diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 919b89c..2f1832d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,12 +16,18 @@ jobs:
       matrix:
         version:
           - '1.10'
+          - '~1.11.0-0'
         os:
           - ubuntu-latest
           - macOS-latest
           - windows-latest
         arch:
           - x64
+        exclude:
+          - version: '~1.11.0-0'
+            os: macOS-latest # JET crashes on one unit test and hangs on another
+          - version: '~1.11.0-0'
+            os: windows-latest # JET crashes on 3 unit tests
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
diff --git a/Project.toml b/Project.toml
index 62f5f0a..2dd7065 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,10 +1,17 @@
 name = "UnrolledUtilities"
 uuid = "0fe1646c-419e-43be-ac14-22321958931b"
 authors = ["CliMA Contributors <clima-software@caltech.edu>"]
-version = "0.1.2"
+version = "0.1.3"
 
 [compat]
 julia = "1.10"
+StaticArrays = "1"
+
+[weakdeps]
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+
+[extensions]
+UnrolledUtilitiesStaticArraysExt = "StaticArrays"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/README.md b/README.md
index 29346ae..8f49b68 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 <h1 align="center">
 <picture>
-  <source media="(prefers-color-scheme: dark)" srcset="logo-white.png">
+  <source media="(prefers-color-scheme: dark)" srcset="logo-dark.png">
   <source media="(prefers-color-scheme: light)" srcset="logo.png">
   <img alt="Shows the logo of UnrolledUtilities.jl" src="logo.png" width="480px">
 </picture>
diff --git a/docs/Project.toml b/docs/Project.toml
index 5ea11ab..5daf06b 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -5,3 +5,4 @@ JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+UnrolledUtilities = "0fe1646c-419e-43be-ac14-22321958931b"
diff --git a/docs/make.jl b/docs/make.jl
index ee728d5..e702add 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,28 +2,37 @@ using Documenter
 
 include(joinpath("..", "test", "test_and_analyze.jl"))
 
-comparison_table_file = joinpath("docs", "src", "comparison_table.md")
-
-open(comparison_table_file, "w") do io
-    println(io, "# Comparison Table\n```@raw html")
-    println(io, "<div style=\"width: max(80vw, 100%)\">") # use 80% of viewport
-    print_comparison_table(io, true)
-    println(io, "</div>")
-    println(io, "```")
+comparison_tables_file = joinpath("docs", "src", "comparison_tables.md")
+preamble_file = joinpath("docs", "src", "comparison_tables_preamble.md")
+cp(preamble_file, comparison_tables_file; force = true)
+open(comparison_tables_file, "a") do io
+    for (title, comparison_table_dict) in comparison_table_dicts
+        print_comparison_table(title, comparison_table_dict, io)
+    end
 end
 
 makedocs(;
     sitename = "UnrolledUtilities.jl",
     modules = [UnrolledUtilities],
-    pages = ["Home" => "index.md", "Comparison Table" => "comparison_table.md"],
+    pages = [
+        "Home" => "index.md",
+        "Introduction" => "introduction.md",
+        "User Guide" => "user_guide.md",
+        "Developer Guide" => "developer_guide.md",
+        "Comparison Tables" => basename(comparison_tables_file),
+    ],
     format = Documenter.HTML(
         prettyurls = get(ENV, "CI", nothing) == "true",
-        size_threshold_ignore = ["comparison_table.md"],
+        sidebar_sitename = false,
+        size_threshold_ignore = [
+            "introduction.md",
+            basename(comparison_tables_file),
+        ],
     ),
     clean = true,
 )
 
-rm(comparison_table_file)
+rm(comparison_tables_file)
 
 deploydocs(
     repo = "github.com/CliMA/UnrolledUtilities.jl.git",
diff --git a/logo-white.svg b/docs/src/assets/logo-dark.svg
similarity index 99%
rename from logo-white.svg
rename to docs/src/assets/logo-dark.svg
index 2daf34e..c8c43ac 100644
--- a/logo-white.svg
+++ b/docs/src/assets/logo-dark.svg
@@ -7,7 +7,7 @@
    width="1567.9242"
    height="279.37802"
    viewBox="0 0 1567.9242 279.37802"
-   sodipodi:docname="logo-white.svg"
+   sodipodi:docname="logo-dark.svg"
    inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
diff --git a/docs/src/assets/logo.svg b/docs/src/assets/logo.svg
new file mode 100644
index 0000000..f02f78e
--- /dev/null
+++ b/docs/src/assets/logo.svg
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   version="1.1"
+   id="svg2"
+   width="1567.9242"
+   height="279.37802"
+   viewBox="0 0 1567.9242 279.37802"
+   sodipodi:docname="logo.svg"
+   inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs6" />
+  <sodipodi:namedview
+     id="namedview4"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     showgrid="false"
+     inkscape:zoom="0.50146484"
+     inkscape:cx="462.6446"
+     inkscape:cy="-253.25803"
+     inkscape:window-width="2560"
+     inkscape:window-height="1373"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="g8" />
+  <g
+     inkscape:groupmode="layer"
+     inkscape:label="Image"
+     id="g8"
+     transform="translate(-105.58993,-344.99612)">
+    <path
+       style="fill:#000000;stroke-width:0.496807"
+       d="m 114.97489,605.13127 c -6.86457,-2.77797 -10.72714,-10.47934 -8.95688,-17.85862 0.37091,-1.54612 2.69731,-6.09414 5.93854,-11.6096 19.65428,-33.44487 17.92542,-30.21902 17.65324,-32.9388 -0.13538,-1.35283 -0.91541,-10.95509 -1.73339,-21.33836 -0.81799,-10.38327 -1.70441,-21.44965 -1.96982,-24.59195 -0.92263,-10.92316 -0.92783,-10.88494 4.45833,-32.76915 2.72639,-11.07744 5.52715,-22.22922 6.2239,-24.78173 2.01187,-7.37035 5.0272,-11.30273 10.67854,-13.9262 3.86808,-1.79565 7.87365,-1.85312 39.63859,-0.56881 l 26.94175,1.0893 5.83339,-3.2077 c 3.20836,-1.76423 8.92964,-4.91235 12.71396,-6.99581 3.78432,-2.08347 7.87216,-4.12477 9.08409,-4.53623 6.19724,-2.10402 13.49846,0.74611 16.94929,6.61637 1.20239,2.04539 1.36502,2.82831 1.35977,6.54612 -0.005,3.36222 -0.22403,4.63242 -1.07594,6.23242 -1.46704,2.75527 -4.92545,5.82291 -8.68835,7.70663 -1.7307,0.86639 -8.73581,4.62376 -15.56691,8.34971 l -12.42018,6.77446 -8.69412,-0.0917 c -4.78177,-0.0505 -16.40706,-0.44037 -25.83397,-0.86648 -24.00285,-1.08495 -28.88533,-1.21868 -30.30523,-0.83009 -1.65198,0.45211 -2.64916,2.53134 -2.01454,4.20052 0.975,2.56443 1.84429,2.74504 14.9795,3.11216 l 12.1961,0.34088 -2.65689,9.48756 c -1.46129,5.21815 -3.44122,12.39387 -4.39985,15.94604 l -1.74297,6.4585 3.12169,2.92362 c 1.71692,1.608 9.9119,9.08191 18.21105,16.60871 20.88048,18.93725 21.2146,19.30323 22.38162,24.51664 1.08679,4.85497 8.48366,54.72726 8.48366,57.19981 0,3.21243 -2.19944,8.04816 -4.56643,10.03985 -4.1459,3.48855 -10.22125,4.66584 -15.32681,2.97004 -4.48755,-1.49052 -8.59978,-6.2137 -9.65414,-11.08843 -0.197,-0.91083 -1.87108,-12.1498 -3.72017,-24.97549 -2.73577,-18.97599 -3.53641,-23.51665 -4.29849,-24.3782 -0.51508,-0.5823 -6.84673,-6.33413 -14.07033,-12.78183 -16.63687,-14.84987 -16.923,-15.09822 -16.923,-14.68784 0,0.18806 0.67886,7.72263 1.50858,16.74349 1.69704,18.45073 1.76108,17.61099 -1.78919,23.46301 -1.16664,1.92303 -3.64874,6.1114 -5.51575,9.30747 -1.86701,3.19607 -4.15901,7.10843 -5.09332,8.69412 -0.93432,1.5857 -2.25113,4.0009 -2.92625,5.36712 -0.67512,1.36622 -1.83256,3.37828 -2.57209,4.47126 -0.73952,1.09298 -3.30096,5.30353 -5.69208,9.35677 -3.69218,6.25871 -4.69331,7.58886 -6.64176,8.82454 -1.26186,0.80024 -3.08447,1.67451 -4.05026,1.94282 -2.69313,0.74818 -6.97658,0.5367 -9.45648,-0.46687 z m 183.57022,0.1288 c -0.81973,-0.10216 -3.05537,-0.55171 -4.96807,-0.99898 -21.10578,-4.93542 -37.03309,-23.3418 -42.83105,-49.49764 l -1.3418,-6.05317 -0.16331,-50.68487 -0.16331,-50.68486 2.30948,-0.68191 c 1.27022,-0.37505 3.45557,-1.45317 4.85634,-2.39581 2.30624,-1.55199 2.59428,-1.63349 3.04881,-0.8627 6.56916,11.14003 15.79947,21.18107 23.9645,26.06942 l 4.56401,2.73244 0.0256,26.57918 c 0.026,26.98818 0.2612,31.66128 2.03169,40.36509 4.49622,22.10369 18.05372,39.64364 37.95826,49.10833 8.56456,4.07249 16.48435,6.0082 31.31932,7.65492 11.2467,1.24841 17.5425,2.90678 24.59195,6.47775 6.79998,3.4446 10.10277,3.18228 -39.24776,3.1172 -24.45533,-0.0322 -45.13493,-0.14223 -45.95466,-0.24439 z M 312.2073,471.67701 c -7.06106,-1.13226 -15.02114,-3.8039 -20.8659,-7.00323 -10.15561,-5.55902 -19.12906,-14.48198 -24.41018,-24.27283 -2.00977,-3.72599 -2.01661,-3.75692 -1.20734,-5.46232 3.42305,-7.21355 3.41684,-14.80423 -0.0173,-21.11385 -2.87255,-5.27782 -9.86704,-10.37927 -15.36021,-11.20302 -1.43946,-0.21586 -2.2272,-0.58424 -2.2272,-1.04152 0,-1.32122 3.46195,-9.6791 5.7999,-14.00223 12.16632,-22.49683 30.35206,-36.43352 54.31377,-41.62351 5.33886,-1.15638 20.26573,-1.30005 25.58556,-0.24626 14.75849,2.92345 28.84777,11.37086 37.86542,22.7027 15.54015,19.52819 18.63136,44.77572 8.12111,66.32917 -5.54156,11.36412 -11.81152,19.16942 -20.59956,25.64383 -12.87746,9.48719 -31.04137,13.85176 -46.99808,11.29307 z m 21.36271,-30.7556 c 5.37983,-1.45158 9.85718,-4.11641 14.13903,-8.41528 4.57553,-4.59372 7.17742,-8.74967 9.05044,-14.45612 1.09397,-3.33295 1.30782,-4.89663 1.32486,-9.68774 0.0363,-10.20476 -2.51234,-16.3268 -9.82792,-23.60741 -7.02352,-6.98995 -13.58104,-9.67867 -23.60533,-9.67867 -10.1891,0 -17.48099,3.25093 -24.76948,11.04291 -4.53717,4.8506 -6.62817,9.15818 -7.88688,16.24741 -3.20762,18.06572 8.57159,35.71387 26.04827,39.02668 4.48211,0.84961 11.40982,0.63912 15.52701,-0.47178 z m -12.95209,-15.81943 c -5.1529,-1.45922 -9.01728,-4.78857 -11.33047,-9.76174 -1.05644,-2.27127 -1.28338,-3.49671 -1.29225,-6.97797 -0.009,-3.67294 0.1781,-4.60751 1.43953,-7.17664 1.77518,-3.6155 4.98567,-6.9151 8.12127,-8.3467 3.47074,-1.58461 11.34251,-1.58561 14.5563,-0.002 3.06929,1.51255 6.66662,5.21748 8.30493,8.55333 1.15163,2.3449 1.34852,3.36976 1.33934,6.97186 -0.0125,4.91307 -1.31664,8.37807 -4.46478,11.86268 -3.77795,4.18175 -11.32938,6.39051 -16.67387,4.87703 z M 159.68753,413.77035 c -4.53031,-0.72527 -10.35151,-3.91752 -13.72218,-7.52503 -5.52956,-5.91808 -7.96353,-12.50188 -7.48486,-20.24634 0.42972,-6.95284 3.10217,-12.82652 8.02125,-17.62968 6.09548,-5.95184 15.37998,-8.50248 24.21528,-6.65244 7.97359,1.66961 15.77958,8.73911 18.88681,17.10483 0.90472,2.43583 1.10296,4.02068 1.11703,8.93009 l 0.0171,5.96868 -2.36649,4.89492 c -5.69928,11.78856 -15.93421,17.19613 -28.68393,15.15497 z"
+       id="path293" />
+    <g
+       id="g1908"
+       transform="matrix(0.58421514,0,0,0.58421514,1051.7717,223.61773)">
+      <ellipse
+         style="fill:none;stroke:#cb3c33;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="path1023"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#cb3c33;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="path1545"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g1914"
+       transform="matrix(0.58421514,0,0,0.58421514,1131.5617,225.72924)">
+      <ellipse
+         style="fill:none;stroke:#9558b2;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1910"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#9558b2;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1912"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g1920"
+       transform="matrix(0.58421514,0,0,0.58421514,1093.2148,157.54801)">
+      <ellipse
+         style="fill:none;stroke:#389826;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1916"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#389826;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1918"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g3384">
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 314.57838,471.88069 c -12.00636,-1.56018 -23.35647,-6.3154 -32.28042,-13.52416 -3.10576,-2.50883 -7.93107,-7.43228 -10.29651,-10.50594 -3.00779,-3.90832 -6.5292,-9.97186 -6.5292,-11.24267 0,-0.35893 0.38691,-1.60214 0.85981,-2.76269 2.61122,-6.4083 2.73636,-12.4896 0.37484,-18.21633 -0.98592,-2.39089 -2.28946,-4.275 -4.47074,-6.46191 -3.31498,-3.32353 -7.59033,-5.78011 -11.48554,-6.59949 -2.23624,-0.47041 -2.55221,-0.70284 -2.33707,-1.71919 0.51471,-2.43159 4.68333,-11.67748 7.22165,-16.01743 11.74514,-20.0815 28.63707,-32.88256 50.40162,-38.19538 4.78395,-1.16779 7.32395,-1.38537 16.14429,-1.38299 8.59645,0.002 10.04704,0.13454 14.85874,1.35428 25.54183,6.47469 44.43185,27.414 48.23798,53.47111 0.53663,3.67386 0.6045,11.86723 0.12698,15.33008 -1.39102,10.08731 -4.90302,19.19711 -11.05821,28.68396 -9.79703,15.09994 -24.8765,24.57383 -43.81495,27.52737 -3.59998,0.56144 -12.51957,0.70758 -15.95327,0.26138 z m 16.95034,-30.40999 c 5.87577,-1.03126 11.33466,-3.99242 16.09679,-8.73166 5.12103,-5.09641 7.92141,-9.83117 9.88192,-16.70788 0.51811,-1.81734 0.56744,-2.49147 0.55631,-7.60273 -0.0105,-4.83678 -0.0887,-5.96168 -0.56783,-8.17466 -1.27199,-5.87452 -3.44626,-9.64692 -8.62928,-14.97195 -7.3357,-7.5367 -13.73232,-10.25831 -24.10018,-10.25406 -7.21921,0.003 -12.44834,1.38648 -17.4682,4.62173 -4.969,3.20248 -10.16515,8.67945 -12.40453,13.07495 -1.04977,2.0605 -2.13965,5.33031 -2.79071,8.37261 -0.7654,3.57654 -0.83292,10.49704 -0.13475,13.8124 1.4287,6.78442 4.51864,12.67686 9.14956,17.44801 4.96802,5.11845 11.90162,8.58662 18.90591,9.45669 2.51616,0.31256 8.84351,0.12367 11.50499,-0.34345 z"
+         id="path1995" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 320.81013,425.00347 c -6.1658,-1.68691 -11.44918,-7.28023 -12.48266,-13.21495 -0.47447,-2.72462 -0.19177,-7.13377 0.5795,-9.03815 2.09845,-5.1814 6.51093,-9.43253 10.83696,-10.44069 2.61518,-0.60945 8.00228,-0.60871 10.28303,10e-4 3.87935,1.03778 8.03419,4.80131 10.27311,9.30555 0.88482,1.78009 1.03429,2.33081 1.18395,4.36222 0.36574,4.9645 -0.4366,8.66482 -2.57687,11.88424 -2.12529,3.19689 -4.99807,5.32057 -8.99868,6.6522 -2.52932,0.8419 -6.94034,1.07857 -9.09834,0.48815 z"
+         id="path3374" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 320.30454,424.66163 c -5.4507,-1.74457 -9.94796,-6.31202 -11.56582,-11.74634 -0.47526,-1.59638 -0.56229,-6.34268 -0.15481,-8.44282 0.51324,-2.6452 2.90131,-6.47407 5.51063,-8.83537 3.15189,-2.85229 6.58758,-3.86413 12.17881,-3.58675 3.42681,0.17 4.88262,0.57128 6.97216,1.92182 2.99244,1.9341 5.29065,4.53838 6.98036,7.90998 1.95227,3.89549 1.4912,10.9764 -0.98831,15.17798 -0.46965,0.79585 -1.65172,2.25398 -2.62681,3.24031 -1.41516,1.43147 -2.24914,2.02501 -4.13305,2.94148 -1.29809,0.63148 -3.13631,1.31257 -4.08495,1.51354 -2.39734,0.50787 -6.35161,0.462 -8.08821,-0.0938 z"
+         id="path3376" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 300.37001,605.09795 c -4.04197,-0.1988 -8.19178,-1.12294 -12.83739,-2.85882 -13.14249,-4.91081 -24.35748,-16.01203 -31.02999,-30.71519 -2.69076,-5.9292 -4.19256,-10.55042 -5.98338,-18.41154 l -0.96925,-4.25468 -0.17946,-50.71981 -0.17947,-50.7198 1.55538,-0.43243 c 1.7372,-0.48299 3.8635,-1.50127 6.14936,-2.94492 0.83937,-0.53011 1.64356,-0.96384 1.78709,-0.96384 0.14353,0 0.57072,0.53282 0.9493,1.18403 3.2926,5.66373 9.8537,13.93674 14.45888,18.23146 3.74062,3.48845 6.435,5.51893 10.72702,8.08385 l 2.76941,1.65501 0.14512,25.3683 c 0.0798,13.95257 0.26729,27.16304 0.4166,29.35661 1.096,16.1021 5.75508,29.42964 14.27249,40.82722 2.95207,3.95031 9.32325,10.33791 13.17391,13.20786 6.64676,4.95393 14.06004,8.75449 21.41703,10.97985 5.70402,1.72536 10.41223,2.59751 21.68646,4.01721 10.6184,1.33711 15.49069,2.42416 20.83538,4.64856 3.1698,1.31924 7.83064,3.66424 7.83064,3.93983 0,0.11996 -0.25238,0.31071 -0.56086,0.42388 -0.69055,0.25334 -81.41409,0.34427 -86.43427,0.0974 z"
+         id="path3378" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:148.719px;line-height:1.25;font-family:'Latin Modern Roman';-inkscape-font-specification:'Latin Modern Roman, ';stroke-width:3.71798"
+       x="327.75595"
+       y="588.68158"
+       id="text3388"><tspan
+         sodipodi:role="line"
+         id="tspan3386"
+         x="327.75595"
+         y="588.68158"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:'Symbols Nerd Font Mono';-inkscape-font-specification:'Symbols Nerd Font Mono Bold';stroke-width:3.71798">UnrolledUtilities.jl</tspan></text>
+  </g>
+</svg>
diff --git a/docs/src/comparison_tables_preamble.md b/docs/src/comparison_tables_preamble.md
new file mode 100644
index 0000000..bd34f5d
--- /dev/null
+++ b/docs/src/comparison_tables_preamble.md
@@ -0,0 +1,61 @@
+The following autogenerated tables contain a representative set of potential use
+cases for this package, along with measurements that summarize each case's
+performance, compilation, and memory usage:
+- run time (best of several trial measurements)
+- compilation time (as reported by the compiler)
+- overall level of optimization (type stability, constant propagation, etc.) and
+  allocations during run time (as reported by the garbage collector)
+- total allocations during compilation and first run (as reported by the garbage
+  collector and, when possible, the Julia process's resident set size estimator)
+
+The rows of the tables are highlighted as follows:
+- ```@raw html
+  <span style="color:darkturquoise">light blue </span>
+  ```
+  indicates better optimization and either an improvement or no change in run
+  time, compilation time, and total allocations
+- ```@raw html
+  <span style="color:royalblue">dark blue </span>
+  ```
+  indicates better optimization and either slower run time, slower compilation,
+  or more total allocations
+- ```@raw html
+  <span style="color:mediumseagreen">green </span>
+  ```
+  indicates similar optimization, either faster run time or fewer allocations
+  during run time, and either an improvement or no change in compilation time
+  and total allocations
+- ```@raw html
+  <span style="color:khaki">yellow </span>
+  ```
+  indicates similar optimization, either faster run time or fewer allocations
+  during run time, and either slower compilation or more total allocations
+- ```@raw html
+  <span style="color:mediumorchid">magenta </span>
+  ```
+  indicates no change in performance and either an improvement or no change in
+  compilation time and total allocations
+- ```@raw html
+  <span style="color:silver">light gray </span>
+  ```
+  indicates no change in performance and either faster compilation with more
+  total allocations or slower compilation with fewer total allocations
+- ```@raw html
+  <span style="color:gray">dark gray </span>
+  ```
+  indicates no change in performance, compilation time, or total allocations
+- ```@raw html
+  <span style="color:indianred">red </span>
+  ```
+  indicates a deterioration in performance, or no change in performance and
+  either slower compilation or more total allocations
+
+Rows highlighted in gray present no clear advantage to loop unrolling, while
+those highlighted in red present a clear disadvantage. It is recommended that
+you only call unrolled functions when your use case is similar to a row in one
+of the remaining categories, each of which demonstrates some advantage to loop
+unrolling.
+
+The tables are also printed out by this package's test suite, so they can be
+compared across different operating systems by consulting the
+[CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml).
diff --git a/docs/src/developer_guide.md b/docs/src/developer_guide.md
new file mode 100644
index 0000000..9ec99cd
--- /dev/null
+++ b/docs/src/developer_guide.md
@@ -0,0 +1,96 @@
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
+## How to Unroll
+
+There are two general ways to implement loop unrolling in Julia—recursively
+splatting iterator contents and manually generating unrolled expressions. For
+example, the recursively unrolled version of `foreach` is
+
+```julia
+unrolled_foreach(f, itr) = _unrolled_foreach(f, itr...)
+_unrolled_foreach(f) = nothing
+_unrolled_foreach(f, item, items...) = (f(item); _unrolled_foreach(f, items...))
+```
+
+In contrast, the generatively unrolled version of `foreach` is
+
+```julia
+unrolled_foreach(f, itr) = _unrolled_foreach(Val(length(itr)), f, itr)
+@generated _unrolled_foreach(::Val{N}, f, itr) where {N} =
+    Expr(:block, (:(f(generic_getindex(itr, $n))) for n in 1:N)..., nothing)
+```
+
+To switch between recursive and generative unrolling, this package defines the
+following function:
+
+```@docs
+rec_unroll
+```
+
+!!! tip "Tip"
+    Recursive loop unrolling can be disabled globally with the following
+    function redefinition:
+
+    ```julia
+    rec_unroll(itr) = false
+    ```
+
+The cutoff length of 16 for switching to generative unrolling is motivated by
+the benchmarks for [Generative vs. Recursive Unrolling](@ref).
+
+## Interface API
+
+The functions exported by this package can be used with any statically sized
+iterators, as long as those iterators make appropriate use of the following
+interface:
+
+```@docs
+generic_getindex
+output_type_for_promotion
+AmbiguousOutputType
+NoOutputType
+ConditionalOutputType
+output_promote_rule
+constructor_from_tuple
+empty_output
+```
+
+## How to Use the Interface
+
+To unroll over a statically sized iterator of some user-defined type `T`, follow
+these steps:
+- To enable recursive unrolling, add a method for `iterate(::T, [state])`
+- To enable generative unrolling, add a method for `getindex(::T, n)` (or for
+  `generic_getindex(::T, n)` if `getindex` should not be defined for iterators
+  of type `T`)
+- If every unrolled function that needs to construct an iterator when given an
+  iterator of type `T` can return a `Tuple` instead, stop here
+- Otherwise, to return a non-`Tuple` iterator whenever it is efficient to do so,
+  follow these steps:
+    - Add a method for `output_type_for_promotion(::T) = O`, where `O` can be
+      `T`, a supertype of `T`, some other `Type`, or an `AmbiguousOutputType`
+    - If an iterator whose output type is `O` can be used together with an
+      iterator whose output type is `O′`, add a method for
+      `output_promote_rule(O, O′)`
+    - If `O` is a `NoOutputType`, stop here
+    - Otherwise, to handle the unambiguous output type `U` that underlies `O`
+      (where `U` is equivalent to `O` unless `O` is a `ConditionalOutputType`),
+      follow these steps:
+        - If an iterator of type `U` can be efficiently constructed from a
+          `Tuple`, add a method for `constructor_from_tuple(U)`
+        - Otherwise, for each of the following functions, add a method if it can
+          be implemented to construct an iterator of type `U` without first
+          storing the iterator's contents in a `Tuple`:
+            - `empty_output(U)`
+            - `unrolled_map_into(U, f, itr)`
+            - `unrolled_accumulate_into(U, op, itr, init, transform)`
+            - `unrolled_push_into(U, itr, item)`
+            - `unrolled_append_into(U, itr1, itr2)`
+            - `unrolled_take_into(U, itr, val_N)`
+            - `unrolled_drop_into(U, itr, val_N)`
+
+!!! note "Note"
+    When a relevant method for the interface is not defined, unrolled functions
+    will typically fall back to using `Tuple`s instead of other iterator types.
diff --git a/docs/src/index.md b/docs/src/index.md
index 8aaec38..c317460 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -1,75 +1,87 @@
+```@setup inference_test
+using UnrolledUtilities
+```
+
 #  UnrolledUtilities.jl
 
-A collection of generated functions in which all loops are unrolled and inlined:
-- `unrolled_any(f, itr)`: similar to `any`
-- `unrolled_all(f, itr)`: similar to `all`
-- `unrolled_foreach(f, itrs...)`: similar to `foreach`
-- `unrolled_map(f, itrs...)`: similar to `map`
-- `unrolled_reduce(op, itr; [init])`: similar to `reduce`
-- `unrolled_mapreduce(f, op, itrs...; [init])`: similar to `mapreduce`
-- `unrolled_zip(itrs...)`: similar to `zip`
-- `unrolled_enumerate(itrs...)`: similar to `enumerate`, but with the ability to
-  handle multiple iterators
-- `unrolled_in(item, itr)`: similar to `in`
-- `unrolled_unique(itr)`: similar to `unique`
-- `unrolled_filter(f, itr)`: similar to `filter`
-- `unrolled_split(f, itr)`: similar to `(filter(f, itr), filter(!f, itr))`, but
-  without duplicate calls to `f`
-- `unrolled_flatten(itr)`: similar to `Iterators.flatten`
-- `unrolled_flatmap(f, itrs...)`: similar to `Iterators.flatmap`
-- `unrolled_product(itrs...)`: similar to `Iterators.product`
-- `unrolled_applyat(f, n, itrs...)`: similar to `f(map(itr -> itr[n], itrs)...)`
-- `unrolled_take(itr, ::Val{N})`: similar to `itr[1:N]` (and to
-  `Iterators.take`), but with `N` wrapped in a `Val`
-- `unrolled_drop(itr, ::Val{N})`: similar to `itr[(N + 1):end]` (and to
-  `Iterators.drop`), but with `N` wrapped in a `Val`
+A toolkit for low-level optimization of Julia code in which iterator sizes are
+known during compilation.
+
+This package can be used with all *statically sized* iterators (`Tuple`s,
+`NamedTuple`s, [`StaticArray`s](https://github.com/JuliaArrays/StaticArrays.jl),
+etc.), including ones that are very long or ones that have elements of different
+types, both of which are cases that Julia's standard library often handles
+inefficiently. For example, the standard libary function `in` performs worse
+than this package's `unrolled_in` for `Tuple`s with elements of different types:
+
+```@repl inference_test
+@allocated () in ((1, 2), (1, 2, 3))
+@allocated unrolled_in((), ((1, 2), (1, 2, 3)))
+```
 
-These functions are guaranteed to be type-stable whenever they are given
-iterators with inferrable lengths and element types, including when
-- the iterators have many elements (e.g., more than 32, which is when `map`,
-  `reduce`, and `mapreduce` tend to stop getting compiled efficiently)
-- the iterators have nonuniform element types (most functions from `Base` and
-  `Base.Iterators` tend to encounter type-instabilities and allocations when
-  this is the case, especially when there are more than 32 elements)
-- `f` and/or `op` recursively call the function to which they are passed, up to
-  an arbitrarily large recursion depth (e.g., if `f` calls `map(f, itrs)`, it
-  will be type-unstable when the recursion depth exceeds 2, but this will not be
-  the case with `unrolled_map`)
+The [loop unrolling](https://en.wikipedia.org/wiki/Loop_unrolling) automatically
+performed by this package offers the following benefits for statically sized
+iterators:
+- better support for *static compilation*
+  - compilation of [executables](https://github.com/tshort/StaticCompiler.jl)
+  - compilation of [GPU kernels](https://github.com/JuliaGPU/CUDA.jl)
+- better performance (usually)
+  - reduced run times
+  - reduced memory footprints while code is running
+- better compilation efficiency (occasionally)
+  - reduced compilation times
+  - reduced memory footprints while code is compiling
 
-In addition, these functions have been written in a way that makes them very
-likely to get fully optimized out through constant propagation when the
-iterators have singleton element types (and when the result of calling `f`
-and/or `op` on these elements is inferrable). However, they can also be much
-more expensive to compile than their counterparts from `Base` and
-`Base.Iterators`, in which case they should not be used unless there is a clear
-performance benefit. Some notable exceptions to this are `unrolled_zip`,
-`unrolled_take`, and `unrolled_drop`, which tend to be easier to compile than
-`zip`, `Iterators.take`, `Iterators.drop`, and standard indexing notation.
+To find out more about loop unrolling and when it is useful, see the
+[Introduction](introduction.md).
+
+## Package Features
+
+This package exports a number of analogues to functions from `Base` and
+`Base.Iterators`, each of which has been optimized for statically sized
+iterators (in terms of both performance and compilation time):
+- `unrolled_any(f, itr)`—similar to `any`
+- `unrolled_all(f, itr)`—similar to `all`
+- `unrolled_foreach(f, itrs...)`—similar to `foreach`
+- `unrolled_map(f, itrs...)`—similar to `map`
+- `unrolled_reduce(op, itr; [init])`—similar to `reduce`
+- `unrolled_mapreduce(f, op, itrs...; [init])`—similar to `mapreduce`
+- `unrolled_accumulate(op, itr; [init], [transform])`—similar to `accumulate`,
+  but with a `transform` that can be applied to every value in the output
+- `unrolled_push(itr, item)`—similar to `push!`, but non-mutating
+- `unrolled_append(itr1, itr2)`—similar to `append!`, but non-mutating
+- `unrolled_take(itr, ::Val{N})`—similar to `Iterators.take` (i.e., `itr[1:N]`),
+  but with `N` wrapped in a `Val`
+- `unrolled_drop(itr, ::Val{N})`—similar to `Iterators.drop` (i.e.,
+  `itr[(N + 1):end]`), but with `N` wrapped in a `Val`
+- `unrolled_in(item, itr)`—similar to `in`
+- `unrolled_unique(itr)`—similar to `unique`
+- `unrolled_filter(f, itr)`—similar to `filter`
+- `unrolled_flatten(itr)`—similar to `Iterators.flatten`
+- `unrolled_flatmap(f, itrs...)`—similar to `Iterators.flatmap`
+- `unrolled_product(itrs...)`—similar to `Iterators.product`
+
+In addition, this package exports two functions that do not have public
+analogues in `Base` or `Base.Iterators`:
+- `unrolled_applyat(f, n, itrs...)`—similar to `f(itrs[1][n], itrs[2][n], ...)`,
+  but with a `Core.Const` index in every call to `getindex`
+- `unrolled_split(f, itr)`—similar to `(filter(f, itr), filter(!f, itr))`, but
+  without duplicate calls to `f`
 
-For a more precise indication of whether you should use `UnrolledUtilities`,
-please consult the autogenerated [Comparison Table](@ref). This table contains a
-comprehensive set of potential use cases, each with a measurement of performance
-optimization, the time required for compilation, and the memory usage during
-compilation. Most cases involve simple functions `f` and/or `op`, but the last
-few demonstrate the benefits of unrolling with non-trivial recursive functions.
+These unrolled functions are compatible with the following types of iterators:
+- statically sized iterators from `Base` (e.g., `Tuple` and `NamedTuple`)
+- statically sized iterators from `StaticArrays` (e.g., `SVector` and `MVector`)
+- lazy iterators from `Base` (e.g., the results of `enumerate`, `zip`,
+  `Iterators.map`, and generator expressions) that are used as wrappers for
+  statically sized iterators
 
-The rows of the table are highlighted as follows:
-- green indicates an improvement in performance and either no change in
-  compilation or easier compilation (i.e., either similar or smaller values of
-  compilation time and memory usage)
-- dark blue indicates an improvement in performance and harder compilation
-  (i.e., larger values of compilation time and/or memory usage)
-- light blue indicates no change in performance and easier compilation
-- yellow indicates no change in performance and no change in compilation
-- magenta indicates no change in performance, an increase in compilation time,
-  and a decrease in compilation memory usage
-- red indicates no change in performance and harder compilation
+They are also compatible with two new types of statically sized iterators
+exported by this package:
+- `StaticOneTo`—similar to `Base.OneTo`
+- `StaticBitVector`—similar to `BitVector`
 
-Rows highlighted in green and blue present a clear advantage for unrolling,
-whereas those highlighted in yellow, magenta, and red either have no clear
-advantage, or they have a clear disadvantage. It is recommended that you only
-unroll when your use case is similar to a row in the first category.
+See the [User Guide](@ref "When to Use StaticOneTo and StaticBitVector") for
+additional information about these new types of iterators.
 
-The table is also printed out by this package's unit tests, so these
-measurements can be compared across different operating systems by checking the
-[CI pipeline](https://github.com/CliMA/UnrolledUtilities.jl/actions/workflows/ci.yml).
+See the [Developer Guide](@ref "How to Use the Interface") to learn how
+user-defined iterator types can be made compatible with unrolled functions.
diff --git a/docs/src/introduction.md b/docs/src/introduction.md
new file mode 100644
index 0000000..9db926a
--- /dev/null
+++ b/docs/src/introduction.md
@@ -0,0 +1,277 @@
+```@setup inference_test
+using UnrolledUtilities, InteractiveUtils, Test
+```
+
+```@setup fake_inference_test
+macro code_warntype(_...) nothing end
+macro code_llvm(_...) nothing end
+```
+
+```@raw html
+<style>
+summary {
+  background-color: #3c5dcd;
+  border-radius: 5px;
+  color: white;
+  cursor: pointer;
+  list-style-position: outside;
+  list-style-type: "⬇";
+}
+summary::after {
+  content: "Click to show long output";
+  margin-left: 15px;
+}
+details[open] summary {
+  list-style-type: "⬆";
+}
+details[open] summary::after {
+  content: none;
+}
+</style>
+```
+
+## Motivation for Loop Unrolling
+
+Although the iteration utilities in `Base` and `Base.Iterators` are sufficiently
+performant for most common use cases, those who choose to dive into the world of
+low-level optimization will often discover
+[type instabilities](https://docs.julialang.org/en/v1/manual/faq/#man-type-stability)
+in unexpected situations. Here is a particularly simple example:
+
+```@repl inference_test
+Test.@inferred map(one, Tuple(1:31));
+Test.@inferred map(one, Tuple(1:32));
+```
+
+This type instability is present in all `map`s over iterators with lengths
+greater than 31, regardless of whether they are statically sized. As with most
+type instabilities in Julia, this leads to memory allocations every time `map`
+is called with sufficiently long iterators.
+
+[`Test.@inferred`](https://docs.julialang.org/en/v1/stdlib/Test/#Test.@inferred)
+is helpful for checking whether the return type of a function call is stable,
+but looking directly at the generated [LLVM](https://llvm.org/docs/LangRef.html)
+code reveals just how different the two function calls above are:
+
+```@repl inference_test
+@code_llvm debuginfo=:none map(one, Tuple(1:31))
+```
+```@raw html
+<details><summary>
+```
+```@repl fake_inference_test
+@code_llvm debuginfo=:none map(one, Tuple(1:32))
+```
+```@raw html
+</summary>
+```
+```@repl inference_test
+@code_llvm debuginfo=:none map(one, Tuple(1:32)) # hide
+```
+```@raw html
+</details><br>
+```
+
+The type instability (and all of the resulting LLVM code complexity) in the
+second function call can be eliminated by replacing `map` with `unrolled_map`:
+
+```@repl inference_test
+Test.@inferred unrolled_map(one, Tuple(1:32));
+@code_llvm debuginfo=:none unrolled_map(one, Tuple(1:32))
+```
+
+The minimum iterator length for type instability is not always 32; for instance,
+it can also be 14:
+
+```@repl inference_test
+first_11(itr) = itr[1:11]
+Test.@inferred first_11(Tuple(1:13));
+Test.@inferred first_11(Tuple(1:14));
+```
+
+!!! note "Note"
+    ##### *Why is the function definition needed in this example?*
+
+    On the first line of the example above, `[1:11]` is enclosed in a function
+    so that it does not get evaluated in global scope. This turns the range
+    `1:11` into a `Core.Const`, which the compiler can propagate into the call
+    to `getindex` in order to infer the length of the result:
+
+    ```@setup first_11_code_warntype
+    using InteractiveUtils
+    first_11(itr) = itr[1:11]
+    ```
+
+    ```@repl first_11_code_warntype
+    @code_warntype first_11(Tuple(1:13))
+    ```
+
+    In contrast, running `Test.@inferred Tuple(1:13)[1:11]` would amount to
+    checking whether the compiler can compute the result type of `getindex`
+    given only the argument types `NTuple{13, Int64}` and `UnitRange{Int64}`,
+    which it cannot do:
+
+    ```@raw html
+    <details><summary>
+    ```
+    ```@repl fake_inference_test
+    @code_warntype Tuple(1:13)[1:11]
+    ```
+    ```@raw html
+    </summary>
+    ```
+    ```@repl inference_test
+    @code_warntype Tuple(1:13)[1:11] # hide
+    ```
+    ```@raw html
+    </details><br>
+    ```
+
+Although `itr[1:10]` is always inferrable when `itr` is a `Tuple`, `itr[1:11]`
+has a type instability whenever `itr` contains more than 13 items. More
+generally, `itr[1:N]` seems to be unstable for all `N > 10` whenever `itr`
+contains more than `N + 2` items. This type instability can be fixed by
+replacing `getindex` with `unrolled_take`:
+
+```@repl inference_test
+unrolled_first_11(itr) = unrolled_take(itr, Val(11))
+Test.@inferred unrolled_first_11(Tuple(1:14));
+```
+
+Even when the final result of a function is inferred, there can be intermediate
+steps in the function with type instabilities that trigger allocations:
+
+```@repl inference_test
+function add_lengths(itr)
+    length_sum = 0
+    for n in 1:length(itr)
+        length_sum += length(itr[n])
+    end
+end
+Test.@inferred add_lengths(((1, 2), (1, 2, 3)))
+@allocated add_lengths(((1, 2), (1, 2, 3)))
+@code_warntype add_lengths(((1, 2), (1, 2, 3)))
+```
+
+The output of `@code_warntype` is quite cluttered, but the most important detail
+here is that the call to `getindex` does not get inferred because it can result
+in either a `Tuple` of length 2 or a `Tuple` of length 3. This type instability
+can be fixed by replacing `getindex` with `unrolled_applyat`:
+
+```@repl inference_test
+function unrolled_add_lengths(itr)
+    length_sum = 0
+    for n in 1:length(itr)
+        length_sum += unrolled_applyat(length, n, itr)
+    end
+end
+unrolled_add_lengths(((1, 2), (1, 2, 3))) # hide
+@allocated unrolled_add_lengths(((1, 2), (1, 2, 3)))
+@code_warntype unrolled_add_lengths(((1, 2), (1, 2, 3)))
+```
+
+For a detailed breakdown of when the tools provided by this package can improve
+performance, see the [User Guide](user_guide.md).
+
+## What Does Loop Unrolling Do
+
+When a loop over `N` indices is unrolled, it gets compiled into `N` lines of
+LLVM code, where each line has a constant (`Core.Const`) index. For example, an
+unrolled loop that prints every integer from 1 to 33 is compiled into the
+following:
+
+```@raw html
+<details><summary>
+```
+```@repl fake_inference_test
+@code_llvm debuginfo=:none unrolled_foreach(println, Tuple(1:33))
+```
+```@raw html
+</summary>
+```
+```@repl inference_test
+@code_llvm debuginfo=:none unrolled_foreach(println, Tuple(1:33)) # hide
+```
+```@raw html
+</details><br>
+```
+
+This LLVM code consists of 33 `getelementptr` instructions (each of which
+extracts a value from a `Tuple` at a particular index), 33 `load` instructions,
+and 33 `call` instructions (each of which switches execution to `println`).
+Every `getelementptr` instruction has a constant index between 0 and 32; in more
+complex examples where the `call` instructions get inlined, this constant index
+can be propagated into the LLVM code of the function being called. On the other
+hand, here is the LLVM code for the non-unrolled version of this loop:
+
+```@repl inference_test
+@code_llvm debuginfo=:none foreach(println, Tuple(1:33))
+```
+
+Although the first `getelementptr` instruction here has the constant index 0,
+the other `getelementptr` instruction has a non-constant integer index. Also,
+this LLVM code has conditional jump instructions for checking whether the last
+index of the `Tuple` has been reached after each `getelementptr` instruction.
+
+## Downsides of Loop Unrolling
+
+```@setup tuple_of_tuples_test
+using UnrolledUtilities, Test
+tup32 = ntuple(Returns((1, 2)), 32)
+```
+
+Given the performance benefits of loop unrolling, it might seem at first that
+the standard library needs more of it. However, the standard library is not just
+meant for writing high-performance code with statically sized iterators—many of
+its use cases involve code that is only executed once or several times. In such
+cases, most of the execution time is required for compilation, and minimizing
+run time makes no practical difference. Although unrolled functions can
+occasionally be faster to compile than non-unrolled functions, they are
+typically slower to compile, which means that using them instead of standard
+library functions can often increase total execution time:
+
+```@repl tuple_of_tuples_test
+tup32 = ntuple(Returns((1, 2)), 32);
+@elapsed map(first, tup32)
+@elapsed unrolled_map(first, tup32)
+```
+
+The increase in compilation time is usually no more than a factor of 5 for small
+iterators, but it grows as iterator length increases:
+
+```@repl tuple_of_tuples_test
+tup320 = ntuple(Returns((1, 2)), 320);
+@elapsed map(first, tup320)
+@elapsed unrolled_map(first, tup320)
+```
+
+Moreover, loop unrolling can sometimes increase the run time of a function in
+addition to its compilation time:
+
+```@repl tuple_of_tuples_test
+@elapsed Tuple(Iterators.product(tup32, tup32)) # compilation time + run time
+@elapsed Tuple(Iterators.product(tup32, tup32)) # only run time
+@elapsed unrolled_product(tup32, tup32) # compilation time + run time
+@elapsed unrolled_product(tup32, tup32) # only run time
+```
+
+This increase in run time is most likely due to the larger size of unrolled
+code, which makes it take longer to load. Nevertheless, loop unrolling still
+offers the benefit of eliminating the unstable return type in this example:
+
+```@repl tuple_of_tuples_test
+Test.@inferred Tuple(Iterators.product(tup32, tup32));
+Test.@inferred unrolled_product(tup32, tup32);
+```
+
+So, when type instabilities and memory allocations need to be removed
+([as is required for static compilation](https://github.com/brenhinkeller/StaticTools.jl#limitations))
+and the cost to total execution time is more or less irrelevant, using unrolled
+functions is probably worthwhile. Otherwise, if a significant increase in
+compilation time (and potentially also run time) needs to be avoided, using
+standard library functions might be a better option.
+
+It is usually a good idea to compare the performance of unrolled code against
+non-unrolled code before settling on a particular design. Many examples of such
+comparisons can be found in the [tables of benchmarks](comparison_tables.md)
+that are automatically generated for this package.
diff --git a/docs/src/user_guide.md b/docs/src/user_guide.md
new file mode 100644
index 0000000..676de77
--- /dev/null
+++ b/docs/src/user_guide.md
@@ -0,0 +1,257 @@
+```@meta
+CurrentModule = UnrolledUtilities
+```
+
+```@setup inference_test
+using UnrolledUtilities, InteractiveUtils, Test
+```
+
+# When to Use UnrolledUtilities
+
+The functions and types exported by this package tend to perform better than
+their counterparts from `Base` and `Base.Iterators` in the scenarios listed
+below. Additional examples and more precise measurements can be found in the
+automatically generated [tables of benchmarks](comparison_tables.md).
+
+##### Outline:
+
+```@contents
+Pages = ["user_guide.md"]
+Depth = 2:3
+```
+
+## When to Use Unrolled Functions
+
+### Long iterators
+
+- `map` has an unstable return type for iterators with lengths greater than 32:
+
+  ```@repl inference_test
+  Test.@inferred map(one, Tuple(1:31));
+  Test.@inferred map(one, Tuple(1:32));
+  Test.@inferred unrolled_map(one, Tuple(1:32));
+  ```
+
+- `getindex` has an unstable return type for `Core.Const` slices of length
+  `N > 10` from iterators with lengths greater than `N + 2`:
+
+  ```@repl inference_test
+  first_11(itr) = itr[1:11]
+  Test.@inferred first_11(Tuple(1:13));
+  Test.@inferred first_11(Tuple(1:14));
+  unrolled_first_11(itr) = unrolled_take(itr, Val(11))
+  Test.@inferred unrolled_first_11(Tuple(1:14));
+  ```
+
+- For benchmarks that indicate performance improvements when using unrolled
+  functions with long iterators, see [Isolated Unrolled Functions](@ref)
+
+### Iterators with elements of different types
+
+- `in` has an intermediate type instability that triggers allocations for
+  nonuniform iterators:
+
+  ```@repl inference_test
+  @allocated () in ((1, 2), (1, 2, 3))
+  @allocated unrolled_in((), ((1, 2), (1, 2, 3)))
+  ```
+
+- `any`, `all`, and `foreach` have intermediate type instabilities that trigger
+  allocations for nonuniform iterators with lengths greater than 32:
+
+  ```@repl inference_test
+  const nonuniform_itr_of_length_32 = (ntuple(Returns((1, 2)), 31)..., (1, 2, 3));
+  const nonuniform_itr_of_length_33 = (ntuple(Returns((1, 2)), 32)..., (1, 2, 3));
+  @allocated any(isempty, nonuniform_itr_of_length_32)
+  @allocated any(isempty, nonuniform_itr_of_length_33)
+  @allocated unrolled_any(isempty, nonuniform_itr_of_length_33)
+  ```
+
+- `getindex` has an unstable return type for nonuniform iterators when given
+  non-constant (i.e., not `Core.Const`) indices, which can lead to intermediate
+  type instabilities that trigger allocations:
+
+  ```@repl inference_test
+  function add_lengths(itr)
+      length_sum = 0
+      for n in 1:length(itr)
+          length_sum += length(itr[n])
+      end
+  end
+  add_lengths(((1, 2), (1, 2, 3))) # hide
+  @allocated add_lengths(((1, 2), (1, 2, 3)))
+  function unrolled_add_lengths(itr)
+      length_sum = 0
+      for n in 1:length(itr)
+          length_sum += unrolled_applyat(length, n, itr)
+      end
+  end
+  unrolled_add_lengths(((1, 2), (1, 2, 3))) # hide
+  @allocated unrolled_add_lengths(((1, 2), (1, 2, 3)))
+  ```
+
+  !!! note "Note"
+      ##### *How can `unrolled_applyat` be stable if `n` isn't a `Core.Const`?*
+
+      For the example of `add_lengths`, the compiler must infer the return
+      type of `itr[::Int64]` before it can compile the call to `length`.
+      Since this return type depends on the index `n`, the compiler needs to
+      insert a runtime lookup into the method table that determines which
+      method of `length` to call, `length(::Tuple{Int64, Int64})` or
+      `length(::Tuple{Int64, Int64, Int64})`, and this triggers allocations.
+
+      For the example of `unrolled_add_lengths`, the compiler instead infers
+      the return types of `itr[::Core.Const(1)]`, `itr[::Core.Const(2)]`,
+      and so on for every index into `itr`. Then, it compiles a call to
+      `length` for each of these return types, and it inserts a runtime
+      [switch instruction](https://llvm.org/docs/LangRef.html#switch-instruction)
+      that determines which result of `length` to return for a particular
+      value of `n`. As long as `length` itself only returns one type (in this
+      case, `Int64`), this ensures that `unrolled_add_lengths` has no
+      intermediate type instabilities.
+      
+      In other words, `unrolled_applyat` combines multiple methods for `length`
+      and `getindex` into a single method, replacing the inefficient method
+      table lookup that switches between them with a simpler switch instruction.
+
+  !!! tip "Tip"
+      ##### *When should `getindex` be replaced with `unrolled_applyat`?*
+
+      The specific example above could be simplified by using `mapreduce`,
+      instead of using a `for`-loop in conjunction with `unrolled_applyat`:
+
+      ```@repl
+      @allocated mapreduce(length, +, ((1, 2), (1, 2, 3)))
+      ```
+
+      However, there are often situations in which it is not possible to
+      replace loops with function calls, like when those loops are parallelized
+      over CPU or GPU threads. Moreover, CUDA is unable to compile any kernels
+      with type instabilities that trigger allocations, so `unrolled_applyat` is
+      *required* in order to parallelize over nonuniform iterators on GPUs.
+
+- For benchmarks that indicate performance improvements when using unrolled
+  functions with nonuniform iterators, see [Isolated Unrolled Functions](@ref)
+  and [Nested Unrolled Functions](@ref)
+
+### Reduction operations with non-constant return types
+
+- `reduce` and `accumulate` have unstable return types when the return type of
+  `op` is not constant, but only for iterator lengths greater than 32:
+
+  ```@repl inference_test
+  Test.@inferred reduce(tuple, Tuple(1:32));
+  Test.@inferred reduce(tuple, Tuple(1:33));
+  Test.@inferred unrolled_reduce(tuple, Tuple(1:33));
+  ```
+
+- For benchmarks that indicate performance improvements when using unrolled
+  functions with nonuniform reductions, see [Isolated Unrolled Functions](@ref)
+
+### Operations with more than 2 levels of recursion
+
+- All functions in Julia have a default "recursion limit" of 2; unless this
+  limit is modified, it forces any function that recursively calls itself 2 or
+  more times to have an unstable return type:
+
+  ```@repl inference_test
+  recursive_length(itr) =
+      eltype(itr) <: Tuple ? mapreduce(recursive_length, +, itr) : length(itr)
+  Test.@inferred recursive_length(((1, 2), (1, 2, 3)));
+  Test.@inferred recursive_length((((1,), (2,)), (1, 2, 3)));
+  unrolled_recursive_length(itr) =
+      eltype(itr) <: Tuple ?
+      unrolled_mapreduce(unrolled_recursive_length, +, itr) : length(itr)
+  Test.@inferred unrolled_recursive_length((((1,), (2,)), (1, 2, 3)));
+  ```
+
+  !!! note "Note"
+      ##### *Is there any other way to avoid the default recursion limit?*
+
+      The default recursion limit applies to all functions defined in `Base` and
+      `Base.Iterators`, so those functions will have unstable return types for
+      more than 2 levels of recursion, even when all user-defined functions
+      passed to them have had their recursion limits disabled. It is also
+      impossible to modify the recursion limits of functions defined in `Base`
+      from external packages. This means that the only way to avoid the default
+      recursion limit is to not use certain functions from `Base`, and instead
+      to define alternatives without any recursion limits.
+
+- For benchmarks that indicate performance improvements when using unrolled
+  functions with recursive operations, see [Recursive Unrolled Functions](@ref)
+
+## When to Use `StaticOneTo` and `StaticBitVector`
+
+### Iterators of `Int`s from 1 to `N`
+
+```@docs
+StaticOneTo
+```
+
+If an iterator only contains the integers from 1 to `N ≥ 0`, it is possible to
+provide the compiler with the values in the iterator in addition to their types
+by using a `StaticOneTo`, as opposed to a `Tuple` or something similar. This
+can allow the compiler to fully optimize out code that depends on those values,
+essentially moving the code's execution from run time to compilation time:
+
+```@repl inference_test
+@code_llvm debuginfo=:none reduce(+, (1, 2, 3))
+@code_llvm debuginfo=:none reduce(+, StaticOneTo(3))
+```
+
+Standard library functions can sometimes take advantage of this optimization,
+but for most non-trivial operations it is necessary to use unrolled functions:
+
+```@repl inference_test
+@code_llvm debuginfo=:none mapreduce(abs2, +, StaticOneTo(3))
+@code_llvm debuginfo=:none mapreduce(log, +, StaticOneTo(3))
+@code_llvm debuginfo=:none unrolled_mapreduce(log, +, StaticOneTo(3))
+```
+
+For benchmarks that indicate performance improvements when using `StaticOneTo`s,
+see [Very Long Iterators](@ref).
+
+!!! note "Note"
+    ##### *Can the compiler infer iterator values in other scenarios?*
+
+    The compiler can usually infer the values of iterators that only contain
+    [singletons](https://docs.julialang.org/en/v1/manual/types/#man-singleton-types)
+    when they are accessed using `Core.Const` indices, but this is not possible
+    for non-singletons (e.g., integers) unless some special type of iterator is
+    used (e.g., a `StaticOneTo`).
+
+### Long iterators of `Bool`s that get modified across loop iterations
+
+```@docs
+StaticBitVector
+```
+
+Loops in Julia often allocate memory when a value larger than 32 bytes in size
+is modified across loop iterations (regardless of whether the loops are unrolled
+or not). Since `Bool`s are represented by bytes, this limits certain types of
+loops to modifying [bitmasks](https://en.wikipedia.org/wiki/Mask_(computing)) of
+no more than 32 `Bool`s in order to avoid allocations. Unlike an iterator of
+`Bool`s, though, a `StaticBitVector` stores 8 bits in every byte, which makes it
+possible to modify up to 256 bits at a time in loops without any allocations:
+
+```@repl inference_test
+random_bit_flips(itr) = reduce(
+    (itr′, i) -> Base.setindex(itr′, !itr′[rand(1:i)], i),
+    1:length(itr);
+    init = itr,
+)
+@allocated random_bit_flips(ntuple(Returns(true), Val(32))) # hide
+@allocated random_bit_flips(ntuple(Returns(true), Val(32)))
+@allocated random_bit_flips(ntuple(Returns(true), Val(33))) # hide
+@allocated random_bit_flips(ntuple(Returns(true), Val(33)))
+@allocated random_bit_flips(StaticBitVector{256}(true)) # hide
+@allocated random_bit_flips(StaticBitVector{256}(true))
+```
+
+As with `StaticOneTo`s, standard library functions can occasionally take
+advantage of the optimization allowed by `StaticBitVector`s, but most complex
+use cases require unrolled functions.
+
+For benchmarks that indicate performance improvements when using long
+`StaticBitVector`s that get modified across loop iterations, see
+[Nested Unrolled Closures](@ref).
diff --git a/ext/UnrolledUtilitiesStaticArraysExt.jl b/ext/UnrolledUtilitiesStaticArraysExt.jl
new file mode 100644
index 0000000..67058a7
--- /dev/null
+++ b/ext/UnrolledUtilitiesStaticArraysExt.jl
@@ -0,0 +1,12 @@
+module UnrolledUtilitiesStaticArraysExt
+
+import UnrolledUtilities
+import StaticArrays: SVector, MVector
+
+@inline UnrolledUtilities.output_type_for_promotion(::SVector) = SVector
+@inline UnrolledUtilities.constructor_from_tuple(::Type{SVector}) = SVector
+
+@inline UnrolledUtilities.output_type_for_promotion(::MVector) = MVector
+@inline UnrolledUtilities.constructor_from_tuple(::Type{MVector}) = MVector
+
+end
diff --git a/logo-white.png b/logo-dark.png
similarity index 100%
rename from logo-white.png
rename to logo-dark.png
diff --git a/logo-dark.svg b/logo-dark.svg
new file mode 100644
index 0000000..c8c43ac
--- /dev/null
+++ b/logo-dark.svg
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created with Inkscape (http://www.inkscape.org/) -->
+
+<svg
+   version="1.1"
+   id="svg2"
+   width="1567.9242"
+   height="279.37802"
+   viewBox="0 0 1567.9242 279.37802"
+   sodipodi:docname="logo-dark.svg"
+   inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs6" />
+  <sodipodi:namedview
+     id="namedview4"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:showpageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     inkscape:deskcolor="#d1d1d1"
+     showgrid="false"
+     inkscape:zoom="4.0117187"
+     inkscape:cx="228.8296"
+     inkscape:cy="123.38851"
+     inkscape:window-width="2560"
+     inkscape:window-height="1373"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="g8" />
+  <g
+     inkscape:groupmode="layer"
+     inkscape:label="Image"
+     id="g8"
+     transform="translate(-105.58993,-344.99612)">
+    <path
+       style="fill:#ffffff;stroke-width:0.496807;fill-opacity:1"
+       d="m 114.97489,605.13127 c -6.86457,-2.77797 -10.72714,-10.47934 -8.95688,-17.85862 0.37091,-1.54612 2.69731,-6.09414 5.93854,-11.6096 19.65428,-33.44487 17.92542,-30.21902 17.65324,-32.9388 -0.13538,-1.35283 -0.91541,-10.95509 -1.73339,-21.33836 -0.81799,-10.38327 -1.70441,-21.44965 -1.96982,-24.59195 -0.92263,-10.92316 -0.92783,-10.88494 4.45833,-32.76915 2.72639,-11.07744 5.52715,-22.22922 6.2239,-24.78173 2.01187,-7.37035 5.0272,-11.30273 10.67854,-13.9262 3.86808,-1.79565 7.87365,-1.85312 39.63859,-0.56881 l 26.94175,1.0893 5.83339,-3.2077 c 3.20836,-1.76423 8.92964,-4.91235 12.71396,-6.99581 3.78432,-2.08347 7.87216,-4.12477 9.08409,-4.53623 6.19724,-2.10402 13.49846,0.74611 16.94929,6.61637 1.20239,2.04539 1.36502,2.82831 1.35977,6.54612 -0.005,3.36222 -0.22403,4.63242 -1.07594,6.23242 -1.46704,2.75527 -4.92545,5.82291 -8.68835,7.70663 -1.7307,0.86639 -8.73581,4.62376 -15.56691,8.34971 l -12.42018,6.77446 -8.69412,-0.0917 c -4.78177,-0.0505 -16.40706,-0.44037 -25.83397,-0.86648 -24.00285,-1.08495 -28.88533,-1.21868 -30.30523,-0.83009 -1.65198,0.45211 -2.64916,2.53134 -2.01454,4.20052 0.975,2.56443 1.84429,2.74504 14.9795,3.11216 l 12.1961,0.34088 -2.65689,9.48756 c -1.46129,5.21815 -3.44122,12.39387 -4.39985,15.94604 l -1.74297,6.4585 3.12169,2.92362 c 1.71692,1.608 9.9119,9.08191 18.21105,16.60871 20.88048,18.93725 21.2146,19.30323 22.38162,24.51664 1.08679,4.85497 8.48366,54.72726 8.48366,57.19981 0,3.21243 -2.19944,8.04816 -4.56643,10.03985 -4.1459,3.48855 -10.22125,4.66584 -15.32681,2.97004 -4.48755,-1.49052 -8.59978,-6.2137 -9.65414,-11.08843 -0.197,-0.91083 -1.87108,-12.1498 -3.72017,-24.97549 -2.73577,-18.97599 -3.53641,-23.51665 -4.29849,-24.3782 -0.51508,-0.5823 -6.84673,-6.33413 -14.07033,-12.78183 -16.63687,-14.84987 -16.923,-15.09822 -16.923,-14.68784 0,0.18806 0.67886,7.72263 1.50858,16.74349 1.69704,18.45073 1.76108,17.61099 -1.78919,23.46301 -1.16664,1.92303 -3.64874,6.1114 -5.51575,9.30747 -1.86701,3.19607 -4.15901,7.10843 -5.09332,8.69412 -0.93432,1.5857 -2.25113,4.0009 -2.92625,5.36712 -0.67512,1.36622 -1.83256,3.37828 -2.57209,4.47126 -0.73952,1.09298 -3.30096,5.30353 -5.69208,9.35677 -3.69218,6.25871 -4.69331,7.58886 -6.64176,8.82454 -1.26186,0.80024 -3.08447,1.67451 -4.05026,1.94282 -2.69313,0.74818 -6.97658,0.5367 -9.45648,-0.46687 z m 183.57022,0.1288 c -0.81973,-0.10216 -3.05537,-0.55171 -4.96807,-0.99898 -21.10578,-4.93542 -37.03309,-23.3418 -42.83105,-49.49764 l -1.3418,-6.05317 -0.16331,-50.68487 -0.16331,-50.68486 2.30948,-0.68191 c 1.27022,-0.37505 3.45557,-1.45317 4.85634,-2.39581 2.30624,-1.55199 2.59428,-1.63349 3.04881,-0.8627 6.56916,11.14003 15.79947,21.18107 23.9645,26.06942 l 4.56401,2.73244 0.0256,26.57918 c 0.026,26.98818 0.2612,31.66128 2.03169,40.36509 4.49622,22.10369 18.05372,39.64364 37.95826,49.10833 8.56456,4.07249 16.48435,6.0082 31.31932,7.65492 11.2467,1.24841 17.5425,2.90678 24.59195,6.47775 6.79998,3.4446 10.10277,3.18228 -39.24776,3.1172 -24.45533,-0.0322 -45.13493,-0.14223 -45.95466,-0.24439 z M 312.2073,471.67701 c -7.06106,-1.13226 -15.02114,-3.8039 -20.8659,-7.00323 -10.15561,-5.55902 -19.12906,-14.48198 -24.41018,-24.27283 -2.00977,-3.72599 -2.01661,-3.75692 -1.20734,-5.46232 3.42305,-7.21355 3.41684,-14.80423 -0.0173,-21.11385 -2.87255,-5.27782 -9.86704,-10.37927 -15.36021,-11.20302 -1.43946,-0.21586 -2.2272,-0.58424 -2.2272,-1.04152 0,-1.32122 3.46195,-9.6791 5.7999,-14.00223 12.16632,-22.49683 30.35206,-36.43352 54.31377,-41.62351 5.33886,-1.15638 20.26573,-1.30005 25.58556,-0.24626 14.75849,2.92345 28.84777,11.37086 37.86542,22.7027 15.54015,19.52819 18.63136,44.77572 8.12111,66.32917 -5.54156,11.36412 -11.81152,19.16942 -20.59956,25.64383 -12.87746,9.48719 -31.04137,13.85176 -46.99808,11.29307 z m 21.36271,-30.7556 c 5.37983,-1.45158 9.85718,-4.11641 14.13903,-8.41528 4.57553,-4.59372 7.17742,-8.74967 9.05044,-14.45612 1.09397,-3.33295 1.30782,-4.89663 1.32486,-9.68774 0.0363,-10.20476 -2.51234,-16.3268 -9.82792,-23.60741 -7.02352,-6.98995 -13.58104,-9.67867 -23.60533,-9.67867 -10.1891,0 -17.48099,3.25093 -24.76948,11.04291 -4.53717,4.8506 -6.62817,9.15818 -7.88688,16.24741 -3.20762,18.06572 8.57159,35.71387 26.04827,39.02668 4.48211,0.84961 11.40982,0.63912 15.52701,-0.47178 z m -12.95209,-15.81943 c -5.1529,-1.45922 -9.01728,-4.78857 -11.33047,-9.76174 -1.05644,-2.27127 -1.28338,-3.49671 -1.29225,-6.97797 -0.009,-3.67294 0.1781,-4.60751 1.43953,-7.17664 1.77518,-3.6155 4.98567,-6.9151 8.12127,-8.3467 3.47074,-1.58461 11.34251,-1.58561 14.5563,-0.002 3.06929,1.51255 6.66662,5.21748 8.30493,8.55333 1.15163,2.3449 1.34852,3.36976 1.33934,6.97186 -0.0125,4.91307 -1.31664,8.37807 -4.46478,11.86268 -3.77795,4.18175 -11.32938,6.39051 -16.67387,4.87703 z M 159.68753,413.77035 c -4.53031,-0.72527 -10.35151,-3.91752 -13.72218,-7.52503 -5.52956,-5.91808 -7.96353,-12.50188 -7.48486,-20.24634 0.42972,-6.95284 3.10217,-12.82652 8.02125,-17.62968 6.09548,-5.95184 15.37998,-8.50248 24.21528,-6.65244 7.97359,1.66961 15.77958,8.73911 18.88681,17.10483 0.90472,2.43583 1.10296,4.02068 1.11703,8.93009 l 0.0171,5.96868 -2.36649,4.89492 c -5.69928,11.78856 -15.93421,17.19613 -28.68393,15.15497 z"
+       id="path293" />
+    <g
+       id="g1908"
+       transform="matrix(0.58421514,0,0,0.58421514,1051.7717,223.61773)">
+      <ellipse
+         style="fill:none;stroke:#cb3c33;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="path1023"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#cb3c33;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="path1545"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g1914"
+       transform="matrix(0.58421514,0,0,0.58421514,1131.5617,225.72924)">
+      <ellipse
+         style="fill:none;stroke:#9558b2;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1910"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#9558b2;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1912"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g1920"
+       transform="matrix(0.58421514,0,0,0.58421514,1093.2148,157.54801)">
+      <ellipse
+         style="fill:none;stroke:#389826;stroke-width:30.625;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1916"
+         cx="653.81641"
+         cy="399.94873"
+         rx="50.510078"
+         ry="49.841553" />
+      <ellipse
+         style="fill:#389826;fill-opacity:1;stroke:none;stroke-width:53.423;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         id="ellipse1918"
+         cx="655.1441"
+         cy="400.29327"
+         rx="17.882956"
+         ry="17.065907" />
+    </g>
+    <g
+       id="g3384">
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 314.57838,471.88069 c -12.00636,-1.56018 -23.35647,-6.3154 -32.28042,-13.52416 -3.10576,-2.50883 -7.93107,-7.43228 -10.29651,-10.50594 -3.00779,-3.90832 -6.5292,-9.97186 -6.5292,-11.24267 0,-0.35893 0.38691,-1.60214 0.85981,-2.76269 2.61122,-6.4083 2.73636,-12.4896 0.37484,-18.21633 -0.98592,-2.39089 -2.28946,-4.275 -4.47074,-6.46191 -3.31498,-3.32353 -7.59033,-5.78011 -11.48554,-6.59949 -2.23624,-0.47041 -2.55221,-0.70284 -2.33707,-1.71919 0.51471,-2.43159 4.68333,-11.67748 7.22165,-16.01743 11.74514,-20.0815 28.63707,-32.88256 50.40162,-38.19538 4.78395,-1.16779 7.32395,-1.38537 16.14429,-1.38299 8.59645,0.002 10.04704,0.13454 14.85874,1.35428 25.54183,6.47469 44.43185,27.414 48.23798,53.47111 0.53663,3.67386 0.6045,11.86723 0.12698,15.33008 -1.39102,10.08731 -4.90302,19.19711 -11.05821,28.68396 -9.79703,15.09994 -24.8765,24.57383 -43.81495,27.52737 -3.59998,0.56144 -12.51957,0.70758 -15.95327,0.26138 z m 16.95034,-30.40999 c 5.87577,-1.03126 11.33466,-3.99242 16.09679,-8.73166 5.12103,-5.09641 7.92141,-9.83117 9.88192,-16.70788 0.51811,-1.81734 0.56744,-2.49147 0.55631,-7.60273 -0.0105,-4.83678 -0.0887,-5.96168 -0.56783,-8.17466 -1.27199,-5.87452 -3.44626,-9.64692 -8.62928,-14.97195 -7.3357,-7.5367 -13.73232,-10.25831 -24.10018,-10.25406 -7.21921,0.003 -12.44834,1.38648 -17.4682,4.62173 -4.969,3.20248 -10.16515,8.67945 -12.40453,13.07495 -1.04977,2.0605 -2.13965,5.33031 -2.79071,8.37261 -0.7654,3.57654 -0.83292,10.49704 -0.13475,13.8124 1.4287,6.78442 4.51864,12.67686 9.14956,17.44801 4.96802,5.11845 11.90162,8.58662 18.90591,9.45669 2.51616,0.31256 8.84351,0.12367 11.50499,-0.34345 z"
+         id="path1995" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 320.81013,425.00347 c -6.1658,-1.68691 -11.44918,-7.28023 -12.48266,-13.21495 -0.47447,-2.72462 -0.19177,-7.13377 0.5795,-9.03815 2.09845,-5.1814 6.51093,-9.43253 10.83696,-10.44069 2.61518,-0.60945 8.00228,-0.60871 10.28303,10e-4 3.87935,1.03778 8.03419,4.80131 10.27311,9.30555 0.88482,1.78009 1.03429,2.33081 1.18395,4.36222 0.36574,4.9645 -0.4366,8.66482 -2.57687,11.88424 -2.12529,3.19689 -4.99807,5.32057 -8.99868,6.6522 -2.52932,0.8419 -6.94034,1.07857 -9.09834,0.48815 z"
+         id="path3374" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 320.30454,424.66163 c -5.4507,-1.74457 -9.94796,-6.31202 -11.56582,-11.74634 -0.47526,-1.59638 -0.56229,-6.34268 -0.15481,-8.44282 0.51324,-2.6452 2.90131,-6.47407 5.51063,-8.83537 3.15189,-2.85229 6.58758,-3.86413 12.17881,-3.58675 3.42681,0.17 4.88262,0.57128 6.97216,1.92182 2.99244,1.9341 5.29065,4.53838 6.98036,7.90998 1.95227,3.89549 1.4912,10.9764 -0.98831,15.17798 -0.46965,0.79585 -1.65172,2.25398 -2.62681,3.24031 -1.41516,1.43147 -2.24914,2.02501 -4.13305,2.94148 -1.29809,0.63148 -3.13631,1.31257 -4.08495,1.51354 -2.39734,0.50787 -6.35161,0.462 -8.08821,-0.0938 z"
+         id="path3376" />
+      <path
+         style="fill:#4063d8;fill-opacity:1;stroke:none;stroke-width:6.62335;stroke-linecap:square;stroke-linejoin:bevel;stroke-dasharray:none;stroke-opacity:1;paint-order:markers fill stroke"
+         d="m 300.37001,605.09795 c -4.04197,-0.1988 -8.19178,-1.12294 -12.83739,-2.85882 -13.14249,-4.91081 -24.35748,-16.01203 -31.02999,-30.71519 -2.69076,-5.9292 -4.19256,-10.55042 -5.98338,-18.41154 l -0.96925,-4.25468 -0.17946,-50.71981 -0.17947,-50.7198 1.55538,-0.43243 c 1.7372,-0.48299 3.8635,-1.50127 6.14936,-2.94492 0.83937,-0.53011 1.64356,-0.96384 1.78709,-0.96384 0.14353,0 0.57072,0.53282 0.9493,1.18403 3.2926,5.66373 9.8537,13.93674 14.45888,18.23146 3.74062,3.48845 6.435,5.51893 10.72702,8.08385 l 2.76941,1.65501 0.14512,25.3683 c 0.0798,13.95257 0.26729,27.16304 0.4166,29.35661 1.096,16.1021 5.75508,29.42964 14.27249,40.82722 2.95207,3.95031 9.32325,10.33791 13.17391,13.20786 6.64676,4.95393 14.06004,8.75449 21.41703,10.97985 5.70402,1.72536 10.41223,2.59751 21.68646,4.01721 10.6184,1.33711 15.49069,2.42416 20.83538,4.64856 3.1698,1.31924 7.83064,3.66424 7.83064,3.93983 0,0.11996 -0.25238,0.31071 -0.56086,0.42388 -0.69055,0.25334 -81.41409,0.34427 -86.43427,0.0974 z"
+         id="path3378" />
+    </g>
+    <text
+       xml:space="preserve"
+       style="font-size:148.719px;line-height:1.25;font-family:'Latin Modern Roman';-inkscape-font-specification:'Latin Modern Roman, ';stroke-width:3.71798"
+       x="327.75595"
+       y="588.68158"
+       id="text3388"><tspan
+         sodipodi:role="line"
+         id="tspan3386"
+         x="327.75595"
+         y="588.68158"
+         style="font-style:normal;font-variant:normal;font-weight:bold;font-stretch:normal;font-family:'Symbols Nerd Font Mono';-inkscape-font-specification:'Symbols Nerd Font Mono Bold';fill:#ffffff;stroke-width:3.71798">UnrolledUtilities.jl</tspan></text>
+  </g>
+</svg>
diff --git a/src/StaticBitVector.jl b/src/StaticBitVector.jl
new file mode 100644
index 0000000..d41bffa
--- /dev/null
+++ b/src/StaticBitVector.jl
@@ -0,0 +1,194 @@
+"""
+    StaticBitVector{N, [U]}(f)
+    StaticBitVector{N, [U]}([bit])
+
+A statically sized analogue of `BitVector` with `Unsigned` chunks of type `U`,
+which can be constructed using either a function `f(n)` or a constant `bit`. By
+default, `U` is set to `UInt8` and `bit` is set to `false`.
+
+This iterator can only store `Bool`s, so its `output_type_for_promotion` is a
+`ConditionalOutputType`. Efficient implementations are provided for all unrolled
+functions, though the methods for `unrolled_map` and `unrolled_accumulate` only
+apply when the first item in the output is a `Bool`.
+"""
+struct StaticBitVector{N, U <: Unsigned, I <: NTuple{<:Any, U}} <:
+       StaticSequence{N}
+    ints::I
+end
+@inline StaticBitVector{N, U}(ints) where {N, U} =
+    StaticBitVector{N, U, typeof(ints)}(ints)
+@inline StaticBitVector{N}(args...) where {N} =
+    StaticBitVector{N, UInt8}(args...)
+
+@inline function StaticBitVector{N, U}(bit::Bool = false) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Returns(bit ? ~zero(U) : zero(U)), Val(n_ints))
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function StaticBitVector{N, U}(f::Function) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = ntuple(Val(n_ints)) do int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
+            init = zero(U),
+        ) do int, bit_index
+            @inline
+            bit_offset = bit_index - 1
+            int | U(f(first_index + bit_offset)::Bool) << bit_offset
+        end
+    end
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function int_index_and_bit_offset(::Type{U}, n) where {U}
+    int_offset, bit_offset = divrem(n - 1, 8 * sizeof(U))
+    return (int_offset + 1, bit_offset)
+end
+
+@inline function generic_getindex(
+    itr::StaticBitVector{<:Any, U},
+    n::Integer,
+) where {U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    return Bool(int >> bit_offset & one(int))
+end
+
+@inline function Base.setindex(
+    itr::StaticBitVector{N, U},
+    bit::Bool,
+    n::Integer,
+) where {N, U}
+    int_index, bit_offset = int_index_and_bit_offset(U, n)
+    int = itr.ints[int_index]
+    new_int = int & ~(one(U) << bit_offset) | U(bit) << bit_offset
+    ints = Base.setindex(itr.ints, new_int, int_index)
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline output_type_for_promotion(::StaticBitVector{<:Any, U}) where {U} =
+    ConditionalOutputType(Bool, StaticBitVector{<:Any, U})
+
+@inline empty_output(::Type{StaticBitVector{<:Any, U}}) where {U} =
+    StaticBitVector{0, U}()
+
+@inline unrolled_map_into(::Type{StaticBitVector{<:Any, U}}, f, itr) where {U} =
+    StaticBitVector{length(itr), U}(
+        Base.Fix1(generic_getindex, Iterators.map(f, itr)),
+    )
+
+@inline function unrolled_accumulate_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    op,
+    itr,
+    init,
+    transform,
+) where {U}
+    N = length(itr)
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_accumulate(
+        StaticOneTo(n_ints);
+        init = (nothing, init),
+        transform = first,
+    ) do (_, init_value_for_new_int), int_index
+        @inline
+        first_index = n_bits_per_int * (int_index - 1) + 1
+        unrolled_reduce(
+            StaticOneTo(min(n_bits_per_int, N - first_index + 1));
+            init = (zero(U), init_value_for_new_int),
+        ) do (int, prev_value), bit_index
+            @inline
+            bit_offset = bit_index - 1
+            item = generic_getindex(itr, first_index + bit_offset)
+            new_value =
+                first_index + bit_offset == 1 && prev_value isa NoInit ?
+                item : op(prev_value, item)
+            (int | U(transform(new_value)::Bool) << bit_offset, new_value)
+        end
+    end
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function unrolled_push_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    itr,
+    bit,
+) where {U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(length(itr), n_bits_per_int)
+    bit_offset = length(itr) % n_bits_per_int
+    ints = if bit_offset == 0
+        (itr.ints..., U(bit))
+    else
+        last_int = itr.ints[n_ints]
+        new_last_int =
+            last_int & ~(one(U) << bit_offset) | U(bit) << bit_offset
+        (unrolled_take(itr.ints, Val(n_ints - 1))..., new_last_int)
+    end
+    return StaticBitVector{length(itr) + 1, U}(ints)
+end
+
+@inline function unrolled_append_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    itr1,
+    itr2,
+) where {U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints1 = cld(length(itr1), n_bits_per_int)
+    bit_offset = length(itr1) % n_bits_per_int
+    ints = if bit_offset == 0 || length(itr2) == 0
+        (itr1.ints..., itr2.ints...)
+    else
+        mid_int1 = itr1.ints[n_ints1]
+        mid_int2 = itr2.ints[1]
+        mid_int =
+            mid_int1 & ~(~zero(U) << bit_offset) | mid_int2 << bit_offset
+        final_ints =
+            length(itr2) + bit_offset <= n_bits_per_int ? () :
+            unrolled_drop(itr2, Val(n_bits_per_int - bit_offset)).ints
+        (unrolled_take(itr1.ints, Val(n_ints1 - 1))..., mid_int, final_ints...)
+    end
+    return StaticBitVector{length(itr1) + length(itr2), U}(ints)
+end
+
+@inline function unrolled_take_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    itr,
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(N, n_bits_per_int)
+    ints = unrolled_take(itr.ints, Val(n_ints))
+    return StaticBitVector{N, U}(ints)
+end
+
+@inline function unrolled_drop_into(
+    ::Type{StaticBitVector{<:Any, U}},
+    itr,
+    ::Val{N},
+) where {N, U}
+    n_bits_per_int = 8 * sizeof(U)
+    n_ints = cld(length(itr) - N, n_bits_per_int)
+    n_dropped_ints = fld(N, n_bits_per_int)
+    bit_offset = N - n_bits_per_int * n_dropped_ints
+    ints_without_offset = unrolled_drop(itr.ints, Val(n_dropped_ints))
+    ints = if bit_offset == 0 || length(itr) <= N
+        ints_without_offset
+    else
+        next_ints =
+            length(ints_without_offset) == 1 ? (nothing,) :
+            (unrolled_drop(ints_without_offset, Val(1))..., nothing)
+        unrolled_map(ints_without_offset, next_ints) do cur_int, next_int
+            @inline
+            isnothing(next_int) ? cur_int >> bit_offset :
+            cur_int >> bit_offset | next_int << (n_bits_per_int - bit_offset)
+        end
+    end
+    return StaticBitVector{length(itr) - N, U}(ints)
+end
diff --git a/src/StaticOneTo.jl b/src/StaticOneTo.jl
new file mode 100644
index 0000000..3a80b0d
--- /dev/null
+++ b/src/StaticOneTo.jl
@@ -0,0 +1,18 @@
+"""
+    StaticOneTo(N)
+
+A lazy and statically sized analogue of `Base.OneTo(N)`.
+
+This iterator can only store the integers from 1 to `N`, so its
+`output_type_for_promotion` is `NoOutputType()`. An efficient method is provided
+for `unrolled_take`, but no other unrolled functions can use `StaticOneTo`s as
+output types.
+"""
+struct StaticOneTo{N} <: StaticSequence{N} end
+@inline StaticOneTo(N) = StaticOneTo{N}()
+
+@inline generic_getindex(::StaticOneTo, n) = n
+
+@inline output_type_for_promotion(::StaticOneTo) = NoOutputType()
+
+@inline unrolled_take(::StaticOneTo, ::Val{N}) where {N} = StaticOneTo(N)
diff --git a/src/UnrolledUtilities.jl b/src/UnrolledUtilities.jl
index dc69559..da4d860 100644
--- a/src/UnrolledUtilities.jl
+++ b/src/UnrolledUtilities.jl
@@ -4,10 +4,14 @@ export unrolled_any,
     unrolled_all,
     unrolled_foreach,
     unrolled_map,
+    unrolled_applyat,
     unrolled_reduce,
     unrolled_mapreduce,
-    unrolled_zip,
-    unrolled_enumerate,
+    unrolled_accumulate,
+    unrolled_push,
+    unrolled_append,
+    unrolled_take,
+    unrolled_drop,
     unrolled_in,
     unrolled_unique,
     unrolled_filter,
@@ -15,114 +19,161 @@ export unrolled_any,
     unrolled_flatten,
     unrolled_flatmap,
     unrolled_product,
-    unrolled_applyat,
-    unrolled_take,
-    unrolled_drop
-
-inferred_length(::Type{<:NTuple{N, Any}}) where {N} = N
-# We could also add support for statically-sized iterators that are not Tuples.
-
-f_exprs(itr_type) = (:(f(itr[$n])) for n in 1:inferred_length(itr_type))
-@inline @generated unrolled_any(f, itr) = Expr(:||, f_exprs(itr)...)
-@inline @generated unrolled_all(f, itr) = Expr(:&&, f_exprs(itr)...)
-
-function zipped_f_exprs(itr_types)
-    L = length(itr_types)
-    L == 0 && error("unrolled functions need at least one iterator as input")
-    N = minimum(inferred_length, itr_types)
-    return (:(f($((:(itrs[$l][$n]) for l in 1:L)...))) for n in 1:N)
-end
-@inline @generated unrolled_foreach(f, itrs...) =
-    Expr(:block, zipped_f_exprs(itrs)..., nothing)
-@inline @generated unrolled_map(f, itrs...) =
-    Expr(:tuple, zipped_f_exprs(itrs)...)
-
-function nested_op_expr(itr_type)
-    N = inferred_length(itr_type)
-    N == 0 && error("unrolled_reduce needs an `init` value for empty iterators")
-    item_exprs = (:(itr[$n]) for n in 1:N)
-    return reduce((expr1, expr2) -> :(op($expr1, $expr2)), item_exprs)
-end
-@inline @generated unrolled_reduce_without_init(op, itr) = nested_op_expr(itr)
-
-struct NoInit end
+    StaticOneTo,
+    StaticBitVector
+
+include("unrollable_iterator_interface.jl")
+include("recursively_unrolled_functions.jl")
+include("generatively_unrolled_functions.jl")
+
+struct NoInit end # Analogue of Base._InitialValue for reduction/accumulation.
+
+@inline unrolled_any(f, itr) =
+    (rec_unroll(itr) ? rec_unrolled_any : gen_unrolled_any)(f, itr)
+@inline unrolled_any(itr) = unrolled_any(identity, itr)
+
+@inline unrolled_all(f, itr) =
+    (rec_unroll(itr) ? rec_unrolled_all : gen_unrolled_all)(f, itr)
+@inline unrolled_all(itr) = unrolled_all(identity, itr)
+
+@inline unrolled_foreach(f, itr) =
+    (rec_unroll(itr) ? rec_unrolled_foreach : gen_unrolled_foreach)(f, itr)
+@inline unrolled_foreach(f, itrs...) = unrolled_foreach(splat(f), zip(itrs...))
+
+@inline unrolled_map_into_tuple(f, itr) =
+    (rec_unroll(itr) ? rec_unrolled_map : gen_unrolled_map)(f, itr)
+@inline unrolled_map_into(output_type, f, itr) =
+    constructor_from_tuple(output_type)(unrolled_map_into_tuple(f, itr))
+@inline unrolled_map(f, itr) =
+    unrolled_map_into(inferred_output_type(Iterators.map(f, itr)), f, itr)
+@inline unrolled_map(f, itrs...) = unrolled_map(splat(f), zip(itrs...))
+
+@inline unrolled_applyat(f, n, itr) =
+    (rec_unroll(itr) ? rec_unrolled_applyat : gen_unrolled_applyat)(f, n, itr)
+@inline unrolled_applyat(f, n, itrs...) =
+    unrolled_applyat(splat(f), n, zip(itrs...))
+@inline unrolled_applyat_bounds_error() =
+    error("unrolled_applyat has detected an out-of-bounds index")
+
+@inline unrolled_reduce(op, itr, init) =
+    (rec_unroll(itr) ? rec_unrolled_reduce : gen_unrolled_reduce)(op, itr, init)
 @inline unrolled_reduce(op, itr; init = NoInit()) =
-    unrolled_reduce_without_init(op, init isa NoInit ? itr : (init, itr...))
+    isempty(itr) && init isa NoInit ?
+    error("unrolled_reduce requires an init value for empty iterators") :
+    unrolled_reduce(op, itr, init)
+
+# TODO: Figure out why unrolled_reduce(op, Val(N), init) compiles faster than
+# unrolled_reduce(op, StaticOneTo(N), init) for the non-orographic gravity wave
+# parametrization test in ClimaAtmos, to the point where the StaticOneTo version
+# appears to completely hang while the Val version compiles in under a second.
+@inline unrolled_reduce(op, val_N::Val, init) =
+    val_unrolled_reduce(op, val_N, init)
+@inline unrolled_reduce(op, val_N::Val; init = NoInit()) =
+    val_N isa Val{0} && init isa NoInit ?
+    error("unrolled_reduce requires an init value for empty iterators") :
+    unrolled_reduce(op, val_N, init)
 
 @inline unrolled_mapreduce(f, op, itrs...; init = NoInit()) =
-    unrolled_reduce(op, unrolled_map(f, itrs...); init)
+    unrolled_reduce(op, Iterators.map(f, itrs...), init)
+
+@inline unrolled_accumulate_into_tuple(op, itr, init, transform) =
+    (rec_unroll(itr) ? rec_unrolled_accumulate : gen_unrolled_accumulate)(
+        op,
+        itr,
+        init,
+        transform,
+    )
+@inline unrolled_accumulate_into(output_type, op, itr, init, transform) =
+    constructor_from_tuple(output_type)(
+        unrolled_accumulate_into_tuple(op, itr, init, transform),
+    )
+@inline unrolled_accumulate(op, itr; init = NoInit(), transform = identity) =
+    unrolled_accumulate_into(
+        accumulate_output_type(op, itr, init, transform),
+        op,
+        itr,
+        init,
+        transform,
+    )
+
+@inline unrolled_push_into(output_type, itr, item) =
+    constructor_from_tuple(output_type)((itr..., item))
+@inline unrolled_push(itr, item) =
+    unrolled_push_into(inferred_output_type(itr), itr, item)
+
+@inline unrolled_append_into(output_type, itr1, itr2) =
+    constructor_from_tuple(output_type)((itr1..., itr2...))
+@inline unrolled_append(itr1, itr2) =
+    unrolled_append_into(promoted_output_type((itr1, itr2)), itr1, itr2)
 
-@inline unrolled_zip(itrs...) = unrolled_map(tuple, itrs...)
+@inline unrolled_take_into(output_type, itr, ::Val{N}) where {N} =
+    constructor_from_tuple(output_type)(
+        ntuple(Base.Fix1(generic_getindex, itr), Val(N)),
+    )
+@inline unrolled_take(itr, val_N) =
+    unrolled_take_into(inferred_output_type(itr), itr, val_N)
 
-@inline unrolled_enumerate(itrs...) =
-    unrolled_zip(ntuple(identity, Val(length(itrs[1]))), itrs...)
+@inline unrolled_drop_into(output_type, itr, ::Val{N}) where {N} =
+    constructor_from_tuple(output_type)(
+        ntuple(n -> generic_getindex(itr, N + n), Val(length(itr) - N)),
+    )
+@inline unrolled_drop(itr, val_N) =
+    unrolled_drop_into(inferred_output_type(itr), itr, val_N)
 
 @inline unrolled_in(item, itr) = unrolled_any(Base.Fix1(===, item), itr)
 # Using === instead of == or isequal improves type stability for singletons.
 
 @inline unrolled_unique(itr) =
-    unrolled_reduce(itr; init = ()) do unique_items, item
+    unrolled_reduce(itr; init = inferred_empty(itr)) do unique_items, item
         @inline
-        unrolled_in(item, unique_items) ? unique_items : (unique_items..., item)
+        unrolled_in(item, unique_items) ? unique_items :
+        unrolled_push(unique_items, item)
     end
 
 @inline unrolled_filter(f, itr) =
-    unrolled_reduce(itr; init = ()) do filtered_items, item
+    unrolled_reduce(itr; init = inferred_empty(itr)) do items_with_true_f, item
         @inline
-        f(item) ? (filtered_items..., item) : filtered_items
+        f(item) ? unrolled_push(items_with_true_f, item) : items_with_true_f
     end
 
 @inline unrolled_split(f, itr) =
-    unrolled_reduce(itr; init = ((), ())) do (f_items, not_f_items), item
+    unrolled_reduce(
+        itr;
+        init = (inferred_empty(itr), inferred_empty(itr)),
+    ) do (items_with_true_f, items_with_false_f), item
         @inline
-        f(item) ? ((f_items..., item), not_f_items) :
-        (f_items, (not_f_items..., item))
+        f(item) ? (unrolled_push(items_with_true_f, item), items_with_false_f) :
+        (items_with_true_f, unrolled_push(items_with_false_f, item))
     end
 
 @inline unrolled_flatten(itr) =
-    unrolled_reduce((item1, item2) -> (item1..., item2...), itr; init = ())
+    unrolled_reduce(unrolled_append, itr; init = promoted_empty(itr))
 
 @inline unrolled_flatmap(f, itrs...) =
-    unrolled_flatten(unrolled_map(f, itrs...))
+    unrolled_flatten(Iterators.map(f, itrs...))
 
 @inline unrolled_product(itrs...) =
-    unrolled_reduce(itrs; init = ((),)) do product_itr, itr
+    unrolled_reduce(itrs; init = (promoted_empty(itrs),)) do product_itr, itr
         @inline
         unrolled_flatmap(itr) do item
             @inline
-            unrolled_map(product_tuple -> (product_tuple..., item), product_itr)
+            unrolled_map_into_tuple(Base.Fix2(unrolled_push, item), product_itr)
         end
     end
 
-@inline unrolled_applyat(f, n, itrs...) = unrolled_foreach(
-    (i, items...) -> i == n && f(items...),
-    unrolled_enumerate(itrs...),
-)
-
-@inline unrolled_take(itr, ::Val{N}) where {N} = ntuple(i -> itr[i], Val(N))
-@inline unrolled_drop(itr, ::Val{N}) where {N} =
-    ntuple(i -> itr[N + i], Val(length(itr) - N))
-# When its second argument is a Val, ntuple is unrolled via Base.@ntuple.
-
-@static if hasfield(Method, :recursion_relation)
-    # Remove recursion limits for functions whose arguments are also functions.
-    for func in (
-        unrolled_any,
-        unrolled_all,
-        unrolled_foreach,
-        unrolled_map,
-        unrolled_reduce_without_init,
-        unrolled_reduce,
-        unrolled_mapreduce,
-        unrolled_filter,
-        unrolled_split,
-        unrolled_flatmap,
-        unrolled_applyat,
-    )
-        for method in methods(func)
-            method.recursion_relation = (_...) -> true
-        end
-    end
-end
+abstract type StaticSequence{N} end
+
+@inline Base.length(::StaticSequence{N}) where {N} = N
+@inline Base.firstindex(::StaticSequence) = 1
+@inline Base.lastindex(itr::StaticSequence) = length(itr)
+@inline Base.getindex(itr::StaticSequence, n::Integer) =
+    generic_getindex(itr, n)
+@inline Base.iterate(itr::StaticSequence, n = 1) =
+    n > length(itr) ? nothing : (generic_getindex(itr, n), n + 1)
+
+include("StaticOneTo.jl")
+include("StaticBitVector.jl")
+
+include("recursion_limits.jl") # This must be included at the end of the module.
 
 end
diff --git a/src/generatively_unrolled_functions.jl b/src/generatively_unrolled_functions.jl
new file mode 100644
index 0000000..91ec3af
--- /dev/null
+++ b/src/generatively_unrolled_functions.jl
@@ -0,0 +1,60 @@
+@inline @generated _gen_unrolled_any(::Val{N}, f, itr) where {N} =
+    Expr(:||, (:(f(generic_getindex(itr, $n))) for n in 1:N)...)
+@inline gen_unrolled_any(f, itr) = _gen_unrolled_any(Val(length(itr)), f, itr)
+
+@inline @generated _gen_unrolled_all(::Val{N}, f, itr) where {N} =
+    Expr(:&&, (:(f(generic_getindex(itr, $n))) for n in 1:N)...)
+@inline gen_unrolled_all(f, itr) = _gen_unrolled_all(Val(length(itr)), f, itr)
+
+@inline @generated _gen_unrolled_foreach(::Val{N}, f, itr) where {N} =
+    Expr(:block, (:(f(generic_getindex(itr, $n))) for n in 1:N)..., nothing)
+@inline gen_unrolled_foreach(f, itr) =
+    _gen_unrolled_foreach(Val(length(itr)), f, itr)
+
+@inline @generated _gen_unrolled_map(::Val{N}, f, itr) where {N} =
+    Expr(:tuple, (:(f(generic_getindex(itr, $n))) for n in 1:N)...)
+@inline gen_unrolled_map(f, itr) = _gen_unrolled_map(Val(length(itr)), f, itr)
+
+@inline @generated _gen_unrolled_applyat(::Val{N}, f, n, itr) where {N} = Expr(
+    :block,
+    (:(n == $n && return f(generic_getindex(itr, $n))) for n in 1:N)...,
+    :(unrolled_applyat_bounds_error()),
+) # This block gets optimized into a switch instruction during LLVM codegen.
+@inline gen_unrolled_applyat(f, n, itr) =
+    _gen_unrolled_applyat(Val(length(itr)), f, n, itr)
+
+@inline @generated _gen_unrolled_reduce(::Val{N}, op, itr, init) where {N} =
+    foldl(
+        init <: NoInit ? (2:N) : (1:N);
+        init = init <: NoInit ? :(generic_getindex(itr, 1)) : :init,
+    ) do prev_op_expr, n
+        :(op($prev_op_expr, generic_getindex(itr, $n)))
+    end # Use foldl instead of reduce to guarantee left associativity.
+@inline gen_unrolled_reduce(op, itr, init) =
+    _gen_unrolled_reduce(Val(length(itr)), op, itr, init)
+
+@inline @generated function _gen_unrolled_accumulate(
+    ::Val{N},
+    op,
+    itr,
+    init,
+    transform,
+) where {N}
+    first_item_expr = :(generic_getindex(itr, 1))
+    init_expr = init <: NoInit ? first_item_expr : :(op(init, $first_item_expr))
+    transformed_exprs_and_op_exprs =
+        accumulate(1:N; init = (nothing, init_expr)) do (_, prev_op_expr), n
+            var = gensym()
+            op_expr = :(op($var, generic_getindex(itr, $(n + 1))))
+            (:($var = $prev_op_expr; transform($var)), op_expr)
+        end
+    return Expr(:tuple, Iterators.map(first, transformed_exprs_and_op_exprs)...)
+end
+@inline gen_unrolled_accumulate(op, itr, init, transform) =
+    _gen_unrolled_accumulate(Val(length(itr)), op, itr, init, transform)
+
+# NOTE: The following is experimental and will likely be removed in the future.
+@inline @generated val_unrolled_reduce(op, ::Val{N}, init) where {N} =
+    foldl(init <: NoInit ? (1:N) : (:init, 1:N...)) do prev_op_expr, item_expr
+        :(op($prev_op_expr, $item_expr))
+    end # Use foldl instead of reduce to guarantee left associativity.
diff --git a/src/recursion_limits.jl b/src/recursion_limits.jl
new file mode 100644
index 0000000..9f9c279
--- /dev/null
+++ b/src/recursion_limits.jl
@@ -0,0 +1,56 @@
+# Remove recursion limits from functions that call themselves, and also from all
+# functions whose arguments can be arbitrary functions (including themselves).
+@static if hasfield(Method, :recursion_relation)
+    for func in (
+        generic_getindex,
+        output_type_for_promotion,
+        _rec_unrolled_any,
+        _rec_unrolled_all,
+        _rec_unrolled_foreach,
+        _rec_unrolled_map,
+        _rec_unrolled_applyat,
+        _rec_unrolled_reduce,
+        _rec_unrolled_accumulate,
+        rec_unrolled_any,
+        rec_unrolled_all,
+        rec_unrolled_foreach,
+        rec_unrolled_map,
+        rec_unrolled_applyat,
+        rec_unrolled_reduce,
+        rec_unrolled_accumulate,
+        _gen_unrolled_any,
+        _gen_unrolled_all,
+        _gen_unrolled_foreach,
+        _gen_unrolled_map,
+        _gen_unrolled_applyat,
+        _gen_unrolled_reduce,
+        _gen_unrolled_accumulate,
+        gen_unrolled_any,
+        gen_unrolled_all,
+        gen_unrolled_foreach,
+        gen_unrolled_map,
+        gen_unrolled_applyat,
+        gen_unrolled_reduce,
+        gen_unrolled_accumulate,
+        val_unrolled_reduce,
+        unrolled_any,
+        unrolled_all,
+        unrolled_foreach,
+        unrolled_map_into_tuple,
+        unrolled_map_into,
+        unrolled_map,
+        unrolled_applyat,
+        unrolled_reduce,
+        unrolled_mapreduce,
+        unrolled_accumulate_into_tuple,
+        unrolled_accumulate_into,
+        unrolled_accumulate,
+        unrolled_filter,
+        unrolled_split,
+        unrolled_flatmap,
+    )
+        for method in methods(func)
+            method.recursion_relation = Returns(true)
+        end
+    end
+end
diff --git a/src/recursively_unrolled_functions.jl b/src/recursively_unrolled_functions.jl
new file mode 100644
index 0000000..db88bef
--- /dev/null
+++ b/src/recursively_unrolled_functions.jl
@@ -0,0 +1,47 @@
+@inline _rec_unrolled_any(f) = false
+@inline _rec_unrolled_any(f, item, items...) =
+    f(item) || _rec_unrolled_any(f, items...)
+@inline rec_unrolled_any(f, itr) = _rec_unrolled_any(f, itr...)
+
+@inline _rec_unrolled_all(f) = true
+@inline _rec_unrolled_all(f, item, items...) =
+    f(item) && _rec_unrolled_all(f, items...)
+@inline rec_unrolled_all(f, itr) = _rec_unrolled_all(f, itr...)
+
+@inline _rec_unrolled_foreach(f) = nothing
+@inline _rec_unrolled_foreach(f, item, items...) =
+    (f(item); _rec_unrolled_foreach(f, items...))
+@inline rec_unrolled_foreach(f, itr) = _rec_unrolled_foreach(f, itr...)
+
+@inline _rec_unrolled_map(f) = ()
+@inline _rec_unrolled_map(f, item, items...) =
+    (f(item), _rec_unrolled_map(f, items...)...)
+@inline rec_unrolled_map(f, itr) = _rec_unrolled_map(f, itr...)
+
+@inline _rec_unrolled_applyat(f, offset_n) = unrolled_applyat_bounds_error()
+@inline _rec_unrolled_applyat(f, offset_n, item, items...) =
+    offset_n == 1 ? f(item) : _rec_unrolled_applyat(f, offset_n - 1, items...)
+@inline rec_unrolled_applyat(f, n, itr) = _rec_unrolled_applyat(f, n, itr...)
+
+@inline _rec_unrolled_reduce(op, prev_value) = prev_value
+@inline _rec_unrolled_reduce(op, prev_value, item, items...) =
+    _rec_unrolled_reduce(op, op(prev_value, item), items...)
+@inline rec_unrolled_reduce(op, itr, init) =
+    init isa NoInit ? _rec_unrolled_reduce(op, itr...) :
+    _rec_unrolled_reduce(op, init, itr...)
+
+@inline _rec_unrolled_accumulate(op, transform, prev_value) =
+    (transform(prev_value),)
+@inline _rec_unrolled_accumulate(op, transform, prev_value, item, items...) = (
+    transform(prev_value),
+    _rec_unrolled_accumulate(op, transform, op(prev_value, item), items...)...,
+)
+@inline rec_unrolled_accumulate(op, itr, init, transform) =
+    isempty(itr) ? () :
+    init isa NoInit ? _rec_unrolled_accumulate(op, transform, itr...) :
+    _rec_unrolled_accumulate(
+        op,
+        transform,
+        op(init, generic_getindex(itr, 1)),
+        unrolled_drop(itr, Val(1))...,
+    )
diff --git a/src/unrollable_iterator_interface.jl b/src/unrollable_iterator_interface.jl
new file mode 100644
index 0000000..f17705f
--- /dev/null
+++ b/src/unrollable_iterator_interface.jl
@@ -0,0 +1,205 @@
+"""
+    rec_unroll(itr)
+
+Whether to use recursive loop unrolling instead of generative loop unrolling for
+the iterator `itr`.
+
+In general, recursive loop unrolling is faster to compile for small iterators,
+but it becomes extremely slow to compile for long iterators, and it usually
+generates suboptimal LLVM code for long iterators. On the other hand, generative
+loop unrolling is slow to compile for small iterators, but its compilation time
+does not grow as rapidly with respect to iterator size, and it always generates
+optimal LLVM code. The default is currently to use recursive unrolling for
+iterator lengths up to 16, and to use generative unrolling for longer iterators.
+"""
+@inline rec_unroll(itr) = length(itr) <= 16
+
+"""
+    generic_getindex(itr, n)
+
+Identical to `getindex(itr, n)`, but with the added ability to handle lazy
+iterator types defined in the standard library, such as `Base.Generator` and
+`Base.Iterators.Enumerate`.
+"""
+@inline generic_getindex(itr, n) = getindex(itr, n)
+@inline generic_getindex(itr::Base.Generator, n) =
+    itr.f(generic_getindex(itr.iter, n))
+@inline generic_getindex(itr::Base.Iterators.Enumerate, n) =
+    (n, generic_getindex(itr.itr, n))
+@inline generic_getindex(itr::Base.Iterators.Zip, n) =
+    unrolled_map_into_tuple(Base.Fix2(generic_getindex, n), itr.is)
+
+@inline first_item_type(itr) =
+    Base.promote_op(Base.Fix2(generic_getindex, 1), typeof(itr))
+@inline second_item_type(itr) =
+    Base.promote_op(Base.Fix2(generic_getindex, 2), typeof(itr))
+
+"""
+    output_type_for_promotion(itr)
+
+The type of output that unrolled functions should try to generate for the input
+iterator `itr`, or a `ConditionalOutputType` if the output type depends on the
+type of items that need to be stored in it, or `NoOutputType()` if `itr` is a
+lazy iterator without any associated output type. Defaults to `Tuple`.
+"""
+@inline output_type_for_promotion(_) = Tuple
+@inline output_type_for_promotion(::NamedTuple{names}) where {names} =
+    NamedTuple{names}
+@inline output_type_for_promotion(itr::Base.Generator) =
+    output_type_for_promotion(itr.iter)
+@inline output_type_for_promotion(itr::Base.Iterators.Enumerate) =
+    output_type_for_promotion(itr.itr)
+@inline output_type_for_promotion(itr::Base.Iterators.Zip) =
+    maybe_ambiguous_promoted_output_type(itr.is)
+
+"""
+    AmbiguousOutputType
+
+The result of `output_type_for_promotion` for iterators that do not have
+well-defined output types.
+"""
+abstract type AmbiguousOutputType end
+
+"""
+    NoOutputType()
+
+The `AmbiguousOutputType` of lazy iterators.
+"""
+struct NoOutputType <: AmbiguousOutputType end
+
+"""
+    ConditionalOutputType(allowed_item_type, output_type, [fallback_type])
+
+An `AmbiguousOutputType` that can have one of two possible values. If the first
+item in the output is a subtype of `allowed_item_type`, the output will have the
+type `output_type`; otherwise, it will have the type `fallback_type`, which is
+set to `Tuple` by default.
+"""
+struct ConditionalOutputType{I, O, O′} <: AmbiguousOutputType end
+@inline ConditionalOutputType(
+    allowed_item_type::Type,
+    output_type::Type,
+    fallback_type::Type = Tuple,
+) = ConditionalOutputType{allowed_item_type, output_type, fallback_type}()
+
+@inline unambiguous_output_type(_, ::Type{O}) where {O} = O
+@inline unambiguous_output_type(_, ::NoOutputType) = Tuple
+@inline unambiguous_output_type(
+    get_first_item_type,
+    ::ConditionalOutputType{I, O, O′},
+) where {I, O, O′} = get_first_item_type() <: I ? O : O′
+
+"""
+    output_promote_rule(output_type1, output_type2)
+
+The type of output that should be generated when two iterators do not have the
+same `output_type_for_promotion`, or `Union{}` if these iterators should not be
+used together. Only one method of `output_promote_rule` needs to be defined for
+any pair of output types.
+
+By default, all types take precedence over `NoOutputType()`, and the conditional
+part of any `ConditionalOutputType` takes precedence over an unconditional type
+(so that only the `fallback_type` of any conditional type gets promoted). The
+default result for all other pairs of unequal output types is `Union{}`.
+"""
+@inline output_promote_rule(_, _) = Union{}
+@inline output_promote_rule(::Type{O}, ::Type{O}) where {O} = O
+@inline output_promote_rule(::NoOutputType, output_type) = output_type
+@inline output_promote_rule(
+    ::ConditionalOutputType{I, O, O′},
+    ::Type{O′′},
+) where {I, O, O′, O′′} =
+    ConditionalOutputType(I, O, output_promote_rule(O′, O′′))
+@inline output_promote_rule(
+    ::Type{O′},
+    ::ConditionalOutputType{I, O, O′′},
+) where {I, O, O′, O′′} =
+    ConditionalOutputType(I, O, output_promote_rule(O′, O′′))
+@inline output_promote_rule(
+    ::ConditionalOutputType{I, O, O′},
+    ::ConditionalOutputType{I, O, O′′},
+) where {I, O, O′, O′′} =
+    ConditionalOutputType(I, O, output_promote_rule(O′, O′′))
+
+@inline function output_promote_result(O1, O2)
+    O12 = output_promote_rule(O1, O2)
+    O21 = output_promote_rule(O2, O1)
+    O12 == O21 == Union{} &&
+        error("output_promote_rule is undefined for $O1 and $O2")
+    (O12 == O21 || O21 == Union{}) && return O12
+    O12 == Union{} && return O21
+    error("output_promote_rule yields inconsistent results for $O1 and $O2: \
+           $O12 for $O1 followed by $O2, versus $O21 for $O2 followed by $O1")
+end
+
+@inline maybe_ambiguous_promoted_output_type(itrs) =
+    isempty(itrs) ? Tuple : # Generate a Tuple when given 0 inputs.
+    unrolled_mapreduce(output_type_for_promotion, output_promote_result, itrs)
+
+@inline inferred_output_type(itr) =
+    unambiguous_output_type(output_type_for_promotion(itr)) do
+        @inline
+        first_item_type(itr)
+    end
+@inline inferred_output_type(itr::Base.Generator) =
+    unambiguous_output_type(output_type_for_promotion(itr.iter)) do
+        @inline
+        Base.promote_op(itr.f, first_item_type(itr.iter))
+    end
+@inline inferred_output_type(itr::Base.Iterators.Enumerate) =
+    unambiguous_output_type(output_type_for_promotion(itr.itr)) do
+        @inline
+        Tuple{Int, first_item_type(itr.itr)}
+    end
+@inline inferred_output_type(itr::Base.Iterators.Zip) =
+    unambiguous_output_type(maybe_ambiguous_promoted_output_type(itr.is)) do
+        @inline
+        Tuple{unrolled_map_into_tuple(first_item_type, itr.is)...}
+    end
+
+@inline promoted_output_type(itrs) =
+    unambiguous_output_type(maybe_ambiguous_promoted_output_type(itrs)) do
+        @inline
+        first_item_type(generic_getindex(itrs, 1))
+    end
+
+@inline accumulate_output_type(op, itr, init, transform) =
+    unambiguous_output_type(output_type_for_promotion(itr)) do
+        @inline
+        no_init = init isa NoInit
+        arg1_type = no_init ? first_item_type(itr) : typeof(init)
+        arg2_type = no_init ? second_item_type(itr) : first_item_type(itr)
+        Base.promote_op(transform, Base.promote_op(op, arg1_type, arg2_type))
+    end
+
+"""
+    constructor_from_tuple(output_type)
+
+A function that can be used to efficiently construct an output of type
+`output_type` from a `Tuple`, or `identity` if such an output should not be
+constructed from a `Tuple`. Defaults to `identity`, which also handles the case
+where `output_type` is already `Tuple`. The `output_type` here is guaranteed to
+be a `Type`, rather than a `ConditionalOutputType` or `NoOutputType`.
+
+Many statically sized iterators (e.g., `SVector`s) are essentially wrappers for
+`Tuple`s, and their constructors for `Tuple`s can be reduced to no-ops. The main
+exceptions are [`StaticOneTo`](@ref UnrolledUtilities.StaticOneTo)s and
+[`StaticBitVector`](@ref UnrolledUtilities.StaticBitVector)s, which do not
+provide constructors for `Tuple`s because there is no performance benefit to
+making a lazy or low-storage data structure once a corresponding high-storage
+data structure has already been constructed.
+"""
+@inline constructor_from_tuple(::Type) = identity
+@inline constructor_from_tuple(::Type{NT}) where {NT <: NamedTuple} = NT
+
+"""
+    empty_output(output_type)
+
+An empty output of type `output_type`. Defaults to applying the
+`constructor_from_tuple` for the given type to an empty `Tuple`.
+"""
+@inline empty_output(output_type) = constructor_from_tuple(output_type)(())
+
+@inline inferred_empty(itr) = empty_output(inferred_output_type(itr))
+
+@inline promoted_empty(itrs) = empty_output(promoted_output_type(itrs))
diff --git a/test/aqua.jl b/test/aqua.jl
index d7becf1..ff1edd1 100644
--- a/test/aqua.jl
+++ b/test/aqua.jl
@@ -1,3 +1,4 @@
+using Test
 import Aqua, UnrolledUtilities
 
 # This is separate from all the other tests because Aqua.test_all checks for
diff --git a/test/runtests.jl b/test/runtests.jl
index 631181c..0cfddab 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,7 +2,9 @@ using SafeTestsets
 
 @safetestset "Test and Analyze" begin
     @time include("test_and_analyze.jl")
-    print_comparison_table()
+    for (title, comparison_table_dict) in comparison_table_dicts
+        print_comparison_table(title, comparison_table_dict)
+    end
 end
 @safetestset "Aqua" begin
     @time include("aqua.jl")
diff --git a/test/test_and_analyze.jl b/test/test_and_analyze.jl
index 70415e3..b1beb7d 100644
--- a/test/test_and_analyze.jl
+++ b/test/test_and_analyze.jl
@@ -6,45 +6,79 @@ using InteractiveUtils
 
 using UnrolledUtilities
 
-comparison_table_dict = OrderedDict()
+comparison_table_dicts = OrderedDict()
 
-function print_comparison_table(io = stdout, generate_html = false)
+function print_comparison_table(title, comparison_table_dict, io = stdout)
     table_data =
         mapreduce(vcat, collect(comparison_table_dict)) do (key, entries)
             stack(entry -> (key..., entry...), entries; dims = 1)
         end
 
-    highlighter(f, color) =
-        generate_html ? HtmlHighlighter(f, HtmlDecoration(; color)) :
-        Highlighter(f, Crayon(; foreground = Symbol(color)))
-
-    better_performance_but_harder_to_compile =
-        highlighter(generate_html ? "royalblue" : "blue") do data, i, j
-            data[i, 4] != data[i, 5] &&
-                (endswith(data[i, 6], "slower") || endswith(data[i, 7], "more"))
-        end
-    better_performance =
-        highlighter(generate_html ? "mediumseagreen" : "green") do data, i, j
-            data[i, 4] != data[i, 5]
-        end
-    mixed_compilation =
-        highlighter(generate_html ? "mediumorchid" : "magenta") do data, i, j
-            (endswith(data[i, 6], "slower") && endswith(data[i, 7], "less")) ||
-                (endswith(data[i, 6], "faster") && endswith(data[i, 7], "more"))
-        end
-    harder_to_compile =
-        highlighter(generate_html ? "indianred" : "red") do data, i, j
-            endswith(data[i, 6], "slower") || endswith(data[i, 7], "more")
-        end
-    easier_to_compile =
-        highlighter(generate_html ? "darkturquoise" : "cyan") do data, i, j
-            endswith(data[i, 6], "faster") || endswith(data[i, 7], "less")
+    writing_to_docs = io isa IOStream
+
+    color(color_str) =
+        writing_to_docs ? HtmlDecoration(; color = color_str) :
+        Crayon(; foreground = Symbol(color_str))
+    highlighter_color(optimization, run_time, compile_time, allocs) =
+        if contains(optimization, "better") ||
+           contains(optimization, "fewer allocs") &&
+           !contains(run_time, "more") ||
+           contains(optimization, "identical") && contains(run_time, "less")
+            # better performance
+            if !contains(run_time, "more") &&
+               !contains(compile_time, "more") &&
+               !contains(allocs, "more")
+                # similar or better run time, compilation, and total allocations
+                if contains(optimization, "better")
+                    # better optimization
+                    color(writing_to_docs ? "darkturquoise" : "cyan")
+                else
+                    # faster run time or fewer allocations at run time
+                    color(writing_to_docs ? "mediumseagreen" : "green")
+                end
+            else
+                # worse run time, compilation, or total allocations
+                if contains(optimization, "better")
+                    # better optimization
+                    color(writing_to_docs ? "royalblue" : "blue")
+                else
+                    # faster run time or fewer allocations at run time
+                    color(writing_to_docs ? "khaki" : "yellow")
+                end
+            end
+        elseif contains(optimization, "identical") &&
+               contains(run_time, "similar")
+            # similar performance
+            if contains(compile_time, "less") && !contains(allocs, "more") ||
+               !contains(compile_time, "more") && contains(allocs, "less")
+                # better compilation or total allocations
+                color(writing_to_docs ? "mediumorchid" : "magenta")
+            elseif contains(compile_time, "less") && contains(allocs, "more") ||
+                   contains(compile_time, "more") && contains(allocs, "less")
+                # mixed compilation and total allocations
+                color(writing_to_docs ? "silver" : "light_gray")
+            elseif contains(compile_time, "similar") &&
+                   contains(allocs, "similar")
+                # similar compilation and total allocations
+                color(writing_to_docs ? "gray" : "dark_gray")
+            else
+                # worse compilation or total allocations
+                color(writing_to_docs ? "indianred" : "red")
+            end
+        else
+            # worse performance
+            color(writing_to_docs ? "indianred" : "red")
         end
-    no_difference =
-        highlighter((data, i, j) -> true, generate_html ? "khaki" : "yellow")
+    highlighter = (writing_to_docs ? HtmlHighlighter : Highlighter)(
+        Returns(true),
+        (_, data, row, _) -> highlighter_color(data[row, 6:9]...),
+    )
+
+    # TODO: Why does Sys.maxrss() always seem to be 0 on Ubuntu systems?
+    has_rss = any(contains('['), table_data[:, 9])
 
     other_kwargs =
-        generate_html ?
+        writing_to_docs ?
         (;
             backend = Val(:html),
             table_style = Dict(
@@ -53,38 +87,86 @@ function print_comparison_table(io = stdout, generate_html = false)
             ),
         ) :
         (;
+            title,
+            title_alignment = :c,
             title_same_width_as_table = true,
-            columns_width = [45, 45, 0, 0, 0, 0, 0],
+            columns_width = [45, 45, 15, 10, 30, 25, 20, 20, has_rss ? 30 : 20],
             linebreaks = true,
             autowrap = true,
             crop = :none,
         )
 
+    if writing_to_docs
+        println(io, "## $title")
+        println(io, "```@raw html")
+        println(io, "<div style=\"width: max(80vw, 100%)\">") # 80% of viewport
+    end
     pretty_table(
         io,
         table_data;
-        title = "Comparison of UnrolledUtilities to Base and Base.Iterators",
-        title_alignment = :c,
         alignment = :l,
         header = [
             "Unrolled Expression",
             "Reference Expression",
-            "Iterator Contents",
-            "Unrolled Performance",
-            "Reference Performance",
-            "Unrolled Compilation Time",
-            "Unrolled Compilation Memory",
+            "Itr Type",
+            "Itr Length",
+            "Itr Contents",
+            "Optimization",
+            "Run Time",
+            "Compilation Time",
+            "Total $(has_rss ? "GC [and RSS] " : "")Allocations",
         ],
-        highlighters = (
-            better_performance_but_harder_to_compile,
-            better_performance,
-            mixed_compilation,
-            harder_to_compile,
-            easier_to_compile,
-            no_difference,
-        ),
+        highlighters = highlighter,
         other_kwargs...,
     )
+    if writing_to_docs
+        println(io, "</div>")
+        println(io, "```")
+    else
+        println(io)
+    end
+end
+
+function time_string(nanoseconds)
+    nanoseconds == 0 && return "$nanoseconds ns"
+    n_decimal_digits = floor(Int, log10(nanoseconds) + 1)
+    return if n_decimal_digits <= 3
+        "$nanoseconds ns"
+    elseif n_decimal_digits <= 6
+        "$(round(Int, nanoseconds / 10^3)) μs"
+    elseif n_decimal_digits <= 9
+        "$(round(Int, nanoseconds / 10^6)) ms"
+    else
+        "$(round(Int, nanoseconds / 10^9)) s"
+    end
+end
+
+function memory_string(bytes)
+    bytes == 0 && return "$bytes B"
+    n_binary_digits = floor(Int, log2(bytes) + 1)
+    return if n_binary_digits <= 10
+        "$bytes B"
+    elseif n_binary_digits <= 20
+        "$(round(Int, bytes / 2^10)) kB"
+    elseif n_binary_digits <= 30
+        "$(round(Int, bytes / 2^20)) MB"
+    else
+        "$(round(Int, bytes / 2^30)) GB"
+    end
+end
+
+function comparison_string(value1, value2, to_string, to_number = identity)
+    ratio = to_number(value1) / to_number(value2)
+    ratio_str = if ratio >= 2
+        floored_ratio = ratio == Inf ? Inf : floor(Int, ratio)
+        "$floored_ratio times more"
+    elseif inv(ratio) >= 2
+        floored_inv_ratio = ratio == 0 ? Inf : floor(Int, inv(ratio))
+        "$floored_inv_ratio times less"
+    else
+        "similar"
+    end
+    return "$ratio_str ($(to_string(value1)) vs. $(to_string(value2)))"
 end
 
 function drop_line_numbers(expr)
@@ -118,18 +200,59 @@ function code_instance(f, args...)
     end
 end
 
-macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
+macro benchmark(expression)
+    return quote
+        prev_time = time_ns()
+        $(esc(expression))
+        new_time = time_ns()
+        best_time = new_time - prev_time
+
+        # Benchmark for at most 0.1 s (10^8 ns), ignoring the first call above.
+        n_trials = 0
+        start_time = new_time
+        while n_trials < 10^4 && new_time - start_time < 10^8
+            prev_time = time_ns()
+            $(esc(expression))
+            new_time = time_ns()
+            best_time = min(best_time, new_time - prev_time)
+            n_trials += 1
+        end
+
+        best_time
+    end
+end
+
+macro test_unrolled(
+    args_expr,
+    unrolled_expr,
+    reference_expr,
+    itr_contents_str,
+    skip_allocations_test = false,
+    skip_type_stability_test = false,
+)
     @assert Meta.isexpr(args_expr, :tuple)
     arg_names = args_expr.args
     @assert all(arg_name -> arg_name isa Symbol, arg_names)
     args = map(esc, arg_names)
     unrolled_expr_str = simplified_expression_string(unrolled_expr)
     reference_expr_str = simplified_expression_string(reference_expr)
-    expr_info_str =
-        length(args) == 1 ? "$unrolled_expr_str with 1 iterator that contains" :
-        "$unrolled_expr_str with $(length(args)) iterators that each contain"
+    contains_str = length(args) == 1 ? " that contains" : "s that each contain"
     quote
-        @info "Testing $($expr_info_str) $($(esc(contents_info_str)))"
+        itr_types = map(arg -> typeof(arg).name.wrapper, ($(args...),))
+        itr_lengths = map(length, ($(args...),))
+
+        itr_type_str =
+            length(unique(itr_types)) == 1 ? string(itr_types[1]) :
+            join(itr_types, '/')
+        itr_length_str =
+            length(unique(itr_lengths)) == 1 ? string(itr_lengths[1]) :
+            join(itr_lengths, '/')
+        itr_str =
+            $(isempty(args)) ? "nothing" :
+            "$($(length(args))) $itr_type_str$($contains_str) $itr_length_str \
+             $($(esc(itr_contents_str)))"
+
+        @info "Testing $($unrolled_expr_str) with $itr_str"
 
         unrolled_func($(arg_names...)) = $(esc(unrolled_expr))
         reference_func($(arg_names...)) = $(esc(reference_expr))
@@ -146,26 +269,27 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
         reference_func_and_nothing($(args...))
 
         # Test for allocations.
-        @test (@allocated unrolled_func_and_nothing($(args...))) == 0
-        is_reference_non_allocating =
-            (@allocated reference_func_and_nothing($(args...))) == 0
+        unrolled_run_memory = @allocated unrolled_func_and_nothing($(args...))
+        reference_run_memory = @allocated reference_func_and_nothing($(args...))
+        $(esc(skip_allocations_test)) || @test unrolled_run_memory == 0
 
         # Test for type-stability.
-        @test_opt unrolled_func($(args...))
+        is_unrolled_stable =
+            isempty(JET.get_reports(@report_opt unrolled_func($(args...))))
         is_reference_stable =
             isempty(JET.get_reports(@report_opt reference_func($(args...))))
-
-        unrolled_instance = code_instance(unrolled_func, $(args...))
-        reference_instance = code_instance(reference_func, $(args...))
+        $(esc(skip_type_stability_test)) || @test_opt unrolled_func($(args...))
 
         # Test for constant propagation.
-        is_unrolled_const = isdefined(unrolled_instance, :rettype_const)
-        Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const
-        is_reference_const = isdefined(reference_instance, :rettype_const)
+        is_unrolled_const =
+            isdefined(code_instance(unrolled_func, $(args...)), :rettype_const)
+        is_reference_const =
+            isdefined(code_instance(reference_func, $(args...)), :rettype_const)
+        # Base.issingletontype(typeof(($(args...),))) && @test is_unrolled_const
 
         buffer = IOBuffer()
 
-        # Check whether the functions are fully optimized out.
+        # Determine whether the functions are fully optimized out.
         args_type = Tuple{map(typeof, ($(args...),))...}
         code_llvm(buffer, unrolled_func, args_type; debuginfo = :none)
         is_unrolled_optimized_out =
@@ -174,86 +298,115 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
         is_reference_optimized_out =
             length(split(String(take!(buffer)), '\n')) == 5
 
+        # Test the overall level of optimization.
+        unrolled_opt_str, unrolled_opt_score = if unrolled_run_memory > 0
+            "$(memory_string(unrolled_run_memory)) allocs", 1 / unrolled_run_memory
+        elseif !is_unrolled_stable
+            "type-unstable", 2
+        elseif !is_unrolled_const && !is_unrolled_optimized_out
+            "type-stable", 3
+        elseif !is_unrolled_optimized_out
+            "constant", 4
+        else
+            "optimized out", 5
+        end
+        reference_opt_str, reference_opt_score = if reference_run_memory > 0
+            "$(memory_string(reference_run_memory)) allocs",
+            1 / reference_run_memory
+        elseif !is_reference_stable
+            "type-unstable", 2
+        elseif !is_reference_const && !is_reference_optimized_out
+            "type-stable", 3
+        elseif !is_reference_optimized_out
+            "constant", 4
+        else
+            "optimized out", 5
+        end
+        $(esc(skip_type_stability_test)) ||
+            @test unrolled_opt_score >= reference_opt_score
+
+        # Measure the run times.
+        unrolled_run_time = @benchmark unrolled_func($(args...))
+        reference_run_time = @benchmark reference_func($(args...))
+
+        # Measure the compilation times and memory allocations in separate
+        # processes to ensure that they are not under-counted.
         arg_name_strs = ($(map(string, arg_names)...),)
         arg_names_str = join(arg_name_strs, ", ")
         arg_definition_strs =
             map((name, value) -> "$name = $value", arg_name_strs, ($(args...),))
         arg_definitions_str = join(arg_definition_strs, '\n')
-        unrolled_command_str = """
+        command_str(func_str) = """
             using UnrolledUtilities
-            unrolled_func($arg_names_str) = $($(string(unrolled_expr)))
-            $arg_definitions_str
-            stats1 = @timed unrolled_func($arg_names_str)
-            stats2 = @timed unrolled_func($arg_names_str)
-            print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes)
-            """
-        reference_command_str = """
-            reference_func($arg_names_str) = $($(string(reference_expr)))
             $arg_definitions_str
-            stats1 = @timed reference_func($arg_names_str)
-            stats2 = @timed reference_func($arg_names_str)
-            print(stats1.time - stats2.time, ',', stats1.bytes - stats2.bytes)
+            Base.cumulative_compile_timing(true)
+            nanoseconds1 = Base.cumulative_compile_time_ns()[1]
+            rss_bytes_1 = Sys.maxrss()
+            Δgc_bytes = @allocated $func_str
+            rss_bytes_2 = Sys.maxrss()
+            nanoseconds2 = Base.cumulative_compile_time_ns()[1]
+            Base.cumulative_compile_timing(false)
+            Δnanoseconds = nanoseconds2 - nanoseconds1
+            Δrss_bytes = rss_bytes_2 - rss_bytes_1
+            print(Δnanoseconds, ", ", Δgc_bytes, ", ", Δrss_bytes)
             """
 
-        # Get the unrolled function's time-to-first-run and its memory usage.
+        unrolled_command_str = command_str($(string(unrolled_expr)))
         run(pipeline(`julia --project -e $unrolled_command_str`, buffer))
-        unrolled_time, unrolled_memory =
-            parse.((Float64, Int), split(String(take!(buffer)), ','))
+        unrolled_compile_time, unrolled_total_memory, unrolled_total_rss =
+            parse.((Int, Int, Int), split(String(take!(buffer)), ','))
 
         # Make a new buffer to avoid a potential data race:
-        # https://discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3
+        # discourse.julialang.org/t/iobuffer-becomes-not-writable-after-run/92323/3
         close(buffer)
         buffer = IOBuffer()
 
-        # Get the reference function's time-to-first-run and its memory usage.
+        reference_command_str = command_str($(string(reference_expr)))
         run(pipeline(`julia --project -e $reference_command_str`, buffer))
-        reference_time, reference_memory =
-            parse.((Float64, Int), split(String(take!(buffer)), ','))
+        reference_compile_time, reference_total_memory, reference_total_rss =
+            parse.((Int, Int, Int), split(String(take!(buffer)), ','))
 
         close(buffer)
 
-        # Record all relevant information in comparison_table_dict.
-        unrolled_performance_str = if !is_unrolled_const
-            "type-stable"
-        elseif !is_unrolled_optimized_out
-            "const return value"
-        else
-            "fully optimized out"
-        end
-        reference_performance_str = if !is_reference_non_allocating
-            "allocating"
-        elseif !is_reference_stable
-            "type-unstable"
-        elseif !is_reference_const
-            "type-stable"
-        elseif !is_reference_optimized_out
-            "const return value"
-        else
-            "fully optimized out"
-        end
-        time_ratio = unrolled_time / reference_time
-        time_ratio_str = if time_ratio >= 1.5
-            "$(round(Int, time_ratio)) times slower"
-        elseif inv(time_ratio) >= 1.5
-            "$(round(Int, inv(time_ratio))) times faster"
-        else
-            "similar"
-        end
-        memory_ratio = unrolled_memory / reference_memory
-        memory_ratio_str = if memory_ratio >= 1.5
-            "$(round(Int, memory_ratio)) times more"
-        elseif inv(memory_ratio) >= 1.5
-            "$(round(Int, inv(memory_ratio))) times less"
+        optimization_str = if unrolled_opt_score > reference_opt_score
+            if unrolled_opt_score <= 1
+                "fewer allocs ($unrolled_opt_str vs. $reference_opt_str)"
+            else
+                "better ($unrolled_opt_str vs. $reference_opt_str)"
+            end
+        elseif unrolled_opt_score < reference_opt_score
+            "worse ($unrolled_opt_str vs. $reference_opt_str)"
         else
-            "similar"
+            "identical ($unrolled_opt_str)"
         end
+        run_time_str = comparison_string(
+            unrolled_run_time,
+            reference_run_time,
+            time_string,
+        )
+        compile_time_str = comparison_string(
+            unrolled_compile_time,
+            reference_compile_time,
+            time_string,
+        )
+        memory_str = comparison_string(
+            (unrolled_total_memory, unrolled_total_rss),
+            (reference_total_memory, reference_total_rss),
+            ((gc_bytes, rss_bytes),) ->
+                rss_bytes == 0 ? memory_string(gc_bytes) :
+                "$(memory_string(gc_bytes)) [$(memory_string(rss_bytes))]",
+            first, # Use GC value for comparison since RSS might be unavailable.
+        )
+
         dict_key = ($unrolled_expr_str, $reference_expr_str)
         dict_entry = (
-            $(esc(contents_info_str)),
-            unrolled_performance_str,
-            reference_performance_str,
-            time_ratio_str,
-            memory_ratio_str,
+            itr_type_str,
+            itr_length_str,
+            $(esc(itr_contents_str)),
+            optimization_str,
+            run_time_str,
+            compile_time_str,
+            memory_str,
         )
         if dict_key in keys(comparison_table_dict)
             push!(comparison_table_dict[dict_key], dict_entry)
@@ -263,160 +416,219 @@ macro test_unrolled(args_expr, unrolled_expr, reference_expr, contents_info_str)
     end
 end
 
-@testset "empty iterators" begin
-    itr = ()
-    str = "nothing"
-    @test_unrolled (itr,) unrolled_any(error, itr) any(error, itr) str
-    @test_unrolled (itr,) unrolled_all(error, itr) all(error, itr) str
-    @test_unrolled (itr,) unrolled_foreach(error, itr) foreach(error, itr) str
-    @test_unrolled (itr,) unrolled_map(error, itr, itr) map(error, itr, itr) str
-    @test_unrolled(
-        (itr,),
-        unrolled_reduce(error, itr; init = 0),
-        reduce(error, itr; init = 0),
-        str,
-    )
+tuple_of_tuples(num_tuples, min_tuple_length, singleton, identical) =
+    ntuple(num_tuples) do index
+        tuple_length = min_tuple_length + (identical ? 0 : (index - 1) % 7)
+        ntuple(singleton ? Val : identity, tuple_length)
+    end
+function tuples_of_tuples_contents_str(itrs...)
+    str = ""
+    all(itr -> length(itr) > 1 && length(unique(itr)) == 1, itrs) &&
+        (str *= "identical ")
+    all(itr -> length(itr) > 1 && length(unique(itr)) != 1, itrs) &&
+        (str *= "distinct ")
+    all(itr -> all(isempty, itr), itrs) && (str *= "empty ")
+    all(itr -> all(!isempty, itr), itrs) && (str *= "nonempty ")
+    all(itr -> any(isempty, itr) && any(!isempty, itr), itrs) &&
+        (str *= "empty & nonempty ")
+    all(itr -> Base.issingletontype(typeof(itr)), itrs) && (str *= "singleton ")
+    all(itr -> !Base.issingletontype(typeof(itr)), itrs) &&
+        (str *= "non-singleton ")
+    str *= "Tuple"
+    all(itr -> length(itr) > 1, itrs) && (str *= "s")
+    return str
 end
 
-for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
-    itr1 = ntuple(i -> ntuple(Val, identical ? 0 : (i - 1) % 7), n)
-    itr2 = ntuple(i -> ntuple(Val, identical ? 1 : (i - 1) % 7 + 1), n)
-    itr3 = ntuple(i -> ntuple(identity, identical ? 1 : (i - 1) % 7 + 1), n)
-    if n == 1
-        str1 = "1 empty tuple"
-        str2 = "1 nonempty singleton tuple"
-        str3 = "1 nonempty non-singleton tuple"
-        str12 = "1 singleton tuple"
-        str23 = "1 nonempty tuple"
-        str123 = "1 tuple"
-    elseif identical
-        str1 = "$n empty tuples"
-        str2 = "$n identical nonempty singleton tuples"
-        str3 = "$n identical nonempty non-singleton tuples"
-        str12 = "$n identical singleton tuples"
-        str23 = "$n identical nonempty tuples"
-        str123 = "$n identical tuples"
-    else
-        str1 = "$n empty and nonempty singleton tuples"
-        str2 = "$n nonempty singleton tuples"
-        str3 = "$n nonempty non-singleton tuples"
-        str12 = "$n singleton tuples"
-        str23 = "$n nonempty tuples"
-        str123 = "$n tuples"
-    end
-    @testset "iterators of $str123" begin
-        for (itr, str) in ((itr1, str1), (itr2, str2), (itr3, str3))
-            @test_unrolled (itr,) unrolled_any(isempty, itr) any(isempty, itr) str
-            @test_unrolled (itr,) unrolled_any(!isempty, itr) any(!isempty, itr) str
+# NOTE: In the tests below, random numbers are meant to emulate values that
+# cannot be inferred during compilation.
+
+title = "Isolated Unrolled Functions"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+for itr in (
+    tuple_of_tuples(1, 0, true, true),
+    tuple_of_tuples(1, 1, true, true),
+    tuple_of_tuples(1, 1, false, true),
+    map(n -> tuple_of_tuples(n, 0, true, true), (8, 32, 33, 128))...,
+    map(n -> tuple_of_tuples(n, 1, true, true), (8, 32, 33, 128))...,
+    map(n -> tuple_of_tuples(n, 1, false, true), (8, 32, 33, 128))...,
+    map(n -> tuple_of_tuples(n, 0, true, false), (8, 32, 33, 128))...,
+    map(n -> tuple_of_tuples(n, 1, true, false), (8, 32, 33, 128))...,
+    map(n -> tuple_of_tuples(n, 1, false, false), (8, 32, 33, 128))...,
+)
+    str = tuples_of_tuples_contents_str(itr)
+    itr_description = "a Tuple that contains $(length(itr)) $str"
+    @testset "individual unrolled functions of $itr_description" begin
+        @test_unrolled (itr,) unrolled_any(isempty, itr) any(isempty, itr) str
+        @test_unrolled(
+            (itr,),
+            unrolled_any(x -> length(x) == rand(8:10), itr),
+            any(x -> length(x) == rand(8:10), itr),
+            str,
+        )
 
-            @test_unrolled (itr,) unrolled_all(isempty, itr) all(isempty, itr) str
-            @test_unrolled (itr,) unrolled_all(!isempty, itr) all(!isempty, itr) str
+        @test_unrolled (itr,) unrolled_all(isempty, itr) all(isempty, itr) str
+        @test_unrolled(
+            (itr,),
+            unrolled_all(x -> length(x) == rand(8:10), itr),
+            all(x -> length(x) == rand(8:10), itr),
+            str,
+        )
 
-            @test_unrolled(
-                (itr,),
-                unrolled_foreach(x -> @assert(length(x) <= 7), itr),
-                foreach(x -> @assert(length(x) <= 7), itr),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_foreach(x -> @assert(length(x) <= 7), itr),
+            foreach(x -> @assert(length(x) <= 7), itr),
+            str,
+        )
 
-            @test_unrolled (itr,) unrolled_map(length, itr) map(length, itr) str
+        @test_unrolled (itr,) unrolled_map(length, itr) map(length, itr) str
 
-            @test_unrolled (itr,) unrolled_reduce(tuple, itr) reduce(tuple, itr) str
-            @test_unrolled(
-                (itr,),
-                unrolled_reduce(tuple, itr; init = ()),
-                reduce(tuple, itr; init = ()),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_applyat(length, rand(1:7:length(itr)), itr),
+            length(itr[rand(1:7:length(itr))]),
+            str,
+        )
+
+        @test_unrolled (itr,) unrolled_reduce(tuple, itr) reduce(tuple, itr) str
+        @test_unrolled(
+            (itr,),
+            unrolled_reduce(tuple, itr; init = ()),
+            reduce(tuple, itr; init = ()),
+            str,
+        )
+
+        @test_unrolled(
+            (itr,),
+            unrolled_mapreduce(length, +, itr),
+            mapreduce(length, +, itr),
+            str,
+        )
+        @test_unrolled(
+            (itr,),
+            unrolled_mapreduce(length, +, itr; init = 0),
+            mapreduce(length, +, itr; init = 0),
+            str,
+        )
 
+        if length(itr) <= 33
             @test_unrolled(
                 (itr,),
-                unrolled_mapreduce(length, +, itr),
-                mapreduce(length, +, itr),
+                unrolled_accumulate(tuple, itr),
+                accumulate(tuple, itr),
                 str,
             )
             @test_unrolled(
                 (itr,),
-                unrolled_mapreduce(length, +, itr; init = 0),
-                mapreduce(length, +, itr; init = 0),
+                unrolled_accumulate(tuple, itr; init = ()),
+                accumulate(tuple, itr; init = ()),
                 str,
             )
+        end # These can take half a minute to compile when the length is 128.
 
-            @test_unrolled (itr,) unrolled_zip(itr) Tuple(zip(itr)) str
+        @test_unrolled (itr,) unrolled_push(itr, itr[1]) (itr..., itr[1]) str
+        @test_unrolled (itr,) unrolled_append(itr, itr) (itr..., itr...) str
 
-            @test_unrolled (itr,) unrolled_enumerate(itr) Tuple(enumerate(itr)) str
+        @test_unrolled(
+            (itr,),
+            unrolled_take(itr, Val(length(itr) ÷ 2)),
+            itr[1:(length(itr) ÷ 2)],
+            str,
+        )
+        @test_unrolled(
+            (itr,),
+            unrolled_drop(itr, Val(length(itr) ÷ 2)),
+            itr[(length(itr) ÷ 2 + 1):end],
+            str,
+        )
 
-            @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str
-            @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str
-            @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str
+        @test_unrolled (itr,) unrolled_in(nothing, itr) (nothing in itr) str
+        @test_unrolled (itr,) unrolled_in(itr[1], itr) (itr[1] in itr) str
+        @test_unrolled (itr,) unrolled_in(itr[end], itr) (itr[end] in itr) str
 
-            # unrolled_unique is only type-stable for singletons
-            if Base.issingletontype(typeof(itr))
-                @test_unrolled (itr,) unrolled_unique(itr) Tuple(unique(itr)) str
-            end
+        @test_unrolled(
+            (itr,),
+            unrolled_unique(itr),
+            Tuple(unique(itr)),
+            str,
+            !Base.issingletontype(typeof(itr)),
+            !Base.issingletontype(typeof(itr)),
+        ) # unrolled_unique is type-unstable for non-singleton values
 
-            @test_unrolled(
-                (itr,),
-                unrolled_filter(!isempty, itr),
-                filter(!isempty, itr),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_filter(!isempty, itr),
+            filter(!isempty, itr),
+            str,
+        )
 
-            @test_unrolled(
-                (itr,),
-                unrolled_split(isempty, itr),
-                (filter(isempty, itr), filter(!isempty, itr)),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_split(isempty, itr),
+            (filter(isempty, itr), filter(!isempty, itr)),
+            str,
+        )
 
-            @test_unrolled(
-                (itr,),
-                unrolled_flatten(itr),
-                Tuple(Iterators.flatten(itr)),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_flatten(itr),
+            Tuple(Iterators.flatten(itr)),
+            str,
+        )
 
-            @test_unrolled(
-                (itr,),
-                unrolled_flatmap(reverse, itr),
-                Tuple(Iterators.flatmap(reverse, itr)),
-                str,
-            )
+        @test_unrolled(
+            (itr,),
+            unrolled_flatmap(reverse, itr),
+            Tuple(Iterators.flatmap(reverse, itr)),
+            str,
+        )
 
+        if length(itr) <= 33
             @test_unrolled(
                 (itr,),
-                unrolled_product(itr),
-                Tuple(Iterators.product(itr)),
+                unrolled_product(itr, itr),
+                Tuple(Iterators.product(itr, itr)),
                 str,
             )
-
+        end
+        if length(itr) <= 8
             @test_unrolled(
                 (itr,),
-                unrolled_applyat(
-                    x -> @assert(length(x) <= 7),
-                    rand(1:length(itr)),
-                    itr,
-                ),
-                @assert(length(itr[rand(1:length(itr))]) <= 7),
+                unrolled_product(itr, itr, itr),
+                Tuple(Iterators.product(itr, itr, itr)),
                 str,
             )
+        end # This can take several minutes to compile when the length is 32.
+    end
+end
 
-            if n > 1
-                @test_unrolled(
-                    (itr,),
-                    unrolled_take(itr, Val(7)),
-                    itr[1:7],
-                    str,
-                )
-                @test_unrolled(
-                    (itr,),
-                    unrolled_drop(itr, Val(7)),
-                    itr[8:end],
-                    str,
-                )
-            end
-        end
-
+title = "Nested Unrolled Functions"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+for (itr1, itr2, itr3) in (
+    (
+        tuple_of_tuples(1, 0, true, true),
+        tuple_of_tuples(1, 1, true, true),
+        tuple_of_tuples(1, 1, false, true),
+    ),
+    zip(
+        map(n -> tuple_of_tuples(n, 0, true, true), (8, 32, 33, 128)),
+        map(n -> tuple_of_tuples(n, 1, true, true), (8, 32, 33, 128)),
+        map(n -> tuple_of_tuples(n, 1, false, true), (8, 32, 33, 128)),
+    )...,
+    zip(
+        map(n -> tuple_of_tuples(n, 0, true, false), (8, 32, 33, 128)),
+        map(n -> tuple_of_tuples(n, 1, true, false), (8, 32, 33, 128)),
+        map(n -> tuple_of_tuples(n, 1, false, false), (8, 32, 33, 128)),
+    )...,
+)
+    str3 = tuples_of_tuples_contents_str(itr3)
+    str12 = tuples_of_tuples_contents_str(itr1, itr2)
+    str23 = tuples_of_tuples_contents_str(itr2, itr3)
+    str123 = tuples_of_tuples_contents_str(itr1, itr2, itr3)
+    itr_description = "Tuples that contain $(length(itr1)) $str123"
+    @testset "nested unrolled functions of $itr_description" begin
         @test_unrolled(
             (itr3,),
             unrolled_any(x -> unrolled_reduce(+, x) > 7, itr3),
@@ -434,11 +646,11 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
         @test_unrolled(
             (itr1, itr2),
             unrolled_foreach(
-                (x1, x2) -> @assert(length(x1) < length(x2)),
+                (x1, x2) -> @assert(x1 == unrolled_take(x2, Val(length(x1)))),
                 itr1,
                 itr2,
             ),
-            foreach((x1, x2) -> @assert(length(x1) < length(x2)), itr1, itr2),
+            foreach((x1, x2) -> @assert(x1 == x2[1:length(x1)]), itr1, itr2),
             str12,
         )
         @test_unrolled(
@@ -455,13 +667,13 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
         @test_unrolled(
             (itr1, itr2),
             unrolled_applyat(
-                (x1, x2) -> @assert(length(x1) < length(x2)),
+                (x1, x2) -> @assert(x1 == unrolled_take(x2, Val(length(x1)))),
                 rand(1:length(itr1)),
                 itr1,
                 itr2,
             ),
             let n = rand(1:length(itr1))
-                @assert(length(itr1[n]) < length(itr2[n]))
+                @assert(itr1[n] == itr2[n][1:length(itr1[n])])
             end,
             str12,
         )
@@ -478,53 +690,27 @@ for n in (1, 8, 32, 33, 128), identical in (n == 1 ? (true,) : (true, false))
             end,
             str23,
         )
-
-        @test_unrolled(
-            (itr1, itr2),
-            unrolled_zip(itr1, itr2),
-            Tuple(zip(itr1, itr2)),
-            str12,
-        )
-        @test_unrolled(
-            (itr1, itr2, itr3),
-            unrolled_zip(itr1, itr2, itr3),
-            Tuple(zip(itr1, itr2, itr3)),
-            str123,
-        )
-
-        # unrolled_product can take several minutes to compile when n is large
-        if n <= 33
-            @test_unrolled(
-                (itr1, itr2),
-                unrolled_product(itr1, itr2),
-                Tuple(Iterators.product(itr1, itr2)),
-                str12,
-            )
-        end
-        if n <= 8
-            @test_unrolled(
-                (itr1, itr2, itr3),
-                unrolled_product(itr1, itr2, itr3),
-                Tuple(Iterators.product(itr1, itr2, itr3)),
-                str123,
-            )
-        end
     end
 end
 
 nested_iterator(depth, n, inner_n) =
     depth == 1 ? ntuple(identity, n) :
-    ntuple(inner_n) do _
-        nested_iterator(depth - 1, Int(n / inner_n), inner_n)
-    end
+    ntuple(
+        Returns(nested_iterator(depth - 1, Int(n / inner_n), inner_n)),
+        inner_n,
+    )
+
+title = "Recursive Unrolled Functions"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
 
 for n in (8, 32, 128)
-    @testset "iterators of $n values in nested tuples" begin
+    itr_description = "a Tuple that contains $n values in nested Tuples"
+    @testset "recursive unrolled functions of $itr_description" begin
         for depth in (2, 3, 4:2:(Int(log2(n)) + 1)...)
             itr = nested_iterator(depth, n, 2)
-            str = "$n values in nested tuples of depth $depth"
+            str = "$itr_description of depth $depth"
             # In the following definitions, use var"#self#" to avoid boxing:
-            # https://discourse.julialang.org/t/performant-recursive-anonymous-functions/90984/5
+            # discourse.julialang.org/t/performant-recursive-anonymous-functions/90984/5
             @test_unrolled(
                 (itr,),
                 map(
@@ -561,3 +747,254 @@ for n in (8, 32, 128)
         end
     end
 end
+
+title = "Nested Unrolled Closures"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+@testset "nested unrolled closures of Tuples vs. StaticBitVectors" begin
+    for (itr, skip_allocations_test) in (
+        (ntuple(Returns(true), 32), false),
+        (ntuple(Returns(true), 33), true),
+        (StaticBitVector{256}(true), false),
+        (StaticBitVector{257}(true), true),
+    )
+        @test_unrolled(
+            (itr,),
+            unrolled_reduce(
+                (itr′, i) -> Base.setindex(itr′, !itr′[i], i),
+                StaticOneTo(length(itr));
+                init = itr,
+            ),
+            reduce(
+                (itr′, i) -> Base.setindex(itr′, !itr′[i], i),
+                StaticOneTo(length(itr));
+                init = itr,
+            ),
+            "Bools",
+            skip_allocations_test,
+        )
+        @test_unrolled(
+            (itr,),
+            unrolled_reduce(
+                (itr′, i) -> unrolled_reduce(
+                    (itr′′, j) ->
+                        Base.setindex(itr′′, !itr′′[min(i, j)], j),
+                    StaticOneTo(length(itr′));
+                    init = itr′,
+                ),
+                StaticOneTo(length(itr));
+                init = itr,
+            ),
+            reduce(
+                (itr′, i) -> reduce(
+                    (itr′′, j) ->
+                        Base.setindex(itr′′, !itr′′[min(i, j)], j),
+                    StaticOneTo(length(itr′));
+                    init = itr′,
+                ),
+                StaticOneTo(length(itr));
+                init = itr,
+            ),
+            "Bools",
+            skip_allocations_test,
+        )
+        if length(itr) <= 256
+            @test_unrolled(
+                (itr,),
+                unrolled_reduce(
+                    (itr′, i) -> unrolled_reduce(
+                        (itr′′, j) -> unrolled_reduce(
+                            (itr′′′, k) -> Base.setindex(
+                                itr′′′,
+                                !itr′′′[min(i, j, k)],
+                                k,
+                            ),
+                            StaticOneTo(length(itr′′));
+                            init = itr′′,
+                        ),
+                        StaticOneTo(length(itr′));
+                        init = itr′,
+                    ),
+                    StaticOneTo(length(itr));
+                    init = itr,
+                ),
+                reduce(
+                    (itr′, i) -> reduce(
+                        (itr′′, j) -> reduce(
+                            (itr′′′, k) -> Base.setindex(
+                                itr′′′,
+                                !itr′′′[min(i, j, k)],
+                                k,
+                            ),
+                            StaticOneTo(length(itr′′));
+                            init = itr′′,
+                        ),
+                        StaticOneTo(length(itr′));
+                        init = itr′,
+                    ),
+                    StaticOneTo(length(itr));
+                    init = itr,
+                ),
+                "Bools",
+                skip_allocations_test,
+            )
+        end # The StaticBitVector{257} allocates over 2 GB for this test.
+    end
+end
+
+title = "Empty Iterators"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+@testset "unrolled functions of an empty Tuple" begin
+    itr = ()
+    str = "nothing"
+    @test_unrolled (itr,) unrolled_any(error, itr) any(error, itr) str
+    @test_unrolled (itr,) unrolled_all(error, itr) all(error, itr) str
+    @test_unrolled (itr,) unrolled_foreach(error, itr) foreach(error, itr) str
+    @test_unrolled (itr,) unrolled_map(error, itr) map(error, itr) str
+    @test_throws "init" unrolled_reduce(error, itr)
+    @test_unrolled(
+        (itr,),
+        unrolled_reduce(error, itr; init = 0),
+        reduce(error, itr; init = 0),
+        str,
+    )
+    @test_unrolled(
+        (itr,),
+        unrolled_accumulate(error, itr),
+        accumulate(error, itr),
+        str,
+    )
+    @test_unrolled(
+        (itr,),
+        unrolled_accumulate(error, itr; init = 0),
+        accumulate(error, itr; init = 0),
+        str,
+    )
+end
+
+title = "Very Long Iterators"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+@testset "unrolled functions of Tuples vs. StaticOneTos" begin
+    for itr in (ntuple(identity, 2000), StaticOneTo(2000), StaticOneTo(8186))
+        @test_unrolled (itr,) unrolled_reduce(+, itr) reduce(+, itr) "Ints"
+        @test_unrolled(
+            (itr,),
+            unrolled_mapreduce(log, +, itr),
+            mapreduce(log, +, itr),
+            "Ints",
+        )
+    end # These can each take 40 seconds to compile for ntuple(identity, 8186).
+    for itr in (ntuple(identity, 8187), StaticOneTo(8187))
+        @test_throws "gc handles" unrolled_reduce(+, itr)
+        @test_throws "gc handles" unrolled_mapreduce(log, +, itr)
+    end
+    # TODO: Why does the compiler throw an error when generating functions that
+    # get unrolled into more than 8186 lines of LLVM code?
+
+    for itr in (StaticOneTo(8186), StaticOneTo(8187))
+        @test_unrolled(
+            (itr,),
+            unrolled_reduce(+, Val(length(itr))),
+            reduce(+, itr),
+            "Ints",
+        )
+    end
+    @test_throws "gc handles" unrolled_reduce(+, Val(8188))
+    # TODO: Why is the limit 8187 for the Val version of unrolled_reduce?
+end
+
+title = "Generative vs. Recursive Unrolling"
+comparison_table_dict = (comparison_table_dicts[title] = OrderedDict())
+
+for itr in (
+    tuple_of_tuples(1, 0, true, true),
+    tuple_of_tuples(1, 1, true, true),
+    tuple_of_tuples(1, 1, false, true),
+    map(n -> tuple_of_tuples(n, 0, true, true), (8, 16, 32, 33, 128, 256))...,
+    map(n -> tuple_of_tuples(n, 1, true, true), (8, 16, 32, 33, 128, 256))...,
+    map(n -> tuple_of_tuples(n, 1, false, true), (8, 16, 32, 33, 128, 256))...,
+    map(n -> tuple_of_tuples(n, 0, true, false), (8, 16, 32, 33, 128, 256))...,
+    map(n -> tuple_of_tuples(n, 1, true, false), (8, 16, 32, 33, 128, 256))...,
+    map(n -> tuple_of_tuples(n, 1, false, false), (8, 16, 32, 33, 128, 256))...,
+)
+    str = tuples_of_tuples_contents_str(itr)
+    itr_description = "a Tuple that contains $(length(itr)) $str"
+    @testset "generative vs. recursive unrolling of $itr_description" begin
+        @test_unrolled(
+            (itr,),
+            UnrolledUtilities.gen_unrolled_any(isempty, itr),
+            UnrolledUtilities.rec_unrolled_any(isempty, itr),
+            str,
+        )
+
+        @test_unrolled(
+            (itr,),
+            UnrolledUtilities.gen_unrolled_all(isempty, itr),
+            UnrolledUtilities.rec_unrolled_all(isempty, itr),
+            str,
+        )
+
+        @test_unrolled(
+            (itr,),
+            UnrolledUtilities.gen_unrolled_foreach(
+                x -> @assert(length(x) <= 7),
+                itr,
+            ),
+            UnrolledUtilities.rec_unrolled_foreach(
+                x -> @assert(length(x) <= 7),
+                itr,
+            ),
+            str,
+        )
+
+        @test_unrolled(
+            (itr,),
+            UnrolledUtilities.gen_unrolled_map(length, itr),
+            UnrolledUtilities.rec_unrolled_map(length, itr),
+            str,
+        )
+
+        @test_unrolled(
+            (itr,),
+            UnrolledUtilities.gen_unrolled_applyat(
+                length,
+                rand(1:7:length(itr)),
+                itr,
+            ),
+            UnrolledUtilities.rec_unrolled_applyat(
+                length,
+                rand(1:7:length(itr)),
+                itr,
+            ),
+            str,
+        )
+
+        if length(itr) <= 33
+            @test_unrolled(
+                (itr,),
+                UnrolledUtilities.gen_unrolled_reduce(tuple, itr, ()),
+                UnrolledUtilities.rec_unrolled_reduce(tuple, itr, ()),
+                str,
+            )
+
+            @test_unrolled(
+                (itr,),
+                UnrolledUtilities.gen_unrolled_accumulate(
+                    tuple,
+                    itr,
+                    (),
+                    identity,
+                ),
+                UnrolledUtilities.rec_unrolled_accumulate(
+                    tuple,
+                    itr,
+                    (),
+                    identity,
+                ),
+                str,
+            )
+        end # These can take over a minute to compile when the length is 128.
+    end
+end