From 5393a4468019c5a432ac6bbe06a1cfc032857096 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Tue, 12 Nov 2024 09:55:42 +0000
Subject: [PATCH 01/25] Extra comments

---
 .../src/ssa/ir/function_inserter.rs           |   1 +
 .../noirc_evaluator/src/ssa/ir/post_order.rs  |   2 +-
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 147 ++++++++++++++++--
 .../noirc_evaluator/src/ssa/ssa_gen/mod.rs    |   8 +-
 4 files changed, 142 insertions(+), 16 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/ir/function_inserter.rs b/compiler/noirc_evaluator/src/ssa/ir/function_inserter.rs
index 991ff22c902..4ff0921984d 100644
--- a/compiler/noirc_evaluator/src/ssa/ir/function_inserter.rs
+++ b/compiler/noirc_evaluator/src/ssa/ir/function_inserter.rs
@@ -80,6 +80,7 @@ impl<'f> FunctionInserter<'f> {
         }
     }
 
+    /// Get an instruction and make sure all the values in it are freshly resolved.
     pub(crate) fn map_instruction(&mut self, id: InstructionId) -> (Instruction, CallStack) {
         (
             self.function.dfg[id].clone().map_values(|id| self.resolve(id)),
diff --git a/compiler/noirc_evaluator/src/ssa/ir/post_order.rs b/compiler/noirc_evaluator/src/ssa/ir/post_order.rs
index 94ff96ba1d7..398ce887b96 100644
--- a/compiler/noirc_evaluator/src/ssa/ir/post_order.rs
+++ b/compiler/noirc_evaluator/src/ssa/ir/post_order.rs
@@ -22,7 +22,7 @@ impl PostOrder {
 }
 
 impl PostOrder {
-    /// Allocate and compute a function's block post-order. Pos
+    /// Allocate and compute a function's block post-order.
     pub(crate) fn with_function(func: &Function) -> Self {
         PostOrder(Self::compute_post_order(func))
     }
diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 661109c1786..8c01327843f 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -7,11 +7,11 @@
 //!    b. If we have previously modified any of the blocks in the loop,
 //!       restart from step 1 to refresh the context.
 //!    c. If not, try to unroll the loop. If successful, remember the modified
-//!       blocks. If unsuccessfully either error if the abort_on_error flag is set,
+//!       blocks. If unsuccessful either error if the abort_on_error flag is set,
 //!       or otherwise remember that the loop failed to unroll and leave it unmodified.
 //!
 //! Note that this pass also often creates superfluous jmp instructions in the
-//! program that will need to be removed by a later simplify cfg pass.
+//! program that will need to be removed by a later simplify CFG pass.
 //! Note also that unrolling is skipped for Brillig runtime and as a result
 //! we remove reference count instructions because they are only used by Brillig bytecode
 use std::collections::HashSet;
@@ -116,6 +116,32 @@ struct Loops {
 
 /// Find a loop in the program by finding a node that dominates any predecessor node.
 /// The edge where this happens will be the back-edge of the loop.
+///
+/// For example consider the following SSA of a basic loop:
+/// ```text
+/// main():
+///   v0 = ... start ...
+///   v1 = ... end ...
+///   jmp loop_entry(v0)
+/// loop_entry(i: Field):
+///   v2 = lt i v1
+///   jmpif v2, then: loop_body, else: loop_end
+/// loop_body():
+///   v3 = ... body ...
+///   v4 = add 1, i
+///   jmp loop_entry(v4)
+/// loop_end():
+/// ```
+///
+/// The CFG will look something like this:
+/// ```text
+/// main
+///   ↓
+/// loop_entry ←---↰
+///   ↓        ↘   |
+/// loop_end    loop_body     
+/// ```
+/// `loop_entry` has two predecessors: `main` and `loop_body`, and it dominates `loop_body`.
 fn find_all_loops(function: &Function) -> Loops {
     let cfg = ControlFlowGraph::with_function(function);
     let post_order = PostOrder::with_function(function);
@@ -127,6 +153,7 @@ fn find_all_loops(function: &Function) -> Loops {
         // These reachable checks wouldn't be needed if we only iterated over reachable blocks
         if dom_tree.is_reachable(block) {
             for predecessor in cfg.predecessors(block) {
+                // In the above example, we're looking for when `block` is `loop_entry` and `predecessor` is `loop_body`.
                 if dom_tree.is_reachable(predecessor) && dom_tree.dominates(block, predecessor) {
                     // predecessor -> block is the back-edge of a loop
                     loops.push(find_blocks_in_loop(block, predecessor, &cfg));
@@ -137,7 +164,7 @@ fn find_all_loops(function: &Function) -> Loops {
 
     // Sort loops by block size so that we unroll the larger, outer loops of nested loops first.
     // This is needed because inner loops may use the induction variable from their outer loops in
-    // their loop range.
+    // their loop range. We will start popping loops from the back.
     loops.sort_by_key(|loop_| loop_.blocks.len());
 
     Loops {
@@ -212,7 +239,63 @@ fn find_blocks_in_loop(
 }
 
 /// Unroll a single loop in the function.
-/// Returns Err(()) if it failed to unroll and Ok(()) otherwise.
+/// Returns Ok(()) if it succeeded, Err(callstack) if it failed,
+/// where the callstack indicates the location of the instruction
+/// that could not be processed, or empty if such information was
+/// not available.
+///
+/// Consider this example:
+/// ```text
+/// main():
+///   v0 = 0
+///   v1 = 3
+///   jmp loop_entry(v0)
+/// loop_entry(i: Field):
+///   v2 = lt i v1
+///   jmpif v2, then: loop_body, else: loop_end
+/// ```
+///
+/// The first step is to unroll the header by recognizing that jump condition
+/// is a constant, which means it will go to `loop_body`:
+/// ```text
+/// main():
+///   v0 = 0
+///   v1 = 2
+///   v2 = lt v0 v1
+///   // jmpif v2, then: loop_body, else: loop_end
+///   jmp dest: loop_body
+/// ```
+///
+/// Following that we unroll the loop body, which is the next source, replace
+/// the induction variable with the new value created in the body, and have
+/// another go at the header.
+/// ```text
+/// main():
+///   v0 = 0
+///   v1 = 2
+///   v2 = lt v0 v1
+///   v3 = ... body ...
+///   v4 = add 1, 0
+///   jmp loop_entry(v4)
+/// ```
+///
+/// At the end we reach a point where the condition evaluates to 0 and we jump to the end.
+/// ```text
+/// main():
+///   v0 = 0
+///   v1 = 2
+///   v2 = lt 0
+///   v3 = ... body ...
+///   v4 = add 1, v0
+///   v5 = lt v4 v1
+///   v6 = ... body ...
+///   v7 = add v4, 1
+///   v8 = lt v5 v1
+///   jmp loop_end
+/// ```
+///
+/// When e.g. `v8 = lt v5 v1` cannot be evaluated to a constant, the loop signals by returning `Err`
+/// that a few SSA passes are required to evaluate and simplify these values.
 fn unroll_loop(
     function: &mut Function,
     cfg: &ControlFlowGraph,
@@ -232,7 +315,7 @@ fn unroll_loop(
 
 /// The loop pre-header is the block that comes before the loop begins. Generally a header block
 /// is expected to have 2 predecessors: the pre-header and the final block of the loop which jumps
-/// back to the beginning.
+/// back to the beginning. Other predecessors can come from `break` or `continue`.
 fn get_pre_header(cfg: &ControlFlowGraph, loop_: &Loop) -> BasicBlockId {
     let mut pre_header = cfg
         .predecessors(loop_.header)
@@ -247,6 +330,17 @@ fn get_pre_header(cfg: &ControlFlowGraph, loop_: &Loop) -> BasicBlockId {
 ///
 /// Expects the current block to terminate in `jmp h(N)` where h is the loop header and N is
 /// a Field value.
+///
+/// Consider the following example:
+/// ```text
+/// main():
+///   v0 = ... start ...
+///   v1 = ... end ...
+///   jmp loop_entry(v0)
+/// loop_entry(i: Field):
+///   ...
+/// ```
+/// We're looking for the terminating jump of the `main` predecessor of `loop_entry`.
 fn get_induction_variable(function: &Function, block: BasicBlockId) -> Result<ValueId, CallStack> {
     match function.dfg[block].terminator() {
         Some(TerminatorInstruction::Jmp { arguments, call_stack: location, .. }) => {
@@ -288,8 +382,9 @@ fn unroll_loop_header<'a>(
     // Insert the current value of the loop induction variable into our context.
     let first_param = source_block.parameters()[0];
     context.inserter.try_map_value(first_param, induction_value);
+    // Copy over all instructions and a fresh terminator.
     context.inline_instructions_from_block();
-
+    // Mutate the terminator if possible so that it points at the iteration block.
     match context.dfg()[fresh_block].unwrap_terminator() {
         TerminatorInstruction::JmpIf { condition, then_destination, else_destination, call_stack } => {
             let condition = *condition;
@@ -304,6 +399,11 @@ fn unroll_loop_header<'a>(
                 // unroll_into block from now on.
                 context.insert_block = unroll_into;
 
+                // In the last iteration, `handle_jmpif` will have replaced `context.source_block`
+                // with the `else_destination`, that is, the `loop_end`, which signals that we 
+                // have no more loops to unroll, because that block was not part of the loop itself, 
+                // ie. it wasn't between `loop_header` and `loop_body`. Otherwise we have the `loop_body`
+                // in `source_block` and can unroll that into the destination.
                 Ok(loop_.blocks.contains(&context.source_block).then_some(context))
             } else {
                 // If this case is reached the loop either uses non-constant indices or we need
@@ -376,7 +476,8 @@ impl<'f> LoopIteration<'f> {
                 next_blocks.append(&mut blocks);
             }
         }
-
+        // After having unrolled all blocks in the loop body, we must know how to get back to the header;
+        // this is also the block into which we have to unroll into next.
         self.induction_value
             .expect("Expected to find the induction variable by end of loop iteration")
     }
@@ -384,6 +485,9 @@ impl<'f> LoopIteration<'f> {
     /// Unroll a single block in the current iteration of the loop
     fn unroll_loop_block(&mut self) -> Vec<BasicBlockId> {
         let mut next_blocks = self.unroll_loop_block_helper();
+        // If the loop body ends with a JmpIf then eliminate either destinations
+        // which are not one of the block we have seen as a destination of a terminator
+        // of a block we have already inlined.
         next_blocks.retain(|block| {
             let b = self.get_original_block(*block);
             self.loop_.blocks.contains(&b)
@@ -393,6 +497,7 @@ impl<'f> LoopIteration<'f> {
 
     /// Unroll a single block in the current iteration of the loop
     fn unroll_loop_block_helper(&mut self) -> Vec<BasicBlockId> {
+        // Copy instructions from the loop body to the unroll destination, replacing the terminator.
         self.inline_instructions_from_block();
         self.visited_blocks.insert(self.source_block);
 
@@ -402,14 +507,19 @@ impl<'f> LoopIteration<'f> {
                 then_destination,
                 else_destination,
                 call_stack,
-            } => self.handle_jmpif(
-                *condition,
-                *then_destination,
-                *else_destination,
-                call_stack.clone(),
-            ),
+            } =>
+            // Check where the loop body wants to jump (e.g if-then blocks).
+            {
+                self.handle_jmpif(
+                    *condition,
+                    *then_destination,
+                    *else_destination,
+                    call_stack.clone(),
+                )
+            }
             TerminatorInstruction::Jmp { destination, arguments, call_stack: _ } => {
                 if self.get_original_block(*destination) == self.loop_.header {
+                    // We found the back-edge of the loop.
                     assert_eq!(arguments.len(), 1);
                     self.induction_value = Some((self.insert_block, arguments[0]));
                 }
@@ -421,7 +531,10 @@ impl<'f> LoopIteration<'f> {
 
     /// Find the next branch(es) to take from a jmpif terminator and return them.
     /// If only one block is returned, it means the jmpif condition evaluated to a known
-    /// constant and we can safely take only the given branch.
+    /// constant and we can safely take only the given branch. In this case the method
+    /// also replaces the terminator of the insert block (a.k.a fresh block) to be a JMP,
+    /// and changes the source block in the context for the next iteration to be the
+    /// destination indicated by the constant condition (ie. the `then` or the `else`).
     fn handle_jmpif(
         &mut self,
         condition: ValueId,
@@ -467,10 +580,13 @@ impl<'f> LoopIteration<'f> {
         }
     }
 
+    /// Find the original ID of a block that replaced it.
     fn get_original_block(&self, block: BasicBlockId) -> BasicBlockId {
         self.original_blocks.get(&block).copied().unwrap_or(block)
     }
 
+    /// Copy over instructions from the source into the insert block,  
+    /// while simplifying instructions and keeping track of original block IDs.
     fn inline_instructions_from_block(&mut self) {
         let source_block = &self.dfg()[self.source_block];
         let instructions = source_block.instructions().to_vec();
@@ -479,6 +595,7 @@ impl<'f> LoopIteration<'f> {
         // instances of the induction variable or any values that were changed as a result
         // of the new induction variable value.
         for instruction in instructions {
+            // TODO (#6470): Now Brillig will be inlined as well.
             // Skip reference count instructions since they are only used for brillig, and brillig code is not unrolled
             if !matches!(
                 self.dfg()[instruction],
@@ -492,6 +609,8 @@ impl<'f> LoopIteration<'f> {
             .clone()
             .map_values(|value| self.inserter.resolve(value));
 
+        // Replace the blocks in the terminator with fresh one with the same parameters,
+        // while remembering which were the original block IDs.
         terminator.mutate_blocks(|block| self.get_or_insert_block(block));
         self.inserter.function.dfg.set_block_terminator(self.insert_block, terminator);
     }
diff --git a/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs b/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
index 8bf3a740b3a..5c95958a8d0 100644
--- a/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
+++ b/compiler/noirc_evaluator/src/ssa/ssa_gen/mod.rs
@@ -466,6 +466,7 @@ impl<'a> FunctionContext<'a> {
     ///
     /// For example, the loop `for i in start .. end { body }` is codegen'd as:
     ///
+    /// ```text
     ///   v0 = ... codegen start ...
     ///   v1 = ... codegen end ...
     ///   br loop_entry(v0)
@@ -478,6 +479,7 @@ impl<'a> FunctionContext<'a> {
     ///   br loop_entry(v4)
     /// loop_end():
     ///   ... This is the current insert point after codegen_for finishes ...
+    /// ```
     fn codegen_for(&mut self, for_expr: &ast::For) -> Result<Values, RuntimeError> {
         let loop_entry = self.builder.insert_block();
         let loop_body = self.builder.insert_block();
@@ -529,6 +531,7 @@ impl<'a> FunctionContext<'a> {
     ///
     /// For example, the expression `if cond { a } else { b }` is codegen'd as:
     ///
+    /// ```text
     ///   v0 = ... codegen cond ...
     ///   brif v0, then: then_block, else: else_block
     /// then_block():
@@ -539,16 +542,19 @@ impl<'a> FunctionContext<'a> {
     ///   br end_if(v2)
     /// end_if(v3: ?):  // Type of v3 matches the type of a and b
     ///   ... This is the current insert point after codegen_if finishes ...
+    /// ```
     ///
     /// As another example, the expression `if cond { a }` is codegen'd as:
     ///
+    /// ```text
     ///   v0 = ... codegen cond ...
-    ///   brif v0, then: then_block, else: end_block
+    ///   brif v0, then: then_block, else: end_if
     /// then_block:
     ///   v1 = ... codegen a ...
     ///   br end_if()
     /// end_if:  // No block parameter is needed. Without an else, the unit value is always returned.
     ///   ... This is the current insert point after codegen_if finishes ...
+    /// ```
     fn codegen_if(&mut self, if_expr: &ast::If) -> Result<Values, RuntimeError> {
         let condition = self.codegen_non_tuple_expression(&if_expr.condition)?;
 

From 43f1f254ff7d69bb8947ebff82c4183c9e85ab3d Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Tue, 12 Nov 2024 15:53:23 +0000
Subject: [PATCH 02/25] Use SSA parser in unrolling tests

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 222 ++++++------------
 1 file changed, 73 insertions(+), 149 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 8c01327843f..bf9539f626a 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -626,10 +626,7 @@ impl<'f> LoopIteration<'f> {
 
 #[cfg(test)]
 mod tests {
-    use crate::ssa::{
-        function_builder::FunctionBuilder,
-        ir::{instruction::BinaryOp, map::Id, types::Type},
-    };
+    use crate::ssa::{opt::assert_normalized_ssa_equals, Ssa};
 
     #[test]
     fn unroll_nested_loops() {
@@ -640,162 +637,89 @@ mod tests {
         //         }
         //     }
         // }
-        //
-        // fn main f0 {
-        //   b0():
-        //     jmp b1(Field 0)
-        //   b1(v0: Field):  // header of outer loop
-        //     v1 = lt v0, Field 3
-        //     jmpif v1, then: b2, else: b3
-        //   b2():
-        //     jmp b4(Field 0)
-        //   b4(v2: Field):  // header of inner loop
-        //     v3 = lt v2, Field 4
-        //     jmpif v3, then: b5, else: b6
-        //   b5():
-        //     v4 = add v0, v2
-        //     v5 = lt Field 10, v4
-        //     constrain v5
-        //     v6 = add v2, Field 1
-        //     jmp b4(v6)
-        //   b6(): // end of inner loop
-        //     v7 = add v0, Field 1
-        //     jmp b1(v7)
-        //   b3(): // end of outer loop
-        //     return Field 0
-        // }
-        let main_id = Id::test_new(0);
-
-        // Compiling main
-        let mut builder = FunctionBuilder::new("main".into(), main_id);
-
-        let b1 = builder.insert_block();
-        let b2 = builder.insert_block();
-        let b3 = builder.insert_block();
-        let b4 = builder.insert_block();
-        let b5 = builder.insert_block();
-        let b6 = builder.insert_block();
-
-        let v0 = builder.add_block_parameter(b1, Type::field());
-        let v2 = builder.add_block_parameter(b4, Type::field());
-
-        let zero = builder.field_constant(0u128);
-        let one = builder.field_constant(1u128);
-        let three = builder.field_constant(3u128);
-        let four = builder.field_constant(4u128);
-        let ten = builder.field_constant(10u128);
-
-        builder.terminate_with_jmp(b1, vec![zero]);
-
-        // b1
-        builder.switch_to_block(b1);
-        let v1 = builder.insert_binary(v0, BinaryOp::Lt, three);
-        builder.terminate_with_jmpif(v1, b2, b3);
-
-        // b2
-        builder.switch_to_block(b2);
-        builder.terminate_with_jmp(b4, vec![zero]);
-
-        // b3
-        builder.switch_to_block(b3);
-        builder.terminate_with_return(vec![zero]);
-
-        // b4
-        builder.switch_to_block(b4);
-        let v3 = builder.insert_binary(v2, BinaryOp::Lt, four);
-        builder.terminate_with_jmpif(v3, b5, b6);
-
-        // b5
-        builder.switch_to_block(b5);
-        let v4 = builder.insert_binary(v0, BinaryOp::Add, v2);
-        let v5 = builder.insert_binary(ten, BinaryOp::Lt, v4);
-        builder.insert_constrain(v5, one, None);
-        let v6 = builder.insert_binary(v2, BinaryOp::Add, one);
-        builder.terminate_with_jmp(b4, vec![v6]);
-
-        // b6
-        builder.switch_to_block(b6);
-        let v7 = builder.insert_binary(v0, BinaryOp::Add, one);
-        builder.terminate_with_jmp(b1, vec![v7]);
-
-        let ssa = builder.finish();
-        assert_eq!(ssa.main().reachable_blocks().len(), 7);
-
-        // Expected output:
-        //
-        // fn main f0 {
-        //   b0():
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     jmp b23()
-        //   b23():
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     jmp b27()
-        //   b27():
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     constrain Field 0
-        //     jmp b31()
-        //   b31():
-        //     jmp b3()
-        //   b3():
-        //     return Field 0
-        // }
+        let src = "
+            acir(inline) fn main f0 {
+                b0():
+                    jmp b1(Field 0)
+                b1(v0: Field):  // header of outer loop
+                    v1 = lt v0, Field 3
+                    jmpif v1 then: b2, else: b3
+                b2():
+                    jmp b4(Field 0)
+                b4(v2: Field):  // header of inner loop
+                    v3 = lt v2, Field 4
+                    jmpif v3 then: b5, else: b6
+                b5():
+                    v4 = add v0, v2
+                    v5 = lt Field 10, v4
+                    constrain v5 == Field 1
+                    v6 = add v2, Field 1
+                    jmp b4(v6)
+                b6(): // end of inner loop
+                    v7 = add v0, Field 1
+                    jmp b1(v7)
+                b3(): // end of outer loop
+                    return Field 0
+            }
+        ";
+        let ssa = Ssa::from_str(src).unwrap();
+
+        let expected = "
+            acir(inline) fn main f0 {
+              b0():
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                jmp b1()
+              b1():
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                jmp b2()
+              b2():
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                constrain u1 0 == Field 1
+                jmp b3()
+              b3():
+                jmp b4()
+              b4():
+                return Field 0
+            }
+        ";
+
         // The final block count is not 1 because unrolling creates some unnecessary jmps.
         // If a simplify cfg pass is ran afterward, the expected block count will be 1.
         let (ssa, errors) = ssa.try_to_unroll_loops();
         assert_eq!(errors.len(), 0, "All loops should be unrolled");
         assert_eq!(ssa.main().reachable_blocks().len(), 5);
+
+        assert_normalized_ssa_equals(ssa, expected);
     }
 
     // Test that the pass can still be run on loops which fail to unroll properly
     #[test]
     fn fail_to_unroll_loop() {
-        // fn main f0 {
-        //   b0(v0: Field):
-        //     jmp b1(v0)
-        //   b1(v1: Field):
-        //     v2 = lt v1, 5
-        //     jmpif v2, then: b2, else: b3
-        //   b2():
-        //     v3 = add v1, Field 1
-        //     jmp b1(v3)
-        //   b3():
-        //     return Field 0
-        // }
-        let main_id = Id::test_new(0);
-        let mut builder = FunctionBuilder::new("main".into(), main_id);
-
-        let b1 = builder.insert_block();
-        let b2 = builder.insert_block();
-        let b3 = builder.insert_block();
-
-        let v0 = builder.add_parameter(Type::field());
-        let v1 = builder.add_block_parameter(b1, Type::field());
-
-        builder.terminate_with_jmp(b1, vec![v0]);
-
-        builder.switch_to_block(b1);
-        let five = builder.field_constant(5u128);
-        let v2 = builder.insert_binary(v1, BinaryOp::Lt, five);
-        builder.terminate_with_jmpif(v2, b2, b3);
-
-        builder.switch_to_block(b2);
-        let one = builder.field_constant(1u128);
-        let v3 = builder.insert_binary(v1, BinaryOp::Add, one);
-        builder.terminate_with_jmp(b1, vec![v3]);
-
-        builder.switch_to_block(b3);
-        let zero = builder.field_constant(0u128);
-        builder.terminate_with_return(vec![zero]);
+        let src = "
+        acir(inline) fn main f0 {
+          b0(v0: Field):
+            jmp b1(v0)
+          b1(v1: Field):
+            v2 = lt v1, Field 5
+            jmpif v2 then: b2, else: b3
+          b2():
+            v3 = add v1, Field 1
+            jmp b1(v3)
+          b3():
+            return Field 0
+        }
+        ";
+        let ssa = Ssa::from_str(src).unwrap();
 
-        let ssa = builder.finish();
+        // Sanity check
         assert_eq!(ssa.main().reachable_blocks().len(), 4);
 
         // Expected that we failed to unroll the loop

From 9dbc2f8594057d0d8133e42428dadbbe36de98e0 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Tue, 12 Nov 2024 21:52:11 +0000
Subject: [PATCH 03/25] Move functions under Loops and Loop where there is an
 obvious self

---
 .../noirc_evaluator/src/ssa/ir/function.rs    |   8 +
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 514 +++++++++---------
 2 files changed, 272 insertions(+), 250 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/ir/function.rs b/compiler/noirc_evaluator/src/ssa/ir/function.rs
index e8245ff6036..b1233e3063e 100644
--- a/compiler/noirc_evaluator/src/ssa/ir/function.rs
+++ b/compiler/noirc_evaluator/src/ssa/ir/function.rs
@@ -46,6 +46,14 @@ impl RuntimeType {
                 | RuntimeType::Brillig(InlineType::NoPredicates)
         )
     }
+
+    pub(crate) fn is_brillig(&self) -> bool {
+        matches!(self, RuntimeType::Brillig(_))
+    }
+
+    pub(crate) fn is_acir(&self) -> bool {
+        matches!(self, RuntimeType::Acir(_))
+    }
 }
 
 /// A function holds a list of instructions.
diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index bf9539f626a..71408e21981 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -12,8 +12,12 @@
 //!
 //! Note that this pass also often creates superfluous jmp instructions in the
 //! program that will need to be removed by a later simplify CFG pass.
-//! Note also that unrolling is skipped for Brillig runtime and as a result
-//! we remove reference count instructions because they are only used by Brillig bytecode
+//!
+//! Note also that unrolling is skipped for Brillig runtime, unless the loops are deemed
+//! sufficiently small that inlining can be done without increasing the bytecode.
+//!
+//! When unrolling ACIR code, we remove reference count instructions because they are
+//! only used by Brillig bytecode.
 use std::collections::HashSet;
 
 use acvm::acir::AcirField;
@@ -26,9 +30,9 @@ use crate::{
             cfg::ControlFlowGraph,
             dfg::{CallStack, DataFlowGraph},
             dom::DominatorTree,
-            function::{Function, RuntimeType},
+            function::Function,
             function_inserter::FunctionInserter,
-            instruction::{Instruction, TerminatorInstruction},
+            instruction::{Instruction, InstructionId, TerminatorInstruction},
             post_order::PostOrder,
             value::ValueId,
         },
@@ -43,7 +47,7 @@ impl Ssa {
     pub(crate) fn unroll_loops_iteratively(mut ssa: Ssa) -> Result<Ssa, RuntimeError> {
         // Try to unroll loops first:
         let mut unroll_errors;
-        (ssa, unroll_errors) = ssa.try_to_unroll_loops();
+        (ssa, unroll_errors) = ssa.try_unroll_loops();
 
         // Keep unrolling until no more errors are found
         while !unroll_errors.is_empty() {
@@ -58,7 +62,7 @@ impl Ssa {
             ssa = ssa.mem2reg();
 
             // Unroll again
-            (ssa, unroll_errors) = ssa.try_to_unroll_loops();
+            (ssa, unroll_errors) = ssa.try_unroll_loops();
             // If we didn't manage to unroll any more loops, exit
             if unroll_errors.len() >= prev_unroll_err_count {
                 return Err(unroll_errors.swap_remove(0));
@@ -71,23 +75,22 @@ impl Ssa {
     /// If any loop cannot be unrolled, it is left as-is or in a partially unrolled state.
     /// Returns the ssa along with all unrolling errors encountered
     #[tracing::instrument(level = "trace", skip(self))]
-    pub(crate) fn try_to_unroll_loops(mut self) -> (Ssa, Vec<RuntimeError>) {
+    fn try_unroll_loops(mut self) -> (Ssa, Vec<RuntimeError>) {
         let mut errors = vec![];
         for function in self.functions.values_mut() {
-            function.try_to_unroll_loops(&mut errors);
+            errors.extend(function.try_unroll_loops());
         }
         (self, errors)
     }
 }
 
 impl Function {
-    pub(crate) fn try_to_unroll_loops(&mut self, errors: &mut Vec<RuntimeError>) {
-        // Loop unrolling in brillig can lead to a code explosion currently. This can
-        // also be true for ACIR, but we have no alternative to unrolling in ACIR.
-        // Brillig also generally prefers smaller code rather than faster code.
-        if !matches!(self.runtime(), RuntimeType::Brillig(_)) {
-            errors.extend(find_all_loops(self).unroll_each_loop(self));
-        }
+    fn try_unroll_loops(&mut self) -> Vec<RuntimeError> {
+        // Loop unrolling in brillig can lead to a code explosion currently.
+        // This can also be true for ACIR, but we have no alternative to unrolling in ACIR.
+        // Brillig also generally prefers smaller code rather than faster code,
+        // so we only attempt to unroll small loops, which we decide on a case-by-case basis.
+        Loops::find_all(self).unroll_each(self)
     }
 }
 
@@ -101,7 +104,7 @@ struct Loop {
     back_edge_start: BasicBlockId,
 
     /// All the blocks contained within the loop, including `header` and `back_edge_start`.
-    pub(crate) blocks: HashSet<BasicBlockId>,
+    blocks: HashSet<BasicBlockId>,
 }
 
 struct Loops {
@@ -114,87 +117,89 @@ struct Loops {
     cfg: ControlFlowGraph,
 }
 
-/// Find a loop in the program by finding a node that dominates any predecessor node.
-/// The edge where this happens will be the back-edge of the loop.
-///
-/// For example consider the following SSA of a basic loop:
-/// ```text
-/// main():
-///   v0 = ... start ...
-///   v1 = ... end ...
-///   jmp loop_entry(v0)
-/// loop_entry(i: Field):
-///   v2 = lt i v1
-///   jmpif v2, then: loop_body, else: loop_end
-/// loop_body():
-///   v3 = ... body ...
-///   v4 = add 1, i
-///   jmp loop_entry(v4)
-/// loop_end():
-/// ```
-///
-/// The CFG will look something like this:
-/// ```text
-/// main
-///   ↓
-/// loop_entry ←---↰
-///   ↓        ↘   |
-/// loop_end    loop_body     
-/// ```
-/// `loop_entry` has two predecessors: `main` and `loop_body`, and it dominates `loop_body`.
-fn find_all_loops(function: &Function) -> Loops {
-    let cfg = ControlFlowGraph::with_function(function);
-    let post_order = PostOrder::with_function(function);
-    let mut dom_tree = DominatorTree::with_cfg_and_post_order(&cfg, &post_order);
-
-    let mut loops = vec![];
-
-    for (block, _) in function.dfg.basic_blocks_iter() {
-        // These reachable checks wouldn't be needed if we only iterated over reachable blocks
-        if dom_tree.is_reachable(block) {
-            for predecessor in cfg.predecessors(block) {
-                // In the above example, we're looking for when `block` is `loop_entry` and `predecessor` is `loop_body`.
-                if dom_tree.is_reachable(predecessor) && dom_tree.dominates(block, predecessor) {
-                    // predecessor -> block is the back-edge of a loop
-                    loops.push(find_blocks_in_loop(block, predecessor, &cfg));
+impl Loops {
+    /// Find a loop in the program by finding a node that dominates any predecessor node.
+    /// The edge where this happens will be the back-edge of the loop.
+    ///
+    /// For example consider the following SSA of a basic loop:
+    /// ```text
+    /// main():
+    ///   v0 = ... start ...
+    ///   v1 = ... end ...
+    ///   jmp loop_entry(v0)
+    /// loop_entry(i: Field):
+    ///   v2 = lt i v1
+    ///   jmpif v2, then: loop_body, else: loop_end
+    /// loop_body():
+    ///   v3 = ... body ...
+    ///   v4 = add 1, i
+    ///   jmp loop_entry(v4)
+    /// loop_end():
+    /// ```
+    ///
+    /// The CFG will look something like this:
+    /// ```text
+    /// main
+    ///   ↓
+    /// loop_entry ←---↰
+    ///   ↓        ↘   |
+    /// loop_end    loop_body     
+    /// ```
+    /// `loop_entry` has two predecessors: `main` and `loop_body`, and it dominates `loop_body`.
+    fn find_all(function: &Function) -> Self {
+        let cfg = ControlFlowGraph::with_function(function);
+        let post_order = PostOrder::with_function(function);
+        let mut dom_tree = DominatorTree::with_cfg_and_post_order(&cfg, &post_order);
+
+        let mut loops = vec![];
+
+        for (block, _) in function.dfg.basic_blocks_iter() {
+            // These reachable checks wouldn't be needed if we only iterated over reachable blocks
+            if dom_tree.is_reachable(block) {
+                for predecessor in cfg.predecessors(block) {
+                    // In the above example, we're looking for when `block` is `loop_entry` and `predecessor` is `loop_body`.
+                    if dom_tree.is_reachable(predecessor) && dom_tree.dominates(block, predecessor)
+                    {
+                        // predecessor -> block is the back-edge of a loop
+                        loops.push(Loop::find_blocks_in_loop(block, predecessor, &cfg));
+                    }
                 }
             }
         }
-    }
 
-    // Sort loops by block size so that we unroll the larger, outer loops of nested loops first.
-    // This is needed because inner loops may use the induction variable from their outer loops in
-    // their loop range. We will start popping loops from the back.
-    loops.sort_by_key(|loop_| loop_.blocks.len());
+        // Sort loops by block size so that we unroll the larger, outer loops of nested loops first.
+        // This is needed because inner loops may use the induction variable from their outer loops in
+        // their loop range. We will start popping loops from the back.
+        loops.sort_by_key(|loop_| loop_.blocks.len());
 
-    Loops {
-        failed_to_unroll: HashSet::new(),
-        yet_to_unroll: loops,
-        modified_blocks: HashSet::new(),
-        cfg,
+        Self {
+            failed_to_unroll: HashSet::new(),
+            yet_to_unroll: loops,
+            modified_blocks: HashSet::new(),
+            cfg,
+        }
     }
-}
 
-impl Loops {
     /// Unroll all loops within a given function.
     /// Any loops which fail to be unrolled (due to using non-constant indices) will be unmodified.
-    fn unroll_each_loop(mut self, function: &mut Function) -> Vec<RuntimeError> {
+    fn unroll_each(mut self, function: &mut Function) -> Vec<RuntimeError> {
         let mut unroll_errors = vec![];
         while let Some(next_loop) = self.yet_to_unroll.pop() {
+            if function.runtime().is_brillig() {
+                // TODO (#6470): Decide whether to unroll this loop.
+                continue;
+            }
             // If we've previously modified a block in this loop we need to refresh the context.
             // This happens any time we have nested loops.
             if next_loop.blocks.iter().any(|block| self.modified_blocks.contains(block)) {
-                let mut new_context = find_all_loops(function);
-                new_context.failed_to_unroll = self.failed_to_unroll;
-                return unroll_errors
-                    .into_iter()
-                    .chain(new_context.unroll_each_loop(function))
-                    .collect();
+                let mut new_loops = Self::find_all(function);
+                new_loops.failed_to_unroll = self.failed_to_unroll;
+                return unroll_errors.into_iter().chain(new_loops.unroll_each(function)).collect();
             }
 
             // Don't try to unroll the loop again if it is known to fail
             if !self.failed_to_unroll.contains(&next_loop.header) {
-                match unroll_loop(function, &self.cfg, &next_loop) {
+                match next_loop.unroll(function, &self.cfg) {
                     Ok(_) => self.modified_blocks.extend(next_loop.blocks),
                     Err(call_stack) => {
                         self.failed_to_unroll.insert(next_loop.header);
@@ -207,123 +212,179 @@ impl Loops {
     }
 }
 
-/// Return each block that is in a loop starting in the given header block.
-/// Expects back_edge_start -> header to be the back edge of the loop.
-fn find_blocks_in_loop(
-    header: BasicBlockId,
-    back_edge_start: BasicBlockId,
-    cfg: &ControlFlowGraph,
-) -> Loop {
-    let mut blocks = HashSet::new();
-    blocks.insert(header);
-
-    let mut insert = |block, stack: &mut Vec<BasicBlockId>| {
-        if !blocks.contains(&block) {
-            blocks.insert(block);
-            stack.push(block);
-        }
-    };
+impl Loop {
+    /// Return each block that is in a loop starting in the given header block.
+    /// Expects back_edge_start -> header to be the back edge of the loop.
+    fn find_blocks_in_loop(
+        header: BasicBlockId,
+        back_edge_start: BasicBlockId,
+        cfg: &ControlFlowGraph,
+    ) -> Self {
+        let mut blocks = HashSet::new();
+        blocks.insert(header);
 
-    // Starting from the back edge of the loop, each predecessor of this block until
-    // the header is within the loop.
-    let mut stack = vec![];
-    insert(back_edge_start, &mut stack);
+        let mut insert = |block, stack: &mut Vec<BasicBlockId>| {
+            if !blocks.contains(&block) {
+                blocks.insert(block);
+                stack.push(block);
+            }
+        };
+
+        // Starting from the back edge of the loop, each predecessor of this block until
+        // the header is within the loop.
+        let mut stack = vec![];
+        insert(back_edge_start, &mut stack);
 
-    while let Some(block) = stack.pop() {
-        for predecessor in cfg.predecessors(block) {
-            insert(predecessor, &mut stack);
+        while let Some(block) = stack.pop() {
+            for predecessor in cfg.predecessors(block) {
+                insert(predecessor, &mut stack);
+            }
         }
+
+        Self { header, back_edge_start, blocks }
     }
 
-    Loop { header, back_edge_start, blocks }
-}
+    /// Unroll a single loop in the function.
+    /// Returns Ok(()) if it succeeded, Err(callstack) if it failed,
+    /// where the callstack indicates the location of the instruction
+    /// that could not be processed, or empty if such information was
+    /// not available.
+    ///
+    /// Consider this example:
+    /// ```text
+    /// main():
+    ///   v0 = 0
+    ///   v1 = 3
+    ///   jmp loop_entry(v0)
+    /// loop_entry(i: Field):
+    ///   v2 = lt i v1
+    ///   jmpif v2, then: loop_body, else: loop_end
+    /// ```
+    ///
+    /// The first step is to unroll the header by recognizing that jump condition
+    /// is a constant, which means it will go to `loop_body`:
+    /// ```text
+    /// main():
+    ///   v0 = 0
+    ///   v1 = 2
+    ///   v2 = lt v0 v1
+    ///   // jmpif v2, then: loop_body, else: loop_end
+    ///   jmp dest: loop_body
+    /// ```
+    ///
+    /// Following that we unroll the loop body, which is the next source, replace
+    /// the induction variable with the new value created in the body, and have
+    /// another go at the header.
+    /// ```text
+    /// main():
+    ///   v0 = 0
+    ///   v1 = 2
+    ///   v2 = lt v0 v1
+    ///   v3 = ... body ...
+    ///   v4 = add 1, 0
+    ///   jmp loop_entry(v4)
+    /// ```
+    ///
+    /// At the end we reach a point where the condition evaluates to 0 and we jump to the end.
+    /// ```text
+    /// main():
+    ///   v0 = 0
+    ///   v1 = 2
+    ///   v2 = lt 0
+    ///   v3 = ... body ...
+    ///   v4 = add 1, v0
+    ///   v5 = lt v4 v1
+    ///   v6 = ... body ...
+    ///   v7 = add v4, 1
+    ///   v8 = lt v5 v1
+    ///   jmp loop_end
+    /// ```
+    ///
+    /// When e.g. `v8 = lt v5 v1` cannot be evaluated to a constant, the loop signals by returning `Err`
+    /// that a few SSA passes are required to evaluate and simplify these values.
+    fn unroll(
+        self: &Loop,
+        function: &mut Function,
+        cfg: &ControlFlowGraph,
+    ) -> Result<(), CallStack> {
+        let mut unroll_into = self.get_pre_header(cfg);
+        let mut jump_value = get_induction_variable(function, unroll_into)?;
+
+        while let Some(context) = self.unroll_header(function, unroll_into, jump_value)? {
+            let (last_block, last_value) = context.unroll_loop_iteration();
+            unroll_into = last_block;
+            jump_value = last_value;
+        }
 
-/// Unroll a single loop in the function.
-/// Returns Ok(()) if it succeeded, Err(callstack) if it failed,
-/// where the callstack indicates the location of the instruction
-/// that could not be processed, or empty if such information was
-/// not available.
-///
-/// Consider this example:
-/// ```text
-/// main():
-///   v0 = 0
-///   v1 = 3
-///   jmp loop_entry(v0)
-/// loop_entry(i: Field):
-///   v2 = lt i v1
-///   jmpif v2, then: loop_body, else: loop_end
-/// ```
-///
-/// The first step is to unroll the header by recognizing that jump condition
-/// is a constant, which means it will go to `loop_body`:
-/// ```text
-/// main():
-///   v0 = 0
-///   v1 = 2
-///   v2 = lt v0 v1
-///   // jmpif v2, then: loop_body, else: loop_end
-///   jmp dest: loop_body
-/// ```
-///
-/// Following that we unroll the loop body, which is the next source, replace
-/// the induction variable with the new value created in the body, and have
-/// another go at the header.
-/// ```text
-/// main():
-///   v0 = 0
-///   v1 = 2
-///   v2 = lt v0 v1
-///   v3 = ... body ...
-///   v4 = add 1, 0
-///   jmp loop_entry(v4)
-/// ```
-///
-/// At the end we reach a point where the condition evaluates to 0 and we jump to the end.
-/// ```text
-/// main():
-///   v0 = 0
-///   v1 = 2
-///   v2 = lt 0
-///   v3 = ... body ...
-///   v4 = add 1, v0
-///   v5 = lt v4 v1
-///   v6 = ... body ...
-///   v7 = add v4, 1
-///   v8 = lt v5 v1
-///   jmp loop_end
-/// ```
-///
-/// When e.g. `v8 = lt v5 v1` cannot be evaluated to a constant, the loop signals by returning `Err`
-/// that a few SSA passes are required to evaluate and simplify these values.
-fn unroll_loop(
-    function: &mut Function,
-    cfg: &ControlFlowGraph,
-    loop_: &Loop,
-) -> Result<(), CallStack> {
-    let mut unroll_into = get_pre_header(cfg, loop_);
-    let mut jump_value = get_induction_variable(function, unroll_into)?;
-
-    while let Some(context) = unroll_loop_header(function, loop_, unroll_into, jump_value)? {
-        let (last_block, last_value) = context.unroll_loop_iteration();
-        unroll_into = last_block;
-        jump_value = last_value;
+        Ok(())
     }
 
-    Ok(())
-}
+    /// The loop pre-header is the block that comes before the loop begins. Generally a header block
+    /// is expected to have 2 predecessors: the pre-header and the final block of the loop which jumps
+    /// back to the beginning. Other predecessors can come from `break` or `continue`.
+    fn get_pre_header(&self, cfg: &ControlFlowGraph) -> BasicBlockId {
+        let mut pre_header = cfg
+            .predecessors(self.header)
+            .filter(|predecessor| *predecessor != self.back_edge_start)
+            .collect::<Vec<_>>();
+
+        assert_eq!(pre_header.len(), 1);
+        pre_header.remove(0)
+    }
+
+    /// Unrolls the header block of the loop. This is the block that dominates all other blocks in the
+    /// loop and contains the jmpif instruction that lets us know if we should continue looping.
+    /// Returns Some(iteration context) if we should perform another iteration.
+    fn unroll_header<'a>(
+        &'a self,
+        function: &'a mut Function,
+        unroll_into: BasicBlockId,
+        induction_value: ValueId,
+    ) -> Result<Option<LoopIteration<'a>>, CallStack> {
+        // We insert into a fresh block first and move instructions into the unroll_into block later
+        // only once we verify the jmpif instruction has a constant condition. If it does not, we can
+        // just discard this fresh block and leave the loop unmodified.
+        let fresh_block = function.dfg.make_block();
+
+        let mut context = LoopIteration::new(function, self, fresh_block, self.header);
+        let source_block = &context.dfg()[context.source_block];
+        assert_eq!(source_block.parameters().len(), 1, "Expected only 1 argument in loop header");
+
+        // Insert the current value of the loop induction variable into our context.
+        let first_param = source_block.parameters()[0];
+        context.inserter.try_map_value(first_param, induction_value);
+        // Copy over all instructions and a fresh terminator.
+        context.inline_instructions_from_block();
+        // Mutate the terminator if possible so that it points at the iteration block.
+        match context.dfg()[fresh_block].unwrap_terminator() {
+        TerminatorInstruction::JmpIf { condition, then_destination, else_destination, call_stack } => {
+            let condition = *condition;
+            let next_blocks = context.handle_jmpif(condition, *then_destination, *else_destination, call_stack.clone());
+
+            // If there is only 1 next block the jmpif evaluated to a single known block.
+            // This is the expected case and lets us know if we should loop again or not.
+            if next_blocks.len() == 1 {
+                context.dfg_mut().inline_block(fresh_block, unroll_into);
+
+                // The fresh block is gone now so we're committing to insert into the original
+                // unroll_into block from now on.
+                context.insert_block = unroll_into;
 
-/// The loop pre-header is the block that comes before the loop begins. Generally a header block
-/// is expected to have 2 predecessors: the pre-header and the final block of the loop which jumps
-/// back to the beginning. Other predecessors can come from `break` or `continue`.
-fn get_pre_header(cfg: &ControlFlowGraph, loop_: &Loop) -> BasicBlockId {
-    let mut pre_header = cfg
-        .predecessors(loop_.header)
-        .filter(|predecessor| *predecessor != loop_.back_edge_start)
-        .collect::<Vec<_>>();
-
-    assert_eq!(pre_header.len(), 1);
-    pre_header.remove(0)
+                // In the last iteration, `handle_jmpif` will have replaced `context.source_block`
+                // with the `else_destination`, that is, the `loop_end`, which signals that we 
+                // have no more loops to unroll, because that block was not part of the loop itself, 
+                // ie. it wasn't between `loop_header` and `loop_body`. Otherwise we have the `loop_body`
+                // in `source_block` and can unroll that into the destination.
+                Ok(self.blocks.contains(&context.source_block).then_some(context))
+            } else {
+                // If this case is reached the loop either uses non-constant indices or we need
+                // another pass, such as mem2reg to resolve them to constants.
+                Err(context.inserter.function.dfg.get_value_call_stack(condition))
+            }
+        }
+        other => unreachable!("Expected loop header to terminate in a JmpIf to the loop body, but found {other:?} instead"),
+    }
+    }
 }
 
 /// Return the induction value of the current iteration of the loop, from the given block's jmp arguments.
@@ -361,60 +422,6 @@ fn get_induction_variable(function: &Function, block: BasicBlockId) -> Result<Va
     }
 }
 
-/// Unrolls the header block of the loop. This is the block that dominates all other blocks in the
-/// loop and contains the jmpif instruction that lets us know if we should continue looping.
-/// Returns Some(iteration context) if we should perform another iteration.
-fn unroll_loop_header<'a>(
-    function: &'a mut Function,
-    loop_: &'a Loop,
-    unroll_into: BasicBlockId,
-    induction_value: ValueId,
-) -> Result<Option<LoopIteration<'a>>, CallStack> {
-    // We insert into a fresh block first and move instructions into the unroll_into block later
-    // only once we verify the jmpif instruction has a constant condition. If it does not, we can
-    // just discard this fresh block and leave the loop unmodified.
-    let fresh_block = function.dfg.make_block();
-
-    let mut context = LoopIteration::new(function, loop_, fresh_block, loop_.header);
-    let source_block = &context.dfg()[context.source_block];
-    assert_eq!(source_block.parameters().len(), 1, "Expected only 1 argument in loop header");
-
-    // Insert the current value of the loop induction variable into our context.
-    let first_param = source_block.parameters()[0];
-    context.inserter.try_map_value(first_param, induction_value);
-    // Copy over all instructions and a fresh terminator.
-    context.inline_instructions_from_block();
-    // Mutate the terminator if possible so that it points at the iteration block.
-    match context.dfg()[fresh_block].unwrap_terminator() {
-        TerminatorInstruction::JmpIf { condition, then_destination, else_destination, call_stack } => {
-            let condition = *condition;
-            let next_blocks = context.handle_jmpif(condition, *then_destination, *else_destination, call_stack.clone());
-
-            // If there is only 1 next block the jmpif evaluated to a single known block.
-            // This is the expected case and lets us know if we should loop again or not.
-            if next_blocks.len() == 1 {
-                context.dfg_mut().inline_block(fresh_block, unroll_into);
-
-                // The fresh block is gone now so we're committing to insert into the original
-                // unroll_into block from now on.
-                context.insert_block = unroll_into;
-
-                // In the last iteration, `handle_jmpif` will have replaced `context.source_block`
-                // with the `else_destination`, that is, the `loop_end`, which signals that we 
-                // have no more loops to unroll, because that block was not part of the loop itself, 
-                // ie. it wasn't between `loop_header` and `loop_body`. Otherwise we have the `loop_body`
-                // in `source_block` and can unroll that into the destination.
-                Ok(loop_.blocks.contains(&context.source_block).then_some(context))
-            } else {
-                // If this case is reached the loop either uses non-constant indices or we need
-                // another pass, such as mem2reg to resolve them to constants.
-                Err(context.inserter.function.dfg.get_value_call_stack(condition))
-            }
-        }
-        other => unreachable!("Expected loop header to terminate in a JmpIf to the loop body, but found {other:?} instead"),
-    }
-}
-
 /// The context object for each loop iteration.
 /// Notably each loop iteration maps each loop block to a fresh, unrolled block.
 struct LoopIteration<'f> {
@@ -437,6 +444,9 @@ struct LoopIteration<'f> {
     /// This is None until we visit the block which jumps back to the start of the
     /// loop, at which point we record its value and the block it was found in.
     induction_value: Option<(BasicBlockId, ValueId)>,
+
+    /// Whether to discard reference count instructions, which ACIR doesn't need but Brillig does.
+    skip_ref_counts: bool,
 }
 
 impl<'f> LoopIteration<'f> {
@@ -447,6 +457,7 @@ impl<'f> LoopIteration<'f> {
         source_block: BasicBlockId,
     ) -> Self {
         Self {
+            skip_ref_counts: function.runtime().is_acir(),
             inserter: FunctionInserter::new(function),
             loop_,
             insert_block,
@@ -595,14 +606,10 @@ impl<'f> LoopIteration<'f> {
         // instances of the induction variable or any values that were changed as a result
         // of the new induction variable value.
         for instruction in instructions {
-            // TODO (#6470): Now Brillig will be inlined as well.
-            // Skip reference count instructions since they are only used for brillig, and brillig code is not unrolled
-            if !matches!(
-                self.dfg()[instruction],
-                Instruction::IncrementRc { .. } | Instruction::DecrementRc { .. }
-            ) {
-                self.inserter.push_instruction(instruction, self.insert_block);
+            if self.skip_ref_counts && self.is_ref_count(instruction) {
+                continue;
             }
+            self.inserter.push_instruction(instruction, self.insert_block);
         }
         let mut terminator = self.dfg()[self.source_block]
             .unwrap_terminator()
@@ -615,6 +622,13 @@ impl<'f> LoopIteration<'f> {
         self.inserter.function.dfg.set_block_terminator(self.insert_block, terminator);
     }
 
+    fn is_ref_count(&self, instruction: InstructionId) -> bool {
+        matches!(
+            self.dfg()[instruction],
+            Instruction::IncrementRc { .. } | Instruction::DecrementRc { .. }
+        )
+    }
+
     fn dfg(&self) -> &DataFlowGraph {
         &self.inserter.function.dfg
     }
@@ -693,7 +707,7 @@ mod tests {
 
         // The final block count is not 1 because unrolling creates some unnecessary jmps.
         // If a simplify cfg pass is ran afterward, the expected block count will be 1.
-        let (ssa, errors) = ssa.try_to_unroll_loops();
+        let (ssa, errors) = ssa.try_unroll_loops();
         assert_eq!(errors.len(), 0, "All loops should be unrolled");
         assert_eq!(ssa.main().reachable_blocks().len(), 5);
 
@@ -723,7 +737,7 @@ mod tests {
         assert_eq!(ssa.main().reachable_blocks().len(), 4);
 
         // Expected that we failed to unroll the loop
-        let (_, errors) = ssa.try_to_unroll_loops();
+        let (_, errors) = ssa.try_unroll_loops();
         assert_eq!(errors.len(), 1, "Expected to fail to unroll loop");
     }
 }

From 2d14db55b033b8eb8c236d23fce786419285ed4c Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Tue, 12 Nov 2024 23:54:11 +0000
Subject: [PATCH 04/25] Test getting numeric bounds

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 140 +++++++++++++++++-
 1 file changed, 132 insertions(+), 8 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 71408e21981..15fd4ee8acc 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -20,7 +20,7 @@
 //! only used by Brillig bytecode.
 use std::collections::HashSet;
 
-use acvm::acir::AcirField;
+use acvm::{acir::AcirField, FieldElement};
 
 use crate::{
     errors::RuntimeError,
@@ -32,7 +32,7 @@ use crate::{
             dom::DominatorTree,
             function::Function,
             function_inserter::FunctionInserter,
-            instruction::{Instruction, InstructionId, TerminatorInstruction},
+            instruction::{Binary, BinaryOp, Instruction, InstructionId, TerminatorInstruction},
             post_order::PostOrder,
             value::ValueId,
         },
@@ -244,6 +244,75 @@ impl Loop {
         Self { header, back_edge_start, blocks }
     }
 
+    /// Find the lower bound of the loop in the pre-header and return it
+    /// if it's a numeric constant, which it will be if the previous SSA
+    /// steps managed to inline it.
+    ///
+    /// Consider the following example of a `for i in 0..4` loop:
+    /// ```text
+    /// brillig(inline) fn main f0 {
+    ///   b0(v0: u32):                  // Pre-header
+    ///     ...
+    ///     jmp b1(u32 0)               // Lower-bound
+    ///   b1(v1: u32):
+    ///     v5 = lt v1, u32 4
+    ///     jmpif v5 then: b3, else: b2
+    /// ```
+    fn get_const_lower_bound(
+        &self,
+        function: &Function,
+        cfg: &ControlFlowGraph,
+    ) -> Result<Option<FieldElement>, CallStack> {
+        let pre_header = self.get_pre_header(cfg);
+        let jump_value = get_induction_variable(function, pre_header)?;
+        Ok(function.dfg.get_numeric_constant(jump_value))
+    }
+
+    /// Find the upper bound of the loop in the loop header and return it
+    /// if it's a numeric constant, which it will be if the previous SSA
+    /// steps managed to inline it.
+    ///
+    /// Consider the following example of a `for i in 0..4` loop:
+    /// ```text
+    /// brillig(inline) fn main f0 {
+    ///   b0(v0: u32):            
+    ///     ...
+    ///     jmp b1(u32 0)         
+    ///   b1(v1: u32):                  // Loop header
+    ///     v5 = lt v1, u32 4           // Upper bound
+    ///     jmpif v5 then: b3, else: b2
+    /// ```
+    fn get_const_upper_bound(&self, function: &Function) -> Option<FieldElement> {
+        let block = &function.dfg[self.header];
+        let instructions = block.instructions();
+        assert_eq!(
+            instructions.len(),
+            1,
+            "The header should just compare the induction variable and jump"
+        );
+        match &function.dfg[instructions[0]] {
+            Instruction::Binary(Binary { lhs: _, operator: BinaryOp::Lt, rhs }) => {
+                function.dfg.get_numeric_constant(*rhs)
+            }
+            other => panic!("Unexpected instruction in header: {other:?}"),
+        }
+    }
+
+    /// Get the lower and upper bounds of the loop if both are constant numeric values.
+    fn get_const_bounds(
+        &self,
+        function: &Function,
+        cfg: &ControlFlowGraph,
+    ) -> Result<Option<(FieldElement, FieldElement)>, CallStack> {
+        let Some(lower) = self.get_const_lower_bound(function, cfg)? else {
+            return Ok(None);
+        };
+        let Some(upper) = self.get_const_upper_bound(function) else {
+            return Ok(None);
+        };
+        Ok(Some((lower, upper)))
+    }
+
     /// Unroll a single loop in the function.
     /// Returns Ok(()) if it succeeded, Err(callstack) if it failed,
     /// where the callstack indicates the location of the instruction
@@ -302,11 +371,7 @@ impl Loop {
     ///
     /// When e.g. `v8 = lt v5 v1` cannot be evaluated to a constant, the loop signals by returning `Err`
     /// that a few SSA passes are required to evaluate and simplify these values.
-    fn unroll(
-        self: &Loop,
-        function: &mut Function,
-        cfg: &ControlFlowGraph,
-    ) -> Result<(), CallStack> {
+    fn unroll(&self, function: &mut Function, cfg: &ControlFlowGraph) -> Result<(), CallStack> {
         let mut unroll_into = self.get_pre_header(cfg);
         let mut jump_value = get_induction_variable(function, unroll_into)?;
 
@@ -390,7 +455,7 @@ impl Loop {
 /// Return the induction value of the current iteration of the loop, from the given block's jmp arguments.
 ///
 /// Expects the current block to terminate in `jmp h(N)` where h is the loop header and N is
-/// a Field value.
+/// a Field value. Returns an `Err` if this isn't the case.
 ///
 /// Consider the following example:
 /// ```text
@@ -640,8 +705,12 @@ impl<'f> LoopIteration<'f> {
 
 #[cfg(test)]
 mod tests {
+    use acvm::FieldElement;
+
     use crate::ssa::{opt::assert_normalized_ssa_equals, Ssa};
 
+    use super::Loops;
+
     #[test]
     fn unroll_nested_loops() {
         // fn main() {
@@ -740,4 +809,59 @@ mod tests {
         let (_, errors) = ssa.try_unroll_loops();
         assert_eq!(errors.len(), 1, "Expected to fail to unroll loop");
     }
+
+    #[test]
+    fn test_get_const_bounds() {
+        let ssa = brillig_unroll_test_case();
+        let function = ssa.main();
+        let loops = Loops::find_all(function);
+        assert_eq!(loops.yet_to_unroll.len(), 1);
+        let (lower, upper) = loops.yet_to_unroll[0]
+            .get_const_bounds(function, &loops.cfg)
+            .expect("should find bounds")
+            .expect("bounds are numeric const");
+        assert_eq!(lower, FieldElement::from(0u32));
+        assert_eq!(upper, FieldElement::from(4u32));
+    }
+
+    /// Simple test loop:
+    /// ```text
+    /// unconstrained fn main(sum: u32) {
+    ///     assert(loop(0, 4) == sum);
+    /// }
+    ///
+    /// fn loop(from: u32, to: u32) -> u32 {
+    ///      let mut sum = 0;
+    ///      for i in from..to {
+    ///          sum = sum + i;
+    ///      }
+    ///      sum
+    ///  }
+    /// ```
+    fn brillig_unroll_test_case() -> Ssa {
+        let src = "
+        // After `static_assert` and `assert_constant`:
+        brillig(inline) fn main f0 {
+          b0(v0: u32):
+            v2 = allocate -> &mut u32
+            store u32 0 at v2
+            jmp b1(u32 0)
+          b1(v1: u32):
+            v5 = lt v1, u32 4
+            jmpif v5 then: b3, else: b2
+          b3():
+            v8 = load v2 -> u32
+            v9 = add v8, v1
+            store v9 at v2
+            v11 = add v1, u32 1
+            jmp b1(v11)
+          b2():
+            v6 = load v2 -> u32
+            v7 = eq v6, v0
+            constrain v6 == v0
+            return
+        }
+        ";
+        Ssa::from_str(src).unwrap()
+    }
 }

From 49d92d285ada3456b46eb1eb0f36d8e78c91eed3 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 12:49:45 +0000
Subject: [PATCH 05/25] Normalise during SSA parsing

---
 .../noirc_evaluator/src/ssa/parser/into_ssa.rs     | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/parser/into_ssa.rs b/compiler/noirc_evaluator/src/ssa/parser/into_ssa.rs
index 2a94a4fd1eb..b49420ec775 100644
--- a/compiler/noirc_evaluator/src/ssa/parser/into_ssa.rs
+++ b/compiler/noirc_evaluator/src/ssa/parser/into_ssa.rs
@@ -27,7 +27,11 @@ struct Translator {
     /// Maps block names to their IDs
     blocks: HashMap<FunctionId, HashMap<String, BasicBlockId>>,
 
-    /// Maps variable names to their IDs
+    /// Maps variable names to their IDs.
+    ///
+    /// This is necessary because the SSA we parse might have undergone some
+    /// passes already which replaced some of the original IDs. The translator
+    /// will recreate the SSA step by step, which can result in a new ID layout.
     variables: HashMap<FunctionId, HashMap<String, ValueId>>,
 }
 
@@ -308,7 +312,13 @@ impl Translator {
     }
 
     fn finish(self) -> Ssa {
-        self.builder.finish()
+        let mut ssa = self.builder.finish();
+        // Normalize the IDs so we have a better chance of matching the SSA we parsed
+        // after the step-by-step reconstruction done during translation. This assumes
+        // that the SSA we parsed was printed by the `SsaBuilder`, which normalizes
+        // before each print.
+        ssa.normalize_ids();
+        ssa
     }
 
     fn current_function_id(&self) -> FunctionId {

From 16c799e70ebbe84488e91cfcfd7abd5e3fda8a3f Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:03:30 +0000
Subject: [PATCH 06/25] Trim comments from expected SSA before comparing
 strings

---
 compiler/noirc_evaluator/src/lib.rs         | 19 +++++++++++++++++++
 compiler/noirc_evaluator/src/ssa/opt/mod.rs |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/compiler/noirc_evaluator/src/lib.rs b/compiler/noirc_evaluator/src/lib.rs
index 453e8b59f58..5be7a0df0a9 100644
--- a/compiler/noirc_evaluator/src/lib.rs
+++ b/compiler/noirc_evaluator/src/lib.rs
@@ -30,3 +30,22 @@ pub(crate) fn trim_leading_whitespace_from_lines(src: &str) -> String {
     }
     result
 }
+
+/// Trim comments from the lines, ie. content starting with `//`.
+#[cfg(test)]
+pub(crate) fn trim_comments_from_lines(src: &str) -> String {
+    let mut result = String::new();
+    let mut first = true;
+    for line in src.lines() {
+        if !first {
+            result.push('\n');
+        }
+        if let Some(comment) = line.find("//") {
+            result.push_str(line[..comment].trim_end());
+        } else {
+            result.push_str(line);
+        }
+        first = false;
+    }
+    result
+}
diff --git a/compiler/noirc_evaluator/src/ssa/opt/mod.rs b/compiler/noirc_evaluator/src/ssa/opt/mod.rs
index 5576b494570..9123ae939b7 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/mod.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/mod.rs
@@ -35,13 +35,14 @@ pub(crate) fn assert_normalized_ssa_equals(mut ssa: super::Ssa, expected: &str)
         panic!("`expected` argument of `assert_ssa_equals` is not valid SSA:\n{:?}", err);
     }
 
-    use crate::{ssa::Ssa, trim_leading_whitespace_from_lines};
+    use crate::{ssa::Ssa, trim_comments_from_lines, trim_leading_whitespace_from_lines};
 
     ssa.normalize_ids();
 
     let ssa = ssa.to_string();
     let ssa = trim_leading_whitespace_from_lines(&ssa);
     let expected = trim_leading_whitespace_from_lines(expected);
+    let expected = trim_comments_from_lines(&expected);
 
     if ssa != expected {
         println!("Got:\n~~~\n{}\n~~~\nExpected:\n~~~\n{}\n~~~", ssa, expected);

From 6e2468e2c8cba3e5f4efd2a16de913f6faecd25d Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:03:48 +0000
Subject: [PATCH 07/25] Collect references before the pre-header

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 147 ++++++++++++++----
 1 file changed, 118 insertions(+), 29 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 15fd4ee8acc..dbe5bca5d8f 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -143,7 +143,7 @@ impl Loops {
     ///   ↓
     /// loop_entry ←---↰
     ///   ↓        ↘   |
-    /// loop_end    loop_body     
+    /// loop_end    loop_body
     /// ```
     /// `loop_entry` has two predecessors: `main` and `loop_body`, and it dominates `loop_body`.
     fn find_all(function: &Function) -> Self {
@@ -275,9 +275,9 @@ impl Loop {
     /// Consider the following example of a `for i in 0..4` loop:
     /// ```text
     /// brillig(inline) fn main f0 {
-    ///   b0(v0: u32):            
+    ///   b0(v0: u32):
     ///     ...
-    ///     jmp b1(u32 0)         
+    ///     jmp b1(u32 0)
     ///   b1(v1: u32):                  // Loop header
     ///     v5 = lt v1, u32 4           // Upper bound
     ///     jmpif v5 then: b3, else: b2
@@ -422,33 +422,111 @@ impl Loop {
         context.inline_instructions_from_block();
         // Mutate the terminator if possible so that it points at the iteration block.
         match context.dfg()[fresh_block].unwrap_terminator() {
-        TerminatorInstruction::JmpIf { condition, then_destination, else_destination, call_stack } => {
-            let condition = *condition;
-            let next_blocks = context.handle_jmpif(condition, *then_destination, *else_destination, call_stack.clone());
-
-            // If there is only 1 next block the jmpif evaluated to a single known block.
-            // This is the expected case and lets us know if we should loop again or not.
-            if next_blocks.len() == 1 {
-                context.dfg_mut().inline_block(fresh_block, unroll_into);
-
-                // The fresh block is gone now so we're committing to insert into the original
-                // unroll_into block from now on.
-                context.insert_block = unroll_into;
-
-                // In the last iteration, `handle_jmpif` will have replaced `context.source_block`
-                // with the `else_destination`, that is, the `loop_end`, which signals that we 
-                // have no more loops to unroll, because that block was not part of the loop itself, 
-                // ie. it wasn't between `loop_header` and `loop_body`. Otherwise we have the `loop_body`
-                // in `source_block` and can unroll that into the destination.
-                Ok(self.blocks.contains(&context.source_block).then_some(context))
-            } else {
-                // If this case is reached the loop either uses non-constant indices or we need
-                // another pass, such as mem2reg to resolve them to constants.
-                Err(context.inserter.function.dfg.get_value_call_stack(condition))
+            TerminatorInstruction::JmpIf { condition, then_destination, else_destination, call_stack } => {
+                let condition = *condition;
+                let next_blocks = context.handle_jmpif(condition, *then_destination, *else_destination, call_stack.clone());
+
+                // If there is only 1 next block the jmpif evaluated to a single known block.
+                // This is the expected case and lets us know if we should loop again or not.
+                if next_blocks.len() == 1 {
+                    context.dfg_mut().inline_block(fresh_block, unroll_into);
+
+                    // The fresh block is gone now so we're committing to insert into the original
+                    // unroll_into block from now on.
+                    context.insert_block = unroll_into;
+
+                    // In the last iteration, `handle_jmpif` will have replaced `context.source_block`
+                    // with the `else_destination`, that is, the `loop_end`, which signals that we
+                    // have no more loops to unroll, because that block was not part of the loop itself,
+                    // ie. it wasn't between `loop_header` and `loop_body`. Otherwise we have the `loop_body`
+                    // in `source_block` and can unroll that into the destination.
+                    Ok(self.blocks.contains(&context.source_block).then_some(context))
+                } else {
+                    // If this case is reached the loop either uses non-constant indices or we need
+                    // another pass, such as mem2reg to resolve them to constants.
+                    Err(context.inserter.function.dfg.get_value_call_stack(condition))
+                }
             }
+            other => unreachable!("Expected loop header to terminate in a JmpIf to the loop body, but found {other:?} instead"),
         }
-        other => unreachable!("Expected loop header to terminate in a JmpIf to the loop body, but found {other:?} instead"),
     }
+
+    /// Find all reference values which were allocated before the pre-header.
+    ///
+    /// These are accessible inside the loop body, and they can be involved
+    /// in load/store operations that could be eliminated if we unrolled the
+    /// body into the pre-header.
+    ///
+    /// Consider this loop:
+    /// ```text
+    /// let mut sum = 0;
+    /// let mut arr = &[];
+    /// for i in 0..3 {
+    ///     sum = sum + i;
+    ///     arr.push_back(sum)
+    /// }
+    /// sum
+    /// ```
+    ///
+    /// The SSA has a load+store for the `sum` and a load+push for the `arr`:
+    /// ```text
+    /// b0(v0: u32):
+    ///   v2 = allocate -> &mut u32     // reference allocated for `sum`
+    ///   store u32 0 at v2             // initial value for `sum`
+    ///   v4 = allocate -> &mut u32     // reference allocated for the length of `arr`
+    ///   store u32 0 at v4             // initial length of `arr`
+    ///   inc_rc [] of u32              // storage for `arr`
+    ///   v6 = allocate -> &mut [u32]   // reference allocated to point at the storage of `arr`
+    ///   store [] of u32 at v6         // initial value for the storage of `arr`
+    ///   jmp b1(u32 0)                 // start looping from 0
+    /// b1(v1: u32):                    // `i` induction variable
+    ///   v8 = lt v1, u32 3             // loop until 3
+    ///   jmpif v8 then: b3, else: b2
+    /// b3():
+    ///   v11 = load v2 -> u32          // load `sum`
+    ///   v12 = add v11, v1             // add `i` to `sum`
+    ///   store v12 at v2               // store updated `sum`
+    ///   v13 = load v4 -> u32          // load length of `arr`
+    ///   v14 = load v6 -> [u32]        // load storage of `arr`
+    ///   v16, v17 = call slice_push_back(v13, v14, v12) -> (u32, [u32]) // builtin to push, will store to storage and length references
+    ///   v19 = add v1, u32 1           // increase `arr`
+    ///   jmp b1(v19)                   // back-edge of the loop
+    /// b2():                           // after the loop
+    ///   v9 = load v2 -> u32           // read final value of `sum`
+    /// ```
+    ///
+    /// We won't always find load _and_ store ops (e.g. the push above doesn't come with a store),
+    /// but it's likely that we `mem2reg` could eliminate a lot of the loads we can find, so we can
+    /// use this as an approximation of the gains we would see.
+    fn find_pre_header_reference_values(
+        &self,
+        function: &Function,
+        cfg: &ControlFlowGraph,
+    ) -> HashSet<ValueId> {
+        // We need to traverse blocks from the pre-header up to the block entry point.
+        let pre_header = self.get_pre_header(cfg);
+        let function_entry = function.entry_block();
+
+        // The algorithm to collect blocks in the loop is fine for us.
+        let blocks = Self::find_blocks_in_loop(function_entry, pre_header, cfg).blocks;
+
+        // Collect allocations in all blocks above the header.
+        let allocations = blocks.iter().flat_map(|b| {
+            function.dfg[*b]
+                .instructions()
+                .iter()
+                .filter(|i| matches!(&function.dfg[**i], Instruction::Allocate))
+                .map(|i| {
+                    // Get the value into which the allocation was stored.
+                    function.dfg.instruction_results(*i)[0]
+                })
+        });
+
+        // Collect reference parameters of the function itself.
+        let params =
+            function.parameters().iter().filter(|p| function.dfg.value_is_reference(**p)).copied();
+
+        params.chain(allocations).collect()
     }
 }
 
@@ -661,7 +739,7 @@ impl<'f> LoopIteration<'f> {
         self.original_blocks.get(&block).copied().unwrap_or(block)
     }
 
-    /// Copy over instructions from the source into the insert block,  
+    /// Copy over instructions from the source into the insert block,
     /// while simplifying instructions and keeping track of original block IDs.
     fn inline_instructions_from_block(&mut self) {
         let source_block = &self.dfg()[self.source_block];
@@ -707,7 +785,7 @@ impl<'f> LoopIteration<'f> {
 mod tests {
     use acvm::FieldElement;
 
-    use crate::ssa::{opt::assert_normalized_ssa_equals, Ssa};
+    use crate::ssa::{ir::value::ValueId, opt::assert_normalized_ssa_equals, Ssa};
 
     use super::Loops;
 
@@ -824,6 +902,17 @@ mod tests {
         assert_eq!(upper, FieldElement::from(4u32));
     }
 
+    #[test]
+    fn test_find_pre_header_reference_values() {
+        let ssa = brillig_unroll_test_case();
+        let function = ssa.main();
+        let loops = Loops::find_all(function);
+        assert_eq!(loops.yet_to_unroll.len(), 1);
+        let refs = loops.yet_to_unroll[0].find_pre_header_reference_values(function, &loops.cfg);
+        assert_eq!(refs.len(), 1);
+        assert!(refs.contains(&ValueId::new(2)));
+    }
+
     /// Simple test loop:
     /// ```text
     /// unconstrained fn main(sum: u32) {

From 71cccc25448970302be2de47b23e4fff4501b02d Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:19:52 +0000
Subject: [PATCH 08/25] Fix parser test now that the expectation is a
 normalised SSA

---
 compiler/noirc_evaluator/src/ssa/parser/tests.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/parser/tests.rs b/compiler/noirc_evaluator/src/ssa/parser/tests.rs
index 3ed6be57b5e..0b661c1cc4f 100644
--- a/compiler/noirc_evaluator/src/ssa/parser/tests.rs
+++ b/compiler/noirc_evaluator/src/ssa/parser/tests.rs
@@ -103,8 +103,8 @@ fn test_multiple_blocks_and_jmp() {
         acir(inline) fn main f0 {
           b0():
             jmp b1(Field 1)
-          b1(v1: Field):
-            return v1
+          b1(v0: Field):
+            return v0
         }
         ";
     assert_ssa_roundtrip(src);

From cc0c6c9bf2c7ed936a2b030fc646e727814f4ab4 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:26:29 +0000
Subject: [PATCH 09/25] Count loads and stores

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 39 +++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index dbe5bca5d8f..0fd8c850c0a 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -528,6 +528,32 @@ impl Loop {
 
         params.chain(allocations).collect()
     }
+
+    /// Count the number of load and store instructions of specific variables in the loop.
+    ///
+    /// Returns `(loads, stores)` in case we want to differentiate in the estimates.
+    fn count_loads_and_stores(
+        &self,
+        function: &Function,
+        refs: &HashSet<ValueId>,
+    ) -> (usize, usize) {
+        let mut loads = 0;
+        let mut stores = 0;
+        for block in &self.blocks {
+            for instruction in function.dfg[*block].instructions() {
+                match &function.dfg[*instruction] {
+                    Instruction::Load { address } if refs.contains(address) => {
+                        loads += 1;
+                    }
+                    Instruction::Store { address, .. } if refs.contains(address) => {
+                        stores += 1;
+                    }
+                    _ => {}
+                }
+            }
+        }
+        (loads, stores)
+    }
 }
 
 /// Return the induction value of the current iteration of the loop, from the given block's jmp arguments.
@@ -894,10 +920,12 @@ mod tests {
         let function = ssa.main();
         let loops = Loops::find_all(function);
         assert_eq!(loops.yet_to_unroll.len(), 1);
+
         let (lower, upper) = loops.yet_to_unroll[0]
             .get_const_bounds(function, &loops.cfg)
             .expect("should find bounds")
             .expect("bounds are numeric const");
+
         assert_eq!(lower, FieldElement::from(0u32));
         assert_eq!(upper, FieldElement::from(4u32));
     }
@@ -906,11 +934,16 @@ mod tests {
     fn test_find_pre_header_reference_values() {
         let ssa = brillig_unroll_test_case();
         let function = ssa.main();
-        let loops = Loops::find_all(function);
-        assert_eq!(loops.yet_to_unroll.len(), 1);
-        let refs = loops.yet_to_unroll[0].find_pre_header_reference_values(function, &loops.cfg);
+        let mut loops = Loops::find_all(function);
+        let loop0 = loops.yet_to_unroll.pop().unwrap();
+
+        let refs = loop0.find_pre_header_reference_values(function, &loops.cfg);
         assert_eq!(refs.len(), 1);
         assert!(refs.contains(&ValueId::new(2)));
+
+        let (loads, stores) = loop0.count_loads_and_stores(function, &refs);
+        assert_eq!(loads, 1);
+        assert_eq!(stores, 1);
     }
 
     /// Simple test loop:

From a9c2c8ea9465b22fd2dbf6e2646356f1f5671d7b Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:40:03 +0000
Subject: [PATCH 10/25] Fix parser test now that the expectation is a
 normalised SSA

---
 compiler/noirc_evaluator/src/ssa/parser/tests.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/parser/tests.rs b/compiler/noirc_evaluator/src/ssa/parser/tests.rs
index 0b661c1cc4f..312a874ed3c 100644
--- a/compiler/noirc_evaluator/src/ssa/parser/tests.rs
+++ b/compiler/noirc_evaluator/src/ssa/parser/tests.rs
@@ -115,11 +115,11 @@ fn test_jmpif() {
     let src = "
         acir(inline) fn main f0 {
           b0(v0: Field):
-            jmpif v0 then: b1, else: b2
-          b1():
-            return
+            jmpif v0 then: b2, else: b1
           b2():
             return
+          b1():
+            return
         }
         ";
     assert_ssa_roundtrip(src);

From 944ae376223fb8b403884cb28059ddb79755df36 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 13:48:57 +0000
Subject: [PATCH 11/25] Count all instructions

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 0fd8c850c0a..eceb798dcaa 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -554,6 +554,17 @@ impl Loop {
         }
         (loads, stores)
     }
+
+    /// Count the number of instructions in the loop, including the terminating jumps.
+    fn count_all_instructions(&self, function: &Function) -> usize {
+        self.blocks
+            .iter()
+            .map(|block| {
+                let block = &function.dfg[*block];
+                block.instructions().len() + block.terminator().map(|_| 1).unwrap_or_default()
+            })
+            .sum()
+    }
 }
 
 /// Return the induction value of the current iteration of the loop, from the given block's jmp arguments.
@@ -944,6 +955,9 @@ mod tests {
         let (loads, stores) = loop0.count_loads_and_stores(function, &refs);
         assert_eq!(loads, 1);
         assert_eq!(stores, 1);
+
+        let all = loop0.count_all_instructions(function);
+        assert_eq!(all, 7);
     }
 
     /// Simple test loop:

From f41ec36f685ebaf748c22782471b784a65a36513 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 14:27:08 +0000
Subject: [PATCH 12/25] Add method to decide if a loop is small

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index eceb798dcaa..1077347628f 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -41,6 +41,29 @@ use crate::{
 };
 use fxhash::FxHashMap as HashMap;
 
+/// Number of instructions which are complete loop boilerplate,
+/// the ones facilitating jumps and the increment of the loop
+/// variable.
+///
+/// All the instructions in the following example are boilerplate:
+/// ```text
+/// brillig(inline) fn main f0 {
+///   b0(v0: u32):
+///     ...
+///     jmp b1(u32 0)
+///   b1(v1: u32):
+///     v5 = lt v1, u32 4
+///     jmpif v5 then: b3, else: b2
+///   b3():
+///     ...
+///     v11 = add v1, u32 1
+///     jmp b1(v11)
+///   b2():
+///     ...
+/// }
+/// ```
+const LOOP_BOILERPLATE_COUNT: usize = 5;
+
 impl Ssa {
     /// Loop unrolling can return errors, since ACIR functions need to be fully unrolled.
     /// This meta-pass will keep trying to unroll loops and simplifying the SSA until no more errors are found.
@@ -565,6 +588,27 @@ impl Loop {
             })
             .sum()
     }
+
+    /// Decide if this loop is small enough that it can be inlined in a way that the number
+    /// of unrolled instructions times the number of iterations would result in smaller bytecode
+    /// than if we keep the loops with their overheads.
+    fn is_small_loop(&self, function: &Function, cfg: &ControlFlowGraph) -> bool {
+        let Ok(Some((lower, upper))) = self.get_const_bounds(function, cfg) else {
+            return false;
+        };
+        let Some(lower) = lower.try_to_u64() else {
+            return false;
+        };
+        let Some(upper) = upper.try_to_u64() else {
+            return false;
+        };
+        let num_iterations = (upper - lower) as usize;
+        let refs = self.find_pre_header_reference_values(function, cfg);
+        let (loads, stores) = self.count_loads_and_stores(function, &refs);
+        let all_instructions = self.count_all_instructions(function);
+        let useful_instructions = all_instructions - loads - stores - LOOP_BOILERPLATE_COUNT;
+        useful_instructions * num_iterations < all_instructions
+    }
 }
 
 /// Return the induction value of the current iteration of the loop, from the given block's jmp arguments.
@@ -960,6 +1004,16 @@ mod tests {
         assert_eq!(all, 7);
     }
 
+    #[test]
+    fn test_is_small_loop() {
+        let ssa = brillig_unroll_test_case();
+        let function = ssa.main();
+        let mut loops = Loops::find_all(function);
+        let loop0 = loops.yet_to_unroll.pop().unwrap();
+
+        assert!(loop0.is_small_loop(function, &loops.cfg));
+    }
+
     /// Simple test loop:
     /// ```text
     /// unconstrained fn main(sum: u32) {

From b35fa25d03164a2cad907c8e8c7b5d8a3775ee98 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 16:44:45 +0000
Subject: [PATCH 13/25] Unroll small Brillig loops

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 193 +++++++++++++++++-
 1 file changed, 186 insertions(+), 7 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 1077347628f..ca90b6d101f 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -209,8 +209,9 @@ impl Loops {
         let mut unroll_errors = vec![];
         while let Some(next_loop) = self.yet_to_unroll.pop() {
             if function.runtime().is_brillig() {
-                // TODO (#6470): Decide whether to unroll this loop.
-                continue;
+                if !next_loop.is_small_loop(function, &self.cfg) {
+                    continue;
+                }
             }
             // If we've previously modified a block in this loop we need to refresh the context.
             // This happens any time we have nested loops.
@@ -593,21 +594,54 @@ impl Loop {
     /// of unrolled instructions times the number of iterations would result in smaller bytecode
     /// than if we keep the loops with their overheads.
     fn is_small_loop(&self, function: &Function, cfg: &ControlFlowGraph) -> bool {
+        self.boilerplate_stats(function, cfg).map(|s| s.is_small()).unwrap_or_default()
+    }
+
+    /// Collect boilerplate stats if we can figure out the upper and lower bounds of the loop.
+    fn boilerplate_stats(
+        &self,
+        function: &Function,
+        cfg: &ControlFlowGraph,
+    ) -> Option<BoilerplateStats> {
         let Ok(Some((lower, upper))) = self.get_const_bounds(function, cfg) else {
-            return false;
+            return None;
         };
         let Some(lower) = lower.try_to_u64() else {
-            return false;
+            return None;
         };
         let Some(upper) = upper.try_to_u64() else {
-            return false;
+            return None;
         };
-        let num_iterations = (upper - lower) as usize;
         let refs = self.find_pre_header_reference_values(function, cfg);
         let (loads, stores) = self.count_loads_and_stores(function, &refs);
         let all_instructions = self.count_all_instructions(function);
         let useful_instructions = all_instructions - loads - stores - LOOP_BOILERPLATE_COUNT;
-        useful_instructions * num_iterations < all_instructions
+        Some(BoilerplateStats {
+            iterations: (upper - lower) as usize,
+            loads,
+            stores,
+            all_instructions,
+            useful_instructions,
+        })
+    }
+}
+
+#[derive(Debug)]
+struct BoilerplateStats {
+    iterations: usize,
+    loads: usize,
+    stores: usize,
+    all_instructions: usize,
+    useful_instructions: usize,
+}
+
+impl BoilerplateStats {
+    /// A small loop is where if we unroll it into the pre-header then considering the
+    /// number of iterations we still end up with a smaller bytecode than if we leave
+    /// the blocks in tact with all the boilerplate involved in jumping, and the extra
+    /// reference access instructions.
+    fn is_small(&self) -> bool {
+        self.useful_instructions * self.iterations < self.all_instructions
     }
 }
 
@@ -1014,6 +1048,105 @@ mod tests {
         assert!(loop0.is_small_loop(function, &loops.cfg));
     }
 
+    #[test]
+    fn test_brillig_unroll_small_loop() {
+        let ssa = brillig_unroll_test_case();
+
+        // Example taken from an equivalent ACIR program (ie. remove the `unconstrained`) and run
+        // `cargo run -q -p nargo_cli --  --program-dir . compile --show-ssa`
+        let expected = "
+        brillig(inline) fn main f0 {
+          b0(v0: u32):
+            v1 = allocate -> &mut u32
+            store u32 0 at v1
+            v3 = load v1 -> u32
+            store v3 at v1
+            v4 = load v1 -> u32
+            v6 = add v4, u32 1
+            store v6 at v1
+            v7 = load v1 -> u32
+            v9 = add v7, u32 2
+            store v9 at v1
+            v10 = load v1 -> u32
+            v12 = add v10, u32 3
+            store v12 at v1
+            jmp b1()
+          b1():
+            v13 = load v1 -> u32
+            v14 = eq v13, v0
+            constrain v13 == v0
+            return
+        }
+        ";
+
+        let (ssa, errors) = ssa.try_unroll_loops();
+        assert_eq!(errors.len(), 0, "Unroll should have no errors");
+        assert_eq!(ssa.main().reachable_blocks().len(), 2, "The loop should be unrolled");
+
+        assert_normalized_ssa_equals(ssa, expected);
+    }
+
+    #[test]
+    fn test_brillig_unroll_6470_small() {
+        // Few enough iterations so that we can perform the unroll.
+        let ssa = brillig_unroll_test_case_6470(3);
+        let (ssa, errors) = ssa.try_unroll_loops();
+        assert_eq!(errors.len(), 0, "Unroll should have no errors");
+        assert_eq!(ssa.main().reachable_blocks().len(), 2, "The loop should be unrolled");
+
+        // The IDs are shifted by one compared to what the ACIR version printed.
+        let expected = "
+        brillig(inline) fn __validate_gt_remainder f0 {
+          b0(v0: [u64; 6]):
+            inc_rc v0
+            inc_rc [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64
+            v3 = allocate -> &mut [u64; 6]
+            store [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64 at v3
+            v5 = load v3 -> [u64; 6]
+            v7 = array_get v0, index u32 0 -> u64
+            v9 = add v7, u64 1
+            v10 = array_set v5, index u32 0, value v9
+            store v10 at v3
+            v11 = load v3 -> [u64; 6]
+            v13 = array_get v0, index u32 1 -> u64
+            v14 = add v13, u64 1
+            v15 = array_set v11, index u32 1, value v14
+            store v15 at v3
+            v16 = load v3 -> [u64; 6]
+            v18 = array_get v0, index u32 2 -> u64
+            v19 = add v18, u64 1
+            v20 = array_set v16, index u32 2, value v19
+            store v20 at v3
+            jmp b1()
+          b1():
+            v21 = load v3 -> [u64; 6]
+            dec_rc v0
+            return v21
+        }
+        ";
+        assert_normalized_ssa_equals(ssa, expected);
+    }
+
+    #[test]
+    fn test_brillig_unroll_6470_large() {
+        // More iterations than it can unroll
+        let ssa = brillig_unroll_test_case_6470(6);
+
+        let function = ssa.main();
+        let mut loops = Loops::find_all(function);
+        let loop0 = loops.yet_to_unroll.pop().unwrap();
+        let stats = loop0.boilerplate_stats(function, &loops.cfg).unwrap();
+        assert_eq!(stats.is_small(), false);
+
+        let (ssa, errors) = ssa.try_unroll_loops();
+        assert_eq!(errors.len(), 0, "Unroll should have no errors");
+        assert_eq!(
+            ssa.main().reachable_blocks().len(),
+            4,
+            "The loop should be considered too costly to unroll"
+        );
+    }
+
     /// Simple test loop:
     /// ```text
     /// unconstrained fn main(sum: u32) {
@@ -1054,4 +1187,50 @@ mod tests {
         ";
         Ssa::from_str(src).unwrap()
     }
+
+    /// Test case from #6470:
+    /// ```text
+    /// unconstrained fn __validate_gt_remainder(a_u60: [u64; 6]) -> [u64; 6] {
+    ///     let mut result_u60: [u64; 6] = [0; 6];
+    ///
+    ///     for i in 0..6 {
+    ///         result_u60[i] = a_u60[i] + 1;
+    ///     }
+    ///
+    ///     result_u60
+    /// }
+    /// ```
+    /// The `num_iterations` parameter can be used to make it more costly to inline.
+    fn brillig_unroll_test_case_6470(num_iterations: usize) -> Ssa {
+        let src = format!(
+            "
+        // After `static_assert` and `assert_constant`:
+        brillig(inline) fn __validate_gt_remainder f0 {{
+          b0(v0: [u64; 6]):
+            inc_rc v0
+            inc_rc [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64
+            v4 = allocate -> &mut [u64; 6]
+            store [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64 at v4
+            jmp b1(u32 0)
+          b1(v1: u32):
+            v7 = lt v1, u32 {num_iterations}
+            jmpif v7 then: b3, else: b2
+          b3():
+            v9 = load v4 -> [u64; 6]
+            v10 = array_get v0, index v1 -> u64
+            v12 = add v10, u64 1
+            v13 = array_set v9, index v1, value v12
+            v15 = add v1, u32 1
+            store v13 at v4
+            v16 = add v1, u32 1
+            jmp b1(v16)
+          b2():
+            v8 = load v4 -> [u64; 6]
+            dec_rc v0
+            return v8
+        }}
+        "
+        );
+        Ssa::from_str(&src).unwrap()
+    }
 }

From 7ffda9527cf62bce0552e9226b6be142ff4586b6 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 16:50:32 +0000
Subject: [PATCH 14/25] Fix clippy

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index ca90b6d101f..58664a6f1fd 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -208,10 +208,8 @@ impl Loops {
     fn unroll_each(mut self, function: &mut Function) -> Vec<RuntimeError> {
         let mut unroll_errors = vec![];
         while let Some(next_loop) = self.yet_to_unroll.pop() {
-            if function.runtime().is_brillig() {
-                if !next_loop.is_small_loop(function, &self.cfg) {
-                    continue;
-                }
+            if function.runtime().is_brillig() && !next_loop.is_small_loop(function, &self.cfg) {
+                continue;
             }
             // If we've previously modified a block in this loop we need to refresh the context.
             // This happens any time we have nested loops.
@@ -1136,7 +1134,7 @@ mod tests {
         let mut loops = Loops::find_all(function);
         let loop0 = loops.yet_to_unroll.pop().unwrap();
         let stats = loop0.boilerplate_stats(function, &loops.cfg).unwrap();
-        assert_eq!(stats.is_small(), false);
+        assert!(!stats.is_small(), "the loop should be considered large");
 
         let (ssa, errors) = ssa.try_unroll_loops();
         assert_eq!(errors.len(), 0, "Unroll should have no errors");

From 4aed51a2b02db750b15ec123de8594edefddb3be Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 20:38:01 +0000
Subject: [PATCH 15/25] Test stats, count duplicate increments

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 142 +++++++++++++-----
 1 file changed, 105 insertions(+), 37 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 58664a6f1fd..b233152383a 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -41,29 +41,6 @@ use crate::{
 };
 use fxhash::FxHashMap as HashMap;
 
-/// Number of instructions which are complete loop boilerplate,
-/// the ones facilitating jumps and the increment of the loop
-/// variable.
-///
-/// All the instructions in the following example are boilerplate:
-/// ```text
-/// brillig(inline) fn main f0 {
-///   b0(v0: u32):
-///     ...
-///     jmp b1(u32 0)
-///   b1(v1: u32):
-///     v5 = lt v1, u32 4
-///     jmpif v5 then: b3, else: b2
-///   b3():
-///     ...
-///     v11 = add v1, u32 1
-///     jmp b1(v11)
-///   b2():
-///     ...
-/// }
-/// ```
-const LOOP_BOILERPLATE_COUNT: usize = 5;
-
 impl Ssa {
     /// Loop unrolling can return errors, since ACIR functions need to be fully unrolled.
     /// This meta-pass will keep trying to unroll loops and simplifying the SSA until no more errors are found.
@@ -276,7 +253,7 @@ impl Loop {
     ///   b0(v0: u32):                  // Pre-header
     ///     ...
     ///     jmp b1(u32 0)               // Lower-bound
-    ///   b1(v1: u32):
+    ///   b1(v1: u32):                  // Induction variable
     ///     v5 = lt v1, u32 4
     ///     jmpif v5 then: b3, else: b2
     /// ```
@@ -588,6 +565,25 @@ impl Loop {
             .sum()
     }
 
+    /// Count the number of increments to the induction variable.
+    /// It should be one, but it can be duplicated.
+    /// The increment should be in the block where the back-edge was found.
+    fn count_induction_increments(&self, function: &Function) -> usize {
+        let back = &function.dfg[self.back_edge_start];
+        let header = &function.dfg[self.header];
+        let induction_var = header.parameters()[0];
+
+        let mut increments = 0;
+        for instruction in back.instructions() {
+            let instruction = &function.dfg[*instruction];
+            if matches!(instruction, Instruction::Binary(Binary { lhs, operator: BinaryOp::Add, rhs: _ }) if *lhs == induction_var)
+            {
+                increments += 1;
+            }
+        }
+        increments
+    }
+
     /// Decide if this loop is small enough that it can be inlined in a way that the number
     /// of unrolled instructions times the number of iterations would result in smaller bytecode
     /// than if we keep the loops with their overheads.
@@ -612,34 +608,77 @@ impl Loop {
         };
         let refs = self.find_pre_header_reference_values(function, cfg);
         let (loads, stores) = self.count_loads_and_stores(function, &refs);
+        let increments = self.count_induction_increments(function);
         let all_instructions = self.count_all_instructions(function);
-        let useful_instructions = all_instructions - loads - stores - LOOP_BOILERPLATE_COUNT;
+
         Some(BoilerplateStats {
             iterations: (upper - lower) as usize,
             loads,
             stores,
+            increments,
             all_instructions,
-            useful_instructions,
         })
     }
 }
 
+/// All the instructions in the following example are boilerplate:
+/// ```text
+/// brillig(inline) fn main f0 {
+///   b0(v0: u32):
+///     ...
+///     jmp b1(u32 0)
+///   b1(v1: u32):
+///     v5 = lt v1, u32 4
+///     jmpif v5 then: b3, else: b2
+///   b3():
+///     ...
+///     v11 = add v1, u32 1
+///     jmp b1(v11)
+///   b2():
+///     ...
+/// }
+/// ```
+const LOOP_BOILERPLATE_COUNT: usize = 5;
 #[derive(Debug)]
 struct BoilerplateStats {
+    /// Number of iterations in the loop.
     iterations: usize,
+    /// Number of loads  pre-header references in the loop.
     loads: usize,
+    /// Number of stores into pre-header references in the loop.
     stores: usize,
+    /// Number of increments to the induction variable (might be duplicated).
+    increments: usize,
+    /// Number of instructions in the loop, including boilerplate,
+    /// but excluding the boilerplate which is outside the loop.
     all_instructions: usize,
-    useful_instructions: usize,
 }
 
 impl BoilerplateStats {
+    /// Instruction count if we leave the loop as-is.
+    /// It's the instructions in the loop, plus the one to kick it off in the pre-header.
+    fn baseline_instructions(&self) -> usize {
+        self.all_instructions + 1
+    }
+
+    /// Estimated number of _useful_ instructions, which is the ones in the loop
+    /// minus all in-loop boilerplate.
+    fn useful_instructions(&self) -> usize {
+        let boilerplate = 3; // Two jumps + plus the comparison with the upper bound
+        self.all_instructions - self.loads - self.stores - self.increments - boilerplate
+    }
+
+    /// Estimated number of instructions if we unroll the loop.
+    fn unrolled_instructions(&self) -> usize {
+        self.useful_instructions() * self.iterations
+    }
+
     /// A small loop is where if we unroll it into the pre-header then considering the
     /// number of iterations we still end up with a smaller bytecode than if we leave
     /// the blocks in tact with all the boilerplate involved in jumping, and the extra
     /// reference access instructions.
     fn is_small(&self) -> bool {
-        self.useful_instructions * self.iterations < self.all_instructions
+        self.unrolled_instructions() < self.baseline_instructions()
     }
 }
 
@@ -862,7 +901,7 @@ impl<'f> LoopIteration<'f> {
         // instances of the induction variable or any values that were changed as a result
         // of the new induction variable value.
         for instruction in instructions {
-            if self.skip_ref_counts && self.is_ref_count(instruction) {
+            if self.skip_ref_counts && self.is_refcount(instruction) {
                 continue;
             }
             self.inserter.push_instruction(instruction, self.insert_block);
@@ -878,7 +917,8 @@ impl<'f> LoopIteration<'f> {
         self.inserter.function.dfg.set_block_terminator(self.insert_block, terminator);
     }
 
-    fn is_ref_count(&self, instruction: InstructionId) -> bool {
+    /// Is the instruction an `Rc`?
+    fn is_refcount(&self, instruction: InstructionId) -> bool {
         matches!(
             self.dfg()[instruction],
             Instruction::IncrementRc { .. } | Instruction::DecrementRc { .. }
@@ -900,7 +940,7 @@ mod tests {
 
     use crate::ssa::{ir::value::ValueId, opt::assert_normalized_ssa_equals, Ssa};
 
-    use super::Loops;
+    use super::{BoilerplateStats, Loops};
 
     #[test]
     fn unroll_nested_loops() {
@@ -1037,15 +1077,34 @@ mod tests {
     }
 
     #[test]
-    fn test_is_small_loop() {
+    fn test_boilerplate_stats() {
         let ssa = brillig_unroll_test_case();
-        let function = ssa.main();
-        let mut loops = Loops::find_all(function);
-        let loop0 = loops.yet_to_unroll.pop().unwrap();
+        let stats = loop0_stats(&ssa);
+        assert_eq!(stats.iterations, 4);
+        assert_eq!(stats.all_instructions, 2 + 5); // Instructions in b1 and b3
+        assert_eq!(stats.increments, 1);
+        assert_eq!(stats.loads, 1);
+        assert_eq!(stats.stores, 1);
+        assert_eq!(stats.useful_instructions(), 1); // Adding to sum
+        assert_eq!(stats.baseline_instructions(), 8);
+        assert!(stats.is_small());
+    }
 
-        assert!(loop0.is_small_loop(function, &loops.cfg));
+    #[test]
+    fn test_boilerplate_stats_6470() {
+        let ssa = brillig_unroll_test_case_6470(3);
+        let stats = loop0_stats(&ssa);
+        assert_eq!(stats.iterations, 3);
+        assert_eq!(stats.all_instructions, 2 + 8); // Instructions in b1 and b3
+        assert_eq!(stats.increments, 2);
+        assert_eq!(stats.loads, 1);
+        assert_eq!(stats.stores, 1);
+        assert_eq!(stats.useful_instructions(), 3); // array get, add, array set
+        assert_eq!(stats.baseline_instructions(), 11);
+        assert!(stats.is_small());
     }
 
+    /// Test that we can unroll a small loop.
     #[test]
     fn test_brillig_unroll_small_loop() {
         let ssa = brillig_unroll_test_case();
@@ -1084,6 +1143,7 @@ mod tests {
         assert_normalized_ssa_equals(ssa, expected);
     }
 
+    /// Test that we can unroll the loop in the ticket if we don't have too many iterations.
     #[test]
     fn test_brillig_unroll_6470_small() {
         // Few enough iterations so that we can perform the unroll.
@@ -1125,6 +1185,7 @@ mod tests {
         assert_normalized_ssa_equals(ssa, expected);
     }
 
+    /// Test that with more iterations it's not unrolled.
     #[test]
     fn test_brillig_unroll_6470_large() {
         // More iterations than it can unroll
@@ -1218,7 +1279,7 @@ mod tests {
             v10 = array_get v0, index v1 -> u64
             v12 = add v10, u64 1
             v13 = array_set v9, index v1, value v12
-            v15 = add v1, u32 1
+            v15 = add v1, u32 1                        // duplicate unused increment
             store v13 at v4
             v16 = add v1, u32 1
             jmp b1(v16)
@@ -1231,4 +1292,11 @@ mod tests {
         );
         Ssa::from_str(&src).unwrap()
     }
+
+    fn loop0_stats(ssa: &Ssa) -> BoilerplateStats {
+        let function = ssa.main();
+        let mut loops = Loops::find_all(function);
+        let loop0 = loops.yet_to_unroll.pop().expect("there should be a loop");
+        loop0.boilerplate_stats(function, &loops.cfg).expect("there should be stats")
+    }
 }

From d2adf53e9cefb6b2909baf549add108425bc4339 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 20:54:38 +0000
Subject: [PATCH 16/25] Handle 0..1

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index b233152383a..756ba93279b 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -293,6 +293,13 @@ impl Loop {
             Instruction::Binary(Binary { lhs: _, operator: BinaryOp::Lt, rhs }) => {
                 function.dfg.get_numeric_constant(*rhs)
             }
+            Instruction::Binary(Binary { lhs: _, operator: BinaryOp::Eq, rhs }) => {
+                // `for i in 0..1` is turned into:
+                // b1(v0: u32):
+                //   v12 = eq v0, u32 0
+                //   jmpif v12 then: b3, else: b2
+                function.dfg.get_numeric_constant(*rhs).map(|c| c + FieldElement::one())
+            }
             other => panic!("Unexpected instruction in header: {other:?}"),
         }
     }

From 7cd33fe1a3f758a308d8fdd928c24130f840b4d5 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 21:14:00 +0000
Subject: [PATCH 17/25] No panic on break and continue

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 98 ++++++++++++++++---
 1 file changed, 82 insertions(+), 16 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 756ba93279b..715f01614fb 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -262,7 +262,7 @@ impl Loop {
         function: &Function,
         cfg: &ControlFlowGraph,
     ) -> Result<Option<FieldElement>, CallStack> {
-        let pre_header = self.get_pre_header(cfg);
+        let pre_header = self.get_pre_header(function, cfg)?;
         let jump_value = get_induction_variable(function, pre_header)?;
         Ok(function.dfg.get_numeric_constant(jump_value))
     }
@@ -378,7 +378,7 @@ impl Loop {
     /// When e.g. `v8 = lt v5 v1` cannot be evaluated to a constant, the loop signals by returning `Err`
     /// that a few SSA passes are required to evaluate and simplify these values.
     fn unroll(&self, function: &mut Function, cfg: &ControlFlowGraph) -> Result<(), CallStack> {
-        let mut unroll_into = self.get_pre_header(cfg);
+        let mut unroll_into = self.get_pre_header(function, cfg)?;
         let mut jump_value = get_induction_variable(function, unroll_into)?;
 
         while let Some(context) = self.unroll_header(function, unroll_into, jump_value)? {
@@ -393,14 +393,27 @@ impl Loop {
     /// The loop pre-header is the block that comes before the loop begins. Generally a header block
     /// is expected to have 2 predecessors: the pre-header and the final block of the loop which jumps
     /// back to the beginning. Other predecessors can come from `break` or `continue`.
-    fn get_pre_header(&self, cfg: &ControlFlowGraph) -> BasicBlockId {
+    fn get_pre_header(
+        &self,
+        function: &Function,
+        cfg: &ControlFlowGraph,
+    ) -> Result<BasicBlockId, CallStack> {
         let mut pre_header = cfg
             .predecessors(self.header)
             .filter(|predecessor| *predecessor != self.back_edge_start)
             .collect::<Vec<_>>();
 
-        assert_eq!(pre_header.len(), 1);
-        pre_header.remove(0)
+        if function.runtime().is_acir() {
+            assert_eq!(pre_header.len(), 1);
+            Ok(pre_header.remove(0))
+        } else {
+            if pre_header.len() == 1 {
+                Ok(pre_header.remove(0))
+            } else {
+                // We can come back into the header from multiple blocks, so we can't unroll this.
+                Err(CallStack::new())
+            }
+        }
     }
 
     /// Unrolls the header block of the loop. This is the block that dominates all other blocks in the
@@ -508,9 +521,9 @@ impl Loop {
         &self,
         function: &Function,
         cfg: &ControlFlowGraph,
-    ) -> HashSet<ValueId> {
+    ) -> Result<HashSet<ValueId>, CallStack> {
         // We need to traverse blocks from the pre-header up to the block entry point.
-        let pre_header = self.get_pre_header(cfg);
+        let pre_header = self.get_pre_header(function, cfg)?;
         let function_entry = function.entry_block();
 
         // The algorithm to collect blocks in the loop is fine for us.
@@ -532,7 +545,7 @@ impl Loop {
         let params =
             function.parameters().iter().filter(|p| function.dfg.value_is_reference(**p)).copied();
 
-        params.chain(allocations).collect()
+        Ok(params.chain(allocations).collect())
     }
 
     /// Count the number of load and store instructions of specific variables in the loop.
@@ -598,7 +611,8 @@ impl Loop {
         self.boilerplate_stats(function, cfg).map(|s| s.is_small()).unwrap_or_default()
     }
 
-    /// Collect boilerplate stats if we can figure out the upper and lower bounds of the loop.
+    /// Collect boilerplate stats if we can figure out the upper and lower bounds of the loop,
+    /// and the loop doesn't have multiple back-edges from breaks and continues.
     fn boilerplate_stats(
         &self,
         function: &Function,
@@ -613,7 +627,9 @@ impl Loop {
         let Some(upper) = upper.try_to_u64() else {
             return None;
         };
-        let refs = self.find_pre_header_reference_values(function, cfg);
+        let Ok(refs) = self.find_pre_header_reference_values(function, cfg) else {
+            return None;
+        };
         let (loads, stores) = self.count_loads_and_stores(function, &refs);
         let increments = self.count_induction_increments(function);
         let all_instructions = self.count_all_instructions(function);
@@ -1071,7 +1087,7 @@ mod tests {
         let mut loops = Loops::find_all(function);
         let loop0 = loops.yet_to_unroll.pop().unwrap();
 
-        let refs = loop0.find_pre_header_reference_values(function, &loops.cfg);
+        let refs = loop0.find_pre_header_reference_values(function, &loops.cfg).unwrap();
         assert_eq!(refs.len(), 1);
         assert!(refs.contains(&ValueId::new(2)));
 
@@ -1197,11 +1213,7 @@ mod tests {
     fn test_brillig_unroll_6470_large() {
         // More iterations than it can unroll
         let ssa = brillig_unroll_test_case_6470(6);
-
-        let function = ssa.main();
-        let mut loops = Loops::find_all(function);
-        let loop0 = loops.yet_to_unroll.pop().unwrap();
-        let stats = loop0.boilerplate_stats(function, &loops.cfg).unwrap();
+        let stats = loop0_stats(&ssa);
         assert!(!stats.is_small(), "the loop should be considered large");
 
         let (ssa, errors) = ssa.try_unroll_loops();
@@ -1213,6 +1225,60 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_brillig_unroll_break_and_continue() {
+        // unconstrained fn main() {
+        //     let mut count = 0;
+        //     for i in 0..10 {
+        //         if i == 2 {
+        //             continue;
+        //         }
+        //         if i == 5 {
+        //             break;
+        //         }
+        //         count += 1;
+        //     }
+        //     assert(count == 4);
+        // }
+        let src = "
+        brillig(inline) fn main f0 {
+          b0():
+            v1 = allocate -> &mut Field
+            store Field 0 at v1
+            jmp b1(u32 0)
+          b1(v0: u32):
+            v5 = lt v0, u32 10
+            jmpif v5 then: b2, else: b6
+          b2():
+            v7 = eq v0, u32 2
+            jmpif v7 then: b7, else: b3
+          b7():
+            v18 = add v0, u32 1
+            jmp b1(v18)
+          b3():
+            v9 = eq v0, u32 5
+            jmpif v9 then: b5, else: b4
+          b5():
+            jmp b6()
+          b6():
+            v15 = load v1 -> Field
+            v17 = eq v15, Field 4
+            constrain v15 == Field 4
+            return
+          b4():
+            v10 = load v1 -> Field
+            v12 = add v10, Field 1
+            store v12 at v1
+            v14 = add v0, u32 1
+            jmp b1(v14)
+        }
+        ";
+        let ssa = Ssa::from_str(src).unwrap();
+        let (ssa, errors) = ssa.try_unroll_loops();
+        assert_eq!(errors.len(), 0, "Unroll should have no errors");
+        assert_eq!(ssa.main().reachable_blocks().len(), 8, "Nothing should be unrolled");
+    }
+
     /// Simple test loop:
     /// ```text
     /// unconstrained fn main(sum: u32) {

From 030d794068e68058db229e4f2c37dffe9d11bcef Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 21:38:53 +0000
Subject: [PATCH 18/25] Fix clippy

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 3900fed4e92..134514a40cc 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -408,13 +408,11 @@ impl Loop {
         if function.runtime().is_acir() {
             assert_eq!(pre_header.len(), 1);
             Ok(pre_header.remove(0))
+        } else if pre_header.len() == 1 {
+            Ok(pre_header.remove(0))
         } else {
-            if pre_header.len() == 1 {
-                Ok(pre_header.remove(0))
-            } else {
-                // We can come back into the header from multiple blocks, so we can't unroll this.
-                Err(CallStack::new())
-            }
+            // We can come back into the header from multiple blocks, so we can't unroll this.
+            Err(CallStack::new())
         }
     }
 

From 7b0face111ef96b8d0edf8877be948c89ed85514 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 21:43:01 +0000
Subject: [PATCH 19/25] Remove leftover method after merge

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 33 +++++--------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 134514a40cc..e02e3f55a35 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -72,18 +72,6 @@ impl Ssa {
         }
         Ok(ssa)
     }
-
-    /// Tries to unroll all loops in each SSA function.
-    /// If any loop cannot be unrolled, it is left as-is or in a partially unrolled state.
-    /// Returns the ssa along with all unrolling errors encountered
-    #[tracing::instrument(level = "trace", skip(self))]
-    fn try_unroll_loops(mut self) -> (Ssa, Vec<RuntimeError>) {
-        let mut errors = vec![];
-        for function in self.functions.values_mut() {
-            errors.extend(function.try_unroll_loops());
-        }
-        (self, errors)
-    }
 }
 
 impl Function {
@@ -593,15 +581,10 @@ impl Loop {
         let header = &function.dfg[self.header];
         let induction_var = header.parameters()[0];
 
-        let mut increments = 0;
-        for instruction in back.instructions() {
-            let instruction = &function.dfg[*instruction];
-            if matches!(instruction, Instruction::Binary(Binary { lhs, operator: BinaryOp::Add, rhs: _ }) if *lhs == induction_var)
-            {
-                increments += 1;
-            }
-        }
-        increments
+        back.instructions().iter().filter(|instruction|  {
+            let instruction = &function.dfg[**instruction];
+            matches!(instruction, Instruction::Binary(Binary { lhs, operator: BinaryOp::Add, rhs: _ }) if *lhs == induction_var)
+        }).count()
     }
 
     /// Decide if this loop is small enough that it can be inlined in a way that the number
@@ -1170,7 +1153,7 @@ mod tests {
         }
         ";
 
-        let (ssa, errors) = ssa.try_unroll_loops();
+        let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
         assert_eq!(ssa.main().reachable_blocks().len(), 2, "The loop should be unrolled");
 
@@ -1182,7 +1165,7 @@ mod tests {
     fn test_brillig_unroll_6470_small() {
         // Few enough iterations so that we can perform the unroll.
         let ssa = brillig_unroll_test_case_6470(3);
-        let (ssa, errors) = ssa.try_unroll_loops();
+        let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
         assert_eq!(ssa.main().reachable_blocks().len(), 2, "The loop should be unrolled");
 
@@ -1227,7 +1210,7 @@ mod tests {
         let stats = loop0_stats(&ssa);
         assert!(!stats.is_small(), "the loop should be considered large");
 
-        let (ssa, errors) = ssa.try_unroll_loops();
+        let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
         assert_eq!(
             ssa.main().reachable_blocks().len(),
@@ -1285,7 +1268,7 @@ mod tests {
         }
         ";
         let ssa = Ssa::from_str(src).unwrap();
-        let (ssa, errors) = ssa.try_unroll_loops();
+        let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
         assert_eq!(ssa.main().reachable_blocks().len(), 8, "Nothing should be unrolled");
     }

From efd763cb77b3709948ef54ae534cbc3f1a8de0e9 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Wed, 13 Nov 2024 22:46:18 +0000
Subject: [PATCH 20/25] Remove leftover const

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index e02e3f55a35..1fb88655a59 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -319,7 +319,7 @@ impl Loop {
     /// ```text
     /// main():
     ///   v0 = 0
-    ///   v1 = 3
+    ///   v1 = 2
     ///   jmp loop_entry(v0)
     /// loop_entry(i: Field):
     ///   v2 = lt i v1
@@ -503,7 +503,7 @@ impl Loop {
     /// ```
     ///
     /// We won't always find load _and_ store ops (e.g. the push above doesn't come with a store),
-    /// but it's likely that we `mem2reg` could eliminate a lot of the loads we can find, so we can
+    /// but it's likely that mem2reg could eliminate a lot of the loads we can find, so we can
     /// use this as an approximation of the gains we would see.
     fn find_pre_header_reference_values(
         &self,
@@ -644,7 +644,6 @@ impl Loop {
 ///     ...
 /// }
 /// ```
-const LOOP_BOILERPLATE_COUNT: usize = 5;
 #[derive(Debug)]
 struct BoilerplateStats {
     /// Number of iterations in the loop.

From 7cd6f3aa3a37db0f40fbb0227d91dbfcfebd4844 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Thu, 14 Nov 2024 19:00:40 +0000
Subject: [PATCH 21/25] Be more conservative and only assume mem2reg can get
 rid of load+store pairs

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 1fb88655a59..297b2846ed4 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -46,6 +46,7 @@ impl Ssa {
     /// This meta-pass will keep trying to unroll loops and simplifying the SSA until no more errors are found.
     #[tracing::instrument(level = "trace", skip(ssa))]
     pub(crate) fn unroll_loops_iteratively(mut ssa: Ssa) -> Result<Ssa, RuntimeError> {
+        ssa.normalize_ids();
         for (_, function) in ssa.functions.iter_mut() {
             // Try to unroll loops first:
             let mut unroll_errors = function.try_unroll_loops();
@@ -669,8 +670,12 @@ impl BoilerplateStats {
     /// Estimated number of _useful_ instructions, which is the ones in the loop
     /// minus all in-loop boilerplate.
     fn useful_instructions(&self) -> usize {
-        let boilerplate = 3; // Two jumps + plus the comparison with the upper bound
-        self.all_instructions - self.loads - self.stores - self.increments - boilerplate
+        // Two jumps + plus the comparison with the upper bound
+        let boilerplate = 3;
+        // Be conservative and only assume that mem2reg gets rid of load followed by store.
+        // NB we have not checked that these are actual pairs.
+        let load_and_store = self.loads.min(self.stores) * 2;
+        self.all_instructions - self.increments - load_and_store - boilerplate
     }
 
     /// Estimated number of instructions if we unroll the loop.

From 4050a85191a2857235e78861a9e90fb8b595be26 Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Mon, 18 Nov 2024 12:25:41 +0000
Subject: [PATCH 22/25] Fix comments, remove skip_ref_counts

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 26 +++++++------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 482faf839d4..19cea499f12 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -514,7 +514,8 @@ impl Loop {
         let pre_header = self.get_pre_header(function, cfg)?;
         let function_entry = function.entry_block();
 
-        // The algorithm to collect blocks in the loop is fine for us.
+        // The algorithm in `find_blocks_in_loop` expects to collect the blocks between the header and the back-edge of the loop,
+        // but technically works the same if we go from the pre-header up to the function entry as well.
         let blocks = Self::find_blocks_in_loop(function_entry, pre_header, cfg).blocks;
 
         // Collect allocations in all blocks above the header.
@@ -748,9 +749,6 @@ struct LoopIteration<'f> {
     /// This is None until we visit the block which jumps back to the start of the
     /// loop, at which point we record its value and the block it was found in.
     induction_value: Option<(BasicBlockId, ValueId)>,
-
-    /// Whether to discard reference count instructions, which ACIR doesn't need but Brillig does.
-    skip_ref_counts: bool,
 }
 
 impl<'f> LoopIteration<'f> {
@@ -761,7 +759,6 @@ impl<'f> LoopIteration<'f> {
         source_block: BasicBlockId,
     ) -> Self {
         Self {
-            skip_ref_counts: function.runtime().is_acir(),
             inserter: FunctionInserter::new(function),
             loop_,
             insert_block,
@@ -822,16 +819,12 @@ impl<'f> LoopIteration<'f> {
                 then_destination,
                 else_destination,
                 call_stack,
-            } =>
-            // Check where the loop body wants to jump (e.g if-then blocks).
-            {
-                self.handle_jmpif(
-                    *condition,
-                    *then_destination,
-                    *else_destination,
-                    call_stack.clone(),
-                )
-            }
+            } => self.handle_jmpif(
+                *condition,
+                *then_destination,
+                *else_destination,
+                call_stack.clone(),
+            ),
             TerminatorInstruction::Jmp { destination, arguments, call_stack: _ } => {
                 if self.get_original_block(*destination) == self.loop_.header {
                     // We found the back-edge of the loop.
@@ -910,7 +903,8 @@ impl<'f> LoopIteration<'f> {
         // instances of the induction variable or any values that were changed as a result
         // of the new induction variable value.
         for instruction in instructions {
-            if self.skip_ref_counts && self.is_refcount(instruction) {
+            // Reference counting is only used by Brillig, ACIR doesn't need them.
+            if self.inserter.function.runtime().is_acir() && self.is_refcount(instruction) {
                 continue;
             }
             self.inserter.push_instruction(instruction, self.insert_block);

From bc84d48f189e034f10e6394e9e2a62e97ae17b2b Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Mon, 18 Nov 2024 12:43:27 +0000
Subject: [PATCH 23/25] Compare agains original SSA source when testing that
 nothing is unrolled

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 19cea499f12..9a588a96798 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -1123,8 +1123,8 @@ mod tests {
     fn test_brillig_unroll_small_loop() {
         let ssa = brillig_unroll_test_case();
 
-        // Example taken from an equivalent ACIR program (ie. remove the `unconstrained`) and run
-        // `cargo run -q -p nargo_cli --  --program-dir . compile --show-ssa`
+        // Expectation taken by compiling the Noir program as ACIR,
+        // ie. by removing the `unconstrained` from `main`.
         let expected = "
         brillig(inline) fn main f0 {
           b0(v0: u32):
@@ -1203,19 +1203,17 @@ mod tests {
     #[test]
     fn test_brillig_unroll_6470_large() {
         // More iterations than it can unroll
-        let ssa = brillig_unroll_test_case_6470(6);
+        let parse_ssa = || brillig_unroll_test_case_6470(6);
+        let ssa = parse_ssa();
         let stats = loop0_stats(&ssa);
         assert!(!stats.is_small(), "the loop should be considered large");
 
         let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
-        assert_eq!(
-            ssa.main().reachable_blocks().len(),
-            4,
-            "The loop should be considered too costly to unroll"
-        );
+        assert_normalized_ssa_equals(ssa, parse_ssa().to_string().as_str());
     }
 
+    /// Test that `break` and `continue` stop unrolling without any panic.
     #[test]
     fn test_brillig_unroll_break_and_continue() {
         // unconstrained fn main() {
@@ -1267,7 +1265,7 @@ mod tests {
         let ssa = Ssa::from_str(src).unwrap();
         let (ssa, errors) = try_unroll_loops(ssa);
         assert_eq!(errors.len(), 0, "Unroll should have no errors");
-        assert_eq!(ssa.main().reachable_blocks().len(), 8, "Nothing should be unrolled");
+        assert_normalized_ssa_equals(ssa, src);
     }
 
     /// Simple test loop:
@@ -1284,6 +1282,9 @@ mod tests {
     ///      sum
     ///  }
     /// ```
+    /// We can check what the ACIR unrolling behavior would be by
+    /// removing the `unconstrained` from the `main` function and
+    /// compiling the program with `nargo --test-program . compile --show-ssa`.
     fn brillig_unroll_test_case() -> Ssa {
         let src = "
         // After `static_assert` and `assert_constant`:
@@ -1357,6 +1358,7 @@ mod tests {
         Ssa::from_str(&src).unwrap()
     }
 
+    // Boilerplate stats of the first loop in the SSA.
     fn loop0_stats(ssa: &Ssa) -> BoilerplateStats {
         let function = ssa.main();
         let mut loops = Loops::find_all(function);

From 8845ca0b44315cdf851edadea1474268f24268cd Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Mon, 18 Nov 2024 14:11:48 +0000
Subject: [PATCH 24/25] Rephrase comment

---
 compiler/noirc_evaluator/src/ssa/opt/unrolling.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index 9a588a96798..5bb2c90b1cc 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -797,9 +797,8 @@ impl<'f> LoopIteration<'f> {
     /// Unroll a single block in the current iteration of the loop
     fn unroll_loop_block(&mut self) -> Vec<BasicBlockId> {
         let mut next_blocks = self.unroll_loop_block_helper();
-        // If the loop body ends with a JmpIf then eliminate either destinations
-        // which are not one of the block we have seen as a destination of a terminator
-        // of a block we have already inlined.
+        // Guarantee that the next blocks we set up to be unrolled, are actually part of the loop,
+        // which we recorded while inlining the instructions of the blocks already processed.
         next_blocks.retain(|block| {
             let b = self.get_original_block(*block);
             self.loop_.blocks.contains(&b)
@@ -840,7 +839,7 @@ impl<'f> LoopIteration<'f> {
     /// Find the next branch(es) to take from a jmpif terminator and return them.
     /// If only one block is returned, it means the jmpif condition evaluated to a known
     /// constant and we can safely take only the given branch. In this case the method
-    /// also replaces the terminator of the insert block (a.k.a fresh block) to be a JMP,
+    /// also replaces the terminator of the insert block (a.k.a fresh block) to be a `Jmp`,
     /// and changes the source block in the context for the next iteration to be the
     /// destination indicated by the constant condition (ie. the `then` or the `else`).
     fn handle_jmpif(

From ca32afef69de89551f996372ddb73e00cf6d4b1f Mon Sep 17 00:00:00 2001
From: Akosh Farkash <aakoshh@gmail.com>
Date: Tue, 19 Nov 2024 16:21:19 +0000
Subject: [PATCH 25/25] Update SSA after make_array

---
 .../noirc_evaluator/src/ssa/opt/unrolling.rs  | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
index e9aad0b2075..89f1b2b2d7d 100644
--- a/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
+++ b/compiler/noirc_evaluator/src/ssa/opt/unrolling.rs
@@ -1180,32 +1180,33 @@ mod tests {
 
         // The IDs are shifted by one compared to what the ACIR version printed.
         let expected = "
-        brillig(inline) fn __validate_gt_remainder f0 {
+        brillig(inline) fn main f0 {
           b0(v0: [u64; 6]):
             inc_rc v0
-            inc_rc [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64
+            v2 = make_array [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] : [u64; 6]
+            inc_rc v2
             v3 = allocate -> &mut [u64; 6]
-            store [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64 at v3
-            v5 = load v3 -> [u64; 6]
-            v7 = array_get v0, index u32 0 -> u64
-            v9 = add v7, u64 1
-            v10 = array_set v5, index u32 0, value v9
-            store v10 at v3
-            v11 = load v3 -> [u64; 6]
-            v13 = array_get v0, index u32 1 -> u64
-            v14 = add v13, u64 1
-            v15 = array_set v11, index u32 1, value v14
-            store v15 at v3
-            v16 = load v3 -> [u64; 6]
-            v18 = array_get v0, index u32 2 -> u64
-            v19 = add v18, u64 1
-            v20 = array_set v16, index u32 2, value v19
-            store v20 at v3
+            store v2 at v3
+            v4 = load v3 -> [u64; 6]
+            v6 = array_get v0, index u32 0 -> u64
+            v8 = add v6, u64 1
+            v9 = array_set v4, index u32 0, value v8
+            store v9 at v3
+            v10 = load v3 -> [u64; 6]
+            v12 = array_get v0, index u32 1 -> u64
+            v13 = add v12, u64 1
+            v14 = array_set v10, index u32 1, value v13
+            store v14 at v3
+            v15 = load v3 -> [u64; 6]
+            v17 = array_get v0, index u32 2 -> u64
+            v18 = add v17, u64 1
+            v19 = array_set v15, index u32 2, value v18
+            store v19 at v3
             jmp b1()
           b1():
-            v21 = load v3 -> [u64; 6]
+            v20 = load v3 -> [u64; 6]
             dec_rc v0
-            return v21
+            return v20
         }
         ";
         assert_normalized_ssa_equals(ssa, expected);
@@ -1341,12 +1342,13 @@ mod tests {
         let src = format!(
             "
         // After `static_assert` and `assert_constant`:
-        brillig(inline) fn __validate_gt_remainder f0 {{
+        brillig(inline) fn main f0 {{
           b0(v0: [u64; 6]):
             inc_rc v0
-            inc_rc [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64
+            v3 = make_array [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] : [u64; 6]
+            inc_rc v3
             v4 = allocate -> &mut [u64; 6]
-            store [u64 0, u64 0, u64 0, u64 0, u64 0, u64 0] of u64 at v4
+            store v3 at v4
             jmp b1(u32 0)
           b1(v1: u32):
             v7 = lt v1, u32 {num_iterations}
@@ -1356,9 +1358,9 @@ mod tests {
             v10 = array_get v0, index v1 -> u64
             v12 = add v10, u64 1
             v13 = array_set v9, index v1, value v12
-            v15 = add v1, u32 1                        // duplicate unused increment
+            v15 = add v1, u32 1
             store v13 at v4
-            v16 = add v1, u32 1
+            v16 = add v1, u32 1 // duplicate
             jmp b1(v16)
           b2():
             v8 = load v4 -> [u64; 6]