From 07e2070cbf3f6007944a8e5b60019a2e5ac0a9eb Mon Sep 17 00:00:00 2001
From: Eduard-Mihai Burtescu <eddyb@lyken.rs>
Date: Thu, 28 Sep 2023 18:03:45 +0300
Subject: [PATCH 1/3] cfg: use Tarjan's SCC algorithm to get "minimal loops"
 (instead of "maximal").

---
 README.md        |   8 +-
 src/cfg.rs       | 270 ++++++++++++++++++++++++++++++++++++++++++-----
 src/spv/lower.rs |  21 +++-
 3 files changed, 268 insertions(+), 31 deletions(-)
diff --git a/README.md b/README.md
index eea77ff..d46121e 100644
--- a/README.md
+++ b/README.md
@@ -140,16 +140,16 @@ global_var GV0 in spv.StorageClass.Output: s32
 func F0() -> spv.OpTypeVoid {
   loop(v0: s32 <- 1s32, v1: s32 <- 1s32) {
     v2 = spv.OpSLessThan(v1, 10s32): bool
-    (v3: bool, v4: s32, v5: s32) = if v2 {
+    (v3: bool, v4: s32, v5: s32, _: bool) = if v2 {
       v6 = spv.OpIMul(v0, v1): s32
       v7 = spv.OpIAdd(v1, 1s32): s32
-      (true, v6, v7)
+      (true, v6, v7, false)
     } else {
-      spv.OpStore(Pointer: &GV0, Object: v0)
-      (false, spv.OpUndef: s32, spv.OpUndef: s32)
+      (false, spv.OpUndef: s32, spv.OpUndef: s32, true)
     }
     (v4, v5) -> (v0, v1)
   } while v3
+  spv.OpStore(Pointer: &GV0, Object: v0)
 }
 ```
 <!-- END tests/data/for-loop.wgsl.spvasm.structured.spirt -->
diff --git a/src/cfg.rs b/src/cfg.rs
index 9606131..3dc70ae 100644
--- a/src/cfg.rs
+++ b/src/cfg.rs
@@ -1,10 +1,10 @@
 //! Control-flow graph (CFG) abstractions and utilities.
 
-use crate::func_at::FuncAt;
 use crate::{
     spv, AttrSet, Const, ConstCtor, ConstDef, Context, ControlNode, ControlNodeDef,
     ControlNodeKind, ControlNodeOutputDecl, ControlRegion, ControlRegionDef, EntityList,
-    EntityOrientedDenseMap, FuncDefBody, FxIndexMap, SelectionKind, Type, TypeCtor, TypeDef, Value,
+    EntityOrientedDenseMap, FuncDefBody, FxIndexMap, FxIndexSet, SelectionKind, Type, TypeCtor,
+    TypeDef, Value,
 };
 use smallvec::SmallVec;
 use std::mem;
@@ -15,6 +15,10 @@ use std::mem;
 #[derive(Clone, Default)]
 pub struct ControlFlowGraph {
     pub control_inst_on_exit_from: EntityOrientedDenseMap<ControlRegion, ControlInst>,
+
+    // HACK(eddyb) this currently only comes from `OpLoopMerge`, and cannot be
+    // inferred (because implies too strong of an ownership/uniqueness notion).
+    pub loop_merge_to_loop_header: FxIndexMap<ControlRegion, ControlRegion>,
 }
 
 #[derive(Clone)]
@@ -148,16 +152,14 @@ impl ControlFlowGraph {
         assert!(std::ptr::eq(func_def_body.unstructured_cfg.as_ref().unwrap(), self));
         assert!(func_at_body.def().outputs.is_empty());
 
-        self.traverse(func_at_body, state);
+        self.traverse(func_def_body.body, state);
     }
 
     fn traverse(
         &self,
-        func_at_region: FuncAt<'_, ControlRegion>,
+        region: ControlRegion,
         state: &mut TraversalState<impl FnMut(ControlRegion), impl FnMut(ControlRegion)>,
     ) {
-        let region = func_at_region.position;
-
         // FIXME(eddyb) `EntityOrientedDenseMap` should have an `entry` API.
         if let Some(existing_count) = state.incoming_edge_counts.get_mut(region) {
             *existing_count += IncomingEdgeCount::ONE;
@@ -179,13 +181,181 @@ impl ControlFlowGraph {
             Either::Right(targets)
         };
         for target in targets {
-            self.traverse(func_at_region.at(target), state);
+            self.traverse(target, state);
         }
 
         (state.post_order_visit)(region);
     }
 }
 
+/// Minimal loop analysis, based on Tarjan's SCC (strongly connected components)
+/// algorithm, applied recursively (for every level of loop nesting).
+///
+/// Here "minimal" means that each loops is the smallest CFG subgraph possible
+/// (excluding any control-flow paths that cannot reach a backedge and cycle),
+/// i.e. each loop is a CFG SCC (strongly connected component).
+///
+/// These "minimal loops" contrast with the "maximal loops" that the greedy
+/// architecture of the structurizer would naively produce, with the main impact
+/// of the difference being where loop exits (`break`s) "merge" (or "reconverge"),
+/// which SPIR-V encodes via `OpLoopMerge`, and is significant for almost anything
+/// where shared memory and/or subgroup ops can allow observing when invocations
+/// "wait for others in the subgroup to exit the loop" (or when they fail to wait).
+///
+/// This analysis was added to because of two observations wrt "reconvergence":
+/// 1. syntactic loops (from some high-level language), when truly structured
+///    (i.e. only using `while`/`do`-`while` exit conditions, not `break` etc.),
+///    *always* map to "minimal loops" on a CFG, as the only loop exit edge is
+///    built-in, and no part of the syntactic "loop body" can be its successor
+/// 2. more pragmatically, compiling shader languages to SPIR-V seems to (almost?)
+///    always *either* fully preserve syntactic loops (via SPIR-V `OpLoopMerge`),
+///    *or* structurize CFGs in a way that produces "minimal loops", which can
+///    be misleading with explicit `break`s (moving user code from just before
+///    the `break` to after the loop), but is less impactful than "maximal loops"
+struct LoopFinder<'a> {
+    cfg: &'a ControlFlowGraph,
+
+    // FIXME(eddyb) this feels a bit inefficient (are many-exit loops rare?).
+    loop_header_to_exit_targets: FxIndexMap<ControlRegion, FxIndexSet<ControlRegion>>,
+
+    /// SCC accumulation stack, where CFG nodes collect during the depth-first
+    /// traversal, and are only popped when their "SCC root" (loop header) is
+    /// (note that multiple SCCs on the stack does *not* indicate SCC nesting,
+    /// but rather a path between two SCCs, i.e. a loop *following* another).
+    scc_stack: Vec<ControlRegion>,
+    /// Per-CFG-node traversal state (often just pointing to a `scc_stack` slot).
+    scc_state: EntityOrientedDenseMap<ControlRegion, SccState>,
+}
+
+// FIXME(eddyb) make `Option<Option<SccStackIdx>>` the same size somehow.
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
+struct SccStackIdx(u32);
+
+#[derive(PartialEq, Eq)]
+enum SccState {
+    /// CFG node has been reached and ended up somewhere on the `scc_stack`,
+    /// where it will remain after the SCC it's part of will be completed.
+    Pending(SccStackIdx),
+
+    /// CFG node had been reached once, but is no longer on the `scc_stack`, its
+    /// parent SCC having been completed (or it wasn't in an SCC to begin with).
+    Complete,
+}
+
+impl<'a> LoopFinder<'a> {
+    fn new(cfg: &'a ControlFlowGraph) -> Self {
+        Self {
+            cfg,
+            loop_header_to_exit_targets: FxIndexMap::default(),
+            scc_stack: vec![],
+            scc_state: EntityOrientedDenseMap::new(),
+        }
+    }
+
+    /// Tarjan's SCC algorithm works by computing the "earliest" reachable node,
+    /// from every node (often using the name `lowlink`), which will be equal
+    /// to the origin node itself iff that node is an "SCC root" (loop header),
+    /// and always point to an "earlier" node if a cycle (via loop backedge) was
+    /// found from somewhere else in the SCC (i.e. from inside the loop body).
+    ///
+    /// Here we track stack indices (as the stack order is the traversal order),
+    /// and distinguish the acyclic case to avoid treating most nodes as self-loops.
+    fn find_earliest_scc_root_of(&mut self, node: ControlRegion) -> Option<SccStackIdx> {
+        let state_entry = self.scc_state.entry(node);
+        if let Some(state) = &state_entry {
+            return match *state {
+                SccState::Pending(scc_stack_idx) => Some(scc_stack_idx),
+                SccState::Complete => None,
+            };
+        }
+        let scc_stack_idx = SccStackIdx(self.scc_stack.len().try_into().unwrap());
+        self.scc_stack.push(node);
+        *state_entry = Some(SccState::Pending(scc_stack_idx));
+
+        let control_inst = self
+            .cfg
+            .control_inst_on_exit_from
+            .get(node)
+            .expect("cfg: missing `ControlInst`, despite having left structured control-flow");
+
+        let earliest_scc_root = control_inst
+            .targets
+            .iter()
+            .flat_map(|&target| {
+                // HACK(eddyb) if one of the edges is already known to be a loop exit
+                // (from `OpLoopMerge` specifically), treat it almost like a backedge,
+                // but with the additional requirement that the loop header is already
+                // on the stack (i.e. this `node` is reachable from that loop header).
+                let root_candidate_from_loop_merge =
+                    self.cfg.loop_merge_to_loop_header.get(&target).and_then(|&loop_header| {
+                        match self.scc_state.get(loop_header) {
+                            Some(&SccState::Pending(scc_stack_idx)) => Some(scc_stack_idx),
+                            _ => None,
+                        }
+                    });
+
+                self.find_earliest_scc_root_of(target)
+                    .into_iter()
+                    .chain(root_candidate_from_loop_merge)
+            })
+            .min();
+
+        // If this node has been chosen as the root of an SCC, complete that SCC.
+        if earliest_scc_root == Some(scc_stack_idx) {
+            let scc_start = scc_stack_idx.0 as usize;
+
+            // It's now possible to find all the loop exits: they're all the
+            // edges from nodes of this SCC (loop) to nodes not in the SCC.
+            let target_in_scc = |target| match self.scc_state.get(target) {
+                Some(&SccState::Pending(i)) => i >= scc_stack_idx,
+                _ => false,
+            };
+            self.loop_header_to_exit_targets.insert(
+                node,
+                self.scc_stack[scc_start..]
+                    .iter()
+                    .flat_map(|&scc_node| {
+                        self.cfg.control_inst_on_exit_from[scc_node]
+                            .targets
+                            .iter()
+                            .copied()
+                            .filter(|&target| !target_in_scc(target))
+                    })
+                    .collect(),
+            );
+
+            // Find nested loops by marking *only* the loop header as complete,
+            // clearing loop body nodes' state, and recursing on them: all the
+            // nodes outside the loop (otherwise reachable from within), and the
+            // loop header itself, are already marked as complete, meaning that
+            // all exits and backedges will be ignored, and the recursion will
+            // only find more SCCs within the loop body (i.e. nested loops).
+            self.scc_state[node] = SccState::Complete;
+            let loop_body_range = scc_start + 1..self.scc_stack.len();
+            for &scc_node in &self.scc_stack[loop_body_range.clone()] {
+                self.scc_state.remove(scc_node);
+            }
+            for i in loop_body_range.clone() {
+                self.find_earliest_scc_root_of(self.scc_stack[i]);
+            }
+            assert_eq!(self.scc_stack.len(), loop_body_range.end);
+
+            // Remove the entire SCC from the accumulation stack all at once.
+            self.scc_stack.truncate(scc_start);
+
+            return None;
+        }
+
+        // Not actually in an SCC at all, just some node outside any CFG cycles.
+        if earliest_scc_root.is_none() {
+            assert!(self.scc_stack.pop() == Some(node));
+            self.scc_state[node] = SccState::Complete;
+        }
+
+        earliest_scc_root
+    }
+}
+
 #[allow(rustdoc::private_intra_doc_links)]
 /// Control-flow "structurizer", which attempts to convert as much of the CFG
 /// as possible into structural control-flow (regions).
@@ -225,7 +395,15 @@ pub struct Structurizer<'a> {
     const_false: Const,
 
     func_def_body: &'a mut FuncDefBody,
-    incoming_edge_counts: EntityOrientedDenseMap<ControlRegion, IncomingEdgeCount>,
+
+    // FIXME(eddyb) this feels a bit inefficient (are many-exit loops rare?).
+    loop_header_to_exit_targets: FxIndexMap<ControlRegion, FxIndexSet<ControlRegion>>,
+
+    // HACK(eddyb) this also tracks all of `loop_header_to_exit_targets`, as
+    // "false edges" from every loop header to each exit target of that loop,
+    // which structurizing that loop consumes to "unlock" its own exits.
+    incoming_edge_counts_including_loop_exits:
+        EntityOrientedDenseMap<ControlRegion, IncomingEdgeCount>,
 
     /// Keyed by the input to `structurize_region_from` (the start [`ControlRegion`]),
     /// and describing the state of that partial structurization step.
@@ -384,21 +562,42 @@ impl<'a> Structurizer<'a> {
             ctor_args: [].into_iter().collect(),
         });
 
-        let incoming_edge_counts = func_def_body
-            .unstructured_cfg
-            .as_ref()
-            .map(|cfg| {
-                let mut state = TraversalState {
-                    incoming_edge_counts: EntityOrientedDenseMap::new(),
-
-                    pre_order_visit: |_| {},
-                    post_order_visit: |_| {},
-                    reverse_targets: false,
-                };
-                cfg.traverse_whole_func(func_def_body, &mut state);
-                state.incoming_edge_counts
-            })
-            .unwrap_or_default();
+        let (loop_header_to_exit_targets, incoming_edge_counts_including_loop_exits) =
+            func_def_body
+                .unstructured_cfg
+                .as_ref()
+                .map(|cfg| {
+                    let loop_header_to_exit_targets = {
+                        let mut loop_finder = LoopFinder::new(cfg);
+                        loop_finder.find_earliest_scc_root_of(func_def_body.body);
+                        loop_finder.loop_header_to_exit_targets
+                    };
+
+                    let mut state = TraversalState {
+                        incoming_edge_counts: EntityOrientedDenseMap::new(),
+
+                        pre_order_visit: |_| {},
+                        post_order_visit: |_| {},
+                        reverse_targets: false,
+                    };
+                    cfg.traverse_whole_func(func_def_body, &mut state);
+
+                    // HACK(eddyb) treat loop exits as "false edges", that their
+                    // respective loop header "owns", such that structurization
+                    // naturally stops at those loop exits, instead of continuing
+                    // greedily into the loop exterior (producing "maximal loops").
+                    for loop_exit_targets in loop_header_to_exit_targets.values() {
+                        for &exit_target in loop_exit_targets {
+                            *state
+                                .incoming_edge_counts
+                                .entry(exit_target)
+                                .get_or_insert(Default::default()) += IncomingEdgeCount::ONE;
+                        }
+                    }
+
+                    (loop_header_to_exit_targets, state.incoming_edge_counts)
+                })
+                .unwrap_or_default();
 
         Self {
             cx,
@@ -407,7 +606,9 @@ impl<'a> Structurizer<'a> {
             const_false,
 
             func_def_body,
-            incoming_edge_counts,
+
+            loop_header_to_exit_targets,
+            incoming_edge_counts_including_loop_exits,
 
             structurize_region_state: FxIndexMap::default(),
             control_region_input_replacements: EntityOrientedDenseMap::new(),
@@ -530,7 +731,9 @@ impl<'a> Structurizer<'a> {
         };
         let backedge_count = backedge.map(|e| e.edge_bundle.accumulated_count).unwrap_or_default();
 
-        if self.incoming_edge_counts[target] != edge_bundle.accumulated_count + backedge_count {
+        if self.incoming_edge_counts_including_loop_exits[target]
+            != edge_bundle.accumulated_count + backedge_count
+        {
             return Err(DeferredEdgeBundle {
                 condition: Value::Const(self.const_true),
                 edge_bundle,
@@ -612,6 +815,20 @@ impl<'a> Structurizer<'a> {
             // being encoded in `region.deferred_*`.
             region.children = EntityList::empty();
             region.children.insert_last(loop_node, &mut self.func_def_body.control_nodes);
+
+            // HACK(eddyb) we've treated loop exits as extra "false edges", so
+            // here they have to be added to the loop (potentially unlocking
+            // structurization to the outside of the loop, in the caller).
+            if let Some(exit_targets) = self.loop_header_to_exit_targets.get(&target) {
+                for &exit_target in exit_targets {
+                    // FIXME(eddyb) what if this is `None`, is that impossible?
+                    if let Some(deferred) =
+                        region.deferred_edges.target_to_deferred.get_mut(&exit_target)
+                    {
+                        deferred.edge_bundle.accumulated_count += IncomingEdgeCount::ONE;
+                    }
+                }
+            }
         }
 
         if !edge_bundle.target_inputs.is_empty() {
@@ -759,7 +976,8 @@ impl<'a> Structurizer<'a> {
                     .control_inst_on_exit_from
                     .insert(proxy, control_inst);
                 self.structurize_region_state.insert(proxy, StructurizeRegionState::InProgress);
-                self.incoming_edge_counts.insert(proxy, IncomingEdgeCount::ONE);
+                self.incoming_edge_counts_including_loop_exits
+                    .insert(proxy, IncomingEdgeCount::ONE);
                 let deferred_proxy = DeferredEdgeBundle {
                     condition: Value::Const(self.const_true),
                     edge_bundle: IncomingEdgeBundle {
diff --git a/src/spv/lower.rs b/src/spv/lower.rs
index f9b8d2c..b0a8bad 100644
--- a/src/spv/lower.rs
+++ b/src/spv/lower.rs
@@ -1265,7 +1265,26 @@ impl Module {
                         ));
                     }
 
-                    // HACK(eddyb) merges are ignored - this may be lossy,
+                    // HACK(eddyb) we want to at least record `OpLoopMerge`s'
+                    // impact on the shape of a loop, for restructurization.
+                    if opcode == wk.OpLoopMerge {
+                        assert_eq!(ids.len(), 2);
+                        let loop_merge_target =
+                            match lookup_global_or_local_id_for_data_or_control_inst_input(ids[0])?
+                            {
+                                LocalIdDef::Value(_) => return Err(invalid("expected label ID")),
+                                LocalIdDef::BlockLabel(target) => target,
+                            };
+
+                        func_def_body
+                            .unstructured_cfg
+                            .as_mut()
+                            .unwrap()
+                            .loop_merge_to_loop_header
+                            .insert(loop_merge_target, current_block_control_region);
+                    }
+
+                    // HACK(eddyb) merges are mostly ignored - this may be lossy,
                     // especially wrt the `SelectionControl` and `LoopControl`
                     // operands, but it's not obvious how they should map to
                     // some "structured regions" replacement for the CFG.

From 2991d833f7fa86af9fcfe852629f40187da00cf3 Mon Sep 17 00:00:00 2001
From: Eduard-Mihai Burtescu <eddyb@lyken.rs>
Date: Thu, 28 Sep 2023 18:07:17 +0300
Subject: [PATCH 2/3] Pacify rustc/clippy warnings.

---
 build.rs         | 16 ++++++++--------
 src/print/mod.rs |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/build.rs b/build.rs
index 88b5f7f..8f2cdc4 100644
--- a/build.rs
+++ b/build.rs
@@ -32,17 +32,17 @@ fn main() {
         .collect();
     extinsts_names_and_grammars.sort();
 
-    let all_jsons = format!(
+    let mut all_jsons = format!(
         "pub(super) const SPIRV_CORE_GRAMMAR: &str = include_str!({:?});\n\
-         pub(super) const EXTINST_NAMES_AND_GRAMMARS: &[(&str, &str)] = &[\n{}];",
+         pub(super) const EXTINST_NAMES_AND_GRAMMARS: &[(&str, &str)] = &[\n",
         khr_spv_include_dir.join(core_grammar),
-        extinsts_names_and_grammars
-            .into_iter()
-            .map(|(name, grammar)| {
-                format!("({:?}, include_str!({:?})),\n", name, khr_spv_include_dir.join(grammar),)
-            })
-            .collect::<String>()
     );
+    for (name, grammar) in extinsts_names_and_grammars {
+        use std::fmt::Write as _;
+        writeln!(all_jsons, "({:?}, include_str!({:?})),", name, khr_spv_include_dir.join(grammar))
+            .unwrap();
+    }
+    all_jsons += "];";
     std::fs::write(
         std::path::PathBuf::from(std::env::var_os("OUT_DIR").unwrap())
             .join("khr_spv_grammar_jsons.rs"),
diff --git a/src/print/mod.rs b/src/print/mod.rs
index 9f65570..a7b821c 100644
--- a/src/print/mod.rs
+++ b/src/print/mod.rs
@@ -227,7 +227,7 @@ impl Use {
     // HACK(eddyb) this is used in `AttrsAndDef::insert_name_before_def` to
     // detect alignment anchors specifically, so it needs to not overlap with
     // any other name (including those containing escaped `OpName` strings).
-    const ANCHOR_ALIGNMENT_NAME_PREFIX: &str = "AA.";
+    const ANCHOR_ALIGNMENT_NAME_PREFIX: &'static str = "AA.";
 
     fn keyword_and_name_prefix(self) -> (&'static str, &'static str) {
         match self {

From b8e8300f9a94d12e52c1fc145e47ccb60e3c065e Mon Sep 17 00:00:00 2001
From: Eduard-Mihai Burtescu <eddyb@lyken.rs>
Date: Thu, 28 Sep 2023 18:14:06 +0300
Subject: [PATCH 3/3] Update CHANGELOG.

---
 CHANGELOG.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dca4421..4546176 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,10 +33,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased] - ReleaseDate
 
+### Changed 🛠
+- [PR#48](https://github.com/EmbarkStudios/spirt/pull/48) changed CFG structurization
+  from "maximal loops" to "minimal loops" (computed using Tarjan's SCC algorithm),
+  and added `OpLoopMerge` support on top (by extending a "minimal loop" as needed)
+
 ## [0.3.0] - 2023-07-25
 
 ### Added ⭐
-- [PR#45](https://github.com/EmbarkStudios/spirt/pull/45) added the abilit to
+- [PR#45](https://github.com/EmbarkStudios/spirt/pull/45) added the ability to
   pretty-print `OpExtInst`s (SPIR-V "extended instructions") using official
   `extinst.*.grammar.json` descriptions and/or custom ones (registered via `Context`)