From 875d2758b1f7892c14a7d71a39d3c1e944a82ec6 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 11:46:53 -0700
Subject: [PATCH 01/12] ARM64 backend, part 1 / 11: misc changes to existing
 code.

- Add a `simple_legalize()` function that invokes a predetermined set of
  legalizations, without depending on the details of the current
  backend design. This will be used by the new backend pipeline.

- Separate out `has_side_effect()` from the DCE pass. This will be used
  by the new backends' lowering code.

- Add documentation for the `Arm64Call` relocation type.
---
 cranelift/codegen/meta/src/isa/arm64/mod.rs |  2 +
 cranelift/codegen/src/binemit/mod.rs        |  4 +-
 cranelift/codegen/src/dce.rs                | 13 ++++--
 cranelift/codegen/src/ir/immediates.rs      |  5 +++
 cranelift/codegen/src/legalizer/mod.rs      | 49 +++++++++++++++++++++
 5 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/cranelift/codegen/meta/src/isa/arm64/mod.rs b/cranelift/codegen/meta/src/isa/arm64/mod.rs
index 3440c8af8229..5d8bc76fc444 100644
--- a/cranelift/codegen/meta/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs
@@ -54,7 +54,9 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     let mut a64 = CpuMode::new("A64");
 
     // TODO refine these.
+    let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
     let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags");
+    a64.legalize_monomorphic(expand_flags);
     a64.legalize_default(narrow_flags);
 
     let cpu_modes = vec![a64];
diff --git a/cranelift/codegen/src/binemit/mod.rs b/cranelift/codegen/src/binemit/mod.rs
index 3a33649d4de0..33655a26bd44 100644
--- a/cranelift/codegen/src/binemit/mod.rs
+++ b/cranelift/codegen/src/binemit/mod.rs
@@ -54,7 +54,9 @@ pub enum Reloc {
     X86GOTPCRel4,
     /// Arm32 call target
     Arm32Call,
-    /// Arm64 call target
+    /// Arm64 call target. Encoded as bottom 26 bits of instruction. This
+    /// value is sign-extended, multiplied by 4, and added to the PC of
+    /// the call instruction to form the destination address.
     Arm64Call,
     /// RISC-V call target
     RiscvCall,
diff --git a/cranelift/codegen/src/dce.rs b/cranelift/codegen/src/dce.rs
index b217534c3e55..827ae98ec4ce 100644
--- a/cranelift/codegen/src/dce.rs
+++ b/cranelift/codegen/src/dce.rs
@@ -40,6 +40,14 @@ fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool
     }
 }
 
+/// Does the given instruction have any side-effect that would preclude it from being removed when
+/// its value is unused?
+pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
+    let data = &func.dfg[inst];
+    let opcode = data.opcode();
+    trivially_unsafe_for_dce(opcode) || is_load_with_defined_trapping(opcode, data)
+}
+
 /// Perform DCE on `func`.
 pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
     let _tt = timing::dce();
@@ -50,10 +58,7 @@ pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
         let mut pos = FuncCursor::new(func).at_bottom(block);
         while let Some(inst) = pos.prev_inst() {
             {
-                let data = &pos.func.dfg[inst];
-                let opcode = data.opcode();
-                if trivially_unsafe_for_dce(opcode)
-                    || is_load_with_defined_trapping(opcode, &data)
+                if has_side_effect(pos.func, inst)
                     || any_inst_results_used(inst, &live, &pos.func.dfg)
                 {
                     for arg in pos.func.dfg.inst_args(inst) {
diff --git a/cranelift/codegen/src/ir/immediates.rs b/cranelift/codegen/src/ir/immediates.rs
index b1d142bd9e41..5104d83f9dbc 100644
--- a/cranelift/codegen/src/ir/immediates.rs
+++ b/cranelift/codegen/src/ir/immediates.rs
@@ -57,6 +57,11 @@ impl Imm64 {
     pub fn wrapping_neg(self) -> Self {
         Self(self.0.wrapping_neg())
     }
+
+    /// Return bits of this immediate.
+    pub fn bits(&self) -> i64 {
+        self.0
+    }
 }
 
 impl Into<i64> for Imm64 {
diff --git a/cranelift/codegen/src/legalizer/mod.rs b/cranelift/codegen/src/legalizer/mod.rs
index 781767336a5c..e28cc47d6ab7 100644
--- a/cranelift/codegen/src/legalizer/mod.rs
+++ b/cranelift/codegen/src/legalizer/mod.rs
@@ -196,6 +196,55 @@ pub fn legalize_function(func: &mut ir::Function, cfg: &mut ControlFlowGraph, is
     }
 }
 
+/// Perform a simple legalization by expansion of the function, without
+/// platform-specific transforms.
+pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa: &dyn TargetIsa) {
+    let mut pos = FuncCursor::new(func);
+    let func_begin = pos.position();
+    pos.set_position(func_begin);
+    while let Some(_block) = pos.next_block() {
+        let mut prev_pos = pos.position();
+        while let Some(inst) = pos.next_inst() {
+            let expanded = match pos.func.dfg[inst].opcode() {
+                ir::Opcode::BrIcmp
+                | ir::Opcode::GlobalValue
+                | ir::Opcode::HeapAddr
+                | ir::Opcode::StackLoad
+                | ir::Opcode::StackStore
+                | ir::Opcode::TableAddr
+                | ir::Opcode::Trapnz
+                | ir::Opcode::Trapz
+                | ir::Opcode::BandImm
+                | ir::Opcode::BorImm
+                | ir::Opcode::BxorImm
+                | ir::Opcode::IaddImm
+                | ir::Opcode::IfcmpImm
+                | ir::Opcode::ImulImm
+                | ir::Opcode::IrsubImm
+                | ir::Opcode::IshlImm
+                | ir::Opcode::RotlImm
+                | ir::Opcode::RotrImm
+                | ir::Opcode::SdivImm
+                | ir::Opcode::SremImm
+                | ir::Opcode::SshrImm
+                | ir::Opcode::UdivImm
+                | ir::Opcode::UremImm
+                | ir::Opcode::UshrImm
+                | ir::Opcode::IcmpImm => expand(inst, &mut pos.func, cfg, isa),
+                _ => false,
+            };
+
+            if expanded {
+                // Legalization implementations require fixpoint loop
+                // here. TODO: fix this.
+                pos.set_position(prev_pos);
+            } else {
+                prev_pos = pos.position();
+            }
+        }
+    }
+}
+
 // Include legalization patterns that were generated by `gen_legalizer.rs` from the
 // `TransformGroup` in `cranelift-codegen/meta/shared/legalize.rs`.
 //

From f80fe949c63c6361468325a93c2d8b56e5cd11a9 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 12:01:11 -0700
Subject: [PATCH 02/12] ARM64 backend, part 2 / 11: remove old ARM64 backend.

This removes the old ARM64 backend completely, leaving only an empty
`arm64` module. The tree at this state will not build with the `arm64`
feature enabled, but that feature has to be enabled explicitly (it is
not default). Subsequent patches will fill in the new backend.
---
 cranelift/codegen/src/isa/arm64/abi.rs        |  31 ----
 cranelift/codegen/src/isa/arm64/binemit.rs    |   8 --
 cranelift/codegen/src/isa/arm64/enc_tables.rs |  10 --
 cranelift/codegen/src/isa/arm64/mod.rs        | 133 +-----------------
 cranelift/codegen/src/isa/arm64/registers.rs  |  39 -----
 cranelift/codegen/src/isa/arm64/settings.rs   |   9 --
 cranelift/codegen/src/isa/mod.rs              |   1 -
 7 files changed, 1 insertion(+), 230 deletions(-)
 delete mode 100644 cranelift/codegen/src/isa/arm64/abi.rs
 delete mode 100644 cranelift/codegen/src/isa/arm64/binemit.rs
 delete mode 100644 cranelift/codegen/src/isa/arm64/enc_tables.rs
 delete mode 100644 cranelift/codegen/src/isa/arm64/registers.rs
 delete mode 100644 cranelift/codegen/src/isa/arm64/settings.rs

diff --git a/cranelift/codegen/src/isa/arm64/abi.rs b/cranelift/codegen/src/isa/arm64/abi.rs
deleted file mode 100644
index 8d486d4193f6..000000000000
--- a/cranelift/codegen/src/isa/arm64/abi.rs
+++ /dev/null
@@ -1,31 +0,0 @@
-//! ARM 64 ABI implementation.
-
-use super::registers::{FPR, GPR};
-use crate::ir;
-use crate::isa::RegClass;
-use crate::regalloc::RegisterSet;
-use crate::settings as shared_settings;
-use alloc::borrow::Cow;
-
-/// Legalize `sig`.
-pub fn legalize_signature(
-    _sig: &mut Cow<ir::Signature>,
-    _flags: &shared_settings::Flags,
-    _current: bool,
-) {
-    unimplemented!()
-}
-
-/// Get register class for a type appearing in a legalized signature.
-pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass {
-    if ty.is_int() {
-        GPR
-    } else {
-        FPR
-    }
-}
-
-/// Get the set of allocatable registers for `func`.
-pub fn allocatable_registers(_func: &ir::Function) -> RegisterSet {
-    unimplemented!()
-}
diff --git a/cranelift/codegen/src/isa/arm64/binemit.rs b/cranelift/codegen/src/isa/arm64/binemit.rs
deleted file mode 100644
index 4401b6d6f59d..000000000000
--- a/cranelift/codegen/src/isa/arm64/binemit.rs
+++ /dev/null
@@ -1,8 +0,0 @@
-//! Emitting binary ARM64 machine code.
-
-use crate::binemit::{bad_encoding, CodeSink};
-use crate::ir::{Function, Inst};
-use crate::isa::TargetIsa;
-use crate::regalloc::RegDiversions;
-
-include!(concat!(env!("OUT_DIR"), "/binemit-arm64.rs"));
diff --git a/cranelift/codegen/src/isa/arm64/enc_tables.rs b/cranelift/codegen/src/isa/arm64/enc_tables.rs
deleted file mode 100644
index 6040a9b866ea..000000000000
--- a/cranelift/codegen/src/isa/arm64/enc_tables.rs
+++ /dev/null
@@ -1,10 +0,0 @@
-//! Encoding tables for ARM64 ISA.
-
-use crate::ir;
-use crate::isa;
-use crate::isa::constraints::*;
-use crate::isa::enc_tables::*;
-use crate::isa::encoding::RecipeSizing;
-
-include!(concat!(env!("OUT_DIR"), "/encoding-arm64.rs"));
-include!(concat!(env!("OUT_DIR"), "/legalize-arm64.rs"));
diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs
index f00062b2afc3..2bd6dce476e4 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1,132 +1 @@
-//! ARM 64-bit Instruction Set Architecture.
-
-mod abi;
-mod binemit;
-mod enc_tables;
-mod registers;
-pub mod settings;
-
-use super::super::settings as shared_settings;
-#[cfg(feature = "testing_hooks")]
-use crate::binemit::CodeSink;
-use crate::binemit::{emit_function, MemoryCodeSink};
-use crate::ir;
-use crate::isa::enc_tables::{lookup_enclist, Encodings};
-use crate::isa::Builder as IsaBuilder;
-use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa};
-use crate::regalloc;
-use alloc::borrow::Cow;
-use alloc::boxed::Box;
-use core::fmt;
-use target_lexicon::Triple;
-
-#[allow(dead_code)]
-struct Isa {
-    triple: Triple,
-    shared_flags: shared_settings::Flags,
-    isa_flags: settings::Flags,
-}
-
-/// Get an ISA builder for creating ARM64 targets.
-pub fn isa_builder(triple: Triple) -> IsaBuilder {
-    IsaBuilder {
-        triple,
-        setup: settings::builder(),
-        constructor: isa_constructor,
-    }
-}
-
-fn isa_constructor(
-    triple: Triple,
-    shared_flags: shared_settings::Flags,
-    builder: shared_settings::Builder,
-) -> Box<dyn TargetIsa> {
-    Box::new(Isa {
-        triple,
-        isa_flags: settings::Flags::new(&shared_flags, builder),
-        shared_flags,
-    })
-}
-
-impl TargetIsa for Isa {
-    fn name(&self) -> &'static str {
-        "arm64"
-    }
-
-    fn triple(&self) -> &Triple {
-        &self.triple
-    }
-
-    fn flags(&self) -> &shared_settings::Flags {
-        &self.shared_flags
-    }
-
-    fn register_info(&self) -> RegInfo {
-        registers::INFO.clone()
-    }
-
-    fn encoding_info(&self) -> EncInfo {
-        enc_tables::INFO.clone()
-    }
-
-    fn legal_encodings<'a>(
-        &'a self,
-        func: &'a ir::Function,
-        inst: &'a ir::InstructionData,
-        ctrl_typevar: ir::Type,
-    ) -> Encodings<'a> {
-        lookup_enclist(
-            ctrl_typevar,
-            inst,
-            func,
-            &enc_tables::LEVEL1_A64[..],
-            &enc_tables::LEVEL2[..],
-            &enc_tables::ENCLISTS[..],
-            &enc_tables::LEGALIZE_ACTIONS[..],
-            &enc_tables::RECIPE_PREDICATES[..],
-            &enc_tables::INST_PREDICATES[..],
-            self.isa_flags.predicate_view(),
-        )
-    }
-
-    fn legalize_signature(&self, sig: &mut Cow<ir::Signature>, current: bool) {
-        abi::legalize_signature(sig, &self.shared_flags, current)
-    }
-
-    fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass {
-        abi::regclass_for_abi_type(ty)
-    }
-
-    fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet {
-        abi::allocatable_registers(func)
-    }
-
-    #[cfg(feature = "testing_hooks")]
-    fn emit_inst(
-        &self,
-        func: &ir::Function,
-        inst: ir::Inst,
-        divert: &mut regalloc::RegDiversions,
-        sink: &mut dyn CodeSink,
-    ) {
-        binemit::emit_inst(func, inst, divert, sink, self)
-    }
-
-    fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) {
-        emit_function(func, binemit::emit_inst, sink, self)
-    }
-
-    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
-        ir::condcodes::IntCC::UnsignedLessThan
-    }
-
-    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
-        ir::condcodes::IntCC::UnsignedGreaterThanOrEqual
-    }
-}
-
-impl fmt::Display for Isa {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}\n{}", self.shared_flags, self.isa_flags)
-    }
-}
+// Empty.
diff --git a/cranelift/codegen/src/isa/arm64/registers.rs b/cranelift/codegen/src/isa/arm64/registers.rs
deleted file mode 100644
index c02f6b7d4d11..000000000000
--- a/cranelift/codegen/src/isa/arm64/registers.rs
+++ /dev/null
@@ -1,39 +0,0 @@
-//! ARM64 register descriptions.
-
-use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit};
-
-include!(concat!(env!("OUT_DIR"), "/registers-arm64.rs"));
-
-#[cfg(test)]
-mod tests {
-    use super::INFO;
-    use crate::isa::RegUnit;
-    use alloc::string::{String, ToString};
-
-    #[test]
-    fn unit_encodings() {
-        assert_eq!(INFO.parse_regunit("x0"), Some(0));
-        assert_eq!(INFO.parse_regunit("x31"), Some(31));
-        assert_eq!(INFO.parse_regunit("v0"), Some(32));
-        assert_eq!(INFO.parse_regunit("v31"), Some(63));
-
-        assert_eq!(INFO.parse_regunit("x32"), None);
-        assert_eq!(INFO.parse_regunit("v32"), None);
-    }
-
-    #[test]
-    fn unit_names() {
-        fn uname(ru: RegUnit) -> String {
-            INFO.display_regunit(ru).to_string()
-        }
-
-        assert_eq!(uname(0), "%x0");
-        assert_eq!(uname(1), "%x1");
-        assert_eq!(uname(31), "%x31");
-        assert_eq!(uname(32), "%v0");
-        assert_eq!(uname(33), "%v1");
-        assert_eq!(uname(63), "%v31");
-        assert_eq!(uname(64), "%nzcv");
-        assert_eq!(uname(65), "%INVALID65");
-    }
-}
diff --git a/cranelift/codegen/src/isa/arm64/settings.rs b/cranelift/codegen/src/isa/arm64/settings.rs
deleted file mode 100644
index 56d0f4ee0b44..000000000000
--- a/cranelift/codegen/src/isa/arm64/settings.rs
+++ /dev/null
@@ -1,9 +0,0 @@
-//! ARM64 Settings.
-
-use crate::settings::{self, detail, Builder};
-use core::fmt;
-
-// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a
-// public `Flags` struct with an impl for all of the settings defined in
-// `cranelift-codegen/meta/src/isa/arm64/mod.rs`.
-include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs"));
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index 9c91d4219390..bad6fd7e795f 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -116,7 +116,6 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
             isa_builder!(x86, "x86", triple)
         }
         Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
-        Architecture::Aarch64 { .. } => isa_builder!(arm64, "arm64", triple),
         _ => Err(LookupError::Unsupported),
     }
 }

From d83574261c913e7c1bf1f4f44c721b848f5bf2d7 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 12:27:26 -0700
Subject: [PATCH 03/12] ARM64 backend, part 3 / 11: MachInst infrastructure.

This patch adds the MachInst, or Machine Instruction, infrastructure.
This is the machine-independent portion of the new backend design. It
contains the implementation of the "vcode" (virtual-registerized code)
container, the top-level lowering algorithm and compilation pipeline,
and the trait definitions that the machine backends will fill in.

This backend infrastructure is included in the compilation of the
`codegen` crate, but it is not yet tied into the public APIs; that patch
will come last, after all the other pieces are filled in.

This patch contains code written by Julian Seward <jseward@acm.org> and
Benjamin Bouvier <public@benj.me>, originally developed on a side-branch
before rebasing and condensing into this patch series. See the `arm64`
branch at `https://github.com/cfallin/wasmtime` for original development
history.

Co-authored-by: Julian Seward <jseward@acm.org>
Co-authored-by: Benjamin Bouvier <public@benj.me>
---
 Cargo.lock                                   |  17 +
 cranelift/codegen/Cargo.toml                 |   3 +-
 cranelift/codegen/src/isa/mod.rs             |   8 +-
 cranelift/codegen/src/lib.rs                 |   2 +
 cranelift/codegen/src/machinst/abi.rs        | 142 ++++
 cranelift/codegen/src/machinst/adapter.rs    | 123 ++++
 cranelift/codegen/src/machinst/blockorder.rs |  59 ++
 cranelift/codegen/src/machinst/compile.rs    |  76 ++
 cranelift/codegen/src/machinst/lower.rs      | 723 ++++++++++++++++++
 cranelift/codegen/src/machinst/mod.rs        | 288 ++++++++
 cranelift/codegen/src/machinst/pp.rs         |  66 ++
 cranelift/codegen/src/machinst/sections.rs   | 351 +++++++++
 cranelift/codegen/src/machinst/vcode.rs      | 738 +++++++++++++++++++
 cranelift/codegen/src/num_uses.rs            |  68 ++
 14 files changed, 2662 insertions(+), 2 deletions(-)
 create mode 100644 cranelift/codegen/src/machinst/abi.rs
 create mode 100644 cranelift/codegen/src/machinst/adapter.rs
 create mode 100644 cranelift/codegen/src/machinst/blockorder.rs
 create mode 100644 cranelift/codegen/src/machinst/compile.rs
 create mode 100644 cranelift/codegen/src/machinst/lower.rs
 create mode 100644 cranelift/codegen/src/machinst/mod.rs
 create mode 100644 cranelift/codegen/src/machinst/pp.rs
 create mode 100644 cranelift/codegen/src/machinst/sections.rs
 create mode 100644 cranelift/codegen/src/machinst/vcode.rs
 create mode 100644 cranelift/codegen/src/num_uses.rs

diff --git a/Cargo.lock b/Cargo.lock
index b8c92cbc46c0..8d7d237b6e0b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -379,6 +379,7 @@ dependencies = [
  "gimli",
  "hashbrown 0.7.1",
  "log",
+ "regalloc",
  "serde",
  "smallvec",
  "target-lexicon",
@@ -1599,6 +1600,16 @@ dependencies = [
  "rust-argon2",
 ]
 
+[[package]]
+name = "regalloc"
+version = "0.0.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89ce0cd835fa6e91bbf5d010beee19d0c2e97e4ad5e13c399a31122cfc83bdd6"
+dependencies = [
+ "log",
+ "rustc-hash",
+]
+
 [[package]]
 name = "regex"
 version = "1.3.6"
@@ -1663,6 +1674,12 @@ version = "0.1.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783"
 
+[[package]]
+name = "rustc-hash"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
+
 [[package]]
 name = "rustc_version"
 version = "0.2.3"
diff --git a/cranelift/codegen/Cargo.toml b/cranelift/codegen/Cargo.toml
index 148fcf93273f..83219d42e652 100644
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -24,6 +24,7 @@ gimli = { version = "0.20.0", default-features = false, features = ["write"], op
 smallvec = { version = "1.0.0" }
 thiserror = "1.0.4"
 byteorder = { version = "1.3.2", default-features = false }
+regalloc = "0.0.17"
 # It is a goal of the cranelift-codegen crate to have minimal external dependencies.
 # Please don't add any unless they are essential to the task of creating binary
 # machine code. Integration tests that need external dependencies can be
@@ -33,7 +34,7 @@ byteorder = { version = "1.3.2", default-features = false }
 cranelift-codegen-meta = { path = "meta", version = "0.62.0" }
 
 [features]
-default = ["std", "unwind"]
+default = ["std", "unwind", "all-arch"]
 
 # The "std" feature enables use of libstd. The "core" feature enables use
 # of some minimal std-like replacement libraries. At least one of these two
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index bad6fd7e795f..c94707690a51 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -55,9 +55,10 @@ pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
 use crate::binemit;
 use crate::flowgraph;
 use crate::ir;
-use crate::isa::enc_tables::Encodings;
+pub use crate::isa::enc_tables::Encodings;
 #[cfg(feature = "unwind")]
 use crate::isa::fde::RegisterMappingError;
+use crate::machinst::MachBackend;
 use crate::regalloc;
 use crate::result::CodegenResult;
 use crate::settings;
@@ -400,6 +401,11 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
     ) {
         // No-op by default
     }
+
+    /// Get the new-style MachBackend, if this is an adapter around one.
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        None
+    }
 }
 
 impl Debug for &dyn TargetIsa {
diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs
index 772562b916cc..2d6651a67e30 100644
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -71,6 +71,7 @@ pub mod flowgraph;
 pub mod ir;
 pub mod isa;
 pub mod loop_analysis;
+pub mod machinst;
 pub mod print_errors;
 pub mod settings;
 pub mod timing;
@@ -90,6 +91,7 @@ mod iterators;
 mod legalizer;
 mod licm;
 mod nan_canonicalization;
+mod num_uses;
 mod partition_slice;
 mod postopt;
 mod predicates;
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
new file mode 100644
index 000000000000..7aaa66fe1471
--- /dev/null
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -0,0 +1,142 @@
+//! ABI definitions.
+
+use crate::ir;
+use crate::ir::StackSlot;
+use crate::machinst::*;
+use crate::settings;
+
+use regalloc::{Reg, Set, SpillSlot, VirtualReg, Writable};
+
+/// Trait implemented by an object that tracks ABI-related state (e.g., stack
+/// layout) and can generate code while emitting the *body* of a function.
+pub trait ABIBody<I: VCodeInst> {
+    /// Get the liveins of the function.
+    fn liveins(&self) -> Set<RealReg>;
+
+    /// Get the liveouts of the function.
+    fn liveouts(&self) -> Set<RealReg>;
+
+    /// Number of arguments.
+    fn num_args(&self) -> usize;
+
+    /// Number of return values.
+    fn num_retvals(&self) -> usize;
+
+    /// Number of stack slots (not spill slots).
+    fn num_stackslots(&self) -> usize;
+
+    /// Generate an instruction which copies an argument to a destination
+    /// register.
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> I;
+
+    /// Generate an instruction which copies a source register to a return
+    /// value slot.
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> I;
+
+    /// Generate a return instruction.
+    fn gen_ret(&self) -> I;
+
+    /// Generate an epilogue placeholder.
+    fn gen_epilogue_placeholder(&self) -> I;
+
+    // -----------------------------------------------------------------
+    // Every function above this line may only be called pre-regalloc.
+    // Every function below this line may only be called post-regalloc.
+    // `spillslots()` must be called before any other post-regalloc
+    // function.
+    // ----------------------------------------------------------------
+
+    /// Update with the number of spillslots, post-regalloc.
+    fn set_num_spillslots(&mut self, slots: usize);
+
+    /// Update with the clobbered registers, post-regalloc.
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>);
+
+    /// Load from a stackslot.
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: usize,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> I;
+
+    /// Store to a stackslot.
+    fn store_stackslot(&self, slot: StackSlot, offset: usize, ty: Type, from_reg: Reg) -> I;
+
+    /// Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> I;
+
+    /// Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> I;
+
+    /// Generate a prologue, post-regalloc. This should include any stack
+    /// frame or other setup necessary to use the other methods (`load_arg`,
+    /// `store_retval`, and spillslot accesses.)  |self| is mutable so that we
+    /// can store information in it which will be useful when creating the
+    /// epilogue.
+    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<I>;
+
+    /// Generate an epilogue, post-regalloc. Note that this must generate the
+    /// actual return instruction (rather than emitting this in the lowering
+    /// logic), because the epilogue code comes before the return and the two are
+    /// likely closely related.
+    fn gen_epilogue(&self, flags: &settings::Flags) -> Vec<I>;
+
+    /// Returns the full frame size for the given function, after prologue emission has run. This
+    /// comprises the spill space, incoming argument space, alignment padding, etc.
+    fn frame_size(&self) -> u32;
+
+    /// Get the spill-slot size.
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32;
+
+    /// Generate a spill.
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> I;
+
+    /// Generate a reload (fill).
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> I;
+}
+
+/// Trait implemented by an object that tracks ABI-related state and can
+/// generate code while emitting a *call* to a function.
+///
+/// An instance of this trait returns information for a *particular*
+/// callsite. It will usually be computed from the called function's
+/// signature.
+///
+/// Unlike `ABIBody` above, methods on this trait are not invoked directly
+/// by the machine-independent code. Rather, the machine-specific lowering
+/// code will typically create an `ABICall` when creating machine instructions
+/// for an IR call instruction inside `lower()`, directly emit the arg and
+/// and retval copies, and attach the register use/def info to the call.
+///
+/// This trait is thus provided for convenience to the backends.
+pub trait ABICall<I: VCodeInst> {
+    /// Get the number of arguments expected.
+    fn num_args(&self) -> usize;
+
+    /// Save the clobbered registers.
+    /// Copy an argument value from a source register, prior to the call.
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> I;
+
+    /// Copy a return value into a destination register, after the call returns.
+    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> I;
+
+    /// Pre-adjust the stack, prior to argument copies and call.
+    fn gen_stack_pre_adjust(&self) -> Vec<I>;
+
+    /// Post-adjust the satck, after call return and return-value copies.
+    fn gen_stack_post_adjust(&self) -> Vec<I>;
+
+    /// Generate the call itself.
+    ///
+    /// The returned instruction should have proper use- and def-sets according
+    /// to the argument registers, return-value registers, and clobbered
+    /// registers for this function signature in this ABI.
+    ///
+    /// (Arg registers are uses, and retval registers are defs. Clobbered
+    /// registers are also logically defs, but should never be read; their
+    /// values are "defined" (to the regalloc) but "undefined" in every other
+    /// sense.)
+    fn gen_call(&self) -> Vec<I>;
+}
diff --git a/cranelift/codegen/src/machinst/adapter.rs b/cranelift/codegen/src/machinst/adapter.rs
new file mode 100644
index 000000000000..3f7c5b7b57f0
--- /dev/null
+++ b/cranelift/codegen/src/machinst/adapter.rs
@@ -0,0 +1,123 @@
+//! Adapter for a `MachBackend` to implement the `TargetIsa` trait.
+
+use crate::binemit;
+use crate::ir;
+use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa};
+use crate::machinst::*;
+use crate::regalloc::{RegDiversions, RegisterSet};
+use crate::settings::Flags;
+
+use std::borrow::Cow;
+use std::fmt;
+use target_lexicon::Triple;
+
+/// A wrapper around a `MachBackend` that provides a `TargetIsa` impl.
+pub struct TargetIsaAdapter {
+    backend: Box<dyn MachBackend + Send + Sync + 'static>,
+    triple: Triple,
+}
+
+impl TargetIsaAdapter {
+    /// Create a new `TargetIsa` wrapper around a `MachBackend`.
+    pub fn new<B: MachBackend + Send + Sync + 'static>(backend: B) -> TargetIsaAdapter {
+        let triple = backend.triple();
+        TargetIsaAdapter {
+            backend: Box::new(backend),
+            triple,
+        }
+    }
+}
+
+impl fmt::Display for TargetIsaAdapter {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "MachBackend")
+    }
+}
+
+impl TargetIsa for TargetIsaAdapter {
+    fn name(&self) -> &'static str {
+        self.backend.name()
+    }
+
+    fn triple(&self) -> &Triple {
+        &self.triple
+    }
+
+    fn flags(&self) -> &Flags {
+        self.backend.flags()
+    }
+
+    fn register_info(&self) -> RegInfo {
+        // Called from function's Display impl, so we need a stub here.
+        RegInfo {
+            banks: &[],
+            classes: &[],
+        }
+    }
+
+    fn legal_encodings<'a>(
+        &'a self,
+        _func: &'a ir::Function,
+        _inst: &'a ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Encodings<'a> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encode(
+        &self,
+        _func: &ir::Function,
+        _inst: &ir::InstructionData,
+        _ctrl_typevar: ir::Type,
+    ) -> Result<Encoding, Legalize> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn encoding_info(&self) -> EncInfo {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn legalize_signature(&self, _sig: &mut Cow<ir::Signature>, _current: bool) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn regclass_for_abi_type(&self, _ty: ir::Type) -> RegClass {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn allocatable_registers(&self, _func: &ir::Function) -> RegisterSet {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn prologue_epilogue(&self, _func: &mut ir::Function) -> CodegenResult<()> {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    #[cfg(feature = "testing_hooks")]
+    fn emit_inst(
+        &self,
+        _func: &ir::Function,
+        _inst: ir::Inst,
+        _divert: &mut RegDiversions,
+        _sink: &mut dyn binemit::CodeSink,
+    ) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    /// Emit a whole function into memory.
+    fn emit_function_to_memory(&self, _func: &ir::Function, _sink: &mut binemit::MemoryCodeSink) {
+        panic!("Should not be called when new-style backend is available!")
+    }
+
+    fn get_mach_backend(&self) -> Option<&dyn MachBackend> {
+        Some(&*self.backend)
+    }
+
+    fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_add_overflow_condition()
+    }
+
+    fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC {
+        self.backend.unsigned_sub_overflow_condition()
+    }
+}
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
new file mode 100644
index 000000000000..bfd4bf665af7
--- /dev/null
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -0,0 +1,59 @@
+//! Computation of basic block order in emitted code.
+
+use crate::machinst::*;
+
+/// Simple reverse postorder-based block order emission.
+///
+/// TODO: use a proper algorithm, such as the bottom-up straight-line-section
+/// construction algorithm.
+struct BlockRPO {
+    visited: Vec<bool>,
+    postorder: Vec<BlockIndex>,
+    deferred_last: Option<BlockIndex>,
+}
+
+impl BlockRPO {
+    fn new<I: VCodeInst>(vcode: &VCode<I>) -> BlockRPO {
+        BlockRPO {
+            visited: vec![false; vcode.num_blocks()],
+            postorder: vec![],
+            deferred_last: None,
+        }
+    }
+
+    fn visit<I: VCodeInst>(&mut self, vcode: &VCode<I>, block: BlockIndex) {
+        self.visited[block as usize] = true;
+        for succ in vcode.succs(block) {
+            if !self.visited[*succ as usize] {
+                self.visit(vcode, *succ);
+            }
+        }
+
+        let (start, end) = &vcode.block_ranges[block as usize];
+        for i in *start..*end {
+            if vcode.insts[i as usize].is_epilogue_placeholder() {
+                debug_assert!(self.deferred_last.is_none());
+                self.deferred_last = Some(block);
+                return;
+            }
+        }
+
+        self.postorder.push(block);
+    }
+
+    fn rpo(self) -> Vec<BlockIndex> {
+        let mut rpo = self.postorder;
+        rpo.reverse();
+        if let Some(block) = self.deferred_last {
+            rpo.push(block);
+        }
+        rpo
+    }
+}
+
+/// Compute the final block order.
+pub fn compute_final_block_order<I: VCodeInst>(vcode: &VCode<I>) -> Vec<BlockIndex> {
+    let mut rpo = BlockRPO::new(vcode);
+    rpo.visit(vcode, vcode.entry());
+    rpo.rpo()
+}
diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs
new file mode 100644
index 000000000000..458db9ea368d
--- /dev/null
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -0,0 +1,76 @@
+//! Compilation backend pipeline: optimized IR to VCode / binemit.
+
+use crate::ir::Function;
+use crate::machinst::*;
+use crate::settings;
+use crate::timing;
+
+use log::debug;
+use regalloc::{allocate_registers, RegAllocAlgorithm};
+use std::env;
+
+/// Compile the given function down to VCode with allocated registers, ready
+/// for binary emission.
+pub fn compile<B: LowerBackend>(
+    f: &mut Function,
+    b: &B,
+    abi: Box<dyn ABIBody<B::MInst>>,
+    flags: &settings::Flags,
+) -> VCode<B::MInst>
+where
+    B::MInst: ShowWithRRU,
+{
+    // This lowers the CL IR.
+    let mut vcode = Lower::new(f, abi).lower(b);
+
+    let universe = &B::MInst::reg_universe();
+
+    debug!("vcode from lowering: \n{}", vcode.show_rru(Some(universe)));
+
+    // Perform register allocation.
+    let algorithm = match env::var("REGALLOC") {
+        Ok(str) => match str.as_str() {
+            "lsrac" => RegAllocAlgorithm::LinearScanChecked,
+            "lsra" => RegAllocAlgorithm::LinearScan,
+            // to wit: btc doesn't mean "bitcoin" here
+            "btc" => RegAllocAlgorithm::BacktrackingChecked,
+            _ => RegAllocAlgorithm::Backtracking,
+        },
+        // By default use backtracking, which is the fastest.
+        Err(_) => RegAllocAlgorithm::Backtracking,
+    };
+
+    let result = {
+        let _tt = timing::regalloc();
+        allocate_registers(
+            &mut vcode, algorithm, universe, /*request_block_annotations=*/ false,
+        )
+        .map_err(|err| {
+            debug!(
+                "Register allocation error for vcode\n{}\nError: {:?}",
+                vcode.show_rru(Some(universe)),
+                err
+            );
+            err
+        })
+        .expect("register allocation")
+    };
+
+    // Reorder vcode into final order and copy out final instruction sequence
+    // all at once. This also inserts prologues/epilogues.
+    vcode.replace_insns_from_regalloc(result, flags);
+
+    vcode.remove_redundant_branches();
+
+    // Do final passes over code to finalize branches.
+    vcode.finalize_branches();
+
+    debug!(
+        "vcode after regalloc: final version:\n{}",
+        vcode.show_rru(Some(universe))
+    );
+
+    //println!("{}\n", vcode.show_rru(Some(&B::MInst::reg_universe())));
+
+    vcode
+}
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
new file mode 100644
index 000000000000..2165416ebccb
--- /dev/null
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -0,0 +1,723 @@
+//! This module implements lowering (instruction selection) from Cranelift IR
+//! to machine instructions with virtual registers. This is *almost* the final
+//! machine code, except for register allocation.
+
+use crate::binemit::CodeSink;
+use crate::dce::has_side_effect;
+use crate::entity::SecondaryMap;
+use crate::ir::{
+    Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, MemFlags, Opcode,
+    Signature, SourceLoc, Type, Value, ValueDef,
+};
+use crate::isa::registers::RegUnit;
+use crate::machinst::{
+    ABIBody, BlockIndex, MachInst, MachInstEmit, VCode, VCodeBuilder, VCodeInst,
+};
+use crate::num_uses::NumUses;
+
+use regalloc::Function as RegallocFunction;
+use regalloc::{RealReg, Reg, RegClass, Set, VirtualReg, Writable};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use log::debug;
+use smallvec::SmallVec;
+use std::collections::VecDeque;
+use std::ops::Range;
+
+/// A context that machine-specific lowering code can use to emit lowered instructions. This is the
+/// view of the machine-independent per-function lowering context that is seen by the machine
+/// backend.
+pub trait LowerCtx<I> {
+    /// Get the instdata for a given IR instruction.
+    fn data(&self, ir_inst: Inst) -> &InstructionData;
+    /// Get the controlling type for a polymorphic IR instruction.
+    fn ty(&self, ir_inst: Inst) -> Type;
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: I);
+    /// Indicate that an IR instruction has been merged, and so one of its
+    /// uses is gone (replaced by uses of the instruction's inputs). This
+    /// helps the lowering algorithm to perform on-the-fly DCE, skipping over
+    /// unused instructions (such as immediates incorporated directly).
+    fn merged(&mut self, from_inst: Inst);
+    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
+    /// given IR instruction
+    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)>;
+    /// Map a Value to its associated writable (probably virtual) Reg.
+    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg>;
+    /// Map a Value to its associated (probably virtual) Reg.
+    fn value_to_reg(&self, val: Value) -> Reg;
+    /// Get the `idx`th input to the given IR instruction as a virtual register.
+    fn input(&self, ir_inst: Inst, idx: usize) -> Reg;
+    /// Get the `idx`th output of the given IR instruction as a virtual register.
+    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg>;
+    /// Get the number of inputs to the given IR instruction.
+    fn num_inputs(&self, ir_inst: Inst) -> usize;
+    /// Get the number of outputs to the given IR instruction.
+    fn num_outputs(&self, ir_inst: Inst) -> usize;
+    /// Get the type for an instruction's input.
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get the type for an instruction's output.
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type;
+    /// Get a new temp.
+    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg>;
+    /// Get the number of block params.
+    fn num_bb_params(&self, bb: Block) -> usize;
+    /// Get the register for a block param.
+    fn bb_param(&self, bb: Block, idx: usize) -> Reg;
+    /// Get the register for a return value.
+    fn retval(&self, idx: usize) -> Writable<Reg>;
+    /// Get the target for a call instruction, as an `ExternalName`.
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName>;
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>;
+    /// Get the symbol name and offset for a symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)>;
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags>;
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc;
+}
+
+/// A machine backend.
+pub trait LowerBackend {
+    /// The machine instruction type.
+    type MInst: VCodeInst;
+
+    /// Lower a single instruction. Instructions are lowered in reverse order.
+    /// This function need not handle branches; those are always passed to
+    /// `lower_branch_group` below.
+    fn lower<C: LowerCtx<Self::MInst>>(&self, ctx: &mut C, inst: Inst);
+
+    /// Lower a block-terminating group of branches (which together can be seen as one
+    /// N-way branch), given a vcode BlockIndex for each target.
+    fn lower_branch_group<C: LowerCtx<Self::MInst>>(
+        &self,
+        ctx: &mut C,
+        insts: &[Inst],
+        targets: &[BlockIndex],
+        fallthrough: Option<BlockIndex>,
+    );
+}
+
+/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
+/// from original Inst to MachInsts.
+pub struct Lower<'a, I: VCodeInst> {
+    // The function to lower.
+    f: &'a Function,
+
+    // Lowered machine instructions.
+    vcode: VCodeBuilder<I>,
+
+    // Number of active uses (minus `dec_use()` calls by backend) of each instruction.
+    num_uses: SecondaryMap<Inst, u32>,
+
+    // Mapping from `Value` (SSA value in IR) to virtual register.
+    value_regs: SecondaryMap<Value, Reg>,
+
+    // Return-value vregs.
+    retval_regs: Vec<Reg>,
+
+    // Next virtual register number to allocate.
+    next_vreg: u32,
+}
+
+fn alloc_vreg(
+    value_regs: &mut SecondaryMap<Value, Reg>,
+    regclass: RegClass,
+    value: Value,
+    next_vreg: &mut u32,
+) -> VirtualReg {
+    if value_regs[value].get_index() == 0 {
+        // default value in map.
+        let v = *next_vreg;
+        *next_vreg += 1;
+        value_regs[value] = Reg::new_virtual(regclass, v);
+    }
+    value_regs[value].as_virtual_reg().unwrap()
+}
+
+enum GenerateReturn {
+    Yes,
+    No,
+}
+
+impl<'a, I: VCodeInst> Lower<'a, I> {
+    /// Prepare a new lowering context for the given IR function.
+    pub fn new(f: &'a Function, abi: Box<dyn ABIBody<I>>) -> Lower<'a, I> {
+        let mut vcode = VCodeBuilder::new(abi);
+
+        let num_uses = NumUses::compute(f).take_uses();
+
+        let mut next_vreg: u32 = 1;
+
+        // Default register should never be seen, but the `value_regs` map needs a default and we
+        // don't want to push `Option` everywhere. All values will be assigned registers by the
+        // loops over block parameters and instruction results below.
+        //
+        // We do not use vreg 0 so that we can detect any unassigned register that leaks through.
+        let default_register = Reg::new_virtual(RegClass::I32, 0);
+        let mut value_regs = SecondaryMap::with_default(default_register);
+
+        // Assign a vreg to each value.
+        for bb in f.layout.blocks() {
+            for param in f.dfg.block_params(bb) {
+                let vreg = alloc_vreg(
+                    &mut value_regs,
+                    I::rc_for_type(f.dfg.value_type(*param)),
+                    *param,
+                    &mut next_vreg,
+                );
+                vcode.set_vreg_type(vreg, f.dfg.value_type(*param));
+            }
+            for inst in f.layout.block_insts(bb) {
+                for result in f.dfg.inst_results(inst) {
+                    let vreg = alloc_vreg(
+                        &mut value_regs,
+                        I::rc_for_type(f.dfg.value_type(*result)),
+                        *result,
+                        &mut next_vreg,
+                    );
+                    vcode.set_vreg_type(vreg, f.dfg.value_type(*result));
+                }
+            }
+        }
+
+        // Assign a vreg to each return value.
+        let mut retval_regs = vec![];
+        for ret in &f.signature.returns {
+            let v = next_vreg;
+            next_vreg += 1;
+            let regclass = I::rc_for_type(ret.value_type);
+            let vreg = Reg::new_virtual(regclass, v);
+            retval_regs.push(vreg);
+            vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type);
+        }
+
+        Lower {
+            f,
+            vcode,
+            num_uses,
+            value_regs,
+            retval_regs,
+            next_vreg,
+        }
+    }
+
+    fn gen_arg_setup(&mut self) {
+        if let Some(entry_bb) = self.f.layout.entry_block() {
+            debug!(
+                "gen_arg_setup: entry BB {} args are:\n{:?}",
+                entry_bb,
+                self.f.dfg.block_params(entry_bb)
+            );
+            for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() {
+                let reg = Writable::from_reg(self.value_regs[*param]);
+                let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg);
+                self.vcode.push(insn);
+            }
+        }
+    }
+
+    fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) {
+        for (i, reg) in self.retval_regs.iter().enumerate() {
+            let insn = self.vcode.abi().gen_copy_reg_to_retval(i, *reg);
+            self.vcode.push(insn);
+        }
+        let inst = match gen_ret_inst {
+            GenerateReturn::Yes => self.vcode.abi().gen_ret(),
+            GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(),
+        };
+        self.vcode.push(inst);
+    }
+
+    fn find_reachable_bbs(&self) -> SmallVec<[Block; 16]> {
+        if let Some(entry) = self.f.layout.entry_block() {
+            let mut ret = SmallVec::new();
+            let mut queue = VecDeque::new();
+            let mut visited = SecondaryMap::with_default(false);
+            queue.push_back(entry);
+            visited[entry] = true;
+            while !queue.is_empty() {
+                let b = queue.pop_front().unwrap();
+                ret.push(b);
+                let mut succs: SmallVec<[Block; 16]> = SmallVec::new();
+                for inst in self.f.layout.block_insts(b) {
+                    if self.f.dfg[inst].opcode().is_branch() {
+                        succs.extend(branch_targets(self.f, b, inst).into_iter());
+                    }
+                }
+                for succ in succs.into_iter() {
+                    if !visited[succ] {
+                        queue.push_back(succ);
+                        visited[succ] = true;
+                    }
+                }
+            }
+
+            ret
+        } else {
+            SmallVec::new()
+        }
+    }
+
+    /// Lower the function.
+    pub fn lower<B: LowerBackend<MInst = I>>(mut self, backend: &B) -> VCode<I> {
+        // Find all reachable blocks.
+        let mut bbs = self.find_reachable_bbs();
+        // Work backward (reverse block order, reverse through each block), skipping insns with zero
+        // uses.
+        bbs.reverse();
+
+        // This records a Block-to-BlockIndex map so that branch targets can be resolved.
+        let mut next_bindex = self.vcode.init_bb_map(&bbs[..]);
+
+        // Allocate a separate BlockIndex for each control-flow instruction so that we can create
+        // the edge blocks later. Each entry for a control-flow inst is the edge block; the list
+        // has (cf-inst, edge block, orig block) tuples.
+        let mut edge_blocks_by_inst: SecondaryMap<Inst, Vec<BlockIndex>> =
+            SecondaryMap::with_default(vec![]);
+        let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = vec![];
+
+        debug!("about to lower function: {:?}", self.f);
+        debug!("bb map: {:?}", self.vcode.blocks_by_bb());
+
+        for bb in bbs.iter() {
+            for inst in self.f.layout.block_insts(*bb) {
+                let op = self.f.dfg[inst].opcode();
+                if op.is_branch() {
+                    // Find the original target.
+                    let mut add_succ = |next_bb| {
+                        let edge_block = next_bindex;
+                        next_bindex += 1;
+                        edge_blocks_by_inst[inst].push(edge_block);
+                        edge_blocks.push((inst, edge_block, next_bb));
+                    };
+                    for succ in branch_targets(self.f, *bb, inst).into_iter() {
+                        add_succ(succ);
+                    }
+                }
+            }
+        }
+
+        for bb in bbs.iter() {
+            debug!("lowering bb: {}", bb);
+
+            // If this is a return block, produce the return value setup.
+            let last_insn = self.f.layout.block_insts(*bb).last().unwrap();
+            let last_insn_opcode = self.f.dfg[last_insn].opcode();
+            if last_insn_opcode.is_return() {
+                let gen_ret = if last_insn_opcode == Opcode::Return {
+                    GenerateReturn::Yes
+                } else {
+                    debug_assert!(last_insn_opcode == Opcode::FallthroughReturn);
+                    GenerateReturn::No
+                };
+                self.gen_retval_setup(gen_ret);
+                self.vcode.end_ir_inst();
+            }
+
+            // Find the branches at the end first, and process those, if any.
+            let mut branches: SmallVec<[Inst; 2]> = SmallVec::new();
+            let mut targets: SmallVec<[BlockIndex; 2]> = SmallVec::new();
+
+            for inst in self.f.layout.block_insts(*bb).rev() {
+                debug!("lower: inst {}", inst);
+                if edge_blocks_by_inst[inst].len() > 0 {
+                    branches.push(inst);
+                    for target in edge_blocks_by_inst[inst].iter().rev().cloned() {
+                        targets.push(target);
+                    }
+                } else {
+                    // We've reached the end of the branches -- process all as a group, first.
+                    if branches.len() > 0 {
+                        let fallthrough = self.f.layout.next_block(*bb);
+                        let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
+                        branches.reverse();
+                        targets.reverse();
+                        debug!(
+                            "lower_branch_group: targets = {:?} branches = {:?}",
+                            targets, branches
+                        );
+                        backend.lower_branch_group(
+                            &mut self,
+                            &branches[..],
+                            &targets[..],
+                            fallthrough,
+                        );
+                        self.vcode.end_ir_inst();
+                        branches.clear();
+                        targets.clear();
+                    }
+
+                    // Only codegen an instruction if it either has a side
+                    // effect, or has at least one use of one of its results.
+                    let num_uses = self.num_uses[inst];
+                    let side_effect = has_side_effect(self.f, inst);
+                    if side_effect || num_uses > 0 {
+                        backend.lower(&mut self, inst);
+                        self.vcode.end_ir_inst();
+                    } else {
+                        // If we're skipping the instruction, we need to dec-ref
+                        // its arguments.
+                        for arg in self.f.dfg.inst_args(inst) {
+                            let val = self.f.dfg.resolve_aliases(*arg);
+                            match self.f.dfg.value_def(val) {
+                                ValueDef::Result(src_inst, _) => {
+                                    self.dec_use(src_inst);
+                                }
+                                _ => {}
+                            }
+                        }
+                    }
+                }
+            }
+
+            // There are possibly some branches left if the block contained only branches.
+            if branches.len() > 0 {
+                let fallthrough = self.f.layout.next_block(*bb);
+                let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb));
+                branches.reverse();
+                targets.reverse();
+                debug!(
+                    "lower_branch_group: targets = {:?} branches = {:?}",
+                    targets, branches
+                );
+                backend.lower_branch_group(&mut self, &branches[..], &targets[..], fallthrough);
+                self.vcode.end_ir_inst();
+                branches.clear();
+                targets.clear();
+            }
+
+            // If this is the entry block, produce the argument setup.
+            if Some(*bb) == self.f.layout.entry_block() {
+                self.gen_arg_setup();
+                self.vcode.end_ir_inst();
+            }
+
+            let vcode_bb = self.vcode.end_bb();
+            debug!("finished building bb: BlockIndex {}", vcode_bb);
+            debug!("bb_to_bindex map says: {}", self.vcode.bb_to_bindex(*bb));
+            assert!(vcode_bb == self.vcode.bb_to_bindex(*bb));
+            if Some(*bb) == self.f.layout.entry_block() {
+                self.vcode.set_entry(vcode_bb);
+            }
+        }
+
+        // Now create the edge blocks, with phi lowering (block parameter copies).
+        for (inst, edge_block, orig_block) in edge_blocks.into_iter() {
+            debug!(
+                "creating edge block: inst {}, edge_block {}, orig_block {}",
+                inst, edge_block, orig_block
+            );
+
+            // Create a temporary for each block parameter.
+            let phi_classes: Vec<(Type, RegClass)> = self
+                .f
+                .dfg
+                .block_params(orig_block)
+                .iter()
+                .map(|p| self.f.dfg.value_type(*p))
+                .map(|ty| (ty, I::rc_for_type(ty)))
+                .collect();
+
+            // FIXME sewardj 2020Feb29: use SmallVec
+            let mut src_regs = vec![];
+            let mut dst_regs = vec![];
+
+            // Create all of the phi uses (reads) from jump args to temps.
+
+            // Round up all the source and destination regs
+            for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() {
+                let arg = self.f.dfg.resolve_aliases(*arg);
+                debug!("jump arg {} is {}", i, arg);
+                src_regs.push(self.value_regs[arg]);
+            }
+            for (i, param) in self.f.dfg.block_params(orig_block).iter().enumerate() {
+                debug!("bb arg {} is {}", i, param);
+                dst_regs.push(Writable::from_reg(self.value_regs[*param]));
+            }
+            debug_assert!(src_regs.len() == dst_regs.len());
+            debug_assert!(phi_classes.len() == dst_regs.len());
+
+            // If, as is mostly the case, the source and destination register
+            // sets are non overlapping, then we can copy directly, so as to
+            // save the register allocator work.
+            if !Set::<Reg>::from_vec(src_regs.clone()).intersects(&Set::<Reg>::from_vec(
+                dst_regs.iter().map(|r| r.to_reg()).collect(),
+            )) {
+                for (dst_reg, (src_reg, (ty, _))) in
+                    dst_regs.iter().zip(src_regs.iter().zip(phi_classes))
+                {
+                    self.vcode.push(I::gen_move(*dst_reg, *src_reg, ty));
+                }
+            } else {
+                // There's some overlap, so play safe and copy via temps.
+
+                let tmp_regs: Vec<Writable<Reg>> = phi_classes
+                    .iter()
+                    .map(|&(ty, rc)| self.tmp(rc, ty)) // borrows `self` mutably.
+                    .collect();
+
+                debug!("phi_temps = {:?}", tmp_regs);
+                debug_assert!(tmp_regs.len() == src_regs.len());
+
+                for (tmp_reg, (src_reg, &(ty, _))) in
+                    tmp_regs.iter().zip(src_regs.iter().zip(phi_classes.iter()))
+                {
+                    self.vcode.push(I::gen_move(*tmp_reg, *src_reg, ty));
+                }
+                for (dst_reg, (tmp_reg, &(ty, _))) in
+                    dst_regs.iter().zip(tmp_regs.iter().zip(phi_classes.iter()))
+                {
+                    self.vcode.push(I::gen_move(*dst_reg, tmp_reg.to_reg(), ty));
+                }
+            }
+
+            // Create the unconditional jump to the original target block.
+            self.vcode
+                .push(I::gen_jump(self.vcode.bb_to_bindex(orig_block)));
+
+            // End the IR inst and block. (We lower this as if it were one IR instruction so that
+            // we can emit machine instructions in forward order.)
+            self.vcode.end_ir_inst();
+            let blocknum = self.vcode.end_bb();
+            assert!(blocknum == edge_block);
+        }
+
+        // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode.
+        self.vcode.build()
+    }
+
+    /// Reduce the use-count of an IR instruction. Use this when, e.g., isel incorporates the
+    /// computation of an input instruction directly, so that input instruction has one
+    /// fewer use.
+    fn dec_use(&mut self, ir_inst: Inst) {
+        assert!(self.num_uses[ir_inst] > 0);
+        self.num_uses[ir_inst] -= 1;
+        debug!(
+            "incref: ir_inst {} now has {} uses",
+            ir_inst, self.num_uses[ir_inst]
+        );
+    }
+
+    /// Increase the use-count of an IR instruction. Use this when, e.g., isel incorporates
+    /// the computation of an input instruction directly, so that input instruction's
+    /// inputs are now used directly by the merged instruction.
+    fn inc_use(&mut self, ir_inst: Inst) {
+        self.num_uses[ir_inst] += 1;
+        debug!(
+            "decref: ir_inst {} now has {} uses",
+            ir_inst, self.num_uses[ir_inst]
+        );
+    }
+}
+
+impl<'a, I: VCodeInst> LowerCtx<I> for Lower<'a, I> {
+    /// Get the instdata for a given IR instruction.
+    fn data(&self, ir_inst: Inst) -> &InstructionData {
+        &self.f.dfg[ir_inst]
+    }
+
+    /// Get the controlling type for a polymorphic IR instruction.
+    fn ty(&self, ir_inst: Inst) -> Type {
+        self.f.dfg.ctrl_typevar(ir_inst)
+    }
+
+    /// Emit a machine instruction.
+    fn emit(&mut self, mach_inst: I) {
+        self.vcode.push(mach_inst);
+    }
+
+    /// Indicate that a merge has occurred.
+    fn merged(&mut self, from_inst: Inst) {
+        debug!("merged: inst {}", from_inst);
+        // First, inc-ref all inputs of `from_inst`, because they are now used
+        // directly by `into_inst`.
+        for arg in self.f.dfg.inst_args(from_inst) {
+            let arg = self.f.dfg.resolve_aliases(*arg);
+            match self.f.dfg.value_def(arg) {
+                ValueDef::Result(src_inst, _) => {
+                    debug!(" -> inc-reffing src inst {}", src_inst);
+                    self.inc_use(src_inst);
+                }
+                _ => {}
+            }
+        }
+        // Then, dec-ref the merged instruction itself. It still retains references
+        // to its arguments (inc-ref'd above). If its refcount has reached zero,
+        // it will be skipped during emission and its args will be dec-ref'd at that
+        // time.
+        self.dec_use(from_inst);
+    }
+
+    /// Get the producing instruction, if any, and output number, for the `idx`th input to the
+    /// given IR instruction.
+    fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)> {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        match self.f.dfg.value_def(val) {
+            ValueDef::Result(src_inst, result_idx) => Some((src_inst, result_idx)),
+            _ => None,
+        }
+    }
+
+    /// Map a Value to its associated writable (probably virtual) Reg.
+    fn value_to_writable_reg(&self, val: Value) -> Writable<Reg> {
+        let val = self.f.dfg.resolve_aliases(val);
+        Writable::from_reg(self.value_regs[val])
+    }
+
+    /// Map a Value to its associated (probably virtual) Reg.
+    fn value_to_reg(&self, val: Value) -> Reg {
+        let val = self.f.dfg.resolve_aliases(val);
+        self.value_regs[val]
+    }
+
+    /// Get the `idx`th input to the given IR instruction as a virtual register.
+    fn input(&self, ir_inst: Inst, idx: usize) -> Reg {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.value_to_reg(val)
+    }
+
+    /// Get the `idx`th output of the given IR instruction as a virtual register.
+    fn output(&self, ir_inst: Inst, idx: usize) -> Writable<Reg> {
+        let val = self.f.dfg.inst_results(ir_inst)[idx];
+        self.value_to_writable_reg(val)
+    }
+
+    /// Get a new temp.
+    fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable<Reg> {
+        let v = self.next_vreg;
+        self.next_vreg += 1;
+        let vreg = Reg::new_virtual(rc, v);
+        self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty);
+        Writable::from_reg(vreg)
+    }
+
+    /// Get the number of inputs for the given IR instruction.
+    fn num_inputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_args(ir_inst).len()
+    }
+
+    /// Get the number of outputs for the given IR instruction.
+    fn num_outputs(&self, ir_inst: Inst) -> usize {
+        self.f.dfg.inst_results(ir_inst).len()
+    }
+
+    /// Get the type for an instruction's input.
+    fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        let val = self.f.dfg.inst_args(ir_inst)[idx];
+        let val = self.f.dfg.resolve_aliases(val);
+        self.f.dfg.value_type(val)
+    }
+
+    /// Get the type for an instruction's output.
+    fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type {
+        self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx])
+    }
+
+    /// Get the number of block params.
+    fn num_bb_params(&self, bb: Block) -> usize {
+        self.f.dfg.block_params(bb).len()
+    }
+
+    /// Get the register for a block param.
+    fn bb_param(&self, bb: Block, idx: usize) -> Reg {
+        let val = self.f.dfg.block_params(bb)[idx];
+        self.value_regs[val]
+    }
+
+    /// Get the register for a return value.
+    fn retval(&self, idx: usize) -> Writable<Reg> {
+        Writable::from_reg(self.retval_regs[idx])
+    }
+
+    /// Get the target for a call instruction, as an `ExternalName`.
+    fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. }
+            | &InstructionData::FuncAddr { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                Some(&funcdata.name)
+            }
+            _ => None,
+        }
+    }
+    /// Get the signature for a call or call-indirect instruction.
+    fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Call { func_ref, .. } => {
+                let funcdata = &self.f.dfg.ext_funcs[func_ref];
+                Some(&self.f.dfg.signatures[funcdata.signature])
+            }
+            &InstructionData::CallIndirect { sig_ref, .. } => Some(&self.f.dfg.signatures[sig_ref]),
+            _ => None,
+        }
+    }
+
+    /// Get the symbol name and offset for a symbol_value instruction.
+    fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::UnaryGlobalValue { global_value, .. } => {
+                let gvdata = &self.f.global_values[global_value];
+                match gvdata {
+                    &GlobalValueData::Symbol {
+                        ref name,
+                        ref offset,
+                        ..
+                    } => {
+                        let offset = offset.bits();
+                        Some((name, offset))
+                    }
+                    _ => None,
+                }
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns the memory flags of a given memory access.
+    fn memflags(&self, ir_inst: Inst) -> Option<MemFlags> {
+        match &self.f.dfg[ir_inst] {
+            &InstructionData::Load { flags, .. }
+            | &InstructionData::LoadComplex { flags, .. }
+            | &InstructionData::Store { flags, .. }
+            | &InstructionData::StoreComplex { flags, .. } => Some(flags),
+            _ => None,
+        }
+    }
+
+    /// Get the source location for a given instruction.
+    fn srcloc(&self, ir_inst: Inst) -> SourceLoc {
+        self.f.srclocs[ir_inst]
+    }
+}
+
+fn branch_targets(f: &Function, block: Block, inst: Inst) -> SmallVec<[Block; 16]> {
+    let mut ret = SmallVec::new();
+    if f.dfg[inst].opcode() == Opcode::Fallthrough {
+        ret.push(f.layout.next_block(block).unwrap());
+    } else {
+        match &f.dfg[inst] {
+            &InstructionData::Jump { destination, .. }
+            | &InstructionData::Branch { destination, .. }
+            | &InstructionData::BranchInt { destination, .. }
+            | &InstructionData::BranchIcmp { destination, .. }
+            | &InstructionData::BranchFloat { destination, .. } => {
+                ret.push(destination);
+            }
+            &InstructionData::BranchTable {
+                destination, table, ..
+            } => {
+                ret.push(destination);
+                for dest in f.jump_tables[table].as_slice() {
+                    ret.push(*dest);
+                }
+            }
+            _ => {}
+        }
+    }
+    ret
+}
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
new file mode 100644
index 000000000000..93c9126b320f
--- /dev/null
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -0,0 +1,288 @@
+//! This module exposes the machine-specific backend definition pieces.
+//!
+//! The MachInst infrastructure is the compiler backend, from CLIF
+//! (ir::Function) to machine code. The purpose of this infrastructure is, at a
+//! high level, to do instruction selection/lowering (to machine instructions),
+//! register allocation, and then perform all the fixups to branches, constant
+//! data references, etc., needed to actually generate machine code.
+//!
+//! The container for machine instructions, at various stages of construction,
+//! is the `VCode` struct. We refer to a sequence of machine instructions organized
+//! into basic blocks as "vcode". This is short for "virtual-register code", though
+//! it's a bit of a misnomer because near the end of the pipeline, vcode has all
+//! real registers. Nevertheless, the name is catchy and we like it.
+//!
+//! The compilation pipeline, from an `ir::Function` (already optimized as much as
+//! you like by machine-independent optimization passes) onward, is as follows.
+//! (N.B.: though we show the VCode separately at each stage, the passes
+//! mutate the VCode in place; these are not separate copies of the code.)
+//!
+//! |    ir::Function                (SSA IR, machine-independent opcodes)
+//! |        |
+//! |        |  [lower]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - mostly virtual registers.
+//! |        |                        - cond branches in two-target form.
+//! |        |                        - branch targets are block indices.
+//! |        |                        - in-memory constants held by insns,
+//! |        |                          with unknown offsets.
+//! |        |                        - critical edges (actually all edges)
+//! |        |                          are split.)
+//! |        | [regalloc]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - all real registers.
+//! |        |                        - new instruction sequence returned
+//! |        |                          out-of-band in RegAllocResult.
+//! |        |                        - instruction sequence has spills,
+//! |        |                          reloads, and moves inserted.
+//! |        |                        - other invariants same as above.)
+//! |        |
+//! |        | [preamble/postamble]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - stack-frame size known.
+//! |        |                        - out-of-band instruction sequence
+//! |        |                          has preamble prepended to entry
+//! |        |                          block, and postamble injected before
+//! |        |                          every return instruction.
+//! |        |                        - all symbolic stack references to
+//! |        |                          stackslots and spillslots are resolved
+//! |        |                          to concrete FP-offset mem addresses.)
+//! |        | [block/insn ordering]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - vcode.final_block_order is filled in.
+//! |        |                        - new insn sequence from regalloc is
+//! |        |                          placed back into vcode and block
+//! |        |                          boundaries are updated.)
+//! |        | [redundant branch/block
+//! |        |  removal]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - all blocks that were just an
+//! |        |                          unconditional branch are removed.)
+//! |        |
+//! |        | [branch finalization
+//! |        |  (fallthroughs)]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - all branches are in lowered one-
+//! |        |                          target form, but targets are still
+//! |        |                          block indices.)
+//! |        |
+//! |        | [branch finalization
+//! |        |  (offsets)]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - all branch offsets from start of
+//! |        |                          function are known, and all branches
+//! |        |                          have resolved-offset targets.)
+//! |        |
+//! |        | [MemArg finalization]
+//! |        |
+//! |    VCode<arch_backend::Inst>   (machine instructions:
+//! |        |                        - all MemArg references to the constant
+//! |        |                          pool are replaced with offsets.
+//! |        |                        - all constant-pool data is collected
+//! |        |                          in the VCode.)
+//! |        |
+//! |        | [binary emission]
+//! |        |
+//! |    Vec<u8>                     (machine code!)
+//! |
+
+#![allow(unused_imports)]
+
+use crate::binemit::{
+    CodeInfo, CodeOffset, CodeSink, MemoryCodeSink, RelocSink, StackmapSink, TrapSink,
+};
+use crate::entity::EntityRef;
+use crate::entity::SecondaryMap;
+use crate::ir::condcodes::IntCC;
+use crate::ir::ValueLocations;
+use crate::ir::{DataFlowGraph, Function, Inst, Opcode, Type, Value};
+use crate::isa::RegUnit;
+use crate::result::CodegenResult;
+use crate::settings::Flags;
+use crate::HashMap;
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use core::fmt::Debug;
+use core::iter::Sum;
+use regalloc::Map as RegallocMap;
+use regalloc::RegUsageCollector;
+use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
+use smallvec::SmallVec;
+use std::hash::Hash;
+use std::string::String;
+use target_lexicon::Triple;
+
+pub mod lower;
+pub use lower::*;
+pub mod vcode;
+pub use vcode::*;
+pub mod compile;
+pub use compile::*;
+pub mod blockorder;
+pub use blockorder::*;
+pub mod abi;
+pub use abi::*;
+pub mod pp;
+pub use pp::*;
+pub mod sections;
+pub use sections::*;
+pub mod adapter;
+pub use adapter::*;
+
+/// A machine instruction.
+pub trait MachInst: Clone + Debug {
+    /// Return the registers referenced by this machine instruction along with
+    /// the modes of reference (use, def, modify).
+    fn get_regs(&self, collector: &mut RegUsageCollector);
+
+    /// Map virtual registers to physical registers using the given virt->phys
+    /// maps corresponding to the program points prior to, and after, this instruction.
+    fn map_regs(
+        &mut self,
+        pre_map: &RegallocMap<VirtualReg, RealReg>,
+        post_map: &RegallocMap<VirtualReg, RealReg>,
+    );
+
+    /// If this is a simple move, return the (source, destination) tuple of registers.
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)>;
+
+    /// Is this a terminator (branch or ret)? If so, return its type
+    /// (ret/uncond/cond) and target if applicable.
+    fn is_term<'a>(&'a self) -> MachTerminator<'a>;
+
+    /// Returns true if the instruction is an epilogue placeholder.
+    fn is_epilogue_placeholder(&self) -> bool;
+
+    /// Generate a move.
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Self;
+
+    /// Generate a zero-length no-op.
+    fn gen_zero_len_nop() -> Self;
+
+    /// Possibly operate on a value directly in a spill-slot rather than a
+    /// register. Useful if the machine has register-memory instruction forms
+    /// (e.g., add directly from or directly to memory), like x86.
+    fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option<Self>;
+
+    /// Determine a register class to store the given CraneLift type.
+    fn rc_for_type(ty: Type) -> RegClass;
+
+    /// Generate a jump to another target. Used during lowering of
+    /// control flow.
+    fn gen_jump(target: BlockIndex) -> Self;
+
+    /// Generate a NOP. The `preferred_size` parameter allows the caller to
+    /// request a NOP of that size, or as close to it as possible. The machine
+    /// backend may return a NOP whose binary encoding is smaller than the
+    /// preferred size, but must not return a NOP that is larger. However,
+    /// the instruction must have a nonzero size.
+    fn gen_nop(preferred_size: usize) -> Self;
+
+    /// Rewrite block targets using the block-target map.
+    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]);
+
+    /// Finalize branches once the block order (fallthrough) is known.
+    fn with_fallthrough_block(&mut self, fallthrough_block: Option<BlockIndex>);
+
+    /// Update instruction once block offsets are known.  These offsets are
+    /// relative to the beginning of the function. `targets` is indexed by
+    /// BlockIndex.
+    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]);
+
+    /// Get the register universe for this backend.
+    fn reg_universe() -> RealRegUniverse;
+
+    /// Align a basic block offset (from start of function).  By default, no
+    /// alignment occurs.
+    fn align_basic_block(offset: CodeOffset) -> CodeOffset {
+        offset
+    }
+}
+
+/// Describes a block terminator (not call) in the vcode, when its branches
+/// have not yet been finalized (so a branch may have two targets).
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub enum MachTerminator<'a> {
+    /// Not a terminator.
+    None,
+    /// A return instruction.
+    Ret,
+    /// An unconditional branch to another block.
+    Uncond(BlockIndex),
+    /// A conditional branch to one of two other blocks.
+    Cond(BlockIndex, BlockIndex),
+    /// An indirect branch with known possible targets.
+    Indirect(&'a [BlockIndex]),
+}
+
+/// A trait describing the ability to encode a MachInst into binary machine code.
+pub trait MachInstEmit<O: MachSectionOutput> {
+    /// Emit the instruction.
+    fn emit(&self, code: &mut O);
+}
+
+/// The result of a `MachBackend::compile_function()` call. Contains machine
+/// code (as bytes) and a disassembly, if requested.
+pub struct MachCompileResult {
+    /// Machine code.
+    pub sections: MachSections,
+    /// Size of stack frame, in bytes.
+    pub frame_size: u32,
+    /// Disassembly, if requested.
+    pub disasm: Option<String>,
+}
+
+impl MachCompileResult {
+    /// Get a `CodeInfo` describing section sizes from this compilation result.
+    pub fn code_info(&self) -> CodeInfo {
+        let code_size = self.sections.total_size();
+        CodeInfo {
+            code_size,
+            jumptables_size: 0,
+            rodata_size: 0,
+            total_size: code_size,
+        }
+    }
+}
+
+/// Top-level machine backend trait, which wraps all monomorphized code and
+/// allows a virtual call from the machine-independent `Function::compile()`.
+pub trait MachBackend {
+    /// Compile the given function. Consumes the function.
+    fn compile_function(
+        &self,
+        func: Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult>;
+
+    /// Return flags for this backend.
+    fn flags(&self) -> &Flags;
+
+    /// Return triple for this backend.
+    fn triple(&self) -> Triple;
+
+    /// Return name for this backend.
+    fn name(&self) -> &'static str;
+
+    /// Return the register universe for this backend.
+    fn reg_universe(&self) -> RealRegUniverse;
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    fn unsigned_add_overflow_condition(&self) -> IntCC {
+        // TODO: this is what x86 specifies. Is this right for arm64?
+        IntCC::UnsignedLessThan
+    }
+
+    /// Machine-specific condcode info needed by TargetIsa.
+    fn unsigned_sub_overflow_condition(&self) -> IntCC {
+        // TODO: this is what x86 specifies. Is this right for arm64?
+        IntCC::UnsignedLessThan
+    }
+}
diff --git a/cranelift/codegen/src/machinst/pp.rs b/cranelift/codegen/src/machinst/pp.rs
new file mode 100644
index 000000000000..40e7c1b84298
--- /dev/null
+++ b/cranelift/codegen/src/machinst/pp.rs
@@ -0,0 +1,66 @@
+//! Pretty-printing for machine code (virtual-registerized or final).
+
+use regalloc::{RealRegUniverse, Reg, Writable};
+
+use std::fmt::Debug;
+use std::hash::Hash;
+use std::string::{String, ToString};
+
+// FIXME: Should this go into regalloc.rs instead?
+
+/// A trait for printing instruction bits and pieces, with the the ability to
+/// take a contextualising RealRegUniverse that is used to give proper names to
+/// registers.
+pub trait ShowWithRRU {
+    /// Return a string that shows the implementing object in context of the
+    /// given `RealRegUniverse`, if provided.
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String;
+
+    /// The same as |show_rru|, but with an optional hint giving a size in
+    /// bytes.  Its interpretation is object-dependent, and it is intended to
+    /// pass around enough information to facilitate printing sub-parts of
+    /// real registers correctly.  Objects may ignore size hints that are
+    /// irrelevant to them.
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, _size: u8) -> String {
+        // Default implementation is to ignore the hint.
+        self.show_rru(mb_rru)
+    }
+}
+
+impl ShowWithRRU for Reg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.is_real() {
+            if let Some(rru) = mb_rru {
+                let reg_ix = self.get_index();
+                if reg_ix < rru.regs.len() {
+                    return rru.regs[reg_ix].1.to_string();
+                } else {
+                    // We have a real reg which isn't listed in the universe.
+                    // Per the regalloc.rs interface requirements, this is
+                    // Totally Not Allowed.  Print it generically anyway, so
+                    // we have something to debug.
+                    return format!("!!{:?}!!", self);
+                }
+            }
+        }
+        // The reg is virtual, or we have no universe.  Be generic.
+        format!("%{:?}", self)
+    }
+
+    fn show_rru_sized(&self, _mb_rru: Option<&RealRegUniverse>, _size: u8) -> String {
+        // For the specific case of Reg, we demand not to have a size hint,
+        // since interpretation of the size is target specific, but this code
+        // is used by all targets.
+        panic!("Reg::show_rru_sized: impossible to implement");
+    }
+}
+
+impl<R: ShowWithRRU + Copy + Ord + Hash + Eq + Debug> ShowWithRRU for Writable<R> {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        self.to_reg().show_rru(mb_rru)
+    }
+
+    fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String {
+        self.to_reg().show_rru_sized(mb_rru, size)
+    }
+}
diff --git a/cranelift/codegen/src/machinst/sections.rs b/cranelift/codegen/src/machinst/sections.rs
new file mode 100644
index 000000000000..3e387239d074
--- /dev/null
+++ b/cranelift/codegen/src/machinst/sections.rs
@@ -0,0 +1,351 @@
+//! In-memory representation of compiled machine code, in multiple sections
+//! (text, constant pool / rodata, etc). Emission occurs into multiple sections
+//! simultaneously, so we buffer the result in memory and hand off to the
+//! caller at the end of compilation.
+
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc, RelocSink, StackmapSink, TrapSink};
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
+
+use alloc::vec::Vec;
+
+/// A collection of sections with defined start-offsets.
+pub struct MachSections {
+    /// Sections, in offset order.
+    pub sections: Vec<MachSection>,
+}
+
+impl MachSections {
+    /// New, empty set of sections.
+    pub fn new() -> MachSections {
+        MachSections { sections: vec![] }
+    }
+
+    /// Add a section with a known offset and size. Returns the index.
+    pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize {
+        let idx = self.sections.len();
+        self.sections.push(MachSection::new(start, length));
+        idx
+    }
+
+    /// Mutably borrow the given section by index.
+    pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection {
+        &mut self.sections[idx]
+    }
+
+    /// Get mutable borrows of two sections simultaneously. Used during
+    /// instruction emission to provide references to the .text and .rodata
+    /// (constant pool) sections.
+    pub fn two_sections<'a>(
+        &'a mut self,
+        idx1: usize,
+        idx2: usize,
+    ) -> (&'a mut MachSection, &'a mut MachSection) {
+        assert!(idx1 < idx2);
+        assert!(idx1 < self.sections.len());
+        assert!(idx2 < self.sections.len());
+        let (first, rest) = self.sections.split_at_mut(idx2);
+        (&mut first[idx1], &mut rest[0])
+    }
+
+    /// Emit this set of sections to a set of sinks for the code,
+    /// relocations, traps, and stackmap.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        // N.B.: we emit every section into the .text section as far as
+        // the `CodeSink` is concerned; we do not bother to segregate
+        // the contents into the actual program text, the jumptable and the
+        // rodata (constant pool). This allows us to generate code assuming
+        // that these will not be relocated relative to each other, and avoids
+        // having to designate each section as belonging in one of the three
+        // fixed categories defined by `CodeSink`. If this becomes a problem
+        // later (e.g. because of memory permissions or similar), we can
+        // add this designation and segregate the output; take care, however,
+        // to add the appropriate relocations in this case.
+
+        for section in &self.sections {
+            if section.data.len() > 0 {
+                while sink.offset() < section.start_offset {
+                    sink.put1(0);
+                }
+                section.emit(sink);
+            }
+        }
+        sink.begin_jumptables();
+        sink.begin_rodata();
+        sink.end_codegen();
+    }
+
+    /// Get the total required size for these sections.
+    pub fn total_size(&self) -> CodeOffset {
+        if self.sections.len() == 0 {
+            0
+        } else {
+            // Find the last non-empty section.
+            self.sections
+                .iter()
+                .rev()
+                .find(|s| s.data.len() > 0)
+                .map(|s| s.cur_offset_from_start())
+                .unwrap_or(0)
+        }
+    }
+}
+
+/// An abstraction over MachSection and MachSectionSize: some
+/// receiver of section data.
+pub trait MachSectionOutput {
+    /// Get the current offset from the start of all sections.
+    fn cur_offset_from_start(&self) -> CodeOffset;
+
+    /// Get the start offset of this section.
+    fn start_offset(&self) -> CodeOffset;
+
+    /// Add 1 byte to the section.
+    fn put1(&mut self, _: u8);
+
+    /// Add 2 bytes to the section.
+    fn put2(&mut self, value: u16) {
+        self.put1((value & 0xff) as u8);
+        self.put1(((value >> 8) & 0xff) as u8);
+    }
+
+    /// Add 4 bytes to the section.
+    fn put4(&mut self, value: u32) {
+        self.put1((value & 0xff) as u8);
+        self.put1(((value >> 8) & 0xff) as u8);
+        self.put1(((value >> 16) & 0xff) as u8);
+        self.put1(((value >> 24) & 0xff) as u8);
+    }
+
+    /// Add 8 bytes to the section.
+    fn put8(&mut self, value: u64) {
+        self.put1((value & 0xff) as u8);
+        self.put1(((value >> 8) & 0xff) as u8);
+        self.put1(((value >> 16) & 0xff) as u8);
+        self.put1(((value >> 24) & 0xff) as u8);
+        self.put1(((value >> 32) & 0xff) as u8);
+        self.put1(((value >> 40) & 0xff) as u8);
+        self.put1(((value >> 48) & 0xff) as u8);
+        self.put1(((value >> 56) & 0xff) as u8);
+    }
+
+    /// Add a slice of bytes to the section.
+    fn put_data(&mut self, data: &[u8]);
+
+    /// Add a relocation at the current offset.
+    fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend);
+
+    /// Add a trap record at the current offset.
+    fn add_trap(&mut self, loc: SourceLoc, code: TrapCode);
+
+    /// Add a call return address record at the current offset.
+    fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode);
+
+    /// Align up to the given alignment.
+    fn align_to(&mut self, align_to: CodeOffset) {
+        assert!(align_to.is_power_of_two());
+        while self.cur_offset_from_start() & (align_to - 1) != 0 {
+            self.put1(0);
+        }
+    }
+}
+
+/// A section of output to be emitted to a CodeSink / RelocSink in bulk.
+/// Multiple sections may be created with known start offsets in advance; the
+/// usual use-case is to create the .text (code) and .rodata (constant pool) at
+/// once, after computing the length of the code, so that constant references
+/// can use known offsets as instructions are emitted.
+pub struct MachSection {
+    /// The starting offset of this section.
+    pub start_offset: CodeOffset,
+    /// The limit of this section, defined by the start of the next section.
+    pub length_limit: CodeOffset,
+    /// The section contents, as raw bytes.
+    pub data: Vec<u8>,
+    /// Any relocations referring to this section.
+    pub relocs: Vec<MachReloc>,
+    /// Any trap records referring to this section.
+    pub traps: Vec<MachTrap>,
+    /// Any call site record referring to this section.
+    pub call_sites: Vec<MachCallSite>,
+}
+
+impl MachSection {
+    /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`.
+    pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection {
+        MachSection {
+            start_offset,
+            length_limit,
+            data: vec![],
+            relocs: vec![],
+            traps: vec![],
+            call_sites: vec![],
+        }
+    }
+
+    /// Emit this section to the CodeSink and other associated sinks.  The
+    /// current offset of the CodeSink must match the starting offset of this
+    /// section.
+    pub fn emit<CS: CodeSink>(&self, sink: &mut CS) {
+        assert!(sink.offset() == self.start_offset);
+
+        let mut next_reloc = 0;
+        let mut next_trap = 0;
+        let mut next_call_site = 0;
+        for (idx, byte) in self.data.iter().enumerate() {
+            if next_reloc < self.relocs.len() {
+                let reloc = &self.relocs[next_reloc];
+                if reloc.offset == idx as CodeOffset {
+                    sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend);
+                    next_reloc += 1;
+                }
+            }
+            if next_trap < self.traps.len() {
+                let trap = &self.traps[next_trap];
+                if trap.offset == idx as CodeOffset {
+                    sink.trap(trap.code, trap.srcloc);
+                    next_trap += 1;
+                }
+            }
+            if next_call_site < self.call_sites.len() {
+                let call_site = &self.call_sites[next_call_site];
+                if call_site.ret_addr == idx as CodeOffset {
+                    sink.add_call_site(call_site.opcode, call_site.srcloc);
+                    next_call_site += 1;
+                }
+            }
+            sink.put1(*byte);
+        }
+    }
+}
+
+impl MachSectionOutput for MachSection {
+    fn cur_offset_from_start(&self) -> CodeOffset {
+        self.start_offset + self.data.len() as CodeOffset
+    }
+
+    fn start_offset(&self) -> CodeOffset {
+        self.start_offset
+    }
+
+    fn put1(&mut self, value: u8) {
+        assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit);
+        self.data.push(value);
+    }
+
+    fn put_data(&mut self, data: &[u8]) {
+        assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit);
+        self.data.extend_from_slice(data);
+    }
+
+    fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) {
+        let name = name.clone();
+        self.relocs.push(MachReloc {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            kind,
+            name,
+            addend,
+        });
+    }
+
+    fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) {
+        self.traps.push(MachTrap {
+            offset: self.data.len() as CodeOffset,
+            srcloc,
+            code,
+        });
+    }
+
+    fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) {
+        self.call_sites.push(MachCallSite {
+            ret_addr: self.data.len() as CodeOffset,
+            srcloc,
+            opcode,
+        });
+    }
+}
+
+/// A MachSectionOutput implementation that records only size.
+pub struct MachSectionSize {
+    /// The starting offset of this section.
+    pub start_offset: CodeOffset,
+    /// The current offset of this section.
+    pub offset: CodeOffset,
+}
+
+impl MachSectionSize {
+    /// Create a new size-counting dummy section.
+    pub fn new(start_offset: CodeOffset) -> MachSectionSize {
+        MachSectionSize {
+            start_offset,
+            offset: start_offset,
+        }
+    }
+
+    /// Return the size this section would take if emitted with a real sink.
+    pub fn size(&self) -> CodeOffset {
+        self.offset - self.start_offset
+    }
+}
+
+impl MachSectionOutput for MachSectionSize {
+    fn cur_offset_from_start(&self) -> CodeOffset {
+        // All size-counting sections conceptually start at offset 0; this doesn't
+        // matter when counting code size.
+        self.offset
+    }
+
+    fn start_offset(&self) -> CodeOffset {
+        self.start_offset
+    }
+
+    fn put1(&mut self, _: u8) {
+        self.offset += 1;
+    }
+
+    fn put_data(&mut self, data: &[u8]) {
+        self.offset += data.len() as CodeOffset;
+    }
+
+    fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {}
+
+    fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {}
+
+    fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {}
+}
+
+/// A relocation resulting from a compilation.
+pub struct MachReloc {
+    /// The offset at which the relocation applies, *relative to the
+    /// containing section*.
+    pub offset: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The kind of relocation.
+    pub kind: Reloc,
+    /// The external symbol / name to which this relocation refers.
+    pub name: ExternalName,
+    /// The addend to add to the symbol value.
+    pub addend: i64,
+}
+
+/// A trap record resulting from a compilation.
+pub struct MachTrap {
+    /// The offset at which the trap instruction occurs, *relative to the
+    /// containing section*.
+    pub offset: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The trap code.
+    pub code: TrapCode,
+}
+
+/// A call site record resulting from a compilation.
+pub struct MachCallSite {
+    /// The offset of the call's return address, *relative to the containing section*.
+    pub ret_addr: CodeOffset,
+    /// The original source location.
+    pub srcloc: SourceLoc,
+    /// The call's opcode.
+    pub opcode: Opcode,
+}
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
new file mode 100644
index 000000000000..64b1a4012af8
--- /dev/null
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -0,0 +1,738 @@
+//! This implements the VCode container: a CFG of Insts that have been lowered.
+//!
+//! VCode is virtual-register code. An instruction in VCode is almost a machine
+//! instruction; however, its register slots can refer to virtual registers in
+//! addition to real machine registers.
+//!
+//! VCode is structured with traditional basic blocks, and
+//! each block must be terminated by an unconditional branch (one target), a
+//! conditional branch (two targets), or a return (no targets). Note that this
+//! slightly differs from the machine code of most ISAs: in most ISAs, a
+//! conditional branch has one target (and the not-taken case falls through).
+//! However, we expect that machine backends will elide branches to the following
+//! block (i.e., zero-offset jumps), and will be able to codegen a branch-cond /
+//! branch-uncond pair if *both* targets are not fallthrough. This allows us to
+//! play with layout prior to final binary emission, as well, if we want.
+//!
+//! See the main module comment in `mod.rs` for more details on the VCode-based
+//! backend pipeline.
+
+use crate::binemit::Reloc;
+use crate::ir;
+use crate::machinst::*;
+use crate::settings;
+
+use regalloc::Function as RegallocFunction;
+use regalloc::Set as RegallocSet;
+use regalloc::{BlockIx, InstIx, Range, RegAllocResult, RegClass, RegUsageCollector};
+
+use alloc::boxed::Box;
+use alloc::vec::Vec;
+use log::debug;
+use smallvec::SmallVec;
+use std::fmt;
+use std::iter;
+use std::ops::Index;
+use std::string::String;
+
+/// Index referring to an instruction in VCode.
+pub type InsnIndex = u32;
+/// Index referring to a basic block in VCode.
+pub type BlockIndex = u32;
+
+/// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be
+/// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`.
+pub trait VCodeInst: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize> {}
+impl<I: MachInst + MachInstEmit<MachSection> + MachInstEmit<MachSectionSize>> VCodeInst for I {}
+
+/// A function in "VCode" (virtualized-register code) form, after lowering.
+/// This is essentially a standard CFG of basic blocks, where each basic block
+/// consists of lowered instructions produced by the machine-specific backend.
+pub struct VCode<I: VCodeInst> {
+    /// Function liveins.
+    liveins: RegallocSet<RealReg>,
+
+    /// Function liveouts.
+    liveouts: RegallocSet<RealReg>,
+
+    /// VReg IR-level types.
+    vreg_types: Vec<Type>,
+
+    /// Lowered machine instructions in order corresponding to the original IR.
+    pub insts: Vec<I>,
+
+    /// Entry block.
+    entry: BlockIndex,
+
+    /// Block instruction indices.
+    pub block_ranges: Vec<(InsnIndex, InsnIndex)>,
+
+    /// Block successors: index range in the successor-list below.
+    block_succ_range: Vec<(usize, usize)>,
+
+    /// Block successor lists, concatenated into one Vec. The `block_succ_range`
+    /// list of tuples above gives (start, end) ranges within this list that
+    /// correspond to each basic block's successors.
+    block_succs: Vec<BlockIndex>,
+
+    /// Block indices by IR block.
+    block_by_bb: SecondaryMap<ir::Block, BlockIndex>,
+
+    /// IR block for each VCode Block. The length of this Vec will likely be
+    /// less than the total number of Blocks, because new Blocks (for edge
+    /// splits, for example) are appended during lowering.
+    bb_by_block: Vec<ir::Block>,
+
+    /// Order of block IDs in final generated code.
+    final_block_order: Vec<BlockIndex>,
+
+    /// Final block offsets. Computed during branch finalization and used
+    /// during emission.
+    final_block_offsets: Vec<CodeOffset>,
+
+    /// Size of code, accounting for block layout / alignment.
+    code_size: CodeOffset,
+
+    /// ABI object.
+    abi: Box<dyn ABIBody<I>>,
+}
+
+/// A builder for a VCode function body. This builder is designed for the
+/// lowering approach that we take: we traverse basic blocks in forward
+/// (original IR) order, but within each basic block, we generate code from
+/// bottom to top; and within each IR instruction that we visit in this reverse
+/// order, we emit machine instructions in *forward* order again.
+///
+/// Hence, to produce the final instructions in proper order, we perform two
+/// swaps.  First, the machine instructions (`I` instances) are produced in
+/// forward order for an individual IR instruction. Then these are *reversed*
+/// and concatenated to `bb_insns` at the end of the IR instruction lowering.
+/// The `bb_insns` vec will thus contain all machine instructions for a basic
+/// block, in reverse order. Finally, when we're done with a basic block, we
+/// reverse the whole block's vec of instructions again, and concatenate onto
+/// the VCode's insts.
+pub struct VCodeBuilder<I: VCodeInst> {
+    /// In-progress VCode.
+    vcode: VCode<I>,
+
+    /// Current basic block instructions, in reverse order (because blocks are
+    /// built bottom-to-top).
+    bb_insns: SmallVec<[I; 32]>,
+
+    /// Current IR-inst instructions, in forward order.
+    ir_inst_insns: SmallVec<[I; 4]>,
+
+    /// Start of succs for the current block in the concatenated succs list.
+    succ_start: usize,
+}
+
+impl<I: VCodeInst> VCodeBuilder<I> {
+    /// Create a new VCodeBuilder.
+    pub fn new(abi: Box<dyn ABIBody<I>>) -> VCodeBuilder<I> {
+        let vcode = VCode::new(abi);
+        VCodeBuilder {
+            vcode,
+            bb_insns: SmallVec::new(),
+            ir_inst_insns: SmallVec::new(),
+            succ_start: 0,
+        }
+    }
+
+    /// Access the ABI object.
+    pub fn abi(&mut self) -> &mut dyn ABIBody<I> {
+        &mut *self.vcode.abi
+    }
+
+    /// Set the type of a VReg.
+    pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) {
+        while self.vcode.vreg_types.len() <= vreg.get_index() {
+            self.vcode.vreg_types.push(ir::types::I8); // Default type.
+        }
+        self.vcode.vreg_types[vreg.get_index()] = ty;
+    }
+
+    /// Return the underlying bb-to-BlockIndex map.
+    pub fn blocks_by_bb(&self) -> &SecondaryMap<ir::Block, BlockIndex> {
+        &self.vcode.block_by_bb
+    }
+
+    /// Initialize the bb-to-BlockIndex map. Returns the first free
+    /// BlockIndex.
+    pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex {
+        let mut bindex: BlockIndex = 0;
+        for bb in blocks.iter() {
+            self.vcode.block_by_bb[*bb] = bindex;
+            self.vcode.bb_by_block.push(*bb);
+            bindex += 1;
+        }
+        bindex
+    }
+
+    /// Get the BlockIndex for an IR block.
+    pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex {
+        self.vcode.block_by_bb[bb]
+    }
+
+    /// Set the current block as the entry block.
+    pub fn set_entry(&mut self, block: BlockIndex) {
+        self.vcode.entry = block;
+    }
+
+    /// End the current IR instruction. Must be called after pushing any
+    /// instructions and prior to ending the basic block.
+    pub fn end_ir_inst(&mut self) {
+        while let Some(i) = self.ir_inst_insns.pop() {
+            self.bb_insns.push(i);
+        }
+    }
+
+    /// End the current basic block. Must be called after emitting vcode insts
+    /// for IR insts and prior to ending the function (building the VCode).
+    pub fn end_bb(&mut self) -> BlockIndex {
+        assert!(self.ir_inst_insns.is_empty());
+        let block_num = self.vcode.block_ranges.len() as BlockIndex;
+        // Push the instructions.
+        let start_idx = self.vcode.insts.len() as InsnIndex;
+        while let Some(i) = self.bb_insns.pop() {
+            self.vcode.insts.push(i);
+        }
+        let end_idx = self.vcode.insts.len() as InsnIndex;
+        // Add the instruction index range to the list of blocks.
+        self.vcode.block_ranges.push((start_idx, end_idx));
+        // End the successors list.
+        let succ_end = self.vcode.block_succs.len();
+        self.vcode
+            .block_succ_range
+            .push((self.succ_start, succ_end));
+        self.succ_start = succ_end;
+
+        block_num
+    }
+
+    /// Push an instruction for the current BB and current IR inst within the BB.
+    pub fn push(&mut self, insn: I) {
+        match insn.is_term() {
+            MachTerminator::None | MachTerminator::Ret => {}
+            MachTerminator::Uncond(target) => {
+                self.vcode.block_succs.push(target);
+            }
+            MachTerminator::Cond(true_branch, false_branch) => {
+                self.vcode.block_succs.push(true_branch);
+                self.vcode.block_succs.push(false_branch);
+            }
+            MachTerminator::Indirect(targets) => {
+                for target in targets {
+                    self.vcode.block_succs.push(*target);
+                }
+            }
+        }
+        self.ir_inst_insns.push(insn);
+    }
+
+    /// Build the final VCode.
+    pub fn build(self) -> VCode<I> {
+        assert!(self.ir_inst_insns.is_empty());
+        assert!(self.bb_insns.is_empty());
+        self.vcode
+    }
+}
+
+fn block_ranges(indices: &[InstIx], len: usize) -> Vec<(usize, usize)> {
+    let v = indices
+        .iter()
+        .map(|iix| iix.get() as usize)
+        .chain(iter::once(len))
+        .collect::<Vec<usize>>();
+    v.windows(2).map(|p| (p[0], p[1])).collect()
+}
+
+fn is_redundant_move<I: VCodeInst>(insn: &I) -> bool {
+    if let Some((to, from)) = insn.is_move() {
+        to.to_reg() == from
+    } else {
+        false
+    }
+}
+
+fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> Option<BlockIndex> {
+    let range = vcode.block_insns(BlockIx::new(block));
+
+    debug!(
+        "is_trivial_jump_block: block {} has len {}",
+        block,
+        range.len()
+    );
+
+    if range.len() != 1 {
+        return None;
+    }
+    let insn = range.first();
+
+    debug!(
+        " -> only insn is: {:?} with terminator {:?}",
+        vcode.get_insn(insn),
+        vcode.get_insn(insn).is_term()
+    );
+
+    match vcode.get_insn(insn).is_term() {
+        MachTerminator::Uncond(target) => Some(target),
+        _ => None,
+    }
+}
+
+impl<I: VCodeInst> VCode<I> {
+    /// New empty VCode.
+    fn new(abi: Box<dyn ABIBody<I>>) -> VCode<I> {
+        VCode {
+            liveins: abi.liveins(),
+            liveouts: abi.liveouts(),
+            vreg_types: vec![],
+            insts: vec![],
+            entry: 0,
+            block_ranges: vec![],
+            block_succ_range: vec![],
+            block_succs: vec![],
+            block_by_bb: SecondaryMap::with_default(0),
+            bb_by_block: vec![],
+            final_block_order: vec![],
+            final_block_offsets: vec![],
+            code_size: 0,
+            abi,
+        }
+    }
+
+    /// Get the IR-level type of a VReg.
+    pub fn vreg_type(&self, vreg: VirtualReg) -> Type {
+        self.vreg_types[vreg.get_index()]
+    }
+
+    /// Get the entry block.
+    pub fn entry(&self) -> BlockIndex {
+        self.entry
+    }
+
+    /// Get the number of blocks. Block indices will be in the range `0 ..
+    /// (self.num_blocks() - 1)`.
+    pub fn num_blocks(&self) -> usize {
+        self.block_ranges.len()
+    }
+
+    /// Stack frame size for the full function's body.
+    pub fn frame_size(&self) -> u32 {
+        self.abi.frame_size()
+    }
+
+    /// Get the successors for a block.
+    pub fn succs(&self, block: BlockIndex) -> &[BlockIndex] {
+        let (start, end) = self.block_succ_range[block as usize];
+        &self.block_succs[start..end]
+    }
+
+    /// Take the results of register allocation, with a sequence of
+    /// instructions including spliced fill/reload/move instructions, and replace
+    /// the VCode with them.
+    pub fn replace_insns_from_regalloc(
+        &mut self,
+        result: RegAllocResult<Self>,
+        flags: &settings::Flags,
+    ) {
+        self.final_block_order = compute_final_block_order(self);
+
+        // Record the spillslot count and clobbered registers for the ABI/stack
+        // setup code.
+        self.abi.set_num_spillslots(result.num_spill_slots as usize);
+        self.abi
+            .set_clobbered(result.clobbered_registers.map(|r| Writable::from_reg(*r)));
+
+        // We want to move instructions over in final block order, using the new
+        // block-start map given by the regalloc.
+        let block_ranges: Vec<(usize, usize)> =
+            block_ranges(result.target_map.elems(), result.insns.len());
+        let mut final_insns = vec![];
+        let mut final_block_ranges = vec![(0, 0); self.num_blocks()];
+
+        for block in &self.final_block_order {
+            let (start, end) = block_ranges[*block as usize];
+            let final_start = final_insns.len() as InsnIndex;
+
+            if *block == self.entry {
+                // Start with the prologue.
+                final_insns.extend(self.abi.gen_prologue(flags).into_iter());
+            }
+
+            for i in start..end {
+                let insn = &result.insns[i];
+
+                // Elide redundant moves at this point (we only know what is
+                // redundant once registers are allocated).
+                if is_redundant_move(insn) {
+                    continue;
+                }
+
+                // Whenever encountering a return instruction, replace it
+                // with the epilogue.
+                let is_ret = insn.is_term() == MachTerminator::Ret;
+                if is_ret {
+                    final_insns.extend(self.abi.gen_epilogue(flags).into_iter());
+                } else {
+                    final_insns.push(insn.clone());
+                }
+            }
+
+            let final_end = final_insns.len() as InsnIndex;
+            final_block_ranges[*block as usize] = (final_start, final_end);
+        }
+
+        self.insts = final_insns;
+        self.block_ranges = final_block_ranges;
+    }
+
+    /// Removes redundant branches, rewriting targets to point directly to the
+    /// ultimate block at the end of a chain of trivial one-target jumps.
+    pub fn remove_redundant_branches(&mut self) {
+        // For each block, compute the actual target block, looking through up to one
+        // block with single-target jumps (this will remove empty edge blocks inserted
+        // by phi-lowering).
+        let block_rewrites: Vec<BlockIndex> = (0..self.num_blocks() as u32)
+            .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix))
+            .collect();
+        let mut refcounts: Vec<usize> = vec![0; self.num_blocks()];
+
+        debug!(
+            "remove_redundant_branches: block_rewrites = {:?}",
+            block_rewrites
+        );
+
+        refcounts[self.entry as usize] = 1;
+
+        for block in 0..self.num_blocks() as u32 {
+            for insn in self.block_insns(BlockIx::new(block)) {
+                self.get_insn_mut(insn)
+                    .with_block_rewrites(&block_rewrites[..]);
+                match self.get_insn(insn).is_term() {
+                    MachTerminator::Uncond(bix) => {
+                        refcounts[bix as usize] += 1;
+                    }
+                    MachTerminator::Cond(bix1, bix2) => {
+                        refcounts[bix1 as usize] += 1;
+                        refcounts[bix2 as usize] += 1;
+                    }
+                    MachTerminator::Indirect(blocks) => {
+                        for block in blocks {
+                            refcounts[*block as usize] += 1;
+                        }
+                    }
+                    _ => {}
+                }
+            }
+        }
+
+        let deleted: Vec<bool> = refcounts.iter().map(|r| *r == 0).collect();
+
+        let block_order = std::mem::replace(&mut self.final_block_order, vec![]);
+        self.final_block_order = block_order
+            .into_iter()
+            .filter(|b| !deleted[*b as usize])
+            .collect();
+
+        // Rewrite successor information based on the block-rewrite map.
+        for succ in &mut self.block_succs {
+            let new_succ = block_rewrites[*succ as usize];
+            *succ = new_succ;
+        }
+    }
+
+    /// Mutate branch instructions to (i) lower two-way condbrs to one-way,
+    /// depending on fallthrough; and (ii) use concrete offsets.
+    pub fn finalize_branches(&mut self)
+    where
+        I: MachInstEmit<MachSectionSize>,
+    {
+        // Compute fallthrough block, indexed by block.
+        let num_final_blocks = self.final_block_order.len();
+        let mut block_fallthrough: Vec<Option<BlockIndex>> = vec![None; self.num_blocks()];
+        for i in 0..(num_final_blocks - 1) {
+            let from = self.final_block_order[i];
+            let to = self.final_block_order[i + 1];
+            block_fallthrough[from as usize] = Some(to);
+        }
+
+        // Pass over VCode instructions and finalize two-way branches into
+        // one-way branches with fallthrough.
+        for block in 0..self.num_blocks() {
+            let next_block = block_fallthrough[block];
+            let (start, end) = self.block_ranges[block];
+
+            for iix in start..end {
+                let insn = &mut self.insts[iix as usize];
+                insn.with_fallthrough_block(next_block);
+            }
+        }
+
+        // Compute block offsets.
+        let mut code_section = MachSectionSize::new(0);
+        let mut block_offsets = vec![0; self.num_blocks()];
+        for block in &self.final_block_order {
+            code_section.offset = I::align_basic_block(code_section.offset);
+            block_offsets[*block as usize] = code_section.offset;
+            let (start, end) = self.block_ranges[*block as usize];
+            for iix in start..end {
+                self.insts[iix as usize].emit(&mut code_section);
+            }
+        }
+
+        // We now have the section layout.
+        self.final_block_offsets = block_offsets;
+        self.code_size = code_section.size();
+
+        // Update branches with known block offsets. This looks like the
+        // traversal above, but (i) does not update block_offsets, rather uses
+        // it (so forward references are now possible), and (ii) mutates the
+        // instructions.
+        let mut code_section = MachSectionSize::new(0);
+        for block in &self.final_block_order {
+            code_section.offset = I::align_basic_block(code_section.offset);
+            let (start, end) = self.block_ranges[*block as usize];
+            for iix in start..end {
+                self.insts[iix as usize]
+                    .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
+                self.insts[iix as usize].emit(&mut code_section);
+            }
+        }
+    }
+
+    /// Emit the instructions to a list of sections.
+    pub fn emit(&self) -> MachSections
+    where
+        I: MachInstEmit<MachSection>,
+    {
+        let mut sections = MachSections::new();
+        let code_idx = sections.add_section(0, self.code_size);
+        let code_section = sections.get_section(code_idx);
+
+        for block in &self.final_block_order {
+            let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
+            while new_offset > code_section.cur_offset_from_start() {
+                // Pad with NOPs up to the aligned block offset.
+                let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize);
+                nop.emit(code_section);
+            }
+            assert_eq!(code_section.cur_offset_from_start(), new_offset);
+
+            let (start, end) = self.block_ranges[*block as usize];
+            for iix in start..end {
+                self.insts[iix as usize].emit(code_section);
+            }
+        }
+
+        sections
+    }
+
+    /// Get the IR block for a BlockIndex, if one exists.
+    pub fn bindex_to_bb(&self, block: BlockIndex) -> Option<ir::Block> {
+        if (block as usize) < self.bb_by_block.len() {
+            Some(self.bb_by_block[block as usize])
+        } else {
+            None
+        }
+    }
+}
+
+impl<I: VCodeInst> RegallocFunction for VCode<I> {
+    type Inst = I;
+
+    fn insns(&self) -> &[I] {
+        &self.insts[..]
+    }
+
+    fn insns_mut(&mut self) -> &mut [I] {
+        &mut self.insts[..]
+    }
+
+    fn get_insn(&self, insn: InstIx) -> &I {
+        &self.insts[insn.get() as usize]
+    }
+
+    fn get_insn_mut(&mut self, insn: InstIx) -> &mut I {
+        &mut self.insts[insn.get() as usize]
+    }
+
+    fn blocks(&self) -> Range<BlockIx> {
+        Range::new(BlockIx::new(0), self.block_ranges.len())
+    }
+
+    fn entry_block(&self) -> BlockIx {
+        BlockIx::new(self.entry)
+    }
+
+    fn block_insns(&self, block: BlockIx) -> Range<InstIx> {
+        let (start, end) = self.block_ranges[block.get() as usize];
+        Range::new(InstIx::new(start), (end - start) as usize)
+    }
+
+    fn block_succs(&self, block: BlockIx) -> Vec<BlockIx> {
+        let (start, end) = self.block_succ_range[block.get() as usize];
+        self.block_succs[start..end]
+            .iter()
+            .cloned()
+            .map(BlockIx::new)
+            .collect()
+    }
+
+    fn is_ret(&self, insn: InstIx) -> bool {
+        match self.insts[insn.get() as usize].is_term() {
+            MachTerminator::Ret => true,
+            _ => false,
+        }
+    }
+
+    fn get_regs(insn: &I, collector: &mut RegUsageCollector) {
+        insn.get_regs(collector)
+    }
+
+    fn map_regs(
+        insn: &mut I,
+        pre_map: &RegallocMap<VirtualReg, RealReg>,
+        post_map: &RegallocMap<VirtualReg, RealReg>,
+    ) {
+        insn.map_regs(pre_map, post_map);
+    }
+
+    fn is_move(&self, insn: &I) -> Option<(Writable<Reg>, Reg)> {
+        insn.is_move()
+    }
+
+    fn get_spillslot_size(&self, regclass: RegClass, vreg: VirtualReg) -> u32 {
+        let ty = self.vreg_type(vreg);
+        self.abi.get_spillslot_size(regclass, ty)
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        self.abi.gen_spill(to_slot, from_reg, ty)
+    }
+
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        self.abi.gen_reload(to_reg, from_slot, ty)
+    }
+
+    fn gen_move(&self, to_reg: Writable<RealReg>, from_reg: RealReg, vreg: VirtualReg) -> I {
+        let ty = self.vreg_type(vreg);
+        I::gen_move(to_reg.map(|r| r.to_reg()), from_reg.to_reg(), ty)
+    }
+
+    fn gen_zero_len_nop(&self) -> I {
+        I::gen_zero_len_nop()
+    }
+
+    fn maybe_direct_reload(&self, insn: &I, reg: VirtualReg, slot: SpillSlot) -> Option<I> {
+        insn.maybe_direct_reload(reg, slot)
+    }
+
+    fn func_liveins(&self) -> RegallocSet<RealReg> {
+        self.liveins.clone()
+    }
+
+    fn func_liveouts(&self) -> RegallocSet<RealReg> {
+        self.liveouts.clone()
+    }
+}
+
+// N.B.: Debug impl assumes that VCode has already been through all compilation
+// passes, and so has a final block order and offsets.
+
+impl<I: VCodeInst> fmt::Debug for VCode<I> {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        writeln!(f, "VCode_Debug {{")?;
+        writeln!(f, "  Entry block: {}", self.entry)?;
+        writeln!(f, "  Final block order: {:?}", self.final_block_order)?;
+
+        for block in 0..self.num_blocks() {
+            writeln!(f, "Block {}:", block,)?;
+            for succ in self.succs(block as BlockIndex) {
+                writeln!(f, "  (successor: Block {})", succ)?;
+            }
+            let (start, end) = self.block_ranges[block];
+            writeln!(f, "  (instruction range: {} .. {})", start, end)?;
+            for inst in start..end {
+                writeln!(f, "  Inst {}: {:?}", inst, self.insts[inst as usize])?;
+            }
+        }
+
+        writeln!(f, "}}")?;
+        Ok(())
+    }
+}
+
+// Pretty-printing with `RealRegUniverse` context.
+impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        use crate::alloc::string::ToString;
+        use std::fmt::Write;
+
+        // Calculate an order in which to display the blocks.  This is the same
+        // as final_block_order, but also includes blocks which are in the
+        // representation but not in final_block_order.
+        let mut display_order = Vec::<usize>::new();
+        // First display blocks in |final_block_order|
+        for bix in &self.final_block_order {
+            assert!((*bix as usize) < self.num_blocks());
+            display_order.push(*bix as usize);
+        }
+        // Now also take care of those not listed in |final_block_order|.
+        // This is quadratic, but it's also debug-only code.
+        for bix in 0..self.num_blocks() {
+            if display_order.contains(&bix) {
+                continue;
+            }
+            display_order.push(bix);
+        }
+
+        let mut s = String::new();
+        s = s + &format!("VCode_ShowWithRRU {{{{");
+        s = s + &"\n".to_string();
+        s = s + &format!("  Entry block: {}", self.entry);
+        s = s + &"\n".to_string();
+        s = s + &format!("  Final block order: {:?}", self.final_block_order);
+        s = s + &"\n".to_string();
+
+        for i in 0..self.num_blocks() {
+            let block = display_order[i];
+
+            let omitted =
+                (if !self.final_block_order.is_empty() && i >= self.final_block_order.len() {
+                    "** OMITTED **"
+                } else {
+                    ""
+                })
+                .to_string();
+
+            s = s + &format!("Block {}: {}", block, omitted);
+            s = s + &"\n".to_string();
+            if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
+                s = s + &format!("  (original IR block: {})\n", bb);
+            }
+            for succ in self.succs(block as BlockIndex) {
+                s = s + &format!("  (successor: Block {})", succ);
+                s = s + &"\n".to_string();
+            }
+            let (start, end) = self.block_ranges[block];
+            s = s + &format!("  (instruction range: {} .. {})", start, end);
+            s = s + &"\n".to_string();
+            for inst in start..end {
+                s = s + &format!(
+                    "  Inst {}:   {}",
+                    inst,
+                    self.insts[inst as usize].show_rru(mb_rru)
+                );
+                s = s + &"\n".to_string();
+            }
+        }
+
+        s = s + &format!("}}}}");
+        s = s + &"\n".to_string();
+
+        s
+    }
+}
diff --git a/cranelift/codegen/src/num_uses.rs b/cranelift/codegen/src/num_uses.rs
new file mode 100644
index 000000000000..c08741020c79
--- /dev/null
+++ b/cranelift/codegen/src/num_uses.rs
@@ -0,0 +1,68 @@
+//! A pass that computes the number of uses of any given instruction.
+
+#![allow(dead_code)]
+#![allow(unused_imports)]
+
+use crate::cursor::{Cursor, FuncCursor};
+use crate::dce::has_side_effect;
+use crate::entity::SecondaryMap;
+use crate::ir::dfg::ValueDef;
+use crate::ir::instructions::InstructionData;
+use crate::ir::Value;
+use crate::ir::{DataFlowGraph, Function, Inst, Opcode};
+
+/// Auxiliary data structure that counts the number of uses of any given
+/// instruction in a Function. This is used during instruction selection
+/// to essentially do incremental DCE: when an instruction is no longer
+/// needed because its computation has been isel'd into another machine
+/// instruction at every use site, we can skip it.
+#[derive(Clone, Debug)]
+pub struct NumUses {
+    uses: SecondaryMap<Inst, u32>,
+}
+
+impl NumUses {
+    fn new() -> NumUses {
+        NumUses {
+            uses: SecondaryMap::with_default(0),
+        }
+    }
+
+    /// Compute the NumUses analysis result for a function.
+    pub fn compute(func: &Function) -> NumUses {
+        let mut uses = NumUses::new();
+        for bb in func.layout.blocks() {
+            for inst in func.layout.block_insts(bb) {
+                for arg in func.dfg.inst_args(inst) {
+                    let v = func.dfg.resolve_aliases(*arg);
+                    uses.add_value(&func.dfg, v);
+                }
+            }
+        }
+        uses
+    }
+
+    fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) {
+        match dfg.value_def(v) {
+            ValueDef::Result(inst, _) => {
+                self.uses[inst] += 1;
+            }
+            _ => {}
+        }
+    }
+
+    /// How many times is an instruction used?
+    pub fn use_count(&self, i: Inst) -> usize {
+        self.uses[i] as usize
+    }
+
+    /// Is an instruction used at all?
+    pub fn is_used(&self, i: Inst) -> bool {
+        self.use_count(i) > 0
+    }
+
+    /// Take the complete uses map, consuming this analysis result.
+    pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
+        self.uses
+    }
+}

From 548ce947bf8e4447bf756fc2c6fc4e3bc1aab5ca Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 12:36:21 -0700
Subject: [PATCH 04/12] ARM64 backend, part 4 / 11: ARM64 instruction
 definitions.

This patch provides the bottom layer of the ARM64 backend: it defines
the `Inst` type, which represents a single machine instruction, and
defines emission routines to produce machine code from a `VCode`
container of `Insts`. The backend cannot produce `Inst`s with just this
patch; that will come with later parts.

This patch contains code written by Julian Seward <jseward@acm.org> and
Benjamin Bouvier <public@benj.me>, originally developed on a side-branch
before rebasing and condensing into this patch series. See the `arm64`
branch at `https://github.com/cfallin/wasmtime` for original development
history.

This patch also contains code written by Joey Gouly
<joey.gouly@arm.com> and contributed to the above branch. These
contributions are "Copyright (c) 2020, Arm Limited."

Finally, a contribution from Joey Gouly contains the following notice:

    This is a port of VIXL's Assembler::IsImmLogical.

    Arm has the original copyright on the VIXL code this was ported from
    and is relicensing it under Apache 2 for Cranelift.

Co-authored-by: Julian Seward <jseward@acm.org>
Co-authored-by: Benjamin Bouvier <public@benj.me>
Co-authored-by: Joey Gouly <joey.gouly@arm.com>
---
 cranelift/codegen/src/isa/arm64/inst/args.rs |  501 +++
 cranelift/codegen/src/isa/arm64/inst/emit.rs | 4106 ++++++++++++++++++
 cranelift/codegen/src/isa/arm64/inst/imms.rs |  753 ++++
 cranelift/codegen/src/isa/arm64/inst/mod.rs  | 2515 +++++++++++
 cranelift/codegen/src/isa/arm64/inst/regs.rs |  273 ++
 cranelift/codegen/src/isa/arm64/mod.rs       |    2 +-
 6 files changed, 8149 insertions(+), 1 deletion(-)
 create mode 100644 cranelift/codegen/src/isa/arm64/inst/args.rs
 create mode 100644 cranelift/codegen/src/isa/arm64/inst/emit.rs
 create mode 100644 cranelift/codegen/src/isa/arm64/inst/imms.rs
 create mode 100644 cranelift/codegen/src/isa/arm64/inst/mod.rs
 create mode 100644 cranelift/codegen/src/isa/arm64/inst/regs.rs

diff --git a/cranelift/codegen/src/isa/arm64/inst/args.rs b/cranelift/codegen/src/isa/arm64/inst/args.rs
new file mode 100644
index 000000000000..75cf12283b0e
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/inst/args.rs
@@ -0,0 +1,501 @@
+//! ARM64 ISA definitions: instruction arguments.
+
+#![allow(dead_code)]
+#![allow(non_snake_case)]
+
+use crate::binemit::{CodeOffset, CodeSink};
+use crate::ir::constant::{ConstantData, ConstantOffset};
+use crate::ir::Type;
+use crate::isa::arm64::inst::*;
+use crate::machinst::*;
+
+use regalloc::{
+    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
+    NUM_REG_CLASSES,
+};
+
+use std::string::{String, ToString};
+
+/// A shift operator for a register or immediate.
+#[derive(Clone, Copy, Debug)]
+pub enum ShiftOp {
+    ASR,
+    LSR,
+    LSL,
+    ROR,
+}
+
+impl ShiftOp {
+    /// Get the encoding of this shift op.
+    pub fn bits(&self) -> u8 {
+        match self {
+            &ShiftOp::LSL => 0b00,
+            &ShiftOp::LSR => 0b01,
+            &ShiftOp::ASR => 0b10,
+            &ShiftOp::ROR => 0b11,
+        }
+    }
+}
+
+/// A shift operator with an amount, guaranteed to be within range.
+#[derive(Clone, Debug)]
+pub struct ShiftOpAndAmt {
+    op: ShiftOp,
+    shift: ShiftOpShiftImm,
+}
+
+/// A shift operator amount.
+#[derive(Clone, Copy, Debug)]
+pub struct ShiftOpShiftImm(u8);
+
+impl ShiftOpShiftImm {
+    /// Maximum shift for shifted-register operands.
+    pub const MAX_SHIFT: u64 = 63;
+
+    /// Create a new shiftop shift amount, if possible.
+    pub fn maybe_from_shift(shift: u64) -> Option<ShiftOpShiftImm> {
+        if shift <= Self::MAX_SHIFT {
+            Some(ShiftOpShiftImm(shift as u8))
+        } else {
+            None
+        }
+    }
+
+    /// Return the shift amount.
+    pub fn value(&self) -> u8 {
+        self.0
+    }
+}
+
+impl ShiftOpAndAmt {
+    pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt {
+        ShiftOpAndAmt { op, shift }
+    }
+
+    /// Get the shift op.
+    pub fn op(&self) -> ShiftOp {
+        self.op.clone()
+    }
+
+    /// Get the shift amount.
+    pub fn amt(&self) -> ShiftOpShiftImm {
+        self.shift
+    }
+}
+
+/// An extend operator for a register.
+#[derive(Clone, Copy, Debug)]
+pub enum ExtendOp {
+    SXTB,
+    SXTH,
+    SXTW,
+    SXTX,
+    UXTB,
+    UXTH,
+    UXTW,
+    UXTX,
+}
+
+impl ExtendOp {
+    /// Encoding of this op.
+    pub fn bits(&self) -> u8 {
+        match self {
+            &ExtendOp::UXTB => 0b000,
+            &ExtendOp::UXTH => 0b001,
+            &ExtendOp::UXTW => 0b010,
+            &ExtendOp::UXTX => 0b011,
+            &ExtendOp::SXTB => 0b100,
+            &ExtendOp::SXTH => 0b101,
+            &ExtendOp::SXTW => 0b110,
+            &ExtendOp::SXTX => 0b111,
+        }
+    }
+}
+
+//=============================================================================
+// Instruction sub-components (memory addresses): definitions
+
+/// A reference to some memory address.
+#[derive(Clone, Debug)]
+pub enum MemLabel {
+    /// An address in the code, a constant pool or jumptable, with relative
+    /// offset from this instruction. This form must be used at emission time;
+    /// see `memlabel_finalize()` for how other forms are lowered to this one.
+    PCRel(i32),
+}
+
+/// A memory argument to load/store, encapsulating the possible addressing modes.
+#[derive(Clone, Debug)]
+pub enum MemArg {
+    Label(MemLabel),
+    PostIndexed(Writable<Reg>, SImm9),
+    PreIndexed(Writable<Reg>, SImm9),
+    // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to
+    // what the ISA calls the "register offset" addressing mode. We split out
+    // several options here for more ergonomic codegen.
+    RegReg(Reg, Reg),
+    RegScaled(Reg, Reg, Type),
+    RegScaledExtended(Reg, Reg, Type, ExtendOp),
+    Unscaled(Reg, SImm9),
+    UnsignedOffset(Reg, UImm12Scaled),
+    /// Offset from the stack pointer or frame pointer.
+    SPOffset(i64),
+    FPOffset(i64),
+}
+
+impl MemArg {
+    /// Memory reference using an address in a register.
+    pub fn reg(reg: Reg) -> MemArg {
+        // Use UnsignedOffset rather than Unscaled to use ldr rather than ldur.
+        // This also does not use PostIndexed / PreIndexed as they update the register.
+        MemArg::UnsignedOffset(reg, UImm12Scaled::zero(I64))
+    }
+
+    /// Memory reference using an address in a register and an offset, if possible.
+    pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option<MemArg> {
+        if offset == 0 {
+            Some(MemArg::Unscaled(reg, SImm9::zero()))
+        } else if let Some(simm9) = SImm9::maybe_from_i64(offset) {
+            Some(MemArg::Unscaled(reg, simm9))
+        } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) {
+            Some(MemArg::UnsignedOffset(reg, uimm12s))
+        } else {
+            None
+        }
+    }
+
+    /// Memory reference using the sum of two registers as an address.
+    pub fn reg_reg(reg1: Reg, reg2: Reg) -> MemArg {
+        MemArg::RegReg(reg1, reg2)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
+    pub fn reg_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> MemArg {
+        MemArg::RegScaled(reg1, reg2, ty)
+    }
+
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
+    pub fn reg_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> MemArg {
+        MemArg::RegScaledExtended(reg1, reg2, ty, op)
+    }
+
+    /// Memory reference to a label: a global function or value, or data in the constant pool.
+    pub fn label(label: MemLabel) -> MemArg {
+        MemArg::Label(label)
+    }
+}
+
+/// A memory argument to a load/store-pair.
+#[derive(Clone, Debug)]
+pub enum PairMemArg {
+    SignedOffset(Reg, SImm7Scaled),
+    PreIndexed(Writable<Reg>, SImm7Scaled),
+    PostIndexed(Writable<Reg>, SImm7Scaled),
+}
+
+//=============================================================================
+// Instruction sub-components (conditions, branches and branch targets):
+// definitions
+
+/// Condition for conditional branches.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum Cond {
+    Eq,
+    Ne,
+    Hs,
+    Lo,
+    Mi,
+    Pl,
+    Vs,
+    Vc,
+    Hi,
+    Ls,
+    Ge,
+    Lt,
+    Gt,
+    Le,
+    Al,
+    Nv,
+}
+
+impl Cond {
+    /// Return the inverted condition.
+    pub fn invert(self) -> Cond {
+        match self {
+            Cond::Eq => Cond::Ne,
+            Cond::Ne => Cond::Eq,
+            Cond::Hs => Cond::Lo,
+            Cond::Lo => Cond::Hs,
+            Cond::Mi => Cond::Pl,
+            Cond::Pl => Cond::Mi,
+            Cond::Vs => Cond::Vc,
+            Cond::Vc => Cond::Vs,
+            Cond::Hi => Cond::Ls,
+            Cond::Ls => Cond::Hi,
+            Cond::Ge => Cond::Lt,
+            Cond::Lt => Cond::Ge,
+            Cond::Gt => Cond::Le,
+            Cond::Le => Cond::Gt,
+            Cond::Al => Cond::Nv,
+            Cond::Nv => Cond::Al,
+        }
+    }
+
+    /// Return the machine encoding of this condition.
+    pub fn bits(self) -> u32 {
+        match self {
+            Cond::Eq => 0,
+            Cond::Ne => 1,
+            Cond::Hs => 2,
+            Cond::Lo => 3,
+            Cond::Mi => 4,
+            Cond::Pl => 5,
+            Cond::Vs => 6,
+            Cond::Vc => 7,
+            Cond::Hi => 8,
+            Cond::Ls => 9,
+            Cond::Ge => 10,
+            Cond::Lt => 11,
+            Cond::Gt => 12,
+            Cond::Le => 13,
+            Cond::Al => 14,
+            Cond::Nv => 15,
+        }
+    }
+}
+
+/// The kind of conditional branch: the common-case-optimized "reg-is-zero" /
+/// "reg-is-nonzero" variants, or the generic one that tests the machine
+/// condition codes.
+#[derive(Clone, Copy, Debug)]
+pub enum CondBrKind {
+    /// Condition: given register is zero.
+    Zero(Reg),
+    /// Condition: given register is nonzero.
+    NotZero(Reg),
+    /// Condition: the given condition-code test is true.
+    Cond(Cond),
+}
+
+impl CondBrKind {
+    /// Return the inverted branch condition.
+    pub fn invert(self) -> CondBrKind {
+        match self {
+            CondBrKind::Zero(reg) => CondBrKind::NotZero(reg),
+            CondBrKind::NotZero(reg) => CondBrKind::Zero(reg),
+            CondBrKind::Cond(c) => CondBrKind::Cond(c.invert()),
+        }
+    }
+}
+
+/// A branch target. Either unresolved (basic-block index) or resolved (offset
+/// from end of current instruction).
+#[derive(Clone, Copy, Debug)]
+pub enum BranchTarget {
+    /// An unresolved reference to a BlockIndex, as passed into
+    /// `lower_branch_group()`.
+    Block(BlockIndex),
+    /// A resolved reference to another instruction, after
+    /// `Inst::with_block_offsets()`.
+    ResolvedOffset(isize),
+}
+
+impl BranchTarget {
+    /// Lower the branch target given offsets of each block.
+    pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
+        match self {
+            &mut BranchTarget::Block(bix) => {
+                let bix = bix as usize;
+                assert!(bix < targets.len());
+                let block_offset_in_func = targets[bix];
+                let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
+                *self = BranchTarget::ResolvedOffset(branch_offset);
+            }
+            &mut BranchTarget::ResolvedOffset(..) => {}
+        }
+    }
+
+    /// Get the block index.
+    pub fn as_block_index(&self) -> Option<BlockIndex> {
+        match self {
+            &BranchTarget::Block(bix) => Some(bix),
+            _ => None,
+        }
+    }
+
+    /// Get the offset as 4-byte words. Returns `0` if not
+    /// yet resolved (in that case, we're only computing
+    /// size and the offset doesn't matter).
+    pub fn as_offset_words(&self) -> isize {
+        match self {
+            &BranchTarget::ResolvedOffset(off) => off >> 2,
+            _ => 0,
+        }
+    }
+
+    /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow.
+    pub fn as_off26(&self) -> Option<u32> {
+        let off = self.as_offset_words();
+        if (off < (1 << 25)) && (off >= -(1 << 25)) {
+            Some((off as u32) & ((1 << 26) - 1))
+        } else {
+            None
+        }
+    }
+
+    /// Get the offset as a 16-bit offset, or `None` if overflow.
+    pub fn as_off19(&self) -> Option<u32> {
+        let off = self.as_offset_words();
+        if (off < (1 << 18)) && (off >= -(1 << 18)) {
+            Some((off as u32) & ((1 << 19) - 1))
+        } else {
+            None
+        }
+    }
+
+    /// Map the block index given a transform map.
+    pub fn map(&mut self, block_index_map: &[BlockIndex]) {
+        match self {
+            &mut BranchTarget::Block(ref mut bix) => {
+                let n = block_index_map[*bix as usize];
+                *bix = n;
+            }
+            &mut BranchTarget::ResolvedOffset(_) => {}
+        }
+    }
+}
+
+impl ShowWithRRU for ShiftOpAndAmt {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?} {}", self.op(), self.amt().value())
+    }
+}
+
+impl ShowWithRRU for ExtendOp {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("{:?}", self)
+    }
+}
+
+impl ShowWithRRU for MemLabel {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemLabel::PCRel(off) => format!("pc+{}", off),
+        }
+    }
+}
+
+fn shift_for_type(ty: Type) -> usize {
+    match ty.bytes() {
+        1 => 0,
+        2 => 1,
+        4 => 2,
+        8 => 3,
+        16 => 4,
+        _ => panic!("unknown type"),
+    }
+}
+
+impl ShowWithRRU for MemArg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &MemArg::Unscaled(reg, simm9) => {
+                if simm9.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm9.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &MemArg::UnsignedOffset(reg, uimm12) => {
+                if uimm12.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), uimm12.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &MemArg::RegReg(r1, r2) => {
+                format!("[{}, {}]", r1.show_rru(mb_rru), r2.show_rru(mb_rru),)
+            }
+            &MemArg::RegScaled(r1, r2, ty) => {
+                let shift = shift_for_type(ty);
+                format!(
+                    "[{}, {}, LSL #{}]",
+                    r1.show_rru(mb_rru),
+                    r2.show_rru(mb_rru),
+                    shift,
+                )
+            }
+            &MemArg::RegScaledExtended(r1, r2, ty, op) => {
+                let shift = shift_for_type(ty);
+                let is32 = match op {
+                    ExtendOp::SXTW | ExtendOp::UXTW => true,
+                    _ => false,
+                };
+                let op = op.show_rru(mb_rru);
+                format!(
+                    "[{}, {}, {} #{}]",
+                    r1.show_rru(mb_rru),
+                    show_ireg_sized(r2, mb_rru, is32),
+                    op,
+                    shift
+                )
+            }
+            &MemArg::Label(ref label) => label.show_rru(mb_rru),
+            &MemArg::PreIndexed(r, simm9) => format!(
+                "[{}, {}]!",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            &MemArg::PostIndexed(r, simm9) => format!(
+                "[{}], {}",
+                r.to_reg().show_rru(mb_rru),
+                simm9.show_rru(mb_rru)
+            ),
+            // Eliminated by `mem_finalize()`.
+            &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                panic!("Unexpected stack-offset mem-arg mode!")
+            }
+        }
+    }
+}
+
+impl ShowWithRRU for PairMemArg {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &PairMemArg::SignedOffset(reg, simm7) => {
+                if simm7.value != 0 {
+                    format!("[{}, {}]", reg.show_rru(mb_rru), simm7.show_rru(mb_rru))
+                } else {
+                    format!("[{}]", reg.show_rru(mb_rru))
+                }
+            }
+            &PairMemArg::PreIndexed(reg, simm7) => format!(
+                "[{}, {}]!",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+            &PairMemArg::PostIndexed(reg, simm7) => format!(
+                "[{}], {}",
+                reg.to_reg().show_rru(mb_rru),
+                simm7.show_rru(mb_rru)
+            ),
+        }
+    }
+}
+
+impl ShowWithRRU for Cond {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let mut s = format!("{:?}", self);
+        s.make_ascii_lowercase();
+        s
+    }
+}
+
+impl ShowWithRRU for BranchTarget {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        match self {
+            &BranchTarget::Block(block) => format!("block{}", block),
+            &BranchTarget::ResolvedOffset(off) => format!("{}", off),
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/inst/emit.rs b/cranelift/codegen/src/isa/arm64/inst/emit.rs
new file mode 100644
index 000000000000..20eefdeaae08
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/inst/emit.rs
@@ -0,0 +1,4106 @@
+//! ARM64 ISA: binary code emission.
+
+#![allow(dead_code)]
+#![allow(non_snake_case)]
+
+use crate::binemit::{CodeOffset, CodeSink, Reloc};
+use crate::ir::constant::ConstantData;
+use crate::ir::types::*;
+use crate::ir::{Opcode, TrapCode, Type};
+use crate::isa::arm64::inst::*;
+use crate::machinst::*;
+use cranelift_entity::EntityRef;
+
+use std::env;
+
+use regalloc::{
+    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
+    NUM_REG_CLASSES,
+};
+
+use alloc::vec::Vec;
+
+/// Memory label/reference finalization: convert a MemLabel to a PC-relative
+/// offset, possibly emitting relocation(s) as necessary.
+pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 {
+    match label {
+        &MemLabel::PCRel(rel) => rel,
+    }
+}
+
+/// Memory addressing mode finalization: convert "special" modes (e.g.,
+/// generic arbitrary stack offset) into real addressing modes, possibly by
+/// emitting some helper instructions that come immediately before the use
+/// of this amode.
+pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
+    match mem {
+        &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => {
+            let basereg = match mem {
+                &MemArg::SPOffset(..) => stack_reg(),
+                &MemArg::FPOffset(..) => fp_reg(),
+                _ => unreachable!(),
+            };
+            if let Some(simm9) = SImm9::maybe_from_i64(off) {
+                let mem = MemArg::Unscaled(basereg, simm9);
+                (vec![], mem)
+            } else {
+                let tmp = writable_spilltmp_reg();
+                let mut const_insts = Inst::load_constant(tmp, off as u64);
+                let add_inst = Inst::AluRRR {
+                    alu_op: ALUOp::Add64,
+                    rd: tmp,
+                    rn: tmp.to_reg(),
+                    rm: basereg,
+                };
+                const_insts.push(add_inst);
+                (const_insts.to_vec(), MemArg::reg(tmp.to_reg()))
+            }
+        }
+        &MemArg::Label(ref label) => {
+            let off = memlabel_finalize(insn_off, label);
+            (vec![], MemArg::Label(MemLabel::PCRel(off)))
+        }
+        _ => (vec![], mem.clone()),
+    }
+}
+
+/// Helper: get a ConstantData from a u64.
+pub fn u64_constant(bits: u64) -> ConstantData {
+    let data = [
+        (bits & 0xff) as u8,
+        ((bits >> 8) & 0xff) as u8,
+        ((bits >> 16) & 0xff) as u8,
+        ((bits >> 24) & 0xff) as u8,
+        ((bits >> 32) & 0xff) as u8,
+        ((bits >> 40) & 0xff) as u8,
+        ((bits >> 48) & 0xff) as u8,
+        ((bits >> 56) & 0xff) as u8,
+    ];
+    ConstantData::from(&data[..])
+}
+
+//=============================================================================
+// Instructions and subcomponents: emission
+
+fn machreg_to_gpr(m: Reg) -> u32 {
+    assert!(m.get_class() == RegClass::I64);
+    assert!(m.is_real());
+    m.to_real_reg().get_hw_encoding() as u32
+}
+
+fn machreg_to_vec(m: Reg) -> u32 {
+    assert!(m.get_class() == RegClass::V128);
+    assert!(m.is_real());
+    m.to_real_reg().get_hw_encoding() as u32
+}
+
+fn machreg_to_gpr_or_vec(m: Reg) -> u32 {
+    m.to_real_reg().get_hw_encoding() as u32
+}
+
+fn enc_arith_rrr(bits_31_21: u16, bits_15_10: u8, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    ((bits_31_21 as u32) << 21)
+        | ((bits_15_10 as u32) << 10)
+        | machreg_to_gpr(rd.to_reg())
+        | (machreg_to_gpr(rn) << 5)
+        | (machreg_to_gpr(rm) << 16)
+}
+
+fn enc_arith_rr_imm12(bits_31_24: u8, immshift: u8, imm12: u16, rn: Reg, rd: Writable<Reg>) -> u32 {
+    ((bits_31_24 as u32) << 24)
+        | ((immshift as u32) << 22)
+        | ((imm12 as u32) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_arith_rr_imml(bits_31_23: u16, imm_bits: u16, rn: Reg, rd: Writable<Reg>) -> u32 {
+    ((bits_31_23 as u32) << 23)
+        | ((imm_bits as u32) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_arith_rrrr(top11: u32, rm: Reg, bit15: u32, ra: Reg, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (top11 << 21)
+        | (machreg_to_gpr(rm) << 16)
+        | (bit15 << 15)
+        | (machreg_to_gpr(ra) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_jump26(op_31_26: u32, off_26_0: u32) -> u32 {
+    assert!(off_26_0 < (1 << 26));
+    (op_31_26 << 26) | off_26_0
+}
+
+fn enc_cmpbr(op_31_24: u32, off_18_0: u32, reg: Reg) -> u32 {
+    assert!(off_18_0 < (1 << 19));
+    (op_31_24 << 24) | (off_18_0 << 5) | machreg_to_gpr(reg)
+}
+
+fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 {
+    assert!(off_18_0 < (1 << 19));
+    assert!(cond < (1 << 4));
+    (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond
+}
+
+const MOVE_WIDE_FIXED: u32 = 0x92800000;
+
+#[repr(u32)]
+enum MoveWideOpcode {
+    MOVN = 0b00,
+    MOVZ = 0b10,
+    MOVK = 0b11,
+}
+
+fn enc_move_wide(op: MoveWideOpcode, rd: Writable<Reg>, imm: MoveWideConst) -> u32 {
+    assert!(imm.shift <= 0b11);
+    MOVE_WIDE_FIXED
+        | (op as u32) << 29
+        | (imm.shift as u32) << 21
+        | (imm.bits as u32) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_ldst_pair(op_31_22: u32, simm7: SImm7Scaled, rn: Reg, rt: Reg, rt2: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (simm7.bits() << 15)
+        | (machreg_to_gpr(rt2) << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rt)
+}
+
+fn enc_ldst_simm9(op_31_22: u32, simm9: SImm9, op_11_10: u32, rn: Reg, rd: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (simm9.bits() << 12)
+        | (op_11_10 << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_uimm12(op_31_22: u32, uimm12: UImm12Scaled, rn: Reg, rd: Reg) -> u32 {
+    (op_31_22 << 22)
+        | (0b1 << 24)
+        | (uimm12.bits() << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_reg(
+    op_31_22: u32,
+    rn: Reg,
+    rm: Reg,
+    s_bit: bool,
+    extendop: Option<ExtendOp>,
+    rd: Reg,
+) -> u32 {
+    let s_bit = if s_bit { 1 } else { 0 };
+    let extend_bits = match extendop {
+        Some(ExtendOp::UXTW) => 0b010,
+        Some(ExtendOp::SXTW) => 0b110,
+        Some(ExtendOp::SXTX) => 0b111,
+        None => 0b011, /* LSL */
+        _ => panic!("bad extend mode for ld/st MemArg"),
+    };
+    (op_31_22 << 22)
+        | (1 << 21)
+        | (machreg_to_gpr(rm) << 16)
+        | (extend_bits << 13)
+        | (s_bit << 12)
+        | (0b10 << 10)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 {
+    (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd)
+}
+
+fn enc_extend(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (top11 << 21)
+        | (machreg_to_vec(rm) << 16)
+        | (bit15_10 << 10)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (0b01011010110 << 21)
+        | size << 31
+        | opcode2 << 16
+        | opcode1 << 10
+        | machreg_to_gpr(rn) << 5
+        | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_br(rn: Reg) -> u32 {
+    0b1101011_0000_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5)
+}
+
+fn enc_adr(off: i32, rd: Writable<Reg>) -> u32 {
+    let off = off as u32;
+    let immlo = off & 3;
+    let immhi = (off >> 2) & ((1 << 19) - 1);
+    (0b00010000 << 24) | (immlo << 29) | (immhi << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_csel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond) -> u32 {
+    0b100_11010100_00000_0000_00_00000_00000
+        | (machreg_to_gpr(rm) << 16)
+        | (machreg_to_gpr(rn) << 5)
+        | machreg_to_gpr(rd.to_reg())
+        | (cond.bits() << 12)
+}
+
+fn enc_fcsel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond, is32: bool) -> u32 {
+    let ty_bit = if is32 { 0 } else { 1 };
+    0b000_11110_00_1_00000_0000_11_00000_00000
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+        | (cond.bits() << 12)
+        | (ty_bit << 22)
+}
+
+fn enc_cset(rd: Writable<Reg>, cond: Cond) -> u32 {
+    0b100_11010100_11111_0000_01_11111_00000
+        | machreg_to_gpr(rd.to_reg())
+        | (cond.invert().bits() << 12)
+}
+
+fn enc_vecmov(is_16b: bool, rd: Writable<Reg>, rn: Reg) -> u32 {
+    debug_assert!(!is_16b); // to be supported later.
+    0b00001110_101_00000_00011_1_00000_00000
+        | machreg_to_vec(rd.to_reg())
+        | (machreg_to_vec(rn) << 16)
+        | (machreg_to_vec(rn) << 5)
+}
+
+fn enc_fpurr(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fpurrr(top22: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    (top22 << 10)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fpurrrr(top17: u32, rd: Writable<Reg>, rn: Reg, rm: Reg, ra: Reg) -> u32 {
+    (top17 << 15)
+        | (machreg_to_vec(rm) << 16)
+        | (machreg_to_vec(ra) << 10)
+        | (machreg_to_vec(rn) << 5)
+        | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fcmp(is32: bool, rn: Reg, rm: Reg) -> u32 {
+    let bits = if is32 {
+        0b000_11110_00_1_00000_00_1000_00000_00000
+    } else {
+        0b000_11110_01_1_00000_00_1000_00000_00000
+    };
+    bits | (machreg_to_vec(rm) << 16) | (machreg_to_vec(rn) << 5)
+}
+
+fn enc_fputoint(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top16 << 16) | (machreg_to_vec(rn) << 5) | machreg_to_gpr(rd.to_reg())
+}
+
+fn enc_inttofpu(top16: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top16 << 16) | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+fn enc_fround(top22: u32, rd: Writable<Reg>, rn: Reg) -> u32 {
+    (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg())
+}
+
+impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
+    fn emit(&self, sink: &mut O) {
+        match self {
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let top11 = match alu_op {
+                    ALUOp::Add32 => 0b00001011_000,
+                    ALUOp::Add64 => 0b10001011_000,
+                    ALUOp::Sub32 => 0b01001011_000,
+                    ALUOp::Sub64 => 0b11001011_000,
+                    ALUOp::Orr32 => 0b00101010_000,
+                    ALUOp::Orr64 => 0b10101010_000,
+                    ALUOp::And32 => 0b00001010_000,
+                    ALUOp::And64 => 0b10001010_000,
+                    ALUOp::Eor32 => 0b01001010_000,
+                    ALUOp::Eor64 => 0b11001010_000,
+                    ALUOp::OrrNot32 => 0b00101010_001,
+                    ALUOp::OrrNot64 => 0b10101010_001,
+                    ALUOp::AndNot32 => 0b00001010_001,
+                    ALUOp::AndNot64 => 0b10001010_001,
+                    ALUOp::EorNot32 => 0b01001010_001,
+                    ALUOp::EorNot64 => 0b11001010_001,
+                    ALUOp::AddS32 => 0b00101011_000,
+                    ALUOp::AddS64 => 0b10101011_000,
+                    ALUOp::SubS32 => 0b01101011_000,
+                    ALUOp::SubS64 => 0b11101011_000,
+                    ALUOp::SDiv64 => 0b10011010_110,
+                    ALUOp::UDiv64 => 0b10011010_110,
+                    ALUOp::RotR32 | ALUOp::Lsr32 | ALUOp::Asr32 | ALUOp::Lsl32 => 0b00011010_110,
+                    ALUOp::RotR64 | ALUOp::Lsr64 | ALUOp::Asr64 | ALUOp::Lsl64 => 0b10011010_110,
+
+                    ALUOp::MAdd32
+                    | ALUOp::MAdd64
+                    | ALUOp::MSub32
+                    | ALUOp::MSub64
+                    | ALUOp::SMulH
+                    | ALUOp::UMulH => {
+                        //// RRRR ops.
+                        panic!("Bad ALUOp in RRR form!");
+                    }
+                };
+                let bit15_10 = match alu_op {
+                    ALUOp::SDiv64 => 0b000011,
+                    ALUOp::UDiv64 => 0b000010,
+                    ALUOp::RotR32 | ALUOp::RotR64 => 0b001011,
+                    ALUOp::Lsr32 | ALUOp::Lsr64 => 0b001001,
+                    ALUOp::Asr32 | ALUOp::Asr64 => 0b001010,
+                    ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000,
+                    _ => 0b000000,
+                };
+                assert_ne!(writable_stack_reg(), rd);
+                sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm));
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd,
+                rm,
+                rn,
+                ra,
+            } => {
+                let (top11, bit15) = match alu_op {
+                    ALUOp::MAdd32 => (0b0_00_11011_000, 0),
+                    ALUOp::MSub32 => (0b0_00_11011_000, 1),
+                    ALUOp::MAdd64 => (0b1_00_11011_000, 0),
+                    ALUOp::MSub64 => (0b1_00_11011_000, 1),
+                    ALUOp::SMulH => (0b1_00_11011_010, 0),
+                    ALUOp::UMulH => (0b1_00_11011_110, 0),
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                sink.put4(enc_arith_rrrr(top11, rm, bit15, ra, rn, rd));
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                ref imm12,
+            } => {
+                let top8 = match alu_op {
+                    ALUOp::Add32 => 0b000_10001,
+                    ALUOp::Add64 => 0b100_10001,
+                    ALUOp::Sub32 => 0b010_10001,
+                    ALUOp::Sub64 => 0b110_10001,
+                    ALUOp::AddS32 => 0b001_10001,
+                    ALUOp::AddS64 => 0b101_10001,
+                    ALUOp::SubS32 => 0b011_10001,
+                    ALUOp::SubS64 => 0b111_10001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                sink.put4(enc_arith_rr_imm12(
+                    top8,
+                    imm12.shift_bits(),
+                    imm12.imm_bits(),
+                    rn,
+                    rd,
+                ));
+            }
+            &Inst::AluRRImmLogic {
+                alu_op,
+                rd,
+                rn,
+                ref imml,
+            } => {
+                let (top9, inv) = match alu_op {
+                    ALUOp::Orr32 => (0b001_100100, false),
+                    ALUOp::Orr64 => (0b101_100100, false),
+                    ALUOp::And32 => (0b000_100100, false),
+                    ALUOp::And64 => (0b100_100100, false),
+                    ALUOp::Eor32 => (0b010_100100, false),
+                    ALUOp::Eor64 => (0b110_100100, false),
+                    ALUOp::OrrNot32 => (0b001_100100, true),
+                    ALUOp::OrrNot64 => (0b101_100100, true),
+                    ALUOp::AndNot32 => (0b000_100100, true),
+                    ALUOp::AndNot64 => (0b100_100100, true),
+                    ALUOp::EorNot32 => (0b010_100100, true),
+                    ALUOp::EorNot64 => (0b110_100100, true),
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let imml = if inv { imml.invert() } else { imml.clone() };
+                sink.put4(enc_arith_rr_imml(top9, imml.enc_bits(), rn, rd));
+            }
+
+            &Inst::AluRRImmShift {
+                alu_op,
+                rd,
+                rn,
+                ref immshift,
+            } => {
+                let amt = immshift.value();
+                let (top10, immr, imms) = match alu_op {
+                    ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), amt as u32),
+                    ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), amt as u32),
+                    ALUOp::Lsr32 => (0b0101001100, amt as u32, 0b011111),
+                    ALUOp::Lsr64 => (0b1101001101, amt as u32, 0b111111),
+                    ALUOp::Asr32 => (0b0001001100, amt as u32, 0b011111),
+                    ALUOp::Asr64 => (0b1001001101, amt as u32, 0b111111),
+                    ALUOp::Lsl32 => (0b0101001100, (32 - amt) as u32, (31 - amt) as u32),
+                    ALUOp::Lsl64 => (0b1101001101, (64 - amt) as u32, (63 - amt) as u32),
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                sink.put4(
+                    (top10 << 22)
+                        | (immr << 16)
+                        | (imms << 10)
+                        | (machreg_to_gpr(rn) << 5)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shiftop,
+            } => {
+                let top11: u16 = match alu_op {
+                    ALUOp::Add32 => 0b000_01011000,
+                    ALUOp::Add64 => 0b100_01011000,
+                    ALUOp::AddS32 => 0b001_01011000,
+                    ALUOp::AddS64 => 0b101_01011000,
+                    ALUOp::Sub32 => 0b010_01011000,
+                    ALUOp::Sub64 => 0b110_01011000,
+                    ALUOp::SubS32 => 0b011_01011000,
+                    ALUOp::SubS64 => 0b111_01011000,
+                    ALUOp::Orr32 => 0b001_01010000,
+                    ALUOp::Orr64 => 0b101_01010000,
+                    ALUOp::And32 => 0b000_01010000,
+                    ALUOp::And64 => 0b100_01010000,
+                    ALUOp::Eor32 => 0b010_01010000,
+                    ALUOp::Eor64 => 0b110_01010000,
+                    ALUOp::OrrNot32 => 0b001_01010001,
+                    ALUOp::OrrNot64 => 0b101_01010001,
+                    ALUOp::EorNot32 => 0b010_01010001,
+                    ALUOp::EorNot64 => 0b110_01010001,
+                    ALUOp::AndNot32 => 0b000_01010001,
+                    ALUOp::AndNot64 => 0b100_01010001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let top11 = top11 | ((shiftop.op().bits() as u16) << 1);
+                let bits_15_10 = shiftop.amt().value();
+                sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
+            }
+
+            &Inst::AluRRRExtend {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                extendop,
+            } => {
+                let top11 = match alu_op {
+                    ALUOp::Add32 => 0b00001011001,
+                    ALUOp::Add64 => 0b10001011001,
+                    ALUOp::Sub32 => 0b01001011001,
+                    ALUOp::Sub64 => 0b11001011001,
+                    ALUOp::AddS32 => 0b00101011001,
+                    ALUOp::AddS64 => 0b10101011001,
+                    ALUOp::SubS32 => 0b01101011001,
+                    ALUOp::SubS64 => 0b11101011001,
+                    _ => unimplemented!("{:?}", alu_op),
+                };
+                let bits_15_10 = extendop.bits() << 3;
+                sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
+            }
+
+            &Inst::BitRR { op, rd, rn, .. } => {
+                let size = if op.is_32_bit() { 0b0 } else { 0b1 };
+                let (op1, op2) = match op {
+                    BitOp::RBit32 | BitOp::RBit64 => (0b00000, 0b000000),
+                    BitOp::Clz32 | BitOp::Clz64 => (0b00000, 0b000100),
+                    BitOp::Cls32 | BitOp::Cls64 => (0b00000, 0b000101),
+                };
+                sink.put4(enc_bit_rr(size, op1, op2, rn, rd))
+            }
+
+            &Inst::ULoad8 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::SLoad8 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::ULoad16 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::SLoad16 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::ULoad32 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::SLoad32 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::ULoad64 {
+                rd,
+                ref mem,
+                srcloc,
+                ..
+            }
+            | &Inst::FpuLoad32 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::FpuLoad64 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::FpuLoad128 {
+                rd,
+                ref mem,
+                srcloc,
+            } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink);
+                }
+
+                // ldst encoding helpers take Reg, not Writable<Reg>.
+                let rd = rd.to_reg();
+
+                // This is the base opcode (top 10 bits) for the "unscaled
+                // immediate" form (Unscaled). Other addressing modes will OR in
+                // other values for bits 24/25 (bits 1/2 of this constant).
+                let op = match self {
+                    &Inst::ULoad8 { .. } => 0b0011100001,
+                    &Inst::SLoad8 { .. } => 0b0011100010,
+                    &Inst::ULoad16 { .. } => 0b0111100001,
+                    &Inst::SLoad16 { .. } => 0b0111100010,
+                    &Inst::ULoad32 { .. } => 0b1011100001,
+                    &Inst::SLoad32 { .. } => 0b1011100010,
+                    &Inst::ULoad64 { .. } => 0b1111100001,
+                    &Inst::FpuLoad32 { .. } => 0b1011110001,
+                    &Inst::FpuLoad64 { .. } => 0b1111110001,
+                    &Inst::FpuLoad128 { .. } => 0b0011110011,
+                    _ => unreachable!(),
+                };
+
+                if let Some(srcloc) = srcloc {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::OutOfBounds);
+                }
+
+                match &mem {
+                    &MemArg::Unscaled(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
+                    }
+                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
+                    }
+                    &MemArg::RegReg(r1, r2) => {
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd,
+                        ));
+                    }
+                    &MemArg::RegScaled(r1, r2, ty) | &MemArg::RegScaledExtended(r1, r2, ty, _) => {
+                        match (ty, self) {
+                            (I8, &Inst::ULoad8 { .. }) => {}
+                            (I8, &Inst::SLoad8 { .. }) => {}
+                            (I16, &Inst::ULoad16 { .. }) => {}
+                            (I16, &Inst::SLoad16 { .. }) => {}
+                            (I32, &Inst::ULoad32 { .. }) => {}
+                            (I32, &Inst::SLoad32 { .. }) => {}
+                            (I64, &Inst::ULoad64 { .. }) => {}
+                            (F32, &Inst::FpuLoad32 { .. }) => {}
+                            (F64, &Inst::FpuLoad64 { .. }) => {}
+                            (I128, &Inst::FpuLoad128 { .. }) => {}
+                            _ => panic!("Mismatching reg-scaling type in MemArg"),
+                        }
+                        let extendop = match &mem {
+                            &MemArg::RegScaled(..) => None,
+                            &MemArg::RegScaledExtended(_, _, _, op) => Some(op),
+                            _ => unreachable!(),
+                        };
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ true, extendop, rd,
+                        ));
+                    }
+                    &MemArg::Label(ref label) => {
+                        let offset = match label {
+                            &MemLabel::PCRel(off) => off as u32,
+                        } / 4;
+                        assert!(offset < (1 << 19));
+                        match self {
+                            &Inst::ULoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b00011000, offset, rd));
+                            }
+                            &Inst::SLoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b10011000, offset, rd));
+                            }
+                            &Inst::FpuLoad32 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b00011100, offset, rd));
+                            }
+                            &Inst::ULoad64 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b01011000, offset, rd));
+                            }
+                            &Inst::FpuLoad64 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b01011100, offset, rd));
+                            }
+                            &Inst::FpuLoad128 { .. } => {
+                                sink.put4(enc_ldst_imm19(0b10011100, offset, rd));
+                            }
+                            _ => panic!("Unspported size for LDR from constant pool!"),
+                        }
+                    }
+                    &MemArg::PreIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd));
+                    }
+                    &MemArg::PostIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
+                    }
+                    // Eliminated by `mem_finalize()` above.
+                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                        panic!("Should not see stack-offset here!")
+                    }
+                }
+            }
+
+            &Inst::Store8 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::Store16 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::Store32 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::Store64 {
+                rd,
+                ref mem,
+                srcloc,
+                ..
+            }
+            | &Inst::FpuStore32 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::FpuStore64 {
+                rd,
+                ref mem,
+                srcloc,
+            }
+            | &Inst::FpuStore128 {
+                rd,
+                ref mem,
+                srcloc,
+            } => {
+                let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem);
+
+                for inst in mem_insts.into_iter() {
+                    inst.emit(sink);
+                }
+
+                let op = match self {
+                    &Inst::Store8 { .. } => 0b0011100000,
+                    &Inst::Store16 { .. } => 0b0111100000,
+                    &Inst::Store32 { .. } => 0b1011100000,
+                    &Inst::Store64 { .. } => 0b1111100000,
+                    &Inst::FpuStore32 { .. } => 0b1011110000,
+                    &Inst::FpuStore64 { .. } => 0b1111110000,
+                    &Inst::FpuStore128 { .. } => 0b0011110010,
+                    _ => unreachable!(),
+                };
+
+                if let Some(srcloc) = srcloc {
+                    // Register the offset at which the actual load instruction starts.
+                    sink.add_trap(srcloc, TrapCode::OutOfBounds);
+                }
+
+                match &mem {
+                    &MemArg::Unscaled(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd));
+                    }
+                    &MemArg::UnsignedOffset(reg, uimm12scaled) => {
+                        sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd));
+                    }
+                    &MemArg::RegReg(r1, r2) => {
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd,
+                        ));
+                    }
+                    &MemArg::RegScaled(r1, r2, _ty)
+                    | &MemArg::RegScaledExtended(r1, r2, _ty, _) => {
+                        let extendop = match &mem {
+                            &MemArg::RegScaled(..) => None,
+                            &MemArg::RegScaledExtended(_, _, _, op) => Some(op),
+                            _ => unreachable!(),
+                        };
+                        sink.put4(enc_ldst_reg(
+                            op, r1, r2, /* scaled = */ true, extendop, rd,
+                        ));
+                    }
+                    &MemArg::Label(..) => {
+                        panic!("Store to a MemLabel not implemented!");
+                    }
+                    &MemArg::PreIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd));
+                    }
+                    &MemArg::PostIndexed(reg, simm9) => {
+                        sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd));
+                    }
+                    // Eliminated by `mem_finalize()` above.
+                    &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => {
+                        panic!("Should not see stack-offset here!")
+                    }
+                }
+            }
+
+            &Inst::StoreP64 { rt, rt2, ref mem } => match mem {
+                &PairMemArg::SignedOffset(reg, simm7) => {
+                    assert_eq!(simm7.scale_ty, I64);
+                    sink.put4(enc_ldst_pair(0b1010100100, simm7, reg, rt, rt2));
+                }
+                &PairMemArg::PreIndexed(reg, simm7) => {
+                    assert_eq!(simm7.scale_ty, I64);
+                    sink.put4(enc_ldst_pair(0b1010100110, simm7, reg.to_reg(), rt, rt2));
+                }
+                &PairMemArg::PostIndexed(reg, simm7) => {
+                    assert_eq!(simm7.scale_ty, I64);
+                    sink.put4(enc_ldst_pair(0b1010100010, simm7, reg.to_reg(), rt, rt2));
+                }
+            },
+            &Inst::LoadP64 { rt, rt2, ref mem } => {
+                let rt = rt.to_reg();
+                let rt2 = rt2.to_reg();
+                match mem {
+                    &PairMemArg::SignedOffset(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100101, simm7, reg, rt, rt2));
+                    }
+                    &PairMemArg::PreIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100111, simm7, reg.to_reg(), rt, rt2));
+                    }
+                    &PairMemArg::PostIndexed(reg, simm7) => {
+                        assert_eq!(simm7.scale_ty, I64);
+                        sink.put4(enc_ldst_pair(0b1010100011, simm7, reg.to_reg(), rt, rt2));
+                    }
+                }
+            }
+            &Inst::Mov { rd, rm } => {
+                assert!(rd.to_reg().get_class() == rm.get_class());
+                assert!(rm.get_class() == RegClass::I64);
+                // Encoded as ORR rd, rm, zero.
+                sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm));
+            }
+            &Inst::Mov32 { rd, rm } => {
+                // Encoded as ORR rd, rm, zero.
+                sink.put4(enc_arith_rrr(0b00101010_000, 0b000_000, rd, zero_reg(), rm));
+            }
+            &Inst::MovZ { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVZ, rd, imm)),
+            &Inst::MovN { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVN, rd, imm)),
+            &Inst::MovK { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVK, rd, imm)),
+            &Inst::CSel { rd, rn, rm, cond } => {
+                sink.put4(enc_csel(rd, rn, rm, cond));
+            }
+            &Inst::CSet { rd, cond } => {
+                sink.put4(enc_cset(rd, cond));
+            }
+            &Inst::FpuMove64 { rd, rn } => {
+                sink.put4(enc_vecmov(/* 16b = */ false, rd, rn));
+            }
+            &Inst::FpuRR { fpu_op, rd, rn } => {
+                let top22 = match fpu_op {
+                    FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000,
+                    FPUOp1::Abs64 => 0b000_11110_01_1_000001_10000,
+                    FPUOp1::Neg32 => 0b000_11110_00_1_000010_10000,
+                    FPUOp1::Neg64 => 0b000_11110_01_1_000010_10000,
+                    FPUOp1::Sqrt32 => 0b000_11110_00_1_000011_10000,
+                    FPUOp1::Sqrt64 => 0b000_11110_01_1_000011_10000,
+                    FPUOp1::Cvt32To64 => 0b000_11110_00_1_000101_10000,
+                    FPUOp1::Cvt64To32 => 0b000_11110_01_1_000100_10000,
+                };
+                sink.put4(enc_fpurr(top22, rd, rn));
+            }
+            &Inst::FpuRRR { fpu_op, rd, rn, rm } => {
+                let top22 = match fpu_op {
+                    FPUOp2::Add32 => 0b000_11110_00_1_00000_001010,
+                    FPUOp2::Add64 => 0b000_11110_01_1_00000_001010,
+                    FPUOp2::Sub32 => 0b000_11110_00_1_00000_001110,
+                    FPUOp2::Sub64 => 0b000_11110_01_1_00000_001110,
+                    FPUOp2::Mul32 => 0b000_11110_00_1_00000_000010,
+                    FPUOp2::Mul64 => 0b000_11110_01_1_00000_000010,
+                    FPUOp2::Div32 => 0b000_11110_00_1_00000_000110,
+                    FPUOp2::Div64 => 0b000_11110_01_1_00000_000110,
+                    FPUOp2::Max32 => 0b000_11110_00_1_00000_010010,
+                    FPUOp2::Max64 => 0b000_11110_01_1_00000_010010,
+                    FPUOp2::Min32 => 0b000_11110_00_1_00000_010110,
+                    FPUOp2::Min64 => 0b000_11110_01_1_00000_010110,
+                };
+                sink.put4(enc_fpurrr(top22, rd, rn, rm));
+            }
+            &Inst::FpuRRRR {
+                fpu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let top17 = match fpu_op {
+                    FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0,
+                    FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0,
+                };
+                sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
+            }
+            &Inst::FpuCmp32 { rn, rm } => {
+                sink.put4(enc_fcmp(/* is32 = */ true, rn, rm));
+            }
+            &Inst::FpuCmp64 { rn, rm } => {
+                sink.put4(enc_fcmp(/* is32 = */ false, rn, rm));
+            }
+            &Inst::FpuToInt { op, rd, rn } => {
+                let top16 = match op {
+                    // FCVTZS (32/32-bit)
+                    FpuToIntOp::F32ToI32 => 0b000_11110_00_1_11_000,
+                    // FCVTZU (32/32-bit)
+                    FpuToIntOp::F32ToU32 => 0b000_11110_00_1_11_001,
+                    // FCVTZS (32/64-bit)
+                    FpuToIntOp::F32ToI64 => 0b100_11110_00_1_11_000,
+                    // FCVTZU (32/64-bit)
+                    FpuToIntOp::F32ToU64 => 0b100_11110_00_1_11_001,
+                    // FCVTZS (64/32-bit)
+                    FpuToIntOp::F64ToI32 => 0b000_11110_01_1_11_000,
+                    // FCVTZU (64/32-bit)
+                    FpuToIntOp::F64ToU32 => 0b000_11110_01_1_11_001,
+                    // FCVTZS (64/64-bit)
+                    FpuToIntOp::F64ToI64 => 0b100_11110_01_1_11_000,
+                    // FCVTZU (64/64-bit)
+                    FpuToIntOp::F64ToU64 => 0b100_11110_01_1_11_001,
+                };
+                sink.put4(enc_fputoint(top16, rd, rn));
+            }
+            &Inst::IntToFpu { op, rd, rn } => {
+                let top16 = match op {
+                    // SCVTF (32/32-bit)
+                    IntToFpuOp::I32ToF32 => 0b000_11110_00_1_00_010,
+                    // UCVTF (32/32-bit)
+                    IntToFpuOp::U32ToF32 => 0b000_11110_00_1_00_011,
+                    // SCVTF (64/32-bit)
+                    IntToFpuOp::I64ToF32 => 0b100_11110_00_1_00_010,
+                    // UCVTF (64/32-bit)
+                    IntToFpuOp::U64ToF32 => 0b100_11110_00_1_00_011,
+                    // SCVTF (32/64-bit)
+                    IntToFpuOp::I32ToF64 => 0b000_11110_01_1_00_010,
+                    // UCVTF (32/64-bit)
+                    IntToFpuOp::U32ToF64 => 0b000_11110_01_1_00_011,
+                    // SCVTF (64/64-bit)
+                    IntToFpuOp::I64ToF64 => 0b100_11110_01_1_00_010,
+                    // UCVTF (64/64-bit)
+                    IntToFpuOp::U64ToF64 => 0b100_11110_01_1_00_011,
+                };
+                sink.put4(enc_inttofpu(top16, rd, rn));
+            }
+            &Inst::LoadFpuConst32 { rd, const_data } => {
+                let inst = Inst::FpuLoad32 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None,
+                };
+                inst.emit(sink);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(8),
+                };
+                inst.emit(sink);
+                sink.put4(const_data.to_bits());
+            }
+            &Inst::LoadFpuConst64 { rd, const_data } => {
+                let inst = Inst::FpuLoad64 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None,
+                };
+                inst.emit(sink);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(12),
+                };
+                inst.emit(sink);
+                sink.put8(const_data.to_bits());
+            }
+            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, /* is32 = */ true));
+            }
+            &Inst::FpuCSel64 { rd, rn, rm, cond } => {
+                sink.put4(enc_fcsel(rd, rn, rm, cond, /* is32 = */ false));
+            }
+            &Inst::FpuRound { op, rd, rn } => {
+                let top22 = match op {
+                    FpuRoundMode::Minus32 => 0b000_11110_00_1_001_010_10000,
+                    FpuRoundMode::Minus64 => 0b000_11110_01_1_001_010_10000,
+                    FpuRoundMode::Plus32 => 0b000_11110_00_1_001_001_10000,
+                    FpuRoundMode::Plus64 => 0b000_11110_01_1_001_001_10000,
+                    FpuRoundMode::Zero32 => 0b000_11110_00_1_001_011_10000,
+                    FpuRoundMode::Zero64 => 0b000_11110_01_1_001_011_10000,
+                    FpuRoundMode::Nearest32 => 0b000_11110_00_1_001_000_10000,
+                    FpuRoundMode::Nearest64 => 0b000_11110_01_1_001_000_10000,
+                };
+                sink.put4(enc_fround(top22, rd, rn));
+            }
+            &Inst::MovToVec64 { rd, rn } => {
+                sink.put4(
+                    0b010_01110000_01000_0_0011_1_00000_00000
+                        | (machreg_to_gpr(rn) << 5)
+                        | machreg_to_vec(rd.to_reg()),
+                );
+            }
+            &Inst::MovFromVec64 { rd, rn } => {
+                sink.put4(
+                    0b010_01110000_01000_0_0111_1_00000_00000
+                        | (machreg_to_vec(rn) << 5)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+            &Inst::VecRRR { rd, rn, rm, alu_op } => {
+                let (top11, bit15_10) = match alu_op {
+                    VecALUOp::SQAddScalar => (0b010_11110_11_1, 0b000011),
+                    VecALUOp::SQSubScalar => (0b010_11110_11_1, 0b001011),
+                    VecALUOp::UQAddScalar => (0b011_11110_11_1, 0b000011),
+                    VecALUOp::UQSubScalar => (0b011_11110_11_1, 0b001011),
+                };
+                sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd));
+            }
+            &Inst::MovToNZCV { rn } => {
+                sink.put4(0xd51b4200 | machreg_to_gpr(rn));
+            }
+            &Inst::MovFromNZCV { rd } => {
+                sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg()));
+            }
+            &Inst::CondSet { rd, cond } => {
+                sink.put4(
+                    0b100_11010100_11111_0000_01_11111_00000
+                        | (cond.invert().bits() << 12)
+                        | machreg_to_gpr(rd.to_reg()),
+                );
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits >= 8 => {
+                let top22 = match (signed, from_bits, to_bits) {
+                    (false, 8, 32) => 0b010_100110_0_000000_000111, // UXTB (32)
+                    (false, 16, 32) => 0b010_100110_0_000000_001111, // UXTH (32)
+                    (true, 8, 32) => 0b000_100110_0_000000_000111,  // SXTB (32)
+                    (true, 16, 32) => 0b000_100110_0_000000_001111, // SXTH (32)
+                    // The 64-bit unsigned variants are the same as the 32-bit ones,
+                    // because writes to Wn zero out the top 32 bits of Xn
+                    (false, 8, 64) => 0b010_100110_0_000000_000111, // UXTB (64)
+                    (false, 16, 64) => 0b010_100110_0_000000_001111, // UXTH (64)
+                    (true, 8, 64) => 0b100_100110_1_000000_000111,  // SXTB (64)
+                    (true, 16, 64) => 0b100_100110_1_000000_001111, // SXTH (64)
+                    // 32-to-64: the unsigned case is a 'mov' (special-cased below).
+                    (false, 32, 64) => 0,                           // MOV
+                    (true, 32, 64) => 0b100_100110_1_000000_011111, // SXTW (64)
+                    _ => panic!(
+                        "Unsupported extend combination: signed = {}, from_bits = {}, to_bits = {}",
+                        signed, from_bits, to_bits
+                    ),
+                };
+                if top22 != 0 {
+                    sink.put4(enc_extend(top22, rd, rn));
+                } else {
+                    Inst::mov32(rd, rn).emit(sink);
+                }
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && signed => {
+                assert!(to_bits <= 64);
+                // Reduce sign-extend-from-1-bit to:
+                // - and rd, rn, #1
+                // - sub rd, zr, rd
+
+                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
+                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
+                let sub_inst = Inst::AluRRR {
+                    alu_op: ALUOp::Sub64,
+                    rd,
+                    rn: zero_reg(),
+                    rm: rd.to_reg(),
+                };
+                sub_inst.emit(sink);
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && !signed => {
+                assert!(to_bits <= 64);
+                // Reduce zero-extend-from-1-bit to:
+                // - and rd, rn, #1
+
+                // We don't have ImmLogic yet, so we just hardcode this. FIXME.
+                sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()));
+            }
+            &Inst::Extend { .. } => {
+                panic!("Unsupported extend variant");
+            }
+            &Inst::Jump { ref dest } => {
+                // TODO: differentiate between as_off26() returning `None` for
+                // out-of-range vs. not-yet-finalized. The latter happens when we
+                // do early (fake) emission for size computation.
+                sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
+            }
+            &Inst::Ret {} => {
+                sink.put4(0xd65f03c0);
+            }
+            &Inst::EpiloguePlaceholder {} => {
+                // Noop; this is just a placeholder for epilogues.
+            }
+            &Inst::Call {
+                ref dest,
+                loc,
+                opcode,
+                ..
+            } => {
+                sink.add_reloc(loc, Reloc::Arm64Call, dest, 0);
+                sink.put4(enc_jump26(0b100101, 0));
+                if opcode.is_call() {
+                    sink.add_call_site(loc, opcode);
+                }
+            }
+            &Inst::CallInd {
+                rn, loc, opcode, ..
+            } => {
+                sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5));
+                if opcode.is_call() {
+                    sink.add_call_site(loc, opcode);
+                }
+            }
+            &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"),
+            &Inst::CondBrLowered { target, kind } => match kind {
+                // TODO: handle >2^19 case by emitting a compound sequence with
+                // an unconditional (26-bit) branch. We need branch-relaxation
+                // adjustment machinery to enable this (because we don't want to
+                // always emit the long form).
+                CondBrKind::Zero(reg) => {
+                    sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg));
+                }
+                CondBrKind::NotZero(reg) => {
+                    sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg));
+                }
+                CondBrKind::Cond(c) => {
+                    sink.put4(enc_cbr(
+                        0b01010100,
+                        target.as_off19().unwrap_or(0),
+                        0b0,
+                        c.bits(),
+                    ));
+                }
+            },
+            &Inst::CondBrLoweredCompound {
+                taken,
+                not_taken,
+                kind,
+            } => {
+                // Conditional part first.
+                match kind {
+                    CondBrKind::Zero(reg) => {
+                        sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg));
+                    }
+                    CondBrKind::NotZero(reg) => {
+                        sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg));
+                    }
+                    CondBrKind::Cond(c) => {
+                        sink.put4(enc_cbr(
+                            0b01010100,
+                            taken.as_off19().unwrap_or(0),
+                            0b0,
+                            c.bits(),
+                        ));
+                    }
+                }
+                // Unconditional part.
+                sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0)));
+            }
+            &Inst::IndirectBr { rn, .. } => {
+                sink.put4(enc_br(rn));
+            }
+            &Inst::Nop => {}
+            &Inst::Nop4 => {
+                sink.put4(0xd503201f);
+            }
+            &Inst::Brk => {
+                sink.put4(0xd4200000);
+            }
+            &Inst::Udf { trap_info } => {
+                let (srcloc, code) = trap_info;
+                sink.add_trap(srcloc, code);
+                sink.put4(0xd4a00000);
+            }
+            &Inst::Adr { rd, ref label } => {
+                let off = memlabel_finalize(sink.cur_offset_from_start(), label);
+                assert!(off > -(1 << 20));
+                assert!(off < (1 << 20));
+                sink.put4(enc_adr(off, rd));
+            }
+            &Inst::Word4 { data } => {
+                sink.put4(data);
+            }
+            &Inst::Word8 { data } => {
+                sink.put8(data);
+            }
+            &Inst::JTSequence {
+                ridx,
+                rtmp1,
+                rtmp2,
+                ref targets,
+                ..
+            } => {
+                // This sequence is *one* instruction in the vcode, and is expanded only here at
+                // emission time, because we cannot allow the regalloc to insert spills/reloads in
+                // the middle; we depend on hardcoded PC-rel addressing below.
+                //
+                // N.B.: if PC-rel addressing on ADR below is changed, also update
+                // `Inst::with_block_offsets()` in arm64/inst/mod.rs.
+
+                // Save index in a tmp (the live range of ridx only goes to start of this
+                // sequence; rtmp1 or rtmp2 may overwrite it).
+                let inst = Inst::gen_move(rtmp2, ridx, I64);
+                inst.emit(sink);
+                // Load address of jump table
+                let inst = Inst::Adr {
+                    rd: rtmp1,
+                    label: MemLabel::PCRel(16),
+                };
+                inst.emit(sink);
+                // Load value out of jump table
+                let inst = Inst::SLoad32 {
+                    rd: rtmp2,
+                    mem: MemArg::reg_reg_scaled_extended(
+                        rtmp1.to_reg(),
+                        rtmp2.to_reg(),
+                        I32,
+                        ExtendOp::UXTW,
+                    ),
+                    srcloc: None, // can't cause a user trap.
+                };
+                inst.emit(sink);
+                // Add base of jump table to jump-table-sourced block offset
+                let inst = Inst::AluRRR {
+                    alu_op: ALUOp::Add64,
+                    rd: rtmp1,
+                    rn: rtmp1.to_reg(),
+                    rm: rtmp2.to_reg(),
+                };
+                inst.emit(sink);
+                // Branch to computed address. (`targets` here is only used for successor queries
+                // and is not needed for emission.)
+                let inst = Inst::IndirectBr {
+                    rn: rtmp1.to_reg(),
+                    targets: vec![],
+                };
+                inst.emit(sink);
+                // Emit jump table (table of 32-bit offsets).
+                for target in targets {
+                    let off = target.as_offset_words() * 4;
+                    let off = off as i32 as u32;
+                    sink.put4(off);
+                }
+            }
+            &Inst::LoadConst64 { rd, const_data } => {
+                let inst = Inst::ULoad64 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None, // can't cause a user trap.
+                };
+                inst.emit(sink);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(12),
+                };
+                inst.emit(sink);
+                sink.put8(const_data);
+            }
+            &Inst::LoadExtName {
+                rd,
+                ref name,
+                offset,
+                srcloc,
+            } => {
+                let inst = Inst::ULoad64 {
+                    rd,
+                    mem: MemArg::Label(MemLabel::PCRel(8)),
+                    srcloc: None, // can't cause a user trap.
+                };
+                inst.emit(sink);
+                let inst = Inst::Jump {
+                    dest: BranchTarget::ResolvedOffset(12),
+                };
+                inst.emit(sink);
+                sink.add_reloc(srcloc, Reloc::Abs8, name, offset);
+                sink.put8(0);
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::isa::test_utils;
+
+    #[test]
+    fn test_arm64_binemit() {
+        let mut insns = Vec::<(Inst, &str, &str)>::new();
+
+        // N.B.: the architecture is little-endian, so when transcribing the 32-bit
+        // hex instructions from e.g. objdump disassembly, one must swap the bytes
+        // seen below. (E.g., a `ret` is normally written as the u32 `D65F03C0`,
+        // but we write it here as C0035FD6.)
+
+        // Useful helper script to produce the encodings from the text:
+        //
+        //      #!/bin/sh
+        //      tmp=`mktemp /tmp/XXXXXXXX.o`
+        //      aarch64-linux-gnu-as /dev/stdin -o $tmp
+        //      aarch64-linux-gnu-objdump -d $tmp
+        //      rm -f $tmp
+        //
+        // Then:
+        //
+        //      $ echo "mov x1, x2" | arm64inst.sh
+        insns.push((Inst::Ret {}, "C0035FD6", "ret"));
+        insns.push((Inst::Nop {}, "", "nop-zero-len"));
+        insns.push((Inst::Nop4 {}, "1F2003D5", "nop"));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Add32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100030B",
+            "add w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Add64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400068B",
+            "add x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Sub32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100034B",
+            "sub w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Sub64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40006CB",
+            "sub x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Orr32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100032A",
+            "orr w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Orr64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40006AA",
+            "orr x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::And32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100030A",
+            "and w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::And64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400068A",
+            "and x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::SubS32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100036B",
+            "subs w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::SubS64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40006EB",
+            "subs x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::AddS32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+            },
+            "4100032B",
+            "adds w1, w2, w3",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::AddS64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40006AB",
+            "adds x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::SDiv64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40CC69A",
+            "sdiv x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::UDiv64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A408C69A",
+            "udiv x4, x5, x6",
+        ));
+
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Eor32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400064A",
+            "eor w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Eor64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40006CA",
+            "eor x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::AndNot32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400260A",
+            "bic w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::AndNot64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400268A",
+            "bic x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::OrrNot32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400262A",
+            "orn w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::OrrNot64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40026AA",
+            "orn x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::EorNot32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A400264A",
+            "eon w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::EorNot64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A40026CA",
+            "eon x4, x5, x6",
+        ));
+
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::RotR32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A42CC61A",
+            "ror w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::RotR64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A42CC69A",
+            "ror x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Lsr32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A424C61A",
+            "lsr w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Lsr64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A424C69A",
+            "lsr x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Asr32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A428C61A",
+            "asr w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Asr64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A428C69A",
+            "asr x4, x5, x6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Lsl32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A420C61A",
+            "lsl w4, w5, w6",
+        ));
+        insns.push((
+            Inst::AluRRR {
+                alu_op: ALUOp::Lsl64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                rm: xreg(6),
+            },
+            "A420C69A",
+            "lsl x4, x5, x6",
+        ));
+
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::Add32,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D0411",
+            "add w7, w8, #291",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::Add32,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: true,
+                },
+            },
+            "078D4411",
+            "add w7, w8, #1191936",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D0491",
+            "add x7, x8, #291",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::Sub32,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D0451",
+            "sub w7, w8, #291",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::Sub64,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D04D1",
+            "sub x7, x8, #291",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::SubS32,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D0471",
+            "subs w7, w8, #291",
+        ));
+        insns.push((
+            Inst::AluRRImm12 {
+                alu_op: ALUOp::SubS64,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                imm12: Imm12 {
+                    bits: 0x123,
+                    shift12: false,
+                },
+            },
+            "078D04F1",
+            "subs x7, x8, #291",
+        ));
+
+        insns.push((
+            Inst::AluRRRExtend {
+                alu_op: ALUOp::Add32,
+                rd: writable_xreg(7),
+                rn: xreg(8),
+                rm: xreg(9),
+                extendop: ExtendOp::SXTB,
+            },
+            "0781290B",
+            "add w7, w8, w9, SXTB",
+        ));
+
+        insns.push((
+            Inst::AluRRRExtend {
+                alu_op: ALUOp::Add64,
+                rd: writable_xreg(15),
+                rn: xreg(16),
+                rm: xreg(17),
+                extendop: ExtendOp::UXTB,
+            },
+            "0F02318B",
+            "add x15, x16, x17, UXTB",
+        ));
+
+        insns.push((
+            Inst::AluRRRExtend {
+                alu_op: ALUOp::Sub32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                extendop: ExtendOp::SXTH,
+            },
+            "41A0234B",
+            "sub w1, w2, w3, SXTH",
+        ));
+
+        insns.push((
+            Inst::AluRRRExtend {
+                alu_op: ALUOp::Sub64,
+                rd: writable_xreg(20),
+                rn: xreg(21),
+                rm: xreg(22),
+                extendop: ExtendOp::UXTW,
+            },
+            "B44236CB",
+            "sub x20, x21, x22, UXTW",
+        ));
+
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Add32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(20).unwrap(),
+                ),
+            },
+            "6A510C0B",
+            "add w10, w11, w12, LSL 20",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::ASR,
+                    ShiftOpShiftImm::maybe_from_shift(42).unwrap(),
+                ),
+            },
+            "6AA98C8B",
+            "add x10, x11, x12, ASR 42",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Sub32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C4B",
+            "sub w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Sub64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0CCB",
+            "sub x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Orr32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C2A",
+            "orr w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Orr64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0CAA",
+            "orr x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::And32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C0A",
+            "and w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::And64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C8A",
+            "and x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Eor32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C4A",
+            "eor w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::Eor64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0CCA",
+            "eor x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::OrrNot32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2C2A",
+            "orn w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::OrrNot64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2CAA",
+            "orn x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::AndNot32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2C0A",
+            "bic w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::AndNot64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2C8A",
+            "bic x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::EorNot32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2C4A",
+            "eon w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::EorNot64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D2CCA",
+            "eon x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::AddS32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C2B",
+            "adds w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::AddS64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0CAB",
+            "adds x10, x11, x12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::SubS32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0C6B",
+            "subs w10, w11, w12, LSL 23",
+        ));
+        insns.push((
+            Inst::AluRRRShift {
+                alu_op: ALUOp::SubS64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                rm: xreg(12),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(23).unwrap(),
+                ),
+            },
+            "6A5D0CEB",
+            "subs x10, x11, x12, LSL 23",
+        ));
+
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::MAdd32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: xreg(4),
+            },
+            "4110031B",
+            "madd w1, w2, w3, w4",
+        ));
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::MAdd64,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: xreg(4),
+            },
+            "4110039B",
+            "madd x1, x2, x3, x4",
+        ));
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::MSub32,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: xreg(4),
+            },
+            "4190031B",
+            "msub w1, w2, w3, w4",
+        ));
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::MSub64,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: xreg(4),
+            },
+            "4190039B",
+            "msub x1, x2, x3, x4",
+        ));
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::SMulH,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: zero_reg(),
+            },
+            "417C439B",
+            "smulh x1, x2, x3",
+        ));
+        insns.push((
+            Inst::AluRRRR {
+                alu_op: ALUOp::UMulH,
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                rm: xreg(3),
+                ra: zero_reg(),
+            },
+            "417CC39B",
+            "umulh x1, x2, x3",
+        ));
+
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::RotR32,
+                rd: writable_xreg(20),
+                rn: xreg(21),
+                immshift: ImmShift::maybe_from_u64(19).unwrap(),
+            },
+            "B44E9513",
+            "ror w20, w21, #19",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::RotR64,
+                rd: writable_xreg(20),
+                rn: xreg(21),
+                immshift: ImmShift::maybe_from_u64(42).unwrap(),
+            },
+            "B4AAD593",
+            "ror x20, x21, #42",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr32,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                immshift: ImmShift::maybe_from_u64(13).unwrap(),
+            },
+            "6A7D0D53",
+            "lsr w10, w11, #13",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr64,
+                rd: writable_xreg(10),
+                rn: xreg(11),
+                immshift: ImmShift::maybe_from_u64(57).unwrap(),
+            },
+            "6AFD79D3",
+            "lsr x10, x11, #57",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Asr32,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                immshift: ImmShift::maybe_from_u64(7).unwrap(),
+            },
+            "A47C0713",
+            "asr w4, w5, #7",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Asr64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                immshift: ImmShift::maybe_from_u64(35).unwrap(),
+            },
+            "A4FC6393",
+            "asr x4, x5, #35",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsl32,
+                rd: writable_xreg(8),
+                rn: xreg(9),
+                immshift: ImmShift::maybe_from_u64(24).unwrap(),
+            },
+            "281D0853",
+            "lsl w8, w9, #24",
+        ));
+        insns.push((
+            Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsl64,
+                rd: writable_xreg(8),
+                rn: xreg(9),
+                immshift: ImmShift::maybe_from_u64(63).unwrap(),
+            },
+            "280141D3",
+            "lsl x8, x9, #63",
+        ));
+
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::And32,
+                rd: writable_xreg(21),
+                rn: xreg(27),
+                imml: ImmLogic::maybe_from_u64(0x80003fff, I32).unwrap(),
+            },
+            "753B0112",
+            "and w21, w27, #2147500031",
+        ));
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: writable_xreg(7),
+                rn: xreg(6),
+                imml: ImmLogic::maybe_from_u64(0x3fff80003fff800, I64).unwrap(),
+            },
+            "C7381592",
+            "and x7, x6, #288221580125796352",
+        ));
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::Orr32,
+                rd: writable_xreg(1),
+                rn: xreg(5),
+                imml: ImmLogic::maybe_from_u64(0x100000, I32).unwrap(),
+            },
+            "A1000C32",
+            "orr w1, w5, #1048576",
+        ));
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::Orr64,
+                rd: writable_xreg(4),
+                rn: xreg(5),
+                imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(),
+            },
+            "A4C401B2",
+            "orr x4, x5, #9331882296111890817",
+        ));
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::Eor32,
+                rd: writable_xreg(1),
+                rn: xreg(5),
+                imml: ImmLogic::maybe_from_u64(0x00007fff, I32).unwrap(),
+            },
+            "A1380052",
+            "eor w1, w5, #32767",
+        ));
+        insns.push((
+            Inst::AluRRImmLogic {
+                alu_op: ALUOp::Eor64,
+                rd: writable_xreg(10),
+                rn: xreg(8),
+                imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(),
+            },
+            "0AC501D2",
+            "eor x10, x8, #9331882296111890817",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::RBit32,
+                rd: writable_xreg(1),
+                rn: xreg(10),
+            },
+            "4101C05A",
+            "rbit w1, w10",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::RBit64,
+                rd: writable_xreg(1),
+                rn: xreg(10),
+            },
+            "4101C0DA",
+            "rbit x1, x10",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::Clz32,
+                rd: writable_xreg(15),
+                rn: xreg(3),
+            },
+            "6F10C05A",
+            "clz w15, w3",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::Clz64,
+                rd: writable_xreg(15),
+                rn: xreg(3),
+            },
+            "6F10C0DA",
+            "clz x15, x3",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::Cls32,
+                rd: writable_xreg(21),
+                rn: xreg(16),
+            },
+            "1516C05A",
+            "cls w21, w16",
+        ));
+
+        insns.push((
+            Inst::BitRR {
+                op: BitOp::Cls64,
+                rd: writable_xreg(21),
+                rn: xreg(16),
+            },
+            "1516C0DA",
+            "cls x21, x16",
+        ));
+
+        insns.push((
+            Inst::ULoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "41004038",
+            "ldurb w1, [x2]",
+        ));
+        insns.push((
+            Inst::ULoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::zero(I8)),
+                srcloc: None,
+            },
+            "41004039",
+            "ldrb w1, [x2]",
+        ));
+        insns.push((
+            Inst::ULoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegReg(xreg(2), xreg(5)),
+                srcloc: None,
+            },
+            "41686538",
+            "ldrb w1, [x2, x5]",
+        ));
+        insns.push((
+            Inst::SLoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "41008038",
+            "ldursb x1, [x2]",
+        ));
+        insns.push((
+            Inst::SLoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(63, I8).unwrap()),
+                srcloc: None,
+            },
+            "41FC8039",
+            "ldrsb x1, [x2, #63]",
+        ));
+        insns.push((
+            Inst::SLoad8 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegReg(xreg(2), xreg(5)),
+                srcloc: None,
+            },
+            "4168A538",
+            "ldrsb x1, [x2, x5]",
+        ));
+        insns.push((
+            Inst::ULoad16 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(5).unwrap()),
+                srcloc: None,
+            },
+            "41504078",
+            "ldurh w1, [x2, #5]",
+        ));
+        insns.push((
+            Inst::ULoad16 {
+                rd: writable_xreg(1),
+                mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8, I16).unwrap()),
+                srcloc: None,
+            },
+            "41104079",
+            "ldrh w1, [x2, #8]",
+        ));
+        insns.push((
+            Inst::ULoad16 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegScaled(xreg(2), xreg(3), I16),
+                srcloc: None,
+            },
+            "41786378",
+            "ldrh w1, [x2, x3, LSL #1]",
+        ));
+        insns.push((
+            Inst::SLoad16 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "41008078",
+            "ldursh x1, [x2]",
+        ));
+        insns.push((
+            Inst::SLoad16 {
+                rd: writable_xreg(28),
+                mem: MemArg::UnsignedOffset(
+                    xreg(20),
+                    UImm12Scaled::maybe_from_i64(24, I16).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "9C328079",
+            "ldrsh x28, [x20, #24]",
+        ));
+        insns.push((
+            Inst::SLoad16 {
+                rd: writable_xreg(28),
+                mem: MemArg::RegScaled(xreg(20), xreg(20), I16),
+                srcloc: None,
+            },
+            "9C7AB478",
+            "ldrsh x28, [x20, x20, LSL #1]",
+        ));
+        insns.push((
+            Inst::ULoad32 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "410040B8",
+            "ldur w1, [x2]",
+        ));
+        insns.push((
+            Inst::ULoad32 {
+                rd: writable_xreg(12),
+                mem: MemArg::UnsignedOffset(
+                    xreg(0),
+                    UImm12Scaled::maybe_from_i64(204, I32).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "0CCC40B9",
+            "ldr w12, [x0, #204]",
+        ));
+        insns.push((
+            Inst::ULoad32 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegScaled(xreg(2), xreg(12), I32),
+                srcloc: None,
+            },
+            "41786CB8",
+            "ldr w1, [x2, x12, LSL #2]",
+        ));
+        insns.push((
+            Inst::SLoad32 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "410080B8",
+            "ldursw x1, [x2]",
+        ));
+        insns.push((
+            Inst::SLoad32 {
+                rd: writable_xreg(12),
+                mem: MemArg::UnsignedOffset(
+                    xreg(1),
+                    UImm12Scaled::maybe_from_i64(16380, I32).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "2CFCBFB9",
+            "ldrsw x12, [x1, #16380]",
+        ));
+        insns.push((
+            Inst::SLoad32 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegScaled(xreg(5), xreg(1), I32),
+                srcloc: None,
+            },
+            "A178A1B8",
+            "ldrsw x1, [x5, x1, LSL #2]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "410040F8",
+            "ldur x1, [x2]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(-256).unwrap()),
+                srcloc: None,
+            },
+            "410050F8",
+            "ldur x1, [x2, #-256]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(255).unwrap()),
+                srcloc: None,
+            },
+            "41F04FF8",
+            "ldur x1, [x2, #255]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::UnsignedOffset(
+                    xreg(2),
+                    UImm12Scaled::maybe_from_i64(32760, I64).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "41FC7FF9",
+            "ldr x1, [x2, #32760]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegReg(xreg(2), xreg(3)),
+                srcloc: None,
+            },
+            "416863F8",
+            "ldr x1, [x2, x3]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegScaled(xreg(2), xreg(3), I64),
+                srcloc: None,
+            },
+            "417863F8",
+            "ldr x1, [x2, x3, LSL #3]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::SXTW),
+                srcloc: None,
+            },
+            "41D863F8",
+            "ldr x1, [x2, w3, SXTW #3]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::Label(MemLabel::PCRel(64)),
+                srcloc: None,
+            },
+            "01020058",
+            "ldr x1, pc+64",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+                srcloc: None,
+            },
+            "410C41F8",
+            "ldr x1, [x2, #16]!",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+                srcloc: None,
+            },
+            "410441F8",
+            "ldr x1, [x2], #16",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::FPOffset(32768),
+                srcloc: None,
+            },
+            "0F0090D2EF011D8BE10140F9",
+            "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::FPOffset(-32768),
+                srcloc: None,
+            },
+            "EFFF8F92EF011D8BE10140F9",
+            "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::FPOffset(1048576), // 2^20
+                srcloc: None,
+            },
+            "0F02A0D2EF011D8BE10140F9",
+            "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        ));
+        insns.push((
+            Inst::ULoad64 {
+                rd: writable_xreg(1),
+                mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1
+                srcloc: None,
+            },
+            "2F0080D20F02A0F2EF011D8BE10140F9",
+            "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]",
+        ));
+
+        insns.push((
+            Inst::Store8 {
+                rd: xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "41000038",
+            "sturb w1, [x2]",
+        ));
+        insns.push((
+            Inst::Store8 {
+                rd: xreg(1),
+                mem: MemArg::UnsignedOffset(
+                    xreg(2),
+                    UImm12Scaled::maybe_from_i64(4095, I8).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "41FC3F39",
+            "strb w1, [x2, #4095]",
+        ));
+        insns.push((
+            Inst::Store16 {
+                rd: xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "41000078",
+            "sturh w1, [x2]",
+        ));
+        insns.push((
+            Inst::Store16 {
+                rd: xreg(1),
+                mem: MemArg::UnsignedOffset(
+                    xreg(2),
+                    UImm12Scaled::maybe_from_i64(8190, I16).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "41FC3F79",
+            "strh w1, [x2, #8190]",
+        ));
+        insns.push((
+            Inst::Store32 {
+                rd: xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "410000B8",
+            "stur w1, [x2]",
+        ));
+        insns.push((
+            Inst::Store32 {
+                rd: xreg(1),
+                mem: MemArg::UnsignedOffset(
+                    xreg(2),
+                    UImm12Scaled::maybe_from_i64(16380, I32).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "41FC3FB9",
+            "str w1, [x2, #16380]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::Unscaled(xreg(2), SImm9::zero()),
+                srcloc: None,
+            },
+            "410000F8",
+            "stur x1, [x2]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::UnsignedOffset(
+                    xreg(2),
+                    UImm12Scaled::maybe_from_i64(32760, I64).unwrap(),
+                ),
+                srcloc: None,
+            },
+            "41FC3FF9",
+            "str x1, [x2, #32760]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::RegReg(xreg(2), xreg(3)),
+                srcloc: None,
+            },
+            "416823F8",
+            "str x1, [x2, x3]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::RegScaled(xreg(2), xreg(3), I64),
+                srcloc: None,
+            },
+            "417823F8",
+            "str x1, [x2, x3, LSL #3]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::UXTW),
+                srcloc: None,
+            },
+            "415823F8",
+            "str x1, [x2, w3, UXTW #3]",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+                srcloc: None,
+            },
+            "410C01F8",
+            "str x1, [x2, #16]!",
+        ));
+        insns.push((
+            Inst::Store64 {
+                rd: xreg(1),
+                mem: MemArg::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()),
+                srcloc: None,
+            },
+            "410401F8",
+            "str x1, [x2], #16",
+        ));
+
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(8),
+                rt2: xreg(9),
+                mem: PairMemArg::SignedOffset(xreg(10), SImm7Scaled::zero(I64)),
+            },
+            "482500A9",
+            "stp x8, x9, [x10]",
+        ));
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(8),
+                rt2: xreg(9),
+                mem: PairMemArg::SignedOffset(
+                    xreg(10),
+                    SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+                ),
+            },
+            "48A51FA9",
+            "stp x8, x9, [x10, #504]",
+        ));
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(8),
+                rt2: xreg(9),
+                mem: PairMemArg::SignedOffset(
+                    xreg(10),
+                    SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+                ),
+            },
+            "48253CA9",
+            "stp x8, x9, [x10, #-64]",
+        ));
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(21),
+                rt2: xreg(28),
+                mem: PairMemArg::SignedOffset(
+                    xreg(1),
+                    SImm7Scaled::maybe_from_i64(-512, I64).unwrap(),
+                ),
+            },
+            "357020A9",
+            "stp x21, x28, [x1, #-512]",
+        ));
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(8),
+                rt2: xreg(9),
+                mem: PairMemArg::PreIndexed(
+                    writable_xreg(10),
+                    SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+                ),
+            },
+            "4825BCA9",
+            "stp x8, x9, [x10, #-64]!",
+        ));
+        insns.push((
+            Inst::StoreP64 {
+                rt: xreg(15),
+                rt2: xreg(16),
+                mem: PairMemArg::PostIndexed(
+                    writable_xreg(20),
+                    SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+                ),
+            },
+            "8FC29FA8",
+            "stp x15, x16, [x20], #504",
+        ));
+
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(9),
+                mem: PairMemArg::SignedOffset(xreg(10), SImm7Scaled::zero(I64)),
+            },
+            "482540A9",
+            "ldp x8, x9, [x10]",
+        ));
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(9),
+                mem: PairMemArg::SignedOffset(
+                    xreg(10),
+                    SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+                ),
+            },
+            "48A55FA9",
+            "ldp x8, x9, [x10, #504]",
+        ));
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(9),
+                mem: PairMemArg::SignedOffset(
+                    xreg(10),
+                    SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+                ),
+            },
+            "48257CA9",
+            "ldp x8, x9, [x10, #-64]",
+        ));
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(9),
+                mem: PairMemArg::SignedOffset(
+                    xreg(10),
+                    SImm7Scaled::maybe_from_i64(-512, I64).unwrap(),
+                ),
+            },
+            "482560A9",
+            "ldp x8, x9, [x10, #-512]",
+        ));
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(9),
+                mem: PairMemArg::PreIndexed(
+                    writable_xreg(10),
+                    SImm7Scaled::maybe_from_i64(-64, I64).unwrap(),
+                ),
+            },
+            "4825FCA9",
+            "ldp x8, x9, [x10, #-64]!",
+        ));
+        insns.push((
+            Inst::LoadP64 {
+                rt: writable_xreg(8),
+                rt2: writable_xreg(25),
+                mem: PairMemArg::PostIndexed(
+                    writable_xreg(12),
+                    SImm7Scaled::maybe_from_i64(504, I64).unwrap(),
+                ),
+            },
+            "88E5DFA8",
+            "ldp x8, x25, [x12], #504",
+        ));
+
+        insns.push((
+            Inst::Mov {
+                rd: writable_xreg(8),
+                rm: xreg(9),
+            },
+            "E80309AA",
+            "mov x8, x9",
+        ));
+        insns.push((
+            Inst::Mov32 {
+                rd: writable_xreg(8),
+                rm: xreg(9),
+            },
+            "E803092A",
+            "mov w8, w9",
+        ));
+
+        insns.push((
+            Inst::MovZ {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            },
+            "E8FF9FD2",
+            "movz x8, #65535",
+        ));
+        insns.push((
+            Inst::MovZ {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            },
+            "E8FFBFD2",
+            "movz x8, #65535, LSL #16",
+        ));
+        insns.push((
+            Inst::MovZ {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            },
+            "E8FFDFD2",
+            "movz x8, #65535, LSL #32",
+        ));
+        insns.push((
+            Inst::MovZ {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            },
+            "E8FFFFD2",
+            "movz x8, #65535, LSL #48",
+        ));
+
+        insns.push((
+            Inst::MovN {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            },
+            "E8FF9F92",
+            "movn x8, #65535",
+        ));
+        insns.push((
+            Inst::MovN {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            },
+            "E8FFBF92",
+            "movn x8, #65535, LSL #16",
+        ));
+        insns.push((
+            Inst::MovN {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            },
+            "E8FFDF92",
+            "movn x8, #65535, LSL #32",
+        ));
+        insns.push((
+            Inst::MovN {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            },
+            "E8FFFF92",
+            "movn x8, #65535, LSL #48",
+        ));
+
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(12),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_0000).unwrap(),
+            },
+            "0C0080F2",
+            "movk x12, #0",
+        ));
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(19),
+                imm: MoveWideConst::maybe_with_shift(0x0000, 16).unwrap(),
+            },
+            "1300A0F2",
+            "movk x19, #0, LSL #16",
+        ));
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(3),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(),
+            },
+            "E3FF9FF2",
+            "movk x3, #65535",
+        ));
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(),
+            },
+            "E8FFBFF2",
+            "movk x8, #65535, LSL #16",
+        ));
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(),
+            },
+            "E8FFDFF2",
+            "movk x8, #65535, LSL #32",
+        ));
+        insns.push((
+            Inst::MovK {
+                rd: writable_xreg(8),
+                imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(),
+            },
+            "E8FFFFF2",
+            "movk x8, #65535, LSL #48",
+        ));
+
+        insns.push((
+            Inst::CSel {
+                rd: writable_xreg(10),
+                rn: xreg(12),
+                rm: xreg(14),
+                cond: Cond::Hs,
+            },
+            "8A218E9A",
+            "csel x10, x12, x14, hs",
+        ));
+        insns.push((
+            Inst::CSet {
+                rd: writable_xreg(15),
+                cond: Cond::Ge,
+            },
+            "EFB79F9A",
+            "cset x15, ge",
+        ));
+        insns.push((
+            Inst::MovToVec64 {
+                rd: writable_vreg(20),
+                rn: xreg(21),
+            },
+            "B41E084E",
+            "mov v20.d[0], x21",
+        ));
+        insns.push((
+            Inst::MovFromVec64 {
+                rd: writable_xreg(21),
+                rn: vreg(20),
+            },
+            "953E084E",
+            "mov x21, v20.d[0]",
+        ));
+        insns.push((
+            Inst::MovToNZCV { rn: xreg(13) },
+            "0D421BD5",
+            "msr nzcv, x13",
+        ));
+        insns.push((
+            Inst::MovFromNZCV {
+                rd: writable_xreg(27),
+            },
+            "1B423BD5",
+            "mrs x27, nzcv",
+        ));
+        insns.push((
+            Inst::CondSet {
+                rd: writable_xreg(5),
+                cond: Cond::Hi,
+            },
+            "E5979F9A",
+            "cset x5, hi",
+        ));
+        insns.push((
+            Inst::VecRRR {
+                rd: writable_vreg(21),
+                rn: vreg(22),
+                rm: vreg(23),
+                alu_op: VecALUOp::UQAddScalar,
+            },
+            "D50EF77E",
+            "uqadd d21, d22, d23",
+        ));
+        insns.push((
+            Inst::VecRRR {
+                rd: writable_vreg(21),
+                rn: vreg(22),
+                rm: vreg(23),
+                alu_op: VecALUOp::SQAddScalar,
+            },
+            "D50EF75E",
+            "sqadd d21, d22, d23",
+        ));
+        insns.push((
+            Inst::VecRRR {
+                rd: writable_vreg(21),
+                rn: vreg(22),
+                rm: vreg(23),
+                alu_op: VecALUOp::UQSubScalar,
+            },
+            "D52EF77E",
+            "uqsub d21, d22, d23",
+        ));
+        insns.push((
+            Inst::VecRRR {
+                rd: writable_vreg(21),
+                rn: vreg(22),
+                rm: vreg(23),
+                alu_op: VecALUOp::SQSubScalar,
+            },
+            "D52EF75E",
+            "sqsub d21, d22, d23",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: false,
+                from_bits: 8,
+                to_bits: 32,
+            },
+            "411C0053",
+            "uxtb w1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: true,
+                from_bits: 8,
+                to_bits: 32,
+            },
+            "411C0013",
+            "sxtb w1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: false,
+                from_bits: 16,
+                to_bits: 32,
+            },
+            "413C0053",
+            "uxth w1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: true,
+                from_bits: 16,
+                to_bits: 32,
+            },
+            "413C0013",
+            "sxth w1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: false,
+                from_bits: 8,
+                to_bits: 64,
+            },
+            "411C0053",
+            "uxtb x1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: true,
+                from_bits: 8,
+                to_bits: 64,
+            },
+            "411C4093",
+            "sxtb x1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: false,
+                from_bits: 16,
+                to_bits: 64,
+            },
+            "413C0053",
+            "uxth x1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: true,
+                from_bits: 16,
+                to_bits: 64,
+            },
+            "413C4093",
+            "sxth x1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: false,
+                from_bits: 32,
+                to_bits: 64,
+            },
+            "E103022A",
+            "mov w1, w2",
+        ));
+        insns.push((
+            Inst::Extend {
+                rd: writable_xreg(1),
+                rn: xreg(2),
+                signed: true,
+                from_bits: 32,
+                to_bits: 64,
+            },
+            "417C4093",
+            "sxtw x1, w2",
+        ));
+
+        insns.push((
+            Inst::Jump {
+                dest: BranchTarget::ResolvedOffset(64),
+            },
+            "10000014",
+            "b 64",
+        ));
+
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Zero(xreg(8)),
+            },
+            "080200B4",
+            "cbz x8, 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::NotZero(xreg(8)),
+            },
+            "080200B5",
+            "cbnz x8, 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Eq),
+            },
+            "00020054",
+            "b.eq 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Ne),
+            },
+            "01020054",
+            "b.ne 64",
+        ));
+
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Hs),
+            },
+            "02020054",
+            "b.hs 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Lo),
+            },
+            "03020054",
+            "b.lo 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Mi),
+            },
+            "04020054",
+            "b.mi 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Pl),
+            },
+            "05020054",
+            "b.pl 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Vs),
+            },
+            "06020054",
+            "b.vs 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Vc),
+            },
+            "07020054",
+            "b.vc 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Hi),
+            },
+            "08020054",
+            "b.hi 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Ls),
+            },
+            "09020054",
+            "b.ls 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Ge),
+            },
+            "0A020054",
+            "b.ge 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Lt),
+            },
+            "0B020054",
+            "b.lt 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Gt),
+            },
+            "0C020054",
+            "b.gt 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Le),
+            },
+            "0D020054",
+            "b.le 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Al),
+            },
+            "0E020054",
+            "b.al 64",
+        ));
+        insns.push((
+            Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(64),
+                kind: CondBrKind::Cond(Cond::Nv),
+            },
+            "0F020054",
+            "b.nv 64",
+        ));
+
+        insns.push((
+            Inst::CondBrLoweredCompound {
+                taken: BranchTarget::ResolvedOffset(64),
+                not_taken: BranchTarget::ResolvedOffset(128),
+                kind: CondBrKind::Cond(Cond::Le),
+            },
+            "0D02005420000014",
+            "b.le 64 ; b 128",
+        ));
+
+        insns.push((
+            Inst::Call {
+                dest: ExternalName::testcase("test0"),
+                uses: Set::empty(),
+                defs: Set::empty(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::Call,
+            },
+            "00000094",
+            "bl 0",
+        ));
+
+        insns.push((
+            Inst::CallInd {
+                rn: xreg(10),
+                uses: Set::empty(),
+                defs: Set::empty(),
+                loc: SourceLoc::default(),
+                opcode: Opcode::CallIndirect,
+            },
+            "40013FD6",
+            "blr x10",
+        ));
+
+        insns.push((
+            Inst::IndirectBr {
+                rn: xreg(3),
+                targets: vec![1, 2, 3],
+            },
+            "60001FD6",
+            "br x3",
+        ));
+
+        insns.push((Inst::Brk, "000020D4", "brk #0"));
+
+        insns.push((
+            Inst::Adr {
+                rd: writable_xreg(15),
+                label: MemLabel::PCRel((1 << 20) - 4),
+            },
+            "EFFF7F10",
+            "adr x15, pc+1048572",
+        ));
+
+        insns.push((
+            Inst::FpuMove64 {
+                rd: writable_vreg(8),
+                rn: vreg(4),
+            },
+            "881CA40E",
+            "mov v8.8b, v4.8b",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Abs32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CFC3201E",
+            "fabs s15, s30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Abs64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CFC3601E",
+            "fabs d15, d30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Neg32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CF43211E",
+            "fneg s15, s30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Neg64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CF43611E",
+            "fneg d15, d30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Sqrt32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CFC3211E",
+            "fsqrt s15, s30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Sqrt64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CFC3611E",
+            "fsqrt d15, d30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Cvt32To64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CFC3221E",
+            "fcvt d15, s30",
+        ));
+
+        insns.push((
+            Inst::FpuRR {
+                fpu_op: FPUOp1::Cvt64To32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+            },
+            "CF43621E",
+            "fcvt s15, d30",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Add32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF2B3F1E",
+            "fadd s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Add64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF2B7F1E",
+            "fadd d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Sub32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF3B3F1E",
+            "fsub s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Sub64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF3B7F1E",
+            "fsub d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Mul32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF0B3F1E",
+            "fmul s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Mul64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF0B7F1E",
+            "fmul d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Div32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF1B3F1E",
+            "fdiv s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Div64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF1B7F1E",
+            "fdiv d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Max32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF4B3F1E",
+            "fmax s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Max64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF4B7F1E",
+            "fmax d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Min32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF5B3F1E",
+            "fmin s15, s30, s31",
+        ));
+
+        insns.push((
+            Inst::FpuRRR {
+                fpu_op: FPUOp2::Min64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+            },
+            "CF5B7F1E",
+            "fmin d15, d30, d31",
+        ));
+
+        insns.push((
+            Inst::FpuRRRR {
+                fpu_op: FPUOp3::MAdd32,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+                ra: vreg(1),
+            },
+            "CF071F1F",
+            "fmadd s15, s30, s31, s1",
+        ));
+
+        insns.push((
+            Inst::FpuRRRR {
+                fpu_op: FPUOp3::MAdd64,
+                rd: writable_vreg(15),
+                rn: vreg(30),
+                rm: vreg(31),
+                ra: vreg(1),
+            },
+            "CF075F1F",
+            "fmadd d15, d30, d31, d1",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F32ToU32,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100391E",
+            "fcvtzu w1, s4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F32ToU64,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100399E",
+            "fcvtzu x1, s4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F32ToI32,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100381E",
+            "fcvtzs w1, s4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F32ToI64,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100389E",
+            "fcvtzs x1, s4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F64ToU32,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100791E",
+            "fcvtzu w1, d4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F64ToU64,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100799E",
+            "fcvtzu x1, d4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F64ToI32,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100781E",
+            "fcvtzs w1, d4",
+        ));
+
+        insns.push((
+            Inst::FpuToInt {
+                op: FpuToIntOp::F64ToI64,
+                rd: writable_xreg(1),
+                rn: vreg(4),
+            },
+            "8100789E",
+            "fcvtzs x1, d4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::U32ToF32,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100231E",
+            "ucvtf s1, w4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::I32ToF32,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100221E",
+            "scvtf s1, w4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::U32ToF64,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100631E",
+            "ucvtf d1, w4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::I32ToF64,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100621E",
+            "scvtf d1, w4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::U64ToF32,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100239E",
+            "ucvtf s1, x4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::I64ToF32,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100229E",
+            "scvtf s1, x4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::U64ToF64,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100639E",
+            "ucvtf d1, x4",
+        ));
+
+        insns.push((
+            Inst::IntToFpu {
+                op: IntToFpuOp::I64ToF64,
+                rd: writable_vreg(1),
+                rn: xreg(4),
+            },
+            "8100629E",
+            "scvtf d1, x4",
+        ));
+
+        insns.push((
+            Inst::FpuCmp32 {
+                rn: vreg(23),
+                rm: vreg(24),
+            },
+            "E022381E",
+            "fcmp s23, s24",
+        ));
+
+        insns.push((
+            Inst::FpuCmp64 {
+                rn: vreg(23),
+                rm: vreg(24),
+            },
+            "E022781E",
+            "fcmp d23, d24",
+        ));
+
+        insns.push((
+            Inst::FpuLoad32 {
+                rd: writable_vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), F32),
+                srcloc: None,
+            },
+            "107969BC",
+            "ldr s16, [x8, x9, LSL #2]",
+        ));
+
+        insns.push((
+            Inst::FpuLoad64 {
+                rd: writable_vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), F64),
+                srcloc: None,
+            },
+            "107969FC",
+            "ldr d16, [x8, x9, LSL #3]",
+        ));
+
+        insns.push((
+            Inst::FpuLoad128 {
+                rd: writable_vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), I128),
+                srcloc: None,
+            },
+            "1079E93C",
+            "ldr q16, [x8, x9, LSL #4]",
+        ));
+
+        insns.push((
+            Inst::FpuLoad32 {
+                rd: writable_vreg(16),
+                mem: MemArg::Label(MemLabel::PCRel(8)),
+                srcloc: None,
+            },
+            "5000001C",
+            "ldr s16, pc+8",
+        ));
+
+        insns.push((
+            Inst::FpuLoad64 {
+                rd: writable_vreg(16),
+                mem: MemArg::Label(MemLabel::PCRel(8)),
+                srcloc: None,
+            },
+            "5000005C",
+            "ldr d16, pc+8",
+        ));
+
+        insns.push((
+            Inst::FpuLoad128 {
+                rd: writable_vreg(16),
+                mem: MemArg::Label(MemLabel::PCRel(8)),
+                srcloc: None,
+            },
+            "5000009C",
+            "ldr q16, pc+8",
+        ));
+
+        insns.push((
+            Inst::FpuStore32 {
+                rd: vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), F32),
+                srcloc: None,
+            },
+            "107929BC",
+            "str s16, [x8, x9, LSL #2]",
+        ));
+
+        insns.push((
+            Inst::FpuStore64 {
+                rd: vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), F64),
+                srcloc: None,
+            },
+            "107929FC",
+            "str d16, [x8, x9, LSL #3]",
+        ));
+
+        insns.push((
+            Inst::FpuStore128 {
+                rd: vreg(16),
+                mem: MemArg::RegScaled(xreg(8), xreg(9), I128),
+                srcloc: None,
+            },
+            "1079A93C",
+            "str q16, [x8, x9, LSL #4]",
+        ));
+
+        insns.push((
+            Inst::LoadFpuConst32 {
+                rd: writable_vreg(16),
+                const_data: 1.0,
+            },
+            "5000001C020000140000803F",
+            "ldr s16, pc+8 ; b 8 ; data.f32 1",
+        ));
+
+        insns.push((
+            Inst::LoadFpuConst64 {
+                rd: writable_vreg(16),
+                const_data: 1.0,
+            },
+            "5000005C03000014000000000000F03F",
+            "ldr d16, pc+8 ; b 12 ; data.f64 1",
+        ));
+
+        insns.push((
+            Inst::FpuCSel32 {
+                rd: writable_vreg(1),
+                rn: vreg(2),
+                rm: vreg(3),
+                cond: Cond::Hi,
+            },
+            "418C231E",
+            "fcsel s1, s2, s3, hi",
+        ));
+
+        insns.push((
+            Inst::FpuCSel64 {
+                rd: writable_vreg(1),
+                rn: vreg(2),
+                rm: vreg(3),
+                cond: Cond::Eq,
+            },
+            "410C631E",
+            "fcsel d1, d2, d3, eq",
+        ));
+
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Minus32,
+            },
+            "1743251E",
+            "frintm s23, s24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Minus64,
+            },
+            "1743651E",
+            "frintm d23, d24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Plus32,
+            },
+            "17C3241E",
+            "frintp s23, s24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Plus64,
+            },
+            "17C3641E",
+            "frintp d23, d24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Zero32,
+            },
+            "17C3251E",
+            "frintz s23, s24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Zero64,
+            },
+            "17C3651E",
+            "frintz d23, d24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Nearest32,
+            },
+            "1743241E",
+            "frintn s23, s24",
+        ));
+        insns.push((
+            Inst::FpuRound {
+                rd: writable_vreg(23),
+                rn: vreg(24),
+                op: FpuRoundMode::Nearest64,
+            },
+            "1743641E",
+            "frintn d23, d24",
+        ));
+
+        let rru = create_reg_universe();
+        for (insn, expected_encoding, expected_printing) in insns {
+            println!(
+                "ARM64: {:?}, {}, {}",
+                insn, expected_encoding, expected_printing
+            );
+
+            // Check the printed text is as expected.
+            let actual_printing = insn.show_rru(Some(&rru));
+            assert_eq!(expected_printing, actual_printing);
+
+            // Check the encoding is as expected.
+            let text_size = {
+                let mut code_sec = MachSectionSize::new(0);
+                insn.emit(&mut code_sec);
+                code_sec.size()
+            };
+
+            let mut sink = test_utils::TestCodeSink::new();
+            let mut sections = MachSections::new();
+            let code_idx = sections.add_section(0, text_size);
+            let code_sec = sections.get_section(code_idx);
+            insn.emit(code_sec);
+            sections.emit(&mut sink);
+            let actual_encoding = &sink.stringify();
+            assert_eq!(expected_encoding, actual_encoding);
+        }
+    }
+
+    #[test]
+    fn test_cond_invert() {
+        for cond in vec![
+            Cond::Eq,
+            Cond::Ne,
+            Cond::Hs,
+            Cond::Lo,
+            Cond::Mi,
+            Cond::Pl,
+            Cond::Vs,
+            Cond::Vc,
+            Cond::Hi,
+            Cond::Ls,
+            Cond::Ge,
+            Cond::Lt,
+            Cond::Gt,
+            Cond::Le,
+            Cond::Al,
+            Cond::Nv,
+        ]
+        .into_iter()
+        {
+            assert_eq!(cond.invert().invert(), cond);
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/inst/imms.rs b/cranelift/codegen/src/isa/arm64/inst/imms.rs
new file mode 100644
index 000000000000..eda68af7b12a
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/inst/imms.rs
@@ -0,0 +1,753 @@
+//! ARM64 ISA definitions: immediate constants.
+
+#![allow(dead_code)]
+#![allow(non_snake_case)]
+
+use crate::ir::types::*;
+use crate::ir::Type;
+use crate::machinst::*;
+
+use regalloc::RealRegUniverse;
+
+use core::convert::TryFrom;
+use std::string::String;
+
+/// A signed, scaled 7-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm7Scaled {
+    /// The value.
+    pub value: i16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl SImm7Scaled {
+    /// Create a SImm7Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<SImm7Scaled> {
+        assert!(scale_ty == I64 || scale_ty == I32);
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = scale as i64;
+        let upper_limit = 63 * scale;
+        let lower_limit = -(64 * scale);
+        if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 {
+            Some(SImm7Scaled {
+                value: value as i16,
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> SImm7Scaled {
+        SImm7Scaled { value: 0, scale_ty }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        ((self.value / self.scale_ty.bytes() as i16) as u32) & 0x7f
+    }
+}
+
+/// a 9-bit signed offset.
+#[derive(Clone, Copy, Debug)]
+pub struct SImm9 {
+    /// The value.
+    pub value: i16,
+}
+
+impl SImm9 {
+    /// Create a signed 9-bit offset from a full-range value, if possible.
+    pub fn maybe_from_i64(value: i64) -> Option<SImm9> {
+        if value >= -256 && value <= 255 {
+            Some(SImm9 {
+                value: value as i16,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero() -> SImm9 {
+        SImm9 { value: 0 }
+    }
+
+    /// Bits for encoding.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32) & 0x1ff
+    }
+}
+
+/// An unsigned, scaled 12-bit offset.
+#[derive(Clone, Copy, Debug)]
+pub struct UImm12Scaled {
+    /// The value.
+    pub value: u16,
+    /// multiplied by the size of this type
+    pub scale_ty: Type,
+}
+
+impl UImm12Scaled {
+    /// Create a UImm12Scaled from a raw offset and the known scale type, if
+    /// possible.
+    pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option<UImm12Scaled> {
+        let scale = scale_ty.bytes();
+        assert!(scale.is_power_of_two());
+        let scale = scale as i64;
+        let limit = 4095 * scale;
+        if value >= 0 && value <= limit && (value & (scale - 1)) == 0 {
+            Some(UImm12Scaled {
+                value: value as u16,
+                scale_ty,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Create a zero immediate of this format.
+    pub fn zero(scale_ty: Type) -> UImm12Scaled {
+        UImm12Scaled { value: 0, scale_ty }
+    }
+
+    /// Encoded bits.
+    pub fn bits(&self) -> u32 {
+        (self.value as u32 / self.scale_ty.bytes()) & 0xfff
+    }
+}
+
+/// A shifted immediate value in 'imm12' format: supports 12 bits, shifted
+/// left by 0 or 12 places.
+#[derive(Clone, Debug)]
+pub struct Imm12 {
+    /// The immediate bits.
+    pub bits: usize,
+    /// Whether the immediate bits are shifted left by 12 or not.
+    pub shift12: bool,
+}
+
+impl Imm12 {
+    /// Compute a Imm12 from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<Imm12> {
+        if val == 0 {
+            Some(Imm12 {
+                bits: 0,
+                shift12: false,
+            })
+        } else if val < 0xfff {
+            Some(Imm12 {
+                bits: val as usize,
+                shift12: false,
+            })
+        } else if val < 0xfff_000 && (val & 0xfff == 0) {
+            Some(Imm12 {
+                bits: (val as usize) >> 12,
+                shift12: true,
+            })
+        } else {
+            None
+        }
+    }
+
+    /// Bits for 2-bit "shift" field in e.g. AddI.
+    pub fn shift_bits(&self) -> u8 {
+        if self.shift12 {
+            0b01
+        } else {
+            0b00
+        }
+    }
+
+    /// Bits for 12-bit "imm" field in e.g. AddI.
+    pub fn imm_bits(&self) -> u16 {
+        self.bits as u16
+    }
+}
+
+/// An immediate for logical instructions.
+#[derive(Clone, Debug)]
+#[cfg_attr(test, derive(PartialEq))]
+pub struct ImmLogic {
+    /// The actual value.
+    value: u64,
+    /// `N` flag.
+    pub N: bool,
+    /// `S` field: element size and element bits.
+    pub R: u8,
+    /// `R` field: rotate amount.
+    pub S: u8,
+}
+
+impl ImmLogic {
+    /// Compute an ImmLogic from raw bits, if possible.
+    pub fn maybe_from_u64(value: u64, ty: Type) -> Option<ImmLogic> {
+        // Note: This function is a port of VIXL's Assembler::IsImmLogical.
+
+        if ty != I64 && ty != I32 {
+            return None;
+        }
+
+        let original_value = value;
+
+        let value = if ty == I32 {
+            // To handle 32-bit logical immediates, the very easiest thing is to repeat
+            // the input value twice to make a 64-bit word. The correct encoding of that
+            // as a logical immediate will also be the correct encoding of the 32-bit
+            // value.
+
+            // Avoid making the assumption that the most-significant 32 bits are zero by
+            // shifting the value left and duplicating it.
+            let value = value << 32;
+            value | value >> 32
+        } else {
+            value
+        };
+
+        // Logical immediates are encoded using parameters n, imm_s and imm_r using
+        // the following table:
+        //
+        //    N   imms    immr    size        S             R
+        //    1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //    0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //    0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //    0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //    0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //    0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1 bits
+        // are set. The pattern is rotated right by R, and repeated across a 32 or
+        // 64-bit value, depending on destination register width.
+        //
+        // Put another way: the basic format of a logical immediate is a single
+        // contiguous stretch of 1 bits, repeated across the whole word at intervals
+        // given by a power of 2. To identify them quickly, we first locate the
+        // lowest stretch of 1 bits, then the next 1 bit above that; that combination
+        // is different for every logical immediate, so it gives us all the
+        // information we need to identify the only logical immediate that our input
+        // could be, and then we simply check if that's the value we actually have.
+        //
+        // (The rotation parameter does give the possibility of the stretch of 1 bits
+        // going 'round the end' of the word. To deal with that, we observe that in
+        // any situation where that happens the bitwise NOT of the value is also a
+        // valid logical immediate. So we simply invert the input whenever its low bit
+        // is set, and then we know that the rotated case can't arise.)
+        let (value, inverted) = if value & 1 == 1 {
+            (!value, true)
+        } else {
+            (value, false)
+        };
+
+        if value == 0 {
+            return None;
+        }
+
+        // The basic analysis idea: imagine our input word looks like this.
+        //
+        //    0011111000111110001111100011111000111110001111100011111000111110
+        //                                                          c  b    a
+        //                                                          |<--d-->|
+        //
+        // We find the lowest set bit (as an actual power-of-2 value, not its index)
+        // and call it a. Then we add a to our original number, which wipes out the
+        // bottommost stretch of set bits and replaces it with a 1 carried into the
+        // next zero bit. Then we look for the new lowest set bit, which is in
+        // position b, and subtract it, so now our number is just like the original
+        // but with the lowest stretch of set bits completely gone. Now we find the
+        // lowest set bit again, which is position c in the diagram above. Then we'll
+        // measure the distance d between bit positions a and c (using CLZ), and that
+        // tells us that the only valid logical immediate that could possibly be equal
+        // to this number is the one in which a stretch of bits running from a to just
+        // below b is replicated every d bits.
+        fn lowest_set_bit(value: u64) -> u64 {
+            let bit = value.trailing_zeros();
+            1u64.checked_shl(bit).unwrap_or(0)
+        }
+        let a = lowest_set_bit(value);
+        assert_ne!(0, a);
+        let value_plus_a = value.wrapping_add(a);
+        let b = lowest_set_bit(value_plus_a);
+        let value_plus_a_minus_b = value_plus_a - b;
+        let c = lowest_set_bit(value_plus_a_minus_b);
+
+        let (d, clz_a, out_n, mask) = if c != 0 {
+            // The general case, in which there is more than one stretch of set bits.
+            // Compute the repeat distance d, and set up a bitmask covering the basic
+            // unit of repetition (i.e. a word with the bottom d bits set). Also, in all
+            // of these cases the N bit of the output will be zero.
+            let clz_a = a.leading_zeros();
+            let clz_c = c.leading_zeros();
+            let d = clz_a - clz_c;
+            let mask = (1 << d) - 1;
+            (d, clz_a, 0, mask)
+        } else {
+            (64, a.leading_zeros(), 1, u64::max_value())
+        };
+
+        // If the repeat period d is not a power of two, it can't be encoded.
+        if !d.is_power_of_two() {
+            return None;
+        }
+
+        if ((b.wrapping_sub(a)) & !mask) != 0 {
+            // If the bit stretch (b - a) does not fit within the mask derived from the
+            // repeat period, then fail.
+            return None;
+        }
+
+        // The only possible option is b - a repeated every d bits. Now we're going to
+        // actually construct the valid logical immediate derived from that
+        // specification, and see if it equals our original input.
+        //
+        // To repeat a value every d bits, we multiply it by a number of the form
+        // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can
+        // be derived using a table lookup on CLZ(d).
+        const MULTIPLIERS: [u64; 6] = [
+            0x0000000000000001,
+            0x0000000100000001,
+            0x0001000100010001,
+            0x0101010101010101,
+            0x1111111111111111,
+            0x5555555555555555,
+        ];
+        let multiplier = MULTIPLIERS[(u64::from(d).leading_zeros() - 57) as usize];
+        let candidate = b.wrapping_sub(a) * multiplier;
+
+        if value != candidate {
+            // The candidate pattern doesn't match our input value, so fail.
+            return None;
+        }
+
+        // We have a match! This is a valid logical immediate, so now we have to
+        // construct the bits and pieces of the instruction encoding that generates
+        // it.
+
+        // Count the set bits in our basic stretch. The special case of clz(0) == -1
+        // makes the answer come out right for stretches that reach the very top of
+        // the word (e.g. numbers like 0xffffc00000000000).
+        let clz_b = if b == 0 {
+            u32::max_value() // -1
+        } else {
+            b.leading_zeros()
+        };
+        let s = clz_a.wrapping_sub(clz_b);
+
+        // Decide how many bits to rotate right by, to put the low bit of that basic
+        // stretch in position a.
+        let (s, r) = if inverted {
+            // If we inverted the input right at the start of this function, here's
+            // where we compensate: the number of set bits becomes the number of clear
+            // bits, and the rotation count is based on position b rather than position
+            // a (since b is the location of the 'lowest' 1 bit after inversion).
+            // Need wrapping for when clz_b is max_value() (for when b == 0).
+            (d - s, clz_b.wrapping_add(1) & (d - 1))
+        } else {
+            (s, (clz_a + 1) & (d - 1))
+        };
+
+        // Now we're done, except for having to encode the S output in such a way that
+        // it gives both the number of set bits and the length of the repeated
+        // segment. The s field is encoded like this:
+        //
+        //     imms    size        S
+        //    ssssss    64    UInt(ssssss)
+        //    0sssss    32    UInt(sssss)
+        //    10ssss    16    UInt(ssss)
+        //    110sss     8    UInt(sss)
+        //    1110ss     4    UInt(ss)
+        //    11110s     2    UInt(s)
+        //
+        // So we 'or' (2 * -d) with our computed s to form imms.
+        let s = ((d * 2).wrapping_neg() | (s - 1)) & 0x3f;
+        debug_assert!(u8::try_from(r).is_ok());
+        debug_assert!(u8::try_from(s).is_ok());
+        Some(ImmLogic {
+            value: original_value,
+            N: out_n != 0,
+            R: r as u8,
+            S: s as u8,
+        })
+    }
+
+    pub fn from_raw(value: u64, n: bool, r: u8, s: u8) -> ImmLogic {
+        ImmLogic {
+            N: n,
+            R: r,
+            S: s,
+            value,
+        }
+    }
+
+    /// Returns bits ready for encoding: (N:1, R:6, S:6)
+    pub fn enc_bits(&self) -> u16 {
+        ((self.N as u16) << 12) | ((self.R as u16) << 6) | (self.S as u16)
+    }
+
+    /// Returns the value that this immediate represents.
+    pub fn value(&self) -> u64 {
+        self.value
+    }
+
+    /// Return an immediate for the bitwise-inverted value.
+    pub fn invert(&self) -> ImmLogic {
+        // For every ImmLogical immediate, the inverse can also be encoded.
+        Self::maybe_from_u64(!self.value, I64).unwrap()
+    }
+}
+
+/// An immediate for shift instructions.
+#[derive(Clone, Debug)]
+pub struct ImmShift {
+    /// 6-bit shift amount.
+    pub imm: u8,
+}
+
+impl ImmShift {
+    /// Create an ImmShift from raw bits, if possible.
+    pub fn maybe_from_u64(val: u64) -> Option<ImmShift> {
+        if val < 64 {
+            Some(ImmShift { imm: val as u8 })
+        } else {
+            None
+        }
+    }
+
+    /// Get the immediate value.
+    pub fn value(&self) -> u8 {
+        self.imm
+    }
+}
+
+/// A 16-bit immediate for a MOVZ instruction, with a {0,16,32,48}-bit shift.
+#[derive(Clone, Copy, Debug)]
+pub struct MoveWideConst {
+    /// The value.
+    pub bits: u16,
+    /// shifted 16*shift bits to the left.
+    pub shift: u8,
+}
+
+impl MoveWideConst {
+    /// Construct a MoveWideConst from an arbitrary 64-bit constant if possible.
+    pub fn maybe_from_u64(value: u64) -> Option<MoveWideConst> {
+        let mask0 = 0x0000_0000_0000_ffffu64;
+        let mask1 = 0x0000_0000_ffff_0000u64;
+        let mask2 = 0x0000_ffff_0000_0000u64;
+        let mask3 = 0xffff_0000_0000_0000u64;
+
+        if value == (value & mask0) {
+            return Some(MoveWideConst {
+                bits: (value & mask0) as u16,
+                shift: 0,
+            });
+        }
+        if value == (value & mask1) {
+            return Some(MoveWideConst {
+                bits: ((value >> 16) & mask0) as u16,
+                shift: 1,
+            });
+        }
+        if value == (value & mask2) {
+            return Some(MoveWideConst {
+                bits: ((value >> 32) & mask0) as u16,
+                shift: 2,
+            });
+        }
+        if value == (value & mask3) {
+            return Some(MoveWideConst {
+                bits: ((value >> 48) & mask0) as u16,
+                shift: 3,
+            });
+        }
+        None
+    }
+
+    pub fn maybe_with_shift(imm: u16, shift: u8) -> Option<MoveWideConst> {
+        let shift_enc = shift / 16;
+        if shift_enc > 3 {
+            None
+        } else {
+            Some(MoveWideConst {
+                bits: imm,
+                shift: shift_enc,
+            })
+        }
+    }
+
+    /// Returns the value that this constant represents.
+    pub fn value(&self) -> u64 {
+        (self.bits as u64) << (16 * self.shift)
+    }
+}
+
+impl ShowWithRRU for Imm12 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        let shift = if self.shift12 { 12 } else { 0 };
+        let value = self.bits << shift;
+        format!("#{}", value)
+    }
+}
+
+impl ShowWithRRU for SImm7Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for SImm9 {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for UImm12Scaled {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value)
+    }
+}
+
+impl ShowWithRRU for ImmLogic {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.value())
+    }
+}
+
+impl ShowWithRRU for ImmShift {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        format!("#{}", self.imm)
+    }
+}
+
+impl ShowWithRRU for MoveWideConst {
+    fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
+        if self.shift == 0 {
+            format!("#{}", self.bits)
+        } else {
+            format!("#{}, LSL #{}", self.bits, self.shift * 16)
+        }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn imm_logical_test() {
+        assert_eq!(None, ImmLogic::maybe_from_u64(0, I64));
+        assert_eq!(None, ImmLogic::maybe_from_u64(u64::max_value(), I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1,
+                N: true,
+                R: 0,
+                S: 0
+            }),
+            ImmLogic::maybe_from_u64(1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 2,
+                N: true,
+                R: 63,
+                S: 0
+            }),
+            ImmLogic::maybe_from_u64(2, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(5, I64));
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(11, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 248,
+                N: true,
+                R: 61,
+                S: 4
+            }),
+            ImmLogic::maybe_from_u64(248, I64)
+        );
+
+        assert_eq!(None, ImmLogic::maybe_from_u64(249, I64));
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 1920,
+                N: true,
+                R: 57,
+                S: 3
+            }),
+            ImmLogic::maybe_from_u64(1920, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x7ffe,
+                N: true,
+                R: 63,
+                S: 13
+            }),
+            ImmLogic::maybe_from_u64(0x7ffe, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x30000,
+                N: true,
+                R: 48,
+                S: 1
+            }),
+            ImmLogic::maybe_from_u64(0x30000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000,
+                N: true,
+                R: 44,
+                S: 0
+            }),
+            ImmLogic::maybe_from_u64(0x100000, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: u64::max_value() - 1,
+                N: true,
+                R: 63,
+                S: 62
+            }),
+            ImmLogic::maybe_from_u64(u64::max_value() - 1, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xaaaaaaaaaaaaaaaa,
+                N: false,
+                R: 1,
+                S: 60
+            }),
+            ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x8181818181818181,
+                N: false,
+                R: 1,
+                S: 49
+            }),
+            ImmLogic::maybe_from_u64(0x8181818181818181, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0xffc3ffc3ffc3ffc3,
+                N: false,
+                R: 10,
+                S: 43
+            }),
+            ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x100000001,
+                N: false,
+                R: 0,
+                S: 0
+            }),
+            ImmLogic::maybe_from_u64(0x100000001, I64)
+        );
+
+        assert_eq!(
+            Some(ImmLogic {
+                value: 0x1111111111111111,
+                N: false,
+                R: 0,
+                S: 56
+            }),
+            ImmLogic::maybe_from_u64(0x1111111111111111, I64)
+        );
+
+        for n in 0..2 {
+            let types = if n == 0 { vec![I64, I32] } else { vec![I64] };
+            for s in 0..64 {
+                for r in 0..64 {
+                    let imm = get_logical_imm(n, s, r);
+                    for &ty in &types {
+                        match ImmLogic::maybe_from_u64(imm, ty) {
+                            Some(ImmLogic { value, .. }) => {
+                                assert_eq!(imm, value);
+                                ImmLogic::maybe_from_u64(!value, ty).unwrap();
+                            }
+                            None => assert_eq!(0, imm),
+                        };
+                    }
+                }
+            }
+        }
+    }
+
+    // Repeat a value that has `width` bits, across a 64-bit value.
+    fn repeat(value: u64, width: u64) -> u64 {
+        let mut result = value & ((1 << width) - 1);
+        let mut i = width;
+        while i < 64 {
+            result |= result << i;
+            i *= 2;
+        }
+        result
+    }
+
+    // Get the logical immediate, from the encoding N/R/S bits.
+    fn get_logical_imm(n: u32, s: u32, r: u32) -> u64 {
+        // An integer is constructed from the n, imm_s and imm_r bits according to
+        // the following table:
+        //
+        //  N   imms    immr    size        S             R
+        //  1  ssssss  rrrrrr    64    UInt(ssssss)  UInt(rrrrrr)
+        //  0  0sssss  xrrrrr    32    UInt(sssss)   UInt(rrrrr)
+        //  0  10ssss  xxrrrr    16    UInt(ssss)    UInt(rrrr)
+        //  0  110sss  xxxrrr     8    UInt(sss)     UInt(rrr)
+        //  0  1110ss  xxxxrr     4    UInt(ss)      UInt(rr)
+        //  0  11110s  xxxxxr     2    UInt(s)       UInt(r)
+        // (s bits must not be all set)
+        //
+        // A pattern is constructed of size bits, where the least significant S+1
+        // bits are set. The pattern is rotated right by R, and repeated across a
+        // 64-bit value.
+
+        if n == 1 {
+            if s == 0x3f {
+                return 0;
+            }
+            let bits = (1u64 << (s + 1)) - 1;
+            bits.rotate_right(r)
+        } else {
+            if (s >> 1) == 0x1f {
+                return 0;
+            }
+            let mut width = 0x20;
+            while width >= 0x2 {
+                if (s & width) == 0 {
+                    let mask = width - 1;
+                    if (s & mask) == mask {
+                        return 0;
+                    }
+                    let bits = (1u64 << ((s & mask) + 1)) - 1;
+                    return repeat(bits.rotate_right(r & mask), width.into());
+                }
+                width >>= 1;
+            }
+            unreachable!();
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/inst/mod.rs b/cranelift/codegen/src/isa/arm64/inst/mod.rs
new file mode 100644
index 000000000000..ecc948cc706a
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/inst/mod.rs
@@ -0,0 +1,2515 @@
+//! This module defines arm64-specific machine instruction types.
+
+#![allow(non_snake_case)]
+#![allow(unused_imports)]
+#![allow(non_camel_case_types)]
+#![allow(dead_code)]
+
+use crate::binemit::CodeOffset;
+use crate::ir::constant::{ConstantData, ConstantOffset};
+use crate::ir::types::{
+    B1, B128, B16, B32, B64, B8, F32, F64, FFLAGS, I128, I16, I32, I64, I8, IFLAGS,
+};
+use crate::ir::{ExternalName, GlobalValue, JumpTable, Opcode, SourceLoc, TrapCode, Type};
+use crate::machinst::*;
+
+use regalloc::Map as RegallocMap;
+use regalloc::{
+    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
+    NUM_REG_CLASSES,
+};
+use regalloc::{RegUsageCollector, Set};
+
+use alloc::vec::Vec;
+use smallvec::{smallvec, SmallVec};
+use std::mem;
+use std::string::{String, ToString};
+
+pub mod regs;
+pub use self::regs::*;
+pub mod imms;
+pub use self::imms::*;
+pub mod args;
+pub use self::args::*;
+pub mod emit;
+pub use self::emit::*;
+
+//=============================================================================
+// Instructions (top level): definition
+
+/// An ALU operation. This can be paired with several instruction formats
+/// below (see `Inst`) in any combination.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum ALUOp {
+    Add32,
+    Add64,
+    Sub32,
+    Sub64,
+    Orr32,
+    Orr64,
+    OrrNot32,
+    OrrNot64,
+    And32,
+    And64,
+    AndNot32,
+    AndNot64,
+    Eor32,
+    Eor64,
+    EorNot32,
+    EorNot64,
+    AddS32,
+    AddS64,
+    SubS32,
+    SubS64,
+    MAdd32, // multiply-add
+    MAdd64,
+    MSub32,
+    MSub64,
+    SMulH,
+    UMulH,
+    SDiv64,
+    UDiv64,
+    RotR32,
+    RotR64,
+    Lsr32,
+    Lsr64,
+    Asr32,
+    Asr64,
+    Lsl32,
+    Lsl64,
+}
+
+/// A floating-point unit (FPU) operation with one arg.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp1 {
+    Abs32,
+    Abs64,
+    Neg32,
+    Neg64,
+    Sqrt32,
+    Sqrt64,
+    Cvt32To64,
+    Cvt64To32,
+}
+
+/// A floating-point unit (FPU) operation with two args.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp2 {
+    Add32,
+    Add64,
+    Sub32,
+    Sub64,
+    Mul32,
+    Mul64,
+    Div32,
+    Div64,
+    Max32,
+    Max64,
+    Min32,
+    Min64,
+}
+
+/// A floating-point unit (FPU) operation with three args.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FPUOp3 {
+    MAdd32,
+    MAdd64,
+}
+
+/// A conversion from an FP to an integer value.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FpuToIntOp {
+    F32ToU32,
+    F32ToI32,
+    F32ToU64,
+    F32ToI64,
+    F64ToU32,
+    F64ToI32,
+    F64ToU64,
+    F64ToI64,
+}
+
+/// A conversion from an integer to an FP value.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum IntToFpuOp {
+    U32ToF32,
+    I32ToF32,
+    U32ToF64,
+    I32ToF64,
+    U64ToF32,
+    I64ToF32,
+    U64ToF64,
+    I64ToF64,
+}
+
+/// Modes for FP rounding ops: round down (floor) or up (ceil), or toward zero (trunc), or to
+/// nearest, and for 32- or 64-bit FP values.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum FpuRoundMode {
+    Minus32,
+    Minus64,
+    Plus32,
+    Plus64,
+    Zero32,
+    Zero64,
+    Nearest32,
+    Nearest64,
+}
+
+/// A vector ALU operation.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum VecALUOp {
+    SQAddScalar, // signed saturating add
+    UQAddScalar, // unsigned saturating add
+    SQSubScalar, // signed saturating subtract
+    UQSubScalar, // unsigned saturating subtract
+}
+
+/// An operation on the bits of a register. This can be paired with several instruction formats
+/// below (see `Inst`) in any combination.
+#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
+pub enum BitOp {
+    RBit32,
+    RBit64,
+    Clz32,
+    Clz64,
+    Cls32,
+    Cls64,
+}
+
+impl BitOp {
+    /// Is the opcode a 32-bit operation.
+    pub fn is_32_bit(&self) -> bool {
+        match self {
+            BitOp::RBit32 => true,
+            BitOp::Clz32 => true,
+            BitOp::Cls32 => true,
+            _ => false,
+        }
+    }
+
+    /// Get the assembly mnemonic for this opcode.
+    pub fn op_str(&self) -> &'static str {
+        match self {
+            BitOp::RBit32 | BitOp::RBit64 => "rbit",
+            BitOp::Clz32 | BitOp::Clz64 => "clz",
+            BitOp::Cls32 | BitOp::Cls64 => "cls",
+        }
+    }
+}
+
+impl From<(Opcode, Type)> for BitOp {
+    /// Get the BitOp from the IR opcode.
+    fn from(op_ty: (Opcode, Type)) -> BitOp {
+        match op_ty {
+            (Opcode::Bitrev, I32) => BitOp::RBit32,
+            (Opcode::Bitrev, I64) => BitOp::RBit64,
+            (Opcode::Clz, I32) => BitOp::Clz32,
+            (Opcode::Clz, I64) => BitOp::Clz64,
+            (Opcode::Cls, I32) => BitOp::Cls32,
+            (Opcode::Cls, I64) => BitOp::Cls64,
+            _ => unreachable!("Called with non-bit op!"),
+        }
+    }
+}
+
+/// Instruction formats.
+#[derive(Clone, Debug)]
+pub enum Inst {
+    /// A no-op of zero size.
+    Nop,
+
+    /// A no-op that is one instruction large.
+    Nop4,
+
+    /// An ALU operation with two register sources and a register destination.
+    AluRRR {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+    /// An ALU operation with three register sources and a register destination.
+    AluRRRR {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        ra: Reg,
+    },
+    /// An ALU operation with a register source and an immediate-12 source, and a register
+    /// destination.
+    AluRRImm12 {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imm12: Imm12,
+    },
+    /// An ALU operation with a register source and an immediate-logic source, and a register destination.
+    AluRRImmLogic {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        imml: ImmLogic,
+    },
+    /// An ALU operation with a register source and an immediate-shiftamt source, and a register destination.
+    AluRRImmShift {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        immshift: ImmShift,
+    },
+    /// An ALU operation with two register sources, one of which can be shifted, and a register
+    /// destination.
+    AluRRRShift {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        shiftop: ShiftOpAndAmt,
+    },
+    /// An ALU operation with two register sources, one of which can be {zero,sign}-extended and
+    /// shifted, and a register destination.
+    AluRRRExtend {
+        alu_op: ALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        extendop: ExtendOp,
+    },
+
+    /// A bit op instruction with a single register source.
+    BitRR {
+        op: BitOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// An unsigned (zero-extending) 8-bit load.
+    ULoad8 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A signed (sign-extending) 8-bit load.
+    SLoad8 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// An unsigned (zero-extending) 16-bit load.
+    ULoad16 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A signed (sign-extending) 16-bit load.
+    SLoad16 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// An unsigned (zero-extending) 32-bit load.
+    ULoad32 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A signed (sign-extending) 32-bit load.
+    SLoad32 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A 64-bit load.
+    ULoad64 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// An 8-bit store.
+    Store8 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A 16-bit store.
+    Store16 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A 32-bit store.
+    Store32 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    /// A 64-bit store.
+    Store64 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+
+    /// A store of a pair of registers.
+    StoreP64 {
+        rt: Reg,
+        rt2: Reg,
+        mem: PairMemArg,
+    },
+    /// A load of a pair of registers.
+    LoadP64 {
+        rt: Writable<Reg>,
+        rt2: Writable<Reg>,
+        mem: PairMemArg,
+    },
+
+    /// A MOV instruction. These are encoded as ORR's (AluRRR form) but we
+    /// keep them separate at the `Inst` level for better pretty-printing
+    /// and faster `is_move()` logic.
+    Mov {
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A 32-bit MOV. Zeroes the top 32 bits of the destination. This is
+    /// effectively an alias for an unsigned 32-to-64-bit extension.
+    Mov32 {
+        rd: Writable<Reg>,
+        rm: Reg,
+    },
+
+    /// A MOVZ with a 16-bit immediate.
+    MovZ {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+    },
+
+    /// A MOVN with a 16-bit immediate.
+    MovN {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+    },
+
+    /// A MOVK with a 16-bit immediate.
+    MovK {
+        rd: Writable<Reg>,
+        imm: MoveWideConst,
+    },
+
+    /// A sign- or zero-extend operation.
+    Extend {
+        rd: Writable<Reg>,
+        rn: Reg,
+        signed: bool,
+        from_bits: u8,
+        to_bits: u8,
+    },
+
+    /// A conditional-select operation.
+    CSel {
+        rd: Writable<Reg>,
+        cond: Cond,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// A conditional-set operation.
+    CSet {
+        rd: Writable<Reg>,
+        cond: Cond,
+    },
+
+    /// FPU move. Note that this is distinct from a vector-register
+    /// move; moving just 64 bits seems to be significantly faster.
+    FpuMove64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// 1-op FPU instruction.
+    FpuRR {
+        fpu_op: FPUOp1,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// 2-op FPU instruction.
+    FpuRRR {
+        fpu_op: FPUOp2,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// 3-op FPU instruction.
+    FpuRRRR {
+        fpu_op: FPUOp3,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        ra: Reg,
+    },
+
+    /// FPU comparison, single-precision (32 bit).
+    FpuCmp32 {
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// FPU comparison, double-precision (64 bit).
+    FpuCmp64 {
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// Floating-point loads and stores.
+    FpuLoad32 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    FpuStore32 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    FpuLoad64 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    FpuStore64 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    FpuLoad128 {
+        rd: Writable<Reg>,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+    FpuStore128 {
+        rd: Reg,
+        mem: MemArg,
+        srcloc: Option<SourceLoc>,
+    },
+
+    LoadFpuConst32 {
+        rd: Writable<Reg>,
+        const_data: f32,
+    },
+
+    LoadFpuConst64 {
+        rd: Writable<Reg>,
+        const_data: f64,
+    },
+
+    /// Conversions between FP and integer values.
+    FpuToInt {
+        op: FpuToIntOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    IntToFpu {
+        op: IntToFpuOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    // FP conditional select.
+    FpuCSel32 {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        cond: Cond,
+    },
+    FpuCSel64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+        cond: Cond,
+    },
+
+    // Round to integer.
+    FpuRound {
+        op: FpuRoundMode,
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Move to a vector register from a GPR.
+    MovToVec64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// Move to a GPR from a vector register.
+    MovFromVec64 {
+        rd: Writable<Reg>,
+        rn: Reg,
+    },
+
+    /// A vector ALU op.
+    VecRRR {
+        alu_op: VecALUOp,
+        rd: Writable<Reg>,
+        rn: Reg,
+        rm: Reg,
+    },
+
+    /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn).
+    MovToNZCV {
+        rn: Reg,
+    },
+
+    /// Move from the NZCV flags (actually a `MRS Xn, NZCV` insn).
+    MovFromNZCV {
+        rd: Writable<Reg>,
+    },
+
+    /// Set a register to 1 if condition, else 0.
+    CondSet {
+        rd: Writable<Reg>,
+        cond: Cond,
+    },
+
+    /// A machine call instruction.
+    Call {
+        dest: ExternalName,
+        uses: Set<Reg>,
+        defs: Set<Writable<Reg>>,
+        loc: SourceLoc,
+        opcode: Opcode,
+    },
+    /// A machine indirect-call instruction.
+    CallInd {
+        rn: Reg,
+        uses: Set<Reg>,
+        defs: Set<Writable<Reg>>,
+        loc: SourceLoc,
+        opcode: Opcode,
+    },
+
+    // ---- branches (exactly one must appear at end of BB) ----
+    /// A machine return instruction.
+    Ret {},
+
+    /// A placeholder instruction, generating no code, meaning that a function epilogue must be
+    /// inserted there.
+    EpiloguePlaceholder {},
+
+    /// An unconditional branch.
+    Jump {
+        dest: BranchTarget,
+    },
+
+    /// A conditional branch.
+    CondBr {
+        taken: BranchTarget,
+        not_taken: BranchTarget,
+        kind: CondBrKind,
+    },
+
+    /// Lowered conditional branch: contains the original branch kind (or the
+    /// inverse), but only one BranchTarget is retained. The other is
+    /// implicitly the next instruction, given the final basic-block layout.
+    CondBrLowered {
+        target: BranchTarget,
+        kind: CondBrKind,
+    },
+
+    /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two
+    /// actual machine instructions). Needed when the final block layout implies
+    /// that neither arm of a conditional branch targets the fallthrough block.
+    CondBrLoweredCompound {
+        taken: BranchTarget,
+        not_taken: BranchTarget,
+        kind: CondBrKind,
+    },
+
+    /// An indirect branch through a register, augmented with set of all
+    /// possible successors.
+    IndirectBr {
+        rn: Reg,
+        targets: Vec<BlockIndex>,
+    },
+
+    /// A "break" instruction, used for e.g. traps and debug breakpoints.
+    Brk,
+
+    /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at
+    /// runtime.
+    Udf {
+        trap_info: (SourceLoc, TrapCode),
+    },
+
+    /// Load the address (using a PC-relative offset) of a MemLabel, using the
+    /// `ADR` instruction.
+    Adr {
+        rd: Writable<Reg>,
+        label: MemLabel,
+    },
+
+    /// Raw 32-bit word, used for inline constants and jump-table entries.
+    Word4 {
+        data: u32,
+    },
+
+    /// Raw 64-bit word, used for inline constants.
+    Word8 {
+        data: u64,
+    },
+
+    /// Jump-table sequence, as one compound instruction (see note in lower.rs
+    /// for rationale).
+    JTSequence {
+        targets: Vec<BranchTarget>,
+        targets_for_term: Vec<BlockIndex>, // needed for MachTerminator.
+        ridx: Reg,
+        rtmp1: Writable<Reg>,
+        rtmp2: Writable<Reg>,
+    },
+
+    /// Load an inline constant.
+    LoadConst64 {
+        rd: Writable<Reg>,
+        const_data: u64,
+    },
+
+    /// Load an inline symbol reference.
+    LoadExtName {
+        rd: Writable<Reg>,
+        name: ExternalName,
+        srcloc: SourceLoc,
+        offset: i64,
+    },
+}
+
+fn count_clear_half_words(mut value: u64) -> usize {
+    let mut count = 0;
+    for _ in 0..4 {
+        if value & 0xffff == 0 {
+            count += 1;
+        }
+        value >>= 16;
+    }
+
+    count
+}
+
+impl Inst {
+    /// Create a move instruction.
+    pub fn mov(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
+        assert!(to_reg.to_reg().get_class() == from_reg.get_class());
+        if from_reg.get_class() == RegClass::I64 {
+            Inst::Mov {
+                rd: to_reg,
+                rm: from_reg,
+            }
+        } else {
+            Inst::FpuMove64 {
+                rd: to_reg,
+                rn: from_reg,
+            }
+        }
+    }
+
+    /// Create a 32-bit move instruction.
+    pub fn mov32(to_reg: Writable<Reg>, from_reg: Reg) -> Inst {
+        Inst::Mov32 {
+            rd: to_reg,
+            rm: from_reg,
+        }
+    }
+
+    /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN,
+    /// logical immediate, or constant pool).
+    pub fn load_constant(rd: Writable<Reg>, value: u64) -> SmallVec<[Inst; 4]> {
+        if let Some(imm) = MoveWideConst::maybe_from_u64(value) {
+            // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVZ
+            smallvec![Inst::MovZ { rd, imm }]
+        } else if let Some(imm) = MoveWideConst::maybe_from_u64(!value) {
+            // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVN
+            smallvec![Inst::MovN { rd, imm }]
+        } else if let Some(imml) = ImmLogic::maybe_from_u64(value, I64) {
+            // Weird logical-instruction immediate in ORI using zero register
+            smallvec![Inst::AluRRImmLogic {
+                alu_op: ALUOp::Orr64,
+                rd,
+                rn: zero_reg(),
+                imml,
+            }]
+        } else {
+            let mut insts = smallvec![];
+
+            // If the number of 0xffff half words is greater than the number of 0x0000 half words
+            // it is more efficient to use `movn` for the first instruction.
+            let first_is_inverted = count_clear_half_words(!value) > count_clear_half_words(value);
+            // Either 0xffff or 0x0000 half words can be skipped, depending on the first
+            // instruction used.
+            let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
+            let mut first_mov_emitted = false;
+
+            for i in 0..4 {
+                let imm16 = (value >> (16 * i)) & 0xffff;
+                if imm16 != ignored_halfword {
+                    if !first_mov_emitted {
+                        first_mov_emitted = true;
+                        if first_is_inverted {
+                            let imm =
+                                MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16)
+                                    .unwrap();
+                            insts.push(Inst::MovN { rd, imm });
+                        } else {
+                            let imm =
+                                MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                            insts.push(Inst::MovZ { rd, imm });
+                        }
+                    } else {
+                        let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap();
+                        insts.push(Inst::MovK { rd, imm });
+                    }
+                }
+            }
+
+            assert!(first_mov_emitted);
+
+            insts
+        }
+    }
+
+    /// Create an instruction that loads a 32-bit floating-point constant.
+    pub fn load_fp_constant32(rd: Writable<Reg>, value: f32) -> Inst {
+        // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
+        Inst::LoadFpuConst32 {
+            rd,
+            const_data: value,
+        }
+    }
+
+    /// Create an instruction that loads a 64-bit floating-point constant.
+    pub fn load_fp_constant64(rd: Writable<Reg>, value: f64) -> Inst {
+        // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits.
+        Inst::LoadFpuConst64 {
+            rd,
+            const_data: value,
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: get_regs
+
+fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) {
+    match memarg {
+        &MemArg::Unscaled(reg, ..) | &MemArg::UnsignedOffset(reg, ..) => {
+            collector.add_use(reg);
+        }
+        &MemArg::RegReg(r1, r2, ..)
+        | &MemArg::RegScaled(r1, r2, ..)
+        | &MemArg::RegScaledExtended(r1, r2, ..) => {
+            collector.add_use(r1);
+            collector.add_use(r2);
+        }
+        &MemArg::Label(..) => {}
+        &MemArg::PreIndexed(reg, ..) | &MemArg::PostIndexed(reg, ..) => {
+            collector.add_mod(reg);
+        }
+        &MemArg::FPOffset(..) => {
+            collector.add_use(fp_reg());
+        }
+        &MemArg::SPOffset(..) => {
+            collector.add_use(stack_reg());
+        }
+    }
+}
+
+fn pairmemarg_regs(pairmemarg: &PairMemArg, collector: &mut RegUsageCollector) {
+    match pairmemarg {
+        &PairMemArg::SignedOffset(reg, ..) => {
+            collector.add_use(reg);
+        }
+        &PairMemArg::PreIndexed(reg, ..) | &PairMemArg::PostIndexed(reg, ..) => {
+            collector.add_mod(reg);
+        }
+    }
+}
+
+fn arm64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+    match inst {
+        &Inst::AluRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRR { rd, rn, rm, ra, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+            collector.add_use(ra);
+        }
+        &Inst::AluRRImm12 { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRImmLogic { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRImmShift { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::AluRRRShift { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::AluRRRExtend { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::BitRR { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::ULoad8 { rd, ref mem, .. }
+        | &Inst::SLoad8 { rd, ref mem, .. }
+        | &Inst::ULoad16 { rd, ref mem, .. }
+        | &Inst::SLoad16 { rd, ref mem, .. }
+        | &Inst::ULoad32 { rd, ref mem, .. }
+        | &Inst::SLoad32 { rd, ref mem, .. }
+        | &Inst::ULoad64 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::Store8 { rd, ref mem, .. }
+        | &Inst::Store16 { rd, ref mem, .. }
+        | &Inst::Store32 { rd, ref mem, .. }
+        | &Inst::Store64 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::StoreP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_use(rt);
+            collector.add_use(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::LoadP64 {
+            rt, rt2, ref mem, ..
+        } => {
+            collector.add_def(rt);
+            collector.add_def(rt2);
+            pairmemarg_regs(mem, collector);
+        }
+        &Inst::Mov { rd, rm } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::Mov32 { rd, rm } => {
+            collector.add_def(rd);
+            collector.add_use(rm);
+        }
+        &Inst::MovZ { rd, .. } | &Inst::MovN { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::MovK { rd, .. } => {
+            collector.add_mod(rd);
+        }
+        &Inst::CSel { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::CSet { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::FpuMove64 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuRR { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuRRRR { rd, rn, rm, ra, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+            collector.add_use(ra);
+        }
+        &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => {
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuLoad32 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuLoad64 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuLoad128 { rd, ref mem, .. } => {
+            collector.add_def(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore32 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore64 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::FpuStore128 { rd, ref mem, .. } => {
+            collector.add_use(rd);
+            memarg_regs(mem, collector);
+        }
+        &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::FpuToInt { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::IntToFpu { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::FpuCSel32 { rd, rn, rm, .. } | &Inst::FpuCSel64 { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::FpuRound { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::MovToVec64 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::MovFromVec64 { rd, rn } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::VecRRR { rd, rn, rm, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+            collector.add_use(rm);
+        }
+        &Inst::MovToNZCV { rn } => {
+            collector.add_use(rn);
+        }
+        &Inst::MovFromNZCV { rd } => {
+            collector.add_def(rd);
+        }
+        &Inst::CondSet { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::Extend { rd, rn, .. } => {
+            collector.add_def(rd);
+            collector.add_use(rn);
+        }
+        &Inst::Jump { .. } | &Inst::Ret { .. } | &Inst::EpiloguePlaceholder { .. } => {}
+        &Inst::Call {
+            ref uses, ref defs, ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+        }
+        &Inst::CallInd {
+            ref uses,
+            ref defs,
+            rn,
+            ..
+        } => {
+            collector.add_uses(uses);
+            collector.add_defs(defs);
+            collector.add_use(rn);
+        }
+        &Inst::CondBr { ref kind, .. }
+        | &Inst::CondBrLowered { ref kind, .. }
+        | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind {
+            CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => {
+                collector.add_use(*rt);
+            }
+            CondBrKind::Cond(_) => {}
+        },
+        &Inst::IndirectBr { rn, .. } => {
+            collector.add_use(rn);
+        }
+        &Inst::Nop | Inst::Nop4 => {}
+        &Inst::Brk => {}
+        &Inst::Udf { .. } => {}
+        &Inst::Adr { rd, .. } => {
+            collector.add_def(rd);
+        }
+        &Inst::Word4 { .. } | &Inst::Word8 { .. } => {}
+        &Inst::JTSequence {
+            ridx, rtmp1, rtmp2, ..
+        } => {
+            collector.add_use(ridx);
+            collector.add_def(rtmp1);
+            collector.add_def(rtmp2);
+        }
+        &Inst::LoadConst64 { rd, .. } | &Inst::LoadExtName { rd, .. } => {
+            collector.add_def(rd);
+        }
+    }
+}
+
+//=============================================================================
+// Instructions: map_regs
+
+fn arm64_map_regs(
+    inst: &mut Inst,
+    pre_map: &RegallocMap<VirtualReg, RealReg>,
+    post_map: &RegallocMap<VirtualReg, RealReg>,
+) {
+    fn map(m: &RegallocMap<VirtualReg, RealReg>, r: Reg) -> Reg {
+        if r.is_virtual() {
+            m.get(&r.to_virtual_reg()).cloned().unwrap().to_reg()
+        } else {
+            r
+        }
+    }
+
+    fn map_wr(m: &RegallocMap<VirtualReg, RealReg>, r: Writable<Reg>) -> Writable<Reg> {
+        Writable::from_reg(map(m, r.to_reg()))
+    }
+
+    fn map_mem(u: &RegallocMap<VirtualReg, RealReg>, mem: &MemArg) -> MemArg {
+        // N.B.: we take only the pre-map here, but this is OK because the
+        // only addressing modes that update registers (pre/post-increment on
+        // ARM64) both read and write registers, so they are "mods" rather
+        // than "defs", so must be the same in both the pre- and post-map.
+        match mem {
+            &MemArg::Unscaled(reg, simm9) => MemArg::Unscaled(map(u, reg), simm9),
+            &MemArg::UnsignedOffset(reg, uimm12) => MemArg::UnsignedOffset(map(u, reg), uimm12),
+            &MemArg::RegReg(r1, r2) => MemArg::RegReg(map(u, r1), map(u, r2)),
+            &MemArg::RegScaled(r1, r2, ty) => MemArg::RegScaled(map(u, r1), map(u, r2), ty),
+            &MemArg::RegScaledExtended(r1, r2, ty, op) => {
+                MemArg::RegScaledExtended(map(u, r1), map(u, r2), ty, op)
+            }
+            &MemArg::Label(ref l) => MemArg::Label(l.clone()),
+            &MemArg::PreIndexed(r, simm9) => MemArg::PreIndexed(map_wr(u, r), simm9),
+            &MemArg::PostIndexed(r, simm9) => MemArg::PostIndexed(map_wr(u, r), simm9),
+            &MemArg::FPOffset(off) => MemArg::FPOffset(off),
+            &MemArg::SPOffset(off) => MemArg::SPOffset(off),
+        }
+    }
+
+    fn map_pairmem(u: &RegallocMap<VirtualReg, RealReg>, mem: &PairMemArg) -> PairMemArg {
+        match mem {
+            &PairMemArg::SignedOffset(reg, simm7) => PairMemArg::SignedOffset(map(u, reg), simm7),
+            &PairMemArg::PreIndexed(reg, simm7) => PairMemArg::PreIndexed(map_wr(u, reg), simm7),
+            &PairMemArg::PostIndexed(reg, simm7) => PairMemArg::PostIndexed(map_wr(u, reg), simm7),
+        }
+    }
+
+    fn map_br(u: &RegallocMap<VirtualReg, RealReg>, br: &CondBrKind) -> CondBrKind {
+        match br {
+            &CondBrKind::Zero(reg) => CondBrKind::Zero(map(u, reg)),
+            &CondBrKind::NotZero(reg) => CondBrKind::NotZero(map(u, reg)),
+            &CondBrKind::Cond(c) => CondBrKind::Cond(c),
+        }
+    }
+
+    let u = pre_map; // For brevity below.
+    let d = post_map;
+
+    let newval = match inst {
+        &mut Inst::AluRRR { alu_op, rd, rn, rm } => Inst::AluRRR {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::AluRRRR {
+            alu_op,
+            rd,
+            rn,
+            rm,
+            ra,
+        } => Inst::AluRRRR {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+            ra: map(u, ra),
+        },
+        &mut Inst::AluRRImm12 {
+            alu_op,
+            rd,
+            rn,
+            ref imm12,
+        } => Inst::AluRRImm12 {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            imm12: imm12.clone(),
+        },
+        &mut Inst::AluRRImmLogic {
+            alu_op,
+            rd,
+            rn,
+            ref imml,
+        } => Inst::AluRRImmLogic {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            imml: imml.clone(),
+        },
+        &mut Inst::AluRRImmShift {
+            alu_op,
+            rd,
+            rn,
+            ref immshift,
+        } => Inst::AluRRImmShift {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            immshift: immshift.clone(),
+        },
+        &mut Inst::AluRRRShift {
+            alu_op,
+            rd,
+            rn,
+            rm,
+            ref shiftop,
+        } => Inst::AluRRRShift {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+            shiftop: shiftop.clone(),
+        },
+        &mut Inst::AluRRRExtend {
+            alu_op,
+            rd,
+            rn,
+            rm,
+            ref extendop,
+        } => Inst::AluRRRExtend {
+            alu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+            extendop: extendop.clone(),
+        },
+        &mut Inst::BitRR { op, rd, rn } => Inst::BitRR {
+            op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::ULoad8 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::ULoad8 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::SLoad8 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::SLoad8 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::ULoad16 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::ULoad16 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::SLoad16 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::SLoad16 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::ULoad32 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::ULoad32 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::SLoad32 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::SLoad32 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::ULoad64 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::ULoad64 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::Store8 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::Store8 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::Store16 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::Store16 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::Store32 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::Store32 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::Store64 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::Store64 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::StoreP64 { rt, rt2, ref mem } => Inst::StoreP64 {
+            rt: map(u, rt),
+            rt2: map(u, rt2),
+            mem: map_pairmem(u, mem),
+        },
+        &mut Inst::LoadP64 { rt, rt2, ref mem } => Inst::LoadP64 {
+            rt: map_wr(d, rt),
+            rt2: map_wr(d, rt2),
+            mem: map_pairmem(u, mem),
+        },
+        &mut Inst::Mov { rd, rm } => Inst::Mov {
+            rd: map_wr(d, rd),
+            rm: map(u, rm),
+        },
+        &mut Inst::Mov32 { rd, rm } => Inst::Mov32 {
+            rd: map_wr(d, rd),
+            rm: map(u, rm),
+        },
+        &mut Inst::MovZ { rd, ref imm } => Inst::MovZ {
+            rd: map_wr(d, rd),
+            imm: imm.clone(),
+        },
+        &mut Inst::MovN { rd, ref imm } => Inst::MovN {
+            rd: map_wr(d, rd),
+            imm: imm.clone(),
+        },
+        &mut Inst::MovK { rd, ref imm } => Inst::MovK {
+            rd: map_wr(d, rd),
+            imm: imm.clone(),
+        },
+        &mut Inst::CSel { rd, rn, rm, cond } => Inst::CSel {
+            cond,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::CSet { rd, cond } => Inst::CSet {
+            cond,
+            rd: map_wr(d, rd),
+        },
+        &mut Inst::FpuMove64 { rd, rn } => Inst::FpuMove64 {
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::FpuRR { fpu_op, rd, rn } => Inst::FpuRR {
+            fpu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::FpuRRR { fpu_op, rd, rn, rm } => Inst::FpuRRR {
+            fpu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::FpuRRRR {
+            fpu_op,
+            rd,
+            rn,
+            rm,
+            ra,
+        } => Inst::FpuRRRR {
+            fpu_op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+            ra: map(u, ra),
+        },
+        &mut Inst::FpuCmp32 { rn, rm } => Inst::FpuCmp32 {
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::FpuCmp64 { rn, rm } => Inst::FpuCmp64 {
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::FpuLoad32 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuLoad32 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::FpuLoad64 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuLoad64 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::FpuLoad128 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuLoad64 {
+            rd: map_wr(d, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::FpuStore32 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuStore32 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::FpuStore64 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuStore64 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::FpuStore128 {
+            rd,
+            ref mem,
+            srcloc,
+        } => Inst::FpuStore64 {
+            rd: map(u, rd),
+            mem: map_mem(u, mem),
+            srcloc,
+        },
+        &mut Inst::LoadFpuConst32 { rd, const_data } => Inst::LoadFpuConst32 {
+            rd: map_wr(d, rd),
+            const_data,
+        },
+        &mut Inst::LoadFpuConst64 { rd, const_data } => Inst::LoadFpuConst64 {
+            rd: map_wr(d, rd),
+            const_data,
+        },
+        &mut Inst::FpuToInt { op, rd, rn } => Inst::FpuToInt {
+            op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::IntToFpu { op, rd, rn } => Inst::IntToFpu {
+            op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::FpuCSel32 { rd, rn, rm, cond } => Inst::FpuCSel32 {
+            cond,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::FpuCSel64 { rd, rn, rm, cond } => Inst::FpuCSel64 {
+            cond,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+        },
+        &mut Inst::FpuRound { op, rd, rn } => Inst::FpuRound {
+            op,
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::MovToVec64 { rd, rn } => Inst::MovToVec64 {
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::MovFromVec64 { rd, rn } => Inst::MovFromVec64 {
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+        },
+        &mut Inst::VecRRR { rd, rn, rm, alu_op } => Inst::VecRRR {
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            rm: map(u, rm),
+            alu_op,
+        },
+        &mut Inst::MovToNZCV { rn } => Inst::MovToNZCV { rn: map(u, rn) },
+        &mut Inst::MovFromNZCV { rd } => Inst::MovFromNZCV { rd: map_wr(d, rd) },
+        &mut Inst::CondSet { rd, cond } => Inst::CondSet {
+            rd: map_wr(d, rd),
+            cond,
+        },
+        &mut Inst::Extend {
+            rd,
+            rn,
+            signed,
+            from_bits,
+            to_bits,
+        } => Inst::Extend {
+            rd: map_wr(d, rd),
+            rn: map(u, rn),
+            signed,
+            from_bits,
+            to_bits,
+        },
+        &mut Inst::Jump { dest } => Inst::Jump { dest },
+        &mut Inst::Call {
+            ref uses,
+            ref defs,
+            ref dest,
+            loc,
+            opcode,
+        } => {
+            let uses = uses.map(|r| map(u, *r));
+            let defs = defs.map(|r| map_wr(d, *r));
+            let dest = dest.clone();
+            Inst::Call {
+                dest,
+                uses,
+                defs,
+                loc,
+                opcode,
+            }
+        }
+        &mut Inst::Ret {} => Inst::Ret {},
+        &mut Inst::EpiloguePlaceholder {} => Inst::EpiloguePlaceholder {},
+        &mut Inst::CallInd {
+            ref uses,
+            ref defs,
+            rn,
+            loc,
+            opcode,
+        } => {
+            let uses = uses.map(|r| map(u, *r));
+            let defs = defs.map(|r| map_wr(d, *r));
+            Inst::CallInd {
+                uses,
+                defs,
+                rn: map(u, rn),
+                loc,
+                opcode,
+            }
+        }
+        &mut Inst::CondBr {
+            taken,
+            not_taken,
+            kind,
+        } => Inst::CondBr {
+            taken,
+            not_taken,
+            kind: map_br(u, &kind),
+        },
+        &mut Inst::CondBrLowered { target, kind } => Inst::CondBrLowered {
+            target,
+            kind: map_br(u, &kind),
+        },
+        &mut Inst::CondBrLoweredCompound {
+            taken,
+            not_taken,
+            kind,
+        } => Inst::CondBrLoweredCompound {
+            taken,
+            not_taken,
+            kind: map_br(u, &kind),
+        },
+        &mut Inst::IndirectBr { rn, ref targets } => Inst::IndirectBr {
+            rn: map(u, rn),
+            targets: targets.clone(),
+        },
+        &mut Inst::Nop => Inst::Nop,
+        &mut Inst::Nop4 => Inst::Nop4,
+        &mut Inst::Brk => Inst::Brk,
+        &mut Inst::Udf { trap_info } => Inst::Udf { trap_info },
+        &mut Inst::Adr { rd, ref label } => Inst::Adr {
+            rd: map_wr(d, rd),
+            label: label.clone(),
+        },
+        &mut Inst::Word4 { data } => Inst::Word4 { data },
+        &mut Inst::Word8 { data } => Inst::Word8 { data },
+        &mut Inst::JTSequence {
+            ridx,
+            rtmp1,
+            rtmp2,
+            ref targets,
+            ref targets_for_term,
+        } => Inst::JTSequence {
+            targets: targets.clone(),
+            targets_for_term: targets_for_term.clone(),
+            ridx: map(u, ridx),
+            rtmp1: map_wr(d, rtmp1),
+            rtmp2: map_wr(d, rtmp2),
+        },
+        &mut Inst::LoadConst64 { rd, const_data } => Inst::LoadConst64 {
+            rd: map_wr(d, rd),
+            const_data,
+        },
+        &mut Inst::LoadExtName {
+            rd,
+            ref name,
+            offset,
+            srcloc,
+        } => Inst::LoadExtName {
+            rd: map_wr(d, rd),
+            name: name.clone(),
+            offset,
+            srcloc,
+        },
+    };
+    *inst = newval;
+}
+
+//=============================================================================
+// Instructions: misc functions and external interface
+
+impl MachInst for Inst {
+    fn get_regs(&self, collector: &mut RegUsageCollector) {
+        arm64_get_regs(self, collector)
+    }
+
+    fn map_regs(
+        &mut self,
+        pre_map: &RegallocMap<VirtualReg, RealReg>,
+        post_map: &RegallocMap<VirtualReg, RealReg>,
+    ) {
+        arm64_map_regs(self, pre_map, post_map);
+    }
+
+    fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
+        match self {
+            &Inst::Mov { rd, rm } => Some((rd, rm)),
+            &Inst::FpuMove64 { rd, rn } => Some((rd, rn)),
+            _ => None,
+        }
+    }
+
+    fn is_epilogue_placeholder(&self) -> bool {
+        if let Inst::EpiloguePlaceholder { .. } = self {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn is_term<'a>(&'a self) -> MachTerminator<'a> {
+        match self {
+            &Inst::Ret {} | &Inst::EpiloguePlaceholder {} => MachTerminator::Ret,
+            &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
+            &Inst::CondBr {
+                taken, not_taken, ..
+            } => MachTerminator::Cond(
+                taken.as_block_index().unwrap(),
+                not_taken.as_block_index().unwrap(),
+            ),
+            &Inst::CondBrLowered { .. } => {
+                // When this is used prior to branch finalization for branches
+                // within an open-coded sequence, i.e. with ResolvedOffsets,
+                // do not consider it a terminator. From the point of view of CFG analysis,
+                // it is part of a black-box single-in single-out region, hence is not
+                // denoted a terminator.
+                MachTerminator::None
+            }
+            &Inst::CondBrLoweredCompound { .. } => {
+                panic!("is_term() called after lowering branches");
+            }
+            &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]),
+            &Inst::JTSequence {
+                ref targets_for_term,
+                ..
+            } => MachTerminator::Indirect(&targets_for_term[..]),
+            _ => MachTerminator::None,
+        }
+    }
+
+    fn gen_move(to_reg: Writable<Reg>, from_reg: Reg, ty: Type) -> Inst {
+        assert!(ty.bits() <= 64); // no vector support yet!
+        Inst::mov(to_reg, from_reg)
+    }
+
+    fn gen_zero_len_nop() -> Inst {
+        Inst::Nop
+    }
+
+    fn gen_nop(preferred_size: usize) -> Inst {
+        // We can't give a NOP (or any insn) < 4 bytes.
+        assert!(preferred_size >= 4);
+        Inst::Nop4
+    }
+
+    fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option<Inst> {
+        None
+    }
+
+    fn rc_for_type(ty: Type) -> RegClass {
+        match ty {
+            I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => RegClass::I64,
+            F32 | F64 => RegClass::V128,
+            I128 | B128 => RegClass::V128,
+            IFLAGS | FFLAGS => RegClass::I64,
+            _ => panic!("Unexpected SSA-value type: {}", ty),
+        }
+    }
+
+    fn gen_jump(blockindex: BlockIndex) -> Inst {
+        Inst::Jump {
+            dest: BranchTarget::Block(blockindex),
+        }
+    }
+
+    fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) {
+        match self {
+            &mut Inst::Jump { ref mut dest } => {
+                dest.map(block_target_map);
+            }
+            &mut Inst::CondBr {
+                ref mut taken,
+                ref mut not_taken,
+                ..
+            } => {
+                taken.map(block_target_map);
+                not_taken.map(block_target_map);
+            }
+            &mut Inst::CondBrLowered { .. } => {
+                // See note in `is_term()`: this is used in open-coded sequences
+                // within blocks and should be left alone.
+            }
+            &mut Inst::CondBrLoweredCompound { .. } => {
+                panic!("with_block_rewrites called after branch lowering!");
+            }
+            _ => {}
+        }
+    }
+
+    fn with_fallthrough_block(&mut self, fallthrough: Option<BlockIndex>) {
+        match self {
+            &mut Inst::CondBr {
+                taken,
+                not_taken,
+                kind,
+            } => {
+                if taken.as_block_index() == fallthrough
+                    && not_taken.as_block_index() == fallthrough
+                {
+                    *self = Inst::Nop;
+                } else if taken.as_block_index() == fallthrough {
+                    *self = Inst::CondBrLowered {
+                        target: not_taken,
+                        kind: kind.invert(),
+                    };
+                } else if not_taken.as_block_index() == fallthrough {
+                    *self = Inst::CondBrLowered {
+                        target: taken,
+                        kind,
+                    };
+                } else {
+                    // We need a compound sequence (condbr / uncond-br).
+                    *self = Inst::CondBrLoweredCompound {
+                        taken,
+                        not_taken,
+                        kind,
+                    };
+                }
+            }
+            &mut Inst::Jump { dest } => {
+                if dest.as_block_index() == fallthrough {
+                    *self = Inst::Nop;
+                }
+            }
+            _ => {}
+        }
+    }
+
+    fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) {
+        match self {
+            &mut Inst::CondBrLowered { ref mut target, .. } => {
+                target.lower(targets, my_offset);
+            }
+            &mut Inst::CondBrLoweredCompound {
+                ref mut taken,
+                ref mut not_taken,
+                ..
+            } => {
+                taken.lower(targets, my_offset);
+                not_taken.lower(targets, my_offset + 4);
+            }
+            &mut Inst::Jump { ref mut dest } => {
+                dest.lower(targets, my_offset);
+            }
+            &mut Inst::JTSequence {
+                targets: ref mut t, ..
+            } => {
+                for target in t {
+                    // offset+20: jumptable is 20 bytes into compound sequence.
+                    target.lower(targets, my_offset + 20);
+                }
+            }
+            _ => {}
+        }
+    }
+
+    fn reg_universe() -> RealRegUniverse {
+        create_reg_universe()
+    }
+}
+
+//=============================================================================
+// Pretty-printing of instructions.
+
+fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (String, MemArg) {
+    let (mem_insts, mem) = mem_finalize(0, mem);
+    let mut mem_str = mem_insts
+        .into_iter()
+        .map(|inst| inst.show_rru(mb_rru))
+        .collect::<Vec<_>>()
+        .join(" ; ");
+    if !mem_str.is_empty() {
+        mem_str += " ; ";
+    }
+
+    (mem_str, mem)
+}
+
+impl ShowWithRRU for Inst {
+    fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
+        fn op_is32(alu_op: ALUOp) -> (&'static str, bool) {
+            match alu_op {
+                ALUOp::Add32 => ("add", true),
+                ALUOp::Add64 => ("add", false),
+                ALUOp::Sub32 => ("sub", true),
+                ALUOp::Sub64 => ("sub", false),
+                ALUOp::Orr32 => ("orr", true),
+                ALUOp::Orr64 => ("orr", false),
+                ALUOp::And32 => ("and", true),
+                ALUOp::And64 => ("and", false),
+                ALUOp::Eor32 => ("eor", true),
+                ALUOp::Eor64 => ("eor", false),
+                ALUOp::AddS32 => ("adds", true),
+                ALUOp::AddS64 => ("adds", false),
+                ALUOp::SubS32 => ("subs", true),
+                ALUOp::SubS64 => ("subs", false),
+                ALUOp::MAdd32 => ("madd", true),
+                ALUOp::MAdd64 => ("madd", false),
+                ALUOp::MSub32 => ("msub", true),
+                ALUOp::MSub64 => ("msub", false),
+                ALUOp::SMulH => ("smulh", false),
+                ALUOp::UMulH => ("umulh", false),
+                ALUOp::SDiv64 => ("sdiv", false),
+                ALUOp::UDiv64 => ("udiv", false),
+                ALUOp::AndNot32 => ("bic", true),
+                ALUOp::AndNot64 => ("bic", false),
+                ALUOp::OrrNot32 => ("orn", true),
+                ALUOp::OrrNot64 => ("orn", false),
+                ALUOp::EorNot32 => ("eon", true),
+                ALUOp::EorNot64 => ("eon", false),
+                ALUOp::RotR32 => ("ror", true),
+                ALUOp::RotR64 => ("ror", false),
+                ALUOp::Lsr32 => ("lsr", true),
+                ALUOp::Lsr64 => ("lsr", false),
+                ALUOp::Asr32 => ("asr", true),
+                ALUOp::Asr64 => ("asr", false),
+                ALUOp::Lsl32 => ("lsl", true),
+                ALUOp::Lsl64 => ("lsl", false),
+            }
+        }
+
+        match self {
+            &Inst::Nop => "nop-zero-len".to_string(),
+            &Inst::Nop4 => "nop".to_string(),
+            &Inst::AluRRR { alu_op, rd, rn, rm } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let rm = show_ireg_sized(rm, mb_rru, is32);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::AluRRRR {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let four_args = alu_op != ALUOp::SMulH && alu_op != ALUOp::UMulH;
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let ra = show_ireg_sized(ra, mb_rru, is32);
+                if four_args {
+                    format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
+                } else {
+                    // smulh and umulh have Ra "hard-wired" to the zero register
+                    // and the canonical assembly form has only three regs.
+                    format!("{} {}, {}, {}", op, rd, rn, rm)
+                }
+            }
+            &Inst::AluRRImm12 {
+                alu_op,
+                rd,
+                rn,
+                ref imm12,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+
+                if imm12.bits == 0 && alu_op == ALUOp::Add64 {
+                    // special-case MOV (used for moving into SP).
+                    format!("mov {}, {}", rd, rn)
+                } else {
+                    let imm12 = imm12.show_rru(mb_rru);
+                    format!("{} {}, {}, {}", op, rd, rn, imm12)
+                }
+            }
+            &Inst::AluRRImmLogic {
+                alu_op,
+                rd,
+                rn,
+                ref imml,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let imml = imml.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, imml)
+            }
+            &Inst::AluRRImmShift {
+                alu_op,
+                rd,
+                rn,
+                ref immshift,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let immshift = immshift.show_rru(mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, immshift)
+            }
+            &Inst::AluRRRShift {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref shiftop,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let shiftop = shiftop.show_rru(mb_rru);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, shiftop)
+            }
+            &Inst::AluRRRExtend {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ref extendop,
+            } => {
+                let (op, is32) = op_is32(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let extendop = extendop.show_rru(mb_rru);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, extendop)
+            }
+            &Inst::BitRR { op, rd, rn } => {
+                let is32 = op.is_32_bit();
+                let op = op.op_str();
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_ireg_sized(rn, mb_rru, is32);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::ULoad8 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::SLoad8 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::ULoad16 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::SLoad16 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::ULoad32 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::SLoad32 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::ULoad64 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+                ..
+            } => {
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru);
+
+                let is_unscaled = match &mem {
+                    &MemArg::Unscaled(..) => true,
+                    _ => false,
+                };
+                let (op, is32) = match (self, is_unscaled) {
+                    (&Inst::ULoad8 { .. }, false) => ("ldrb", true),
+                    (&Inst::ULoad8 { .. }, true) => ("ldurb", true),
+                    (&Inst::SLoad8 { .. }, false) => ("ldrsb", false),
+                    (&Inst::SLoad8 { .. }, true) => ("ldursb", false),
+                    (&Inst::ULoad16 { .. }, false) => ("ldrh", true),
+                    (&Inst::ULoad16 { .. }, true) => ("ldurh", true),
+                    (&Inst::SLoad16 { .. }, false) => ("ldrsh", false),
+                    (&Inst::SLoad16 { .. }, true) => ("ldursh", false),
+                    (&Inst::ULoad32 { .. }, false) => ("ldr", true),
+                    (&Inst::ULoad32 { .. }, true) => ("ldur", true),
+                    (&Inst::SLoad32 { .. }, false) => ("ldrsw", false),
+                    (&Inst::SLoad32 { .. }, true) => ("ldursw", false),
+                    (&Inst::ULoad64 { .. }, false) => ("ldr", false),
+                    (&Inst::ULoad64 { .. }, true) => ("ldur", false),
+                    _ => unreachable!(),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rd, mem)
+            }
+            &Inst::Store8 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::Store16 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::Store32 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+            }
+            | &Inst::Store64 {
+                rd,
+                ref mem,
+                srcloc: _srcloc,
+                ..
+            } => {
+                let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru);
+
+                let is_unscaled = match &mem {
+                    &MemArg::Unscaled(..) => true,
+                    _ => false,
+                };
+                let (op, is32) = match (self, is_unscaled) {
+                    (&Inst::Store8 { .. }, false) => ("strb", true),
+                    (&Inst::Store8 { .. }, true) => ("sturb", true),
+                    (&Inst::Store16 { .. }, false) => ("strh", true),
+                    (&Inst::Store16 { .. }, true) => ("sturh", true),
+                    (&Inst::Store32 { .. }, false) => ("str", true),
+                    (&Inst::Store32 { .. }, true) => ("stur", true),
+                    (&Inst::Store64 { .. }, false) => ("str", false),
+                    (&Inst::Store64 { .. }, true) => ("stur", false),
+                    _ => unreachable!(),
+                };
+                let rd = show_ireg_sized(rd, mb_rru, is32);
+                let mem = mem.show_rru(mb_rru);
+                format!("{}{} {}, {}", mem_str, op, rd, mem)
+            }
+            &Inst::StoreP64 { rt, rt2, ref mem } => {
+                let rt = rt.show_rru(mb_rru);
+                let rt2 = rt2.show_rru(mb_rru);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("stp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::LoadP64 { rt, rt2, ref mem } => {
+                let rt = rt.to_reg().show_rru(mb_rru);
+                let rt2 = rt2.to_reg().show_rru(mb_rru);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("ldp {}, {}, {}", rt, rt2, mem)
+            }
+            &Inst::Mov { rd, rm } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                format!("mov {}, {}", rd, rm)
+            }
+            &Inst::Mov32 { rd, rm } => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rm = show_ireg_sized(rm, mb_rru, /* is32 = */ true);
+                format!("mov {}, {}", rd, rm)
+            }
+            &Inst::MovZ { rd, ref imm } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let imm = imm.show_rru(mb_rru);
+                format!("movz {}, {}", rd, imm)
+            }
+            &Inst::MovN { rd, ref imm } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let imm = imm.show_rru(mb_rru);
+                format!("movn {}, {}", rd, imm)
+            }
+            &Inst::MovK { rd, ref imm } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let imm = imm.show_rru(mb_rru);
+                format!("movk {}, {}", rd, imm)
+            }
+            &Inst::CSel { rd, rn, rm, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                let rm = rm.show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("csel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::CSet { rd, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("cset {}, {}", rd, cond)
+            }
+            &Inst::FpuMove64 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}.8b, {}.8b", rd, rn)
+            }
+            &Inst::FpuRR { fpu_op, rd, rn } => {
+                let (op, is32src, is32dst) = match fpu_op {
+                    FPUOp1::Abs32 => ("fabs", true, true),
+                    FPUOp1::Abs64 => ("fabs", false, false),
+                    FPUOp1::Neg32 => ("fneg", true, true),
+                    FPUOp1::Neg64 => ("fneg", false, false),
+                    FPUOp1::Sqrt32 => ("fsqrt", true, true),
+                    FPUOp1::Sqrt64 => ("fsqrt", false, false),
+                    FPUOp1::Cvt32To64 => ("fcvt", true, false),
+                    FPUOp1::Cvt64To32 => ("fcvt", false, true),
+                };
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32dst);
+                let rn = show_freg_sized(rn, mb_rru, is32src);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::FpuRRR { fpu_op, rd, rn, rm } => {
+                let (op, is32) = match fpu_op {
+                    FPUOp2::Add32 => ("fadd", true),
+                    FPUOp2::Add64 => ("fadd", false),
+                    FPUOp2::Sub32 => ("fsub", true),
+                    FPUOp2::Sub64 => ("fsub", false),
+                    FPUOp2::Mul32 => ("fmul", true),
+                    FPUOp2::Mul64 => ("fmul", false),
+                    FPUOp2::Div32 => ("fdiv", true),
+                    FPUOp2::Div64 => ("fdiv", false),
+                    FPUOp2::Max32 => ("fmax", true),
+                    FPUOp2::Max64 => ("fmax", false),
+                    FPUOp2::Min32 => ("fmin", true),
+                    FPUOp2::Min64 => ("fmin", false),
+                };
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_freg_sized(rn, mb_rru, is32);
+                let rm = show_freg_sized(rm, mb_rru, is32);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::FpuRRRR {
+                fpu_op,
+                rd,
+                rn,
+                rm,
+                ra,
+            } => {
+                let (op, is32) = match fpu_op {
+                    FPUOp3::MAdd32 => ("fmadd", true),
+                    FPUOp3::MAdd64 => ("fmadd", false),
+                };
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_freg_sized(rn, mb_rru, is32);
+                let rm = show_freg_sized(rm, mb_rru, is32);
+                let ra = show_freg_sized(ra, mb_rru, is32);
+                format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
+            }
+            &Inst::FpuCmp32 { rn, rm } => {
+                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ true);
+                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ true);
+                format!("fcmp {}, {}", rn, rm)
+            }
+            &Inst::FpuCmp64 { rn, rm } => {
+                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ false);
+                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ false);
+                format!("fcmp {}, {}", rn, rm)
+            }
+            &Inst::FpuLoad32 { rd, ref mem, .. } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 4);
+                format!("ldr {}, {}", rd, mem)
+            }
+            &Inst::FpuLoad64 { rd, ref mem, .. } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("ldr {}, {}", rd, mem)
+            }
+            &Inst::FpuLoad128 { rd, ref mem, .. } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rd = "q".to_string() + &rd[1..];
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("ldr {}, {}", rd, mem)
+            }
+            &Inst::FpuStore32 { rd, ref mem, .. } => {
+                let rd = show_freg_sized(rd, mb_rru, /* is32 = */ true);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 4);
+                format!("str {}, {}", rd, mem)
+            }
+            &Inst::FpuStore64 { rd, ref mem, .. } => {
+                let rd = show_freg_sized(rd, mb_rru, /* is32 = */ false);
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("str {}, {}", rd, mem)
+            }
+            &Inst::FpuStore128 { rd, ref mem, .. } => {
+                let rd = rd.show_rru(mb_rru);
+                let rd = "q".to_string() + &rd[1..];
+                let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
+                format!("str {}, {}", rd, mem)
+            }
+            &Inst::LoadFpuConst32 { rd, const_data } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                format!("ldr {}, pc+8 ; b 8 ; data.f32 {}", rd, const_data)
+            }
+            &Inst::LoadFpuConst64 { rd, const_data } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
+                format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data)
+            }
+            &Inst::FpuToInt { op, rd, rn } => {
+                let (op, is32src, is32dest) = match op {
+                    FpuToIntOp::F32ToI32 => ("fcvtzs", true, true),
+                    FpuToIntOp::F32ToU32 => ("fcvtzu", true, true),
+                    FpuToIntOp::F32ToI64 => ("fcvtzs", true, false),
+                    FpuToIntOp::F32ToU64 => ("fcvtzu", true, false),
+                    FpuToIntOp::F64ToI32 => ("fcvtzs", false, true),
+                    FpuToIntOp::F64ToU32 => ("fcvtzu", false, true),
+                    FpuToIntOp::F64ToI64 => ("fcvtzs", false, false),
+                    FpuToIntOp::F64ToU64 => ("fcvtzu", false, false),
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32dest);
+                let rn = show_freg_sized(rn, mb_rru, is32src);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::IntToFpu { op, rd, rn } => {
+                let (op, is32src, is32dest) = match op {
+                    IntToFpuOp::I32ToF32 => ("scvtf", true, true),
+                    IntToFpuOp::U32ToF32 => ("ucvtf", true, true),
+                    IntToFpuOp::I64ToF32 => ("scvtf", false, true),
+                    IntToFpuOp::U64ToF32 => ("ucvtf", false, true),
+                    IntToFpuOp::I32ToF64 => ("scvtf", true, false),
+                    IntToFpuOp::U32ToF64 => ("ucvtf", true, false),
+                    IntToFpuOp::I64ToF64 => ("scvtf", false, false),
+                    IntToFpuOp::U64ToF64 => ("ucvtf", false, false),
+                };
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32dest);
+                let rn = show_ireg_sized(rn, mb_rru, is32src);
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::FpuCSel32 { rd, rn, rm, cond } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ true);
+                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ true);
+                let cond = cond.show_rru(mb_rru);
+                format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::FpuCSel64 { rd, rn, rm, cond } => {
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
+                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ false);
+                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ false);
+                let cond = cond.show_rru(mb_rru);
+                format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
+            }
+            &Inst::FpuRound { op, rd, rn } => {
+                let (inst, is32) = match op {
+                    FpuRoundMode::Minus32 => ("frintm", true),
+                    FpuRoundMode::Minus64 => ("frintm", false),
+                    FpuRoundMode::Plus32 => ("frintp", true),
+                    FpuRoundMode::Plus64 => ("frintp", false),
+                    FpuRoundMode::Zero32 => ("frintz", true),
+                    FpuRoundMode::Zero64 => ("frintz", false),
+                    FpuRoundMode::Nearest32 => ("frintn", true),
+                    FpuRoundMode::Nearest64 => ("frintn", false),
+                };
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
+                let rn = show_freg_sized(rn, mb_rru, is32);
+                format!("{} {}, {}", inst, rd, rn)
+            }
+            &Inst::MovToVec64 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}.d[0], {}", rd, rn)
+            }
+            &Inst::MovFromVec64 { rd, rn } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let rn = rn.show_rru(mb_rru);
+                format!("mov {}, {}.d[0]", rd, rn)
+            }
+            &Inst::VecRRR { rd, rn, rm, alu_op } => {
+                let op = match alu_op {
+                    VecALUOp::SQAddScalar => "sqadd",
+                    VecALUOp::UQAddScalar => "uqadd",
+                    VecALUOp::SQSubScalar => "sqsub",
+                    VecALUOp::UQSubScalar => "uqsub",
+                };
+                let rd = show_vreg_scalar(rd.to_reg(), mb_rru);
+                let rn = show_vreg_scalar(rn, mb_rru);
+                let rm = show_vreg_scalar(rm, mb_rru);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
+            &Inst::MovToNZCV { rn } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("msr nzcv, {}", rn)
+            }
+            &Inst::MovFromNZCV { rd } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                format!("mrs {}, nzcv", rd)
+            }
+            &Inst::CondSet { rd, cond } => {
+                let rd = rd.to_reg().show_rru(mb_rru);
+                let cond = cond.show_rru(mb_rru);
+                format!("cset {}, {}", rd, cond)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits >= 8 => {
+                // Is the destination a 32-bit register? Corresponds to whether
+                // extend-to width is <= 32 bits, *unless* we have an unsigned
+                // 32-to-64-bit extension, which is implemented with a "mov" to a
+                // 32-bit (W-reg) dest, because this zeroes the top 32 bits.
+                let dest_is32 = if !signed && from_bits == 32 && to_bits == 64 {
+                    true
+                } else {
+                    to_bits <= 32
+                };
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_is32);
+                let rn = show_ireg_sized(rn, mb_rru, from_bits <= 32);
+                let op = match (signed, from_bits, to_bits) {
+                    (false, 8, 32) => "uxtb",
+                    (true, 8, 32) => "sxtb",
+                    (false, 16, 32) => "uxth",
+                    (true, 16, 32) => "sxth",
+                    (false, 8, 64) => "uxtb",
+                    (true, 8, 64) => "sxtb",
+                    (false, 16, 64) => "uxth",
+                    (true, 16, 64) => "sxth",
+                    (false, 32, 64) => "mov", // special case (see above).
+                    (true, 32, 64) => "sxtw",
+                    _ => panic!("Unsupported Extend case: {:?}", self),
+                };
+                format!("{} {}, {}", op, rd, rn)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                to_bits,
+            } if from_bits == 1 && signed => {
+                let dest_is32 = to_bits <= 32;
+                let zr = if dest_is32 { "wzr" } else { "xzr" };
+                let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_is32);
+                let rn = show_ireg_sized(rn, mb_rru, /* is32 = */ true);
+                format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd)
+            }
+            &Inst::Extend {
+                rd,
+                rn,
+                signed,
+                from_bits,
+                ..
+            } if from_bits == 1 && !signed => {
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rn = show_ireg_sized(rn, mb_rru, /* is32 = */ true);
+                format!("and {}, {}, #1", rd, rn)
+            }
+            &Inst::Extend { .. } => {
+                panic!("Unsupported Extend case");
+            }
+            &Inst::Call { dest: _, .. } => format!("bl 0"),
+            &Inst::CallInd { rn, .. } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("blr {}", rn)
+            }
+            &Inst::Ret {} => "ret".to_string(),
+            &Inst::EpiloguePlaceholder {} => "epilogue placeholder".to_string(),
+            &Inst::Jump { ref dest } => {
+                let dest = dest.show_rru(mb_rru);
+                format!("b {}", dest)
+            }
+            &Inst::CondBr {
+                ref taken,
+                ref not_taken,
+                ref kind,
+            } => {
+                let taken = taken.show_rru(mb_rru);
+                let not_taken = not_taken.show_rru(mb_rru);
+                match kind {
+                    &CondBrKind::Zero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbz {}, {} ; b {}", reg, taken, not_taken)
+                    }
+                    &CondBrKind::NotZero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbnz {}, {} ; b {}", reg, taken, not_taken)
+                    }
+                    &CondBrKind::Cond(c) => {
+                        let c = c.show_rru(mb_rru);
+                        format!("b.{} {} ; b {}", c, taken, not_taken)
+                    }
+                }
+            }
+            &Inst::CondBrLowered {
+                ref target,
+                ref kind,
+            } => {
+                let target = target.show_rru(mb_rru);
+                match &kind {
+                    &CondBrKind::Zero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbz {}, {}", reg, target)
+                    }
+                    &CondBrKind::NotZero(reg) => {
+                        let reg = reg.show_rru(mb_rru);
+                        format!("cbnz {}, {}", reg, target)
+                    }
+                    &CondBrKind::Cond(c) => {
+                        let c = c.show_rru(mb_rru);
+                        format!("b.{} {}", c, target)
+                    }
+                }
+            }
+            &Inst::CondBrLoweredCompound {
+                ref taken,
+                ref not_taken,
+                ref kind,
+            } => {
+                let first = Inst::CondBrLowered {
+                    target: taken.clone(),
+                    kind: kind.clone(),
+                };
+                let second = Inst::Jump {
+                    dest: not_taken.clone(),
+                };
+                first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru)
+            }
+            &Inst::IndirectBr { rn, .. } => {
+                let rn = rn.show_rru(mb_rru);
+                format!("br {}", rn)
+            }
+            &Inst::Brk => "brk #0".to_string(),
+            &Inst::Udf { .. } => "udf".to_string(),
+            &Inst::Adr { rd, ref label } => {
+                let rd = rd.show_rru(mb_rru);
+                let label = label.show_rru(mb_rru);
+                format!("adr {}, {}", rd, label)
+            }
+            &Inst::Word4 { data } => format!("data.i32 {}", data),
+            &Inst::Word8 { data } => format!("data.i64 {}", data),
+            &Inst::JTSequence {
+                ref targets,
+                ridx,
+                rtmp1,
+                rtmp2,
+                ..
+            } => {
+                let ridx = ridx.show_rru(mb_rru);
+                let rtmp1 = rtmp1.show_rru(mb_rru);
+                let rtmp2 = rtmp2.show_rru(mb_rru);
+                format!(
+                    concat!(
+                        "adr {}, pc+16 ; ",
+                        "ldrsw {}, [{}, {}, LSL 2] ; ",
+                        "add {}, {}, {} ; ",
+                        "br {} ; ",
+                        "jt_entries {:?}"
+                    ),
+                    rtmp1, rtmp2, rtmp1, ridx, rtmp1, rtmp1, rtmp2, rtmp1, targets
+                )
+            }
+            &Inst::LoadConst64 { rd, const_data } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("ldr {}, 8 ; b 12 ; data {:?}", rd, const_data)
+            }
+            &Inst::LoadExtName {
+                rd,
+                ref name,
+                offset,
+                srcloc: _srcloc,
+            } => {
+                let rd = rd.show_rru(mb_rru);
+                format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset)
+            }
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/inst/regs.rs b/cranelift/codegen/src/isa/arm64/inst/regs.rs
new file mode 100644
index 000000000000..31a915410a97
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/inst/regs.rs
@@ -0,0 +1,273 @@
+//! ARM64 ISA definitions: registers.
+
+#![allow(dead_code)]
+
+use crate::machinst::*;
+
+use regalloc::{
+    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
+    NUM_REG_CLASSES,
+};
+
+use std::string::{String, ToString};
+
+//=============================================================================
+// Registers, the Universe thereof, and printing
+
+#[rustfmt::skip]
+const XREG_INDICES: [u8; 31] = [
+    // X0 - X7
+    32, 33, 34, 35, 36, 37, 38, 39,
+    // X8 - X14
+    40, 41, 42, 43, 44, 45, 46,
+    // X15
+    59,
+    // X16, X17
+    47, 48,
+    // X18
+    60,
+    // X19 - X28
+    49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+    // X29
+    61,
+    // X30
+    62,
+];
+
+const ZERO_REG_INDEX: u8 = 63;
+
+const SP_REG_INDEX: u8 = 64;
+
+/// Get a reference to an X-register (integer register).
+pub fn xreg(num: u8) -> Reg {
+    assert!(num < 31);
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ num,
+        /* index = */ XREG_INDICES[num as usize],
+    )
+}
+
+/// Get a writable reference to an X-register.
+pub fn writable_xreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(xreg(num))
+}
+
+/// Get a reference to a V-register (vector/FP register).
+pub fn vreg(num: u8) -> Reg {
+    assert!(num < 32);
+    Reg::new_real(RegClass::V128, /* enc = */ num, /* index = */ num)
+}
+
+/// Get a writable reference to a V-register.
+pub fn writable_vreg(num: u8) -> Writable<Reg> {
+    Writable::from_reg(vreg(num))
+}
+
+/// Get a reference to the zero-register.
+pub fn zero_reg() -> Reg {
+    // This should be the same as what xreg(31) returns, except that
+    // we use the special index into the register index space.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ ZERO_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the zero-register (this discards a result).
+pub fn writable_zero_reg() -> Writable<Reg> {
+    Writable::from_reg(zero_reg())
+}
+
+/// Get a reference to the stack-pointer register.
+pub fn stack_reg() -> Reg {
+    // XSP (stack) and XZR (zero) are logically different registers which have
+    // the same hardware encoding, and whose meaning, in real arm64
+    // instructions, is context-dependent.  For convenience of
+    // universe-construction and for correct printing, we make them be two
+    // different real registers.
+    Reg::new_real(
+        RegClass::I64,
+        /* enc = */ 31,
+        /* index = */ SP_REG_INDEX,
+    )
+}
+
+/// Get a writable reference to the stack-pointer register.
+pub fn writable_stack_reg() -> Writable<Reg> {
+    Writable::from_reg(stack_reg())
+}
+
+/// Get a reference to the link register (x30).
+pub fn link_reg() -> Reg {
+    xreg(30)
+}
+
+/// Get a writable reference to the link register.
+pub fn writable_link_reg() -> Writable<Reg> {
+    Writable::from_reg(link_reg())
+}
+
+/// Get a reference to the frame pointer (x29).
+pub fn fp_reg() -> Reg {
+    xreg(29)
+}
+
+/// Get a writable reference to the frame pointer.
+pub fn writable_fp_reg() -> Writable<Reg> {
+    Writable::from_reg(fp_reg())
+}
+
+/// Get a reference to the "spill temp" register. This register is used to
+/// compute the address of a spill slot when a direct offset addressing mode from
+/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc
+/// and reserve it for this purpose for simplicity; otherwise we need a
+/// multi-stage analysis where we first determine how many spill slots we have,
+/// then perhaps remove the reg from the pool and recompute regalloc.
+pub fn spilltmp_reg() -> Reg {
+    xreg(15)
+}
+
+/// Get a writable reference to the spilltmp reg.
+pub fn writable_spilltmp_reg() -> Writable<Reg> {
+    Writable::from_reg(spilltmp_reg())
+}
+
+/// Create the register universe for ARM64.
+pub fn create_reg_universe() -> RealRegUniverse {
+    let mut regs = vec![];
+    let mut allocable_by_class = [None; NUM_REG_CLASSES];
+
+    // Numbering Scheme: we put V-regs first, then X-regs. The X-regs
+    // exclude several registers: x18 (globally reserved for platform-specific
+    // purposes), x29 (frame pointer), x30 (link register), x31 (stack pointer
+    // or zero register, depending on context).
+
+    let v_reg_base = 0u8; // in contiguous real-register index space
+    let v_reg_count = 32;
+    for i in 0u8..v_reg_count {
+        let reg = Reg::new_real(
+            RegClass::V128,
+            /* enc = */ i,
+            /* index = */ v_reg_base + i,
+        )
+        .to_real_reg();
+        let name = format!("v{}", i);
+        regs.push((reg, name));
+    }
+    let v_reg_last = v_reg_base + v_reg_count - 1;
+
+    // Add the X registers. N.B.: the order here must match the order implied
+    // by XREG_INDICES, ZERO_REG_INDEX, and SP_REG_INDEX above.
+
+    let x_reg_base = 32u8; // in contiguous real-register index space
+    let mut x_reg_count = 0;
+    for i in 0u8..32u8 {
+        // See above for excluded registers.
+        if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 {
+            continue;
+        }
+        let reg = Reg::new_real(
+            RegClass::I64,
+            /* enc = */ i,
+            /* index = */ x_reg_base + x_reg_count,
+        )
+        .to_real_reg();
+        let name = format!("x{}", i);
+        regs.push((reg, name));
+        x_reg_count += 1;
+    }
+    let x_reg_last = x_reg_base + x_reg_count - 1;
+
+    allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo {
+        first: x_reg_base as usize,
+        last: x_reg_last as usize,
+        suggested_scratch: Some(XREG_INDICES[13] as usize),
+    });
+    allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo {
+        first: v_reg_base as usize,
+        last: v_reg_last as usize,
+        suggested_scratch: Some(/* V31: */ 31),
+    });
+
+    // Other regs, not available to the allocator.
+    let allocable = regs.len();
+    regs.push((xreg(15).to_real_reg(), "x15".to_string()));
+    regs.push((xreg(18).to_real_reg(), "x18".to_string()));
+    regs.push((fp_reg().to_real_reg(), "fp".to_string()));
+    regs.push((link_reg().to_real_reg(), "lr".to_string()));
+    regs.push((zero_reg().to_real_reg(), "xzr".to_string()));
+    regs.push((stack_reg().to_real_reg(), "sp".to_string()));
+    // FIXME JRS 2020Feb06: unfortunately this pushes the number of real regs
+    // to 65, which is potentially inconvenient from a compiler performance
+    // standpoint.  We could possibly drop back to 64 by "losing" a vector
+    // register in future.
+
+    // Assert sanity: the indices in the register structs must match their
+    // actual indices in the array.
+    for (i, reg) in regs.iter().enumerate() {
+        assert_eq!(i, reg.0.get_index());
+    }
+
+    RealRegUniverse {
+        regs,
+        allocable,
+        allocable_by_class,
+    }
+}
+
+/// If |ireg| denotes an I64-classed reg, make a best-effort attempt to show
+/// its name at the 32-bit size.
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, is32: bool) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::I64 || !is32 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "x42" into "w42" as appropriate
+        if reg.get_class() == RegClass::I64 && is32 && s.starts_with("x") {
+            s = "w".to_string() + &s[1..];
+        }
+    } else {
+        // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role
+        if reg.get_class() == RegClass::I64 && is32 {
+            s = s + &"w";
+        }
+    }
+    s
+}
+
+/// Show a vector register when its use as a 32-bit or 64-bit float is known.
+pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, is32: bool) -> String {
+    let s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::V128 {
+        return s;
+    }
+    let prefix = if is32 { "s" } else { "d" };
+    prefix.to_string() + &s[1..]
+}
+
+/// Show a vector register used in a scalar context.
+pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
+    let mut s = reg.show_rru(mb_rru);
+    if reg.get_class() != RegClass::V128 {
+        // We can't do any better.
+        return s;
+    }
+
+    if reg.is_real() {
+        // Change (eg) "v0" into "d0".
+        if reg.get_class() == RegClass::V128 && s.starts_with("v") {
+            s = "d".to_string() + &s[1..];
+        }
+    } else {
+        // Add a "d" suffix to RegClass::V128 vregs.
+        if reg.get_class() == RegClass::V128 {
+            s = s + &"d";
+        }
+    }
+    s
+}
diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs
index 2bd6dce476e4..b6a28a5dbdcc 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1 +1 @@
-// Empty.
+mod inst;

From 0f725a3c5cccba78edfbebd72995a39a3b7bc468 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 12:44:47 -0700
Subject: [PATCH 05/12] ARM64 backend, part 5 / 11: ABI implementation.

This patch provides an ARM64 implementation of the ABI-related traits
required by the new backend infrasturcture. It will be used by the
lowering code, when that is in place in a subsequent patch.

This patch contains code written by Julian Seward <jseward@acm.org> and
Benjamin Bouvier <public@benj.me>, originally developed on a side-branch
before rebasing and condensing into this patch series. See the `arm64`
branch at `https://github.com/cfallin/wasmtime` for original development
history.

This patch also contains code written by Joey Gouly
<joey.gouly@arm.com> and contributed to the above branch. These
contributions are "Copyright (c) 2020, Arm Limited."

Co-authored-by: Julian Seward <jseward@acm.org>
Co-authored-by: Benjamin Bouvier <public@benj.me>
Co-authored-by: Joey Gouly <joey.gouly@arm.com>
---
 cranelift/codegen/src/isa/arm64/abi.rs | 875 +++++++++++++++++++++++++
 cranelift/codegen/src/isa/arm64/mod.rs |   1 +
 2 files changed, 876 insertions(+)
 create mode 100644 cranelift/codegen/src/isa/arm64/abi.rs

diff --git a/cranelift/codegen/src/isa/arm64/abi.rs b/cranelift/codegen/src/isa/arm64/abi.rs
new file mode 100644
index 000000000000..13abb6233a05
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/abi.rs
@@ -0,0 +1,875 @@
+//! Implementation of the standard ARM64 ABI.
+
+use crate::ir;
+use crate::ir::types;
+use crate::ir::types::*;
+use crate::ir::StackSlot;
+use crate::isa;
+use crate::isa::arm64::inst::*;
+use crate::machinst::*;
+use crate::settings;
+
+use alloc::vec::Vec;
+
+use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
+
+use log::debug;
+
+// A location for an argument or return value.
+#[derive(Clone, Debug)]
+enum ABIArg {
+    // In a real register.
+    Reg(RealReg, ir::Type),
+    // Arguments only: on stack, at given offset from SP at entry.
+    Stack(i64, ir::Type),
+    // (first and only) return value only: in memory pointed to by x8 on entry.
+    #[allow(dead_code)]
+    RetMem(ir::Type),
+}
+
+/// ARM64 ABI information shared between body (callee) and caller.
+struct ABISig {
+    args: Vec<ABIArg>,
+    rets: Vec<ABIArg>,
+    stack_arg_space: i64,
+    call_conv: isa::CallConv,
+}
+
+// Spidermonkey specific ABI convention.
+
+/// This is SpiderMonkey's `WasmTableCallSigReg`.
+static BALDRDASH_SIG_REG: u8 = 10;
+
+/// This is SpiderMonkey's `WasmTlsReg`.
+static BALDRDASH_TLS_REG: u8 = 23;
+
+// These two lists represent the registers the JIT may *not* use at any point in generated code.
+//
+// So these are callee-preserved from the JIT's point of view, and every register not in this list
+// has to be caller-preserved by definition.
+//
+// Keep these lists in sync with the NonAllocatableMask set in Spidermonkey's
+// Architecture-arm64.cpp.
+
+// Indexed by physical register number.
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_GPR: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ true /* x16 / ip1 */, true /* x17 / ip2 */, true /* x18 / TLS */, false,
+    /* 20 = */ false, false, false, false,
+    /* 24 = */ false, false, false, false,
+    // There should be 28, the pseudo stack pointer in this list, however the wasm stubs trash it
+    // gladly right now.
+    /* 28 = */ false, false, true /* x30 = FP */, true /* x31 = SP */
+];
+
+#[rustfmt::skip]
+static BALDRDASH_JIT_CALLEE_SAVED_FPU: &[bool] = &[
+    /* 0 = */ false, false, false, false, false, false, false, false,
+    /* 8 = */ false, false, false, false, false, false, false, false,
+    /* 16 = */ false, false, false, false, false, false, false, false,
+    /* 24 = */ false, false, false, false, false, false, false, true /* v31 / d31 */
+];
+
+/// Try to fill a Baldrdash register, returning it if it was found.
+fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Option<ABIArg> {
+    if call_conv.extends_baldrdash() {
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext => {
+                // This is SpiderMonkey's `WasmTlsReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_TLS_REG).to_real_reg(),
+                    ir::types::I64,
+                ))
+            }
+            &ir::ArgumentPurpose::SignatureId => {
+                // This is SpiderMonkey's `WasmTableCallSigReg`.
+                Some(ABIArg::Reg(
+                    xreg(BALDRDASH_SIG_REG).to_real_reg(),
+                    ir::types::I64,
+                ))
+            }
+            _ => None,
+        }
+    } else {
+        None
+    }
+}
+
+/// Process a list of parameters or return values and allocate them to X-regs,
+/// V-regs, and stack slots.
+///
+/// Returns the list of argument locations, and the stack-space used (rounded up
+/// to a 16-byte-aligned boundary).
+fn compute_arg_locs(call_conv: isa::CallConv, params: &[ir::AbiParam]) -> (Vec<ABIArg>, i64) {
+    // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4.
+    let mut next_xreg = 0;
+    let mut next_vreg = 0;
+    let mut next_stack: u64 = 0;
+    let mut ret = vec![];
+    for param in params {
+        // Validate "purpose".
+        match &param.purpose {
+            &ir::ArgumentPurpose::VMContext
+            | &ir::ArgumentPurpose::Normal
+            | &ir::ArgumentPurpose::SignatureId => {}
+            _ => panic!(
+                "Unsupported argument purpose {:?} in signature: {:?}",
+                param.purpose, params
+            ),
+        }
+
+        if in_int_reg(param.value_type) {
+            if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
+                ret.push(param);
+            } else if next_xreg < 8 {
+                ret.push(ABIArg::Reg(xreg(next_xreg).to_real_reg(), param.value_type));
+                next_xreg += 1;
+            } else {
+                ret.push(ABIArg::Stack(next_stack as i64, param.value_type));
+                next_stack += 8;
+            }
+        } else if in_vec_reg(param.value_type) {
+            if next_vreg < 8 {
+                ret.push(ABIArg::Reg(vreg(next_vreg).to_real_reg(), param.value_type));
+                next_vreg += 1;
+            } else {
+                let size: u64 = match param.value_type {
+                    F32 | F64 => 8,
+                    _ => panic!("Unsupported vector-reg argument type"),
+                };
+                // Align.
+                assert!(size.is_power_of_two());
+                next_stack = (next_stack + size - 1) & !(size - 1);
+                ret.push(ABIArg::Stack(next_stack as i64, param.value_type));
+                next_stack += size;
+            }
+        }
+    }
+
+    next_stack = (next_stack + 15) & !15;
+
+    (ret, next_stack as i64)
+}
+
+impl ABISig {
+    fn from_func_sig(sig: &ir::Signature) -> ABISig {
+        // Compute args and retvals from signature.
+        // TODO: pass in arg-mode or ret-mode. (Does not matter
+        // for the types of arguments/return values that we support.)
+        let (args, stack_arg_space) = compute_arg_locs(sig.call_conv, &sig.params);
+        let (rets, _) = compute_arg_locs(sig.call_conv, &sig.returns);
+
+        // Verify that there are no arguments in return-memory area.
+        assert!(args.iter().all(|a| match a {
+            &ABIArg::RetMem(..) => false,
+            _ => true,
+        }));
+        // Verify that there are no return values on the stack.
+        assert!(rets.iter().all(|a| match a {
+            &ABIArg::Stack(..) => false,
+            _ => true,
+        }));
+
+        ABISig {
+            args,
+            rets,
+            stack_arg_space,
+            call_conv: sig.call_conv,
+        }
+    }
+}
+
+/// ARM64 ABI object for a function body.
+pub struct ARM64ABIBody {
+    sig: ABISig,                       // signature: arg and retval regs
+    stackslots: Vec<usize>,            // offsets to each stackslot
+    stackslots_size: usize,            // total stack size of all stackslots
+    clobbered: Set<Writable<RealReg>>, // clobbered registers, from regalloc.
+    spillslots: Option<usize>,         // total number of spillslots, from regalloc.
+    frame_size: Option<usize>,
+    call_conv: isa::CallConv,
+}
+
+fn in_int_reg(ty: ir::Type) -> bool {
+    match ty {
+        types::I8 | types::I16 | types::I32 | types::I64 => true,
+        types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true,
+        _ => false,
+    }
+}
+
+fn in_vec_reg(ty: ir::Type) -> bool {
+    match ty {
+        types::F32 | types::F64 => true,
+        _ => false,
+    }
+}
+
+impl ARM64ABIBody {
+    /// Create a new body ABI instance.
+    pub fn new(f: &ir::Function) -> Self {
+        debug!("ARM64 ABI: func signature {:?}", f.signature);
+
+        let sig = ABISig::from_func_sig(&f.signature);
+
+        // Compute stackslot locations and total stackslot size.
+        let mut stack_offset: usize = 0;
+        let mut stackslots = vec![];
+        for (stackslot, data) in f.stack_slots.iter() {
+            let off = stack_offset;
+            stack_offset += data.size as usize;
+            stack_offset = (stack_offset + 7) & !7usize;
+            assert_eq!(stackslot.as_u32() as usize, stackslots.len());
+            stackslots.push(off);
+        }
+
+        Self {
+            sig,
+            stackslots,
+            stackslots_size: stack_offset,
+            clobbered: Set::empty(),
+            spillslots: None,
+            frame_size: None,
+            call_conv: f.signature.call_conv,
+        }
+    }
+}
+
+fn load_stack(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst {
+    let mem = MemArg::FPOffset(fp_offset);
+
+    match ty {
+        types::B1
+        | types::B8
+        | types::I8
+        | types::B16
+        | types::I16
+        | types::B32
+        | types::I32
+        | types::B64
+        | types::I64 => Inst::ULoad64 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F32 => Inst::FpuLoad32 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F64 => Inst::FpuLoad64 {
+            rd: into_reg,
+            mem,
+            srcloc: None,
+        },
+        _ => unimplemented!(),
+    }
+}
+
+fn store_stack(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
+    let mem = MemArg::FPOffset(fp_offset);
+
+    match ty {
+        types::B1
+        | types::B8
+        | types::I8
+        | types::B16
+        | types::I16
+        | types::B32
+        | types::I32
+        | types::B64
+        | types::I64 => Inst::Store64 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F32 => Inst::FpuStore32 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        types::F64 => Inst::FpuStore64 {
+            rd: from_reg,
+            mem,
+            srcloc: None,
+        },
+        _ => unimplemented!(),
+    }
+}
+
+fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool {
+    if call_conv.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                if BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native ABI registers.
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                if BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native ABI registers.
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x19 - x28 inclusive are callee-saves.
+            r.get_hw_encoding() >= 19 && r.get_hw_encoding() <= 28
+        }
+        RegClass::V128 => {
+            // v8 - v15 inclusive are callee-saves.
+            r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+fn get_callee_saves(
+    call_conv: isa::CallConv,
+    regs: Vec<Writable<RealReg>>,
+) -> (Vec<Writable<RealReg>>, Vec<Writable<RealReg>>) {
+    let mut int_saves = vec![];
+    let mut vec_saves = vec![];
+    for reg in regs.into_iter() {
+        if is_callee_save(call_conv, reg.to_reg()) {
+            match reg.to_reg().get_class() {
+                RegClass::I64 => int_saves.push(reg),
+                RegClass::V128 => vec_saves.push(reg),
+                _ => panic!("Unexpected RegClass"),
+            }
+        }
+    }
+    (int_saves, vec_saves)
+}
+
+fn is_caller_save(call_conv: isa::CallConv, r: RealReg) -> bool {
+    if call_conv.extends_baldrdash() {
+        match r.get_class() {
+            RegClass::I64 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            RegClass::V128 => {
+                let enc = r.get_hw_encoding();
+                if !BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] {
+                    return true;
+                }
+                // Otherwise, fall through to preserve native's ABI caller-saved.
+            }
+            _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"),
+        };
+    }
+
+    match r.get_class() {
+        RegClass::I64 => {
+            // x0 - x17 inclusive are caller-saves.
+            r.get_hw_encoding() <= 17
+        }
+        RegClass::V128 => {
+            // v0 - v7 inclusive and v16 - v31 inclusive are caller-saves.
+            r.get_hw_encoding() <= 7 || (r.get_hw_encoding() >= 16 && r.get_hw_encoding() <= 31)
+        }
+        _ => panic!("Unexpected RegClass"),
+    }
+}
+
+fn get_caller_saves_set(call_conv: isa::CallConv) -> Set<Writable<Reg>> {
+    let mut set = Set::empty();
+    for i in 0..29 {
+        let x = writable_xreg(i);
+        if is_caller_save(call_conv, x.to_reg().to_real_reg()) {
+            set.insert(x);
+        }
+    }
+    for i in 0..32 {
+        let v = writable_vreg(i);
+        if is_caller_save(call_conv, v.to_reg().to_real_reg()) {
+            set.insert(v);
+        }
+    }
+    set
+}
+
+impl ABIBody<Inst> for ARM64ABIBody {
+    fn liveins(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for arg in &self.sig.args {
+            if let &ABIArg::Reg(r, _) = arg {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn liveouts(&self) -> Set<RealReg> {
+        let mut set: Set<RealReg> = Set::empty();
+        for ret in &self.sig.rets {
+            if let &ABIArg::Reg(r, _) = ret {
+                set.insert(r);
+            }
+        }
+        set
+    }
+
+    fn num_args(&self) -> usize {
+        self.sig.args.len()
+    }
+
+    fn num_retvals(&self) -> usize {
+        self.sig.rets.len()
+    }
+
+    fn num_stackslots(&self) -> usize {
+        self.stackslots.len()
+    }
+
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
+        match &self.sig.args[idx] {
+            &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
+            &ABIArg::Stack(off, ty) => load_stack(off + 16, into_reg, ty),
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Inst {
+        match &self.sig.rets[idx] {
+            &ABIArg::Reg(r, ty) => Inst::gen_move(Writable::from_reg(r.to_reg()), from_reg, ty),
+            &ABIArg::Stack(off, ty) => store_stack(off + 16, from_reg, ty),
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_ret(&self) -> Inst {
+        Inst::Ret {}
+    }
+
+    fn gen_epilogue_placeholder(&self) -> Inst {
+        Inst::EpiloguePlaceholder {}
+    }
+
+    fn set_num_spillslots(&mut self, slots: usize) {
+        self.spillslots = Some(slots);
+    }
+
+    fn set_clobbered(&mut self, clobbered: Set<Writable<RealReg>>) {
+        self.clobbered = clobbered;
+    }
+
+    fn load_stackslot(
+        &self,
+        slot: StackSlot,
+        offset: usize,
+        ty: Type,
+        into_reg: Writable<Reg>,
+    ) -> Inst {
+        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        load_stack(fp_off, into_reg, ty)
+    }
+
+    fn store_stackslot(&self, slot: StackSlot, offset: usize, ty: Type, from_reg: Reg) -> Inst {
+        // Offset from beginning of stackslot area, which is at FP - stackslots_size.
+        let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
+        let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
+        store_stack(fp_off, from_reg, ty)
+    }
+
+    // Load from a spillslot.
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Inst {
+        // Note that when spills/fills are generated, we don't yet know how many
+        // spillslots there will be, so we allocate *downward* from the beginning
+        // of the stackslot area. Hence: FP - stackslot_size - 8*spillslot -
+        // sizeof(ty).
+        let islot = slot.get() as i64;
+        let ty_size = self.get_spillslot_size(into_reg.to_reg().get_class(), ty) * 8;
+        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        load_stack(fp_off, into_reg, ty)
+    }
+
+    // Store to a spillslot.
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Inst {
+        let islot = slot.get() as i64;
+        let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8;
+        let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64;
+        store_stack(fp_off, from_reg, ty)
+    }
+
+    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<Inst> {
+        let mut insts = vec![];
+        if !self.call_conv.extends_baldrdash() {
+            // stp fp (x29), lr (x30), [sp, #-16]!
+            insts.push(Inst::StoreP64 {
+                rt: fp_reg(),
+                rt2: link_reg(),
+                mem: PairMemArg::PreIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+                ),
+            });
+            // mov fp (x29), sp. This uses the ADDI rd, rs, 0 form of `MOV` because
+            // the usual encoding (`ORR`) does not work with SP.
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_fp_reg(),
+                rn: stack_reg(),
+                imm12: Imm12 {
+                    bits: 0,
+                    shift12: false,
+                },
+            });
+        }
+
+        let mut total_stacksize = self.stackslots_size + 8 * self.spillslots.unwrap();
+        if self.call_conv.extends_baldrdash() {
+            debug_assert!(
+                !flags.enable_probestack(),
+                "baldrdash does not expect cranelift to emit stack probes"
+            );
+            total_stacksize += flags.baldrdash_prologue_words() as usize * 8;
+        }
+        let total_stacksize = (total_stacksize + 15) & !15; // 16-align the stack.
+
+        if !self.call_conv.extends_baldrdash() && total_stacksize > 0 {
+            // sub sp, sp, #total_stacksize
+            if let Some(imm12) = Imm12::maybe_from_u64(total_stacksize as u64) {
+                let sub_inst = Inst::AluRRImm12 {
+                    alu_op: ALUOp::Sub64,
+                    rd: writable_stack_reg(),
+                    rn: stack_reg(),
+                    imm12,
+                };
+                insts.push(sub_inst);
+            } else {
+                let tmp = writable_spilltmp_reg();
+                let const_inst = Inst::LoadConst64 {
+                    rd: tmp,
+                    const_data: total_stacksize as u64,
+                };
+                let sub_inst = Inst::AluRRRExtend {
+                    alu_op: ALUOp::Sub64,
+                    rd: writable_stack_reg(),
+                    rn: stack_reg(),
+                    rm: tmp.to_reg(),
+                    extendop: ExtendOp::UXTX,
+                };
+                insts.push(const_inst);
+                insts.push(sub_inst);
+            }
+        }
+
+        // Save clobbered registers.
+        let (clobbered_int, clobbered_vec) =
+            get_callee_saves(self.call_conv, self.clobbered.to_vec());
+        for reg_pair in clobbered_int.chunks(2) {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                // .to_reg().to_reg(): Writable<RealReg> --> RealReg --> Reg
+                (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg())
+            } else {
+                (reg_pair[0].to_reg().to_reg(), zero_reg())
+            };
+
+            debug_assert!(r1.get_class() == RegClass::I64);
+            debug_assert!(r2.get_class() == RegClass::I64);
+
+            // stp r1, r2, [sp, #-16]!
+            insts.push(Inst::StoreP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairMemArg::PreIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(),
+                ),
+            });
+        }
+        let vec_save_bytes = clobbered_vec.len() * 16;
+        if vec_save_bytes != 0 {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Sub64,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
+            });
+        }
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuStore128 {
+                rd: reg.to_reg().to_reg(),
+                mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()),
+                srcloc: None,
+            });
+        }
+
+        self.frame_size = Some(total_stacksize);
+        insts
+    }
+
+    fn gen_epilogue(&self, _flags: &settings::Flags) -> Vec<Inst> {
+        let mut insts = vec![];
+
+        // Restore clobbered registers.
+        let (clobbered_int, clobbered_vec) =
+            get_callee_saves(self.call_conv, self.clobbered.to_vec());
+
+        for (i, reg) in clobbered_vec.iter().enumerate() {
+            insts.push(Inst::FpuLoad128 {
+                rd: Writable::from_reg(reg.to_reg().to_reg()),
+                mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()),
+                srcloc: None,
+            });
+        }
+        let vec_save_bytes = clobbered_vec.len() * 16;
+        if vec_save_bytes != 0 {
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(),
+            });
+        }
+
+        for reg_pair in clobbered_int.chunks(2).rev() {
+            let (r1, r2) = if reg_pair.len() == 2 {
+                (
+                    reg_pair[0].map(|r| r.to_reg()),
+                    reg_pair[1].map(|r| r.to_reg()),
+                )
+            } else {
+                (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg())
+            };
+
+            debug_assert!(r1.to_reg().get_class() == RegClass::I64);
+            debug_assert!(r2.to_reg().get_class() == RegClass::I64);
+
+            // ldp r1, r2, [sp], #16
+            insts.push(Inst::LoadP64 {
+                rt: r1,
+                rt2: r2,
+                mem: PairMemArg::PostIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
+                ),
+            });
+        }
+
+        if !self.call_conv.extends_baldrdash() {
+            // The MOV (alias of ORR) interprets x31 as XZR, so use an ADD here.
+            // MOV to SP is an alias of ADD.
+            insts.push(Inst::AluRRImm12 {
+                alu_op: ALUOp::Add64,
+                rd: writable_stack_reg(),
+                rn: fp_reg(),
+                imm12: Imm12 {
+                    bits: 0,
+                    shift12: false,
+                },
+            });
+            insts.push(Inst::LoadP64 {
+                rt: writable_fp_reg(),
+                rt2: writable_link_reg(),
+                mem: PairMemArg::PostIndexed(
+                    writable_stack_reg(),
+                    SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(),
+                ),
+            });
+            insts.push(Inst::Ret {});
+        }
+
+        debug!("Epilogue: {:?}", insts);
+        insts
+    }
+
+    fn frame_size(&self) -> u32 {
+        self.frame_size
+            .expect("frame size not computed before prologue generation") as u32
+    }
+
+    fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
+        // We allocate in terms of 8-byte slots.
+        match (rc, ty) {
+            (RegClass::I64, _) => 1,
+            (RegClass::V128, F32) | (RegClass::V128, F64) => 1,
+            (RegClass::V128, _) => 2,
+            _ => panic!("Unexpected register class!"),
+        }
+    }
+
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Inst {
+        self.store_spillslot(to_slot, ty, from_reg.to_reg())
+    }
+
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> Inst {
+        self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg()))
+    }
+}
+
+enum CallDest {
+    ExtName(ir::ExternalName),
+    Reg(Reg),
+}
+
+/// ARM64 ABI object for a function call.
+pub struct ARM64ABICall {
+    sig: ABISig,
+    uses: Set<Reg>,
+    defs: Set<Writable<Reg>>,
+    dest: CallDest,
+    loc: ir::SourceLoc,
+    opcode: ir::Opcode,
+}
+
+fn abisig_to_uses_and_defs(sig: &ABISig) -> (Set<Reg>, Set<Writable<Reg>>) {
+    // Compute uses: all arg regs.
+    let mut uses = Set::empty();
+    for arg in &sig.args {
+        match arg {
+            &ABIArg::Reg(reg, _) => uses.insert(reg.to_reg()),
+            _ => {}
+        }
+    }
+
+    // Compute defs: all retval regs, and all caller-save (clobbered) regs.
+    let mut defs = get_caller_saves_set(sig.call_conv);
+    for ret in &sig.rets {
+        match ret {
+            &ABIArg::Reg(reg, _) => defs.insert(Writable::from_reg(reg.to_reg())),
+            _ => {}
+        }
+    }
+
+    (uses, defs)
+}
+
+impl ARM64ABICall {
+    /// Create a callsite ABI object for a call directly to the specified function.
+    pub fn from_func(
+        sig: &ir::Signature,
+        extname: &ir::ExternalName,
+        loc: ir::SourceLoc,
+    ) -> ARM64ABICall {
+        let sig = ABISig::from_func_sig(sig);
+        let (uses, defs) = abisig_to_uses_and_defs(&sig);
+        ARM64ABICall {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::ExtName(extname.clone()),
+            loc,
+            opcode: ir::Opcode::Call,
+        }
+    }
+
+    /// Create a callsite ABI object for a call to a function pointer with the
+    /// given signature.
+    pub fn from_ptr(
+        sig: &ir::Signature,
+        ptr: Reg,
+        loc: ir::SourceLoc,
+        opcode: ir::Opcode,
+    ) -> ARM64ABICall {
+        let sig = ABISig::from_func_sig(sig);
+        let (uses, defs) = abisig_to_uses_and_defs(&sig);
+        ARM64ABICall {
+            sig,
+            uses,
+            defs,
+            dest: CallDest::Reg(ptr),
+            loc,
+            opcode,
+        }
+    }
+}
+
+fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
+    if amt > 0 {
+        let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 };
+        if let Some(imm12) = Imm12::maybe_from_u64(amt) {
+            vec![Inst::AluRRImm12 {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                imm12,
+            }]
+        } else {
+            let const_load = Inst::LoadConst64 {
+                rd: writable_spilltmp_reg(),
+                const_data: amt,
+            };
+            let adj = Inst::AluRRRExtend {
+                alu_op,
+                rd: writable_stack_reg(),
+                rn: stack_reg(),
+                rm: spilltmp_reg(),
+                extendop: ExtendOp::UXTX,
+            };
+            vec![const_load, adj]
+        }
+    } else {
+        vec![]
+    }
+}
+
+impl ABICall<Inst> for ARM64ABICall {
+    fn num_args(&self) -> usize {
+        self.sig.args.len()
+    }
+
+    fn gen_stack_pre_adjust(&self) -> Vec<Inst> {
+        adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ true)
+    }
+
+    fn gen_stack_post_adjust(&self) -> Vec<Inst> {
+        adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false)
+    }
+
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Inst {
+        match &self.sig.args[idx] {
+            &ABIArg::Reg(reg, ty) => Inst::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty),
+            &ABIArg::Stack(off, _) => Inst::Store64 {
+                rd: from_reg,
+                mem: MemArg::SPOffset(off),
+                srcloc: None,
+            },
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
+        match &self.sig.rets[idx] {
+            &ABIArg::Reg(reg, ty) => Inst::gen_move(into_reg, reg.to_reg(), ty),
+            &ABIArg::RetMem(..) => panic!("Return-memory area not yet supported"),
+            _ => unimplemented!(),
+        }
+    }
+
+    fn gen_call(&self) -> Vec<Inst> {
+        let (uses, defs) = (self.uses.clone(), self.defs.clone());
+        match &self.dest {
+            &CallDest::ExtName(ref name) => vec![Inst::Call {
+                dest: name.clone(),
+                uses,
+                defs,
+                loc: self.loc,
+                opcode: self.opcode,
+            }],
+            &CallDest::Reg(reg) => vec![Inst::CallInd {
+                rn: reg,
+                uses,
+                defs,
+                loc: self.loc,
+                opcode: self.opcode,
+            }],
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs
index b6a28a5dbdcc..8f0324904b86 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1 +1,2 @@
+mod abi;
 mod inst;

From aaa5a127c8c2d70b024a4a9b89d40572839eae20 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 13:08:14 -0700
Subject: [PATCH 06/12] ARM64 backend, part 6 / 11: CLIF -> VCode<Inst>
 lowering.

This patch adds the lowering implementation that translates Cranelift IR
(CLIF) function bodies to VCode<Inst>, i.e., ARM64 machine instructions.

This patch contains code written by Julian Seward <jseward@acm.org> and
Benjamin Bouvier <public@benj.me>, originally developed on a side-branch
before rebasing and condensing into this patch series. See the `arm64`
branch at `https://github.com/cfallin/wasmtime` for original development
history.

This patch also contains code written by Joey Gouly
<joey.gouly@arm.com> and contributed to the above branch. These
contributions are "Copyright (c) 2020, Arm Limited."

Co-authored-by: Julian Seward <jseward@acm.org>
Co-authored-by: Benjamin Bouvier <public@benj.me>
Co-authored-by: Joey Gouly <joey.gouly@arm.com>
---
 cranelift/codegen/src/isa/arm64/lower.rs | 2805 ++++++++++++++++++++++
 cranelift/codegen/src/isa/arm64/mod.rs   |    4 +
 2 files changed, 2809 insertions(+)
 create mode 100644 cranelift/codegen/src/isa/arm64/lower.rs

diff --git a/cranelift/codegen/src/isa/arm64/lower.rs b/cranelift/codegen/src/isa/arm64/lower.rs
new file mode 100644
index 000000000000..9979802c792b
--- /dev/null
+++ b/cranelift/codegen/src/isa/arm64/lower.rs
@@ -0,0 +1,2805 @@
+//! Lowering rules for ARM64.
+//!
+//! TODO: opportunities for better code generation:
+//!
+//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns; recognize
+//!   and incorporate sign/zero extension on indicies. Recognize pre/post-index
+//!   opportunities.
+//!
+//! - Logical-immediate args.
+//!
+//! - Floating-point immediates.
+
+#![allow(dead_code)]
+
+use crate::ir::condcodes::{FloatCC, IntCC};
+use crate::ir::types::*;
+use crate::ir::Inst as IRInst;
+use crate::ir::{Block, InstructionData, Opcode, TrapCode, Type};
+use crate::machinst::lower::*;
+use crate::machinst::*;
+
+use crate::isa::arm64::abi::*;
+use crate::isa::arm64::inst::*;
+use crate::isa::arm64::Arm64Backend;
+
+use regalloc::{Reg, RegClass, Writable};
+
+use alloc::vec::Vec;
+use smallvec::SmallVec;
+
+//============================================================================
+// Helpers: opcode conversions
+
+fn op_to_aluop(op: Opcode, ty: Type) -> Option<ALUOp> {
+    match (op, ty) {
+        (Opcode::Iadd, I32) => Some(ALUOp::Add32),
+        (Opcode::Iadd, I64) => Some(ALUOp::Add64),
+        (Opcode::Isub, I32) => Some(ALUOp::Sub32),
+        (Opcode::Isub, I64) => Some(ALUOp::Sub64),
+        _ => None,
+    }
+}
+
+fn is_alu_op(op: Opcode, ctrl_typevar: Type) -> bool {
+    op_to_aluop(op, ctrl_typevar).is_some()
+}
+
+//============================================================================
+// Result enum types.
+//
+// Lowering of a given value results in one of these enums, depending on the
+// modes in which we can accept the value.
+
+/// A lowering result: register, register-shift.  An SSA value can always be
+/// lowered into one of these options; the register form is the fallback.
+#[derive(Clone, Debug)]
+enum ResultRS {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+}
+
+/// A lowering result: register, register-shift, register-extend.  An SSA value can always be
+/// lowered into one of these options; the register form is the fallback.
+#[derive(Clone, Debug)]
+enum ResultRSE {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    RegExtend(Reg, ExtendOp),
+}
+
+impl ResultRSE {
+    fn from_rs(rs: ResultRS) -> ResultRSE {
+        match rs {
+            ResultRS::Reg(r) => ResultRSE::Reg(r),
+            ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s),
+        }
+    }
+}
+
+/// A lowering result: register, register-shift, register-extend, or 12-bit immediate form.
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+enum ResultRSEImm12 {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    RegExtend(Reg, ExtendOp),
+    Imm12(Imm12),
+}
+
+impl ResultRSEImm12 {
+    fn from_rse(rse: ResultRSE) -> ResultRSEImm12 {
+        match rse {
+            ResultRSE::Reg(r) => ResultRSEImm12::Reg(r),
+            ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s),
+            ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e),
+        }
+    }
+}
+
+/// A lowering result: register, register-shift, or logical immediate form.
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+enum ResultRSImmLogic {
+    Reg(Reg),
+    RegShift(Reg, ShiftOpAndAmt),
+    ImmLogic(ImmLogic),
+}
+
+impl ResultRSImmLogic {
+    fn from_rs(rse: ResultRS) -> ResultRSImmLogic {
+        match rse {
+            ResultRS::Reg(r) => ResultRSImmLogic::Reg(r),
+            ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s),
+        }
+    }
+}
+
+/// A lowering result: register or immediate shift amount (arg to a shift op).
+/// An SSA value can always be lowered into one of these options; the register form is the
+/// fallback.
+#[derive(Clone, Debug)]
+enum ResultRegImmShift {
+    Reg(Reg),
+    ImmShift(ImmShift),
+}
+
+//============================================================================
+// Instruction input and output "slots".
+//
+// We use these types to refer to operand numbers, and result numbers, together
+// with the associated instruction, in a type-safe way.
+
+/// Identifier for a particular output of an instruction.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+struct InsnOutput {
+    insn: IRInst,
+    output: usize,
+}
+
+/// Identifier for a particular input of an instruction.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+struct InsnInput {
+    insn: IRInst,
+    input: usize,
+}
+
+/// Producer of a value: either a previous instruction's output, or a register that will be
+/// codegen'd separately.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum InsnInputSource {
+    Output(InsnOutput),
+    Reg(Reg),
+}
+
+impl InsnInputSource {
+    fn as_output(self) -> Option<InsnOutput> {
+        match self {
+            InsnInputSource::Output(o) => Some(o),
+            _ => None,
+        }
+    }
+}
+
+fn get_input<C: LowerCtx<Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
+    assert!(num <= ctx.num_inputs(output.insn));
+    InsnInput {
+        insn: output.insn,
+        input: num,
+    }
+}
+
+/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
+/// register otherwise.
+fn input_source<C: LowerCtx<Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
+    if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
+        let out = InsnOutput {
+            insn: input_inst,
+            output: result_num,
+        };
+        InsnInputSource::Output(out)
+    } else {
+        let reg = ctx.input(input.insn, input.input);
+        InsnInputSource::Reg(reg)
+    }
+}
+
+//============================================================================
+// Lowering: convert instruction outputs to result types.
+
+/// Lower an instruction output to a 64-bit constant, if possible.
+fn output_to_const<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
+    if out.output > 0 {
+        None
+    } else {
+        let inst_data = ctx.data(out.insn);
+        if inst_data.opcode() == Opcode::Null {
+            Some(0)
+        } else {
+            match inst_data {
+                &InstructionData::UnaryImm { opcode: _, imm } => {
+                    // Only has Into for i64; we use u64 elsewhere, so we cast.
+                    let imm: i64 = imm.into();
+                    Some(imm as u64)
+                }
+                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(imm.bits() as u64),
+                &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
+                _ => None,
+            }
+        }
+    }
+}
+
+fn output_to_const_f32<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f32> {
+    output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
+}
+
+fn output_to_const_f64<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f64> {
+    output_to_const(ctx, out).map(|value| f64::from_bits(value))
+}
+
+/// Lower an instruction output to a constant register-shift amount, if possible.
+fn output_to_shiftimm<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<ShiftOpShiftImm> {
+    output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
+}
+
+/// How to handle narrow values loaded into registers; see note on `narrow_mode`
+/// parameter to `input_to_*` below.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum NarrowValueMode {
+    None,
+    /// Zero-extend to 32 bits if original is < 32 bits.
+    ZeroExtend32,
+    /// Sign-extend to 32 bits if original is < 32 bits.
+    SignExtend32,
+    /// Zero-extend to 64 bits if original is < 64 bits.
+    ZeroExtend64,
+    /// Sign-extend to 64 bits if original is < 64 bits.
+    SignExtend64,
+}
+
+impl NarrowValueMode {
+    fn is_32bit(&self) -> bool {
+        match self {
+            NarrowValueMode::None => false,
+            NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true,
+            NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false,
+        }
+    }
+}
+
+/// Lower an instruction output to a reg.
+fn output_to_reg<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
+    ctx.output(out.insn, out.output)
+}
+
+/// Lower an instruction input to a reg.
+///
+/// The given register will be extended appropriately, according to
+/// `narrow_mode` and the input's type. If extended, the value is
+/// always extended to 64 bits, for simplicity.
+fn input_to_reg<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> Reg {
+    let ty = ctx.input_ty(input.insn, input.input);
+    let from_bits = ty_bits(ty) as u8;
+    let in_reg = ctx.input(input.insn, input.input);
+    match (narrow_mode, from_bits) {
+        (NarrowValueMode::None, _) => in_reg,
+        (NarrowValueMode::ZeroExtend32, n) if n < 32 => {
+            let tmp = ctx.tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: false,
+                from_bits,
+                to_bits: 32,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::SignExtend32, n) if n < 32 => {
+            let tmp = ctx.tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: true,
+                from_bits,
+                to_bits: 32,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::ZeroExtend32, n) | (NarrowValueMode::SignExtend32, n) if n == 32 => {
+            in_reg
+        }
+
+        (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
+            let tmp = ctx.tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: false,
+                from_bits,
+                to_bits: 64,
+            });
+            tmp.to_reg()
+        }
+        (NarrowValueMode::SignExtend64, n) if n < 64 => {
+            let tmp = ctx.tmp(RegClass::I64, I32);
+            ctx.emit(Inst::Extend {
+                rd: tmp,
+                rn: in_reg,
+                signed: true,
+                from_bits,
+                to_bits: 64,
+            });
+            tmp.to_reg()
+        }
+        (_, n) if n == 64 => in_reg,
+
+        _ => panic!(
+            "Unsupported input width: input ty {} bits {} mode {:?}",
+            ty, from_bits, narrow_mode
+        ),
+    }
+}
+
+/// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
+/// This does not actually codegen the source instruction; it just uses the
+/// vreg into which the source instruction will generate its value.
+///
+/// The `narrow_mode` flag indicates whether the consumer of this value needs
+/// the high bits clear. For many operations, such as an add/sub/mul or any
+/// bitwise logical operation, the low-bit results depend only on the low-bit
+/// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit
+/// value is stored in the low 8 bits of the register and the high 24 bits are
+/// undefined. If the op truly needs the high N bits clear (such as for a
+/// divide or a right-shift or a compare-to-zero), `narrow_mode` should be
+/// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting
+/// register will be provided the extended value.
+fn input_to_rs<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRS {
+    if let InsnInputSource::Output(out) = input_source(ctx, input) {
+        let insn = out.insn;
+        assert!(out.output <= ctx.num_outputs(insn));
+        let op = ctx.data(insn).opcode();
+
+        if op == Opcode::Ishl {
+            let shiftee = get_input(ctx, out, 0);
+            let shift_amt = get_input(ctx, out, 1);
+
+            // Can we get the shift amount as an immediate?
+            if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() {
+                if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) {
+                    let reg = input_to_reg(ctx, shiftee, narrow_mode);
+                    ctx.merged(insn);
+                    ctx.merged(shift_amt_out.insn);
+                    return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm));
+                }
+            }
+        }
+    }
+
+    ResultRS::Reg(input_to_reg(ctx, input, narrow_mode))
+}
+
+/// Lower an instruction input to a reg or reg/shift, or reg/extend operand.
+/// This does not actually codegen the source instruction; it just uses the
+/// vreg into which the source instruction will generate its value.
+///
+/// See note on `input_to_rs` for a description of `narrow_mode`.
+fn input_to_rse<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSE {
+    if let InsnInputSource::Output(out) = input_source(ctx, input) {
+        let insn = out.insn;
+        assert!(out.output <= ctx.num_outputs(insn));
+        let op = ctx.data(insn).opcode();
+        let out_ty = ctx.output_ty(insn, out.output);
+        let out_bits = ty_bits(out_ty);
+
+        // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend,
+        // then get the result into a register and return an Extend-mode operand on
+        // that register.
+        if narrow_mode != NarrowValueMode::None
+            && ((narrow_mode.is_32bit() && out_bits < 32)
+                || (!narrow_mode.is_32bit() && out_bits < 64))
+        {
+            let reg = output_to_reg(ctx, out);
+            let extendop = match (narrow_mode, out_bits) {
+                (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => {
+                    ExtendOp::SXTB
+                }
+                (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => {
+                    ExtendOp::UXTB
+                }
+                (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => {
+                    ExtendOp::SXTB
+                }
+                (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => {
+                    ExtendOp::UXTB
+                }
+                (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => {
+                    ExtendOp::SXTH
+                }
+                (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => {
+                    ExtendOp::UXTH
+                }
+                (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW,
+                (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW,
+                _ => unreachable!(),
+            };
+            return ResultRSE::RegExtend(reg.to_reg(), extendop);
+        }
+
+        // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator?
+        if op == Opcode::Uextend || op == Opcode::Sextend {
+            assert!(out_bits == 32 || out_bits == 64);
+            let sign_extend = op == Opcode::Sextend;
+            let extendee = get_input(ctx, out, 0);
+            let inner_ty = ctx.input_ty(extendee.insn, extendee.input);
+            let inner_bits = ty_bits(inner_ty);
+            assert!(inner_bits < out_bits);
+            let extendop = match (sign_extend, inner_bits) {
+                (true, 1) => ExtendOp::SXTB,
+                (false, 1) => ExtendOp::UXTB,
+                (true, 8) => ExtendOp::SXTB,
+                (false, 8) => ExtendOp::UXTB,
+                (true, 16) => ExtendOp::SXTH,
+                (false, 16) => ExtendOp::UXTH,
+                (true, 32) => ExtendOp::SXTW,
+                (false, 32) => ExtendOp::UXTW,
+                _ => unreachable!(),
+            };
+            let reg = input_to_reg(ctx, extendee, NarrowValueMode::None);
+            ctx.merged(insn);
+            return ResultRSE::RegExtend(reg, extendop);
+        }
+    }
+
+    ResultRSE::from_rs(input_to_rs(ctx, input, narrow_mode))
+}
+
+fn input_to_rse_imm12<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSEImm12 {
+    if let InsnInputSource::Output(out) = input_source(ctx, input) {
+        if let Some(imm_value) = output_to_const(ctx, out) {
+            if let Some(i) = Imm12::maybe_from_u64(imm_value) {
+                ctx.merged(out.insn);
+                return ResultRSEImm12::Imm12(i);
+            }
+        }
+    }
+
+    ResultRSEImm12::from_rse(input_to_rse(ctx, input, narrow_mode))
+}
+
+fn input_to_rs_immlogic<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+    narrow_mode: NarrowValueMode,
+) -> ResultRSImmLogic {
+    if let InsnInputSource::Output(out) = input_source(ctx, input) {
+        if let Some(imm_value) = output_to_const(ctx, out) {
+            let ty = ctx.output_ty(out.insn, out.output);
+            let ty = if ty_bits(ty) < 32 { I32 } else { ty };
+            if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) {
+                ctx.merged(out.insn);
+                return ResultRSImmLogic::ImmLogic(i);
+            }
+        }
+    }
+
+    ResultRSImmLogic::from_rs(input_to_rs(ctx, input, narrow_mode))
+}
+
+fn input_to_reg_immshift<C: LowerCtx<Inst>>(ctx: &mut C, input: InsnInput) -> ResultRegImmShift {
+    if let InsnInputSource::Output(out) = input_source(ctx, input) {
+        if let Some(imm_value) = output_to_const(ctx, out) {
+            if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
+                ctx.merged(out.insn);
+                return ResultRegImmShift::ImmShift(immshift);
+            }
+        }
+    }
+
+    ResultRegImmShift::Reg(input_to_reg(ctx, input, NarrowValueMode::None))
+}
+
+//============================================================================
+// ALU instruction constructors.
+
+fn alu_inst_imm12(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSEImm12) -> Inst {
+    match rm {
+        ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 {
+            alu_op: op,
+            rd,
+            rn,
+            imm12,
+        },
+        ResultRSEImm12::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+        ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            shiftop,
+        },
+        ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            extendop,
+        },
+    }
+}
+
+fn alu_inst_immlogic(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRSImmLogic) -> Inst {
+    match rm {
+        ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic {
+            alu_op: op,
+            rd,
+            rn,
+            imml,
+        },
+        ResultRSImmLogic::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+        ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+            shiftop,
+        },
+    }
+}
+
+fn alu_inst_immshift(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRegImmShift) -> Inst {
+    match rm {
+        ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift {
+            alu_op: op,
+            rd,
+            rn,
+            immshift,
+        },
+        ResultRegImmShift::Reg(rm) => Inst::AluRRR {
+            alu_op: op,
+            rd,
+            rn,
+            rm,
+        },
+    }
+}
+
+//============================================================================
+// Lowering: addressing mode support. Takes instruction directly, rather
+// than an `InsnInput`, to do more introspection.
+
+/// Lower the address of a load or store.
+fn lower_address<C: LowerCtx<Inst>>(
+    ctx: &mut C,
+    elem_ty: Type,
+    addends: &[InsnInput],
+    offset: i32,
+) -> MemArg {
+    // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or
+    // mul instructions (Load/StoreComplex don't include scale factors).
+
+    // Handle one reg and offset that fits in immediate, if possible.
+    if addends.len() == 1 {
+        let reg = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
+        if let Some(memarg) = MemArg::reg_maybe_offset(reg, offset as i64, elem_ty) {
+            return memarg;
+        }
+    }
+
+    // Handle two regs and a zero offset, if possible.
+    if addends.len() == 2 && offset == 0 {
+        let ra = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
+        let rb = input_to_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64);
+        return MemArg::reg_reg(ra, rb);
+    }
+
+    // Otherwise, generate add instructions.
+    let addr = ctx.tmp(RegClass::I64, I64);
+
+    // Get the const into a reg.
+    lower_constant_u64(ctx, addr.clone(), offset as u64);
+
+    // Add each addend to the address.
+    for addend in addends {
+        let reg = input_to_reg(ctx, *addend, NarrowValueMode::ZeroExtend64);
+        ctx.emit(Inst::AluRRR {
+            alu_op: ALUOp::Add64,
+            rd: addr.clone(),
+            rn: addr.to_reg(),
+            rm: reg.clone(),
+        });
+    }
+
+    MemArg::reg(addr.to_reg())
+}
+
+fn lower_constant_u64<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: u64) {
+    for inst in Inst::load_constant(rd, value) {
+        ctx.emit(inst);
+    }
+}
+
+fn lower_constant_f32<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f32) {
+    ctx.emit(Inst::load_fp_constant32(rd, value));
+}
+
+fn lower_constant_f64<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f64) {
+    ctx.emit(Inst::load_fp_constant64(rd, value));
+}
+
+fn lower_condcode(cc: IntCC) -> Cond {
+    match cc {
+        IntCC::Equal => Cond::Eq,
+        IntCC::NotEqual => Cond::Ne,
+        IntCC::SignedGreaterThanOrEqual => Cond::Ge,
+        IntCC::SignedGreaterThan => Cond::Gt,
+        IntCC::SignedLessThanOrEqual => Cond::Le,
+        IntCC::SignedLessThan => Cond::Lt,
+        IntCC::UnsignedGreaterThanOrEqual => Cond::Hs,
+        IntCC::UnsignedGreaterThan => Cond::Hi,
+        IntCC::UnsignedLessThanOrEqual => Cond::Ls,
+        IntCC::UnsignedLessThan => Cond::Lo,
+        IntCC::Overflow => Cond::Vs,
+        IntCC::NotOverflow => Cond::Vc,
+    }
+}
+
+fn lower_fp_condcode(cc: FloatCC) -> Cond {
+    // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` ARM64 docs.
+    // The FCMP instruction sets:
+    //               NZCV
+    // - PCSR.NZCV = 0011 on UN (unordered),
+    //               0110 on EQ,
+    //               1000 on LT,
+    //               0010 on GT.
+    match cc {
+        // EQ | LT | GT. Vc => V clear.
+        FloatCC::Ordered => Cond::Vc,
+        // UN. Vs => V set.
+        FloatCC::Unordered => Cond::Vs,
+        // EQ. Eq => Z set.
+        FloatCC::Equal => Cond::Eq,
+        // UN | LT | GT. Ne => Z clear.
+        FloatCC::NotEqual => Cond::Ne,
+        // LT | GT.
+        FloatCC::OrderedNotEqual => unimplemented!(),
+        //  UN | EQ
+        FloatCC::UnorderedOrEqual => unimplemented!(),
+        // LT. Mi => N set.
+        FloatCC::LessThan => Cond::Mi,
+        // LT | EQ. Ls => C clear or Z set.
+        FloatCC::LessThanOrEqual => Cond::Ls,
+        // GT. Gt => Z clear, N = V.
+        FloatCC::GreaterThan => Cond::Gt,
+        // GT | EQ. Ge => N = V.
+        FloatCC::GreaterThanOrEqual => Cond::Ge,
+        // UN | LT
+        FloatCC::UnorderedOrLessThan => unimplemented!(),
+        // UN | LT | EQ
+        FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(),
+        // UN | GT
+        FloatCC::UnorderedOrGreaterThan => unimplemented!(),
+        // UN | GT | EQ
+        FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(),
+    }
+}
+
+/// Determines whether this condcode interprets inputs as signed or
+/// unsigned.  See the documentation for the `icmp` instruction in
+/// cranelift-codegen/meta/src/shared/instructions.rs for further insights
+/// into this.
+pub fn condcode_is_signed(cc: IntCC) -> bool {
+    match cc {
+        IntCC::Equal => false,
+        IntCC::NotEqual => false,
+        IntCC::SignedGreaterThanOrEqual => true,
+        IntCC::SignedGreaterThan => true,
+        IntCC::SignedLessThanOrEqual => true,
+        IntCC::SignedLessThan => true,
+        IntCC::UnsignedGreaterThanOrEqual => false,
+        IntCC::UnsignedGreaterThan => false,
+        IntCC::UnsignedLessThanOrEqual => false,
+        IntCC::UnsignedLessThan => false,
+        IntCC::Overflow => true,
+        IntCC::NotOverflow => true,
+    }
+}
+
+//=============================================================================
+// Top-level instruction lowering entry point, for one instruction.
+
+/// Actually codegen an instruction's results into registers.
+fn lower_insn_to_regs<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
+    let op = ctx.data(insn).opcode();
+    let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
+        .map(|i| InsnInput { insn, input: i })
+        .collect();
+    let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn))
+        .map(|i| InsnOutput { insn, output: i })
+        .collect();
+    let ty = if outputs.len() > 0 {
+        Some(ctx.output_ty(insn, 0))
+    } else {
+        None
+    };
+
+    match op {
+        Opcode::Iconst | Opcode::Bconst | Opcode::Null => {
+            let value = output_to_const(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant_u64(ctx, rd, value);
+        }
+        Opcode::F32const => {
+            let value = output_to_const_f32(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant_f32(ctx, rd, value);
+        }
+        Opcode::F64const => {
+            let value = output_to_const_f64(ctx, outputs[0]).unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            lower_constant_f64(ctx, rd, value);
+        }
+        Opcode::Iadd => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64);
+            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+        }
+        Opcode::Isub => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_rse_imm12(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+        }
+        Opcode::UaddSat | Opcode::SaddSat => {
+            // We use the vector instruction set's saturating adds (UQADD /
+            // SQADD), which require vector registers.
+            let is_signed = op == Opcode::SaddSat;
+            let narrow_mode = if is_signed {
+                NarrowValueMode::SignExtend64
+            } else {
+                NarrowValueMode::ZeroExtend64
+            };
+            let alu_op = if is_signed {
+                VecALUOp::SQAddScalar
+            } else {
+                VecALUOp::UQAddScalar
+            };
+            let va = ctx.tmp(RegClass::V128, I128);
+            let vb = ctx.tmp(RegClass::V128, I128);
+            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
+            ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
+            ctx.emit(Inst::VecRRR {
+                rd: va,
+                rn: va.to_reg(),
+                rm: vb.to_reg(),
+                alu_op,
+            });
+            ctx.emit(Inst::MovFromVec64 {
+                rd,
+                rn: va.to_reg(),
+            });
+        }
+
+        Opcode::UsubSat | Opcode::SsubSat => {
+            let is_signed = op == Opcode::SsubSat;
+            let narrow_mode = if is_signed {
+                NarrowValueMode::SignExtend64
+            } else {
+                NarrowValueMode::ZeroExtend64
+            };
+            let alu_op = if is_signed {
+                VecALUOp::SQSubScalar
+            } else {
+                VecALUOp::UQSubScalar
+            };
+            let va = ctx.tmp(RegClass::V128, I128);
+            let vb = ctx.tmp(RegClass::V128, I128);
+            let ra = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rb = input_to_reg(ctx, inputs[1], narrow_mode);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::MovToVec64 { rd: va, rn: ra });
+            ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb });
+            ctx.emit(Inst::VecRRR {
+                rd: va,
+                rn: va.to_reg(),
+                rm: vb.to_reg(),
+                alu_op,
+            });
+            ctx.emit(Inst::MovFromVec64 {
+                rd,
+                rn: va.to_reg(),
+            });
+        }
+
+        Opcode::Ineg => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = zero_reg();
+            let rm = input_to_rse_imm12(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64);
+            ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+        }
+
+        Opcode::Imul => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64);
+            ctx.emit(Inst::AluRRRR {
+                alu_op,
+                rd,
+                rn,
+                rm,
+                ra: zero_reg(),
+            });
+        }
+
+        Opcode::Umulhi | Opcode::Smulhi => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let is_signed = op == Opcode::Smulhi;
+            let input_ty = ctx.input_ty(insn, 0);
+            assert!(ctx.input_ty(insn, 1) == input_ty);
+            assert!(ctx.output_ty(insn, 0) == input_ty);
+
+            match input_ty {
+                I64 => {
+                    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+                    let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+                    let ra = zero_reg();
+                    let alu_op = if is_signed {
+                        ALUOp::SMulH
+                    } else {
+                        ALUOp::UMulH
+                    };
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op,
+                        rd,
+                        rn,
+                        rm,
+                        ra,
+                    });
+                }
+                I32 | I16 | I8 => {
+                    let narrow_mode = if is_signed {
+                        NarrowValueMode::SignExtend64
+                    } else {
+                        NarrowValueMode::ZeroExtend64
+                    };
+                    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+                    let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                    let ra = zero_reg();
+                    ctx.emit(Inst::AluRRRR {
+                        alu_op: ALUOp::MAdd64,
+                        rd,
+                        rn,
+                        rm,
+                        ra,
+                    });
+                    let shift_op = if is_signed {
+                        ALUOp::Asr64
+                    } else {
+                        ALUOp::Lsr64
+                    };
+                    let shift_amt = match input_ty {
+                        I32 => 32,
+                        I16 => 16,
+                        I8 => 8,
+                        _ => unreachable!(),
+                    };
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: shift_op,
+                        rd,
+                        rn: rd.to_reg(),
+                        immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(),
+                    });
+                }
+                _ => {
+                    panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty);
+                }
+            }
+        }
+
+        Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => {
+            let is_signed = match op {
+                Opcode::Udiv | Opcode::Urem => false,
+                Opcode::Sdiv | Opcode::Srem => true,
+                _ => unreachable!(),
+            };
+            let is_rem = match op {
+                Opcode::Udiv | Opcode::Sdiv => false,
+                Opcode::Urem | Opcode::Srem => true,
+                _ => unreachable!(),
+            };
+            let narrow_mode = if is_signed {
+                NarrowValueMode::SignExtend64
+            } else {
+                NarrowValueMode::ZeroExtend64
+            };
+            let div_op = if is_signed {
+                ALUOp::SDiv64
+            } else {
+                ALUOp::UDiv64
+            };
+
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            if !is_rem {
+                let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                ctx.emit(Inst::AluRRR {
+                    alu_op: div_op,
+                    rd,
+                    rn,
+                    rm,
+                });
+            } else {
+                let rm = input_to_reg(ctx, inputs[1], narrow_mode);
+                // Remainder (rn % rm) is implemented as:
+                //
+                //   tmp = rn / rm
+                //   rd = rn - (tmp*rm)
+                //
+                // use 'rd' for tmp and you have:
+                //
+                //   div rd, rn, rm       ; rd = rn / rm
+                //   msub rd, rd, rm, rn  ; rd = rn - rd * rm
+                ctx.emit(Inst::AluRRR {
+                    alu_op: div_op,
+                    rd,
+                    rn,
+                    rm,
+                });
+                ctx.emit(Inst::AluRRRR {
+                    alu_op: ALUOp::MSub64,
+                    rd: rd,
+                    rn: rd.to_reg(),
+                    rm: rm,
+                    ra: rn,
+                });
+            }
+        }
+
+        Opcode::Uextend | Opcode::Sextend => {
+            let output_ty = ty.unwrap();
+            let input_ty = ctx.input_ty(insn, 0);
+            let from_bits = ty_bits(input_ty) as u8;
+            let to_bits = ty_bits(output_ty) as u8;
+            let to_bits = std::cmp::max(32, to_bits);
+            assert!(from_bits <= to_bits);
+            if from_bits < to_bits {
+                let signed = op == Opcode::Sextend;
+                // If we reach this point, we weren't able to incorporate the extend as
+                // a register-mode on another instruction, so we have a 'None'
+                // narrow-value/extend mode here, and we emit the explicit instruction.
+                let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+                let rd = output_to_reg(ctx, outputs[0]);
+                ctx.emit(Inst::Extend {
+                    rd,
+                    rn,
+                    signed,
+                    from_bits,
+                    to_bits,
+                });
+            }
+        }
+
+        Opcode::Bnot => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_rs_immlogic(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64);
+            // NOT rd, rm ==> ORR_NOT rd, zero, rm
+            ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm));
+        }
+
+        Opcode::Band
+        | Opcode::Bor
+        | Opcode::Bxor
+        | Opcode::BandNot
+        | Opcode::BorNot
+        | Opcode::BxorNot => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_rs_immlogic(ctx, inputs[1], NarrowValueMode::None);
+            let ty = ty.unwrap();
+            let alu_op = match op {
+                Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64),
+                Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
+                Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64),
+                Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64),
+                Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64),
+                Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64),
+                _ => unreachable!(),
+            };
+            ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm));
+        }
+
+        Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
+            let ty = ty.unwrap();
+            let is32 = ty_bits(ty) <= 32;
+            let narrow_mode = match (op, is32) {
+                (Opcode::Ishl, _) => NarrowValueMode::None,
+                (Opcode::Ushr, false) => NarrowValueMode::ZeroExtend64,
+                (Opcode::Ushr, true) => NarrowValueMode::ZeroExtend32,
+                (Opcode::Sshr, false) => NarrowValueMode::SignExtend64,
+                (Opcode::Sshr, true) => NarrowValueMode::SignExtend32,
+                _ => unreachable!(),
+            };
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rm = input_to_reg_immshift(ctx, inputs[1]);
+            let alu_op = match op {
+                Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
+                Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64),
+                _ => unreachable!(),
+            };
+            ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
+        }
+
+        Opcode::Rotr => {
+            // For a 32-bit or 64-bit rotate-right, we can use the ROR
+            // instruction directly.
+            //
+            // For a < 32-bit rotate-right, we synthesize this as:
+            //
+            //    rotr rd, rn, rm
+            //
+            //       =>
+            //
+            //    zero-extend rn, <32-or-64>
+            //    sub tmp1, rm, <bitwidth>
+            //    sub tmp1, zero, tmp1  ; neg
+            //    lsr tmp2, rn, rm
+            //    lsl rd, rn, tmp1
+            //    orr rd, rd, tmp2
+            //
+            // For a constant amount, we can instead do:
+            //
+            //    zero-extend rn, <32-or-64>
+            //    lsr tmp2, rn, #<shiftimm>
+            //    lsl rd, rn, <bitwidth - shiftimm>
+            //    orr rd, rd, tmp2
+
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(
+                ctx,
+                inputs[0],
+                if bits <= 32 {
+                    NarrowValueMode::ZeroExtend32
+                } else {
+                    NarrowValueMode::ZeroExtend64
+                },
+            );
+            let rm = input_to_reg_immshift(ctx, inputs[1]);
+
+            if bits == 32 || bits == 64 {
+                let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64);
+                ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm));
+            } else {
+                assert!(bits < 32);
+                match rm {
+                    ResultRegImmShift::Reg(reg) => {
+                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let tmp2 = ctx.tmp(RegClass::I64, I32);
+                        ctx.emit(Inst::AluRRImm12 {
+                            alu_op: ALUOp::Sub32,
+                            rd: tmp1,
+                            rn: reg,
+                            imm12: Imm12::maybe_from_u64(bits as u64).unwrap(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Sub32,
+                            rd: tmp1,
+                            rn: zero_reg(),
+                            rm: tmp1.to_reg(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Lsr32,
+                            rd: tmp2,
+                            rn: rn,
+                            rm: reg,
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Lsl32,
+                            rd: rd,
+                            rn: rn,
+                            rm: tmp1.to_reg(),
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Orr32,
+                            rd: rd,
+                            rn: rd.to_reg(),
+                            rm: tmp2.to_reg(),
+                        });
+                    }
+                    ResultRegImmShift::ImmShift(immshift) => {
+                        let tmp1 = ctx.tmp(RegClass::I64, I32);
+                        let amt = immshift.value();
+                        assert!(amt <= bits as u8);
+                        let opp_shift = ImmShift::maybe_from_u64(bits as u64 - amt as u64).unwrap();
+                        ctx.emit(Inst::AluRRImmShift {
+                            alu_op: ALUOp::Lsr32,
+                            rd: tmp1,
+                            rn: rn,
+                            immshift: immshift,
+                        });
+                        ctx.emit(Inst::AluRRImmShift {
+                            alu_op: ALUOp::Lsl32,
+                            rd: rd,
+                            rn: rn,
+                            immshift: opp_shift,
+                        });
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::Orr32,
+                            rd: rd,
+                            rn: rd.to_reg(),
+                            rm: tmp1.to_reg(),
+                        });
+                    }
+                }
+            }
+        }
+
+        Opcode::Rotl => {
+            // ARM64 does not have a ROL instruction, so we always synthesize
+            // this as:
+            //
+            //    rotl rd, rn, rm
+            //
+            //       =>
+            //
+            //    zero-extend rn, <32-or-64>
+            //    sub tmp1, rm, <bitwidth>
+            //    sub tmp1, zero, tmp1  ; neg
+            //    lsl tmp2, rn, rm
+            //    lsr rd, rn, tmp1
+            //    orr rd, rd, tmp2
+            //
+            // For a constant amount, we can instead do:
+            //
+            //    zero-extend rn, <32-or-64>
+            //    lsl tmp2, rn, #<shiftimm>
+            //    lsr rd, rn, #<bitwidth - shiftimm>
+            //    orr rd, rd, tmp2
+
+            let ty = ty.unwrap();
+            let bits = ty_bits(ty);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(
+                ctx,
+                inputs[0],
+                if bits <= 32 {
+                    NarrowValueMode::ZeroExtend32
+                } else {
+                    NarrowValueMode::ZeroExtend64
+                },
+            );
+            let rm = input_to_reg_immshift(ctx, inputs[1]);
+
+            match rm {
+                ResultRegImmShift::Reg(reg) => {
+                    let tmp1 = ctx.tmp(RegClass::I64, I32);
+                    let tmp2 = ctx.tmp(RegClass::I64, I64);
+                    ctx.emit(Inst::AluRRImm12 {
+                        alu_op: ALUOp::Sub32,
+                        rd: tmp1,
+                        rn: reg,
+                        imm12: Imm12::maybe_from_u64(bits as u64).unwrap(),
+                    });
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: ALUOp::Sub32,
+                        rd: tmp1,
+                        rn: zero_reg(),
+                        rm: tmp1.to_reg(),
+                    });
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
+                        rd: tmp2,
+                        rn: rn,
+                        rm: reg,
+                    });
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                        rd: rd,
+                        rn: rn,
+                        rm: tmp1.to_reg(),
+                    });
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
+                        rd: rd,
+                        rn: rd.to_reg(),
+                        rm: tmp2.to_reg(),
+                    });
+                }
+                ResultRegImmShift::ImmShift(immshift) => {
+                    let tmp1 = ctx.tmp(RegClass::I64, I64);
+                    let amt = immshift.value();
+                    assert!(amt <= bits as u8);
+                    let opp_shift = ImmShift::maybe_from_u64(bits as u64 - amt as u64).unwrap();
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64),
+                        rd: tmp1,
+                        rn: rn,
+                        immshift: immshift,
+                    });
+                    ctx.emit(Inst::AluRRImmShift {
+                        alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                        rd: rd,
+                        rn: rn,
+                        immshift: opp_shift,
+                    });
+                    ctx.emit(Inst::AluRRR {
+                        alu_op: choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64),
+                        rd: rd,
+                        rn: rd.to_reg(),
+                        rm: tmp1.to_reg(),
+                    });
+                }
+            }
+        }
+
+        Opcode::Bitrev | Opcode::Clz | Opcode::Cls => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let op = BitOp::from((op, ty.unwrap()));
+            ctx.emit(Inst::BitRR { rd, rn, op });
+        }
+
+        Opcode::Ctz => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let op = BitOp::from((Opcode::Bitrev, ty.unwrap()));
+            ctx.emit(Inst::BitRR { rd, rn, op });
+            let op = BitOp::from((Opcode::Clz, ty.unwrap()));
+            ctx.emit(Inst::BitRR {
+                rd,
+                rn: rd.to_reg(),
+                op,
+            });
+        }
+
+        Opcode::Popcnt => {
+            // Lower popcount using the following algorithm:
+            //
+            //   x -= (x >> 1) & 0x5555555555555555
+            //   x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333)
+            //   x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f
+            //   x += x << 8
+            //   x += x << 16
+            //   x += x << 32
+            //   x >> 56
+            let ty = ty.unwrap();
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let tmp = ctx.tmp(RegClass::I64, I64);
+
+            // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then
+            // the rest of the code is identical to the 64-bit version.
+            // lsr [wx]d, [wx]n, #1
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64),
+                rd: rd,
+                rn: rn,
+                immshift: ImmShift::maybe_from_u64(1).unwrap(),
+            });
+
+            // and xd, xd, #0x5555555555555555
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: rd,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(),
+            });
+
+            // sub xd, xn, xd
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Sub64,
+                rd: rd,
+                rn: rn,
+                rm: rd.to_reg(),
+            });
+
+            // and xt, xd, #0x3333333333333333
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: tmp,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
+            });
+
+            // lsr xd, xd, #2
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr64,
+                rd: rd,
+                rn: rd.to_reg(),
+                immshift: ImmShift::maybe_from_u64(2).unwrap(),
+            });
+
+            // and xd, xd, #0x3333333333333333
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: rd,
+                rn: rd.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(),
+            });
+
+            // add xt, xd, xt
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: rd.to_reg(),
+                rm: tmp.to_reg(),
+            });
+
+            // add xt, xt, xt LSR #4
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSR,
+                    ShiftOpShiftImm::maybe_from_shift(4).unwrap(),
+                ),
+            });
+
+            // and xt, xt, #0x0f0f0f0f0f0f0f0f
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op: ALUOp::And64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(),
+            });
+
+            // add xt, xt, xt, LSL #8
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(8).unwrap(),
+                ),
+            });
+
+            // add xt, xt, xt, LSL #16
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(16).unwrap(),
+                ),
+            });
+
+            // add xt, xt, xt, LSL #32
+            ctx.emit(Inst::AluRRRShift {
+                alu_op: ALUOp::Add64,
+                rd: tmp,
+                rn: tmp.to_reg(),
+                rm: tmp.to_reg(),
+                shiftop: ShiftOpAndAmt::new(
+                    ShiftOp::LSL,
+                    ShiftOpShiftImm::maybe_from_shift(32).unwrap(),
+                ),
+            });
+
+            // lsr xd, xt, #56
+            ctx.emit(Inst::AluRRImmShift {
+                alu_op: ALUOp::Lsr64,
+                rd: rd,
+                rn: tmp.to_reg(),
+                immshift: ImmShift::maybe_from_u64(56).unwrap(),
+            });
+        }
+
+        Opcode::Load
+        | Opcode::Uload8
+        | Opcode::Sload8
+        | Opcode::Uload16
+        | Opcode::Sload16
+        | Opcode::Uload32
+        | Opcode::Sload32
+        | Opcode::LoadComplex
+        | Opcode::Uload8Complex
+        | Opcode::Sload8Complex
+        | Opcode::Uload16Complex
+        | Opcode::Sload16Complex
+        | Opcode::Uload32Complex
+        | Opcode::Sload32Complex => {
+            let off = ldst_offset(ctx.data(insn)).unwrap();
+            let elem_ty = match op {
+                Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => {
+                    I8
+                }
+                Opcode::Sload16
+                | Opcode::Uload16
+                | Opcode::Sload16Complex
+                | Opcode::Uload16Complex => I16,
+                Opcode::Sload32
+                | Opcode::Uload32
+                | Opcode::Sload32Complex
+                | Opcode::Uload32Complex => I32,
+                Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0),
+                _ => unreachable!(),
+            };
+            let sign_extend = match op {
+                Opcode::Sload8
+                | Opcode::Sload8Complex
+                | Opcode::Sload16
+                | Opcode::Sload16Complex
+                | Opcode::Sload32
+                | Opcode::Sload32Complex => true,
+                _ => false,
+            };
+            let is_float = ty_is_float(elem_ty);
+
+            let mem = lower_address(ctx, elem_ty, &inputs[..], off);
+            let rd = output_to_reg(ctx, outputs[0]);
+
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+
+            ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) {
+                (1, _, _) => Inst::ULoad8 { rd, mem, srcloc },
+                (8, false, _) => Inst::ULoad8 { rd, mem, srcloc },
+                (8, true, _) => Inst::SLoad8 { rd, mem, srcloc },
+                (16, false, _) => Inst::ULoad16 { rd, mem, srcloc },
+                (16, true, _) => Inst::SLoad16 { rd, mem, srcloc },
+                (32, false, false) => Inst::ULoad32 { rd, mem, srcloc },
+                (32, true, false) => Inst::SLoad32 { rd, mem, srcloc },
+                (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc },
+                (64, _, false) => Inst::ULoad64 { rd, mem, srcloc },
+                (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc },
+                _ => panic!("Unsupported size in load"),
+            });
+        }
+
+        Opcode::Store
+        | Opcode::Istore8
+        | Opcode::Istore16
+        | Opcode::Istore32
+        | Opcode::StoreComplex
+        | Opcode::Istore8Complex
+        | Opcode::Istore16Complex
+        | Opcode::Istore32Complex => {
+            let off = ldst_offset(ctx.data(insn)).unwrap();
+            let elem_ty = match op {
+                Opcode::Istore8 | Opcode::Istore8Complex => I8,
+                Opcode::Istore16 | Opcode::Istore16Complex => I16,
+                Opcode::Istore32 | Opcode::Istore32Complex => I32,
+                Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0),
+                _ => unreachable!(),
+            };
+            let is_float = ty_is_float(elem_ty);
+
+            let mem = lower_address(ctx, elem_ty, &inputs[1..], off);
+            let rd = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+
+            let memflags = ctx.memflags(insn).expect("memory flags");
+            let srcloc = if !memflags.notrap() {
+                Some(ctx.srcloc(insn))
+            } else {
+                None
+            };
+
+            ctx.emit(match (ty_bits(elem_ty), is_float) {
+                (1, _) | (8, _) => Inst::Store8 { rd, mem, srcloc },
+                (16, _) => Inst::Store16 { rd, mem, srcloc },
+                (32, false) => Inst::Store32 { rd, mem, srcloc },
+                (32, true) => Inst::FpuStore32 { rd, mem, srcloc },
+                (64, false) => Inst::Store64 { rd, mem, srcloc },
+                (64, true) => Inst::FpuStore64 { rd, mem, srcloc },
+                _ => panic!("Unsupported size in store"),
+            });
+        }
+
+        Opcode::StackLoad | Opcode::StackStore | Opcode::StackAddr => {
+            panic!("Direct stack memory access not supported; should not be used by Wasm");
+        }
+
+        Opcode::HeapAddr => {
+            panic!("heap_addr should have been removed by legalization!");
+        }
+
+        Opcode::TableAddr => {
+            panic!("table_addr should have been removed by legalization!");
+        }
+
+        Opcode::Nop => {
+            // Nothing.
+        }
+
+        Opcode::Select | Opcode::Selectif => {
+            let cond = if op == Opcode::Select {
+                let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 {
+                    (ALUOp::SubS64, NarrowValueMode::ZeroExtend64)
+                } else {
+                    (ALUOp::SubS32, NarrowValueMode::ZeroExtend32)
+                };
+
+                let rcond = input_to_reg(ctx, inputs[0], narrow_mode);
+                // cmp rcond, #0
+                ctx.emit(Inst::AluRRR {
+                    alu_op: cmp_op,
+                    rd: writable_zero_reg(),
+                    rn: rcond,
+                    rm: zero_reg(),
+                });
+                Cond::Ne
+            } else {
+                let condcode = inst_condcode(ctx.data(insn)).unwrap();
+                let cond = lower_condcode(condcode);
+                let is_signed = condcode_is_signed(condcode);
+                // Verification ensures that the input is always a
+                // single-def ifcmp.
+                let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+                cond
+            };
+
+            // csel.COND rd, rn, rm
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
+            let ty = ctx.output_ty(insn, 0);
+            let bits = ty_bits(ty);
+            if ty_is_float(ty) && bits == 32 {
+                ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm });
+            } else if ty_is_float(ty) && bits == 64 {
+                ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm });
+            } else {
+                ctx.emit(Inst::CSel { cond, rd, rn, rm });
+            }
+        }
+
+        Opcode::Bitselect => {
+            let tmp = ctx.tmp(RegClass::I64, I64);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
+            // AND rTmp, rn, rcond
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::And64,
+                rd: tmp,
+                rn,
+                rm: rcond,
+            });
+            // BIC rd, rm, rcond
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::AndNot64,
+                rd,
+                rn: rm,
+                rm: rcond,
+            });
+            // ORR rd, rd, rTmp
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Orr64,
+                rd,
+                rn: rd.to_reg(),
+                rm: tmp.to_reg(),
+            });
+        }
+
+        Opcode::Trueif => {
+            let condcode = inst_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+            // Verification ensures that the input is always a
+            // single-def ifcmp.
+            let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+            lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::CSet { rd, cond });
+        }
+
+        Opcode::Trueff => {
+            let condcode = inst_fp_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_fp_condcode(condcode);
+            let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+            lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::CSet { rd, cond });
+        }
+
+        Opcode::IsNull | Opcode::IsInvalid => {
+            panic!("Reference types not supported");
+        }
+
+        Opcode::Copy => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+
+        Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => {
+            // All of these ops are simply a move from a zero-extended source.
+            // Here is why this works, in each case:
+            //
+            // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we
+            //   merely need to zero-extend here.
+            //
+            // - Breduce, Bextend: changing width of a boolean. We represent a
+            //   bool as a 0 or 1, so again, this is a zero-extend / no-op.
+            //
+            // - Ireduce: changing width of an integer. Smaller ints are stored
+            //   with undefined high-order bits, so we can simply do a copy.
+
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let ty = ctx.input_ty(insn, 0);
+            ctx.emit(Inst::gen_move(rd, rn, ty));
+        }
+
+        Opcode::Bmask => {
+            // Bool is {0, 1}, so we can subtract from 0 to get all-1s.
+            let rd = output_to_reg(ctx, outputs[0]);
+            let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+            ctx.emit(Inst::AluRRR {
+                alu_op: ALUOp::Sub64,
+                rd,
+                rn: zero_reg(),
+                rm,
+            });
+        }
+
+        Opcode::Bitcast => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let ity = ctx.input_ty(insn, 0);
+            let oty = ctx.output_ty(insn, 0);
+            match (ty_is_float(ity), ty_is_float(oty)) {
+                (true, true) => {
+                    let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 {
+                        NarrowValueMode::ZeroExtend32
+                    } else {
+                        NarrowValueMode::ZeroExtend64
+                    };
+                    let rm = input_to_reg(ctx, inputs[0], narrow_mode);
+                    ctx.emit(Inst::gen_move(rd, rm, oty));
+                }
+                (false, false) => {
+                    let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+                    ctx.emit(Inst::gen_move(rd, rm, oty));
+                }
+                (false, true) => {
+                    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+                    ctx.emit(Inst::MovToVec64 { rd, rn });
+                }
+                (true, false) => {
+                    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+                    ctx.emit(Inst::MovFromVec64 { rd, rn });
+                }
+            }
+        }
+
+        Opcode::FallthroughReturn | Opcode::Return => {
+            for (i, input) in inputs.iter().enumerate() {
+                // N.B.: according to the AArch64 ABI, the top bits of a register
+                // (above the bits for the value's type) are undefined, so we
+                // need not extend the return values.
+                let reg = input_to_reg(ctx, *input, NarrowValueMode::None);
+                let retval_reg = ctx.retval(i);
+                let ty = ctx.input_ty(insn, i);
+                ctx.emit(Inst::gen_move(retval_reg, reg, ty));
+            }
+            // N.B.: the Ret itself is generated by the ABI.
+        }
+
+        Opcode::Ifcmp | Opcode::Ffcmp => {
+            // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff
+            // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from
+            // the same block, or a dominating block. In other words, it cannot pass through a BB
+            // param (phi). The flags pass of the verifier will ensure this.
+            panic!("Should never reach ifcmp as isel root!");
+        }
+
+        Opcode::Icmp => {
+            let condcode = inst_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_condcode(condcode);
+            let is_signed = condcode_is_signed(condcode);
+            let ty = ctx.input_ty(insn, 0);
+            let bits = ty_bits(ty);
+            let narrow_mode = match (bits <= 32, is_signed) {
+                (true, true) => NarrowValueMode::SignExtend32,
+                (true, false) => NarrowValueMode::ZeroExtend32,
+                (false, true) => NarrowValueMode::SignExtend64,
+                (false, false) => NarrowValueMode::ZeroExtend64,
+            };
+            let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm));
+            ctx.emit(Inst::CondSet { cond, rd });
+        }
+
+        Opcode::Fcmp => {
+            let condcode = inst_fp_condcode(ctx.data(insn)).unwrap();
+            let cond = lower_fp_condcode(condcode);
+            let ty = ctx.input_ty(insn, 0);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            match ty_bits(ty) {
+                32 => {
+                    ctx.emit(Inst::FpuCmp32 { rn, rm });
+                }
+                64 => {
+                    ctx.emit(Inst::FpuCmp64 { rn, rm });
+                }
+                _ => panic!("Bad float size"),
+            }
+            ctx.emit(Inst::CondSet { cond, rd });
+        }
+
+        Opcode::JumpTableEntry | Opcode::JumpTableBase => {
+            panic!("Should not appear: we handle BrTable directly");
+        }
+
+        Opcode::Debugtrap => {
+            ctx.emit(Inst::Brk);
+        }
+
+        Opcode::Trap => {
+            let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap());
+            ctx.emit(Inst::Udf { trap_info })
+        }
+
+        Opcode::Trapif | Opcode::Trapff => {
+            let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap());
+
+            let cond = if op == Opcode::Trapif {
+                let condcode = inst_condcode(ctx.data(insn)).unwrap();
+                let cond = lower_condcode(condcode);
+                let is_signed = condcode_is_signed(condcode);
+
+                // Verification ensures that the input is always a single-def ifcmp.
+                let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap();
+                lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+                cond
+            } else {
+                let condcode = inst_fp_condcode(ctx.data(insn)).unwrap();
+                let cond = lower_fp_condcode(condcode);
+
+                // Verification ensures that the input is always a
+                // single-def ffcmp.
+                let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap();
+                lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+                cond
+            };
+
+            // Branch around the break instruction with inverted cond. Go straight to lowered
+            // one-target form; this is logically part of a single-in single-out template lowering.
+            let cond = cond.invert();
+            ctx.emit(Inst::CondBrLowered {
+                target: BranchTarget::ResolvedOffset(8),
+                kind: CondBrKind::Cond(cond),
+            });
+
+            ctx.emit(Inst::Udf { trap_info })
+        }
+
+        Opcode::Safepoint => {
+            panic!("safepoint support not implemented!");
+        }
+
+        Opcode::Trapz | Opcode::Trapnz => {
+            panic!("trapz / trapnz should have been removed by legalization!");
+        }
+
+        Opcode::ResumableTrap => {
+            panic!("Resumable traps not supported");
+        }
+
+        Opcode::FuncAddr => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let extname = ctx.call_target(insn).unwrap().clone();
+            let loc = ctx.srcloc(insn);
+            ctx.emit(Inst::LoadExtName {
+                rd,
+                name: extname,
+                srcloc: loc,
+                offset: 0,
+            });
+        }
+
+        Opcode::GlobalValue => {
+            panic!("global_value should have been removed by legalization!");
+        }
+
+        Opcode::SymbolValue => {
+            let rd = output_to_reg(ctx, outputs[0]);
+            let (extname, offset) = ctx.symbol_value(insn).unwrap();
+            let extname = extname.clone();
+            let loc = ctx.srcloc(insn);
+            ctx.emit(Inst::LoadExtName {
+                rd,
+                name: extname,
+                srcloc: loc,
+                offset,
+            });
+        }
+
+        Opcode::Call | Opcode::CallIndirect => {
+            let loc = ctx.srcloc(insn);
+            let (abi, inputs) = match op {
+                Opcode::Call => {
+                    let extname = ctx.call_target(insn).unwrap();
+                    let extname = extname.clone();
+                    // HACK: get the function address with an Abs8 reloc in the constant pool.
+                    //let tmp = ctx.tmp(RegClass::I64, I64);
+                    //ctx.emit(Inst::LoadExtName {
+                    //rd: tmp,
+                    //name: extname,
+                    //srcloc: loc,
+                    //offset: 0,
+                    //});
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert!(inputs.len() == sig.params.len());
+                    assert!(outputs.len() == sig.returns.len());
+                    (ARM64ABICall::from_func(sig, &extname, loc), &inputs[..])
+                    //(ARM64ABICall::from_ptr(sig, tmp.to_reg(), loc), &inputs[..])
+                }
+                Opcode::CallIndirect => {
+                    let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
+                    let sig = ctx.call_sig(insn).unwrap();
+                    assert!(inputs.len() - 1 == sig.params.len());
+                    assert!(outputs.len() == sig.returns.len());
+                    (ARM64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..])
+                }
+                _ => unreachable!(),
+            };
+
+            for inst in abi.gen_stack_pre_adjust().into_iter() {
+                ctx.emit(inst);
+            }
+            assert!(inputs.len() == abi.num_args());
+            for (i, input) in inputs.iter().enumerate() {
+                let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None);
+                ctx.emit(abi.gen_copy_reg_to_arg(i, arg_reg));
+            }
+            for inst in abi.gen_call().into_iter() {
+                ctx.emit(inst);
+            }
+            for (i, output) in outputs.iter().enumerate() {
+                let retval_reg = output_to_reg(ctx, *output);
+                ctx.emit(abi.gen_copy_retval_to_reg(i, retval_reg));
+            }
+            for inst in abi.gen_stack_post_adjust().into_iter() {
+                ctx.emit(inst);
+            }
+        }
+
+        Opcode::GetPinnedReg
+        | Opcode::SetPinnedReg
+        | Opcode::Spill
+        | Opcode::Fill
+        | Opcode::FillNop
+        | Opcode::Regmove
+        | Opcode::CopySpecial
+        | Opcode::CopyToSsa
+        | Opcode::CopyNop
+        | Opcode::AdjustSpDown
+        | Opcode::AdjustSpUpImm
+        | Opcode::AdjustSpDownImm
+        | Opcode::IfcmpSp
+        | Opcode::Regspill
+        | Opcode::Regfill => {
+            panic!("Unused opcode should not be encountered.");
+        }
+
+        Opcode::Jump
+        | Opcode::Fallthrough
+        | Opcode::Brz
+        | Opcode::Brnz
+        | Opcode::BrIcmp
+        | Opcode::Brif
+        | Opcode::Brff
+        | Opcode::IndirectJumpTableBr
+        | Opcode::BrTable => {
+            panic!("Branch opcode reached non-branch lowering logic!");
+        }
+
+        Opcode::Vconst
+        | Opcode::Shuffle
+        | Opcode::Vsplit
+        | Opcode::Vconcat
+        | Opcode::Vselect
+        | Opcode::VanyTrue
+        | Opcode::VallTrue
+        | Opcode::Splat
+        | Opcode::Insertlane
+        | Opcode::Extractlane
+        | Opcode::RawBitcast
+        | Opcode::ScalarToVector
+        | Opcode::Swizzle
+        | Opcode::Uload8x8
+        | Opcode::Sload8x8
+        | Opcode::Uload16x4
+        | Opcode::Sload16x4
+        | Opcode::Uload32x2
+        | Opcode::Sload32x2 => {
+            // TODO
+            panic!("Vector ops not implemented.");
+        }
+
+        Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."),
+        Opcode::Imax | Opcode::Imin | Opcode::Umin | Opcode::Umax => {
+            panic!("Vector ops not supported.")
+        }
+
+        Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => {
+            let bits = ty_bits(ctx.output_ty(insn, 0));
+            let fpu_op = match (op, bits) {
+                (Opcode::Fadd, 32) => FPUOp2::Add32,
+                (Opcode::Fadd, 64) => FPUOp2::Add64,
+                (Opcode::Fsub, 32) => FPUOp2::Sub32,
+                (Opcode::Fsub, 64) => FPUOp2::Sub64,
+                (Opcode::Fmul, 32) => FPUOp2::Mul32,
+                (Opcode::Fmul, 64) => FPUOp2::Mul64,
+                (Opcode::Fdiv, 32) => FPUOp2::Div32,
+                (Opcode::Fdiv, 64) => FPUOp2::Div64,
+                (Opcode::Fmin, 32) => FPUOp2::Min32,
+                (Opcode::Fmin, 64) => FPUOp2::Min64,
+                (Opcode::Fmax, 32) => FPUOp2::Max32,
+                (Opcode::Fmax, 64) => FPUOp2::Max64,
+                _ => panic!("Unknown op/bits combination"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm });
+        }
+
+        Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => {
+            let bits = ty_bits(ctx.output_ty(insn, 0));
+            let fpu_op = match (op, bits) {
+                (Opcode::Sqrt, 32) => FPUOp1::Sqrt32,
+                (Opcode::Sqrt, 64) => FPUOp1::Sqrt64,
+                (Opcode::Fneg, 32) => FPUOp1::Neg32,
+                (Opcode::Fneg, 64) => FPUOp1::Neg64,
+                (Opcode::Fabs, 32) => FPUOp1::Abs32,
+                (Opcode::Fabs, 64) => FPUOp1::Abs64,
+                (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"),
+                (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64,
+                (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32,
+                (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"),
+                _ => panic!("Unknown op/bits combination"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuRR { fpu_op, rd, rn });
+        }
+
+        Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => {
+            let bits = ty_bits(ctx.output_ty(insn, 0));
+            let op = match (op, bits) {
+                (Opcode::Ceil, 32) => FpuRoundMode::Plus32,
+                (Opcode::Ceil, 64) => FpuRoundMode::Plus64,
+                (Opcode::Floor, 32) => FpuRoundMode::Minus32,
+                (Opcode::Floor, 64) => FpuRoundMode::Minus64,
+                (Opcode::Trunc, 32) => FpuRoundMode::Zero32,
+                (Opcode::Trunc, 64) => FpuRoundMode::Zero64,
+                (Opcode::Nearest, 32) => FpuRoundMode::Nearest32,
+                (Opcode::Nearest, 64) => FpuRoundMode::Nearest64,
+                _ => panic!("Unknown op/bits combination"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuRound { op, rd, rn });
+        }
+
+        Opcode::Fma => {
+            let bits = ty_bits(ctx.output_ty(insn, 0));
+            let fpu_op = match bits {
+                32 => FPUOp3::MAdd32,
+                64 => FPUOp3::MAdd64,
+                _ => panic!("Unknown op size"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let ra = input_to_reg(ctx, inputs[2], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuRRRR {
+                fpu_op,
+                rn,
+                rm,
+                ra,
+                rd,
+            });
+        }
+
+        Opcode::Fcopysign => {
+            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
+            //
+            // (64 bits for example, 32-bit sequence is analogous):
+            //
+            // MOV Xtmp1, Dinput0
+            // MOV Xtmp2, Dinput1
+            // AND Xtmp2, 0x8000_0000_0000_0000
+            // ORR Xtmp1, Xtmp1, Xtmp2
+            // MOV Doutput, Xtmp1
+
+            let ty = ctx.output_ty(insn, 0);
+            let bits = ty_bits(ty);
+            assert!(bits == 32 || bits == 64);
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            let tmp1 = ctx.tmp(RegClass::I64, I64);
+            let tmp2 = ctx.tmp(RegClass::I64, I64);
+            ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn });
+            ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm });
+            let imml = if bits == 32 {
+                ImmLogic::from_raw(
+                    /* value = */ 0x8000_0000,
+                    /* n = */ false,
+                    /* r = */ 1,
+                    /* s = */ 0,
+                )
+            } else {
+                ImmLogic::from_raw(
+                    /* value = */ 0x8000_0000_0000_0000,
+                    /* n = */ true,
+                    /* r = */ 1,
+                    /* s = */ 0,
+                )
+            };
+            let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64);
+            ctx.emit(Inst::AluRRImmLogic {
+                alu_op,
+                rd: tmp2,
+                rn: tmp2.to_reg(),
+                imml,
+            });
+            let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64);
+            ctx.emit(Inst::AluRRR {
+                alu_op,
+                rd: tmp1,
+                rn: tmp1.to_reg(),
+                rm: tmp2.to_reg(),
+            });
+            ctx.emit(Inst::MovToVec64 {
+                rd,
+                rn: tmp1.to_reg(),
+            });
+        }
+
+        Opcode::FcvtToUint | Opcode::FcvtToSint => {
+            let in_bits = ty_bits(ctx.input_ty(insn, 0));
+            let out_bits = ty_bits(ctx.output_ty(insn, 0));
+            let signed = op == Opcode::FcvtToSint;
+            let op = match (signed, in_bits, out_bits) {
+                (false, 32, 32) => FpuToIntOp::F32ToU32,
+                (true, 32, 32) => FpuToIntOp::F32ToI32,
+                (false, 32, 64) => FpuToIntOp::F32ToU64,
+                (true, 32, 64) => FpuToIntOp::F32ToI64,
+                (false, 64, 32) => FpuToIntOp::F64ToU32,
+                (true, 64, 32) => FpuToIntOp::F64ToI32,
+                (false, 64, 64) => FpuToIntOp::F64ToU64,
+                (true, 64, 64) => FpuToIntOp::F64ToI64,
+                _ => panic!("Unknown input/output-bits combination"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::FpuToInt { op, rd, rn });
+        }
+
+        Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
+            let in_bits = ty_bits(ctx.input_ty(insn, 0));
+            let out_bits = ty_bits(ctx.output_ty(insn, 0));
+            let signed = op == Opcode::FcvtFromSint;
+            let op = match (signed, in_bits, out_bits) {
+                (false, 32, 32) => IntToFpuOp::U32ToF32,
+                (true, 32, 32) => IntToFpuOp::I32ToF32,
+                (false, 32, 64) => IntToFpuOp::U32ToF64,
+                (true, 32, 64) => IntToFpuOp::I32ToF64,
+                (false, 64, 32) => IntToFpuOp::U64ToF32,
+                (true, 64, 32) => IntToFpuOp::I64ToF32,
+                (false, 64, 64) => IntToFpuOp::U64ToF64,
+                (true, 64, 64) => IntToFpuOp::I64ToF64,
+                _ => panic!("Unknown input/output-bits combination"),
+            };
+            let narrow_mode = match (signed, in_bits) {
+                (false, 32) => NarrowValueMode::ZeroExtend32,
+                (true, 32) => NarrowValueMode::SignExtend32,
+                (false, 64) => NarrowValueMode::ZeroExtend64,
+                (true, 64) => NarrowValueMode::SignExtend64,
+                _ => panic!("Unknown input size"),
+            };
+            let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+            let rd = output_to_reg(ctx, outputs[0]);
+            ctx.emit(Inst::IntToFpu { op, rd, rn });
+        }
+
+        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
+            let in_ty = ctx.input_ty(insn, 0);
+            let in_bits = ty_bits(in_ty);
+            let out_ty = ctx.output_ty(insn, 0);
+            let out_bits = ty_bits(out_ty);
+            let out_signed = op == Opcode::FcvtToSintSat;
+            let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+            let rd = output_to_reg(ctx, outputs[0]);
+
+            // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
+            // FMIN Vtmp2, Vin, Vtmp1
+            // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
+            // FMAX Vtmp2, Vtmp2, Vtmp
+            // FCMP Vin, Vin
+            // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
+            // convert Rout, Vtmp2
+
+            assert!(in_bits == 32 || in_bits == 64);
+            assert!(out_bits == 32 || out_bits == 64);
+
+            let min: f64 = match (out_bits, out_signed) {
+                (32, true) => std::i32::MIN as f64,
+                (32, false) => 0.0,
+                (64, true) => std::i64::MIN as f64,
+                (64, false) => 0.0,
+                _ => unreachable!(),
+            };
+
+            let max = match (out_bits, out_signed) {
+                (32, true) => std::i32::MAX as f64,
+                (32, false) => std::u32::MAX as f64,
+                (64, true) => std::i64::MAX as f64,
+                (64, false) => std::u64::MAX as f64,
+                _ => unreachable!(),
+            };
+
+            let rtmp1 = ctx.tmp(RegClass::V128, in_ty);
+            let rtmp2 = ctx.tmp(RegClass::V128, in_ty);
+
+            if in_bits == 32 {
+                ctx.emit(Inst::LoadFpuConst32 {
+                    rd: rtmp1,
+                    const_data: max as f32,
+                });
+            } else {
+                ctx.emit(Inst::LoadFpuConst64 {
+                    rd: rtmp1,
+                    const_data: max,
+                });
+            }
+            ctx.emit(Inst::FpuRRR {
+                fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64),
+                rd: rtmp2,
+                rn: rn,
+                rm: rtmp1.to_reg(),
+            });
+            if in_bits == 32 {
+                ctx.emit(Inst::LoadFpuConst32 {
+                    rd: rtmp1,
+                    const_data: min as f32,
+                });
+            } else {
+                ctx.emit(Inst::LoadFpuConst64 {
+                    rd: rtmp1,
+                    const_data: min,
+                });
+            }
+            ctx.emit(Inst::FpuRRR {
+                fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64),
+                rd: rtmp2,
+                rn: rtmp2.to_reg(),
+                rm: rtmp1.to_reg(),
+            });
+            if in_bits == 32 {
+                ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn });
+                ctx.emit(Inst::FpuCSel32 {
+                    rd: rtmp2,
+                    rn: rtmp1.to_reg(),
+                    rm: rtmp2.to_reg(),
+                    cond: Cond::Ne,
+                });
+            } else {
+                ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn });
+                ctx.emit(Inst::FpuCSel64 {
+                    rd: rtmp2,
+                    rn: rtmp1.to_reg(),
+                    rm: rtmp2.to_reg(),
+                    cond: Cond::Ne,
+                });
+            }
+
+            let cvt = match (in_bits, out_bits, out_signed) {
+                (32, 32, false) => FpuToIntOp::F32ToU32,
+                (32, 32, true) => FpuToIntOp::F32ToI32,
+                (32, 64, false) => FpuToIntOp::F32ToU64,
+                (32, 64, true) => FpuToIntOp::F32ToI64,
+                (64, 32, false) => FpuToIntOp::F64ToU32,
+                (64, 32, true) => FpuToIntOp::F64ToI32,
+                (64, 64, false) => FpuToIntOp::F64ToU64,
+                (64, 64, true) => FpuToIntOp::F64ToI64,
+                _ => unreachable!(),
+            };
+            ctx.emit(Inst::FpuToInt {
+                op: cvt,
+                rd,
+                rn: rtmp2.to_reg(),
+            });
+        }
+
+        Opcode::IaddImm
+        | Opcode::ImulImm
+        | Opcode::UdivImm
+        | Opcode::SdivImm
+        | Opcode::UremImm
+        | Opcode::SremImm
+        | Opcode::IrsubImm
+        | Opcode::IaddCin
+        | Opcode::IaddIfcin
+        | Opcode::IaddCout
+        | Opcode::IaddIfcout
+        | Opcode::IaddCarry
+        | Opcode::IaddIfcarry
+        | Opcode::IsubBin
+        | Opcode::IsubIfbin
+        | Opcode::IsubBout
+        | Opcode::IsubIfbout
+        | Opcode::IsubBorrow
+        | Opcode::IsubIfborrow
+        | Opcode::BandImm
+        | Opcode::BorImm
+        | Opcode::BxorImm
+        | Opcode::RotlImm
+        | Opcode::RotrImm
+        | Opcode::IshlImm
+        | Opcode::UshrImm
+        | Opcode::SshrImm
+        | Opcode::IcmpImm
+        | Opcode::IfcmpImm => {
+            panic!("ALU+imm and ALU+carry ops should not appear here!");
+        }
+
+        #[cfg(feature = "x86")]
+        Opcode::X86Udivmodx
+        | Opcode::X86Sdivmodx
+        | Opcode::X86Umulx
+        | Opcode::X86Smulx
+        | Opcode::X86Cvtt2si
+        | Opcode::X86Fmin
+        | Opcode::X86Fmax
+        | Opcode::X86Push
+        | Opcode::X86Pop
+        | Opcode::X86Bsr
+        | Opcode::X86Bsf
+        | Opcode::X86Pshufd
+        | Opcode::X86Pshufb
+        | Opcode::X86Pextr
+        | Opcode::X86Pinsr
+        | Opcode::X86Insertps
+        | Opcode::X86Movsd
+        | Opcode::X86Movlhps
+        | Opcode::X86Psll
+        | Opcode::X86Psrl
+        | Opcode::X86Psra
+        | Opcode::X86Ptest
+        | Opcode::X86Pmaxs
+        | Opcode::X86Pmaxu
+        | Opcode::X86Pmins
+        | Opcode::X86Pminu
+        | Opcode::X86ElfTlsGetAddr
+        | Opcode::X86MachoTlsGetAddr => {
+            panic!("x86-specific opcode in supposedly arch-neutral IR!");
+        }
+
+        Opcode::AvgRound => unimplemented!(),
+        Opcode::TlsValue => unimplemented!(),
+    }
+}
+
+//=============================================================================
+// Helpers for instruction lowering.
+fn ty_bits(ty: Type) -> usize {
+    match ty {
+        B1 => 1,
+        B8 | I8 => 8,
+        B16 | I16 => 16,
+        B32 | I32 | F32 => 32,
+        B64 | I64 | F64 => 64,
+        B128 | I128 => 128,
+        IFLAGS | FFLAGS => 32,
+        _ => panic!("ty_bits() on unknown type: {:?}", ty),
+    }
+}
+
+fn ty_is_int(ty: Type) -> bool {
+    match ty {
+        B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true,
+        F32 | F64 | B128 | I128 => false,
+        IFLAGS | FFLAGS => panic!("Unexpected flags type"),
+        _ => panic!("ty_is_int() on unknown type: {:?}", ty),
+    }
+}
+
+fn ty_is_float(ty: Type) -> bool {
+    !ty_is_int(ty)
+}
+
+fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
+    let bits = ty_bits(ty);
+    if bits <= 32 {
+        op32
+    } else if bits == 64 {
+        op64
+    } else {
+        panic!("choose_32_64 on > 64 bits!")
+    }
+}
+
+fn branch_target(data: &InstructionData) -> Option<Block> {
+    match data {
+        &InstructionData::BranchIcmp { destination, .. }
+        | &InstructionData::Branch { destination, .. }
+        | &InstructionData::BranchInt { destination, .. }
+        | &InstructionData::Jump { destination, .. }
+        | &InstructionData::BranchTable { destination, .. }
+        | &InstructionData::BranchFloat { destination, .. } => Some(destination),
+        _ => {
+            assert!(!data.opcode().is_branch());
+            None
+        }
+    }
+}
+
+fn ldst_offset(data: &InstructionData) -> Option<i32> {
+    match data {
+        &InstructionData::Load { offset, .. }
+        | &InstructionData::StackLoad { offset, .. }
+        | &InstructionData::LoadComplex { offset, .. }
+        | &InstructionData::Store { offset, .. }
+        | &InstructionData::StackStore { offset, .. }
+        | &InstructionData::StoreComplex { offset, .. } => Some(offset.into()),
+        _ => None,
+    }
+}
+
+fn inst_condcode(data: &InstructionData) -> Option<IntCC> {
+    match data {
+        &InstructionData::IntCond { cond, .. }
+        | &InstructionData::BranchIcmp { cond, .. }
+        | &InstructionData::IntCompare { cond, .. }
+        | &InstructionData::IntCondTrap { cond, .. }
+        | &InstructionData::BranchInt { cond, .. }
+        | &InstructionData::IntSelect { cond, .. }
+        | &InstructionData::IntCompareImm { cond, .. } => Some(cond),
+        _ => None,
+    }
+}
+
+fn inst_fp_condcode(data: &InstructionData) -> Option<FloatCC> {
+    match data {
+        &InstructionData::BranchFloat { cond, .. }
+        | &InstructionData::FloatCompare { cond, .. }
+        | &InstructionData::FloatCond { cond, .. }
+        | &InstructionData::FloatCondTrap { cond, .. } => Some(cond),
+        _ => None,
+    }
+}
+
+fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
+    match data {
+        &InstructionData::Trap { code, .. }
+        | &InstructionData::CondTrap { code, .. }
+        | &InstructionData::IntCondTrap { code, .. }
+        | &InstructionData::FloatCondTrap { code, .. } => Some(code),
+        _ => None,
+    }
+}
+
+/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
+fn maybe_input_insn<C: LowerCtx<Inst>>(c: &mut C, input: InsnInput, op: Opcode) -> Option<IRInst> {
+    if let InsnInputSource::Output(out) = input_source(c, input) {
+        let data = c.data(out.insn);
+        if data.opcode() == op {
+            c.merged(out.insn);
+            return Some(out.insn);
+        }
+    }
+    None
+}
+
+/// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g.,
+/// Bint or a bitcast). Marks one or both as merged if so, as appropriate.
+///
+/// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
+/// a bit more generic.
+fn maybe_input_insn_via_conv<C: LowerCtx<Inst>>(
+    c: &mut C,
+    input: InsnInput,
+    op: Opcode,
+    conv: Opcode,
+) -> Option<IRInst> {
+    if let Some(ret) = maybe_input_insn(c, input, op) {
+        return Some(ret);
+    }
+
+    if let InsnInputSource::Output(out) = input_source(c, input) {
+        let data = c.data(out.insn);
+        if data.opcode() == conv {
+            let conv_insn = out.insn;
+            let conv_input = InsnInput {
+                insn: conv_insn,
+                input: 0,
+            };
+            if let Some(inner) = maybe_input_insn(c, conv_input, op) {
+                c.merged(conv_insn);
+                return Some(inner);
+            }
+        }
+    }
+    None
+}
+
+fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst, is_signed: bool) {
+    let ty = ctx.input_ty(insn, 0);
+    let bits = ty_bits(ty);
+    let narrow_mode = match (bits <= 32, is_signed) {
+        (true, true) => NarrowValueMode::SignExtend32,
+        (true, false) => NarrowValueMode::ZeroExtend32,
+        (false, true) => NarrowValueMode::SignExtend64,
+        (false, false) => NarrowValueMode::ZeroExtend64,
+    };
+    let inputs = [
+        InsnInput {
+            insn: insn,
+            input: 0,
+        },
+        InsnInput {
+            insn: insn,
+            input: 1,
+        },
+    ];
+    let ty = ctx.input_ty(insn, 0);
+    let rn = input_to_reg(ctx, inputs[0], narrow_mode);
+    let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode);
+    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+    let rd = writable_zero_reg();
+    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+}
+
+fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
+    let ty = ctx.input_ty(insn, 0);
+    let bits = ty_bits(ty);
+    let inputs = [
+        InsnInput {
+            insn: insn,
+            input: 0,
+        },
+        InsnInput {
+            insn: insn,
+            input: 1,
+        },
+    ];
+    let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None);
+    let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None);
+    match bits {
+        32 => {
+            ctx.emit(Inst::FpuCmp32 { rn, rm });
+        }
+        64 => {
+            ctx.emit(Inst::FpuCmp64 { rn, rm });
+        }
+        _ => panic!("Unknown float size"),
+    }
+}
+
+//=============================================================================
+// Lowering-backend trait implementation.
+
+impl LowerBackend for Arm64Backend {
+    type MInst = Inst;
+
+    fn lower<C: LowerCtx<Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
+        lower_insn_to_regs(ctx, ir_inst);
+    }
+
+    fn lower_branch_group<C: LowerCtx<Inst>>(
+        &self,
+        ctx: &mut C,
+        branches: &[IRInst],
+        targets: &[BlockIndex],
+        fallthrough: Option<BlockIndex>,
+    ) {
+        // A block should end with at most two branches. The first may be a
+        // conditional branch; a conditional branch can be followed only by an
+        // unconditional branch or fallthrough. Otherwise, if only one branch,
+        // it may be an unconditional branch, a fallthrough, a return, or a
+        // trap. These conditions are verified by `is_ebb_basic()` during the
+        // verifier pass.
+        assert!(branches.len() <= 2);
+
+        if branches.len() == 2 {
+            // Must be a conditional branch followed by an unconditional branch.
+            let op0 = ctx.data(branches[0]).opcode();
+            let op1 = ctx.data(branches[1]).opcode();
+
+            //println!(
+            //    "lowering two-branch group: opcodes are {:?} and {:?}",
+            //    op0, op1
+            //);
+
+            assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough);
+            let taken = BranchTarget::Block(targets[0]);
+            let not_taken = match op1 {
+                Opcode::Jump => BranchTarget::Block(targets[1]),
+                Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()),
+                _ => unreachable!(), // assert above.
+            };
+            match op0 {
+                Opcode::Brz | Opcode::Brnz => {
+                    let flag_input = InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    };
+                    if let Some(icmp_insn) =
+                        maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint)
+                    {
+                        let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap();
+                        let cond = lower_condcode(condcode);
+                        let is_signed = condcode_is_signed(condcode);
+                        let negated = op0 == Opcode::Brz;
+                        let cond = if negated { cond.invert() } else { cond };
+
+                        lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed);
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    } else if let Some(fcmp_insn) =
+                        maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint)
+                    {
+                        let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap();
+                        let cond = lower_fp_condcode(condcode);
+                        let negated = op0 == Opcode::Brz;
+                        let cond = if negated { cond.invert() } else { cond };
+
+                        lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn);
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    } else {
+                        let rt = input_to_reg(
+                            ctx,
+                            InsnInput {
+                                insn: branches[0],
+                                input: 0,
+                            },
+                            NarrowValueMode::ZeroExtend64,
+                        );
+                        let kind = match op0 {
+                            Opcode::Brz => CondBrKind::Zero(rt),
+                            Opcode::Brnz => CondBrKind::NotZero(rt),
+                            _ => unreachable!(),
+                        };
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind,
+                        });
+                    }
+                }
+                Opcode::BrIcmp => {
+                    let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
+                    let cond = lower_condcode(condcode);
+                    let is_signed = condcode_is_signed(condcode);
+                    let ty = ctx.input_ty(branches[0], 0);
+                    let bits = ty_bits(ty);
+                    let narrow_mode = match (bits <= 32, is_signed) {
+                        (true, true) => NarrowValueMode::SignExtend32,
+                        (true, false) => NarrowValueMode::ZeroExtend32,
+                        (false, true) => NarrowValueMode::SignExtend64,
+                        (false, false) => NarrowValueMode::ZeroExtend64,
+                    };
+                    let rn = input_to_reg(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 0,
+                        },
+                        narrow_mode,
+                    );
+                    let rm = input_to_rse_imm12(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 1,
+                        },
+                        narrow_mode,
+                    );
+
+                    let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64);
+                    let rd = writable_zero_reg();
+                    ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
+                    ctx.emit(Inst::CondBr {
+                        taken,
+                        not_taken,
+                        kind: CondBrKind::Cond(cond),
+                    });
+                }
+
+                Opcode::Brif => {
+                    let condcode = inst_condcode(ctx.data(branches[0])).unwrap();
+                    let cond = lower_condcode(condcode);
+                    let is_signed = condcode_is_signed(condcode);
+                    let flag_input = InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    };
+                    if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) {
+                        lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed);
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    } else {
+                        // If the ifcmp result is actually placed in a
+                        // register, we need to move it back into the flags.
+                        let rn = input_to_reg(ctx, flag_input, NarrowValueMode::None);
+                        ctx.emit(Inst::MovToNZCV { rn });
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    }
+                }
+
+                Opcode::Brff => {
+                    let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap();
+                    let cond = lower_fp_condcode(condcode);
+                    let flag_input = InsnInput {
+                        insn: branches[0],
+                        input: 0,
+                    };
+                    if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) {
+                        lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn);
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    } else {
+                        // If the ffcmp result is actually placed in a
+                        // register, we need to move it back into the flags.
+                        let rn = input_to_reg(ctx, flag_input, NarrowValueMode::None);
+                        ctx.emit(Inst::MovToNZCV { rn });
+                        ctx.emit(Inst::CondBr {
+                            taken,
+                            not_taken,
+                            kind: CondBrKind::Cond(cond),
+                        });
+                    }
+                }
+
+                _ => unimplemented!(),
+            }
+        } else {
+            // Must be an unconditional branch or an indirect branch.
+            let op = ctx.data(branches[0]).opcode();
+            match op {
+                Opcode::Jump | Opcode::Fallthrough => {
+                    assert!(branches.len() == 1);
+                    // In the Fallthrough case, the machine-independent driver
+                    // fills in `targets[0]` with our fallthrough block, so this
+                    // is valid for both Jump and Fallthrough.
+                    ctx.emit(Inst::Jump {
+                        dest: BranchTarget::Block(targets[0]),
+                    });
+                }
+                Opcode::BrTable => {
+                    // Expand `br_table index, default, JT` to:
+                    //
+                    //   subs idx, #jt_size
+                    //   b.hs default
+                    //   adr vTmp1, PC+16
+                    //   ldr vTmp2, [vTmp1, idx, lsl #2]
+                    //   add vTmp2, vTmp2, vTmp1
+                    //   br vTmp2
+                    //   [jumptable offsets relative to JT base]
+                    let jt_size = targets.len() - 1;
+                    assert!(jt_size <= std::u32::MAX as usize);
+                    let ridx = input_to_reg(
+                        ctx,
+                        InsnInput {
+                            insn: branches[0],
+                            input: 0,
+                        },
+                        NarrowValueMode::ZeroExtend32,
+                    );
+
+                    let rtmp1 = ctx.tmp(RegClass::I64, I32);
+                    let rtmp2 = ctx.tmp(RegClass::I64, I32);
+
+                    // Bounds-check and branch to default.
+                    if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) {
+                        ctx.emit(Inst::AluRRImm12 {
+                            alu_op: ALUOp::SubS32,
+                            rd: writable_zero_reg(),
+                            rn: ridx,
+                            imm12,
+                        });
+                    } else {
+                        lower_constant_u64(ctx, rtmp1, jt_size as u64);
+                        ctx.emit(Inst::AluRRR {
+                            alu_op: ALUOp::SubS32,
+                            rd: writable_zero_reg(),
+                            rn: ridx,
+                            rm: rtmp1.to_reg(),
+                        });
+                    }
+                    let default_target = BranchTarget::Block(targets[0]);
+                    ctx.emit(Inst::CondBrLowered {
+                        kind: CondBrKind::Cond(Cond::Hs), // unsigned >=
+                        target: default_target.clone(),
+                    });
+
+                    // Emit the compound instruction that does:
+                    //
+                    // adr rA, jt
+                    // ldrsw rB, [rA, rIndex, UXTW 2]
+                    // add rA, rA, rB
+                    // br rA
+                    // [jt entries]
+                    //
+                    // This must be *one* instruction in the vcode because
+                    // we cannot allow regalloc to insert any spills/fills
+                    // in the middle of the sequence; otherwise, the ADR's
+                    // PC-rel offset to the jumptable would be incorrect.
+                    // (The alternative is to introduce a relocation pass
+                    // for inlined jumptables, which is much worse, IMHO.)
+
+                    let jt_targets: Vec<BranchTarget> = targets
+                        .iter()
+                        .skip(1)
+                        .map(|bix| BranchTarget::Block(*bix))
+                        .collect();
+                    let targets_for_term: Vec<BlockIndex> = targets.to_vec();
+                    ctx.emit(Inst::JTSequence {
+                        ridx,
+                        rtmp1,
+                        rtmp2,
+                        targets: jt_targets,
+                        targets_for_term,
+                    });
+                }
+
+                _ => panic!("Unknown branch type!"),
+            }
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs
index 8f0324904b86..7f4b9ecaa68e 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1,2 +1,6 @@
 mod abi;
 mod inst;
+mod lower;
+
+/// Placeholder for later implementation.
+pub struct Arm64Backend {}

From a0e629ecfb74c8f7d5dd19a5ba3b6d26549f78d7 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 13:27:48 -0700
Subject: [PATCH 07/12] ARM64 backend, part 7 / 11: Arm64Backend toplevel.

This patch ties together the previously-committed pieces to implement
the `MachBackend` trait for ARM64.
---
 cranelift/codegen/src/isa/arm64/mod.rs | 217 ++++++++++++++++++++++++-
 1 file changed, 215 insertions(+), 2 deletions(-)

diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs
index 7f4b9ecaa68e..fb3543933228 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/arm64/mod.rs
@@ -1,6 +1,219 @@
+//! ARM 64-bit Instruction Set Architecture.
+
+use crate::ir::Function;
+use crate::isa::Builder as IsaBuilder;
+use crate::isa::TargetIsa;
+use crate::machinst::{
+    compile, MachBackend, MachCompileResult, ShowWithRRU, TargetIsaAdapter, VCode,
+};
+use crate::result::CodegenResult;
+use crate::settings;
+
+use alloc::boxed::Box;
+use std::str::FromStr;
+
+use regalloc::RealRegUniverse;
+use target_lexicon::Triple;
+
+// New backend:
 mod abi;
 mod inst;
 mod lower;
 
-/// Placeholder for later implementation.
-pub struct Arm64Backend {}
+use inst::create_reg_universe;
+
+/// An ARM64 backend.
+pub struct Arm64Backend {
+    flags: settings::Flags,
+}
+
+impl Arm64Backend {
+    /// Create a new ARM64 backend with the given (shared) flags.
+    pub fn new_with_flags(flags: settings::Flags) -> Arm64Backend {
+        Arm64Backend { flags }
+    }
+
+    fn compile_vcode(&self, mut func: Function, flags: &settings::Flags) -> VCode<inst::Inst> {
+        // This performs lowering to VCode, register-allocates the code, computes
+        // block layout and finalizes branches. The result is ready for binary emission.
+        let abi = Box::new(abi::ARM64ABIBody::new(&func));
+        compile::compile::<Arm64Backend>(&mut func, self, abi, flags)
+    }
+}
+
+impl MachBackend for Arm64Backend {
+    fn compile_function(
+        &self,
+        func: Function,
+        want_disasm: bool,
+    ) -> CodegenResult<MachCompileResult> {
+        let flags = self.flags();
+        let vcode = self.compile_vcode(func, flags);
+        let sections = vcode.emit();
+        let frame_size = vcode.frame_size();
+
+        let disasm = if want_disasm {
+            Some(vcode.show_rru(Some(&create_reg_universe())))
+        } else {
+            None
+        };
+
+        Ok(MachCompileResult {
+            sections,
+            frame_size,
+            disasm,
+        })
+    }
+
+    fn name(&self) -> &'static str {
+        "arm64"
+    }
+
+    fn triple(&self) -> Triple {
+        FromStr::from_str("arm64").unwrap()
+    }
+
+    fn flags(&self) -> &settings::Flags {
+        &self.flags
+    }
+
+    fn reg_universe(&self) -> RealRegUniverse {
+        create_reg_universe()
+    }
+}
+
+/// Create a new `isa::Builder`.
+pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    IsaBuilder {
+        triple,
+        setup: settings::builder(),
+        constructor: isa_constructor,
+    }
+}
+
+fn isa_constructor(
+    _: Triple,
+    shared_flags: settings::Flags,
+    _arch_flag_builder: settings::Builder,
+) -> Box<dyn TargetIsa> {
+    let backend = Arm64Backend::new_with_flags(shared_flags);
+    Box::new(TargetIsaAdapter::new(backend))
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+    use crate::binemit::{NullRelocSink, NullStackmapSink, NullTrapSink};
+    use crate::cursor::{Cursor, FuncCursor};
+    use crate::ir::types::*;
+    use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
+    use crate::isa::CallConv;
+    use crate::settings;
+    use crate::settings::Configurable;
+
+    #[test]
+    fn test_compile_function() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().return_(&[v1]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = Arm64Backend::new_with_flags(settings::Flags::new(shared_flags));
+        let sections = backend.compile_function(func, false).unwrap().sections;
+        let code = &sections.sections[0].data;
+
+        // stp x29, x30, [sp, #-16]!
+        // mov x29, sp
+        // mov x1, #0x1234
+        // add w0, w0, w1
+        // mov sp, x29
+        // ldp x29, x30, [sp], #16
+        // ret
+        let golden = vec![
+            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0x81, 0x46, 0x82, 0xd2, 0x00, 0x00,
+            0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6,
+        ];
+
+        assert_eq!(code, &golden);
+    }
+
+    #[test]
+    fn test_branch_lowering() {
+        let name = ExternalName::testcase("test0");
+        let mut sig = Signature::new(CallConv::SystemV);
+        sig.params.push(AbiParam::new(I32));
+        sig.returns.push(AbiParam::new(I32));
+        let mut func = Function::with_name_signature(name, sig);
+
+        let bb0 = func.dfg.make_block();
+        let arg0 = func.dfg.append_block_param(bb0, I32);
+        let bb1 = func.dfg.make_block();
+        let bb2 = func.dfg.make_block();
+        let bb3 = func.dfg.make_block();
+
+        let mut pos = FuncCursor::new(&mut func);
+        pos.insert_block(bb0);
+        let v0 = pos.ins().iconst(I32, 0x1234);
+        let v1 = pos.ins().iadd(arg0, v0);
+        pos.ins().brnz(v1, bb1, &[]);
+        pos.ins().jump(bb2, &[]);
+        pos.insert_block(bb1);
+        pos.ins().brnz(v1, bb2, &[]);
+        pos.ins().jump(bb3, &[]);
+        pos.insert_block(bb2);
+        let v2 = pos.ins().iadd(v1, v0);
+        pos.ins().brnz(v2, bb2, &[]);
+        pos.ins().jump(bb1, &[]);
+        pos.insert_block(bb3);
+        let v3 = pos.ins().isub(v1, v0);
+        pos.ins().return_(&[v3]);
+
+        let mut shared_flags = settings::builder();
+        shared_flags.set("opt_level", "none").unwrap();
+        let backend = Arm64Backend::new_with_flags(settings::Flags::new(shared_flags));
+        let result = backend
+            .compile_function(func, /* want_disasm = */ false)
+            .unwrap();
+        let code = &result.sections.sections[0].data;
+
+        // stp	x29, x30, [sp, #-16]!
+        // mov	x29, sp
+        // mov	x1, x0
+        // mov  x0, #0x1234
+        // add	w1, w1, w0
+        // mov	w2, w1
+        // cbz	x2, ...
+        // mov	w2, w1
+        // cbz	x2, ...
+        // sub	w0, w1, w0
+        // mov	sp, x29
+        // ldp	x29, x30, [sp], #16
+        // ret
+        // add	w2, w1, w0
+        // mov	w2, w2
+        // cbnz	x2, ... <---- compound branch (cond / uncond)
+        // b ...        <----
+
+        let golden = vec![
+            0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46,
+            0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4,
+            0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03,
+            0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b,
+            0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17,
+        ];
+
+        assert_eq!(code, &golden);
+    }
+}

From 60990aeaae28d70b675c194d8b6fef5d5a9fdc8e Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 13:38:58 -0700
Subject: [PATCH 08/12] ARM64 backend, part 8 / 11: integration.

This patch ties together the new backend infrastructure with the
existing Cranelift codegen APIs.

With all patches in this series up to this patch applied, the ARM64
compiler is now functional and can be used. Two uses of this
functionality -- filecheck-based tests and integration into wasmtime --
will come in subsequent patches.
---
 cranelift/codegen/src/context.rs        | 83 +++++++++++++++++++------
 cranelift/codegen/src/ir/function.rs    | 15 ++++-
 cranelift/codegen/src/isa/mod.rs        |  5 +-
 cranelift/codegen/src/isa/test_utils.rs | 83 +++++++++++++++++++++++++
 cranelift/codegen/src/postopt.rs        | 17 ++---
 cranelift/codegen/src/verifier/flags.rs | 18 +++---
 cranelift/src/compile.rs                | 52 ++++++++--------
 7 files changed, 209 insertions(+), 64 deletions(-)
 create mode 100644 cranelift/codegen/src/isa/test_utils.rs

diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs
index ca70293c05fb..2c3a84509e27 100644
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -19,8 +19,10 @@ use crate::flowgraph::ControlFlowGraph;
 use crate::ir::Function;
 use crate::isa::TargetIsa;
 use crate::legalize_function;
+use crate::legalizer::simple_legalize;
 use crate::licm::do_licm;
 use crate::loop_analysis::LoopAnalysis;
+use crate::machinst::MachCompileResult;
 use crate::nan_canonicalization::do_nan_canonicalization;
 use crate::postopt::do_postopt;
 use crate::redundant_reload_remover::RedundantReloadRemover;
@@ -55,6 +57,12 @@ pub struct Context {
 
     /// Redundant-reload remover context.
     pub redundant_reload_remover: RedundantReloadRemover,
+
+    /// Result of MachBackend compilation, if computed.
+    pub mach_compile_result: Option<MachCompileResult>,
+
+    /// Flag: do we want a disassembly with the MachCompileResult?
+    pub want_disasm: bool,
 }
 
 impl Context {
@@ -78,6 +86,8 @@ impl Context {
             regalloc: regalloc::Context::new(),
             loop_analysis: LoopAnalysis::new(),
             redundant_reload_remover: RedundantReloadRemover::new(),
+            mach_compile_result: None,
+            want_disasm: false,
         }
     }
 
@@ -89,6 +99,14 @@ impl Context {
         self.regalloc.clear();
         self.loop_analysis.clear();
         self.redundant_reload_remover.clear();
+        self.mach_compile_result = None;
+        self.want_disasm = false;
+    }
+
+    /// Set the flag to request a disassembly when compiling with a
+    /// `MachBackend` backend.
+    pub fn set_disasm(&mut self, val: bool) {
+        self.want_disasm = val;
     }
 
     /// Compile the function, and emit machine code into a `Vec<u8>`.
@@ -130,9 +148,13 @@ impl Context {
     pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult<CodeInfo> {
         let _tt = timing::compile();
         self.verify_if(isa)?;
-        debug!("Compiling:\n{}", self.func.display(isa));
 
         let opt_level = isa.flags().opt_level();
+        debug!(
+            "Compiling (opt level {:?}):\n{}",
+            opt_level,
+            self.func.display(isa)
+        );
 
         self.compute_cfg();
         if opt_level != OptLevel::None {
@@ -141,6 +163,7 @@ impl Context {
         if isa.flags().enable_nan_canonicalization() {
             self.canonicalize_nans(isa)?;
         }
+
         self.legalize(isa)?;
         if opt_level != OptLevel::None {
             self.postopt(isa)?;
@@ -149,23 +172,33 @@ impl Context {
             self.licm(isa)?;
             self.simple_gvn(isa)?;
         }
+
         self.compute_domtree();
         self.eliminate_unreachable_code(isa)?;
         if opt_level != OptLevel::None {
             self.dce(isa)?;
         }
-        self.regalloc(isa)?;
-        self.prologue_epilogue(isa)?;
-        if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize {
-            self.redundant_reload_remover(isa)?;
-        }
-        if opt_level == OptLevel::SpeedAndSize {
-            self.shrink_instructions(isa)?;
-        }
-        let result = self.relax_branches(isa);
 
-        debug!("Compiled:\n{}", self.func.display(isa));
-        result
+        if let Some(backend) = isa.get_mach_backend() {
+            let func = std::mem::replace(&mut self.func, Function::new());
+            let result = backend.compile_function(func, self.want_disasm)?;
+            let info = result.code_info();
+            self.mach_compile_result = Some(result);
+            Ok(info)
+        } else {
+            self.regalloc(isa)?;
+            self.prologue_epilogue(isa)?;
+            if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize {
+                self.redundant_reload_remover(isa)?;
+            }
+            if opt_level == OptLevel::SpeedAndSize {
+                self.shrink_instructions(isa)?;
+            }
+            let result = self.relax_branches(isa);
+
+            debug!("Compiled:\n{}", self.func.display(isa));
+            result
+        }
     }
 
     /// Emit machine code directly into raw memory.
@@ -191,7 +224,11 @@ impl Context {
     ) -> CodeInfo {
         let _tt = timing::binemit();
         let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps);
-        isa.emit_function_to_memory(&self.func, &mut sink);
+        if let Some(ref result) = &self.mach_compile_result {
+            result.sections.emit(&mut sink);
+        } else {
+            isa.emit_function_to_memory(&self.func, &mut sink);
+        }
         sink.info
     }
 
@@ -275,13 +312,19 @@ impl Context {
 
     /// Run the legalizer for `isa` on the function.
     pub fn legalize(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
-        // Legalization invalidates the domtree and loop_analysis by mutating the CFG.
-        // TODO: Avoid doing this when legalization doesn't actually mutate the CFG.
-        self.domtree.clear();
-        self.loop_analysis.clear();
-        legalize_function(&mut self.func, &mut self.cfg, isa);
-        debug!("Legalized:\n{}", self.func.display(isa));
-        self.verify_if(isa)
+        if isa.get_mach_backend().is_some() {
+            // Run some specific legalizations only.
+            simple_legalize(&mut self.func, &mut self.cfg, isa);
+            Ok(())
+        } else {
+            // Legalization invalidates the domtree and loop_analysis by mutating the CFG.
+            // TODO: Avoid doing this when legalization doesn't actually mutate the CFG.
+            self.domtree.clear();
+            self.loop_analysis.clear();
+            legalize_function(&mut self.func, &mut self.cfg, isa);
+            debug!("Legalized:\n{}", self.func.display(isa));
+            self.verify_if(isa)
+        }
     }
 
     /// Perform post-legalization rewrites on the function.
diff --git a/cranelift/codegen/src/ir/function.rs b/cranelift/codegen/src/ir/function.rs
index 1e72d2bc48c8..7e3cf719563c 100644
--- a/cranelift/codegen/src/ir/function.rs
+++ b/cranelift/codegen/src/ir/function.rs
@@ -3,6 +3,8 @@
 //! The `Function` struct defined in this module owns all of its basic blocks and
 //! instructions.
 
+#![allow(unused_imports)]
+
 use crate::binemit::CodeOffset;
 use crate::entity::{PrimaryMap, SecondaryMap};
 use crate::ir;
@@ -17,6 +19,7 @@ use crate::isa::{CallConv, EncInfo, Encoding, Legalize, TargetIsa};
 use crate::regalloc::{EntryRegDiversions, RegDiversions};
 use crate::value_label::ValueLabelsRanges;
 use crate::write::write_function;
+use alloc::boxed::Box;
 use core::fmt;
 
 /// A function.
@@ -238,13 +241,21 @@ impl Function {
 
     /// Wrapper around `encode` which assigns `inst` the resulting encoding.
     pub fn update_encoding(&mut self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result<(), Legalize> {
-        self.encode(inst, isa).map(|e| self.encodings[inst] = e)
+        if isa.get_mach_backend().is_some() {
+            Ok(())
+        } else {
+            self.encode(inst, isa).map(|e| self.encodings[inst] = e)
+        }
     }
 
     /// Wrapper around `TargetIsa::encode` for encoding an existing instruction
     /// in the `Function`.
     pub fn encode(&self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result<Encoding, Legalize> {
-        isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst))
+        if isa.get_mach_backend().is_some() {
+            Ok(Encoding::new(0, 0))
+        } else {
+            isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst))
+        }
     }
 
     /// Starts collection of debug information.
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index c94707690a51..a0a2a5de878e 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -48,6 +48,7 @@ pub use crate::isa::call_conv::CallConv;
 pub use crate::isa::constraints::{
     BranchRange, ConstraintKind, OperandConstraint, RecipeConstraints,
 };
+pub use crate::isa::enc_tables::Encodings;
 pub use crate::isa::encoding::{base_size, EncInfo, Encoding};
 pub use crate::isa::registers::{regs_overlap, RegClass, RegClassIndex, RegInfo, RegUnit};
 pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
@@ -55,9 +56,8 @@ pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef};
 use crate::binemit;
 use crate::flowgraph;
 use crate::ir;
-pub use crate::isa::enc_tables::Encodings;
-#[cfg(feature = "unwind")]
 use crate::isa::fde::RegisterMappingError;
+#[cfg(feature = "unwind")]
 use crate::machinst::MachBackend;
 use crate::regalloc;
 use crate::result::CodegenResult;
@@ -117,6 +117,7 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
             isa_builder!(x86, "x86", triple)
         }
         Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
+        Architecture::Aarch64 { .. } => isa_builder!(arm64, "arm64", triple),
         _ => Err(LookupError::Unsupported),
     }
 }
diff --git a/cranelift/codegen/src/isa/test_utils.rs b/cranelift/codegen/src/isa/test_utils.rs
new file mode 100644
index 000000000000..826fabf949f9
--- /dev/null
+++ b/cranelift/codegen/src/isa/test_utils.rs
@@ -0,0 +1,83 @@
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
+use crate::ir::Value;
+use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode};
+use crate::isa::TargetIsa;
+
+use alloc::vec::Vec;
+use std::string::{String, ToString};
+
+pub struct TestCodeSink {
+    bytes: Vec<u8>,
+}
+
+impl TestCodeSink {
+    /// Create a new TestCodeSink.
+    pub fn new() -> TestCodeSink {
+        TestCodeSink { bytes: vec![] }
+    }
+
+    /// This is pretty lame, but whatever ..
+    pub fn stringify(&self) -> String {
+        let mut s = "".to_string();
+        for b in &self.bytes {
+            s = s + &format!("{:02X}", b).to_string();
+        }
+        s
+    }
+}
+
+impl CodeSink for TestCodeSink {
+    fn offset(&self) -> CodeOffset {
+        self.bytes.len() as CodeOffset
+    }
+
+    fn put1(&mut self, x: u8) {
+        self.bytes.push(x);
+    }
+
+    fn put2(&mut self, x: u16) {
+        self.bytes.push((x >> 0) as u8);
+        self.bytes.push((x >> 8) as u8);
+    }
+
+    fn put4(&mut self, mut x: u32) {
+        for _ in 0..4 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn put8(&mut self, mut x: u64) {
+        for _ in 0..8 {
+            self.bytes.push(x as u8);
+            x >>= 8;
+        }
+    }
+
+    fn reloc_block(&mut self, _rel: Reloc, _block_offset: CodeOffset) {}
+
+    fn reloc_external(
+        &mut self,
+        _srcloc: SourceLoc,
+        _rel: Reloc,
+        _name: &ExternalName,
+        _addend: Addend,
+    ) {
+    }
+
+    fn reloc_constant(&mut self, _rel: Reloc, _constant_offset: ConstantOffset) {}
+
+    fn reloc_jt(&mut self, _rel: Reloc, _jt: JumpTable) {}
+
+    fn trap(&mut self, _code: TrapCode, _srcloc: SourceLoc) {}
+
+    fn begin_jumptables(&mut self) {}
+
+    fn begin_rodata(&mut self) {}
+
+    fn end_codegen(&mut self) {}
+
+    fn add_stackmap(&mut self, _val_list: &[Value], _func: &Function, _isa: &dyn TargetIsa) {}
+
+    fn add_call_site(&mut self, _opcode: Opcode, _srcloc: SourceLoc) {}
+}
diff --git a/cranelift/codegen/src/postopt.rs b/cranelift/codegen/src/postopt.rs
index 42121817d5f7..b6c36434a152 100644
--- a/cranelift/codegen/src/postopt.rs
+++ b/cranelift/codegen/src/postopt.rs
@@ -360,6 +360,7 @@ fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &dyn TargetI
 pub fn do_postopt(func: &mut Function, isa: &dyn TargetIsa) {
     let _tt = timing::postopt();
     let mut pos = EncCursor::new(func, isa);
+    let is_mach_backend = isa.get_mach_backend().is_some();
     while let Some(_block) = pos.next_block() {
         let mut last_flags_clobber = None;
         while let Some(inst) = pos.next_inst() {
@@ -367,13 +368,15 @@ pub fn do_postopt(func: &mut Function, isa: &dyn TargetIsa) {
                 // Optimize instructions to make use of flags.
                 optimize_cpu_flags(&mut pos, inst, last_flags_clobber, isa);
 
-                // Track the most recent seen instruction that clobbers the flags.
-                if let Some(constraints) = isa
-                    .encoding_info()
-                    .operand_constraints(pos.func.encodings[inst])
-                {
-                    if constraints.clobbers_flags {
-                        last_flags_clobber = Some(inst)
+                if !is_mach_backend {
+                    // Track the most recent seen instruction that clobbers the flags.
+                    if let Some(constraints) = isa
+                        .encoding_info()
+                        .operand_constraints(pos.func.encodings[inst])
+                    {
+                        if constraints.clobbers_flags {
+                            last_flags_clobber = Some(inst)
+                        }
                     }
                 }
             }
diff --git a/cranelift/codegen/src/verifier/flags.rs b/cranelift/codegen/src/verifier/flags.rs
index 1a20303d20c3..76e83ab88a8a 100644
--- a/cranelift/codegen/src/verifier/flags.rs
+++ b/cranelift/codegen/src/verifier/flags.rs
@@ -28,13 +28,17 @@ pub fn verify_flags(
     errors: &mut VerifierErrors,
 ) -> VerifierStepResult<()> {
     let _tt = timing::verify_flags();
-    let mut verifier = FlagsVerifier {
-        func,
-        cfg,
-        encinfo: isa.map(|isa| isa.encoding_info()),
-        livein: SecondaryMap::new(),
-    };
-    verifier.check(errors)
+    if isa.is_none() || isa.unwrap().get_mach_backend().is_none() {
+        let mut verifier = FlagsVerifier {
+            func,
+            cfg,
+            encinfo: isa.map(|isa| isa.encoding_info()),
+            livein: SecondaryMap::new(),
+        };
+        verifier.check(errors)
+    } else {
+        Ok(())
+    }
 }
 
 struct FlagsVerifier<'a> {
diff --git a/cranelift/src/compile.rs b/cranelift/src/compile.rs
index 7d888f311325..4d7111887606 100644
--- a/cranelift/src/compile.rs
+++ b/cranelift/src/compile.rs
@@ -49,42 +49,42 @@ fn handle_module(
 
     // If we have an isa from the command-line, use that. Otherwise if the
     // file contains a unique isa, use that.
-    let isa = if let Some(isa) = fisa.isa {
-        isa
-    } else if let Some(isa) = test_file.isa_spec.unique_isa() {
-        isa
-    } else {
+    let isa = fisa.isa.or(test_file.isa_spec.unique_isa());
+
+    if isa.is_none() {
         return Err(String::from("compilation requires a target isa"));
     };
 
     for (func, _) in test_file.functions {
-        let mut context = Context::new();
-        context.func = func;
-
         let mut relocs = PrintRelocs::new(flag_print);
         let mut traps = PrintTraps::new(flag_print);
         let mut stackmaps = PrintStackmaps::new(flag_print);
-        let mut mem = vec![];
 
-        // Compile and encode the result to machine code.
-        let code_info = context
-            .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps)
-            .map_err(|err| pretty_error(&context.func, Some(isa), err))?;
+        if let Some(isa) = isa {
+            let mut context = Context::new();
+            context.func = func;
+            let mut mem = vec![];
 
-        if flag_print {
-            println!("{}", context.func.display(isa));
-        }
+            // Compile and encode the result to machine code.
+            let code_info = context
+                .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps)
+                .map_err(|err| pretty_error(&context.func, Some(isa), err))?;
+
+            if flag_print {
+                println!("{}", context.func.display(isa));
+            }
 
-        if flag_disasm {
-            print_all(
-                isa,
-                &mem,
-                code_info.code_size,
-                code_info.jumptables_size + code_info.rodata_size,
-                &relocs,
-                &traps,
-                &stackmaps,
-            )?;
+            if flag_disasm {
+                print_all(
+                    isa,
+                    &mem,
+                    code_info.code_size,
+                    code_info.jumptables_size + code_info.rodata_size,
+                    &relocs,
+                    &traps,
+                    &stackmaps,
+                )?;
+            }
         }
     }
 

From bab0c79c3107431c0c819bc42ad56d54c2f85433 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 13:54:29 -0700
Subject: [PATCH 09/12] ARM64 backend, part 9 / 11: wasmtime support.

This commit adds a few odds and ends required to build wasmtime on ARM64
with the new backend. In particular, it adds:

- Support for the `Arm64Call` relocation type.
- Support for fetching the trap PC when a signal is received.
- A hook for `SIGTRAP`, which is sent by the `brk` opcode (in contrast to
  x86's `SIGILL`).

With the patch sequence up to and including this patch applied,
`wasmtime` can now compile and successfully execute code on arm64. Not
all tests pass yet, but basic Wasm/WASI tests work correctly.
---
 crates/jit/src/link.rs             | 30 +++++++++++++++++++++++-------
 crates/runtime/src/helpers.c       |  9 +++++++++
 crates/runtime/src/traphandlers.rs | 11 +++++++++++
 tests/custom_signal_handler.rs     |  4 ++--
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/crates/jit/src/link.rs b/crates/jit/src/link.rs
index c8313b5d8603..8ffe7295260c 100644
--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -2,7 +2,7 @@
 
 use crate::Compilation;
 use cranelift_codegen::binemit::Reloc;
-use std::ptr::write_unaligned;
+use std::ptr::{read_unaligned, write_unaligned};
 use wasmtime_environ::{Module, Relocation, RelocationTarget};
 use wasmtime_runtime::libcalls;
 use wasmtime_runtime::VMFunctionBody;
@@ -101,6 +101,23 @@ fn apply_reloc(
         Reloc::X86PCRelRodata4 => {
             // ignore
         }
+        Reloc::Arm64Call => unsafe {
+            let reloc_address = body.add(r.offset as usize) as usize;
+            let reloc_addend = r.addend as isize;
+            let reloc_delta = (target_func_address as u64).wrapping_sub(reloc_address as u64);
+            // TODO: come up with a PLT-like solution for longer calls. We can't extend the
+            // code segment at this point, but we could conservatively allocate space at the
+            // end of the function during codegen, a fixed amount per call, to allow for
+            // potential branch islands.
+            assert!((reloc_delta as i64) < (1 << 27));
+            assert!((reloc_delta as i64) >= -(1 << 27));
+            let reloc_delta = reloc_delta as u32;
+            let reloc_delta = reloc_delta.wrapping_add(reloc_addend as u32);
+            let delta_bits = reloc_delta >> 2;
+            let insn = read_unaligned(reloc_address as *const u32);
+            let new_insn = (insn & 0xfc00_0000) | (delta_bits & 0x03ff_ffff);
+            write_unaligned(reloc_address as *mut u32, new_insn);
+        },
         _ => panic!("unsupported reloc kind"),
     }
 }
@@ -108,14 +125,11 @@ fn apply_reloc(
 // A declaration for the stack probe function in Rust's standard library, for
 // catching callstack overflow.
 cfg_if::cfg_if! {
-    if #[cfg(any(
-        target_arch="aarch64",
-        all(
+    if #[cfg(all(
             target_os = "windows",
             target_env = "msvc",
             target_pointer_width = "64"
-        )
-    ))] {
+            ))] {
         extern "C" {
             pub fn __chkstk();
         }
@@ -132,6 +146,8 @@ cfg_if::cfg_if! {
         extern "C" {
             pub fn __rust_probestack();
         }
-        static PROBESTACK: unsafe extern "C" fn() = __rust_probestack;
+        static PROBESTACK: unsafe extern "C" fn() = empty_probestack;
     }
 }
+
+extern "C" fn empty_probestack() {}
diff --git a/crates/runtime/src/helpers.c b/crates/runtime/src/helpers.c
index 213f34e5938d..6436922243da 100644
--- a/crates/runtime/src/helpers.c
+++ b/crates/runtime/src/helpers.c
@@ -26,3 +26,12 @@ void* GetPcFromUContext(ucontext_t *cx) {
   return (void*) cx->uc_mcontext->__ss.__rip;
 }
 #endif
+
+#if defined(__linux__) && defined(__aarch64__)
+#include <sys/ucontext.h>
+
+void* GetPcFromUContext(ucontext_t *cx) {
+    return (void*) cx->uc_mcontext.pc;
+}
+
+#endif  // __linux__ && __aarch64__
diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs
index 657abb62212d..571f823b3f63 100644
--- a/crates/runtime/src/traphandlers.rs
+++ b/crates/runtime/src/traphandlers.rs
@@ -31,6 +31,7 @@ cfg_if::cfg_if! {
         static mut PREV_SIGBUS: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
         static mut PREV_SIGILL: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
         static mut PREV_SIGFPE: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
+        static mut PREV_SIGTRAP: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
 
         unsafe fn platform_init() {
             let register = |slot: &mut MaybeUninit<libc::sigaction>, signal: i32| {
@@ -70,6 +71,9 @@ cfg_if::cfg_if! {
                 register(&mut PREV_SIGFPE, libc::SIGFPE);
             }
 
+            // on ARM64, we use `brk` to report traps, which generates SIGTRAP.
+            register(&mut PREV_SIGTRAP, libc::SIGTRAP);
+
             // On ARM, handle Unaligned Accesses.
             // On Darwin, guard page accesses are raised as SIGBUS.
             if cfg!(target_arch = "arm") || cfg!(target_os = "macos") {
@@ -87,6 +91,7 @@ cfg_if::cfg_if! {
                 libc::SIGBUS => &PREV_SIGBUS,
                 libc::SIGFPE => &PREV_SIGFPE,
                 libc::SIGILL => &PREV_SIGILL,
+                libc::SIGTRAP => &PREV_SIGTRAP,
                 _ => panic!("unknown signal: {}", signum),
             };
             let handled = tls::with(|info| {
@@ -158,6 +163,12 @@ cfg_if::cfg_if! {
                 if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] {
                     let cx = &*(cx as *const libc::ucontext_t);
                     cx.uc_mcontext.gregs[libc::REG_RIP as usize] as *const u8
+                } else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] {
+                    // libc doesn't seem to support Linux/aarch64 at the moment?
+                    extern "C" {
+                        fn GetPcFromUContext(cx: *mut libc::c_void) -> *const u8;
+                    }
+                    GetPcFromUContext(cx)
                 } else if #[cfg(target_os = "macos")] {
                     // FIXME(rust-lang/libc#1702) - once that lands and is
                     // released we should inline the definition here
diff --git a/tests/custom_signal_handler.rs b/tests/custom_signal_handler.rs
index 27d14fc910a5..8b3c8cd478de 100644
--- a/tests/custom_signal_handler.rs
+++ b/tests/custom_signal_handler.rs
@@ -122,7 +122,7 @@ mod tests {
                 .downcast::<Trap>()?;
             assert!(
                 trap.message()
-                    .starts_with("wasm trap: out of bounds memory access"),
+                    .starts_with("wasm trap: out of bounds"),
                 "bad trap message: {:?}",
                 trap.message()
             );
@@ -149,7 +149,7 @@ mod tests {
                 .downcast::<Trap>()?;
             assert!(trap
                 .message()
-                .starts_with("wasm trap: out of bounds memory access"));
+                .starts_with("wasm trap: out of bounds"));
         }
         Ok(())
     }

From 402303f67ad348773fdb10effcbfedd0885a913a Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 14:00:13 -0700
Subject: [PATCH 10/12] ARM64 backend, part 10 / 11: filetest support for VCode
 tests.

This patch adds support for filetests with the `vcode` type. This allows
test cases to feed CLIF into the new backend, produce VCode output with
machine instructions, and then perform matching against the
pretty-printed text representation of the VCode.

Tests for the new ARM64 backend using this infrastructure will come in a
followup patch.
---
 Cargo.lock                            |  1 +
 cranelift/filetests/Cargo.toml        |  1 +
 cranelift/filetests/src/lib.rs        |  2 +
 cranelift/filetests/src/test_vcode.rs | 71 +++++++++++++++++++++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 cranelift/filetests/src/test_vcode.rs

diff --git a/Cargo.lock b/Cargo.lock
index 8d7d237b6e0b..63fdb07c47fe 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -433,6 +433,7 @@ dependencies = [
  "memmap",
  "num_cpus",
  "region",
+ "target-lexicon",
 ]
 
 [[package]]
diff --git a/cranelift/filetests/Cargo.toml b/cranelift/filetests/Cargo.toml
index 35b46b87c85d..3bf3090507ae 100644
--- a/cranelift/filetests/Cargo.toml
+++ b/cranelift/filetests/Cargo.toml
@@ -22,3 +22,4 @@ memmap = "0.7.0"
 num_cpus = "1.8.0"
 region = "2.1.2"
 byteorder = { version = "1.3.2", default-features = false }
+target-lexicon = "0.10"
diff --git a/cranelift/filetests/src/lib.rs b/cranelift/filetests/src/lib.rs
index 0d3b12e45890..0fee249f12c4 100644
--- a/cranelift/filetests/src/lib.rs
+++ b/cranelift/filetests/src/lib.rs
@@ -56,6 +56,7 @@ mod test_shrink;
 mod test_simple_gvn;
 mod test_simple_preopt;
 mod test_unwind;
+mod test_vcode;
 mod test_verifier;
 
 /// The result of running the test in a file.
@@ -134,6 +135,7 @@ fn new_subtest(parsed: &TestCommand) -> subtest::SubtestResult<Box<dyn subtest::
         "run" => test_run::subtest(parsed),
         "shrink" => test_shrink::subtest(parsed),
         "simple-gvn" => test_simple_gvn::subtest(parsed),
+        "vcode" => test_vcode::subtest(parsed),
         "verifier" => test_verifier::subtest(parsed),
         "preopt" => test_preopt::subtest(parsed),
         "safepoint" => test_safepoint::subtest(parsed),
diff --git a/cranelift/filetests/src/test_vcode.rs b/cranelift/filetests/src/test_vcode.rs
new file mode 100644
index 000000000000..f97aef47ea24
--- /dev/null
+++ b/cranelift/filetests/src/test_vcode.rs
@@ -0,0 +1,71 @@
+use crate::subtest::{run_filecheck, Context, SubTest, SubtestResult};
+use cranelift_codegen::ir::Function;
+use cranelift_codegen::isa::lookup;
+use cranelift_codegen::settings;
+use cranelift_codegen::Context as CodegenContext;
+use cranelift_reader::{TestCommand, TestOption};
+use target_lexicon::Triple;
+
+use log::info;
+use std::borrow::Cow;
+use std::str::FromStr;
+use std::string::String;
+
+struct TestVCode {
+    arch: String,
+}
+
+pub fn subtest(parsed: &TestCommand) -> SubtestResult<Box<dyn SubTest>> {
+    assert_eq!(parsed.command, "vcode");
+
+    let mut arch = "arm64".to_string();
+    for option in &parsed.options {
+        match option {
+            TestOption::Value(k, v) if k == &"arch" => {
+                arch = v.to_string();
+            }
+            _ => {}
+        }
+    }
+
+    Ok(Box::new(TestVCode { arch }))
+}
+
+impl SubTest for TestVCode {
+    fn name(&self) -> &'static str {
+        "vcode"
+    }
+
+    fn is_mutating(&self) -> bool {
+        true
+    }
+
+    fn needs_isa(&self) -> bool {
+        false
+    }
+
+    fn run(&self, func: Cow<Function>, context: &Context) -> SubtestResult<()> {
+        let func = func.into_owned();
+
+        let triple =
+            Triple::from_str(&self.arch).map_err(|_| format!("Unknown arch: '{}'", self.arch))?;
+
+        let mut isa = lookup(triple)
+            .map_err(|_| format!("Could not look up backend for arch '{}'", self.arch))?
+            .finish(settings::Flags::new(settings::builder()));
+
+        let mut codectx = CodegenContext::for_function(func);
+        codectx.set_disasm(true);
+
+        codectx
+            .compile(&mut *isa)
+            .map_err(|e| format!("Could not compile with arch '{}': {:?}", self.arch, e))?;
+
+        let result = codectx.mach_compile_result.take().unwrap();
+        let text = result.disasm.unwrap();
+
+        info!("text input to filecheck is:\n{}\n", text);
+
+        run_filecheck(&text, context)
+    }
+}

From 3de504c24c2c9c23fb3b8a9e8ee185c47a99aef1 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Thu, 9 Apr 2020 14:02:19 -0700
Subject: [PATCH 11/12] ARM64 backend, part 11 / 11: filetests for ARM64 VCode.

This patch, the last in the series, adds the filetests for the new ARM64
backend. The filetests cover most of the opcodes, except for the
recently-added floating point support.

This patch contains code written by Julian Seward <jseward@acm.org> and
Benjamin Bouvier <public@benj.me>, originally developed on a side-branch
before rebasing and condensing into this patch series. See the `arm64`
branch at `https://github.com/cfallin/wasmtime` for original development
history.

This patch also contains code written by Joey Gouly
<joey.gouly@arm.com> and contributed to the above branch. These
contributions are "Copyright (c) 2020, Arm Limited."

Co-authored-by: Julian Seward <jseward@acm.org>
Co-authored-by: Benjamin Bouvier <public@benj.me>
Co-authored-by: Joey Gouly <joey.gouly@arm.com>
---
 .../filetests/vcode/arm64/arithmetic.clif     | 242 ++++++++++
 .../filetests/vcode/arm64/basic1.clif         |  13 +
 .../filetests/vcode/arm64/bitops.clif         | 157 +++++++
 .../filetests/vcode/arm64/call-indirect.clif  |  15 +
 .../filetests/filetests/vcode/arm64/call.clif |  16 +
 .../filetests/vcode/arm64/condbr.clif         |  65 +++
 .../filetests/vcode/arm64/condops.clif        |  42 ++
 .../filetests/vcode/arm64/constants.clif      | 175 +++++++
 .../filetests/vcode/arm64/extend-op.clif      |  17 +
 .../filetests/vcode/arm64/jumptable.clif      |  43 ++
 .../vcode/arm64/narrow-arithmetic.clif        |  68 +++
 .../filetests/vcode/arm64/saturating-ops.clif |  35 ++
 .../filetests/vcode/arm64/shift-op.clif       |  16 +
 .../filetests/vcode/arm64/shift-rotate.clif   | 439 ++++++++++++++++++
 .../filetests/vcode/arm64/symbol-value.clif   |  16 +
 .../filetests/vcode/arm64/traps.clif          |  28 ++
 .../vcode/arm64/uextend-sextend.clif          | 157 +++++++
 17 files changed, 1544 insertions(+)
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/arithmetic.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/basic1.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/bitops.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/call-indirect.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/call.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/condbr.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/condops.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/constants.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/extend-op.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/jumptable.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/shift-op.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/symbol-value.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/traps.clif
 create mode 100644 cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif

diff --git a/cranelift/filetests/filetests/vcode/arm64/arithmetic.clif b/cranelift/filetests/filetests/vcode/arm64/arithmetic.clif
new file mode 100644
index 000000000000..7fbda32d081f
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/arithmetic.clif
@@ -0,0 +1,242 @@
+test vcode arch=arm64
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = iadd.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  add x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = isub.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = imul.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  madd x0, x0, x1, xzr
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = umulhi.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  umulh x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = smulhi.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  smulh x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sdiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sdiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 2
+  v2 = sdiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #2
+; nextln:  sdiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = udiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  udiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 2
+  v2 = udiv.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #2
+; nextln:  udiv x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = srem.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sdiv x2, x0, x1
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = urem.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  udiv x2, x0, x1
+; nextln:  msub x0, x2, x1, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  and x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  eor x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = band_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  bic x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bor_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orn x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bxor_not.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  eon x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = bnot.i64 v0
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  orn x0, xzr, x0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/basic1.clif b/cranelift/filetests/filetests/vcode/arm64/basic1.clif
new file mode 100644
index 000000000000..29713d3427ce
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/basic1.clif
@@ -0,0 +1,13 @@
+test vcode arch=arm64
+
+function %f(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    ; check: stp fp, lr, [sp, #-16]!
+    ; check: mov fp, sp
+    v2 = iadd v0, v1
+    ; check: add w0, w0, w1
+    return v2
+    ; check: mov sp, fp
+    ; check: ldp fp, lr, [sp], #16
+    ; check: ret
+}
diff --git a/cranelift/filetests/filetests/vcode/arm64/bitops.clif b/cranelift/filetests/filetests/vcode/arm64/bitops.clif
new file mode 100644
index 000000000000..f2ebc5f003cf
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/bitops.clif
@@ -0,0 +1,157 @@
+test vcode arch=arm64
+
+function %a(i32) -> i32 {
+block0(v0: i32):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %a(i64) -> i64 {
+block0(v0: i64):
+    v1 = bitrev v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %b(i32) -> i32 {
+block0(v0: i32):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %b(i64) -> i64 {
+block0(v0: i64):
+    v1 = clz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: clz x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %c(i32) -> i32 {
+block0(v0: i32):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: cls w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %c(i64) -> i64 {
+block0(v0: i64):
+    v1 = cls v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: cls x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i32) -> i32 {
+block0(v0: i32):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit w0, w0
+; nextln: clz w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i64) -> i64 {
+block0(v0: i64):
+    v1 = ctz v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: rbit x0, x0
+; nextln: clz x0, x0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i64) -> i64 {
+block0(v0: i64):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: lsr x1, x0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %d(i32) -> i32 {
+block0(v0: i32):
+    v1 = popcnt v0
+    return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: lsr w1, w0, #1
+; nextln: and x1, x1, #6148914691236517205
+; nextln: sub x1, x0, x1
+; nextln: and x0, x1, #3689348814741910323
+; nextln: lsr x1, x1, #2
+; nextln: and x1, x1, #3689348814741910323
+; nextln: add x0, x1, x0
+; nextln: add x0, x0, x0, LSR 4
+; nextln: and x0, x0, #1085102592571150095
+; nextln: add x0, x0, x0, LSL 8
+; nextln: add x0, x0, x0, LSL 16
+; nextln: add x0, x0, x0, LSL 32
+; nextln: lsr x0, x0, #56
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/call-indirect.clif b/cranelift/filetests/filetests/vcode/arm64/call-indirect.clif
new file mode 100644
index 000000000000..84fa72d2db29
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/call-indirect.clif
@@ -0,0 +1,15 @@
+test vcode arch=arm64
+
+function %f(i64, i64) -> i64 {
+    sig0 = (i64) -> i64
+block0(v0: i64, v1: i64):
+    v2 = call_indirect.i64 sig0, v1(v0)
+    return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  blr x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/call.clif b/cranelift/filetests/filetests/vcode/arm64/call.clif
new file mode 100644
index 000000000000..3210db3959c1
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/call.clif
@@ -0,0 +1,16 @@
+test vcode arch=arm64
+
+function %f(i64) -> i64 {
+    fn0 = %g(i64) -> i64
+
+block0(v0: i64):
+    v1 = call fn0(v0)
+    return v1
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  bl 0
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/condbr.clif b/cranelift/filetests/filetests/vcode/arm64/condbr.clif
new file mode 100644
index 000000000000..e85e309ce5a1
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/condbr.clif
@@ -0,0 +1,65 @@
+test vcode arch=arm64
+
+function %f(i64, i64) -> b1 {
+block0(v0: i64, v1: i64):
+  v2 = icmp eq v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; nextln: cset x0, eq
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ifcmp v0, v1
+  brif eq v2, block1
+  jump block2
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+
+block2:
+  v5 = iconst.i64 2
+  return v5
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; nextln: b.eq 20
+; check: Block 0:
+; check: movz x0, #2
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+; check: Block 1:
+; check: movz x0, #1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ifcmp v0, v1
+  brif eq v2, block1
+  jump block1
+
+block1:
+  v4 = iconst.i64 1
+  return v4
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: subs xzr, x0, x1
+; check: Block 0:
+; check: movz x0, #1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/condops.clif b/cranelift/filetests/filetests/vcode/arm64/condops.clif
new file mode 100644
index 000000000000..01d2637e889a
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/condops.clif
@@ -0,0 +1,42 @@
+test vcode arch=arm64
+
+function %f(i8, i64, i64) -> i64 {
+block0(v0: i8, v1: i64, v2: i64):
+  v3 = iconst.i8 42
+  v4 = ifcmp v0, v3
+  v5 = selectif.i64 eq v4, v1, v2
+  return v5
+}
+
+; check: subs wzr
+; check: csel x0, $(=x[0-9]+, x[0-9]+), eq
+
+function %g(i8) -> b1 {
+block0(v0: i8):
+  v3 = iconst.i8 42
+  v4 = ifcmp v0, v3
+  v5 = trueif eq v4
+  return v5
+}
+
+; check: subs wzr
+; check: cset x0, eq
+
+function %h(i8, i8, i8) -> i8 {
+block0(v0: i8, v1: i8, v2: i8):
+  v3 = bitselect.i8 v0, v1, v2
+  return v3
+}
+
+; check: and
+; nextln: bic
+; nextln: orr
+
+function %i(b1, i8, i8) -> i8 {
+block0(v0: b1, v1: i8, v2: i8):
+  v3 = select.i8 v0, v1, v2
+  return v3
+}
+
+; check: subs wzr
+; nextln: csel
diff --git a/cranelift/filetests/filetests/vcode/arm64/constants.clif b/cranelift/filetests/filetests/vcode/arm64/constants.clif
new file mode 100644
index 000000000000..5eca5402d7d9
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/constants.clif
@@ -0,0 +1,175 @@
+test vcode arch=arm64
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff0000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #16
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff00000000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #32
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff000000000000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #65535, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffffffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffffffff0000
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffffffff0000ffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #16
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xffff0000ffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #32
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x0000ffffffffffff
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #65535, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0xf34bf0a31212003a ; random digits
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #58
+; nextln: movk x0, #4626, LSL #16
+; nextln: movk x0, #61603, LSL #32
+; nextln: movk x0, #62283, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x12e900001ef40000 ; random digits with 2 clear half words
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movz x0, #7924, LSL #16
+; nextln: movk x0, #4841, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f() -> i64 {
+block0:
+  v0 = iconst.i64 0x12e9ffff1ef4ffff ; random digits with 2 full half words
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: movn x0, #57611, LSL #16
+; nextln: movk x0, #4841, LSL #48
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/extend-op.clif b/cranelift/filetests/filetests/vcode/arm64/extend-op.clif
new file mode 100644
index 000000000000..74879c8c11f5
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/extend-op.clif
@@ -0,0 +1,17 @@
+test vcode arch=arm64
+
+function %f(i8) -> i64 {
+block0(v0: i8):
+  v1 = sextend.i64 v0
+  v2 = iconst.i64 42
+  v3 = iadd.i64 v2, v1
+  return v3
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  movz x1, #42
+; nextln:  add x0, x1, x0, SXTB
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/jumptable.clif b/cranelift/filetests/filetests/vcode/arm64/jumptable.clif
new file mode 100644
index 000000000000..0677c3cb7d59
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/jumptable.clif
@@ -0,0 +1,43 @@
+test vcode arch=arm64
+
+function %f(i64) -> i64 {
+  jt0 = jump_table [block1, block2, block3]
+
+block0(v0: i64):
+  br_table v0, block4, jt0
+
+block1:
+  v1 = iconst.i64 1
+  jump block5(v1)
+
+block2:
+  v2 = iconst.i64 2
+  jump block5(v2)
+
+block3:
+  v3 = iconst.i64 3
+  jump block5(v3)
+
+block4:
+  v4 = iconst.i64 4
+  jump block5(v4)
+
+block5(v5: i64):
+  v6 = iadd.i64 v0, v5
+  return v6
+}
+
+; check:   subs wzr, w0, #3
+; nextln:   b.hs
+; nextln:   adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries
+
+; check:   movz x1, #3
+; nextln:   b
+
+; check:   movz x1, #2
+; nextln:   b
+
+; check:   movz x1, #1
+
+; check:   add x0, x0, x1
+
diff --git a/cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif b/cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif
new file mode 100644
index 000000000000..345a527d8839
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif
@@ -0,0 +1,68 @@
+test vcode arch=arm64
+
+function %add8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = iadd.i8 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add16(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = iadd.i16 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add32(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = iadd.i32 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add32_8(i32, i8) -> i32 {
+block0(v0: i32, v1: i8):
+  v2 = sextend.i32 v1
+  v3 = iadd.i32 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add w0, w0, w1, SXTB
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %add64_32(i64, i32) -> i64 {
+block0(v0: i64, v1: i32):
+  v2 = sextend.i64 v1
+  v3 = iadd.i64 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, x1, SXTW
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif b/cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif
new file mode 100644
index 000000000000..a281a25e4b4b
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif
@@ -0,0 +1,35 @@
+test vcode arch=arm64
+
+function %uaddsat64(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = uadd_sat.i64 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov v0.d[0], x0
+; nextln: mov v1.d[0], x1
+; nextln: uqadd d0, d0, d1
+; nextln: mov x0, v0.d[0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %uaddsat8(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = uadd_sat.i8 v0, v1
+  return v2
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb x0, w0
+; nextln: uxtb x1, w1
+; nextln: mov v0.d[0], x0
+; nextln: mov v1.d[0], x1
+; nextln: uqadd d0, d0, d1
+; nextln: mov x0, v0.d[0]
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/shift-op.clif b/cranelift/filetests/filetests/vcode/arm64/shift-op.clif
new file mode 100644
index 000000000000..852668081d15
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/shift-op.clif
@@ -0,0 +1,16 @@
+test vcode arch=arm64
+
+function %f(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i64 3
+  v2 = ishl.i64 v0, v1
+  v3 = iadd.i64 v0, v2
+  return v3
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: add x0, x0, x0, LSL 3
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif b/cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif
new file mode 100644
index 000000000000..bd56d4da5a64
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif
@@ -0,0 +1,439 @@
+test vcode arch=arm64
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ROR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f0(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = rotr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f1(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = rotr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f2(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = rotr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  sub w2, w1, #16
+; nextln:  sub w2, wzr, w2
+; nextln:  lsr w1, w0, w1
+; nextln:  lsl w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f3(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = rotr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  sub w2, w1, #8
+; nextln:  sub w2, wzr, w2
+; nextln:  lsr w1, w0, w1
+; nextln:  lsl w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ROL, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f4(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = rotl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub w2, w1, #64
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl x1, x0, x1
+; nextln:  lsr x0, x0, x2
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f5(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = rotl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sub w2, w1, #32
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f6(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = rotl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  sub w2, w1, #16
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f7(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = rotl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  sub w2, w1, #8
+; nextln:  sub w2, wzr, w2
+; nextln:  lsl w1, w0, w1
+; nextln:  lsr w0, w0, w2
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; LSR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f8(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ushr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f9(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = ushr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f10(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = ushr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f11(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = ushr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  lsr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; LSL, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f12(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = ishl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f13(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = ishl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f14(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = ishl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f15(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = ishl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; ASR, variable
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f16(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+  v2 = sshr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f17(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+  v2 = sshr.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f18(i16, i16) -> i16 {
+block0(v0: i16, v1: i16):
+  v2 = sshr.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxth w0, w0
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f19(i8, i8) -> i8 {
+block0(v0: i8, v1: i8):
+  v2 = sshr.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  sxtb w0, w0
+; nextln:  asr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; immediate forms
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+function %f20(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = rotr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  ror x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f21(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = rotl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x1, x0, #17
+; nextln:  lsr x0, x0, #47
+; nextln:  orr x0, x0, x1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f22(i32) -> i32 {
+block0(v0: i32):
+  v1 = iconst.i32 17
+  v2 = rotl.i32 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl w1, w0, #17
+; nextln:  lsr w0, w0, #15
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f23(i16) -> i16 {
+block0(v0: i16):
+  v1 = iconst.i32 10
+  v2 = rotl.i16 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxth w0, w0
+; nextln:  lsl w1, w0, #10
+; nextln:  lsr w0, w0, #6
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f24(i8) -> i8 {
+block0(v0: i8):
+  v1 = iconst.i32 3
+  v2 = rotl.i8 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  uxtb w0, w0
+; nextln:  lsl w1, w0, #3
+; nextln:  lsr w0, w0, #5
+; nextln:  orr w0, w0, w1
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f25(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = ushr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsr x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f26(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = sshr.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  asr x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
+
+function %f27(i64) -> i64 {
+block0(v0: i64):
+  v1 = iconst.i32 17
+  v2 = ishl.i64 v0, v1
+  return v2
+}
+
+; check:  stp fp, lr, [sp, #-16]!
+; nextln:  mov fp, sp
+; nextln:  lsl x0, x0, #17
+; nextln:  mov sp, fp
+; nextln:  ldp fp, lr, [sp], #16
+; nextln:  ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/symbol-value.clif b/cranelift/filetests/filetests/vcode/arm64/symbol-value.clif
new file mode 100644
index 000000000000..cf22b20ff9b9
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/symbol-value.clif
@@ -0,0 +1,16 @@
+test vcode arch=arm64
+
+function %f() -> i64 {
+  gv0 = symbol %my_global
+
+block0:
+  v0 = symbol_value.i64 gv0
+  return v0
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: ldr x0, 8 ; b 12 ; data
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
diff --git a/cranelift/filetests/filetests/vcode/arm64/traps.clif b/cranelift/filetests/filetests/vcode/arm64/traps.clif
new file mode 100644
index 000000000000..9f4a40ef12e1
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/traps.clif
@@ -0,0 +1,28 @@
+test vcode arch=arm64
+
+function %f() {
+block0:
+  trap user0
+}
+
+; check: udf
+
+function %g(i64) {
+block0(v0: i64):
+  v1 = iconst.i64 42
+  v2 = ifcmp v0, v1
+  trapif eq v2, user0
+  return
+}
+
+; check: subs xzr, x0, #42
+; nextln: b.ne 8
+; nextln: udf
+
+function %h() {
+block0:
+  debugtrap
+  return
+}
+
+; check: brk #0
diff --git a/cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif b/cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif
new file mode 100644
index 000000000000..85a5c488a280
--- /dev/null
+++ b/cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif
@@ -0,0 +1,157 @@
+test vcode arch=arm64
+
+function %f_u_8_64(i8) -> i64 {
+block0(v0: i8):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_8_32(i8) -> i32 {
+block0(v0: i8):
+  v1 = uextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_8_16(i8) -> i16 {
+block0(v0: i8):
+  v1 = uextend.i16 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_64(i8) -> i64 {
+block0(v0: i8):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_32(i8) -> i32 {
+block0(v0: i8):
+  v1 = sextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_8_16(i8) -> i16 {
+block0(v0: i8):
+  v1 = sextend.i16 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtb w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_16_64(i16) -> i64 {
+block0(v0: i16):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_16_32(i16) -> i32 {
+block0(v0: i16):
+  v1 = uextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: uxth w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_16_64(i16) -> i64 {
+block0(v0: i16):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxth x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_16_32(i16) -> i32 {
+block0(v0: i16):
+  v1 = sextend.i32 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxth w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_u_32_64(i32) -> i64 {
+block0(v0: i32):
+  v1 = uextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: mov w0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret
+
+function %f_s_32_64(i32) -> i64 {
+block0(v0: i32):
+  v1 = sextend.i64 v0
+  return v1
+}
+
+; check: stp fp, lr, [sp, #-16]!
+; nextln: mov fp, sp
+; nextln: sxtw x0, w0
+; nextln: mov sp, fp
+; nextln: ldp fp, lr, [sp], #16
+; nextln: ret

From 48cf2c2f50ff8aa64b450b482f62ec5eaab22d36 Mon Sep 17 00:00:00 2001
From: Chris Fallin <cfallin@mozilla.com>
Date: Wed, 15 Apr 2020 16:31:44 -0700
Subject: [PATCH 12/12] Address review comments:

- Undo temporary changes to default features (`all-arch`) and a
  signal-handler test.
- Remove `SIGTRAP` handler: no longer needed now that we've found an
  "undefined opcode" option on ARM64.
- Rename pp.rs to pretty_print.rs in machinst/.
- Only use empty stack-probe on non-x86. As per a comment in
  rust-lang/compiler-builtins [1], LLVM only supports stack probes on
  x86 and x86-64. Thus, on any other CPU architecture, we cannot refer
  to `__rust_probestack`, because it does not exist.
- Rename arm64 to aarch64.
- Use `target` directive in vcode filetests.
- Run the flags verifier, but without encinfo, when using new backends.
- Clean up warning overrides.
- Fix up use of casts: use u32::from(x) and siblings when possible,
  u32::try_from(x).unwrap() when not, to avoid silent truncation.
- Take immutable `Function` borrows as input; we don't actually
  mutate the input IR.
- Lots of other miscellaneous cleanups.

[1] https://github.com/rust-lang/compiler-builtins/blob/cae3e6ea23739166504f9f9fb50ec070097979d4/src/probestack.rs#L39
---
 cranelift/codegen/Cargo.toml                  |    2 +-
 cranelift/codegen/src/context.rs              |   13 +-
 cranelift/codegen/src/dce.rs                  |   42 +-
 cranelift/codegen/src/inst_predicates.rs      |   42 +
 cranelift/codegen/src/ir/function.rs          |    3 -
 .../codegen/src/isa/{arm64 => aarch64}/abi.rs |  116 +-
 .../src/isa/{arm64 => aarch64}/inst/args.rs   |  227 +--
 .../src/isa/{arm64 => aarch64}/inst/emit.rs   |  153 +-
 .../src/isa/{arm64 => aarch64}/inst/imms.rs   |  135 +-
 .../src/isa/{arm64 => aarch64}/inst/mod.rs    | 1472 +++++++++--------
 .../src/isa/{arm64 => aarch64}/inst/regs.rs   |   39 +-
 .../src/isa/{arm64 => aarch64}/lower.rs       |  147 +-
 .../codegen/src/isa/{arm64 => aarch64}/mod.rs |   63 +-
 cranelift/codegen/src/isa/mod.rs              |    7 +-
 cranelift/codegen/src/isa/test_utils.rs       |   13 +-
 cranelift/codegen/src/lib.rs                  |    1 +
 cranelift/codegen/src/machinst/abi.rs         |   55 +-
 cranelift/codegen/src/machinst/adapter.rs     |   11 +-
 cranelift/codegen/src/machinst/blockorder.rs  |    6 +-
 cranelift/codegen/src/machinst/compile.rs     |   21 +-
 cranelift/codegen/src/machinst/lower.rs       |   95 +-
 cranelift/codegen/src/machinst/mod.rs         |  176 +-
 .../src/machinst/{pp.rs => pretty_print.rs}   |    0
 cranelift/codegen/src/machinst/sections.rs    |   33 +-
 cranelift/codegen/src/machinst/vcode.rs       |   88 +-
 cranelift/codegen/src/num_uses.rs             |   18 +-
 cranelift/codegen/src/postopt.rs              |   18 +-
 cranelift/codegen/src/verifier/flags.rs       |   21 +-
 .../vcode/{arm64 => aarch64}/arithmetic.clif  |    3 +-
 .../vcode/{arm64 => aarch64}/basic1.clif      |    3 +-
 .../vcode/{arm64 => aarch64}/bitops.clif      |    3 +-
 .../{arm64 => aarch64}/call-indirect.clif     |    3 +-
 .../vcode/{arm64 => aarch64}/call.clif        |    3 +-
 .../vcode/{arm64 => aarch64}/condbr.clif      |    7 +-
 .../vcode/{arm64 => aarch64}/condops.clif     |    3 +-
 .../vcode/{arm64 => aarch64}/constants.clif   |    3 +-
 .../vcode/{arm64 => aarch64}/extend-op.clif   |    3 +-
 .../vcode/{arm64 => aarch64}/jumptable.clif   |    3 +-
 .../{arm64 => aarch64}/narrow-arithmetic.clif |    3 +-
 .../{arm64 => aarch64}/saturating-ops.clif    |    3 +-
 .../vcode/{arm64 => aarch64}/shift-op.clif    |    3 +-
 .../{arm64 => aarch64}/shift-rotate.clif      |    3 +-
 .../{arm64 => aarch64}/symbol-value.clif      |    3 +-
 .../vcode/{arm64 => aarch64}/traps.clif       |    3 +-
 .../{arm64 => aarch64}/uextend-sextend.clif   |    3 +-
 cranelift/filetests/src/test_vcode.rs         |    8 +-
 crates/jit/src/link.rs                        |   11 +-
 crates/runtime/src/traphandlers.rs            |    5 -
 tests/custom_signal_handler.rs                |    4 +-
 49 files changed, 1553 insertions(+), 1547 deletions(-)
 create mode 100644 cranelift/codegen/src/inst_predicates.rs
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/abi.rs (92%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/inst/args.rs (78%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/inst/emit.rs (97%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/inst/imms.rs (91%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/inst/mod.rs (66%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/inst/regs.rs (90%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/lower.rs (95%)
 rename cranelift/codegen/src/isa/{arm64 => aarch64}/mod.rs (78%)
 rename cranelift/codegen/src/machinst/{pp.rs => pretty_print.rs} (100%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/arithmetic.clif (99%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/basic1.clif (90%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/bitops.clif (99%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/call-indirect.clif (91%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/call.clif (90%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/condbr.clif (94%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/condops.clif (96%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/constants.clif (99%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/extend-op.clif (92%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/jumptable.clif (96%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/narrow-arithmetic.clif (98%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/saturating-ops.clif (96%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/shift-op.clif (91%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/shift-rotate.clif (99%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/symbol-value.clif (90%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/traps.clif (91%)
 rename cranelift/filetests/filetests/vcode/{arm64 => aarch64}/uextend-sextend.clif (99%)

diff --git a/cranelift/codegen/Cargo.toml b/cranelift/codegen/Cargo.toml
index 83219d42e652..8bf10759c4be 100644
--- a/cranelift/codegen/Cargo.toml
+++ b/cranelift/codegen/Cargo.toml
@@ -34,7 +34,7 @@ regalloc = "0.0.17"
 cranelift-codegen-meta = { path = "meta", version = "0.62.0" }
 
 [features]
-default = ["std", "unwind", "all-arch"]
+default = ["std", "unwind"]
 
 # The "std" feature enables use of libstd. The "core" feature enables use
 # of some minimal std-like replacement libraries. At least one of these two
diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs
index 2c3a84509e27..0fe5b38ad002 100644
--- a/cranelift/codegen/src/context.rs
+++ b/cranelift/codegen/src/context.rs
@@ -180,8 +180,7 @@ impl Context {
         }
 
         if let Some(backend) = isa.get_mach_backend() {
-            let func = std::mem::replace(&mut self.func, Function::new());
-            let result = backend.compile_function(func, self.want_disasm)?;
+            let result = backend.compile_function(&mut self.func, self.want_disasm)?;
             let info = result.code_info();
             self.mach_compile_result = Some(result);
             Ok(info)
@@ -312,15 +311,15 @@ impl Context {
 
     /// Run the legalizer for `isa` on the function.
     pub fn legalize(&mut self, isa: &dyn TargetIsa) -> CodegenResult<()> {
+        // Legalization invalidates the domtree and loop_analysis by mutating the CFG.
+        // TODO: Avoid doing this when legalization doesn't actually mutate the CFG.
+        self.domtree.clear();
+        self.loop_analysis.clear();
         if isa.get_mach_backend().is_some() {
             // Run some specific legalizations only.
             simple_legalize(&mut self.func, &mut self.cfg, isa);
-            Ok(())
+            self.verify_if(isa)
         } else {
-            // Legalization invalidates the domtree and loop_analysis by mutating the CFG.
-            // TODO: Avoid doing this when legalization doesn't actually mutate the CFG.
-            self.domtree.clear();
-            self.loop_analysis.clear();
             legalize_function(&mut self.func, &mut self.cfg, isa);
             debug!("Legalized:\n{}", self.func.display(isa));
             self.verify_if(isa)
diff --git a/cranelift/codegen/src/dce.rs b/cranelift/codegen/src/dce.rs
index 827ae98ec4ce..e3e855806da8 100644
--- a/cranelift/codegen/src/dce.rs
+++ b/cranelift/codegen/src/dce.rs
@@ -6,48 +6,10 @@
 use crate::cursor::{Cursor, FuncCursor};
 use crate::dominator_tree::DominatorTree;
 use crate::entity::EntityRef;
-use crate::ir::instructions::InstructionData;
-use crate::ir::{DataFlowGraph, Function, Inst, Opcode};
+use crate::inst_predicates::{any_inst_results_used, has_side_effect};
+use crate::ir::Function;
 use crate::timing;
 
-/// Test whether the given opcode is unsafe to even consider for DCE.
-fn trivially_unsafe_for_dce(opcode: Opcode) -> bool {
-    opcode.is_call()
-        || opcode.is_branch()
-        || opcode.is_terminator()
-        || opcode.is_return()
-        || opcode.can_trap()
-        || opcode.other_side_effects()
-        || opcode.can_store()
-}
-
-/// Preserve instructions with used result values.
-fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool {
-    dfg.inst_results(inst).iter().any(|v| live[v.index()])
-}
-
-/// Load instructions without the `notrap` flag are defined to trap when
-/// operating on inaccessible memory, so we can't DCE them even if the
-/// loaded value is unused.
-fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool {
-    if !opcode.can_load() {
-        return false;
-    }
-    match *data {
-        InstructionData::StackLoad { .. } => false,
-        InstructionData::Load { flags, .. } => !flags.notrap(),
-        _ => true,
-    }
-}
-
-/// Does the given instruction have any side-effect that would preclude it from being removed when
-/// its value is unused?
-pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
-    let data = &func.dfg[inst];
-    let opcode = data.opcode();
-    trivially_unsafe_for_dce(opcode) || is_load_with_defined_trapping(opcode, data)
-}
-
 /// Perform DCE on `func`.
 pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) {
     let _tt = timing::dce();
diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs
new file mode 100644
index 000000000000..9cefbc38f921
--- /dev/null
+++ b/cranelift/codegen/src/inst_predicates.rs
@@ -0,0 +1,42 @@
+//! Instruction predicates/properties, shared by various analyses.
+
+use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode};
+use cranelift_entity::EntityRef;
+
+/// Preserve instructions with used result values.
+pub fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool {
+    dfg.inst_results(inst).iter().any(|v| live[v.index()])
+}
+
+/// Test whether the given opcode is unsafe to even consider as side-effect-free.
+fn trivially_has_side_effects(opcode: Opcode) -> bool {
+    opcode.is_call()
+        || opcode.is_branch()
+        || opcode.is_terminator()
+        || opcode.is_return()
+        || opcode.can_trap()
+        || opcode.other_side_effects()
+        || opcode.can_store()
+}
+
+/// Load instructions without the `notrap` flag are defined to trap when
+/// operating on inaccessible memory, so we can't treat them as side-effect-free even if the loaded
+/// value is unused.
+fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool {
+    if !opcode.can_load() {
+        return false;
+    }
+    match *data {
+        InstructionData::StackLoad { .. } => false,
+        InstructionData::Load { flags, .. } => !flags.notrap(),
+        _ => true,
+    }
+}
+
+/// Does the given instruction have any side-effect that would preclude it from being removed when
+/// its value is unused?
+pub fn has_side_effect(func: &Function, inst: Inst) -> bool {
+    let data = &func.dfg[inst];
+    let opcode = data.opcode();
+    trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data)
+}
diff --git a/cranelift/codegen/src/ir/function.rs b/cranelift/codegen/src/ir/function.rs
index 7e3cf719563c..4a3829780bb0 100644
--- a/cranelift/codegen/src/ir/function.rs
+++ b/cranelift/codegen/src/ir/function.rs
@@ -3,8 +3,6 @@
 //! The `Function` struct defined in this module owns all of its basic blocks and
 //! instructions.
 
-#![allow(unused_imports)]
-
 use crate::binemit::CodeOffset;
 use crate::entity::{PrimaryMap, SecondaryMap};
 use crate::ir;
@@ -19,7 +17,6 @@ use crate::isa::{CallConv, EncInfo, Encoding, Legalize, TargetIsa};
 use crate::regalloc::{EntryRegDiversions, RegDiversions};
 use crate::value_label::ValueLabelsRanges;
 use crate::write::write_function;
-use alloc::boxed::Box;
 use core::fmt;
 
 /// A function.
diff --git a/cranelift/codegen/src/isa/arm64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs
similarity index 92%
rename from cranelift/codegen/src/isa/arm64/abi.rs
rename to cranelift/codegen/src/isa/aarch64/abi.rs
index 13abb6233a05..88aa60f8af0f 100644
--- a/cranelift/codegen/src/isa/arm64/abi.rs
+++ b/cranelift/codegen/src/isa/aarch64/abi.rs
@@ -1,11 +1,11 @@
-//! Implementation of the standard ARM64 ABI.
+//! Implementation of the standard AArch64 ABI.
 
 use crate::ir;
 use crate::ir::types;
 use crate::ir::types::*;
 use crate::ir::StackSlot;
 use crate::isa;
-use crate::isa::arm64::inst::*;
+use crate::isa::aarch64::inst::*;
 use crate::machinst::*;
 use crate::settings;
 
@@ -15,19 +15,16 @@ use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable};
 
 use log::debug;
 
-// A location for an argument or return value.
-#[derive(Clone, Debug)]
+/// A location for an argument or return value.
+#[derive(Clone, Copy, Debug)]
 enum ABIArg {
-    // In a real register.
+    /// In a real register.
     Reg(RealReg, ir::Type),
-    // Arguments only: on stack, at given offset from SP at entry.
+    /// Arguments only: on stack, at given offset from SP at entry.
     Stack(i64, ir::Type),
-    // (first and only) return value only: in memory pointed to by x8 on entry.
-    #[allow(dead_code)]
-    RetMem(ir::Type),
 }
 
-/// ARM64 ABI information shared between body (callee) and caller.
+/// AArch64 ABI information shared between body (callee) and caller.
 struct ABISig {
     args: Vec<ABIArg>,
     rets: Vec<ABIArg>,
@@ -161,11 +158,6 @@ impl ABISig {
         let (args, stack_arg_space) = compute_arg_locs(sig.call_conv, &sig.params);
         let (rets, _) = compute_arg_locs(sig.call_conv, &sig.returns);
 
-        // Verify that there are no arguments in return-memory area.
-        assert!(args.iter().all(|a| match a {
-            &ABIArg::RetMem(..) => false,
-            _ => true,
-        }));
         // Verify that there are no return values on the stack.
         assert!(rets.iter().all(|a| match a {
             &ABIArg::Stack(..) => false,
@@ -181,14 +173,21 @@ impl ABISig {
     }
 }
 
-/// ARM64 ABI object for a function body.
-pub struct ARM64ABIBody {
-    sig: ABISig,                       // signature: arg and retval regs
-    stackslots: Vec<usize>,            // offsets to each stackslot
-    stackslots_size: usize,            // total stack size of all stackslots
-    clobbered: Set<Writable<RealReg>>, // clobbered registers, from regalloc.
-    spillslots: Option<usize>,         // total number of spillslots, from regalloc.
-    frame_size: Option<usize>,
+/// AArch64 ABI object for a function body.
+pub struct AArch64ABIBody {
+    /// signature: arg and retval regs
+    sig: ABISig,
+    /// offsets to each stackslot
+    stackslots: Vec<u32>,
+    /// total stack size of all stackslots
+    stackslots_size: u32,
+    /// clobbered registers, from regalloc.
+    clobbered: Set<Writable<RealReg>>,
+    /// total number of spillslots, from regalloc.
+    spillslots: Option<usize>,
+    /// Total frame size.
+    frame_size: Option<u32>,
+    /// Calling convention this function expects.
     call_conv: isa::CallConv,
 }
 
@@ -207,20 +206,31 @@ fn in_vec_reg(ty: ir::Type) -> bool {
     }
 }
 
-impl ARM64ABIBody {
+impl AArch64ABIBody {
     /// Create a new body ABI instance.
     pub fn new(f: &ir::Function) -> Self {
-        debug!("ARM64 ABI: func signature {:?}", f.signature);
+        debug!("AArch64 ABI: func signature {:?}", f.signature);
 
         let sig = ABISig::from_func_sig(&f.signature);
 
+        let call_conv = f.signature.call_conv;
+        // Only these calling conventions are supported.
+        assert!(
+            call_conv == isa::CallConv::SystemV
+                || call_conv == isa::CallConv::Fast
+                || call_conv == isa::CallConv::Cold
+                || call_conv.extends_baldrdash(),
+            "Unsupported calling convention: {:?}",
+            call_conv
+        );
+
         // Compute stackslot locations and total stackslot size.
-        let mut stack_offset: usize = 0;
+        let mut stack_offset: u32 = 0;
         let mut stackslots = vec![];
         for (stackslot, data) in f.stack_slots.iter() {
             let off = stack_offset;
-            stack_offset += data.size as usize;
-            stack_offset = (stack_offset + 7) & !7usize;
+            stack_offset += data.size;
+            stack_offset = (stack_offset + 7) & !7;
             assert_eq!(stackslot.as_u32() as usize, stackslots.len());
             stackslots.push(off);
         }
@@ -232,7 +242,7 @@ impl ARM64ABIBody {
             clobbered: Set::empty(),
             spillslots: None,
             frame_size: None,
-            call_conv: f.signature.call_conv,
+            call_conv,
         }
     }
 }
@@ -264,7 +274,7 @@ fn load_stack(fp_offset: i64, into_reg: Writable<Reg>, ty: Type) -> Inst {
             mem,
             srcloc: None,
         },
-        _ => unimplemented!(),
+        _ => unimplemented!("load_stack({})", ty),
     }
 }
 
@@ -295,7 +305,7 @@ fn store_stack(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst {
             mem,
             srcloc: None,
         },
-        _ => unimplemented!(),
+        _ => unimplemented!("store_stack({})", ty),
     }
 }
 
@@ -402,11 +412,13 @@ fn get_caller_saves_set(call_conv: isa::CallConv) -> Set<Writable<Reg>> {
     set
 }
 
-impl ABIBody<Inst> for ARM64ABIBody {
+impl ABIBody for AArch64ABIBody {
+    type I = Inst;
+
     fn liveins(&self) -> Set<RealReg> {
         let mut set: Set<RealReg> = Set::empty();
-        for arg in &self.sig.args {
-            if let &ABIArg::Reg(r, _) = arg {
+        for &arg in &self.sig.args {
+            if let ABIArg::Reg(r, _) = arg {
                 set.insert(r);
             }
         }
@@ -415,8 +427,8 @@ impl ABIBody<Inst> for ARM64ABIBody {
 
     fn liveouts(&self) -> Set<RealReg> {
         let mut set: Set<RealReg> = Set::empty();
-        for ret in &self.sig.rets {
-            if let &ABIArg::Reg(r, _) = ret {
+        for &ret in &self.sig.rets {
+            if let ABIArg::Reg(r, _) = ret {
                 set.insert(r);
             }
         }
@@ -439,7 +451,6 @@ impl ABIBody<Inst> for ARM64ABIBody {
         match &self.sig.args[idx] {
             &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty),
             &ABIArg::Stack(off, ty) => load_stack(off + 16, into_reg, ty),
-            _ => unimplemented!(),
         }
     }
 
@@ -447,7 +458,6 @@ impl ABIBody<Inst> for ARM64ABIBody {
         match &self.sig.rets[idx] {
             &ABIArg::Reg(r, ty) => Inst::gen_move(Writable::from_reg(r.to_reg()), from_reg, ty),
             &ABIArg::Stack(off, ty) => store_stack(off + 16, from_reg, ty),
-            _ => unimplemented!(),
         }
     }
 
@@ -470,7 +480,7 @@ impl ABIBody<Inst> for ARM64ABIBody {
     fn load_stackslot(
         &self,
         slot: StackSlot,
-        offset: usize,
+        offset: u32,
         ty: Type,
         into_reg: Writable<Reg>,
     ) -> Inst {
@@ -480,7 +490,7 @@ impl ABIBody<Inst> for ARM64ABIBody {
         load_stack(fp_off, into_reg, ty)
     }
 
-    fn store_stackslot(&self, slot: StackSlot, offset: usize, ty: Type, from_reg: Reg) -> Inst {
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Inst {
         // Offset from beginning of stackslot area, which is at FP - stackslots_size.
         let stack_off = self.stackslots[slot.as_u32() as usize] as i64;
         let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64);
@@ -532,13 +542,13 @@ impl ABIBody<Inst> for ARM64ABIBody {
             });
         }
 
-        let mut total_stacksize = self.stackslots_size + 8 * self.spillslots.unwrap();
+        let mut total_stacksize = self.stackslots_size + 8 * self.spillslots.unwrap() as u32;
         if self.call_conv.extends_baldrdash() {
             debug_assert!(
                 !flags.enable_probestack(),
                 "baldrdash does not expect cranelift to emit stack probes"
             );
-            total_stacksize += flags.baldrdash_prologue_words() as usize * 8;
+            total_stacksize += flags.baldrdash_prologue_words() as u32 * 8;
         }
         let total_stacksize = (total_stacksize + 15) & !15; // 16-align the stack.
 
@@ -692,7 +702,7 @@ impl ABIBody<Inst> for ARM64ABIBody {
 
     fn frame_size(&self) -> u32 {
         self.frame_size
-            .expect("frame size not computed before prologue generation") as u32
+            .expect("frame size not computed before prologue generation")
     }
 
     fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 {
@@ -719,8 +729,8 @@ enum CallDest {
     Reg(Reg),
 }
 
-/// ARM64 ABI object for a function call.
-pub struct ARM64ABICall {
+/// AArch64 ABI object for a function call.
+pub struct AArch64ABICall {
     sig: ABISig,
     uses: Set<Reg>,
     defs: Set<Writable<Reg>>,
@@ -751,16 +761,16 @@ fn abisig_to_uses_and_defs(sig: &ABISig) -> (Set<Reg>, Set<Writable<Reg>>) {
     (uses, defs)
 }
 
-impl ARM64ABICall {
+impl AArch64ABICall {
     /// Create a callsite ABI object for a call directly to the specified function.
     pub fn from_func(
         sig: &ir::Signature,
         extname: &ir::ExternalName,
         loc: ir::SourceLoc,
-    ) -> ARM64ABICall {
+    ) -> AArch64ABICall {
         let sig = ABISig::from_func_sig(sig);
         let (uses, defs) = abisig_to_uses_and_defs(&sig);
-        ARM64ABICall {
+        AArch64ABICall {
             sig,
             uses,
             defs,
@@ -777,10 +787,10 @@ impl ARM64ABICall {
         ptr: Reg,
         loc: ir::SourceLoc,
         opcode: ir::Opcode,
-    ) -> ARM64ABICall {
+    ) -> AArch64ABICall {
         let sig = ABISig::from_func_sig(sig);
         let (uses, defs) = abisig_to_uses_and_defs(&sig);
-        ARM64ABICall {
+        AArch64ABICall {
             sig,
             uses,
             defs,
@@ -820,7 +830,9 @@ fn adjust_stack(amt: u64, is_sub: bool) -> Vec<Inst> {
     }
 }
 
-impl ABICall<Inst> for ARM64ABICall {
+impl ABICall for AArch64ABICall {
+    type I = Inst;
+
     fn num_args(&self) -> usize {
         self.sig.args.len()
     }
@@ -841,14 +853,12 @@ impl ABICall<Inst> for ARM64ABICall {
                 mem: MemArg::SPOffset(off),
                 srcloc: None,
             },
-            _ => unimplemented!(),
         }
     }
 
     fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Inst {
         match &self.sig.rets[idx] {
             &ABIArg::Reg(reg, ty) => Inst::gen_move(into_reg, reg.to_reg(), ty),
-            &ABIArg::RetMem(..) => panic!("Return-memory area not yet supported"),
             _ => unimplemented!(),
         }
     }
diff --git a/cranelift/codegen/src/isa/arm64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
similarity index 78%
rename from cranelift/codegen/src/isa/arm64/inst/args.rs
rename to cranelift/codegen/src/isa/aarch64/inst/args.rs
index 75cf12283b0e..b83f375bcf6d 100644
--- a/cranelift/codegen/src/isa/arm64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -1,49 +1,34 @@
-//! ARM64 ISA definitions: instruction arguments.
+//! AArch64 ISA definitions: instruction arguments.
 
+// Some variants are never constructed, but we still want them as options in the future.
 #![allow(dead_code)]
-#![allow(non_snake_case)]
 
-use crate::binemit::{CodeOffset, CodeSink};
-use crate::ir::constant::{ConstantData, ConstantOffset};
+use crate::binemit::CodeOffset;
 use crate::ir::Type;
-use crate::isa::arm64::inst::*;
-use crate::machinst::*;
+use crate::isa::aarch64::inst::*;
 
-use regalloc::{
-    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
-    NUM_REG_CLASSES,
-};
+use regalloc::{RealRegUniverse, Reg, Writable};
 
-use std::string::{String, ToString};
+use core::convert::{Into, TryFrom};
+use std::string::String;
 
 /// A shift operator for a register or immediate.
 #[derive(Clone, Copy, Debug)]
+#[repr(u8)]
 pub enum ShiftOp {
-    ASR,
-    LSR,
-    LSL,
-    ROR,
+    LSL = 0b00,
+    LSR = 0b01,
+    ASR = 0b10,
+    ROR = 0b11,
 }
 
 impl ShiftOp {
     /// Get the encoding of this shift op.
-    pub fn bits(&self) -> u8 {
-        match self {
-            &ShiftOp::LSL => 0b00,
-            &ShiftOp::LSR => 0b01,
-            &ShiftOp::ASR => 0b10,
-            &ShiftOp::ROR => 0b11,
-        }
+    pub fn bits(self) -> u8 {
+        self as u8
     }
 }
 
-/// A shift operator with an amount, guaranteed to be within range.
-#[derive(Clone, Debug)]
-pub struct ShiftOpAndAmt {
-    op: ShiftOp,
-    shift: ShiftOpShiftImm,
-}
-
 /// A shift operator amount.
 #[derive(Clone, Copy, Debug)]
 pub struct ShiftOpShiftImm(u8);
@@ -62,11 +47,18 @@ impl ShiftOpShiftImm {
     }
 
     /// Return the shift amount.
-    pub fn value(&self) -> u8 {
+    pub fn value(self) -> u8 {
         self.0
     }
 }
 
+/// A shift operator with an amount, guaranteed to be within range.
+#[derive(Clone, Debug)]
+pub struct ShiftOpAndAmt {
+    op: ShiftOp,
+    shift: ShiftOpShiftImm,
+}
+
 impl ShiftOpAndAmt {
     pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt {
         ShiftOpAndAmt { op, shift }
@@ -74,7 +66,7 @@ impl ShiftOpAndAmt {
 
     /// Get the shift op.
     pub fn op(&self) -> ShiftOp {
-        self.op.clone()
+        self.op
     }
 
     /// Get the shift amount.
@@ -85,30 +77,22 @@ impl ShiftOpAndAmt {
 
 /// An extend operator for a register.
 #[derive(Clone, Copy, Debug)]
+#[repr(u8)]
 pub enum ExtendOp {
-    SXTB,
-    SXTH,
-    SXTW,
-    SXTX,
-    UXTB,
-    UXTH,
-    UXTW,
-    UXTX,
+    UXTB = 0b000,
+    UXTH = 0b001,
+    UXTW = 0b010,
+    UXTX = 0b011,
+    SXTB = 0b100,
+    SXTH = 0b101,
+    SXTW = 0b110,
+    SXTX = 0b111,
 }
 
 impl ExtendOp {
     /// Encoding of this op.
-    pub fn bits(&self) -> u8 {
-        match self {
-            &ExtendOp::UXTB => 0b000,
-            &ExtendOp::UXTH => 0b001,
-            &ExtendOp::UXTW => 0b010,
-            &ExtendOp::UXTX => 0b011,
-            &ExtendOp::SXTB => 0b100,
-            &ExtendOp::SXTH => 0b101,
-            &ExtendOp::SXTW => 0b110,
-            &ExtendOp::SXTX => 0b111,
-        }
+    pub fn bits(self) -> u8 {
+        self as u8
     }
 }
 
@@ -128,18 +112,34 @@ pub enum MemLabel {
 #[derive(Clone, Debug)]
 pub enum MemArg {
     Label(MemLabel),
+    /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation.
     PostIndexed(Writable<Reg>, SImm9),
+    /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation.
     PreIndexed(Writable<Reg>, SImm9),
+
     // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to
     // what the ISA calls the "register offset" addressing mode. We split out
     // several options here for more ergonomic codegen.
+    /// Register plus register offset.
     RegReg(Reg, Reg),
+
+    /// Register plus register offset, scaled by type's size.
     RegScaled(Reg, Reg, Type),
+
+    /// Register plus register offset, scaled by type's size, with index sign- or zero-extended
+    /// first.
     RegScaledExtended(Reg, Reg, Type, ExtendOp),
+
+    /// Unscaled signed 9-bit immediate offset from reg.
     Unscaled(Reg, SImm9),
+
+    /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg.
     UnsignedOffset(Reg, UImm12Scaled),
-    /// Offset from the stack pointer or frame pointer.
+
+    /// Offset from the stack pointer. Lowered into a real amode at emission.
     SPOffset(i64),
+
+    /// Offset from the frame pointer. Lowered into a real amode at emission.
     FPOffset(i64),
 }
 
@@ -153,9 +153,7 @@ impl MemArg {
 
     /// Memory reference using an address in a register and an offset, if possible.
     pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option<MemArg> {
-        if offset == 0 {
-            Some(MemArg::Unscaled(reg, SImm9::zero()))
-        } else if let Some(simm9) = SImm9::maybe_from_i64(offset) {
+        if let Some(simm9) = SImm9::maybe_from_i64(offset) {
             Some(MemArg::Unscaled(reg, simm9))
         } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) {
             Some(MemArg::UnsignedOffset(reg, uimm12s))
@@ -165,17 +163,18 @@ impl MemArg {
     }
 
     /// Memory reference using the sum of two registers as an address.
-    pub fn reg_reg(reg1: Reg, reg2: Reg) -> MemArg {
+    pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> MemArg {
         MemArg::RegReg(reg1, reg2)
     }
 
     /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
-    pub fn reg_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> MemArg {
+    pub fn reg_plus_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> MemArg {
         MemArg::RegScaled(reg1, reg2, ty)
     }
 
-    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address.
-    pub fn reg_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> MemArg {
+    /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address, with `reg2` sign- or
+    /// zero-extended as per `op`.
+    pub fn reg_plus_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> MemArg {
         MemArg::RegScaledExtended(reg1, reg2, ty, op)
     }
 
@@ -199,23 +198,24 @@ pub enum PairMemArg {
 
 /// Condition for conditional branches.
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
+#[repr(u8)]
 pub enum Cond {
-    Eq,
-    Ne,
-    Hs,
-    Lo,
-    Mi,
-    Pl,
-    Vs,
-    Vc,
-    Hi,
-    Ls,
-    Ge,
-    Lt,
-    Gt,
-    Le,
-    Al,
-    Nv,
+    Eq = 0,
+    Ne = 1,
+    Hs = 2,
+    Lo = 3,
+    Mi = 4,
+    Pl = 5,
+    Vs = 6,
+    Vc = 7,
+    Hi = 8,
+    Ls = 9,
+    Ge = 10,
+    Lt = 11,
+    Gt = 12,
+    Le = 13,
+    Al = 14,
+    Nv = 15,
 }
 
 impl Cond {
@@ -224,18 +224,25 @@ impl Cond {
         match self {
             Cond::Eq => Cond::Ne,
             Cond::Ne => Cond::Eq,
+
             Cond::Hs => Cond::Lo,
             Cond::Lo => Cond::Hs,
+
             Cond::Mi => Cond::Pl,
             Cond::Pl => Cond::Mi,
+
             Cond::Vs => Cond::Vc,
             Cond::Vc => Cond::Vs,
+
             Cond::Hi => Cond::Ls,
             Cond::Ls => Cond::Hi,
+
             Cond::Ge => Cond::Lt,
             Cond::Lt => Cond::Ge,
+
             Cond::Gt => Cond::Le,
             Cond::Le => Cond::Gt,
+
             Cond::Al => Cond::Nv,
             Cond::Nv => Cond::Al,
         }
@@ -243,24 +250,7 @@ impl Cond {
 
     /// Return the machine encoding of this condition.
     pub fn bits(self) -> u32 {
-        match self {
-            Cond::Eq => 0,
-            Cond::Ne => 1,
-            Cond::Hs => 2,
-            Cond::Lo => 3,
-            Cond::Mi => 4,
-            Cond::Pl => 5,
-            Cond::Vs => 6,
-            Cond::Vc => 7,
-            Cond::Hi => 8,
-            Cond::Ls => 9,
-            Cond::Ge => 10,
-            Cond::Lt => 11,
-            Cond::Gt => 12,
-            Cond::Le => 13,
-            Cond::Al => 14,
-            Cond::Nv => 15,
-        }
+        self as u32
     }
 }
 
@@ -305,7 +295,7 @@ impl BranchTarget {
     pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) {
         match self {
             &mut BranchTarget::Block(bix) => {
-                let bix = bix as usize;
+                let bix = usize::try_from(bix).unwrap();
                 assert!(bix < targets.len());
                 let block_offset_in_func = targets[bix];
                 let branch_offset = (block_offset_in_func as isize) - (my_offset as isize);
@@ -343,7 +333,7 @@ impl BranchTarget {
         }
     }
 
-    /// Get the offset as a 16-bit offset, or `None` if overflow.
+    /// Get the offset as a 19-bit offset, or `None` if overflow.
     pub fn as_off19(&self) -> Option<u32> {
         let off = self.as_offset_words();
         if (off < (1 << 18)) && (off >= -(1 << 18)) {
@@ -357,7 +347,7 @@ impl BranchTarget {
     pub fn map(&mut self, block_index_map: &[BlockIndex]) {
         match self {
             &mut BranchTarget::Block(ref mut bix) => {
-                let n = block_index_map[*bix as usize];
+                let n = block_index_map[usize::try_from(*bix).unwrap()];
                 *bix = n;
             }
             &mut BranchTarget::ResolvedOffset(_) => {}
@@ -392,7 +382,7 @@ fn shift_for_type(ty: Type) -> usize {
         4 => 2,
         8 => 3,
         16 => 4,
-        _ => panic!("unknown type"),
+        _ => panic!("unknown type: {}", ty),
     }
 }
 
@@ -427,15 +417,15 @@ impl ShowWithRRU for MemArg {
             }
             &MemArg::RegScaledExtended(r1, r2, ty, op) => {
                 let shift = shift_for_type(ty);
-                let is32 = match op {
-                    ExtendOp::SXTW | ExtendOp::UXTW => true,
-                    _ => false,
+                let size = match op {
+                    ExtendOp::SXTW | ExtendOp::UXTW => InstSize::Size32,
+                    _ => InstSize::Size64,
                 };
                 let op = op.show_rru(mb_rru);
                 format!(
                     "[{}, {}, {} #{}]",
                     r1.show_rru(mb_rru),
-                    show_ireg_sized(r2, mb_rru, is32),
+                    show_ireg_sized(r2, mb_rru, size),
                     op,
                     shift
                 )
@@ -499,3 +489,40 @@ impl ShowWithRRU for BranchTarget {
         }
     }
 }
+
+/// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and
+/// 64-bit variants of many instructions (and integer registers).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum InstSize {
+    Size32,
+    Size64,
+}
+
+impl InstSize {
+    /// 32-bit case?
+    pub fn is32(self) -> bool {
+        self == InstSize::Size32
+    }
+    /// 64-bit case?
+    pub fn is64(self) -> bool {
+        self == InstSize::Size64
+    }
+    /// Convert from an `is32` boolean flag to an `InstSize`.
+    pub fn from_is32(is32: bool) -> InstSize {
+        if is32 {
+            InstSize::Size32
+        } else {
+            InstSize::Size64
+        }
+    }
+    /// Convert from a needed width to the smallest size that fits.
+    pub fn from_bits<I: Into<usize>>(bits: I) -> InstSize {
+        let bits: usize = bits.into();
+        assert!(bits <= 64);
+        if bits <= 32 {
+            InstSize::Size32
+        } else {
+            InstSize::Size64
+        }
+    }
+}
diff --git a/cranelift/codegen/src/isa/arm64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
similarity index 97%
rename from cranelift/codegen/src/isa/arm64/inst/emit.rs
rename to cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 20eefdeaae08..f01746543ce7 100644
--- a/cranelift/codegen/src/isa/arm64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -1,22 +1,14 @@
-//! ARM64 ISA: binary code emission.
+//! AArch64 ISA: binary code emission.
 
-#![allow(dead_code)]
-#![allow(non_snake_case)]
-
-use crate::binemit::{CodeOffset, CodeSink, Reloc};
+use crate::binemit::{CodeOffset, Reloc};
 use crate::ir::constant::ConstantData;
 use crate::ir::types::*;
-use crate::ir::{Opcode, TrapCode, Type};
-use crate::isa::arm64::inst::*;
-use crate::machinst::*;
-use cranelift_entity::EntityRef;
+use crate::ir::TrapCode;
+use crate::isa::aarch64::inst::*;
 
-use std::env;
+use core::convert::TryFrom;
 
-use regalloc::{
-    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
-    NUM_REG_CLASSES,
-};
+use regalloc::{Reg, RegClass, Writable};
 
 use alloc::vec::Vec;
 
@@ -66,16 +58,7 @@ pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec<Inst>, MemArg) {
 
 /// Helper: get a ConstantData from a u64.
 pub fn u64_constant(bits: u64) -> ConstantData {
-    let data = [
-        (bits & 0xff) as u8,
-        ((bits >> 8) & 0xff) as u8,
-        ((bits >> 16) & 0xff) as u8,
-        ((bits >> 24) & 0xff) as u8,
-        ((bits >> 32) & 0xff) as u8,
-        ((bits >> 40) & 0xff) as u8,
-        ((bits >> 48) & 0xff) as u8,
-        ((bits >> 56) & 0xff) as u8,
-    ];
+    let data = bits.to_le_bytes();
     ConstantData::from(&data[..])
 }
 
@@ -84,41 +67,42 @@ pub fn u64_constant(bits: u64) -> ConstantData {
 
 fn machreg_to_gpr(m: Reg) -> u32 {
     assert!(m.get_class() == RegClass::I64);
-    assert!(m.is_real());
-    m.to_real_reg().get_hw_encoding() as u32
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 
 fn machreg_to_vec(m: Reg) -> u32 {
     assert!(m.get_class() == RegClass::V128);
-    assert!(m.is_real());
-    m.to_real_reg().get_hw_encoding() as u32
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 
 fn machreg_to_gpr_or_vec(m: Reg) -> u32 {
-    m.to_real_reg().get_hw_encoding() as u32
+    u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap()
 }
 
-fn enc_arith_rrr(bits_31_21: u16, bits_15_10: u8, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
-    ((bits_31_21 as u32) << 21)
-        | ((bits_15_10 as u32) << 10)
+fn enc_arith_rrr(bits_31_21: u32, bits_15_10: u32, rd: Writable<Reg>, rn: Reg, rm: Reg) -> u32 {
+    (bits_31_21 << 21)
+        | (bits_15_10 << 10)
         | machreg_to_gpr(rd.to_reg())
         | (machreg_to_gpr(rn) << 5)
         | (machreg_to_gpr(rm) << 16)
 }
 
-fn enc_arith_rr_imm12(bits_31_24: u8, immshift: u8, imm12: u16, rn: Reg, rd: Writable<Reg>) -> u32 {
-    ((bits_31_24 as u32) << 24)
-        | ((immshift as u32) << 22)
-        | ((imm12 as u32) << 10)
+fn enc_arith_rr_imm12(
+    bits_31_24: u32,
+    immshift: u32,
+    imm12: u32,
+    rn: Reg,
+    rd: Writable<Reg>,
+) -> u32 {
+    (bits_31_24 << 24)
+        | (immshift << 22)
+        | (imm12 << 10)
         | (machreg_to_gpr(rn) << 5)
         | machreg_to_gpr(rd.to_reg())
 }
 
-fn enc_arith_rr_imml(bits_31_23: u16, imm_bits: u16, rn: Reg, rd: Writable<Reg>) -> u32 {
-    ((bits_31_23 as u32) << 23)
-        | ((imm_bits as u32) << 10)
-        | (machreg_to_gpr(rn) << 5)
-        | machreg_to_gpr(rd.to_reg())
+fn enc_arith_rr_imml(bits_31_23: u32, imm_bits: u32, rn: Reg, rd: Writable<Reg>) -> u32 {
+    (bits_31_23 << 23) | (imm_bits << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())
 }
 
 fn enc_arith_rrrr(top11: u32, rm: Reg, bit15: u32, ra: Reg, rn: Reg, rd: Writable<Reg>) -> u32 {
@@ -159,8 +143,8 @@ fn enc_move_wide(op: MoveWideOpcode, rd: Writable<Reg>, imm: MoveWideConst) -> u
     assert!(imm.shift <= 0b11);
     MOVE_WIDE_FIXED
         | (op as u32) << 29
-        | (imm.shift as u32) << 21
-        | (imm.bits as u32) << 5
+        | u32::from(imm.shift) << 21
+        | u32::from(imm.bits) << 5
         | machreg_to_gpr(rd.to_reg())
 }
 
@@ -201,7 +185,7 @@ fn enc_ldst_reg(
         Some(ExtendOp::UXTW) => 0b010,
         Some(ExtendOp::SXTW) => 0b110,
         Some(ExtendOp::SXTX) => 0b111,
-        None => 0b011, /* LSL */
+        None => 0b011, // LSL
         _ => panic!("bad extend mode for ld/st MemArg"),
     };
     (op_31_22 << 22)
@@ -244,7 +228,7 @@ fn enc_br(rn: Reg) -> u32 {
 }
 
 fn enc_adr(off: i32, rd: Writable<Reg>) -> u32 {
-    let off = off as u32;
+    let off = u32::try_from(off).unwrap();
     let immlo = off & 3;
     let immhi = (off >> 2) & ((1 << 19) - 1);
     (0b00010000 << 24) | (immlo << 29) | (immhi << 5) | machreg_to_gpr(rd.to_reg())
@@ -258,8 +242,8 @@ fn enc_csel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond) -> u32 {
         | (cond.bits() << 12)
 }
 
-fn enc_fcsel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond, is32: bool) -> u32 {
-    let ty_bit = if is32 { 0 } else { 1 };
+fn enc_fcsel(rd: Writable<Reg>, rn: Reg, rm: Reg, cond: Cond, size: InstSize) -> u32 {
+    let ty_bit = if size.is32() { 0 } else { 1 };
     0b000_11110_00_1_00000_0000_11_00000_00000
         | (machreg_to_vec(rm) << 16)
         | (machreg_to_vec(rn) << 5)
@@ -301,8 +285,8 @@ fn enc_fpurrrr(top17: u32, rd: Writable<Reg>, rn: Reg, rm: Reg, ra: Reg) -> u32
         | machreg_to_vec(rd.to_reg())
 }
 
-fn enc_fcmp(is32: bool, rn: Reg, rm: Reg) -> u32 {
-    let bits = if is32 {
+fn enc_fcmp(size: InstSize, rn: Reg, rm: Reg) -> u32 {
+    let bits = if size.is32() {
         0b000_11110_00_1_00000_00_1000_00000_00000
     } else {
         0b000_11110_01_1_00000_00_1000_00000_00000
@@ -359,7 +343,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     | ALUOp::SMulH
                     | ALUOp::UMulH => {
                         //// RRRR ops.
-                        panic!("Bad ALUOp in RRR form!");
+                        panic!("Bad ALUOp {:?} in RRR form!", alu_op);
                     }
                 };
                 let bit15_10 = match alu_op {
@@ -450,14 +434,14 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
             } => {
                 let amt = immshift.value();
                 let (top10, immr, imms) = match alu_op {
-                    ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), amt as u32),
-                    ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), amt as u32),
-                    ALUOp::Lsr32 => (0b0101001100, amt as u32, 0b011111),
-                    ALUOp::Lsr64 => (0b1101001101, amt as u32, 0b111111),
-                    ALUOp::Asr32 => (0b0001001100, amt as u32, 0b011111),
-                    ALUOp::Asr64 => (0b1001001101, amt as u32, 0b111111),
-                    ALUOp::Lsl32 => (0b0101001100, (32 - amt) as u32, (31 - amt) as u32),
-                    ALUOp::Lsl64 => (0b1101001101, (64 - amt) as u32, (63 - amt) as u32),
+                    ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
+                    ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), u32::from(amt)),
+                    ALUOp::Lsr32 => (0b0101001100, u32::from(amt), 0b011111),
+                    ALUOp::Lsr64 => (0b1101001101, u32::from(amt), 0b111111),
+                    ALUOp::Asr32 => (0b0001001100, u32::from(amt), 0b011111),
+                    ALUOp::Asr64 => (0b1001001101, u32::from(amt), 0b111111),
+                    ALUOp::Lsl32 => (0b0101001100, u32::from(32 - amt), u32::from(31 - amt)),
+                    ALUOp::Lsl64 => (0b1101001101, u32::from(64 - amt), u32::from(63 - amt)),
                     _ => unimplemented!("{:?}", alu_op),
                 };
                 sink.put4(
@@ -476,7 +460,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 rm,
                 ref shiftop,
             } => {
-                let top11: u16 = match alu_op {
+                let top11: u32 = match alu_op {
                     ALUOp::Add32 => 0b000_01011000,
                     ALUOp::Add64 => 0b100_01011000,
                     ALUOp::AddS32 => 0b001_01011000,
@@ -499,8 +483,8 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     ALUOp::AndNot64 => 0b100_01010001,
                     _ => unimplemented!("{:?}", alu_op),
                 };
-                let top11 = top11 | ((shiftop.op().bits() as u16) << 1);
-                let bits_15_10 = shiftop.amt().value();
+                let top11 = top11 | (u32::from(shiftop.op().bits()) << 1);
+                let bits_15_10 = u32::from(shiftop.amt().value());
                 sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
             }
 
@@ -511,7 +495,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 rm,
                 extendop,
             } => {
-                let top11 = match alu_op {
+                let top11: u32 = match alu_op {
                     ALUOp::Add32 => 0b00001011001,
                     ALUOp::Add64 => 0b10001011001,
                     ALUOp::Sub32 => 0b01001011001,
@@ -522,12 +506,12 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     ALUOp::SubS64 => 0b11101011001,
                     _ => unimplemented!("{:?}", alu_op),
                 };
-                let bits_15_10 = extendop.bits() << 3;
+                let bits_15_10 = u32::from(extendop.bits()) << 3;
                 sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm));
             }
 
             &Inst::BitRR { op, rd, rn, .. } => {
-                let size = if op.is_32_bit() { 0b0 } else { 0b1 };
+                let size = if op.inst_size().is32() { 0b0 } else { 0b1 };
                 let (op1, op2) = match op {
                     BitOp::RBit32 | BitOp::RBit64 => (0b00000, 0b000000),
                     BitOp::Clz32 | BitOp::Clz64 => (0b00000, 0b000100),
@@ -655,6 +639,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                     }
                     &MemArg::Label(ref label) => {
                         let offset = match label {
+                            // cast i32 to u32 (two's-complement)
                             &MemLabel::PCRel(off) => off as u32,
                         } / 4;
                         assert!(offset < (1 << 19));
@@ -825,10 +810,16 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
             &Inst::Mov { rd, rm } => {
                 assert!(rd.to_reg().get_class() == rm.get_class());
                 assert!(rm.get_class() == RegClass::I64);
+                // MOV to SP is interpreted as MOV to XZR instead. And our codegen
+                // should never MOV to XZR.
+                assert!(machreg_to_gpr(rd.to_reg()) != 31);
                 // Encoded as ORR rd, rm, zero.
                 sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm));
             }
             &Inst::Mov32 { rd, rm } => {
+                // MOV to SP is interpreted as MOV to XZR instead. And our codegen
+                // should never MOV to XZR.
+                assert!(machreg_to_gpr(rd.to_reg()) != 31);
                 // Encoded as ORR rd, rm, zero.
                 sink.put4(enc_arith_rrr(0b00101010_000, 0b000_000, rd, zero_reg(), rm));
             }
@@ -888,10 +879,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra));
             }
             &Inst::FpuCmp32 { rn, rm } => {
-                sink.put4(enc_fcmp(/* is32 = */ true, rn, rm));
+                sink.put4(enc_fcmp(InstSize::Size32, rn, rm));
             }
             &Inst::FpuCmp64 { rn, rm } => {
-                sink.put4(enc_fcmp(/* is32 = */ false, rn, rm));
+                sink.put4(enc_fcmp(InstSize::Size64, rn, rm));
             }
             &Inst::FpuToInt { op, rd, rn } => {
                 let top16 = match op {
@@ -962,10 +953,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 sink.put8(const_data.to_bits());
             }
             &Inst::FpuCSel32 { rd, rn, rm, cond } => {
-                sink.put4(enc_fcsel(rd, rn, rm, cond, /* is32 = */ true));
+                sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size32));
             }
             &Inst::FpuCSel64 { rd, rn, rm, cond } => {
-                sink.put4(enc_fcsel(rd, rn, rm, cond, /* is32 = */ false));
+                sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size64));
             }
             &Inst::FpuRound { op, rd, rn } => {
                 let top22 = match op {
@@ -1093,10 +1084,10 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // do early (fake) emission for size computation.
                 sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap()));
             }
-            &Inst::Ret {} => {
+            &Inst::Ret => {
                 sink.put4(0xd65f03c0);
             }
-            &Inst::EpiloguePlaceholder {} => {
+            &Inst::EpiloguePlaceholder => {
                 // Noop; this is just a placeholder for epilogues.
             }
             &Inst::Call {
@@ -1168,7 +1159,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
             &Inst::IndirectBr { rn, .. } => {
                 sink.put4(enc_br(rn));
             }
-            &Inst::Nop => {}
+            &Inst::Nop0 => {}
             &Inst::Nop4 => {
                 sink.put4(0xd503201f);
             }
@@ -1204,7 +1195,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // the middle; we depend on hardcoded PC-rel addressing below.
                 //
                 // N.B.: if PC-rel addressing on ADR below is changed, also update
-                // `Inst::with_block_offsets()` in arm64/inst/mod.rs.
+                // `Inst::with_block_offsets()` in aarch64/inst/mod.rs.
 
                 // Save index in a tmp (the live range of ridx only goes to start of this
                 // sequence; rtmp1 or rtmp2 may overwrite it).
@@ -1219,7 +1210,7 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // Load value out of jump table
                 let inst = Inst::SLoad32 {
                     rd: rtmp2,
-                    mem: MemArg::reg_reg_scaled_extended(
+                    mem: MemArg::reg_plus_reg_scaled_extended(
                         rtmp1.to_reg(),
                         rtmp2.to_reg(),
                         I32,
@@ -1246,7 +1237,9 @@ impl<O: MachSectionOutput> MachInstEmit<O> for Inst {
                 // Emit jump table (table of 32-bit offsets).
                 for target in targets {
                     let off = target.as_offset_words() * 4;
-                    let off = off as i32 as u32;
+                    let off = i32::try_from(off).unwrap();
+                    // cast i32 to u32 (two's-complement)
+                    let off = off as u32;
                     sink.put4(off);
                 }
             }
@@ -1292,7 +1285,7 @@ mod test {
     use crate::isa::test_utils;
 
     #[test]
-    fn test_arm64_binemit() {
+    fn test_aarch64_binemit() {
         let mut insns = Vec::<(Inst, &str, &str)>::new();
 
         // N.B.: the architecture is little-endian, so when transcribing the 32-bit
@@ -1310,10 +1303,10 @@ mod test {
         //
         // Then:
         //
-        //      $ echo "mov x1, x2" | arm64inst.sh
-        insns.push((Inst::Ret {}, "C0035FD6", "ret"));
-        insns.push((Inst::Nop {}, "", "nop-zero-len"));
-        insns.push((Inst::Nop4 {}, "1F2003D5", "nop"));
+        //      $ echo "mov x1, x2" | aarch64inst.sh
+        insns.push((Inst::Ret, "C0035FD6", "ret"));
+        insns.push((Inst::Nop0, "", "nop-zero-len"));
+        insns.push((Inst::Nop4, "1F2003D5", "nop"));
         insns.push((
             Inst::AluRRR {
                 alu_op: ALUOp::Add32,
@@ -4052,7 +4045,7 @@ mod test {
         let rru = create_reg_universe();
         for (insn, expected_encoding, expected_printing) in insns {
             println!(
-                "ARM64: {:?}, {}, {}",
+                "AArch64: {:?}, {}, {}",
                 insn, expected_encoding, expected_printing
             );
 
diff --git a/cranelift/codegen/src/isa/arm64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
similarity index 91%
rename from cranelift/codegen/src/isa/arm64/inst/imms.rs
rename to cranelift/codegen/src/isa/aarch64/inst/imms.rs
index eda68af7b12a..7230b4f44e3f 100644
--- a/cranelift/codegen/src/isa/arm64/inst/imms.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs
@@ -1,8 +1,7 @@
-//! ARM64 ISA definitions: immediate constants.
-
-#![allow(dead_code)]
-#![allow(non_snake_case)]
+//! AArch64 ISA definitions: immediate constants.
 
+// Some variants are never constructed, but we still want them as options in the future.
+#[allow(dead_code)]
 use crate::ir::types::*;
 use crate::ir::Type;
 use crate::machinst::*;
@@ -28,12 +27,12 @@ impl SImm7Scaled {
         assert!(scale_ty == I64 || scale_ty == I32);
         let scale = scale_ty.bytes();
         assert!(scale.is_power_of_two());
-        let scale = scale as i64;
+        let scale = i64::from(scale);
         let upper_limit = 63 * scale;
         let lower_limit = -(64 * scale);
         if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 {
             Some(SImm7Scaled {
-                value: value as i16,
+                value: i16::try_from(value).unwrap(),
                 scale_ty,
             })
         } else {
@@ -48,7 +47,12 @@ impl SImm7Scaled {
 
     /// Bits for encoding.
     pub fn bits(&self) -> u32 {
-        ((self.value / self.scale_ty.bytes() as i16) as u32) & 0x7f
+        let ty_bytes: i16 = self.scale_ty.bytes() as i16;
+        let scaled: i16 = self.value / ty_bytes;
+        assert!(scaled <= 63 && scaled >= -64);
+        let scaled: i8 = scaled as i8;
+        let encoded: u32 = scaled as u32;
+        encoded & 0x7f
     }
 }
 
@@ -125,7 +129,7 @@ impl UImm12Scaled {
 #[derive(Clone, Debug)]
 pub struct Imm12 {
     /// The immediate bits.
-    pub bits: usize,
+    pub bits: u16,
     /// Whether the immediate bits are shifted left by 12 or not.
     pub shift12: bool,
 }
@@ -140,12 +144,12 @@ impl Imm12 {
             })
         } else if val < 0xfff {
             Some(Imm12 {
-                bits: val as usize,
+                bits: val as u16,
                 shift12: false,
             })
         } else if val < 0xfff_000 && (val & 0xfff == 0) {
             Some(Imm12 {
-                bits: (val as usize) >> 12,
+                bits: (val >> 12) as u16,
                 shift12: true,
             })
         } else {
@@ -154,7 +158,7 @@ impl Imm12 {
     }
 
     /// Bits for 2-bit "shift" field in e.g. AddI.
-    pub fn shift_bits(&self) -> u8 {
+    pub fn shift_bits(&self) -> u32 {
         if self.shift12 {
             0b01
         } else {
@@ -163,8 +167,8 @@ impl Imm12 {
     }
 
     /// Bits for 12-bit "imm" field in e.g. AddI.
-    pub fn imm_bits(&self) -> u16 {
-        self.bits as u16
+    pub fn imm_bits(&self) -> u32 {
+        self.bits as u32
     }
 }
 
@@ -175,11 +179,11 @@ pub struct ImmLogic {
     /// The actual value.
     value: u64,
     /// `N` flag.
-    pub N: bool,
+    pub n: bool,
     /// `S` field: element size and element bits.
-    pub R: u8,
+    pub r: u8,
     /// `R` field: rotate amount.
-    pub S: u8,
+    pub s: u8,
 }
 
 impl ImmLogic {
@@ -367,24 +371,19 @@ impl ImmLogic {
         debug_assert!(u8::try_from(s).is_ok());
         Some(ImmLogic {
             value: original_value,
-            N: out_n != 0,
-            R: r as u8,
-            S: s as u8,
+            n: out_n != 0,
+            r: r as u8,
+            s: s as u8,
         })
     }
 
     pub fn from_raw(value: u64, n: bool, r: u8, s: u8) -> ImmLogic {
-        ImmLogic {
-            N: n,
-            R: r,
-            S: s,
-            value,
-        }
+        ImmLogic { n, r, s, value }
     }
 
     /// Returns bits ready for encoding: (N:1, R:6, S:6)
-    pub fn enc_bits(&self) -> u16 {
-        ((self.N as u16) << 12) | ((self.R as u16) << 6) | (self.S as u16)
+    pub fn enc_bits(&self) -> u32 {
+        ((self.n as u32) << 12) | ((self.r as u32) << 6) | (self.s as u32)
     }
 
     /// Returns the value that this immediate represents.
@@ -427,7 +426,7 @@ impl ImmShift {
 pub struct MoveWideConst {
     /// The value.
     pub bits: u16,
-    /// shifted 16*shift bits to the left.
+    /// Result is `bits` shifted 16*shift bits to the left.
     pub shift: u8,
 }
 
@@ -487,7 +486,7 @@ impl MoveWideConst {
 impl ShowWithRRU for Imm12 {
     fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String {
         let shift = if self.shift12 { 12 } else { 0 };
-        let value = self.bits << shift;
+        let value = u32::from(self.bits) << shift;
         format!("#{}", value)
     }
 }
@@ -544,9 +543,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 1,
-                N: true,
-                R: 0,
-                S: 0
+                n: true,
+                r: 0,
+                s: 0
             }),
             ImmLogic::maybe_from_u64(1, I64)
         );
@@ -554,9 +553,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 2,
-                N: true,
-                R: 63,
-                S: 0
+                n: true,
+                r: 63,
+                s: 0
             }),
             ImmLogic::maybe_from_u64(2, I64)
         );
@@ -568,9 +567,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 248,
-                N: true,
-                R: 61,
-                S: 4
+                n: true,
+                r: 61,
+                s: 4
             }),
             ImmLogic::maybe_from_u64(248, I64)
         );
@@ -580,9 +579,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 1920,
-                N: true,
-                R: 57,
-                S: 3
+                n: true,
+                r: 57,
+                s: 3
             }),
             ImmLogic::maybe_from_u64(1920, I64)
         );
@@ -590,9 +589,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x7ffe,
-                N: true,
-                R: 63,
-                S: 13
+                n: true,
+                r: 63,
+                s: 13
             }),
             ImmLogic::maybe_from_u64(0x7ffe, I64)
         );
@@ -600,9 +599,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x30000,
-                N: true,
-                R: 48,
-                S: 1
+                n: true,
+                r: 48,
+                s: 1
             }),
             ImmLogic::maybe_from_u64(0x30000, I64)
         );
@@ -610,9 +609,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x100000,
-                N: true,
-                R: 44,
-                S: 0
+                n: true,
+                r: 44,
+                s: 0
             }),
             ImmLogic::maybe_from_u64(0x100000, I64)
         );
@@ -620,9 +619,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: u64::max_value() - 1,
-                N: true,
-                R: 63,
-                S: 62
+                n: true,
+                r: 63,
+                s: 62
             }),
             ImmLogic::maybe_from_u64(u64::max_value() - 1, I64)
         );
@@ -630,9 +629,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0xaaaaaaaaaaaaaaaa,
-                N: false,
-                R: 1,
-                S: 60
+                n: false,
+                r: 1,
+                s: 60
             }),
             ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64)
         );
@@ -640,9 +639,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x8181818181818181,
-                N: false,
-                R: 1,
-                S: 49
+                n: false,
+                r: 1,
+                s: 49
             }),
             ImmLogic::maybe_from_u64(0x8181818181818181, I64)
         );
@@ -650,9 +649,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0xffc3ffc3ffc3ffc3,
-                N: false,
-                R: 10,
-                S: 43
+                n: false,
+                r: 10,
+                s: 43
             }),
             ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64)
         );
@@ -660,9 +659,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x100000001,
-                N: false,
-                R: 0,
-                S: 0
+                n: false,
+                r: 0,
+                s: 0
             }),
             ImmLogic::maybe_from_u64(0x100000001, I64)
         );
@@ -670,9 +669,9 @@ mod test {
         assert_eq!(
             Some(ImmLogic {
                 value: 0x1111111111111111,
-                N: false,
-                R: 0,
-                S: 56
+                n: false,
+                r: 0,
+                s: 56
             }),
             ImmLogic::maybe_from_u64(0x1111111111111111, I64)
         );
diff --git a/cranelift/codegen/src/isa/arm64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
similarity index 66%
rename from cranelift/codegen/src/isa/arm64/inst/mod.rs
rename to cranelift/codegen/src/isa/aarch64/inst/mod.rs
index ecc948cc706a..44da584b444d 100644
--- a/cranelift/codegen/src/isa/arm64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1,28 +1,19 @@
-//! This module defines arm64-specific machine instruction types.
+//! This module defines aarch64-specific machine instruction types.
 
-#![allow(non_snake_case)]
-#![allow(unused_imports)]
-#![allow(non_camel_case_types)]
+// Some variants are not constructed, but we still want them as options in the future.
 #![allow(dead_code)]
 
 use crate::binemit::CodeOffset;
-use crate::ir::constant::{ConstantData, ConstantOffset};
-use crate::ir::types::{
-    B1, B128, B16, B32, B64, B8, F32, F64, FFLAGS, I128, I16, I32, I64, I8, IFLAGS,
-};
-use crate::ir::{ExternalName, GlobalValue, JumpTable, Opcode, SourceLoc, TrapCode, Type};
+use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS};
+use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type};
 use crate::machinst::*;
 
 use regalloc::Map as RegallocMap;
-use regalloc::{
-    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
-    NUM_REG_CLASSES,
-};
+use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
 use regalloc::{RegUsageCollector, Set};
 
 use alloc::vec::Vec;
 use smallvec::{smallvec, SmallVec};
-use std::mem;
 use std::string::{String, ToString};
 
 pub mod regs;
@@ -47,25 +38,43 @@ pub enum ALUOp {
     Sub64,
     Orr32,
     Orr64,
+    /// NOR
     OrrNot32,
+    /// NOR
     OrrNot64,
     And32,
     And64,
+    /// NAND
     AndNot32,
+    /// NAND
     AndNot64,
+    /// XOR (AArch64 calls this "EOR")
     Eor32,
+    /// XOR (AArch64 calls this "EOR")
     Eor64,
+    /// XNOR (AArch64 calls this "EOR-NOT")
     EorNot32,
+    /// XNOR (AArch64 calls this "EOR-NOT")
     EorNot64,
+    /// Add, setting flags
     AddS32,
+    /// Add, setting flags
     AddS64,
+    /// Sub, setting flags
     SubS32,
+    /// Sub, setting flags
     SubS64,
-    MAdd32, // multiply-add
+    /// Multiply-add
+    MAdd32,
+    /// Multiply-add
     MAdd64,
+    /// Multiply-sub
     MSub32,
+    /// Multiply-sub
     MSub64,
+    /// Signed multiply, high-word result
     SMulH,
+    /// Unsigned multiply, high-word result
     UMulH,
     SDiv64,
     UDiv64,
@@ -159,17 +168,23 @@ pub enum FpuRoundMode {
 /// A vector ALU operation.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum VecALUOp {
-    SQAddScalar, // signed saturating add
-    UQAddScalar, // unsigned saturating add
-    SQSubScalar, // signed saturating subtract
-    UQSubScalar, // unsigned saturating subtract
+    /// Signed saturating add
+    SQAddScalar,
+    /// Unsigned saturating add
+    UQAddScalar,
+    /// Signed saturating subtract
+    SQSubScalar,
+    /// Unsigned saturating subtract
+    UQSubScalar,
 }
 
 /// An operation on the bits of a register. This can be paired with several instruction formats
 /// below (see `Inst`) in any combination.
 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub enum BitOp {
+    /// Bit reverse
     RBit32,
+    /// Bit reverse
     RBit64,
     Clz32,
     Clz64,
@@ -178,13 +193,11 @@ pub enum BitOp {
 }
 
 impl BitOp {
-    /// Is the opcode a 32-bit operation.
-    pub fn is_32_bit(&self) -> bool {
+    /// What is the opcode's native width?
+    pub fn inst_size(&self) -> InstSize {
         match self {
-            BitOp::RBit32 => true,
-            BitOp::Clz32 => true,
-            BitOp::Cls32 => true,
-            _ => false,
+            BitOp::RBit32 | BitOp::Clz32 | BitOp::Cls32 => InstSize::Size32,
+            _ => InstSize::Size64,
         }
     }
 
@@ -217,7 +230,7 @@ impl From<(Opcode, Type)> for BitOp {
 #[derive(Clone, Debug)]
 pub enum Inst {
     /// A no-op of zero size.
-    Nop,
+    Nop0,
 
     /// A no-op that is one instruction large.
     Nop4,
@@ -465,32 +478,37 @@ pub enum Inst {
         rm: Reg,
     },
 
-    /// Floating-point loads and stores.
+    /// Floating-point load, single-precision (32 bit).
     FpuLoad32 {
         rd: Writable<Reg>,
         mem: MemArg,
         srcloc: Option<SourceLoc>,
     },
+    /// Floating-point store, single-precision (32 bit).
     FpuStore32 {
         rd: Reg,
         mem: MemArg,
         srcloc: Option<SourceLoc>,
     },
+    /// Floating-point load, double-precision (64 bit).
     FpuLoad64 {
         rd: Writable<Reg>,
         mem: MemArg,
         srcloc: Option<SourceLoc>,
     },
+    /// Floating-point store, double-precision (64 bit).
     FpuStore64 {
         rd: Reg,
         mem: MemArg,
         srcloc: Option<SourceLoc>,
     },
+    /// Floating-point/vector load, 128 bit.
     FpuLoad128 {
         rd: Writable<Reg>,
         mem: MemArg,
         srcloc: Option<SourceLoc>,
     },
+    /// Floating-point/vector store, 128 bit.
     FpuStore128 {
         rd: Reg,
         mem: MemArg,
@@ -507,26 +525,28 @@ pub enum Inst {
         const_data: f64,
     },
 
-    /// Conversions between FP and integer values.
+    /// Conversion: FP -> integer.
     FpuToInt {
         op: FpuToIntOp,
         rd: Writable<Reg>,
         rn: Reg,
     },
 
+    /// Conversion: integer -> FP.
     IntToFpu {
         op: IntToFpuOp,
         rd: Writable<Reg>,
         rn: Reg,
     },
 
-    // FP conditional select.
+    /// FP conditional select, 32 bit.
     FpuCSel32 {
         rd: Writable<Reg>,
         rn: Reg,
         rm: Reg,
         cond: Cond,
     },
+    /// FP conditional select, 64 bit.
     FpuCSel64 {
         rd: Writable<Reg>,
         rn: Reg,
@@ -534,7 +554,7 @@ pub enum Inst {
         cond: Cond,
     },
 
-    // Round to integer.
+    /// Round to integer.
     FpuRound {
         op: FpuRoundMode,
         rd: Writable<Reg>,
@@ -596,11 +616,11 @@ pub enum Inst {
 
     // ---- branches (exactly one must appear at end of BB) ----
     /// A machine return instruction.
-    Ret {},
+    Ret,
 
     /// A placeholder instruction, generating no code, meaning that a function epilogue must be
     /// inserted there.
-    EpiloguePlaceholder {},
+    EpiloguePlaceholder,
 
     /// An unconditional branch.
     Jump {
@@ -689,7 +709,7 @@ pub enum Inst {
     },
 }
 
-fn count_clear_half_words(mut value: u64) -> usize {
+fn count_zero_half_words(mut value: u64) -> usize {
     let mut count = 0;
     for _ in 0..4 {
         if value & 0xffff == 0 {
@@ -748,7 +768,7 @@ impl Inst {
 
             // If the number of 0xffff half words is greater than the number of 0x0000 half words
             // it is more efficient to use `movn` for the first instruction.
-            let first_is_inverted = count_clear_half_words(!value) > count_clear_half_words(value);
+            let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value);
             // Either 0xffff or 0x0000 half words can be skipped, depending on the first
             // instruction used.
             let ignored_halfword = if first_is_inverted { 0xffff } else { 0 };
@@ -839,7 +859,7 @@ fn pairmemarg_regs(pairmemarg: &PairMemArg, collector: &mut RegUsageCollector) {
     }
 }
 
-fn arm64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
+fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
     match inst {
         &Inst::AluRRR { rd, rn, rm, .. } => {
             collector.add_def(rd);
@@ -1024,7 +1044,7 @@ fn arm64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
             collector.add_def(rd);
             collector.add_use(rn);
         }
-        &Inst::Jump { .. } | &Inst::Ret { .. } | &Inst::EpiloguePlaceholder { .. } => {}
+        &Inst::Jump { .. } | &Inst::Ret | &Inst::EpiloguePlaceholder => {}
         &Inst::Call {
             ref uses, ref defs, ..
         } => {
@@ -1052,7 +1072,7 @@ fn arm64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
         &Inst::IndirectBr { rn, .. } => {
             collector.add_use(rn);
         }
-        &Inst::Nop | Inst::Nop4 => {}
+        &Inst::Nop0 | Inst::Nop4 => {}
         &Inst::Brk => {}
         &Inst::Udf { .. } => {}
         &Inst::Adr { rd, .. } => {
@@ -1075,548 +1095,555 @@ fn arm64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) {
 //=============================================================================
 // Instructions: map_regs
 
-fn arm64_map_regs(
+fn aarch64_map_regs(
     inst: &mut Inst,
     pre_map: &RegallocMap<VirtualReg, RealReg>,
     post_map: &RegallocMap<VirtualReg, RealReg>,
 ) {
-    fn map(m: &RegallocMap<VirtualReg, RealReg>, r: Reg) -> Reg {
+    fn map(m: &RegallocMap<VirtualReg, RealReg>, r: &mut Reg) {
         if r.is_virtual() {
-            m.get(&r.to_virtual_reg()).cloned().unwrap().to_reg()
-        } else {
-            r
+            let new = m.get(&r.to_virtual_reg()).cloned().unwrap().to_reg();
+            *r = new;
         }
     }
 
-    fn map_wr(m: &RegallocMap<VirtualReg, RealReg>, r: Writable<Reg>) -> Writable<Reg> {
-        Writable::from_reg(map(m, r.to_reg()))
+    fn map_wr(m: &RegallocMap<VirtualReg, RealReg>, r: &mut Writable<Reg>) {
+        let mut reg = r.to_reg();
+        map(m, &mut reg);
+        *r = Writable::from_reg(reg);
     }
 
-    fn map_mem(u: &RegallocMap<VirtualReg, RealReg>, mem: &MemArg) -> MemArg {
+    fn map_mem(u: &RegallocMap<VirtualReg, RealReg>, mem: &mut MemArg) {
         // N.B.: we take only the pre-map here, but this is OK because the
         // only addressing modes that update registers (pre/post-increment on
-        // ARM64) both read and write registers, so they are "mods" rather
+        // AArch64) both read and write registers, so they are "mods" rather
         // than "defs", so must be the same in both the pre- and post-map.
         match mem {
-            &MemArg::Unscaled(reg, simm9) => MemArg::Unscaled(map(u, reg), simm9),
-            &MemArg::UnsignedOffset(reg, uimm12) => MemArg::UnsignedOffset(map(u, reg), uimm12),
-            &MemArg::RegReg(r1, r2) => MemArg::RegReg(map(u, r1), map(u, r2)),
-            &MemArg::RegScaled(r1, r2, ty) => MemArg::RegScaled(map(u, r1), map(u, r2), ty),
-            &MemArg::RegScaledExtended(r1, r2, ty, op) => {
-                MemArg::RegScaledExtended(map(u, r1), map(u, r2), ty, op)
-            }
-            &MemArg::Label(ref l) => MemArg::Label(l.clone()),
-            &MemArg::PreIndexed(r, simm9) => MemArg::PreIndexed(map_wr(u, r), simm9),
-            &MemArg::PostIndexed(r, simm9) => MemArg::PostIndexed(map_wr(u, r), simm9),
-            &MemArg::FPOffset(off) => MemArg::FPOffset(off),
-            &MemArg::SPOffset(off) => MemArg::SPOffset(off),
-        }
+            &mut MemArg::Unscaled(ref mut reg, ..) => map(u, reg),
+            &mut MemArg::UnsignedOffset(ref mut reg, ..) => map(u, reg),
+            &mut MemArg::RegReg(ref mut r1, ref mut r2) => {
+                map(u, r1);
+                map(u, r2);
+            }
+            &mut MemArg::RegScaled(ref mut r1, ref mut r2, ..) => {
+                map(u, r1);
+                map(u, r2);
+            }
+            &mut MemArg::RegScaledExtended(ref mut r1, ref mut r2, ..) => {
+                map(u, r1);
+                map(u, r2);
+            }
+            &mut MemArg::Label(..) => {}
+            &mut MemArg::PreIndexed(ref mut r, ..) => map_wr(u, r),
+            &mut MemArg::PostIndexed(ref mut r, ..) => map_wr(u, r),
+            &mut MemArg::FPOffset(..) | &mut MemArg::SPOffset(..) => {}
+        };
     }
 
-    fn map_pairmem(u: &RegallocMap<VirtualReg, RealReg>, mem: &PairMemArg) -> PairMemArg {
+    fn map_pairmem(u: &RegallocMap<VirtualReg, RealReg>, mem: &mut PairMemArg) {
         match mem {
-            &PairMemArg::SignedOffset(reg, simm7) => PairMemArg::SignedOffset(map(u, reg), simm7),
-            &PairMemArg::PreIndexed(reg, simm7) => PairMemArg::PreIndexed(map_wr(u, reg), simm7),
-            &PairMemArg::PostIndexed(reg, simm7) => PairMemArg::PostIndexed(map_wr(u, reg), simm7),
+            &mut PairMemArg::SignedOffset(ref mut reg, ..) => map(u, reg),
+            &mut PairMemArg::PreIndexed(ref mut reg, ..) => map_wr(u, reg),
+            &mut PairMemArg::PostIndexed(ref mut reg, ..) => map_wr(u, reg),
         }
     }
 
-    fn map_br(u: &RegallocMap<VirtualReg, RealReg>, br: &CondBrKind) -> CondBrKind {
+    fn map_br(u: &RegallocMap<VirtualReg, RealReg>, br: &mut CondBrKind) {
         match br {
-            &CondBrKind::Zero(reg) => CondBrKind::Zero(map(u, reg)),
-            &CondBrKind::NotZero(reg) => CondBrKind::NotZero(map(u, reg)),
-            &CondBrKind::Cond(c) => CondBrKind::Cond(c),
-        }
+            &mut CondBrKind::Zero(ref mut reg) => map(u, reg),
+            &mut CondBrKind::NotZero(ref mut reg) => map(u, reg),
+            &mut CondBrKind::Cond(..) => {}
+        };
     }
 
     let u = pre_map; // For brevity below.
     let d = post_map;
 
-    let newval = match inst {
-        &mut Inst::AluRRR { alu_op, rd, rn, rm } => Inst::AluRRR {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
+    match inst {
+        &mut Inst::AluRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
         &mut Inst::AluRRRR {
-            alu_op,
-            rd,
-            rn,
-            rm,
-            ra,
-        } => Inst::AluRRRR {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-            ra: map(u, ra),
-        },
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ref mut ra,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+            map(u, ra);
+        }
         &mut Inst::AluRRImm12 {
-            alu_op,
-            rd,
-            rn,
-            ref imm12,
-        } => Inst::AluRRImm12 {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            imm12: imm12.clone(),
-        },
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
         &mut Inst::AluRRImmLogic {
-            alu_op,
-            rd,
-            rn,
-            ref imml,
-        } => Inst::AluRRImmLogic {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            imml: imml.clone(),
-        },
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
         &mut Inst::AluRRImmShift {
-            alu_op,
-            rd,
-            rn,
-            ref immshift,
-        } => Inst::AluRRImmShift {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            immshift: immshift.clone(),
-        },
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
         &mut Inst::AluRRRShift {
-            alu_op,
-            rd,
-            rn,
-            rm,
-            ref shiftop,
-        } => Inst::AluRRRShift {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-            shiftop: shiftop.clone(),
-        },
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
         &mut Inst::AluRRRExtend {
-            alu_op,
-            rd,
-            rn,
-            rm,
-            ref extendop,
-        } => Inst::AluRRRExtend {
-            alu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-            extendop: extendop.clone(),
-        },
-        &mut Inst::BitRR { op, rd, rn } => Inst::BitRR {
-            op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::BitRR {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
         &mut Inst::ULoad8 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::ULoad8 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::SLoad8 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::SLoad8 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::ULoad16 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::ULoad16 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::SLoad16 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::SLoad16 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::ULoad32 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::ULoad32 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::SLoad32 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::SLoad32 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
+
         &mut Inst::ULoad64 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::ULoad64 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::Store8 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::Store8 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::Store16 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::Store16 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::Store32 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::Store32 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::Store64 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::Store64 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
-        &mut Inst::StoreP64 { rt, rt2, ref mem } => Inst::StoreP64 {
-            rt: map(u, rt),
-            rt2: map(u, rt2),
-            mem: map_pairmem(u, mem),
-        },
-        &mut Inst::LoadP64 { rt, rt2, ref mem } => Inst::LoadP64 {
-            rt: map_wr(d, rt),
-            rt2: map_wr(d, rt2),
-            mem: map_pairmem(u, mem),
-        },
-        &mut Inst::Mov { rd, rm } => Inst::Mov {
-            rd: map_wr(d, rd),
-            rm: map(u, rm),
-        },
-        &mut Inst::Mov32 { rd, rm } => Inst::Mov32 {
-            rd: map_wr(d, rd),
-            rm: map(u, rm),
-        },
-        &mut Inst::MovZ { rd, ref imm } => Inst::MovZ {
-            rd: map_wr(d, rd),
-            imm: imm.clone(),
-        },
-        &mut Inst::MovN { rd, ref imm } => Inst::MovN {
-            rd: map_wr(d, rd),
-            imm: imm.clone(),
-        },
-        &mut Inst::MovK { rd, ref imm } => Inst::MovK {
-            rd: map_wr(d, rd),
-            imm: imm.clone(),
-        },
-        &mut Inst::CSel { rd, rn, rm, cond } => Inst::CSel {
-            cond,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
-        &mut Inst::CSet { rd, cond } => Inst::CSet {
-            cond,
-            rd: map_wr(d, rd),
-        },
-        &mut Inst::FpuMove64 { rd, rn } => Inst::FpuMove64 {
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::FpuRR { fpu_op, rd, rn } => Inst::FpuRR {
-            fpu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::FpuRRR { fpu_op, rd, rn, rm } => Inst::FpuRRR {
-            fpu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
+
+        &mut Inst::StoreP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+        } => {
+            map(u, rt);
+            map(u, rt2);
+            map_pairmem(u, mem);
+        }
+        &mut Inst::LoadP64 {
+            ref mut rt,
+            ref mut rt2,
+            ref mut mem,
+        } => {
+            map_wr(d, rt);
+            map_wr(d, rt2);
+            map_pairmem(u, mem);
+        }
+        &mut Inst::Mov {
+            ref mut rd,
+            ref mut rm,
+        } => {
+            map_wr(d, rd);
+            map(u, rm);
+        }
+        &mut Inst::Mov32 {
+            ref mut rd,
+            ref mut rm,
+        } => {
+            map_wr(d, rd);
+            map(u, rm);
+        }
+        &mut Inst::MovZ { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::MovN { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::MovK { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::CSel {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::CSet { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::FpuMove64 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::FpuRR {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::FpuRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
         &mut Inst::FpuRRRR {
-            fpu_op,
-            rd,
-            rn,
-            rm,
-            ra,
-        } => Inst::FpuRRRR {
-            fpu_op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-            ra: map(u, ra),
-        },
-        &mut Inst::FpuCmp32 { rn, rm } => Inst::FpuCmp32 {
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
-        &mut Inst::FpuCmp64 { rn, rm } => Inst::FpuCmp64 {
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ref mut ra,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+            map(u, ra);
+        }
+        &mut Inst::FpuCmp32 {
+            ref mut rn,
+            ref mut rm,
+        } => {
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::FpuCmp64 {
+            ref mut rn,
+            ref mut rm,
+        } => {
+            map(u, rn);
+            map(u, rm);
+        }
         &mut Inst::FpuLoad32 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuLoad32 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::FpuLoad64 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuLoad64 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::FpuLoad128 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuLoad64 {
-            rd: map_wr(d, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map_wr(d, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::FpuStore32 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuStore32 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::FpuStore64 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuStore64 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
         &mut Inst::FpuStore128 {
-            rd,
-            ref mem,
-            srcloc,
-        } => Inst::FpuStore64 {
-            rd: map(u, rd),
-            mem: map_mem(u, mem),
-            srcloc,
-        },
-        &mut Inst::LoadFpuConst32 { rd, const_data } => Inst::LoadFpuConst32 {
-            rd: map_wr(d, rd),
-            const_data,
-        },
-        &mut Inst::LoadFpuConst64 { rd, const_data } => Inst::LoadFpuConst64 {
-            rd: map_wr(d, rd),
-            const_data,
-        },
-        &mut Inst::FpuToInt { op, rd, rn } => Inst::FpuToInt {
-            op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::IntToFpu { op, rd, rn } => Inst::IntToFpu {
-            op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::FpuCSel32 { rd, rn, rm, cond } => Inst::FpuCSel32 {
-            cond,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
-        &mut Inst::FpuCSel64 { rd, rn, rm, cond } => Inst::FpuCSel64 {
-            cond,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-        },
-        &mut Inst::FpuRound { op, rd, rn } => Inst::FpuRound {
-            op,
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::MovToVec64 { rd, rn } => Inst::MovToVec64 {
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::MovFromVec64 { rd, rn } => Inst::MovFromVec64 {
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-        },
-        &mut Inst::VecRRR { rd, rn, rm, alu_op } => Inst::VecRRR {
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            rm: map(u, rm),
-            alu_op,
-        },
-        &mut Inst::MovToNZCV { rn } => Inst::MovToNZCV { rn: map(u, rn) },
-        &mut Inst::MovFromNZCV { rd } => Inst::MovFromNZCV { rd: map_wr(d, rd) },
-        &mut Inst::CondSet { rd, cond } => Inst::CondSet {
-            rd: map_wr(d, rd),
-            cond,
-        },
+            ref mut rd,
+            ref mut mem,
+            ..
+        } => {
+            map(u, rd);
+            map_mem(u, mem);
+        }
+        &mut Inst::LoadFpuConst32 { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::LoadFpuConst64 { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::FpuToInt {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::IntToFpu {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::FpuCSel32 {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::FpuCSel64 {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::FpuRound {
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::MovToVec64 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::MovFromVec64 {
+            ref mut rd,
+            ref mut rn,
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::VecRRR {
+            ref mut rd,
+            ref mut rn,
+            ref mut rm,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+            map(u, rm);
+        }
+        &mut Inst::MovToNZCV { ref mut rn } => {
+            map(u, rn);
+        }
+        &mut Inst::MovFromNZCV { ref mut rd } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::CondSet { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
         &mut Inst::Extend {
-            rd,
-            rn,
-            signed,
-            from_bits,
-            to_bits,
-        } => Inst::Extend {
-            rd: map_wr(d, rd),
-            rn: map(u, rn),
-            signed,
-            from_bits,
-            to_bits,
-        },
-        &mut Inst::Jump { dest } => Inst::Jump { dest },
+            ref mut rd,
+            ref mut rn,
+            ..
+        } => {
+            map_wr(d, rd);
+            map(u, rn);
+        }
+        &mut Inst::Jump { .. } => {}
         &mut Inst::Call {
-            ref uses,
-            ref defs,
-            ref dest,
-            loc,
-            opcode,
+            ref mut uses,
+            ref mut defs,
+            ..
         } => {
-            let uses = uses.map(|r| map(u, *r));
-            let defs = defs.map(|r| map_wr(d, *r));
-            let dest = dest.clone();
-            Inst::Call {
-                dest,
-                uses,
-                defs,
-                loc,
-                opcode,
-            }
-        }
-        &mut Inst::Ret {} => Inst::Ret {},
-        &mut Inst::EpiloguePlaceholder {} => Inst::EpiloguePlaceholder {},
+            // TODO: add `map_mut()` to regalloc.rs's Set.
+            let new_uses = uses.map(|r| {
+                let mut r = *r;
+                map(u, &mut r);
+                r
+            });
+            let new_defs = defs.map(|r| {
+                let mut r = *r;
+                map_wr(d, &mut r);
+                r
+            });
+            *uses = new_uses;
+            *defs = new_defs;
+        }
+        &mut Inst::Ret | &mut Inst::EpiloguePlaceholder => {}
         &mut Inst::CallInd {
-            ref uses,
-            ref defs,
-            rn,
-            loc,
-            opcode,
+            ref mut uses,
+            ref mut defs,
+            ref mut rn,
+            ..
         } => {
-            let uses = uses.map(|r| map(u, *r));
-            let defs = defs.map(|r| map_wr(d, *r));
-            Inst::CallInd {
-                uses,
-                defs,
-                rn: map(u, rn),
-                loc,
-                opcode,
-            }
-        }
-        &mut Inst::CondBr {
-            taken,
-            not_taken,
-            kind,
-        } => Inst::CondBr {
-            taken,
-            not_taken,
-            kind: map_br(u, &kind),
-        },
-        &mut Inst::CondBrLowered { target, kind } => Inst::CondBrLowered {
-            target,
-            kind: map_br(u, &kind),
-        },
-        &mut Inst::CondBrLoweredCompound {
-            taken,
-            not_taken,
-            kind,
-        } => Inst::CondBrLoweredCompound {
-            taken,
-            not_taken,
-            kind: map_br(u, &kind),
-        },
-        &mut Inst::IndirectBr { rn, ref targets } => Inst::IndirectBr {
-            rn: map(u, rn),
-            targets: targets.clone(),
-        },
-        &mut Inst::Nop => Inst::Nop,
-        &mut Inst::Nop4 => Inst::Nop4,
-        &mut Inst::Brk => Inst::Brk,
-        &mut Inst::Udf { trap_info } => Inst::Udf { trap_info },
-        &mut Inst::Adr { rd, ref label } => Inst::Adr {
-            rd: map_wr(d, rd),
-            label: label.clone(),
-        },
-        &mut Inst::Word4 { data } => Inst::Word4 { data },
-        &mut Inst::Word8 { data } => Inst::Word8 { data },
+            // TODO: add `map_mut()` to regalloc.rs's Set.
+            let new_uses = uses.map(|r| {
+                let mut r = *r;
+                map(u, &mut r);
+                r
+            });
+            let new_defs = defs.map(|r| {
+                let mut r = *r;
+                map_wr(d, &mut r);
+                r
+            });
+            *uses = new_uses;
+            *defs = new_defs;
+            map(u, rn);
+        }
+        &mut Inst::CondBr { ref mut kind, .. } => {
+            map_br(u, kind);
+        }
+        &mut Inst::CondBrLowered { ref mut kind, .. } => {
+            map_br(u, kind);
+        }
+        &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => {
+            map_br(u, kind);
+        }
+        &mut Inst::IndirectBr { ref mut rn, .. } => {
+            map(u, rn);
+        }
+        &mut Inst::Nop0 | &mut Inst::Nop4 | &mut Inst::Brk | &mut Inst::Udf { .. } => {}
+        &mut Inst::Adr { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::Word4 { .. } | &mut Inst::Word8 { .. } => {}
         &mut Inst::JTSequence {
-            ridx,
-            rtmp1,
-            rtmp2,
-            ref targets,
-            ref targets_for_term,
-        } => Inst::JTSequence {
-            targets: targets.clone(),
-            targets_for_term: targets_for_term.clone(),
-            ridx: map(u, ridx),
-            rtmp1: map_wr(d, rtmp1),
-            rtmp2: map_wr(d, rtmp2),
-        },
-        &mut Inst::LoadConst64 { rd, const_data } => Inst::LoadConst64 {
-            rd: map_wr(d, rd),
-            const_data,
-        },
-        &mut Inst::LoadExtName {
-            rd,
-            ref name,
-            offset,
-            srcloc,
-        } => Inst::LoadExtName {
-            rd: map_wr(d, rd),
-            name: name.clone(),
-            offset,
-            srcloc,
-        },
-    };
-    *inst = newval;
+            ref mut ridx,
+            ref mut rtmp1,
+            ref mut rtmp2,
+            ..
+        } => {
+            map(u, ridx);
+            map_wr(d, rtmp1);
+            map_wr(d, rtmp2);
+        }
+        &mut Inst::LoadConst64 { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+        &mut Inst::LoadExtName { ref mut rd, .. } => {
+            map_wr(d, rd);
+        }
+    }
 }
 
 //=============================================================================
@@ -1624,7 +1651,7 @@ fn arm64_map_regs(
 
 impl MachInst for Inst {
     fn get_regs(&self, collector: &mut RegUsageCollector) {
-        arm64_get_regs(self, collector)
+        aarch64_get_regs(self, collector)
     }
 
     fn map_regs(
@@ -1632,7 +1659,7 @@ impl MachInst for Inst {
         pre_map: &RegallocMap<VirtualReg, RealReg>,
         post_map: &RegallocMap<VirtualReg, RealReg>,
     ) {
-        arm64_map_regs(self, pre_map, post_map);
+        aarch64_map_regs(self, pre_map, post_map);
     }
 
     fn is_move(&self) -> Option<(Writable<Reg>, Reg)> {
@@ -1644,7 +1671,7 @@ impl MachInst for Inst {
     }
 
     fn is_epilogue_placeholder(&self) -> bool {
-        if let Inst::EpiloguePlaceholder { .. } = self {
+        if let Inst::EpiloguePlaceholder = self {
             true
         } else {
             false
@@ -1653,7 +1680,7 @@ impl MachInst for Inst {
 
     fn is_term<'a>(&'a self) -> MachTerminator<'a> {
         match self {
-            &Inst::Ret {} | &Inst::EpiloguePlaceholder {} => MachTerminator::Ret,
+            &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret,
             &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()),
             &Inst::CondBr {
                 taken, not_taken, ..
@@ -1687,7 +1714,7 @@ impl MachInst for Inst {
     }
 
     fn gen_zero_len_nop() -> Inst {
-        Inst::Nop
+        Inst::Nop0
     }
 
     fn gen_nop(preferred_size: usize) -> Inst {
@@ -1704,7 +1731,6 @@ impl MachInst for Inst {
         match ty {
             I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => RegClass::I64,
             F32 | F64 => RegClass::V128,
-            I128 | B128 => RegClass::V128,
             IFLAGS | FFLAGS => RegClass::I64,
             _ => panic!("Unexpected SSA-value type: {}", ty),
         }
@@ -1750,7 +1776,7 @@ impl MachInst for Inst {
                 if taken.as_block_index() == fallthrough
                     && not_taken.as_block_index() == fallthrough
                 {
-                    *self = Inst::Nop;
+                    *self = Inst::Nop0;
                 } else if taken.as_block_index() == fallthrough {
                     *self = Inst::CondBrLowered {
                         target: not_taken,
@@ -1772,7 +1798,7 @@ impl MachInst for Inst {
             }
             &mut Inst::Jump { dest } => {
                 if dest.as_block_index() == fallthrough {
-                    *self = Inst::Nop;
+                    *self = Inst::Nop0;
                 }
             }
             _ => {}
@@ -1831,55 +1857,55 @@ fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (Str
 
 impl ShowWithRRU for Inst {
     fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
-        fn op_is32(alu_op: ALUOp) -> (&'static str, bool) {
+        fn op_name_size(alu_op: ALUOp) -> (&'static str, InstSize) {
             match alu_op {
-                ALUOp::Add32 => ("add", true),
-                ALUOp::Add64 => ("add", false),
-                ALUOp::Sub32 => ("sub", true),
-                ALUOp::Sub64 => ("sub", false),
-                ALUOp::Orr32 => ("orr", true),
-                ALUOp::Orr64 => ("orr", false),
-                ALUOp::And32 => ("and", true),
-                ALUOp::And64 => ("and", false),
-                ALUOp::Eor32 => ("eor", true),
-                ALUOp::Eor64 => ("eor", false),
-                ALUOp::AddS32 => ("adds", true),
-                ALUOp::AddS64 => ("adds", false),
-                ALUOp::SubS32 => ("subs", true),
-                ALUOp::SubS64 => ("subs", false),
-                ALUOp::MAdd32 => ("madd", true),
-                ALUOp::MAdd64 => ("madd", false),
-                ALUOp::MSub32 => ("msub", true),
-                ALUOp::MSub64 => ("msub", false),
-                ALUOp::SMulH => ("smulh", false),
-                ALUOp::UMulH => ("umulh", false),
-                ALUOp::SDiv64 => ("sdiv", false),
-                ALUOp::UDiv64 => ("udiv", false),
-                ALUOp::AndNot32 => ("bic", true),
-                ALUOp::AndNot64 => ("bic", false),
-                ALUOp::OrrNot32 => ("orn", true),
-                ALUOp::OrrNot64 => ("orn", false),
-                ALUOp::EorNot32 => ("eon", true),
-                ALUOp::EorNot64 => ("eon", false),
-                ALUOp::RotR32 => ("ror", true),
-                ALUOp::RotR64 => ("ror", false),
-                ALUOp::Lsr32 => ("lsr", true),
-                ALUOp::Lsr64 => ("lsr", false),
-                ALUOp::Asr32 => ("asr", true),
-                ALUOp::Asr64 => ("asr", false),
-                ALUOp::Lsl32 => ("lsl", true),
-                ALUOp::Lsl64 => ("lsl", false),
+                ALUOp::Add32 => ("add", InstSize::Size32),
+                ALUOp::Add64 => ("add", InstSize::Size64),
+                ALUOp::Sub32 => ("sub", InstSize::Size32),
+                ALUOp::Sub64 => ("sub", InstSize::Size64),
+                ALUOp::Orr32 => ("orr", InstSize::Size32),
+                ALUOp::Orr64 => ("orr", InstSize::Size64),
+                ALUOp::And32 => ("and", InstSize::Size32),
+                ALUOp::And64 => ("and", InstSize::Size64),
+                ALUOp::Eor32 => ("eor", InstSize::Size32),
+                ALUOp::Eor64 => ("eor", InstSize::Size64),
+                ALUOp::AddS32 => ("adds", InstSize::Size32),
+                ALUOp::AddS64 => ("adds", InstSize::Size64),
+                ALUOp::SubS32 => ("subs", InstSize::Size32),
+                ALUOp::SubS64 => ("subs", InstSize::Size64),
+                ALUOp::MAdd32 => ("madd", InstSize::Size32),
+                ALUOp::MAdd64 => ("madd", InstSize::Size64),
+                ALUOp::MSub32 => ("msub", InstSize::Size32),
+                ALUOp::MSub64 => ("msub", InstSize::Size64),
+                ALUOp::SMulH => ("smulh", InstSize::Size64),
+                ALUOp::UMulH => ("umulh", InstSize::Size64),
+                ALUOp::SDiv64 => ("sdiv", InstSize::Size64),
+                ALUOp::UDiv64 => ("udiv", InstSize::Size64),
+                ALUOp::AndNot32 => ("bic", InstSize::Size32),
+                ALUOp::AndNot64 => ("bic", InstSize::Size64),
+                ALUOp::OrrNot32 => ("orn", InstSize::Size32),
+                ALUOp::OrrNot64 => ("orn", InstSize::Size64),
+                ALUOp::EorNot32 => ("eon", InstSize::Size32),
+                ALUOp::EorNot64 => ("eon", InstSize::Size64),
+                ALUOp::RotR32 => ("ror", InstSize::Size32),
+                ALUOp::RotR64 => ("ror", InstSize::Size64),
+                ALUOp::Lsr32 => ("lsr", InstSize::Size32),
+                ALUOp::Lsr64 => ("lsr", InstSize::Size64),
+                ALUOp::Asr32 => ("asr", InstSize::Size32),
+                ALUOp::Asr64 => ("asr", InstSize::Size64),
+                ALUOp::Lsl32 => ("lsl", InstSize::Size32),
+                ALUOp::Lsl64 => ("lsl", InstSize::Size64),
             }
         }
 
         match self {
-            &Inst::Nop => "nop-zero-len".to_string(),
+            &Inst::Nop0 => "nop-zero-len".to_string(),
             &Inst::Nop4 => "nop".to_string(),
             &Inst::AluRRR { alu_op, rd, rn, rm } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
-                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
             &Inst::AluRRRR {
@@ -1889,12 +1915,12 @@ impl ShowWithRRU for Inst {
                 rm,
                 ra,
             } => {
-                let (op, is32) = op_is32(alu_op);
+                let (op, size) = op_name_size(alu_op);
                 let four_args = alu_op != ALUOp::SMulH && alu_op != ALUOp::UMulH;
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
-                let rm = show_ireg_sized(rm, mb_rru, is32);
-                let ra = show_ireg_sized(ra, mb_rru, is32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
+                let ra = show_ireg_sized(ra, mb_rru, size);
                 if four_args {
                     format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
                 } else {
@@ -1909,9 +1935,9 @@ impl ShowWithRRU for Inst {
                 rn,
                 ref imm12,
             } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
 
                 if imm12.bits == 0 && alu_op == ALUOp::Add64 {
                     // special-case MOV (used for moving into SP).
@@ -1927,9 +1953,9 @@ impl ShowWithRRU for Inst {
                 rn,
                 ref imml,
             } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
                 let imml = imml.show_rru(mb_rru);
                 format!("{} {}, {}, {}", op, rd, rn, imml)
             }
@@ -1939,9 +1965,9 @@ impl ShowWithRRU for Inst {
                 rn,
                 ref immshift,
             } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
                 let immshift = immshift.show_rru(mb_rru);
                 format!("{} {}, {}, {}", op, rd, rn, immshift)
             }
@@ -1952,10 +1978,10 @@ impl ShowWithRRU for Inst {
                 rm,
                 ref shiftop,
             } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
-                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
                 let shiftop = shiftop.show_rru(mb_rru);
                 format!("{} {}, {}, {}, {}", op, rd, rn, rm, shiftop)
             }
@@ -1966,18 +1992,18 @@ impl ShowWithRRU for Inst {
                 rm,
                 ref extendop,
             } => {
-                let (op, is32) = op_is32(alu_op);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
-                let rm = show_ireg_sized(rm, mb_rru, is32);
+                let (op, size) = op_name_size(alu_op);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
+                let rm = show_ireg_sized(rm, mb_rru, size);
                 let extendop = extendop.show_rru(mb_rru);
                 format!("{} {}, {}, {}, {}", op, rd, rn, rm, extendop)
             }
             &Inst::BitRR { op, rd, rn } => {
-                let is32 = op.is_32_bit();
+                let size = op.inst_size();
                 let op = op.op_str();
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_ireg_sized(rn, mb_rru, is32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_ireg_sized(rn, mb_rru, size);
                 format!("{} {}, {}", op, rd, rn)
             }
             &Inst::ULoad8 {
@@ -2022,24 +2048,24 @@ impl ShowWithRRU for Inst {
                     &MemArg::Unscaled(..) => true,
                     _ => false,
                 };
-                let (op, is32) = match (self, is_unscaled) {
-                    (&Inst::ULoad8 { .. }, false) => ("ldrb", true),
-                    (&Inst::ULoad8 { .. }, true) => ("ldurb", true),
-                    (&Inst::SLoad8 { .. }, false) => ("ldrsb", false),
-                    (&Inst::SLoad8 { .. }, true) => ("ldursb", false),
-                    (&Inst::ULoad16 { .. }, false) => ("ldrh", true),
-                    (&Inst::ULoad16 { .. }, true) => ("ldurh", true),
-                    (&Inst::SLoad16 { .. }, false) => ("ldrsh", false),
-                    (&Inst::SLoad16 { .. }, true) => ("ldursh", false),
-                    (&Inst::ULoad32 { .. }, false) => ("ldr", true),
-                    (&Inst::ULoad32 { .. }, true) => ("ldur", true),
-                    (&Inst::SLoad32 { .. }, false) => ("ldrsw", false),
-                    (&Inst::SLoad32 { .. }, true) => ("ldursw", false),
-                    (&Inst::ULoad64 { .. }, false) => ("ldr", false),
-                    (&Inst::ULoad64 { .. }, true) => ("ldur", false),
+                let (op, size) = match (self, is_unscaled) {
+                    (&Inst::ULoad8 { .. }, false) => ("ldrb", InstSize::Size32),
+                    (&Inst::ULoad8 { .. }, true) => ("ldurb", InstSize::Size32),
+                    (&Inst::SLoad8 { .. }, false) => ("ldrsb", InstSize::Size64),
+                    (&Inst::SLoad8 { .. }, true) => ("ldursb", InstSize::Size64),
+                    (&Inst::ULoad16 { .. }, false) => ("ldrh", InstSize::Size32),
+                    (&Inst::ULoad16 { .. }, true) => ("ldurh", InstSize::Size32),
+                    (&Inst::SLoad16 { .. }, false) => ("ldrsh", InstSize::Size64),
+                    (&Inst::SLoad16 { .. }, true) => ("ldursh", InstSize::Size64),
+                    (&Inst::ULoad32 { .. }, false) => ("ldr", InstSize::Size32),
+                    (&Inst::ULoad32 { .. }, true) => ("ldur", InstSize::Size32),
+                    (&Inst::SLoad32 { .. }, false) => ("ldrsw", InstSize::Size64),
+                    (&Inst::SLoad32 { .. }, true) => ("ldursw", InstSize::Size64),
+                    (&Inst::ULoad64 { .. }, false) => ("ldr", InstSize::Size64),
+                    (&Inst::ULoad64 { .. }, true) => ("ldur", InstSize::Size64),
                     _ => unreachable!(),
                 };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, size);
                 let mem = mem.show_rru(mb_rru);
                 format!("{}{} {}, {}", mem_str, op, rd, mem)
             }
@@ -2070,18 +2096,18 @@ impl ShowWithRRU for Inst {
                     &MemArg::Unscaled(..) => true,
                     _ => false,
                 };
-                let (op, is32) = match (self, is_unscaled) {
-                    (&Inst::Store8 { .. }, false) => ("strb", true),
-                    (&Inst::Store8 { .. }, true) => ("sturb", true),
-                    (&Inst::Store16 { .. }, false) => ("strh", true),
-                    (&Inst::Store16 { .. }, true) => ("sturh", true),
-                    (&Inst::Store32 { .. }, false) => ("str", true),
-                    (&Inst::Store32 { .. }, true) => ("stur", true),
-                    (&Inst::Store64 { .. }, false) => ("str", false),
-                    (&Inst::Store64 { .. }, true) => ("stur", false),
+                let (op, size) = match (self, is_unscaled) {
+                    (&Inst::Store8 { .. }, false) => ("strb", InstSize::Size32),
+                    (&Inst::Store8 { .. }, true) => ("sturb", InstSize::Size32),
+                    (&Inst::Store16 { .. }, false) => ("strh", InstSize::Size32),
+                    (&Inst::Store16 { .. }, true) => ("sturh", InstSize::Size32),
+                    (&Inst::Store32 { .. }, false) => ("str", InstSize::Size32),
+                    (&Inst::Store32 { .. }, true) => ("stur", InstSize::Size32),
+                    (&Inst::Store64 { .. }, false) => ("str", InstSize::Size64),
+                    (&Inst::Store64 { .. }, true) => ("stur", InstSize::Size64),
                     _ => unreachable!(),
                 };
-                let rd = show_ireg_sized(rd, mb_rru, is32);
+                let rd = show_ireg_sized(rd, mb_rru, size);
                 let mem = mem.show_rru(mb_rru);
                 format!("{}{} {}, {}", mem_str, op, rd, mem)
             }
@@ -2103,8 +2129,8 @@ impl ShowWithRRU for Inst {
                 format!("mov {}, {}", rd, rm)
             }
             &Inst::Mov32 { rd, rm } => {
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
-                let rm = show_ireg_sized(rm, mb_rru, /* is32 = */ true);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
+                let rm = show_ireg_sized(rm, mb_rru, InstSize::Size32);
                 format!("mov {}, {}", rd, rm)
             }
             &Inst::MovZ { rd, ref imm } => {
@@ -2140,38 +2166,38 @@ impl ShowWithRRU for Inst {
                 format!("mov {}.8b, {}.8b", rd, rn)
             }
             &Inst::FpuRR { fpu_op, rd, rn } => {
-                let (op, is32src, is32dst) = match fpu_op {
-                    FPUOp1::Abs32 => ("fabs", true, true),
-                    FPUOp1::Abs64 => ("fabs", false, false),
-                    FPUOp1::Neg32 => ("fneg", true, true),
-                    FPUOp1::Neg64 => ("fneg", false, false),
-                    FPUOp1::Sqrt32 => ("fsqrt", true, true),
-                    FPUOp1::Sqrt64 => ("fsqrt", false, false),
-                    FPUOp1::Cvt32To64 => ("fcvt", true, false),
-                    FPUOp1::Cvt64To32 => ("fcvt", false, true),
+                let (op, sizesrc, sizedest) = match fpu_op {
+                    FPUOp1::Abs32 => ("fabs", InstSize::Size32, InstSize::Size32),
+                    FPUOp1::Abs64 => ("fabs", InstSize::Size64, InstSize::Size64),
+                    FPUOp1::Neg32 => ("fneg", InstSize::Size32, InstSize::Size32),
+                    FPUOp1::Neg64 => ("fneg", InstSize::Size64, InstSize::Size64),
+                    FPUOp1::Sqrt32 => ("fsqrt", InstSize::Size32, InstSize::Size32),
+                    FPUOp1::Sqrt64 => ("fsqrt", InstSize::Size64, InstSize::Size64),
+                    FPUOp1::Cvt32To64 => ("fcvt", InstSize::Size32, InstSize::Size64),
+                    FPUOp1::Cvt64To32 => ("fcvt", InstSize::Size64, InstSize::Size32),
                 };
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32dst);
-                let rn = show_freg_sized(rn, mb_rru, is32src);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_freg_sized(rn, mb_rru, sizesrc);
                 format!("{} {}, {}", op, rd, rn)
             }
             &Inst::FpuRRR { fpu_op, rd, rn, rm } => {
-                let (op, is32) = match fpu_op {
-                    FPUOp2::Add32 => ("fadd", true),
-                    FPUOp2::Add64 => ("fadd", false),
-                    FPUOp2::Sub32 => ("fsub", true),
-                    FPUOp2::Sub64 => ("fsub", false),
-                    FPUOp2::Mul32 => ("fmul", true),
-                    FPUOp2::Mul64 => ("fmul", false),
-                    FPUOp2::Div32 => ("fdiv", true),
-                    FPUOp2::Div64 => ("fdiv", false),
-                    FPUOp2::Max32 => ("fmax", true),
-                    FPUOp2::Max64 => ("fmax", false),
-                    FPUOp2::Min32 => ("fmin", true),
-                    FPUOp2::Min64 => ("fmin", false),
+                let (op, size) = match fpu_op {
+                    FPUOp2::Add32 => ("fadd", InstSize::Size32),
+                    FPUOp2::Add64 => ("fadd", InstSize::Size64),
+                    FPUOp2::Sub32 => ("fsub", InstSize::Size32),
+                    FPUOp2::Sub64 => ("fsub", InstSize::Size64),
+                    FPUOp2::Mul32 => ("fmul", InstSize::Size32),
+                    FPUOp2::Mul64 => ("fmul", InstSize::Size64),
+                    FPUOp2::Div32 => ("fdiv", InstSize::Size32),
+                    FPUOp2::Div64 => ("fdiv", InstSize::Size64),
+                    FPUOp2::Max32 => ("fmax", InstSize::Size32),
+                    FPUOp2::Max64 => ("fmax", InstSize::Size64),
+                    FPUOp2::Min32 => ("fmin", InstSize::Size32),
+                    FPUOp2::Min64 => ("fmin", InstSize::Size64),
                 };
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_freg_sized(rn, mb_rru, is32);
-                let rm = show_freg_sized(rm, mb_rru, is32);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_freg_sized(rn, mb_rru, size);
+                let rm = show_freg_sized(rm, mb_rru, size);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
             &Inst::FpuRRRR {
@@ -2181,33 +2207,33 @@ impl ShowWithRRU for Inst {
                 rm,
                 ra,
             } => {
-                let (op, is32) = match fpu_op {
-                    FPUOp3::MAdd32 => ("fmadd", true),
-                    FPUOp3::MAdd64 => ("fmadd", false),
+                let (op, size) = match fpu_op {
+                    FPUOp3::MAdd32 => ("fmadd", InstSize::Size32),
+                    FPUOp3::MAdd64 => ("fmadd", InstSize::Size64),
                 };
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_freg_sized(rn, mb_rru, is32);
-                let rm = show_freg_sized(rm, mb_rru, is32);
-                let ra = show_freg_sized(ra, mb_rru, is32);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_freg_sized(rn, mb_rru, size);
+                let rm = show_freg_sized(rm, mb_rru, size);
+                let ra = show_freg_sized(ra, mb_rru, size);
                 format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra)
             }
             &Inst::FpuCmp32 { rn, rm } => {
-                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ true);
-                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ true);
+                let rn = show_freg_sized(rn, mb_rru, InstSize::Size32);
+                let rm = show_freg_sized(rm, mb_rru, InstSize::Size32);
                 format!("fcmp {}, {}", rn, rm)
             }
             &Inst::FpuCmp64 { rn, rm } => {
-                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ false);
-                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ false);
+                let rn = show_freg_sized(rn, mb_rru, InstSize::Size64);
+                let rm = show_freg_sized(rm, mb_rru, InstSize::Size64);
                 format!("fcmp {}, {}", rn, rm)
             }
             &Inst::FpuLoad32 { rd, ref mem, .. } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
                 let mem = mem.show_rru_sized(mb_rru, /* size = */ 4);
                 format!("ldr {}, {}", rd, mem)
             }
             &Inst::FpuLoad64 { rd, ref mem, .. } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64);
                 let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
                 format!("ldr {}, {}", rd, mem)
             }
@@ -2218,12 +2244,12 @@ impl ShowWithRRU for Inst {
                 format!("ldr {}, {}", rd, mem)
             }
             &Inst::FpuStore32 { rd, ref mem, .. } => {
-                let rd = show_freg_sized(rd, mb_rru, /* is32 = */ true);
+                let rd = show_freg_sized(rd, mb_rru, InstSize::Size32);
                 let mem = mem.show_rru_sized(mb_rru, /* size = */ 4);
                 format!("str {}, {}", rd, mem)
             }
             &Inst::FpuStore64 { rd, ref mem, .. } => {
-                let rd = show_freg_sized(rd, mb_rru, /* is32 = */ false);
+                let rd = show_freg_sized(rd, mb_rru, InstSize::Size64);
                 let mem = mem.show_rru_sized(mb_rru, /* size = */ 8);
                 format!("str {}, {}", rd, mem)
             }
@@ -2234,70 +2260,70 @@ impl ShowWithRRU for Inst {
                 format!("str {}, {}", rd, mem)
             }
             &Inst::LoadFpuConst32 { rd, const_data } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
                 format!("ldr {}, pc+8 ; b 8 ; data.f32 {}", rd, const_data)
             }
             &Inst::LoadFpuConst64 { rd, const_data } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64);
                 format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data)
             }
             &Inst::FpuToInt { op, rd, rn } => {
-                let (op, is32src, is32dest) = match op {
-                    FpuToIntOp::F32ToI32 => ("fcvtzs", true, true),
-                    FpuToIntOp::F32ToU32 => ("fcvtzu", true, true),
-                    FpuToIntOp::F32ToI64 => ("fcvtzs", true, false),
-                    FpuToIntOp::F32ToU64 => ("fcvtzu", true, false),
-                    FpuToIntOp::F64ToI32 => ("fcvtzs", false, true),
-                    FpuToIntOp::F64ToU32 => ("fcvtzu", false, true),
-                    FpuToIntOp::F64ToI64 => ("fcvtzs", false, false),
-                    FpuToIntOp::F64ToU64 => ("fcvtzu", false, false),
+                let (op, sizesrc, sizedest) = match op {
+                    FpuToIntOp::F32ToI32 => ("fcvtzs", InstSize::Size32, InstSize::Size32),
+                    FpuToIntOp::F32ToU32 => ("fcvtzu", InstSize::Size32, InstSize::Size32),
+                    FpuToIntOp::F32ToI64 => ("fcvtzs", InstSize::Size32, InstSize::Size64),
+                    FpuToIntOp::F32ToU64 => ("fcvtzu", InstSize::Size32, InstSize::Size64),
+                    FpuToIntOp::F64ToI32 => ("fcvtzs", InstSize::Size64, InstSize::Size32),
+                    FpuToIntOp::F64ToU32 => ("fcvtzu", InstSize::Size64, InstSize::Size32),
+                    FpuToIntOp::F64ToI64 => ("fcvtzs", InstSize::Size64, InstSize::Size64),
+                    FpuToIntOp::F64ToU64 => ("fcvtzu", InstSize::Size64, InstSize::Size64),
                 };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, is32dest);
-                let rn = show_freg_sized(rn, mb_rru, is32src);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_freg_sized(rn, mb_rru, sizesrc);
                 format!("{} {}, {}", op, rd, rn)
             }
             &Inst::IntToFpu { op, rd, rn } => {
-                let (op, is32src, is32dest) = match op {
-                    IntToFpuOp::I32ToF32 => ("scvtf", true, true),
-                    IntToFpuOp::U32ToF32 => ("ucvtf", true, true),
-                    IntToFpuOp::I64ToF32 => ("scvtf", false, true),
-                    IntToFpuOp::U64ToF32 => ("ucvtf", false, true),
-                    IntToFpuOp::I32ToF64 => ("scvtf", true, false),
-                    IntToFpuOp::U32ToF64 => ("ucvtf", true, false),
-                    IntToFpuOp::I64ToF64 => ("scvtf", false, false),
-                    IntToFpuOp::U64ToF64 => ("ucvtf", false, false),
+                let (op, sizesrc, sizedest) = match op {
+                    IntToFpuOp::I32ToF32 => ("scvtf", InstSize::Size32, InstSize::Size32),
+                    IntToFpuOp::U32ToF32 => ("ucvtf", InstSize::Size32, InstSize::Size32),
+                    IntToFpuOp::I64ToF32 => ("scvtf", InstSize::Size64, InstSize::Size32),
+                    IntToFpuOp::U64ToF32 => ("ucvtf", InstSize::Size64, InstSize::Size32),
+                    IntToFpuOp::I32ToF64 => ("scvtf", InstSize::Size32, InstSize::Size64),
+                    IntToFpuOp::U32ToF64 => ("ucvtf", InstSize::Size32, InstSize::Size64),
+                    IntToFpuOp::I64ToF64 => ("scvtf", InstSize::Size64, InstSize::Size64),
+                    IntToFpuOp::U64ToF64 => ("ucvtf", InstSize::Size64, InstSize::Size64),
                 };
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32dest);
-                let rn = show_ireg_sized(rn, mb_rru, is32src);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, sizedest);
+                let rn = show_ireg_sized(rn, mb_rru, sizesrc);
                 format!("{} {}, {}", op, rd, rn)
             }
             &Inst::FpuCSel32 { rd, rn, rm, cond } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
-                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ true);
-                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ true);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
+                let rn = show_freg_sized(rn, mb_rru, InstSize::Size32);
+                let rm = show_freg_sized(rm, mb_rru, InstSize::Size32);
                 let cond = cond.show_rru(mb_rru);
                 format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
             }
             &Inst::FpuCSel64 { rd, rn, rm, cond } => {
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, /* is32 = */ false);
-                let rn = show_freg_sized(rn, mb_rru, /* is32 = */ false);
-                let rm = show_freg_sized(rm, mb_rru, /* is32 = */ false);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64);
+                let rn = show_freg_sized(rn, mb_rru, InstSize::Size64);
+                let rm = show_freg_sized(rm, mb_rru, InstSize::Size64);
                 let cond = cond.show_rru(mb_rru);
                 format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond)
             }
             &Inst::FpuRound { op, rd, rn } => {
-                let (inst, is32) = match op {
-                    FpuRoundMode::Minus32 => ("frintm", true),
-                    FpuRoundMode::Minus64 => ("frintm", false),
-                    FpuRoundMode::Plus32 => ("frintp", true),
-                    FpuRoundMode::Plus64 => ("frintp", false),
-                    FpuRoundMode::Zero32 => ("frintz", true),
-                    FpuRoundMode::Zero64 => ("frintz", false),
-                    FpuRoundMode::Nearest32 => ("frintn", true),
-                    FpuRoundMode::Nearest64 => ("frintn", false),
+                let (inst, size) = match op {
+                    FpuRoundMode::Minus32 => ("frintm", InstSize::Size32),
+                    FpuRoundMode::Minus64 => ("frintm", InstSize::Size64),
+                    FpuRoundMode::Plus32 => ("frintp", InstSize::Size32),
+                    FpuRoundMode::Plus64 => ("frintp", InstSize::Size64),
+                    FpuRoundMode::Zero32 => ("frintz", InstSize::Size32),
+                    FpuRoundMode::Zero64 => ("frintz", InstSize::Size64),
+                    FpuRoundMode::Nearest32 => ("frintn", InstSize::Size32),
+                    FpuRoundMode::Nearest64 => ("frintn", InstSize::Size64),
                 };
-                let rd = show_freg_sized(rd.to_reg(), mb_rru, is32);
-                let rn = show_freg_sized(rn, mb_rru, is32);
+                let rd = show_freg_sized(rd.to_reg(), mb_rru, size);
+                let rn = show_freg_sized(rn, mb_rru, size);
                 format!("{} {}, {}", inst, rd, rn)
             }
             &Inst::MovToVec64 { rd, rn } => {
@@ -2346,13 +2372,13 @@ impl ShowWithRRU for Inst {
                 // extend-to width is <= 32 bits, *unless* we have an unsigned
                 // 32-to-64-bit extension, which is implemented with a "mov" to a
                 // 32-bit (W-reg) dest, because this zeroes the top 32 bits.
-                let dest_is32 = if !signed && from_bits == 32 && to_bits == 64 {
-                    true
+                let dest_size = if !signed && from_bits == 32 && to_bits == 64 {
+                    InstSize::Size32
                 } else {
-                    to_bits <= 32
+                    InstSize::from_bits(to_bits)
                 };
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_is32);
-                let rn = show_ireg_sized(rn, mb_rru, from_bits <= 32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_ireg_sized(rn, mb_rru, InstSize::from_bits(from_bits));
                 let op = match (signed, from_bits, to_bits) {
                     (false, 8, 32) => "uxtb",
                     (true, 8, 32) => "sxtb",
@@ -2375,11 +2401,11 @@ impl ShowWithRRU for Inst {
                 from_bits,
                 to_bits,
             } if from_bits == 1 && signed => {
-                let dest_is32 = to_bits <= 32;
-                let zr = if dest_is32 { "wzr" } else { "xzr" };
-                let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_is32);
-                let rn = show_ireg_sized(rn, mb_rru, /* is32 = */ true);
+                let dest_size = InstSize::from_bits(to_bits);
+                let zr = if dest_size.is32() { "wzr" } else { "xzr" };
+                let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size);
+                let rn = show_ireg_sized(rn, mb_rru, InstSize::Size32);
                 format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd)
             }
             &Inst::Extend {
@@ -2389,8 +2415,8 @@ impl ShowWithRRU for Inst {
                 from_bits,
                 ..
             } if from_bits == 1 && !signed => {
-                let rd = show_ireg_sized(rd.to_reg(), mb_rru, /* is32 = */ true);
-                let rn = show_ireg_sized(rn, mb_rru, /* is32 = */ true);
+                let rd = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32);
+                let rn = show_ireg_sized(rn, mb_rru, InstSize::Size32);
                 format!("and {}, {}, #1", rd, rn)
             }
             &Inst::Extend { .. } => {
@@ -2401,8 +2427,8 @@ impl ShowWithRRU for Inst {
                 let rn = rn.show_rru(mb_rru);
                 format!("blr {}", rn)
             }
-            &Inst::Ret {} => "ret".to_string(),
-            &Inst::EpiloguePlaceholder {} => "epilogue placeholder".to_string(),
+            &Inst::Ret => "ret".to_string(),
+            &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(),
             &Inst::Jump { ref dest } => {
                 let dest = dest.show_rru(mb_rru);
                 format!("b {}", dest)
diff --git a/cranelift/codegen/src/isa/arm64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
similarity index 90%
rename from cranelift/codegen/src/isa/arm64/inst/regs.rs
rename to cranelift/codegen/src/isa/aarch64/inst/regs.rs
index 31a915410a97..b675d7f4d722 100644
--- a/cranelift/codegen/src/isa/arm64/inst/regs.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs
@@ -1,13 +1,9 @@
-//! ARM64 ISA definitions: registers.
-
-#![allow(dead_code)]
+//! AArch64 ISA definitions: registers.
 
+use crate::isa::aarch64::inst::InstSize;
 use crate::machinst::*;
 
-use regalloc::{
-    RealReg, RealRegUniverse, Reg, RegClass, RegClassInfo, SpillSlot, VirtualReg, Writable,
-    NUM_REG_CLASSES,
-};
+use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES};
 
 use std::string::{String, ToString};
 
@@ -83,7 +79,7 @@ pub fn writable_zero_reg() -> Writable<Reg> {
 /// Get a reference to the stack-pointer register.
 pub fn stack_reg() -> Reg {
     // XSP (stack) and XZR (zero) are logically different registers which have
-    // the same hardware encoding, and whose meaning, in real arm64
+    // the same hardware encoding, and whose meaning, in real aarch64
     // instructions, is context-dependent.  For convenience of
     // universe-construction and for correct printing, we make them be two
     // different real registers.
@@ -134,7 +130,7 @@ pub fn writable_spilltmp_reg() -> Writable<Reg> {
     Writable::from_reg(spilltmp_reg())
 }
 
-/// Create the register universe for ARM64.
+/// Create the register universe for AArch64.
 pub fn create_reg_universe() -> RealRegUniverse {
     let mut regs = vec![];
     let mut allocable_by_class = [None; NUM_REG_CLASSES];
@@ -217,37 +213,38 @@ pub fn create_reg_universe() -> RealRegUniverse {
     }
 }
 
-/// If |ireg| denotes an I64-classed reg, make a best-effort attempt to show
+/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show
 /// its name at the 32-bit size.
-pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, is32: bool) -> String {
+pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
     let mut s = reg.show_rru(mb_rru);
-    if reg.get_class() != RegClass::I64 || !is32 {
+    if reg.get_class() != RegClass::I64 || !size.is32() {
         // We can't do any better.
         return s;
     }
 
     if reg.is_real() {
         // Change (eg) "x42" into "w42" as appropriate
-        if reg.get_class() == RegClass::I64 && is32 && s.starts_with("x") {
+        if reg.get_class() == RegClass::I64 && size.is32() && s.starts_with("x") {
             s = "w".to_string() + &s[1..];
         }
     } else {
         // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role
-        if reg.get_class() == RegClass::I64 && is32 {
-            s = s + &"w";
+        if reg.get_class() == RegClass::I64 && size.is32() {
+            s.push('w');
         }
     }
     s
 }
 
 /// Show a vector register when its use as a 32-bit or 64-bit float is known.
-pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, is32: bool) -> String {
-    let s = reg.show_rru(mb_rru);
+pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String {
+    let mut s = reg.show_rru(mb_rru);
     if reg.get_class() != RegClass::V128 {
         return s;
     }
-    let prefix = if is32 { "s" } else { "d" };
-    prefix.to_string() + &s[1..]
+    let prefix = if size.is32() { "s" } else { "d" };
+    s.replace_range(0..1, prefix);
+    s
 }
 
 /// Show a vector register used in a scalar context.
@@ -261,12 +258,12 @@ pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String {
     if reg.is_real() {
         // Change (eg) "v0" into "d0".
         if reg.get_class() == RegClass::V128 && s.starts_with("v") {
-            s = "d".to_string() + &s[1..];
+            s.replace_range(0..1, "d");
         }
     } else {
         // Add a "d" suffix to RegClass::V128 vregs.
         if reg.get_class() == RegClass::V128 {
-            s = s + &"d";
+            s.push('d');
         }
     }
     s
diff --git a/cranelift/codegen/src/isa/arm64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
similarity index 95%
rename from cranelift/codegen/src/isa/arm64/lower.rs
rename to cranelift/codegen/src/isa/aarch64/lower.rs
index 9979802c792b..07a8e896e684 100644
--- a/cranelift/codegen/src/isa/arm64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1,4 +1,4 @@
-//! Lowering rules for ARM64.
+//! Lowering rules for AArch64.
 //!
 //! TODO: opportunities for better code generation:
 //!
@@ -6,45 +6,24 @@
 //!   and incorporate sign/zero extension on indicies. Recognize pre/post-index
 //!   opportunities.
 //!
-//! - Logical-immediate args.
-//!
-//! - Floating-point immediates.
-
-#![allow(dead_code)]
+//! - Floating-point immediates (FIMM instruction).
 
 use crate::ir::condcodes::{FloatCC, IntCC};
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{Block, InstructionData, Opcode, TrapCode, Type};
+use crate::ir::{InstructionData, Opcode, TrapCode, Type};
 use crate::machinst::lower::*;
 use crate::machinst::*;
 
-use crate::isa::arm64::abi::*;
-use crate::isa::arm64::inst::*;
-use crate::isa::arm64::Arm64Backend;
+use crate::isa::aarch64::abi::*;
+use crate::isa::aarch64::inst::*;
+use crate::isa::aarch64::AArch64Backend;
 
 use regalloc::{Reg, RegClass, Writable};
 
 use alloc::vec::Vec;
 use smallvec::SmallVec;
 
-//============================================================================
-// Helpers: opcode conversions
-
-fn op_to_aluop(op: Opcode, ty: Type) -> Option<ALUOp> {
-    match (op, ty) {
-        (Opcode::Iadd, I32) => Some(ALUOp::Add32),
-        (Opcode::Iadd, I64) => Some(ALUOp::Add64),
-        (Opcode::Isub, I32) => Some(ALUOp::Sub32),
-        (Opcode::Isub, I64) => Some(ALUOp::Sub64),
-        _ => None,
-    }
-}
-
-fn is_alu_op(op: Opcode, ctrl_typevar: Type) -> bool {
-    op_to_aluop(op, ctrl_typevar).is_some()
-}
-
 //============================================================================
 // Result enum types.
 //
@@ -163,7 +142,7 @@ impl InsnInputSource {
     }
 }
 
-fn get_input<C: LowerCtx<Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
+fn get_input<C: LowerCtx<I = Inst>>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput {
     assert!(num <= ctx.num_inputs(output.insn));
     InsnInput {
         insn: output.insn,
@@ -173,7 +152,7 @@ fn get_input<C: LowerCtx<Inst>>(ctx: &mut C, output: InsnOutput, num: usize) ->
 
 /// Convert an instruction input to a producing instruction's output if possible (in same BB), or a
 /// register otherwise.
-fn input_source<C: LowerCtx<Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
+fn input_source<C: LowerCtx<I = Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSource {
     if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) {
         let out = InsnOutput {
             insn: input_inst,
@@ -190,7 +169,7 @@ fn input_source<C: LowerCtx<Inst>>(ctx: &mut C, input: InsnInput) -> InsnInputSo
 // Lowering: convert instruction outputs to result types.
 
 /// Lower an instruction output to a 64-bit constant, if possible.
-fn output_to_const<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
+fn output_to_const<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u64> {
     if out.output > 0 {
         None
     } else {
@@ -204,7 +183,7 @@ fn output_to_const<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u6
                     let imm: i64 = imm.into();
                     Some(imm as u64)
                 }
-                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(imm.bits() as u64),
+                &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())),
                 &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()),
                 _ => None,
             }
@@ -212,16 +191,19 @@ fn output_to_const<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<u6
     }
 }
 
-fn output_to_const_f32<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f32> {
+fn output_to_const_f32<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f32> {
     output_to_const(ctx, out).map(|value| f32::from_bits(value as u32))
 }
 
-fn output_to_const_f64<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f64> {
+fn output_to_const_f64<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Option<f64> {
     output_to_const(ctx, out).map(|value| f64::from_bits(value))
 }
 
 /// Lower an instruction output to a constant register-shift amount, if possible.
-fn output_to_shiftimm<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Option<ShiftOpShiftImm> {
+fn output_to_shiftimm<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    out: InsnOutput,
+) -> Option<ShiftOpShiftImm> {
     output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift)
 }
 
@@ -251,7 +233,7 @@ impl NarrowValueMode {
 }
 
 /// Lower an instruction output to a reg.
-fn output_to_reg<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
+fn output_to_reg<C: LowerCtx<I = Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Reg> {
     ctx.output(out.insn, out.output)
 }
 
@@ -260,7 +242,7 @@ fn output_to_reg<C: LowerCtx<Inst>>(ctx: &mut C, out: InsnOutput) -> Writable<Re
 /// The given register will be extended appropriately, according to
 /// `narrow_mode` and the input's type. If extended, the value is
 /// always extended to 64 bits, for simplicity.
-fn input_to_reg<C: LowerCtx<Inst>>(
+fn input_to_reg<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     narrow_mode: NarrowValueMode,
@@ -292,9 +274,7 @@ fn input_to_reg<C: LowerCtx<Inst>>(
             });
             tmp.to_reg()
         }
-        (NarrowValueMode::ZeroExtend32, n) | (NarrowValueMode::SignExtend32, n) if n == 32 => {
-            in_reg
-        }
+        (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg,
 
         (NarrowValueMode::ZeroExtend64, n) if n < 64 => {
             let tmp = ctx.tmp(RegClass::I64, I32);
@@ -318,7 +298,7 @@ fn input_to_reg<C: LowerCtx<Inst>>(
             });
             tmp.to_reg()
         }
-        (_, n) if n == 64 => in_reg,
+        (_, 64) => in_reg,
 
         _ => panic!(
             "Unsupported input width: input ty {} bits {} mode {:?}",
@@ -340,7 +320,7 @@ fn input_to_reg<C: LowerCtx<Inst>>(
 /// divide or a right-shift or a compare-to-zero), `narrow_mode` should be
 /// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting
 /// register will be provided the extended value.
-fn input_to_rs<C: LowerCtx<Inst>>(
+fn input_to_rs<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     narrow_mode: NarrowValueMode,
@@ -374,7 +354,7 @@ fn input_to_rs<C: LowerCtx<Inst>>(
 /// vreg into which the source instruction will generate its value.
 ///
 /// See note on `input_to_rs` for a description of `narrow_mode`.
-fn input_to_rse<C: LowerCtx<Inst>>(
+fn input_to_rse<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     narrow_mode: NarrowValueMode,
@@ -448,7 +428,7 @@ fn input_to_rse<C: LowerCtx<Inst>>(
     ResultRSE::from_rs(input_to_rs(ctx, input, narrow_mode))
 }
 
-fn input_to_rse_imm12<C: LowerCtx<Inst>>(
+fn input_to_rse_imm12<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     narrow_mode: NarrowValueMode,
@@ -465,7 +445,7 @@ fn input_to_rse_imm12<C: LowerCtx<Inst>>(
     ResultRSEImm12::from_rse(input_to_rse(ctx, input, narrow_mode))
 }
 
-fn input_to_rs_immlogic<C: LowerCtx<Inst>>(
+fn input_to_rs_immlogic<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     input: InsnInput,
     narrow_mode: NarrowValueMode,
@@ -484,7 +464,10 @@ fn input_to_rs_immlogic<C: LowerCtx<Inst>>(
     ResultRSImmLogic::from_rs(input_to_rs(ctx, input, narrow_mode))
 }
 
-fn input_to_reg_immshift<C: LowerCtx<Inst>>(ctx: &mut C, input: InsnInput) -> ResultRegImmShift {
+fn input_to_reg_immshift<C: LowerCtx<I = Inst>>(
+    ctx: &mut C,
+    input: InsnInput,
+) -> ResultRegImmShift {
     if let InsnInputSource::Output(out) = input_source(ctx, input) {
         if let Some(imm_value) = output_to_const(ctx, out) {
             if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) {
@@ -577,7 +560,7 @@ fn alu_inst_immshift(op: ALUOp, rd: Writable<Reg>, rn: Reg, rm: ResultRegImmShif
 // than an `InsnInput`, to do more introspection.
 
 /// Lower the address of a load or store.
-fn lower_address<C: LowerCtx<Inst>>(
+fn lower_address<C: LowerCtx<I = Inst>>(
     ctx: &mut C,
     elem_ty: Type,
     addends: &[InsnInput],
@@ -598,7 +581,7 @@ fn lower_address<C: LowerCtx<Inst>>(
     if addends.len() == 2 && offset == 0 {
         let ra = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64);
         let rb = input_to_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64);
-        return MemArg::reg_reg(ra, rb);
+        return MemArg::reg_plus_reg(ra, rb);
     }
 
     // Otherwise, generate add instructions.
@@ -621,17 +604,17 @@ fn lower_address<C: LowerCtx<Inst>>(
     MemArg::reg(addr.to_reg())
 }
 
-fn lower_constant_u64<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: u64) {
+fn lower_constant_u64<C: LowerCtx<I = Inst>>(ctx: &mut C, rd: Writable<Reg>, value: u64) {
     for inst in Inst::load_constant(rd, value) {
         ctx.emit(inst);
     }
 }
 
-fn lower_constant_f32<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f32) {
+fn lower_constant_f32<C: LowerCtx<I = Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f32) {
     ctx.emit(Inst::load_fp_constant32(rd, value));
 }
 
-fn lower_constant_f64<C: LowerCtx<Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f64) {
+fn lower_constant_f64<C: LowerCtx<I = Inst>>(ctx: &mut C, rd: Writable<Reg>, value: f64) {
     ctx.emit(Inst::load_fp_constant64(rd, value));
 }
 
@@ -653,7 +636,7 @@ fn lower_condcode(cc: IntCC) -> Cond {
 }
 
 fn lower_fp_condcode(cc: FloatCC) -> Cond {
-    // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` ARM64 docs.
+    // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs.
     // The FCMP instruction sets:
     //               NZCV
     // - PCSR.NZCV = 0011 on UN (unordered),
@@ -717,7 +700,7 @@ pub fn condcode_is_signed(cc: IntCC) -> bool {
 // Top-level instruction lowering entry point, for one instruction.
 
 /// Actually codegen an instruction's results into registers.
-fn lower_insn_to_regs<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
+fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
     let op = ctx.data(insn).opcode();
     let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn))
         .map(|i| InsnInput { insn, input: i })
@@ -1032,13 +1015,13 @@ fn lower_insn_to_regs<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
 
         Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => {
             let ty = ty.unwrap();
-            let is32 = ty_bits(ty) <= 32;
-            let narrow_mode = match (op, is32) {
+            let size = InstSize::from_bits(ty_bits(ty));
+            let narrow_mode = match (op, size) {
                 (Opcode::Ishl, _) => NarrowValueMode::None,
-                (Opcode::Ushr, false) => NarrowValueMode::ZeroExtend64,
-                (Opcode::Ushr, true) => NarrowValueMode::ZeroExtend32,
-                (Opcode::Sshr, false) => NarrowValueMode::SignExtend64,
-                (Opcode::Sshr, true) => NarrowValueMode::SignExtend32,
+                (Opcode::Ushr, InstSize::Size64) => NarrowValueMode::ZeroExtend64,
+                (Opcode::Ushr, InstSize::Size32) => NarrowValueMode::ZeroExtend32,
+                (Opcode::Sshr, InstSize::Size64) => NarrowValueMode::SignExtend64,
+                (Opcode::Sshr, InstSize::Size32) => NarrowValueMode::SignExtend32,
                 _ => unreachable!(),
             };
             let rd = output_to_reg(ctx, outputs[0]);
@@ -1160,7 +1143,7 @@ fn lower_insn_to_regs<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
         }
 
         Opcode::Rotl => {
-            // ARM64 does not have a ROL instruction, so we always synthesize
+            // AArch64 does not have a ROL instruction, so we always synthesize
             // this as:
             //
             //    rotl rd, rn, rm
@@ -1854,26 +1837,17 @@ fn lower_insn_to_regs<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
                 Opcode::Call => {
                     let extname = ctx.call_target(insn).unwrap();
                     let extname = extname.clone();
-                    // HACK: get the function address with an Abs8 reloc in the constant pool.
-                    //let tmp = ctx.tmp(RegClass::I64, I64);
-                    //ctx.emit(Inst::LoadExtName {
-                    //rd: tmp,
-                    //name: extname,
-                    //srcloc: loc,
-                    //offset: 0,
-                    //});
                     let sig = ctx.call_sig(insn).unwrap();
                     assert!(inputs.len() == sig.params.len());
                     assert!(outputs.len() == sig.returns.len());
-                    (ARM64ABICall::from_func(sig, &extname, loc), &inputs[..])
-                    //(ARM64ABICall::from_ptr(sig, tmp.to_reg(), loc), &inputs[..])
+                    (AArch64ABICall::from_func(sig, &extname, loc), &inputs[..])
                 }
                 Opcode::CallIndirect => {
                     let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64);
                     let sig = ctx.call_sig(insn).unwrap();
                     assert!(inputs.len() - 1 == sig.params.len());
                     assert!(outputs.len() == sig.returns.len());
-                    (ARM64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..])
+                    (AArch64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..])
                 }
                 _ => unreachable!(),
             };
@@ -2357,21 +2331,6 @@ fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
     }
 }
 
-fn branch_target(data: &InstructionData) -> Option<Block> {
-    match data {
-        &InstructionData::BranchIcmp { destination, .. }
-        | &InstructionData::Branch { destination, .. }
-        | &InstructionData::BranchInt { destination, .. }
-        | &InstructionData::Jump { destination, .. }
-        | &InstructionData::BranchTable { destination, .. }
-        | &InstructionData::BranchFloat { destination, .. } => Some(destination),
-        _ => {
-            assert!(!data.opcode().is_branch());
-            None
-        }
-    }
-}
-
 fn ldst_offset(data: &InstructionData) -> Option<i32> {
     match data {
         &InstructionData::Load { offset, .. }
@@ -2418,7 +2377,11 @@ fn inst_trapcode(data: &InstructionData) -> Option<TrapCode> {
 }
 
 /// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so.
-fn maybe_input_insn<C: LowerCtx<Inst>>(c: &mut C, input: InsnInput, op: Opcode) -> Option<IRInst> {
+fn maybe_input_insn<C: LowerCtx<I = Inst>>(
+    c: &mut C,
+    input: InsnInput,
+    op: Opcode,
+) -> Option<IRInst> {
     if let InsnInputSource::Output(out) = input_source(c, input) {
         let data = c.data(out.insn);
         if data.opcode() == op {
@@ -2434,7 +2397,7 @@ fn maybe_input_insn<C: LowerCtx<Inst>>(c: &mut C, input: InsnInput, op: Opcode)
 ///
 /// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it
 /// a bit more generic.
-fn maybe_input_insn_via_conv<C: LowerCtx<Inst>>(
+fn maybe_input_insn_via_conv<C: LowerCtx<I = Inst>>(
     c: &mut C,
     input: InsnInput,
     op: Opcode,
@@ -2461,7 +2424,7 @@ fn maybe_input_insn_via_conv<C: LowerCtx<Inst>>(
     None
 }
 
-fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst, is_signed: bool) {
+fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst, is_signed: bool) {
     let ty = ctx.input_ty(insn, 0);
     let bits = ty_bits(ty);
     let narrow_mode = match (bits <= 32, is_signed) {
@@ -2488,7 +2451,7 @@ fn lower_icmp_or_ifcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst, is
     ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm));
 }
 
-fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
+fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<I = Inst>>(ctx: &mut C, insn: IRInst) {
     let ty = ctx.input_ty(insn, 0);
     let bits = ty_bits(ty);
     let inputs = [
@@ -2517,14 +2480,14 @@ fn lower_fcmp_or_ffcmp_to_flags<C: LowerCtx<Inst>>(ctx: &mut C, insn: IRInst) {
 //=============================================================================
 // Lowering-backend trait implementation.
 
-impl LowerBackend for Arm64Backend {
+impl LowerBackend for AArch64Backend {
     type MInst = Inst;
 
-    fn lower<C: LowerCtx<Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
+    fn lower<C: LowerCtx<I = Inst>>(&self, ctx: &mut C, ir_inst: IRInst) {
         lower_insn_to_regs(ctx, ir_inst);
     }
 
-    fn lower_branch_group<C: LowerCtx<Inst>>(
+    fn lower_branch_group<C: LowerCtx<I = Inst>>(
         &self,
         ctx: &mut C,
         branches: &[IRInst],
diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs
similarity index 78%
rename from cranelift/codegen/src/isa/arm64/mod.rs
rename to cranelift/codegen/src/isa/aarch64/mod.rs
index fb3543933228..2a71085929e7 100644
--- a/cranelift/codegen/src/isa/arm64/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/mod.rs
@@ -2,7 +2,6 @@
 
 use crate::ir::Function;
 use crate::isa::Builder as IsaBuilder;
-use crate::isa::TargetIsa;
 use crate::machinst::{
     compile, MachBackend, MachCompileResult, ShowWithRRU, TargetIsaAdapter, VCode,
 };
@@ -10,10 +9,9 @@ use crate::result::CodegenResult;
 use crate::settings;
 
 use alloc::boxed::Box;
-use std::str::FromStr;
 
 use regalloc::RealRegUniverse;
-use target_lexicon::Triple;
+use target_lexicon::{Aarch64Architecture, Architecture, Triple};
 
 // New backend:
 mod abi;
@@ -22,29 +20,30 @@ mod lower;
 
 use inst::create_reg_universe;
 
-/// An ARM64 backend.
-pub struct Arm64Backend {
+/// An AArch64 backend.
+pub struct AArch64Backend {
+    triple: Triple,
     flags: settings::Flags,
 }
 
-impl Arm64Backend {
-    /// Create a new ARM64 backend with the given (shared) flags.
-    pub fn new_with_flags(flags: settings::Flags) -> Arm64Backend {
-        Arm64Backend { flags }
+impl AArch64Backend {
+    /// Create a new AArch64 backend with the given (shared) flags.
+    pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend {
+        AArch64Backend { triple, flags }
     }
 
-    fn compile_vcode(&self, mut func: Function, flags: &settings::Flags) -> VCode<inst::Inst> {
+    fn compile_vcode(&self, func: &Function, flags: &settings::Flags) -> VCode<inst::Inst> {
         // This performs lowering to VCode, register-allocates the code, computes
         // block layout and finalizes branches. The result is ready for binary emission.
-        let abi = Box::new(abi::ARM64ABIBody::new(&func));
-        compile::compile::<Arm64Backend>(&mut func, self, abi, flags)
+        let abi = Box::new(abi::AArch64ABIBody::new(func));
+        compile::compile::<AArch64Backend>(func, self, abi, flags)
     }
 }
 
-impl MachBackend for Arm64Backend {
+impl MachBackend for AArch64Backend {
     fn compile_function(
         &self,
-        func: Function,
+        func: &Function,
         want_disasm: bool,
     ) -> CodegenResult<MachCompileResult> {
         let flags = self.flags();
@@ -66,11 +65,11 @@ impl MachBackend for Arm64Backend {
     }
 
     fn name(&self) -> &'static str {
-        "arm64"
+        "aarch64"
     }
 
     fn triple(&self) -> Triple {
-        FromStr::from_str("arm64").unwrap()
+        self.triple.clone()
     }
 
     fn flags(&self) -> &settings::Flags {
@@ -84,32 +83,28 @@ impl MachBackend for Arm64Backend {
 
 /// Create a new `isa::Builder`.
 pub fn isa_builder(triple: Triple) -> IsaBuilder {
+    assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64));
     IsaBuilder {
         triple,
         setup: settings::builder(),
-        constructor: isa_constructor,
+        constructor: |triple, shared_flags, _| {
+            let backend = AArch64Backend::new_with_flags(triple, shared_flags);
+            Box::new(TargetIsaAdapter::new(backend))
+        },
     }
 }
 
-fn isa_constructor(
-    _: Triple,
-    shared_flags: settings::Flags,
-    _arch_flag_builder: settings::Builder,
-) -> Box<dyn TargetIsa> {
-    let backend = Arm64Backend::new_with_flags(shared_flags);
-    Box::new(TargetIsaAdapter::new(backend))
-}
-
 #[cfg(test)]
 mod test {
     use super::*;
-    use crate::binemit::{NullRelocSink, NullStackmapSink, NullTrapSink};
     use crate::cursor::{Cursor, FuncCursor};
     use crate::ir::types::*;
     use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature};
     use crate::isa::CallConv;
     use crate::settings;
     use crate::settings::Configurable;
+    use core::str::FromStr;
+    use target_lexicon::Triple;
 
     #[test]
     fn test_compile_function() {
@@ -130,8 +125,11 @@ mod test {
 
         let mut shared_flags = settings::builder();
         shared_flags.set("opt_level", "none").unwrap();
-        let backend = Arm64Backend::new_with_flags(settings::Flags::new(shared_flags));
-        let sections = backend.compile_function(func, false).unwrap().sections;
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
+        let sections = backend.compile_function(&mut func, false).unwrap().sections;
         let code = &sections.sections[0].data;
 
         // stp x29, x30, [sp, #-16]!
@@ -182,9 +180,12 @@ mod test {
 
         let mut shared_flags = settings::builder();
         shared_flags.set("opt_level", "none").unwrap();
-        let backend = Arm64Backend::new_with_flags(settings::Flags::new(shared_flags));
+        let backend = AArch64Backend::new_with_flags(
+            Triple::from_str("aarch64").unwrap(),
+            settings::Flags::new(shared_flags),
+        );
         let result = backend
-            .compile_function(func, /* want_disasm = */ false)
+            .compile_function(&mut func, /* want_disasm = */ false)
             .unwrap();
         let code = &result.sections.sections[0].data;
 
diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs
index a0a2a5de878e..c07082836f7c 100644
--- a/cranelift/codegen/src/isa/mod.rs
+++ b/cranelift/codegen/src/isa/mod.rs
@@ -84,7 +84,7 @@ pub mod fde;
 mod arm32;
 
 #[cfg(feature = "arm64")]
-mod arm64;
+mod aarch64;
 
 mod call_conv;
 mod constraints;
@@ -93,6 +93,9 @@ mod encoding;
 pub mod registers;
 mod stack;
 
+#[cfg(test)]
+mod test_utils;
+
 /// Returns a builder that can create a corresponding `TargetIsa`
 /// or `Err(LookupError::SupportDisabled)` if not enabled.
 macro_rules! isa_builder {
@@ -117,7 +120,7 @@ pub fn lookup(triple: Triple) -> Result<Builder, LookupError> {
             isa_builder!(x86, "x86", triple)
         }
         Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple),
-        Architecture::Aarch64 { .. } => isa_builder!(arm64, "arm64", triple),
+        Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple),
         _ => Err(LookupError::Unsupported),
     }
 }
diff --git a/cranelift/codegen/src/isa/test_utils.rs b/cranelift/codegen/src/isa/test_utils.rs
index 826fabf949f9..c7802b052a21 100644
--- a/cranelift/codegen/src/isa/test_utils.rs
+++ b/cranelift/codegen/src/isa/test_utils.rs
@@ -1,10 +1,13 @@
+// This is unused when no platforms with the new backend are enabled.
+#![allow(dead_code)]
+
 use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
 use crate::ir::Value;
 use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode};
 use crate::isa::TargetIsa;
 
 use alloc::vec::Vec;
-use std::string::{String, ToString};
+use std::string::String;
 
 pub struct TestCodeSink {
     bytes: Vec<u8>,
@@ -16,11 +19,13 @@ impl TestCodeSink {
         TestCodeSink { bytes: vec![] }
     }
 
-    /// This is pretty lame, but whatever ..
+    /// Return the code emitted to this sink as a hex string.
     pub fn stringify(&self) -> String {
-        let mut s = "".to_string();
+        // This is pretty lame, but whatever ..
+        use std::fmt::Write;
+        let mut s = String::with_capacity(self.bytes.len() * 2);
         for b in &self.bytes {
-            s = s + &format!("{:02X}", b).to_string();
+            write!(&mut s, "{:02X}", b).unwrap();
         }
         s
     }
diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs
index 2d6651a67e30..d87bbf26b86b 100644
--- a/cranelift/codegen/src/lib.rs
+++ b/cranelift/codegen/src/lib.rs
@@ -87,6 +87,7 @@ mod context;
 mod dce;
 mod divconst_magic_numbers;
 mod fx;
+mod inst_predicates;
 mod iterators;
 mod legalizer;
 mod licm;
diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs
index 7aaa66fe1471..11a96c58b2be 100644
--- a/cranelift/codegen/src/machinst/abi.rs
+++ b/cranelift/codegen/src/machinst/abi.rs
@@ -1,15 +1,17 @@
 //! ABI definitions.
 
-use crate::ir;
 use crate::ir::StackSlot;
 use crate::machinst::*;
 use crate::settings;
 
-use regalloc::{Reg, Set, SpillSlot, VirtualReg, Writable};
+use regalloc::{Reg, Set, SpillSlot, Writable};
 
 /// Trait implemented by an object that tracks ABI-related state (e.g., stack
 /// layout) and can generate code while emitting the *body* of a function.
-pub trait ABIBody<I: VCodeInst> {
+pub trait ABIBody {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
     /// Get the liveins of the function.
     fn liveins(&self) -> Set<RealReg>;
 
@@ -27,17 +29,19 @@ pub trait ABIBody<I: VCodeInst> {
 
     /// Generate an instruction which copies an argument to a destination
     /// register.
-    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> I;
+    fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
 
     /// Generate an instruction which copies a source register to a return
     /// value slot.
-    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> I;
+    fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Self::I;
 
     /// Generate a return instruction.
-    fn gen_ret(&self) -> I;
+    fn gen_ret(&self) -> Self::I;
 
-    /// Generate an epilogue placeholder.
-    fn gen_epilogue_placeholder(&self) -> I;
+    /// Generate an epilogue placeholder. The returned instruction should return `true` from
+    /// `is_epilogue_placeholder()`; this is used to indicate to the lowering driver when
+    /// the epilogue should be inserted.
+    fn gen_epilogue_placeholder(&self) -> Self::I;
 
     // -----------------------------------------------------------------
     // Every function above this line may only be called pre-regalloc.
@@ -56,32 +60,32 @@ pub trait ABIBody<I: VCodeInst> {
     fn load_stackslot(
         &self,
         slot: StackSlot,
-        offset: usize,
+        offset: u32,
         ty: Type,
         into_reg: Writable<Reg>,
-    ) -> I;
+    ) -> Self::I;
 
     /// Store to a stackslot.
-    fn store_stackslot(&self, slot: StackSlot, offset: usize, ty: Type, from_reg: Reg) -> I;
+    fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I;
 
     /// Load from a spillslot.
-    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> I;
+    fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable<Reg>) -> Self::I;
 
     /// Store to a spillslot.
-    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> I;
+    fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I;
 
     /// Generate a prologue, post-regalloc. This should include any stack
     /// frame or other setup necessary to use the other methods (`load_arg`,
-    /// `store_retval`, and spillslot accesses.)  |self| is mutable so that we
+    /// `store_retval`, and spillslot accesses.)  `self` is mutable so that we
     /// can store information in it which will be useful when creating the
     /// epilogue.
-    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<I>;
+    fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec<Self::I>;
 
     /// Generate an epilogue, post-regalloc. Note that this must generate the
     /// actual return instruction (rather than emitting this in the lowering
     /// logic), because the epilogue code comes before the return and the two are
     /// likely closely related.
-    fn gen_epilogue(&self, flags: &settings::Flags) -> Vec<I>;
+    fn gen_epilogue(&self, flags: &settings::Flags) -> Vec<Self::I>;
 
     /// Returns the full frame size for the given function, after prologue emission has run. This
     /// comprises the spill space, incoming argument space, alignment padding, etc.
@@ -91,10 +95,10 @@ pub trait ABIBody<I: VCodeInst> {
     fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32;
 
     /// Generate a spill.
-    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> I;
+    fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Self::I;
 
     /// Generate a reload (fill).
-    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> I;
+    fn gen_reload(&self, to_reg: Writable<RealReg>, from_slot: SpillSlot, ty: Type) -> Self::I;
 }
 
 /// Trait implemented by an object that tracks ABI-related state and can
@@ -111,22 +115,25 @@ pub trait ABIBody<I: VCodeInst> {
 /// and retval copies, and attach the register use/def info to the call.
 ///
 /// This trait is thus provided for convenience to the backends.
-pub trait ABICall<I: VCodeInst> {
+pub trait ABICall {
+    /// The instruction type for the ISA associated with this ABI.
+    type I: VCodeInst;
+
     /// Get the number of arguments expected.
     fn num_args(&self) -> usize;
 
     /// Save the clobbered registers.
     /// Copy an argument value from a source register, prior to the call.
-    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> I;
+    fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Self::I;
 
     /// Copy a return value into a destination register, after the call returns.
-    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> I;
+    fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable<Reg>) -> Self::I;
 
     /// Pre-adjust the stack, prior to argument copies and call.
-    fn gen_stack_pre_adjust(&self) -> Vec<I>;
+    fn gen_stack_pre_adjust(&self) -> Vec<Self::I>;
 
     /// Post-adjust the satck, after call return and return-value copies.
-    fn gen_stack_post_adjust(&self) -> Vec<I>;
+    fn gen_stack_post_adjust(&self) -> Vec<Self::I>;
 
     /// Generate the call itself.
     ///
@@ -138,5 +145,5 @@ pub trait ABICall<I: VCodeInst> {
     /// registers are also logically defs, but should never be read; their
     /// values are "defined" (to the regalloc) but "undefined" in every other
     /// sense.)
-    fn gen_call(&self) -> Vec<I>;
+    fn gen_call(&self) -> Vec<Self::I>;
 }
diff --git a/cranelift/codegen/src/machinst/adapter.rs b/cranelift/codegen/src/machinst/adapter.rs
index 3f7c5b7b57f0..c9cf41f359b2 100644
--- a/cranelift/codegen/src/machinst/adapter.rs
+++ b/cranelift/codegen/src/machinst/adapter.rs
@@ -4,9 +4,12 @@ use crate::binemit;
 use crate::ir;
 use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa};
 use crate::machinst::*;
-use crate::regalloc::{RegDiversions, RegisterSet};
+use crate::regalloc::RegisterSet;
 use crate::settings::Flags;
 
+#[cfg(feature = "testing_hooks")]
+use crate::regalloc::RegDiversions;
+
 use std::borrow::Cow;
 use std::fmt;
 use target_lexicon::Triple;
@@ -30,7 +33,11 @@ impl TargetIsaAdapter {
 
 impl fmt::Display for TargetIsaAdapter {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "MachBackend")
+        f.debug_struct("MachBackend")
+            .field("name", &self.backend.name())
+            .field("triple", &self.backend.triple())
+            .field("flags", &format!("{}", self.backend.flags()))
+            .finish()
     }
 }
 
diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs
index bfd4bf665af7..847f2a6b663c 100644
--- a/cranelift/codegen/src/machinst/blockorder.rs
+++ b/cranelift/codegen/src/machinst/blockorder.rs
@@ -1,6 +1,7 @@
 //! Computation of basic block order in emitted code.
 
 use crate::machinst::*;
+use regalloc::{BlockIx, Function};
 
 /// Simple reverse postorder-based block order emission.
 ///
@@ -29,9 +30,8 @@ impl BlockRPO {
             }
         }
 
-        let (start, end) = &vcode.block_ranges[block as usize];
-        for i in *start..*end {
-            if vcode.insts[i as usize].is_epilogue_placeholder() {
+        for i in vcode.block_insns(BlockIx::new(block)) {
+            if vcode.get_insn(i).is_epilogue_placeholder() {
                 debug_assert!(self.deferred_last.is_none());
                 self.deferred_last = Some(block);
                 return;
diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs
index 458db9ea368d..eda3955f88d3 100644
--- a/cranelift/codegen/src/machinst/compile.rs
+++ b/cranelift/codegen/src/machinst/compile.rs
@@ -7,14 +7,13 @@ use crate::timing;
 
 use log::debug;
 use regalloc::{allocate_registers, RegAllocAlgorithm};
-use std::env;
 
 /// Compile the given function down to VCode with allocated registers, ready
 /// for binary emission.
 pub fn compile<B: LowerBackend>(
-    f: &mut Function,
+    f: &Function,
     b: &B,
-    abi: Box<dyn ABIBody<B::MInst>>,
+    abi: Box<dyn ABIBody<I = B::MInst>>,
     flags: &settings::Flags,
 ) -> VCode<B::MInst>
 where
@@ -28,18 +27,8 @@ where
     debug!("vcode from lowering: \n{}", vcode.show_rru(Some(universe)));
 
     // Perform register allocation.
-    let algorithm = match env::var("REGALLOC") {
-        Ok(str) => match str.as_str() {
-            "lsrac" => RegAllocAlgorithm::LinearScanChecked,
-            "lsra" => RegAllocAlgorithm::LinearScan,
-            // to wit: btc doesn't mean "bitcoin" here
-            "btc" => RegAllocAlgorithm::BacktrackingChecked,
-            _ => RegAllocAlgorithm::Backtracking,
-        },
-        // By default use backtracking, which is the fastest.
-        Err(_) => RegAllocAlgorithm::Backtracking,
-    };
-
+    // TODO: select register allocation algorithm from flags.
+    let algorithm = RegAllocAlgorithm::Backtracking;
     let result = {
         let _tt = timing::regalloc();
         allocate_registers(
@@ -70,7 +59,5 @@ where
         vcode.show_rru(Some(universe))
     );
 
-    //println!("{}\n", vcode.show_rru(Some(&B::MInst::reg_universe())));
-
     vcode
 }
diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs
index 2165416ebccb..0d8fb1ff0e0e 100644
--- a/cranelift/codegen/src/machinst/lower.rs
+++ b/cranelift/codegen/src/machinst/lower.rs
@@ -2,39 +2,37 @@
 //! to machine instructions with virtual registers. This is *almost* the final
 //! machine code, except for register allocation.
 
-use crate::binemit::CodeSink;
-use crate::dce::has_side_effect;
 use crate::entity::SecondaryMap;
+use crate::inst_predicates::has_side_effect;
+use crate::ir::instructions::BranchInfo;
 use crate::ir::{
     Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, MemFlags, Opcode,
     Signature, SourceLoc, Type, Value, ValueDef,
 };
-use crate::isa::registers::RegUnit;
-use crate::machinst::{
-    ABIBody, BlockIndex, MachInst, MachInstEmit, VCode, VCodeBuilder, VCodeInst,
-};
+use crate::machinst::{ABIBody, BlockIndex, VCode, VCodeBuilder, VCodeInst};
 use crate::num_uses::NumUses;
 
-use regalloc::Function as RegallocFunction;
-use regalloc::{RealReg, Reg, RegClass, Set, VirtualReg, Writable};
+use regalloc::{Reg, RegClass, Set, VirtualReg, Writable};
 
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use log::debug;
 use smallvec::SmallVec;
 use std::collections::VecDeque;
-use std::ops::Range;
 
 /// A context that machine-specific lowering code can use to emit lowered instructions. This is the
 /// view of the machine-independent per-function lowering context that is seen by the machine
 /// backend.
-pub trait LowerCtx<I> {
+pub trait LowerCtx {
+    /// The instruction type for which this lowering framework is instantiated.
+    type I;
+
     /// Get the instdata for a given IR instruction.
     fn data(&self, ir_inst: Inst) -> &InstructionData;
     /// Get the controlling type for a polymorphic IR instruction.
     fn ty(&self, ir_inst: Inst) -> Type;
     /// Emit a machine instruction.
-    fn emit(&mut self, mach_inst: I);
+    fn emit(&mut self, mach_inst: Self::I);
     /// Indicate that an IR instruction has been merged, and so one of its
     /// uses is gone (replaced by uses of the instruction's inputs). This
     /// helps the lowering algorithm to perform on-the-fly DCE, skipping over
@@ -87,11 +85,11 @@ pub trait LowerBackend {
     /// Lower a single instruction. Instructions are lowered in reverse order.
     /// This function need not handle branches; those are always passed to
     /// `lower_branch_group` below.
-    fn lower<C: LowerCtx<Self::MInst>>(&self, ctx: &mut C, inst: Inst);
+    fn lower<C: LowerCtx<I = Self::MInst>>(&self, ctx: &mut C, inst: Inst);
 
     /// Lower a block-terminating group of branches (which together can be seen as one
     /// N-way branch), given a vcode BlockIndex for each target.
-    fn lower_branch_group<C: LowerCtx<Self::MInst>>(
+    fn lower_branch_group<C: LowerCtx<I = Self::MInst>>(
         &self,
         ctx: &mut C,
         insts: &[Inst],
@@ -103,22 +101,22 @@ pub trait LowerBackend {
 /// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence
 /// from original Inst to MachInsts.
 pub struct Lower<'a, I: VCodeInst> {
-    // The function to lower.
+    /// The function to lower.
     f: &'a Function,
 
-    // Lowered machine instructions.
+    /// Lowered machine instructions.
     vcode: VCodeBuilder<I>,
 
-    // Number of active uses (minus `dec_use()` calls by backend) of each instruction.
+    /// Number of active uses (minus `dec_use()` calls by backend) of each instruction.
     num_uses: SecondaryMap<Inst, u32>,
 
-    // Mapping from `Value` (SSA value in IR) to virtual register.
+    /// Mapping from `Value` (SSA value in IR) to virtual register.
     value_regs: SecondaryMap<Value, Reg>,
 
-    // Return-value vregs.
+    /// Return-value vregs.
     retval_regs: Vec<Reg>,
 
-    // Next virtual register number to allocate.
+    /// Next virtual register number to allocate.
     next_vreg: u32,
 }
 
@@ -144,7 +142,7 @@ enum GenerateReturn {
 
 impl<'a, I: VCodeInst> Lower<'a, I> {
     /// Prepare a new lowering context for the given IR function.
-    pub fn new(f: &'a Function, abi: Box<dyn ABIBody<I>>) -> Lower<'a, I> {
+    pub fn new(f: &'a Function, abi: Box<dyn ABIBody<I = I>>) -> Lower<'a, I> {
         let mut vcode = VCodeBuilder::new(abi);
 
         let num_uses = NumUses::compute(f).take_uses();
@@ -244,7 +242,9 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
                 let mut succs: SmallVec<[Block; 16]> = SmallVec::new();
                 for inst in self.f.layout.block_insts(b) {
                     if self.f.dfg[inst].opcode().is_branch() {
-                        succs.extend(branch_targets(self.f, b, inst).into_iter());
+                        visit_branch_targets(self.f, b, inst, |succ| {
+                            succs.push(succ);
+                        });
                     }
                 }
                 for succ in succs.into_iter() {
@@ -264,17 +264,14 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
     /// Lower the function.
     pub fn lower<B: LowerBackend<MInst = I>>(mut self, backend: &B) -> VCode<I> {
         // Find all reachable blocks.
-        let mut bbs = self.find_reachable_bbs();
-        // Work backward (reverse block order, reverse through each block), skipping insns with zero
-        // uses.
-        bbs.reverse();
+        let bbs = self.find_reachable_bbs();
 
         // This records a Block-to-BlockIndex map so that branch targets can be resolved.
         let mut next_bindex = self.vcode.init_bb_map(&bbs[..]);
 
         // Allocate a separate BlockIndex for each control-flow instruction so that we can create
         // the edge blocks later. Each entry for a control-flow inst is the edge block; the list
-        // has (cf-inst, edge block, orig block) tuples.
+        // has (control flow inst, edge block, orig block) tuples.
         let mut edge_blocks_by_inst: SecondaryMap<Inst, Vec<BlockIndex>> =
             SecondaryMap::with_default(vec![]);
         let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = vec![];
@@ -282,7 +279,9 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
         debug!("about to lower function: {:?}", self.f);
         debug!("bb map: {:?}", self.vcode.blocks_by_bb());
 
-        for bb in bbs.iter() {
+        // Work backward (reverse block order, reverse through each block), skipping insns with zero
+        // uses.
+        for bb in bbs.iter().rev() {
             for inst in self.f.layout.block_insts(*bb) {
                 let op = self.f.dfg[inst].opcode();
                 if op.is_branch() {
@@ -293,9 +292,9 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
                         edge_blocks_by_inst[inst].push(edge_block);
                         edge_blocks.push((inst, edge_block, next_bb));
                     };
-                    for succ in branch_targets(self.f, *bb, inst).into_iter() {
+                    visit_branch_targets(self.f, *bb, inst, |succ| {
                         add_succ(succ);
-                    }
+                    });
                 }
             }
         }
@@ -303,7 +302,9 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
         for bb in bbs.iter() {
             debug!("lowering bb: {}", bb);
 
-            // If this is a return block, produce the return value setup.
+            // If this is a return block, produce the return value setup.  N.B.: this comes
+            // *before* the below because it must occur *after* any other instructions, and
+            // instructions are lowered in reverse order.
             let last_insn = self.f.layout.block_insts(*bb).last().unwrap();
             let last_insn_opcode = self.f.dfg[last_insn].opcode();
             if last_insn_opcode.is_return() {
@@ -513,7 +514,9 @@ impl<'a, I: VCodeInst> Lower<'a, I> {
     }
 }
 
-impl<'a, I: VCodeInst> LowerCtx<I> for Lower<'a, I> {
+impl<'a, I: VCodeInst> LowerCtx for Lower<'a, I> {
+    type I = I;
+
     /// Get the instdata for a given IR instruction.
     fn data(&self, ir_inst: Inst) -> &InstructionData {
         &self.f.dfg[ir_inst]
@@ -695,29 +698,23 @@ impl<'a, I: VCodeInst> LowerCtx<I> for Lower<'a, I> {
     }
 }
 
-fn branch_targets(f: &Function, block: Block, inst: Inst) -> SmallVec<[Block; 16]> {
-    let mut ret = SmallVec::new();
+fn visit_branch_targets<F: FnMut(Block)>(f: &Function, block: Block, inst: Inst, mut visit: F) {
     if f.dfg[inst].opcode() == Opcode::Fallthrough {
-        ret.push(f.layout.next_block(block).unwrap());
+        visit(f.layout.next_block(block).unwrap());
     } else {
-        match &f.dfg[inst] {
-            &InstructionData::Jump { destination, .. }
-            | &InstructionData::Branch { destination, .. }
-            | &InstructionData::BranchInt { destination, .. }
-            | &InstructionData::BranchIcmp { destination, .. }
-            | &InstructionData::BranchFloat { destination, .. } => {
-                ret.push(destination);
+        match f.dfg[inst].analyze_branch(&f.dfg.value_lists) {
+            BranchInfo::NotABranch => {}
+            BranchInfo::SingleDest(dest, _) => {
+                visit(dest);
             }
-            &InstructionData::BranchTable {
-                destination, table, ..
-            } => {
-                ret.push(destination);
-                for dest in f.jump_tables[table].as_slice() {
-                    ret.push(*dest);
+            BranchInfo::Table(table, maybe_dest) => {
+                if let Some(dest) = maybe_dest {
+                    visit(dest);
+                }
+                for &dest in f.jump_tables[table].as_slice() {
+                    visit(dest);
                 }
             }
-            _ => {}
         }
     }
-    ret
 }
diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs
index 93c9126b320f..844d0d1a4f48 100644
--- a/cranelift/codegen/src/machinst/mod.rs
+++ b/cranelift/codegen/src/machinst/mod.rs
@@ -17,105 +17,97 @@
 //! (N.B.: though we show the VCode separately at each stage, the passes
 //! mutate the VCode in place; these are not separate copies of the code.)
 //!
-//! |    ir::Function                (SSA IR, machine-independent opcodes)
-//! |        |
-//! |        |  [lower]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - mostly virtual registers.
-//! |        |                        - cond branches in two-target form.
-//! |        |                        - branch targets are block indices.
-//! |        |                        - in-memory constants held by insns,
-//! |        |                          with unknown offsets.
-//! |        |                        - critical edges (actually all edges)
-//! |        |                          are split.)
-//! |        | [regalloc]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - all real registers.
-//! |        |                        - new instruction sequence returned
-//! |        |                          out-of-band in RegAllocResult.
-//! |        |                        - instruction sequence has spills,
-//! |        |                          reloads, and moves inserted.
-//! |        |                        - other invariants same as above.)
-//! |        |
-//! |        | [preamble/postamble]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - stack-frame size known.
-//! |        |                        - out-of-band instruction sequence
-//! |        |                          has preamble prepended to entry
-//! |        |                          block, and postamble injected before
-//! |        |                          every return instruction.
-//! |        |                        - all symbolic stack references to
-//! |        |                          stackslots and spillslots are resolved
-//! |        |                          to concrete FP-offset mem addresses.)
-//! |        | [block/insn ordering]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - vcode.final_block_order is filled in.
-//! |        |                        - new insn sequence from regalloc is
-//! |        |                          placed back into vcode and block
-//! |        |                          boundaries are updated.)
-//! |        | [redundant branch/block
-//! |        |  removal]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - all blocks that were just an
-//! |        |                          unconditional branch are removed.)
-//! |        |
-//! |        | [branch finalization
-//! |        |  (fallthroughs)]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - all branches are in lowered one-
-//! |        |                          target form, but targets are still
-//! |        |                          block indices.)
-//! |        |
-//! |        | [branch finalization
-//! |        |  (offsets)]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - all branch offsets from start of
-//! |        |                          function are known, and all branches
-//! |        |                          have resolved-offset targets.)
-//! |        |
-//! |        | [MemArg finalization]
-//! |        |
-//! |    VCode<arch_backend::Inst>   (machine instructions:
-//! |        |                        - all MemArg references to the constant
-//! |        |                          pool are replaced with offsets.
-//! |        |                        - all constant-pool data is collected
-//! |        |                          in the VCode.)
-//! |        |
-//! |        | [binary emission]
-//! |        |
-//! |    Vec<u8>                     (machine code!)
-//! |
-
-#![allow(unused_imports)]
+//! ```plain
+//!
+//!     ir::Function                (SSA IR, machine-independent opcodes)
+//!         |
+//!         |  [lower]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - mostly virtual registers.
+//!         |                        - cond branches in two-target form.
+//!         |                        - branch targets are block indices.
+//!         |                        - in-memory constants held by insns,
+//!         |                          with unknown offsets.
+//!         |                        - critical edges (actually all edges)
+//!         |                          are split.)
+//!         | [regalloc]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all real registers.
+//!         |                        - new instruction sequence returned
+//!         |                          out-of-band in RegAllocResult.
+//!         |                        - instruction sequence has spills,
+//!         |                          reloads, and moves inserted.
+//!         |                        - other invariants same as above.)
+//!         |
+//!         | [preamble/postamble]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - stack-frame size known.
+//!         |                        - out-of-band instruction sequence
+//!         |                          has preamble prepended to entry
+//!         |                          block, and postamble injected before
+//!         |                          every return instruction.
+//!         |                        - all symbolic stack references to
+//!         |                          stackslots and spillslots are resolved
+//!         |                          to concrete FP-offset mem addresses.)
+//!         | [block/insn ordering]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - vcode.final_block_order is filled in.
+//!         |                        - new insn sequence from regalloc is
+//!         |                          placed back into vcode and block
+//!         |                          boundaries are updated.)
+//!         | [redundant branch/block
+//!         |  removal]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all blocks that were just an
+//!         |                          unconditional branch are removed.)
+//!         |
+//!         | [branch finalization
+//!         |  (fallthroughs)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branches are in lowered one-
+//!         |                          target form, but targets are still
+//!         |                          block indices.)
+//!         |
+//!         | [branch finalization
+//!         |  (offsets)]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all branch offsets from start of
+//!         |                          function are known, and all branches
+//!         |                          have resolved-offset targets.)
+//!         |
+//!         | [MemArg finalization]
+//!         |
+//!     VCode<arch_backend::Inst>   (machine instructions:
+//!         |                        - all MemArg references to the constant
+//!         |                          pool are replaced with offsets.
+//!         |                        - all constant-pool data is collected
+//!         |                          in the VCode.)
+//!         |
+//!         | [binary emission]
+//!         |
+//!     Vec<u8>                     (machine code!)
+//!
+//! ```
 
-use crate::binemit::{
-    CodeInfo, CodeOffset, CodeSink, MemoryCodeSink, RelocSink, StackmapSink, TrapSink,
-};
-use crate::entity::EntityRef;
+use crate::binemit::{CodeInfo, CodeOffset};
 use crate::entity::SecondaryMap;
 use crate::ir::condcodes::IntCC;
-use crate::ir::ValueLocations;
-use crate::ir::{DataFlowGraph, Function, Inst, Opcode, Type, Value};
-use crate::isa::RegUnit;
+use crate::ir::{Function, Type};
 use crate::result::CodegenResult;
 use crate::settings::Flags;
-use crate::HashMap;
 use alloc::boxed::Box;
 use alloc::vec::Vec;
 use core::fmt::Debug;
-use core::iter::Sum;
 use regalloc::Map as RegallocMap;
 use regalloc::RegUsageCollector;
 use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable};
-use smallvec::SmallVec;
-use std::hash::Hash;
 use std::string::String;
 use target_lexicon::Triple;
 
@@ -129,8 +121,8 @@ pub mod blockorder;
 pub use blockorder::*;
 pub mod abi;
 pub use abi::*;
-pub mod pp;
-pub use pp::*;
+pub mod pretty_print;
+pub use pretty_print::*;
 pub mod sections;
 pub use sections::*;
 pub mod adapter;
@@ -255,10 +247,10 @@ impl MachCompileResult {
 /// Top-level machine backend trait, which wraps all monomorphized code and
 /// allows a virtual call from the machine-independent `Function::compile()`.
 pub trait MachBackend {
-    /// Compile the given function. Consumes the function.
+    /// Compile the given function.
     fn compile_function(
         &self,
-        func: Function,
+        func: &Function,
         want_disasm: bool,
     ) -> CodegenResult<MachCompileResult>;
 
diff --git a/cranelift/codegen/src/machinst/pp.rs b/cranelift/codegen/src/machinst/pretty_print.rs
similarity index 100%
rename from cranelift/codegen/src/machinst/pp.rs
rename to cranelift/codegen/src/machinst/pretty_print.rs
diff --git a/cranelift/codegen/src/machinst/sections.rs b/cranelift/codegen/src/machinst/sections.rs
index 3e387239d074..247adf5cef48 100644
--- a/cranelift/codegen/src/machinst/sections.rs
+++ b/cranelift/codegen/src/machinst/sections.rs
@@ -3,7 +3,7 @@
 //! simultaneously, so we buffer the result in memory and hand off to the
 //! caller at the end of compilation.
 
-use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc, RelocSink, StackmapSink, TrapSink};
+use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc};
 use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode};
 
 use alloc::vec::Vec;
@@ -104,28 +104,31 @@ pub trait MachSectionOutput {
 
     /// Add 2 bytes to the section.
     fn put2(&mut self, value: u16) {
-        self.put1((value & 0xff) as u8);
-        self.put1(((value >> 8) & 0xff) as u8);
+        let [b0, b1] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
     }
 
     /// Add 4 bytes to the section.
     fn put4(&mut self, value: u32) {
-        self.put1((value & 0xff) as u8);
-        self.put1(((value >> 8) & 0xff) as u8);
-        self.put1(((value >> 16) & 0xff) as u8);
-        self.put1(((value >> 24) & 0xff) as u8);
+        let [b0, b1, b2, b3] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
+        self.put1(b2);
+        self.put1(b3);
     }
 
     /// Add 8 bytes to the section.
     fn put8(&mut self, value: u64) {
-        self.put1((value & 0xff) as u8);
-        self.put1(((value >> 8) & 0xff) as u8);
-        self.put1(((value >> 16) & 0xff) as u8);
-        self.put1(((value >> 24) & 0xff) as u8);
-        self.put1(((value >> 32) & 0xff) as u8);
-        self.put1(((value >> 40) & 0xff) as u8);
-        self.put1(((value >> 48) & 0xff) as u8);
-        self.put1(((value >> 56) & 0xff) as u8);
+        let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes();
+        self.put1(b0);
+        self.put1(b1);
+        self.put1(b2);
+        self.put1(b3);
+        self.put1(b4);
+        self.put1(b5);
+        self.put1(b6);
+        self.put1(b7);
     }
 
     /// Add a slice of bytes to the section.
diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs
index 64b1a4012af8..6e3adea53aec 100644
--- a/cranelift/codegen/src/machinst/vcode.rs
+++ b/cranelift/codegen/src/machinst/vcode.rs
@@ -17,7 +17,6 @@
 //! See the main module comment in `mod.rs` for more details on the VCode-based
 //! backend pipeline.
 
-use crate::binemit::Reloc;
 use crate::ir;
 use crate::machinst::*;
 use crate::settings;
@@ -32,7 +31,6 @@ use log::debug;
 use smallvec::SmallVec;
 use std::fmt;
 use std::iter;
-use std::ops::Index;
 use std::string::String;
 
 /// Index referring to an instruction in VCode.
@@ -59,13 +57,13 @@ pub struct VCode<I: VCodeInst> {
     vreg_types: Vec<Type>,
 
     /// Lowered machine instructions in order corresponding to the original IR.
-    pub insts: Vec<I>,
+    insts: Vec<I>,
 
     /// Entry block.
     entry: BlockIndex,
 
     /// Block instruction indices.
-    pub block_ranges: Vec<(InsnIndex, InsnIndex)>,
+    block_ranges: Vec<(InsnIndex, InsnIndex)>,
 
     /// Block successors: index range in the successor-list below.
     block_succ_range: Vec<(usize, usize)>,
@@ -94,7 +92,7 @@ pub struct VCode<I: VCodeInst> {
     code_size: CodeOffset,
 
     /// ABI object.
-    abi: Box<dyn ABIBody<I>>,
+    abi: Box<dyn ABIBody<I = I>>,
 }
 
 /// A builder for a VCode function body. This builder is designed for the
@@ -128,7 +126,7 @@ pub struct VCodeBuilder<I: VCodeInst> {
 
 impl<I: VCodeInst> VCodeBuilder<I> {
     /// Create a new VCodeBuilder.
-    pub fn new(abi: Box<dyn ABIBody<I>>) -> VCodeBuilder<I> {
+    pub fn new(abi: Box<dyn ABIBody<I = I>>) -> VCodeBuilder<I> {
         let vcode = VCode::new(abi);
         VCodeBuilder {
             vcode,
@@ -139,7 +137,7 @@ impl<I: VCodeInst> VCodeBuilder<I> {
     }
 
     /// Access the ABI object.
-    pub fn abi(&mut self) -> &mut dyn ABIBody<I> {
+    pub fn abi(&mut self) -> &mut dyn ABIBody<I = I> {
         &mut *self.vcode.abi
     }
 
@@ -282,7 +280,7 @@ fn is_trivial_jump_block<I: VCodeInst>(vcode: &VCode<I>, block: BlockIndex) -> O
 
 impl<I: VCodeInst> VCode<I> {
     /// New empty VCode.
-    fn new(abi: Box<dyn ABIBody<I>>) -> VCode<I> {
+    fn new(abi: Box<dyn ABIBody<I = I>>) -> VCode<I> {
         VCode {
             liveins: abi.liveins(),
             liveouts: abi.liveouts(),
@@ -472,10 +470,10 @@ impl<I: VCodeInst> VCode<I> {
         // Compute block offsets.
         let mut code_section = MachSectionSize::new(0);
         let mut block_offsets = vec![0; self.num_blocks()];
-        for block in &self.final_block_order {
+        for &block in &self.final_block_order {
             code_section.offset = I::align_basic_block(code_section.offset);
-            block_offsets[*block as usize] = code_section.offset;
-            let (start, end) = self.block_ranges[*block as usize];
+            block_offsets[block as usize] = code_section.offset;
+            let (start, end) = self.block_ranges[block as usize];
             for iix in start..end {
                 self.insts[iix as usize].emit(&mut code_section);
             }
@@ -490,9 +488,9 @@ impl<I: VCodeInst> VCode<I> {
         // it (so forward references are now possible), and (ii) mutates the
         // instructions.
         let mut code_section = MachSectionSize::new(0);
-        for block in &self.final_block_order {
+        for &block in &self.final_block_order {
             code_section.offset = I::align_basic_block(code_section.offset);
-            let (start, end) = self.block_ranges[*block as usize];
+            let (start, end) = self.block_ranges[block as usize];
             for iix in start..end {
                 self.insts[iix as usize]
                     .with_block_offsets(code_section.offset, &self.final_block_offsets[..]);
@@ -510,7 +508,7 @@ impl<I: VCodeInst> VCode<I> {
         let code_idx = sections.add_section(0, self.code_size);
         let code_section = sections.get_section(code_idx);
 
-        for block in &self.final_block_order {
+        for &block in &self.final_block_order {
             let new_offset = I::align_basic_block(code_section.cur_offset_from_start());
             while new_offset > code_section.cur_offset_from_start() {
                 // Pad with NOPs up to the aligned block offset.
@@ -519,7 +517,7 @@ impl<I: VCodeInst> VCode<I> {
             }
             assert_eq!(code_section.cur_offset_from_start(), new_offset);
 
-            let (start, end) = self.block_ranges[*block as usize];
+            let (start, end) = self.block_ranges[block as usize];
             for iix in start..end {
                 self.insts[iix as usize].emit(code_section);
             }
@@ -639,9 +637,6 @@ impl<I: VCodeInst> RegallocFunction for VCode<I> {
     }
 }
 
-// N.B.: Debug impl assumes that VCode has already been through all compilation
-// passes, and so has a final block order and offsets.
-
 impl<I: VCodeInst> fmt::Debug for VCode<I> {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         writeln!(f, "VCode_Debug {{")?;
@@ -665,22 +660,21 @@ impl<I: VCodeInst> fmt::Debug for VCode<I> {
     }
 }
 
-// Pretty-printing with `RealRegUniverse` context.
+/// Pretty-printing with `RealRegUniverse` context.
 impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
     fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String {
-        use crate::alloc::string::ToString;
         use std::fmt::Write;
 
         // Calculate an order in which to display the blocks.  This is the same
         // as final_block_order, but also includes blocks which are in the
         // representation but not in final_block_order.
         let mut display_order = Vec::<usize>::new();
-        // First display blocks in |final_block_order|
+        // First display blocks in `final_block_order`
         for bix in &self.final_block_order {
             assert!((*bix as usize) < self.num_blocks());
             display_order.push(*bix as usize);
         }
-        // Now also take care of those not listed in |final_block_order|.
+        // Now also take care of those not listed in `final_block_order`.
         // This is quadratic, but it's also debug-only code.
         for bix in 0..self.num_blocks() {
             if display_order.contains(&bix) {
@@ -690,48 +684,46 @@ impl<I: VCodeInst + ShowWithRRU> ShowWithRRU for VCode<I> {
         }
 
         let mut s = String::new();
-        s = s + &format!("VCode_ShowWithRRU {{{{");
-        s = s + &"\n".to_string();
-        s = s + &format!("  Entry block: {}", self.entry);
-        s = s + &"\n".to_string();
-        s = s + &format!("  Final block order: {:?}", self.final_block_order);
-        s = s + &"\n".to_string();
+        write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap();
+        write!(&mut s, "  Entry block: {}\n", self.entry).unwrap();
+        write!(
+            &mut s,
+            "  Final block order: {:?}\n",
+            self.final_block_order
+        )
+        .unwrap();
 
         for i in 0..self.num_blocks() {
             let block = display_order[i];
 
-            let omitted =
-                (if !self.final_block_order.is_empty() && i >= self.final_block_order.len() {
-                    "** OMITTED **"
-                } else {
-                    ""
-                })
-                .to_string();
+            let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len()
+            {
+                "** OMITTED **"
+            } else {
+                ""
+            };
 
-            s = s + &format!("Block {}: {}", block, omitted);
-            s = s + &"\n".to_string();
+            write!(&mut s, "Block {}: {}\n", block, omitted).unwrap();
             if let Some(bb) = self.bindex_to_bb(block as BlockIndex) {
-                s = s + &format!("  (original IR block: {})\n", bb);
+                write!(&mut s, "  (original IR block: {})\n", bb).unwrap();
             }
             for succ in self.succs(block as BlockIndex) {
-                s = s + &format!("  (successor: Block {})", succ);
-                s = s + &"\n".to_string();
+                write!(&mut s, "  (successor: Block {})\n", succ).unwrap();
             }
             let (start, end) = self.block_ranges[block];
-            s = s + &format!("  (instruction range: {} .. {})", start, end);
-            s = s + &"\n".to_string();
+            write!(&mut s, "  (instruction range: {} .. {})\n", start, end).unwrap();
             for inst in start..end {
-                s = s + &format!(
-                    "  Inst {}:   {}",
+                write!(
+                    &mut s,
+                    "  Inst {}:   {}\n",
                     inst,
                     self.insts[inst as usize].show_rru(mb_rru)
-                );
-                s = s + &"\n".to_string();
+                )
+                .unwrap();
             }
         }
 
-        s = s + &format!("}}}}");
-        s = s + &"\n".to_string();
+        write!(&mut s, "}}}}\n").unwrap();
 
         s
     }
diff --git a/cranelift/codegen/src/num_uses.rs b/cranelift/codegen/src/num_uses.rs
index c08741020c79..fd6eee8ec152 100644
--- a/cranelift/codegen/src/num_uses.rs
+++ b/cranelift/codegen/src/num_uses.rs
@@ -1,15 +1,9 @@
 //! A pass that computes the number of uses of any given instruction.
 
-#![allow(dead_code)]
-#![allow(unused_imports)]
-
-use crate::cursor::{Cursor, FuncCursor};
-use crate::dce::has_side_effect;
 use crate::entity::SecondaryMap;
 use crate::ir::dfg::ValueDef;
-use crate::ir::instructions::InstructionData;
 use crate::ir::Value;
-use crate::ir::{DataFlowGraph, Function, Inst, Opcode};
+use crate::ir::{DataFlowGraph, Function, Inst};
 
 /// Auxiliary data structure that counts the number of uses of any given
 /// instruction in a Function. This is used during instruction selection
@@ -51,16 +45,6 @@ impl NumUses {
         }
     }
 
-    /// How many times is an instruction used?
-    pub fn use_count(&self, i: Inst) -> usize {
-        self.uses[i] as usize
-    }
-
-    /// Is an instruction used at all?
-    pub fn is_used(&self, i: Inst) -> bool {
-        self.use_count(i) > 0
-    }
-
     /// Take the complete uses map, consuming this analysis result.
     pub fn take_uses(self) -> SecondaryMap<Inst, u32> {
         self.uses
diff --git a/cranelift/codegen/src/postopt.rs b/cranelift/codegen/src/postopt.rs
index b6c36434a152..9e2179982d3f 100644
--- a/cranelift/codegen/src/postopt.rs
+++ b/cranelift/codegen/src/postopt.rs
@@ -364,19 +364,17 @@ pub fn do_postopt(func: &mut Function, isa: &dyn TargetIsa) {
     while let Some(_block) = pos.next_block() {
         let mut last_flags_clobber = None;
         while let Some(inst) = pos.next_inst() {
-            if isa.uses_cpu_flags() {
+            if !is_mach_backend && isa.uses_cpu_flags() {
                 // Optimize instructions to make use of flags.
                 optimize_cpu_flags(&mut pos, inst, last_flags_clobber, isa);
 
-                if !is_mach_backend {
-                    // Track the most recent seen instruction that clobbers the flags.
-                    if let Some(constraints) = isa
-                        .encoding_info()
-                        .operand_constraints(pos.func.encodings[inst])
-                    {
-                        if constraints.clobbers_flags {
-                            last_flags_clobber = Some(inst)
-                        }
+                // Track the most recent seen instruction that clobbers the flags.
+                if let Some(constraints) = isa
+                    .encoding_info()
+                    .operand_constraints(pos.func.encodings[inst])
+                {
+                    if constraints.clobbers_flags {
+                        last_flags_clobber = Some(inst)
                     }
                 }
             }
diff --git a/cranelift/codegen/src/verifier/flags.rs b/cranelift/codegen/src/verifier/flags.rs
index 76e83ab88a8a..e4cfc8046220 100644
--- a/cranelift/codegen/src/verifier/flags.rs
+++ b/cranelift/codegen/src/verifier/flags.rs
@@ -28,17 +28,18 @@ pub fn verify_flags(
     errors: &mut VerifierErrors,
 ) -> VerifierStepResult<()> {
     let _tt = timing::verify_flags();
-    if isa.is_none() || isa.unwrap().get_mach_backend().is_none() {
-        let mut verifier = FlagsVerifier {
-            func,
-            cfg,
-            encinfo: isa.map(|isa| isa.encoding_info()),
-            livein: SecondaryMap::new(),
-        };
-        verifier.check(errors)
+    let encinfo = if isa.is_none() || isa.unwrap().get_mach_backend().is_some() {
+        None
     } else {
-        Ok(())
-    }
+        Some(isa.unwrap().encoding_info())
+    };
+    let mut verifier = FlagsVerifier {
+        func,
+        cfg,
+        encinfo,
+        livein: SecondaryMap::new(),
+    };
+    verifier.check(errors)
 }
 
 struct FlagsVerifier<'a> {
diff --git a/cranelift/filetests/filetests/vcode/arm64/arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
similarity index 99%
rename from cranelift/filetests/filetests/vcode/arm64/arithmetic.clif
rename to cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
index 7fbda32d081f..1f6dcf6b8206 100644
--- a/cranelift/filetests/filetests/vcode/arm64/arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
diff --git a/cranelift/filetests/filetests/vcode/arm64/basic1.clif b/cranelift/filetests/filetests/vcode/aarch64/basic1.clif
similarity index 90%
rename from cranelift/filetests/filetests/vcode/arm64/basic1.clif
rename to cranelift/filetests/filetests/vcode/aarch64/basic1.clif
index 29713d3427ce..b5ec1ae16075 100644
--- a/cranelift/filetests/filetests/vcode/arm64/basic1.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/basic1.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i32, i32) -> i32 {
 block0(v0: i32, v1: i32):
diff --git a/cranelift/filetests/filetests/vcode/arm64/bitops.clif b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
similarity index 99%
rename from cranelift/filetests/filetests/vcode/arm64/bitops.clif
rename to cranelift/filetests/filetests/vcode/aarch64/bitops.clif
index f2ebc5f003cf..8f5e81d32241 100644
--- a/cranelift/filetests/filetests/vcode/arm64/bitops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %a(i32) -> i32 {
 block0(v0: i32):
diff --git a/cranelift/filetests/filetests/vcode/arm64/call-indirect.clif b/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif
similarity index 91%
rename from cranelift/filetests/filetests/vcode/arm64/call-indirect.clif
rename to cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif
index 84fa72d2db29..c5e8ea059667 100644
--- a/cranelift/filetests/filetests/vcode/arm64/call-indirect.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64, i64) -> i64 {
     sig0 = (i64) -> i64
diff --git a/cranelift/filetests/filetests/vcode/arm64/call.clif b/cranelift/filetests/filetests/vcode/aarch64/call.clif
similarity index 90%
rename from cranelift/filetests/filetests/vcode/arm64/call.clif
rename to cranelift/filetests/filetests/vcode/aarch64/call.clif
index 3210db3959c1..1429dceed6f0 100644
--- a/cranelift/filetests/filetests/vcode/arm64/call.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/call.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64) -> i64 {
     fn0 = %g(i64) -> i64
diff --git a/cranelift/filetests/filetests/vcode/arm64/condbr.clif b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
similarity index 94%
rename from cranelift/filetests/filetests/vcode/arm64/condbr.clif
rename to cranelift/filetests/filetests/vcode/aarch64/condbr.clif
index e85e309ce5a1..596557d8e07f 100644
--- a/cranelift/filetests/filetests/vcode/arm64/condbr.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64, i64) -> b1 {
 block0(v0: i64, v1: i64):
@@ -33,7 +34,7 @@ block2:
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
 ; nextln: b.eq 20
-; check: Block 0:
+; check: Block 2:
 ; check: movz x0, #2
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
@@ -58,7 +59,7 @@ block1:
 ; check: stp fp, lr, [sp, #-16]!
 ; nextln: mov fp, sp
 ; nextln: subs xzr, x0, x1
-; check: Block 0:
+; check: Block 1:
 ; check: movz x0, #1
 ; nextln: mov sp, fp
 ; nextln: ldp fp, lr, [sp], #16
diff --git a/cranelift/filetests/filetests/vcode/arm64/condops.clif b/cranelift/filetests/filetests/vcode/aarch64/condops.clif
similarity index 96%
rename from cranelift/filetests/filetests/vcode/arm64/condops.clif
rename to cranelift/filetests/filetests/vcode/aarch64/condops.clif
index 01d2637e889a..e489836527df 100644
--- a/cranelift/filetests/filetests/vcode/arm64/condops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/condops.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i8, i64, i64) -> i64 {
 block0(v0: i8, v1: i64, v2: i64):
diff --git a/cranelift/filetests/filetests/vcode/arm64/constants.clif b/cranelift/filetests/filetests/vcode/aarch64/constants.clif
similarity index 99%
rename from cranelift/filetests/filetests/vcode/arm64/constants.clif
rename to cranelift/filetests/filetests/vcode/aarch64/constants.clif
index 5eca5402d7d9..67667d59c1f2 100644
--- a/cranelift/filetests/filetests/vcode/arm64/constants.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/constants.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f() -> i64 {
 block0:
diff --git a/cranelift/filetests/filetests/vcode/arm64/extend-op.clif b/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif
similarity index 92%
rename from cranelift/filetests/filetests/vcode/arm64/extend-op.clif
rename to cranelift/filetests/filetests/vcode/aarch64/extend-op.clif
index 74879c8c11f5..6194dd563f81 100644
--- a/cranelift/filetests/filetests/vcode/arm64/extend-op.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i8) -> i64 {
 block0(v0: i8):
diff --git a/cranelift/filetests/filetests/vcode/arm64/jumptable.clif b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
similarity index 96%
rename from cranelift/filetests/filetests/vcode/arm64/jumptable.clif
rename to cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
index 0677c3cb7d59..0789173acbfa 100644
--- a/cranelift/filetests/filetests/vcode/arm64/jumptable.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64) -> i64 {
   jt0 = jump_table [block1, block2, block3]
diff --git a/cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif
similarity index 98%
rename from cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif
rename to cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif
index 345a527d8839..d11fc224176c 100644
--- a/cranelift/filetests/filetests/vcode/arm64/narrow-arithmetic.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %add8(i8, i8) -> i8 {
 block0(v0: i8, v1: i8):
diff --git a/cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
similarity index 96%
rename from cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif
rename to cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
index a281a25e4b4b..60b45cc07aeb 100644
--- a/cranelift/filetests/filetests/vcode/arm64/saturating-ops.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %uaddsat64(i64, i64) -> i64 {
 block0(v0: i64, v1: i64):
diff --git a/cranelift/filetests/filetests/vcode/arm64/shift-op.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif
similarity index 91%
rename from cranelift/filetests/filetests/vcode/arm64/shift-op.clif
rename to cranelift/filetests/filetests/vcode/aarch64/shift-op.clif
index 852668081d15..12984620a1e9 100644
--- a/cranelift/filetests/filetests/vcode/arm64/shift-op.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f(i64) -> i64 {
 block0(v0: i64):
diff --git a/cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
similarity index 99%
rename from cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif
rename to cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
index bd56d4da5a64..b865cc29027b 100644
--- a/cranelift/filetests/filetests/vcode/arm64/shift-rotate.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; ROR, variable
diff --git a/cranelift/filetests/filetests/vcode/arm64/symbol-value.clif b/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif
similarity index 90%
rename from cranelift/filetests/filetests/vcode/arm64/symbol-value.clif
rename to cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif
index cf22b20ff9b9..01c0a8a46b19 100644
--- a/cranelift/filetests/filetests/vcode/arm64/symbol-value.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f() -> i64 {
   gv0 = symbol %my_global
diff --git a/cranelift/filetests/filetests/vcode/arm64/traps.clif b/cranelift/filetests/filetests/vcode/aarch64/traps.clif
similarity index 91%
rename from cranelift/filetests/filetests/vcode/arm64/traps.clif
rename to cranelift/filetests/filetests/vcode/aarch64/traps.clif
index 9f4a40ef12e1..b4c4be344b31 100644
--- a/cranelift/filetests/filetests/vcode/arm64/traps.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/traps.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f() {
 block0:
diff --git a/cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif b/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif
similarity index 99%
rename from cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif
rename to cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif
index 85a5c488a280..86084ff0cc57 100644
--- a/cranelift/filetests/filetests/vcode/arm64/uextend-sextend.clif
+++ b/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif
@@ -1,4 +1,5 @@
-test vcode arch=arm64
+test vcode
+target aarch64
 
 function %f_u_8_64(i8) -> i64 {
 block0(v0: i8):
diff --git a/cranelift/filetests/src/test_vcode.rs b/cranelift/filetests/src/test_vcode.rs
index f97aef47ea24..93bce57a59e7 100644
--- a/cranelift/filetests/src/test_vcode.rs
+++ b/cranelift/filetests/src/test_vcode.rs
@@ -4,11 +4,9 @@ use cranelift_codegen::isa::lookup;
 use cranelift_codegen::settings;
 use cranelift_codegen::Context as CodegenContext;
 use cranelift_reader::{TestCommand, TestOption};
-use target_lexicon::Triple;
 
 use log::info;
 use std::borrow::Cow;
-use std::str::FromStr;
 use std::string::String;
 
 struct TestVCode {
@@ -41,15 +39,13 @@ impl SubTest for TestVCode {
     }
 
     fn needs_isa(&self) -> bool {
-        false
+        true
     }
 
     fn run(&self, func: Cow<Function>, context: &Context) -> SubtestResult<()> {
+        let triple = context.isa.unwrap().triple().clone();
         let func = func.into_owned();
 
-        let triple =
-            Triple::from_str(&self.arch).map_err(|_| format!("Unknown arch: '{}'", self.arch))?;
-
         let mut isa = lookup(triple)
             .map_err(|_| format!("Could not look up backend for arch '{}'", self.arch))?
             .finish(settings::Flags::new(settings::builder()));
diff --git a/crates/jit/src/link.rs b/crates/jit/src/link.rs
index 8ffe7295260c..824c35ced6a9 100644
--- a/crates/jit/src/link.rs
+++ b/crates/jit/src/link.rs
@@ -142,12 +142,17 @@ cfg_if::cfg_if! {
             pub fn ___chkstk();
         }
         const PROBESTACK: unsafe extern "C" fn() = ___chkstk;
+    } else if #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] {
+        // As per
+        // https://github.com/rust-lang/compiler-builtins/blob/cae3e6ea23739166504f9f9fb50ec070097979d4/src/probestack.rs#L39,
+        // LLVM only has stack-probe support on x86-64 and x86. Thus, on any other CPU
+        // architecture, we simply use an empty stack-probe function.
+        extern "C" fn empty_probestack() {}
+        const PROBESTACK: unsafe extern "C" fn() = empty_probestack;
     } else {
         extern "C" {
             pub fn __rust_probestack();
         }
-        static PROBESTACK: unsafe extern "C" fn() = empty_probestack;
+        static PROBESTACK: unsafe extern "C" fn() = __rust_probestack;
     }
 }
-
-extern "C" fn empty_probestack() {}
diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs
index 571f823b3f63..e180b6c91bad 100644
--- a/crates/runtime/src/traphandlers.rs
+++ b/crates/runtime/src/traphandlers.rs
@@ -31,7 +31,6 @@ cfg_if::cfg_if! {
         static mut PREV_SIGBUS: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
         static mut PREV_SIGILL: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
         static mut PREV_SIGFPE: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
-        static mut PREV_SIGTRAP: MaybeUninit<libc::sigaction> = MaybeUninit::uninit();
 
         unsafe fn platform_init() {
             let register = |slot: &mut MaybeUninit<libc::sigaction>, signal: i32| {
@@ -71,9 +70,6 @@ cfg_if::cfg_if! {
                 register(&mut PREV_SIGFPE, libc::SIGFPE);
             }
 
-            // on ARM64, we use `brk` to report traps, which generates SIGTRAP.
-            register(&mut PREV_SIGTRAP, libc::SIGTRAP);
-
             // On ARM, handle Unaligned Accesses.
             // On Darwin, guard page accesses are raised as SIGBUS.
             if cfg!(target_arch = "arm") || cfg!(target_os = "macos") {
@@ -91,7 +87,6 @@ cfg_if::cfg_if! {
                 libc::SIGBUS => &PREV_SIGBUS,
                 libc::SIGFPE => &PREV_SIGFPE,
                 libc::SIGILL => &PREV_SIGILL,
-                libc::SIGTRAP => &PREV_SIGTRAP,
                 _ => panic!("unknown signal: {}", signum),
             };
             let handled = tls::with(|info| {
diff --git a/tests/custom_signal_handler.rs b/tests/custom_signal_handler.rs
index 8b3c8cd478de..27d14fc910a5 100644
--- a/tests/custom_signal_handler.rs
+++ b/tests/custom_signal_handler.rs
@@ -122,7 +122,7 @@ mod tests {
                 .downcast::<Trap>()?;
             assert!(
                 trap.message()
-                    .starts_with("wasm trap: out of bounds"),
+                    .starts_with("wasm trap: out of bounds memory access"),
                 "bad trap message: {:?}",
                 trap.message()
             );
@@ -149,7 +149,7 @@ mod tests {
                 .downcast::<Trap>()?;
             assert!(trap
                 .message()
-                .starts_with("wasm trap: out of bounds"));
+                .starts_with("wasm trap: out of bounds memory access"));
         }
         Ok(())
     }