diff --git a/Cargo.lock b/Cargo.lock index 86ab801ad16f..83d35409f1ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -379,6 +379,7 @@ dependencies = [ "gimli", "hashbrown 0.7.1", "log", + "regalloc", "serde", "smallvec", "target-lexicon", @@ -432,6 +433,7 @@ dependencies = [ "memmap", "num_cpus", "region", + "target-lexicon", ] [[package]] @@ -1589,6 +1591,16 @@ dependencies = [ "rust-argon2", ] +[[package]] +name = "regalloc" +version = "0.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ce0cd835fa6e91bbf5d010beee19d0c2e97e4ad5e13c399a31122cfc83bdd6" +dependencies = [ + "log", + "rustc-hash", +] + [[package]] name = "regex" version = "1.3.6" @@ -1653,6 +1665,12 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustc_version" version = "0.2.3" diff --git a/cranelift/codegen/Cargo.toml b/cranelift/codegen/Cargo.toml index 148fcf93273f..8bf10759c4be 100644 --- a/cranelift/codegen/Cargo.toml +++ b/cranelift/codegen/Cargo.toml @@ -24,6 +24,7 @@ gimli = { version = "0.20.0", default-features = false, features = ["write"], op smallvec = { version = "1.0.0" } thiserror = "1.0.4" byteorder = { version = "1.3.2", default-features = false } +regalloc = "0.0.17" # It is a goal of the cranelift-codegen crate to have minimal external dependencies. # Please don't add any unless they are essential to the task of creating binary # machine code. Integration tests that need external dependencies can be diff --git a/cranelift/codegen/meta/src/isa/arm64/mod.rs b/cranelift/codegen/meta/src/isa/arm64/mod.rs index 3440c8af8229..5d8bc76fc444 100644 --- a/cranelift/codegen/meta/src/isa/arm64/mod.rs +++ b/cranelift/codegen/meta/src/isa/arm64/mod.rs @@ -54,7 +54,9 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { let mut a64 = CpuMode::new("A64"); // TODO refine these. + let expand_flags = shared_defs.transform_groups.by_name("expand_flags"); let narrow_flags = shared_defs.transform_groups.by_name("narrow_flags"); + a64.legalize_monomorphic(expand_flags); a64.legalize_default(narrow_flags); let cpu_modes = vec![a64]; diff --git a/cranelift/codegen/src/binemit/mod.rs b/cranelift/codegen/src/binemit/mod.rs index 3a33649d4de0..33655a26bd44 100644 --- a/cranelift/codegen/src/binemit/mod.rs +++ b/cranelift/codegen/src/binemit/mod.rs @@ -54,7 +54,9 @@ pub enum Reloc { X86GOTPCRel4, /// Arm32 call target Arm32Call, - /// Arm64 call target + /// Arm64 call target. Encoded as bottom 26 bits of instruction. This + /// value is sign-extended, multiplied by 4, and added to the PC of + /// the call instruction to form the destination address. Arm64Call, /// RISC-V call target RiscvCall, diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index 967d302ecb9f..faf8c23d4e39 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -19,8 +19,10 @@ use crate::flowgraph::ControlFlowGraph; use crate::ir::Function; use crate::isa::TargetIsa; use crate::legalize_function; +use crate::legalizer::simple_legalize; use crate::licm::do_licm; use crate::loop_analysis::LoopAnalysis; +use crate::machinst::MachCompileResult; use crate::nan_canonicalization::do_nan_canonicalization; use crate::postopt::do_postopt; use crate::redundant_reload_remover::RedundantReloadRemover; @@ -55,6 +57,12 @@ pub struct Context { /// Redundant-reload remover context. pub redundant_reload_remover: RedundantReloadRemover, + + /// Result of MachBackend compilation, if computed. + pub mach_compile_result: Option, + + /// Flag: do we want a disassembly with the MachCompileResult? + pub want_disasm: bool, } impl Context { @@ -78,6 +86,8 @@ impl Context { regalloc: regalloc::Context::new(), loop_analysis: LoopAnalysis::new(), redundant_reload_remover: RedundantReloadRemover::new(), + mach_compile_result: None, + want_disasm: false, } } @@ -89,6 +99,14 @@ impl Context { self.regalloc.clear(); self.loop_analysis.clear(); self.redundant_reload_remover.clear(); + self.mach_compile_result = None; + self.want_disasm = false; + } + + /// Set the flag to request a disassembly when compiling with a + /// `MachBackend` backend. + pub fn set_disasm(&mut self, val: bool) { + self.want_disasm = val; } /// Compile the function, and emit machine code into a `Vec`. @@ -130,9 +148,13 @@ impl Context { pub fn compile(&mut self, isa: &dyn TargetIsa) -> CodegenResult { let _tt = timing::compile(); self.verify_if(isa)?; - debug!("Compiling:\n{}", self.func.display(isa)); let opt_level = isa.flags().opt_level(); + debug!( + "Compiling (opt level {:?}):\n{}", + opt_level, + self.func.display(isa) + ); self.compute_cfg(); if opt_level != OptLevel::None { @@ -141,6 +163,7 @@ impl Context { if isa.flags().enable_nan_canonicalization() { self.canonicalize_nans(isa)?; } + self.legalize(isa)?; if opt_level != OptLevel::None { self.postopt(isa)?; @@ -149,23 +172,32 @@ impl Context { self.licm(isa)?; self.simple_gvn(isa)?; } + self.compute_domtree(); self.eliminate_unreachable_code(isa)?; if opt_level != OptLevel::None { self.dce(isa)?; } - self.regalloc(isa)?; - self.prologue_epilogue(isa)?; - if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize { - self.redundant_reload_remover(isa)?; - } - if opt_level == OptLevel::SpeedAndSize { - self.shrink_instructions(isa)?; - } - let result = self.relax_branches(isa); - debug!("Compiled:\n{}", self.func.display(isa)); - result + if let Some(backend) = isa.get_mach_backend() { + let result = backend.compile_function(&mut self.func, self.want_disasm)?; + let info = result.code_info(); + self.mach_compile_result = Some(result); + Ok(info) + } else { + self.regalloc(isa)?; + self.prologue_epilogue(isa)?; + if opt_level == OptLevel::Speed || opt_level == OptLevel::SpeedAndSize { + self.redundant_reload_remover(isa)?; + } + if opt_level == OptLevel::SpeedAndSize { + self.shrink_instructions(isa)?; + } + let result = self.relax_branches(isa); + + debug!("Compiled:\n{}", self.func.display(isa)); + result + } } /// Emit machine code directly into raw memory. @@ -191,7 +223,11 @@ impl Context { ) -> CodeInfo { let _tt = timing::binemit(); let mut sink = MemoryCodeSink::new(mem, relocs, traps, stackmaps); - isa.emit_function_to_memory(&self.func, &mut sink); + if let Some(ref result) = &self.mach_compile_result { + result.sections.emit(&mut sink); + } else { + isa.emit_function_to_memory(&self.func, &mut sink); + } sink.info } @@ -279,9 +315,15 @@ impl Context { // TODO: Avoid doing this when legalization doesn't actually mutate the CFG. self.domtree.clear(); self.loop_analysis.clear(); - legalize_function(&mut self.func, &mut self.cfg, isa); - debug!("Legalized:\n{}", self.func.display(isa)); - self.verify_if(isa) + if isa.get_mach_backend().is_some() { + // Run some specific legalizations only. + simple_legalize(&mut self.func, &mut self.cfg, isa); + self.verify_if(isa) + } else { + legalize_function(&mut self.func, &mut self.cfg, isa); + debug!("Legalized:\n{}", self.func.display(isa)); + self.verify_if(isa) + } } /// Perform post-legalization rewrites on the function. diff --git a/cranelift/codegen/src/dce.rs b/cranelift/codegen/src/dce.rs index b217534c3e55..e3e855806da8 100644 --- a/cranelift/codegen/src/dce.rs +++ b/cranelift/codegen/src/dce.rs @@ -6,40 +6,10 @@ use crate::cursor::{Cursor, FuncCursor}; use crate::dominator_tree::DominatorTree; use crate::entity::EntityRef; -use crate::ir::instructions::InstructionData; -use crate::ir::{DataFlowGraph, Function, Inst, Opcode}; +use crate::inst_predicates::{any_inst_results_used, has_side_effect}; +use crate::ir::Function; use crate::timing; -/// Test whether the given opcode is unsafe to even consider for DCE. -fn trivially_unsafe_for_dce(opcode: Opcode) -> bool { - opcode.is_call() - || opcode.is_branch() - || opcode.is_terminator() - || opcode.is_return() - || opcode.can_trap() - || opcode.other_side_effects() - || opcode.can_store() -} - -/// Preserve instructions with used result values. -fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool { - dfg.inst_results(inst).iter().any(|v| live[v.index()]) -} - -/// Load instructions without the `notrap` flag are defined to trap when -/// operating on inaccessible memory, so we can't DCE them even if the -/// loaded value is unused. -fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool { - if !opcode.can_load() { - return false; - } - match *data { - InstructionData::StackLoad { .. } => false, - InstructionData::Load { flags, .. } => !flags.notrap(), - _ => true, - } -} - /// Perform DCE on `func`. pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) { let _tt = timing::dce(); @@ -50,10 +20,7 @@ pub fn do_dce(func: &mut Function, domtree: &mut DominatorTree) { let mut pos = FuncCursor::new(func).at_bottom(block); while let Some(inst) = pos.prev_inst() { { - let data = &pos.func.dfg[inst]; - let opcode = data.opcode(); - if trivially_unsafe_for_dce(opcode) - || is_load_with_defined_trapping(opcode, &data) + if has_side_effect(pos.func, inst) || any_inst_results_used(inst, &live, &pos.func.dfg) { for arg in pos.func.dfg.inst_args(inst) { diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs new file mode 100644 index 000000000000..9cefbc38f921 --- /dev/null +++ b/cranelift/codegen/src/inst_predicates.rs @@ -0,0 +1,42 @@ +//! Instruction predicates/properties, shared by various analyses. + +use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode}; +use cranelift_entity::EntityRef; + +/// Preserve instructions with used result values. +pub fn any_inst_results_used(inst: Inst, live: &[bool], dfg: &DataFlowGraph) -> bool { + dfg.inst_results(inst).iter().any(|v| live[v.index()]) +} + +/// Test whether the given opcode is unsafe to even consider as side-effect-free. +fn trivially_has_side_effects(opcode: Opcode) -> bool { + opcode.is_call() + || opcode.is_branch() + || opcode.is_terminator() + || opcode.is_return() + || opcode.can_trap() + || opcode.other_side_effects() + || opcode.can_store() +} + +/// Load instructions without the `notrap` flag are defined to trap when +/// operating on inaccessible memory, so we can't treat them as side-effect-free even if the loaded +/// value is unused. +fn is_load_with_defined_trapping(opcode: Opcode, data: &InstructionData) -> bool { + if !opcode.can_load() { + return false; + } + match *data { + InstructionData::StackLoad { .. } => false, + InstructionData::Load { flags, .. } => !flags.notrap(), + _ => true, + } +} + +/// Does the given instruction have any side-effect that would preclude it from being removed when +/// its value is unused? +pub fn has_side_effect(func: &Function, inst: Inst) -> bool { + let data = &func.dfg[inst]; + let opcode = data.opcode(); + trivially_has_side_effects(opcode) || is_load_with_defined_trapping(opcode, data) +} diff --git a/cranelift/codegen/src/ir/function.rs b/cranelift/codegen/src/ir/function.rs index 1e72d2bc48c8..4a3829780bb0 100644 --- a/cranelift/codegen/src/ir/function.rs +++ b/cranelift/codegen/src/ir/function.rs @@ -238,13 +238,21 @@ impl Function { /// Wrapper around `encode` which assigns `inst` the resulting encoding. pub fn update_encoding(&mut self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result<(), Legalize> { - self.encode(inst, isa).map(|e| self.encodings[inst] = e) + if isa.get_mach_backend().is_some() { + Ok(()) + } else { + self.encode(inst, isa).map(|e| self.encodings[inst] = e) + } } /// Wrapper around `TargetIsa::encode` for encoding an existing instruction /// in the `Function`. pub fn encode(&self, inst: ir::Inst, isa: &dyn TargetIsa) -> Result { - isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst)) + if isa.get_mach_backend().is_some() { + Ok(Encoding::new(0, 0)) + } else { + isa.encode(&self, &self.dfg[inst], self.dfg.ctrl_typevar(inst)) + } } /// Starts collection of debug information. diff --git a/cranelift/codegen/src/ir/immediates.rs b/cranelift/codegen/src/ir/immediates.rs index b1d142bd9e41..5104d83f9dbc 100644 --- a/cranelift/codegen/src/ir/immediates.rs +++ b/cranelift/codegen/src/ir/immediates.rs @@ -57,6 +57,11 @@ impl Imm64 { pub fn wrapping_neg(self) -> Self { Self(self.0.wrapping_neg()) } + + /// Return bits of this immediate. + pub fn bits(&self) -> i64 { + self.0 + } } impl Into for Imm64 { diff --git a/cranelift/codegen/src/isa/aarch64/abi.rs b/cranelift/codegen/src/isa/aarch64/abi.rs new file mode 100644 index 000000000000..88aa60f8af0f --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/abi.rs @@ -0,0 +1,885 @@ +//! Implementation of the standard AArch64 ABI. + +use crate::ir; +use crate::ir::types; +use crate::ir::types::*; +use crate::ir::StackSlot; +use crate::isa; +use crate::isa::aarch64::inst::*; +use crate::machinst::*; +use crate::settings; + +use alloc::vec::Vec; + +use regalloc::{RealReg, Reg, RegClass, Set, SpillSlot, Writable}; + +use log::debug; + +/// A location for an argument or return value. +#[derive(Clone, Copy, Debug)] +enum ABIArg { + /// In a real register. + Reg(RealReg, ir::Type), + /// Arguments only: on stack, at given offset from SP at entry. + Stack(i64, ir::Type), +} + +/// AArch64 ABI information shared between body (callee) and caller. +struct ABISig { + args: Vec, + rets: Vec, + stack_arg_space: i64, + call_conv: isa::CallConv, +} + +// Spidermonkey specific ABI convention. + +/// This is SpiderMonkey's `WasmTableCallSigReg`. +static BALDRDASH_SIG_REG: u8 = 10; + +/// This is SpiderMonkey's `WasmTlsReg`. +static BALDRDASH_TLS_REG: u8 = 23; + +// These two lists represent the registers the JIT may *not* use at any point in generated code. +// +// So these are callee-preserved from the JIT's point of view, and every register not in this list +// has to be caller-preserved by definition. +// +// Keep these lists in sync with the NonAllocatableMask set in Spidermonkey's +// Architecture-arm64.cpp. + +// Indexed by physical register number. +#[rustfmt::skip] +static BALDRDASH_JIT_CALLEE_SAVED_GPR: &[bool] = &[ + /* 0 = */ false, false, false, false, false, false, false, false, + /* 8 = */ false, false, false, false, false, false, false, false, + /* 16 = */ true /* x16 / ip1 */, true /* x17 / ip2 */, true /* x18 / TLS */, false, + /* 20 = */ false, false, false, false, + /* 24 = */ false, false, false, false, + // There should be 28, the pseudo stack pointer in this list, however the wasm stubs trash it + // gladly right now. + /* 28 = */ false, false, true /* x30 = FP */, true /* x31 = SP */ +]; + +#[rustfmt::skip] +static BALDRDASH_JIT_CALLEE_SAVED_FPU: &[bool] = &[ + /* 0 = */ false, false, false, false, false, false, false, false, + /* 8 = */ false, false, false, false, false, false, false, false, + /* 16 = */ false, false, false, false, false, false, false, false, + /* 24 = */ false, false, false, false, false, false, false, true /* v31 / d31 */ +]; + +/// Try to fill a Baldrdash register, returning it if it was found. +fn try_fill_baldrdash_reg(call_conv: isa::CallConv, param: &ir::AbiParam) -> Option { + if call_conv.extends_baldrdash() { + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext => { + // This is SpiderMonkey's `WasmTlsReg`. + Some(ABIArg::Reg( + xreg(BALDRDASH_TLS_REG).to_real_reg(), + ir::types::I64, + )) + } + &ir::ArgumentPurpose::SignatureId => { + // This is SpiderMonkey's `WasmTableCallSigReg`. + Some(ABIArg::Reg( + xreg(BALDRDASH_SIG_REG).to_real_reg(), + ir::types::I64, + )) + } + _ => None, + } + } else { + None + } +} + +/// Process a list of parameters or return values and allocate them to X-regs, +/// V-regs, and stack slots. +/// +/// Returns the list of argument locations, and the stack-space used (rounded up +/// to a 16-byte-aligned boundary). +fn compute_arg_locs(call_conv: isa::CallConv, params: &[ir::AbiParam]) -> (Vec, i64) { + // See AArch64 ABI (https://c9x.me/compile/bib/abi-arm64.pdf), sections 5.4. + let mut next_xreg = 0; + let mut next_vreg = 0; + let mut next_stack: u64 = 0; + let mut ret = vec![]; + for param in params { + // Validate "purpose". + match ¶m.purpose { + &ir::ArgumentPurpose::VMContext + | &ir::ArgumentPurpose::Normal + | &ir::ArgumentPurpose::SignatureId => {} + _ => panic!( + "Unsupported argument purpose {:?} in signature: {:?}", + param.purpose, params + ), + } + + if in_int_reg(param.value_type) { + if let Some(param) = try_fill_baldrdash_reg(call_conv, param) { + ret.push(param); + } else if next_xreg < 8 { + ret.push(ABIArg::Reg(xreg(next_xreg).to_real_reg(), param.value_type)); + next_xreg += 1; + } else { + ret.push(ABIArg::Stack(next_stack as i64, param.value_type)); + next_stack += 8; + } + } else if in_vec_reg(param.value_type) { + if next_vreg < 8 { + ret.push(ABIArg::Reg(vreg(next_vreg).to_real_reg(), param.value_type)); + next_vreg += 1; + } else { + let size: u64 = match param.value_type { + F32 | F64 => 8, + _ => panic!("Unsupported vector-reg argument type"), + }; + // Align. + assert!(size.is_power_of_two()); + next_stack = (next_stack + size - 1) & !(size - 1); + ret.push(ABIArg::Stack(next_stack as i64, param.value_type)); + next_stack += size; + } + } + } + + next_stack = (next_stack + 15) & !15; + + (ret, next_stack as i64) +} + +impl ABISig { + fn from_func_sig(sig: &ir::Signature) -> ABISig { + // Compute args and retvals from signature. + // TODO: pass in arg-mode or ret-mode. (Does not matter + // for the types of arguments/return values that we support.) + let (args, stack_arg_space) = compute_arg_locs(sig.call_conv, &sig.params); + let (rets, _) = compute_arg_locs(sig.call_conv, &sig.returns); + + // Verify that there are no return values on the stack. + assert!(rets.iter().all(|a| match a { + &ABIArg::Stack(..) => false, + _ => true, + })); + + ABISig { + args, + rets, + stack_arg_space, + call_conv: sig.call_conv, + } + } +} + +/// AArch64 ABI object for a function body. +pub struct AArch64ABIBody { + /// signature: arg and retval regs + sig: ABISig, + /// offsets to each stackslot + stackslots: Vec, + /// total stack size of all stackslots + stackslots_size: u32, + /// clobbered registers, from regalloc. + clobbered: Set>, + /// total number of spillslots, from regalloc. + spillslots: Option, + /// Total frame size. + frame_size: Option, + /// Calling convention this function expects. + call_conv: isa::CallConv, +} + +fn in_int_reg(ty: ir::Type) -> bool { + match ty { + types::I8 | types::I16 | types::I32 | types::I64 => true, + types::B1 | types::B8 | types::B16 | types::B32 | types::B64 => true, + _ => false, + } +} + +fn in_vec_reg(ty: ir::Type) -> bool { + match ty { + types::F32 | types::F64 => true, + _ => false, + } +} + +impl AArch64ABIBody { + /// Create a new body ABI instance. + pub fn new(f: &ir::Function) -> Self { + debug!("AArch64 ABI: func signature {:?}", f.signature); + + let sig = ABISig::from_func_sig(&f.signature); + + let call_conv = f.signature.call_conv; + // Only these calling conventions are supported. + assert!( + call_conv == isa::CallConv::SystemV + || call_conv == isa::CallConv::Fast + || call_conv == isa::CallConv::Cold + || call_conv.extends_baldrdash(), + "Unsupported calling convention: {:?}", + call_conv + ); + + // Compute stackslot locations and total stackslot size. + let mut stack_offset: u32 = 0; + let mut stackslots = vec![]; + for (stackslot, data) in f.stack_slots.iter() { + let off = stack_offset; + stack_offset += data.size; + stack_offset = (stack_offset + 7) & !7; + assert_eq!(stackslot.as_u32() as usize, stackslots.len()); + stackslots.push(off); + } + + Self { + sig, + stackslots, + stackslots_size: stack_offset, + clobbered: Set::empty(), + spillslots: None, + frame_size: None, + call_conv, + } + } +} + +fn load_stack(fp_offset: i64, into_reg: Writable, ty: Type) -> Inst { + let mem = MemArg::FPOffset(fp_offset); + + match ty { + types::B1 + | types::B8 + | types::I8 + | types::B16 + | types::I16 + | types::B32 + | types::I32 + | types::B64 + | types::I64 => Inst::ULoad64 { + rd: into_reg, + mem, + srcloc: None, + }, + types::F32 => Inst::FpuLoad32 { + rd: into_reg, + mem, + srcloc: None, + }, + types::F64 => Inst::FpuLoad64 { + rd: into_reg, + mem, + srcloc: None, + }, + _ => unimplemented!("load_stack({})", ty), + } +} + +fn store_stack(fp_offset: i64, from_reg: Reg, ty: Type) -> Inst { + let mem = MemArg::FPOffset(fp_offset); + + match ty { + types::B1 + | types::B8 + | types::I8 + | types::B16 + | types::I16 + | types::B32 + | types::I32 + | types::B64 + | types::I64 => Inst::Store64 { + rd: from_reg, + mem, + srcloc: None, + }, + types::F32 => Inst::FpuStore32 { + rd: from_reg, + mem, + srcloc: None, + }, + types::F64 => Inst::FpuStore64 { + rd: from_reg, + mem, + srcloc: None, + }, + _ => unimplemented!("store_stack({})", ty), + } +} + +fn is_callee_save(call_conv: isa::CallConv, r: RealReg) -> bool { + if call_conv.extends_baldrdash() { + match r.get_class() { + RegClass::I64 => { + let enc = r.get_hw_encoding(); + if BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] { + return true; + } + // Otherwise, fall through to preserve native ABI registers. + } + RegClass::V128 => { + let enc = r.get_hw_encoding(); + if BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] { + return true; + } + // Otherwise, fall through to preserve native ABI registers. + } + _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"), + }; + } + + match r.get_class() { + RegClass::I64 => { + // x19 - x28 inclusive are callee-saves. + r.get_hw_encoding() >= 19 && r.get_hw_encoding() <= 28 + } + RegClass::V128 => { + // v8 - v15 inclusive are callee-saves. + r.get_hw_encoding() >= 8 && r.get_hw_encoding() <= 15 + } + _ => panic!("Unexpected RegClass"), + } +} + +fn get_callee_saves( + call_conv: isa::CallConv, + regs: Vec>, +) -> (Vec>, Vec>) { + let mut int_saves = vec![]; + let mut vec_saves = vec![]; + for reg in regs.into_iter() { + if is_callee_save(call_conv, reg.to_reg()) { + match reg.to_reg().get_class() { + RegClass::I64 => int_saves.push(reg), + RegClass::V128 => vec_saves.push(reg), + _ => panic!("Unexpected RegClass"), + } + } + } + (int_saves, vec_saves) +} + +fn is_caller_save(call_conv: isa::CallConv, r: RealReg) -> bool { + if call_conv.extends_baldrdash() { + match r.get_class() { + RegClass::I64 => { + let enc = r.get_hw_encoding(); + if !BALDRDASH_JIT_CALLEE_SAVED_GPR[enc] { + return true; + } + // Otherwise, fall through to preserve native's ABI caller-saved. + } + RegClass::V128 => { + let enc = r.get_hw_encoding(); + if !BALDRDASH_JIT_CALLEE_SAVED_FPU[enc] { + return true; + } + // Otherwise, fall through to preserve native's ABI caller-saved. + } + _ => unimplemented!("baldrdash callee saved on non-i64 reg classes"), + }; + } + + match r.get_class() { + RegClass::I64 => { + // x0 - x17 inclusive are caller-saves. + r.get_hw_encoding() <= 17 + } + RegClass::V128 => { + // v0 - v7 inclusive and v16 - v31 inclusive are caller-saves. + r.get_hw_encoding() <= 7 || (r.get_hw_encoding() >= 16 && r.get_hw_encoding() <= 31) + } + _ => panic!("Unexpected RegClass"), + } +} + +fn get_caller_saves_set(call_conv: isa::CallConv) -> Set> { + let mut set = Set::empty(); + for i in 0..29 { + let x = writable_xreg(i); + if is_caller_save(call_conv, x.to_reg().to_real_reg()) { + set.insert(x); + } + } + for i in 0..32 { + let v = writable_vreg(i); + if is_caller_save(call_conv, v.to_reg().to_real_reg()) { + set.insert(v); + } + } + set +} + +impl ABIBody for AArch64ABIBody { + type I = Inst; + + fn liveins(&self) -> Set { + let mut set: Set = Set::empty(); + for &arg in &self.sig.args { + if let ABIArg::Reg(r, _) = arg { + set.insert(r); + } + } + set + } + + fn liveouts(&self) -> Set { + let mut set: Set = Set::empty(); + for &ret in &self.sig.rets { + if let ABIArg::Reg(r, _) = ret { + set.insert(r); + } + } + set + } + + fn num_args(&self) -> usize { + self.sig.args.len() + } + + fn num_retvals(&self) -> usize { + self.sig.rets.len() + } + + fn num_stackslots(&self) -> usize { + self.stackslots.len() + } + + fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable) -> Inst { + match &self.sig.args[idx] { + &ABIArg::Reg(r, ty) => Inst::gen_move(into_reg, r.to_reg(), ty), + &ABIArg::Stack(off, ty) => load_stack(off + 16, into_reg, ty), + } + } + + fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Inst { + match &self.sig.rets[idx] { + &ABIArg::Reg(r, ty) => Inst::gen_move(Writable::from_reg(r.to_reg()), from_reg, ty), + &ABIArg::Stack(off, ty) => store_stack(off + 16, from_reg, ty), + } + } + + fn gen_ret(&self) -> Inst { + Inst::Ret {} + } + + fn gen_epilogue_placeholder(&self) -> Inst { + Inst::EpiloguePlaceholder {} + } + + fn set_num_spillslots(&mut self, slots: usize) { + self.spillslots = Some(slots); + } + + fn set_clobbered(&mut self, clobbered: Set>) { + self.clobbered = clobbered; + } + + fn load_stackslot( + &self, + slot: StackSlot, + offset: u32, + ty: Type, + into_reg: Writable, + ) -> Inst { + // Offset from beginning of stackslot area, which is at FP - stackslots_size. + let stack_off = self.stackslots[slot.as_u32() as usize] as i64; + let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64); + load_stack(fp_off, into_reg, ty) + } + + fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Inst { + // Offset from beginning of stackslot area, which is at FP - stackslots_size. + let stack_off = self.stackslots[slot.as_u32() as usize] as i64; + let fp_off: i64 = -(self.stackslots_size as i64) + stack_off + (offset as i64); + store_stack(fp_off, from_reg, ty) + } + + // Load from a spillslot. + fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable) -> Inst { + // Note that when spills/fills are generated, we don't yet know how many + // spillslots there will be, so we allocate *downward* from the beginning + // of the stackslot area. Hence: FP - stackslot_size - 8*spillslot - + // sizeof(ty). + let islot = slot.get() as i64; + let ty_size = self.get_spillslot_size(into_reg.to_reg().get_class(), ty) * 8; + let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64; + load_stack(fp_off, into_reg, ty) + } + + // Store to a spillslot. + fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Inst { + let islot = slot.get() as i64; + let ty_size = self.get_spillslot_size(from_reg.get_class(), ty) * 8; + let fp_off: i64 = -(self.stackslots_size as i64) - (8 * islot) - ty_size as i64; + store_stack(fp_off, from_reg, ty) + } + + fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec { + let mut insts = vec![]; + if !self.call_conv.extends_baldrdash() { + // stp fp (x29), lr (x30), [sp, #-16]! + insts.push(Inst::StoreP64 { + rt: fp_reg(), + rt2: link_reg(), + mem: PairMemArg::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + ), + }); + // mov fp (x29), sp. This uses the ADDI rd, rs, 0 form of `MOV` because + // the usual encoding (`ORR`) does not work with SP. + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_fp_reg(), + rn: stack_reg(), + imm12: Imm12 { + bits: 0, + shift12: false, + }, + }); + } + + let mut total_stacksize = self.stackslots_size + 8 * self.spillslots.unwrap() as u32; + if self.call_conv.extends_baldrdash() { + debug_assert!( + !flags.enable_probestack(), + "baldrdash does not expect cranelift to emit stack probes" + ); + total_stacksize += flags.baldrdash_prologue_words() as u32 * 8; + } + let total_stacksize = (total_stacksize + 15) & !15; // 16-align the stack. + + if !self.call_conv.extends_baldrdash() && total_stacksize > 0 { + // sub sp, sp, #total_stacksize + if let Some(imm12) = Imm12::maybe_from_u64(total_stacksize as u64) { + let sub_inst = Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12, + }; + insts.push(sub_inst); + } else { + let tmp = writable_spilltmp_reg(); + let const_inst = Inst::LoadConst64 { + rd: tmp, + const_data: total_stacksize as u64, + }; + let sub_inst = Inst::AluRRRExtend { + alu_op: ALUOp::Sub64, + rd: writable_stack_reg(), + rn: stack_reg(), + rm: tmp.to_reg(), + extendop: ExtendOp::UXTX, + }; + insts.push(const_inst); + insts.push(sub_inst); + } + } + + // Save clobbered registers. + let (clobbered_int, clobbered_vec) = + get_callee_saves(self.call_conv, self.clobbered.to_vec()); + for reg_pair in clobbered_int.chunks(2) { + let (r1, r2) = if reg_pair.len() == 2 { + // .to_reg().to_reg(): Writable --> RealReg --> Reg + (reg_pair[0].to_reg().to_reg(), reg_pair[1].to_reg().to_reg()) + } else { + (reg_pair[0].to_reg().to_reg(), zero_reg()) + }; + + debug_assert!(r1.get_class() == RegClass::I64); + debug_assert!(r2.get_class() == RegClass::I64); + + // stp r1, r2, [sp, #-16]! + insts.push(Inst::StoreP64 { + rt: r1, + rt2: r2, + mem: PairMemArg::PreIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(-16, types::I64).unwrap(), + ), + }); + } + let vec_save_bytes = clobbered_vec.len() * 16; + if vec_save_bytes != 0 { + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(), + }); + } + for (i, reg) in clobbered_vec.iter().enumerate() { + insts.push(Inst::FpuStore128 { + rd: reg.to_reg().to_reg(), + mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()), + srcloc: None, + }); + } + + self.frame_size = Some(total_stacksize); + insts + } + + fn gen_epilogue(&self, _flags: &settings::Flags) -> Vec { + let mut insts = vec![]; + + // Restore clobbered registers. + let (clobbered_int, clobbered_vec) = + get_callee_saves(self.call_conv, self.clobbered.to_vec()); + + for (i, reg) in clobbered_vec.iter().enumerate() { + insts.push(Inst::FpuLoad128 { + rd: Writable::from_reg(reg.to_reg().to_reg()), + mem: MemArg::Unscaled(stack_reg(), SImm9::maybe_from_i64((i * 16) as i64).unwrap()), + srcloc: None, + }); + } + let vec_save_bytes = clobbered_vec.len() * 16; + if vec_save_bytes != 0 { + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12: Imm12::maybe_from_u64(vec_save_bytes as u64).unwrap(), + }); + } + + for reg_pair in clobbered_int.chunks(2).rev() { + let (r1, r2) = if reg_pair.len() == 2 { + ( + reg_pair[0].map(|r| r.to_reg()), + reg_pair[1].map(|r| r.to_reg()), + ) + } else { + (reg_pair[0].map(|r| r.to_reg()), writable_zero_reg()) + }; + + debug_assert!(r1.to_reg().get_class() == RegClass::I64); + debug_assert!(r2.to_reg().get_class() == RegClass::I64); + + // ldp r1, r2, [sp], #16 + insts.push(Inst::LoadP64 { + rt: r1, + rt2: r2, + mem: PairMemArg::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(), + ), + }); + } + + if !self.call_conv.extends_baldrdash() { + // The MOV (alias of ORR) interprets x31 as XZR, so use an ADD here. + // MOV to SP is an alias of ADD. + insts.push(Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_stack_reg(), + rn: fp_reg(), + imm12: Imm12 { + bits: 0, + shift12: false, + }, + }); + insts.push(Inst::LoadP64 { + rt: writable_fp_reg(), + rt2: writable_link_reg(), + mem: PairMemArg::PostIndexed( + writable_stack_reg(), + SImm7Scaled::maybe_from_i64(16, types::I64).unwrap(), + ), + }); + insts.push(Inst::Ret {}); + } + + debug!("Epilogue: {:?}", insts); + insts + } + + fn frame_size(&self) -> u32 { + self.frame_size + .expect("frame size not computed before prologue generation") + } + + fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32 { + // We allocate in terms of 8-byte slots. + match (rc, ty) { + (RegClass::I64, _) => 1, + (RegClass::V128, F32) | (RegClass::V128, F64) => 1, + (RegClass::V128, _) => 2, + _ => panic!("Unexpected register class!"), + } + } + + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Inst { + self.store_spillslot(to_slot, ty, from_reg.to_reg()) + } + + fn gen_reload(&self, to_reg: Writable, from_slot: SpillSlot, ty: Type) -> Inst { + self.load_spillslot(from_slot, ty, to_reg.map(|r| r.to_reg())) + } +} + +enum CallDest { + ExtName(ir::ExternalName), + Reg(Reg), +} + +/// AArch64 ABI object for a function call. +pub struct AArch64ABICall { + sig: ABISig, + uses: Set, + defs: Set>, + dest: CallDest, + loc: ir::SourceLoc, + opcode: ir::Opcode, +} + +fn abisig_to_uses_and_defs(sig: &ABISig) -> (Set, Set>) { + // Compute uses: all arg regs. + let mut uses = Set::empty(); + for arg in &sig.args { + match arg { + &ABIArg::Reg(reg, _) => uses.insert(reg.to_reg()), + _ => {} + } + } + + // Compute defs: all retval regs, and all caller-save (clobbered) regs. + let mut defs = get_caller_saves_set(sig.call_conv); + for ret in &sig.rets { + match ret { + &ABIArg::Reg(reg, _) => defs.insert(Writable::from_reg(reg.to_reg())), + _ => {} + } + } + + (uses, defs) +} + +impl AArch64ABICall { + /// Create a callsite ABI object for a call directly to the specified function. + pub fn from_func( + sig: &ir::Signature, + extname: &ir::ExternalName, + loc: ir::SourceLoc, + ) -> AArch64ABICall { + let sig = ABISig::from_func_sig(sig); + let (uses, defs) = abisig_to_uses_and_defs(&sig); + AArch64ABICall { + sig, + uses, + defs, + dest: CallDest::ExtName(extname.clone()), + loc, + opcode: ir::Opcode::Call, + } + } + + /// Create a callsite ABI object for a call to a function pointer with the + /// given signature. + pub fn from_ptr( + sig: &ir::Signature, + ptr: Reg, + loc: ir::SourceLoc, + opcode: ir::Opcode, + ) -> AArch64ABICall { + let sig = ABISig::from_func_sig(sig); + let (uses, defs) = abisig_to_uses_and_defs(&sig); + AArch64ABICall { + sig, + uses, + defs, + dest: CallDest::Reg(ptr), + loc, + opcode, + } + } +} + +fn adjust_stack(amt: u64, is_sub: bool) -> Vec { + if amt > 0 { + let alu_op = if is_sub { ALUOp::Sub64 } else { ALUOp::Add64 }; + if let Some(imm12) = Imm12::maybe_from_u64(amt) { + vec![Inst::AluRRImm12 { + alu_op, + rd: writable_stack_reg(), + rn: stack_reg(), + imm12, + }] + } else { + let const_load = Inst::LoadConst64 { + rd: writable_spilltmp_reg(), + const_data: amt, + }; + let adj = Inst::AluRRRExtend { + alu_op, + rd: writable_stack_reg(), + rn: stack_reg(), + rm: spilltmp_reg(), + extendop: ExtendOp::UXTX, + }; + vec![const_load, adj] + } + } else { + vec![] + } +} + +impl ABICall for AArch64ABICall { + type I = Inst; + + fn num_args(&self) -> usize { + self.sig.args.len() + } + + fn gen_stack_pre_adjust(&self) -> Vec { + adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ true) + } + + fn gen_stack_post_adjust(&self) -> Vec { + adjust_stack(self.sig.stack_arg_space as u64, /* is_sub = */ false) + } + + fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Inst { + match &self.sig.args[idx] { + &ABIArg::Reg(reg, ty) => Inst::gen_move(Writable::from_reg(reg.to_reg()), from_reg, ty), + &ABIArg::Stack(off, _) => Inst::Store64 { + rd: from_reg, + mem: MemArg::SPOffset(off), + srcloc: None, + }, + } + } + + fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable) -> Inst { + match &self.sig.rets[idx] { + &ABIArg::Reg(reg, ty) => Inst::gen_move(into_reg, reg.to_reg(), ty), + _ => unimplemented!(), + } + } + + fn gen_call(&self) -> Vec { + let (uses, defs) = (self.uses.clone(), self.defs.clone()); + match &self.dest { + &CallDest::ExtName(ref name) => vec![Inst::Call { + dest: name.clone(), + uses, + defs, + loc: self.loc, + opcode: self.opcode, + }], + &CallDest::Reg(reg) => vec![Inst::CallInd { + rn: reg, + uses, + defs, + loc: self.loc, + opcode: self.opcode, + }], + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs new file mode 100644 index 000000000000..b83f375bcf6d --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs @@ -0,0 +1,528 @@ +//! AArch64 ISA definitions: instruction arguments. + +// Some variants are never constructed, but we still want them as options in the future. +#![allow(dead_code)] + +use crate::binemit::CodeOffset; +use crate::ir::Type; +use crate::isa::aarch64::inst::*; + +use regalloc::{RealRegUniverse, Reg, Writable}; + +use core::convert::{Into, TryFrom}; +use std::string::String; + +/// A shift operator for a register or immediate. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum ShiftOp { + LSL = 0b00, + LSR = 0b01, + ASR = 0b10, + ROR = 0b11, +} + +impl ShiftOp { + /// Get the encoding of this shift op. + pub fn bits(self) -> u8 { + self as u8 + } +} + +/// A shift operator amount. +#[derive(Clone, Copy, Debug)] +pub struct ShiftOpShiftImm(u8); + +impl ShiftOpShiftImm { + /// Maximum shift for shifted-register operands. + pub const MAX_SHIFT: u64 = 63; + + /// Create a new shiftop shift amount, if possible. + pub fn maybe_from_shift(shift: u64) -> Option { + if shift <= Self::MAX_SHIFT { + Some(ShiftOpShiftImm(shift as u8)) + } else { + None + } + } + + /// Return the shift amount. + pub fn value(self) -> u8 { + self.0 + } +} + +/// A shift operator with an amount, guaranteed to be within range. +#[derive(Clone, Debug)] +pub struct ShiftOpAndAmt { + op: ShiftOp, + shift: ShiftOpShiftImm, +} + +impl ShiftOpAndAmt { + pub fn new(op: ShiftOp, shift: ShiftOpShiftImm) -> ShiftOpAndAmt { + ShiftOpAndAmt { op, shift } + } + + /// Get the shift op. + pub fn op(&self) -> ShiftOp { + self.op + } + + /// Get the shift amount. + pub fn amt(&self) -> ShiftOpShiftImm { + self.shift + } +} + +/// An extend operator for a register. +#[derive(Clone, Copy, Debug)] +#[repr(u8)] +pub enum ExtendOp { + UXTB = 0b000, + UXTH = 0b001, + UXTW = 0b010, + UXTX = 0b011, + SXTB = 0b100, + SXTH = 0b101, + SXTW = 0b110, + SXTX = 0b111, +} + +impl ExtendOp { + /// Encoding of this op. + pub fn bits(self) -> u8 { + self as u8 + } +} + +//============================================================================= +// Instruction sub-components (memory addresses): definitions + +/// A reference to some memory address. +#[derive(Clone, Debug)] +pub enum MemLabel { + /// An address in the code, a constant pool or jumptable, with relative + /// offset from this instruction. This form must be used at emission time; + /// see `memlabel_finalize()` for how other forms are lowered to this one. + PCRel(i32), +} + +/// A memory argument to load/store, encapsulating the possible addressing modes. +#[derive(Clone, Debug)] +pub enum MemArg { + Label(MemLabel), + /// "post-indexed" mode as per AArch64 docs: postincrement reg after address computation. + PostIndexed(Writable, SImm9), + /// "pre-indexed" mode as per AArch64 docs: preincrement reg before address computation. + PreIndexed(Writable, SImm9), + + // N.B.: RegReg, RegScaled, and RegScaledExtended all correspond to + // what the ISA calls the "register offset" addressing mode. We split out + // several options here for more ergonomic codegen. + /// Register plus register offset. + RegReg(Reg, Reg), + + /// Register plus register offset, scaled by type's size. + RegScaled(Reg, Reg, Type), + + /// Register plus register offset, scaled by type's size, with index sign- or zero-extended + /// first. + RegScaledExtended(Reg, Reg, Type, ExtendOp), + + /// Unscaled signed 9-bit immediate offset from reg. + Unscaled(Reg, SImm9), + + /// Scaled (by size of a type) unsigned 12-bit immediate offset from reg. + UnsignedOffset(Reg, UImm12Scaled), + + /// Offset from the stack pointer. Lowered into a real amode at emission. + SPOffset(i64), + + /// Offset from the frame pointer. Lowered into a real amode at emission. + FPOffset(i64), +} + +impl MemArg { + /// Memory reference using an address in a register. + pub fn reg(reg: Reg) -> MemArg { + // Use UnsignedOffset rather than Unscaled to use ldr rather than ldur. + // This also does not use PostIndexed / PreIndexed as they update the register. + MemArg::UnsignedOffset(reg, UImm12Scaled::zero(I64)) + } + + /// Memory reference using an address in a register and an offset, if possible. + pub fn reg_maybe_offset(reg: Reg, offset: i64, value_type: Type) -> Option { + if let Some(simm9) = SImm9::maybe_from_i64(offset) { + Some(MemArg::Unscaled(reg, simm9)) + } else if let Some(uimm12s) = UImm12Scaled::maybe_from_i64(offset, value_type) { + Some(MemArg::UnsignedOffset(reg, uimm12s)) + } else { + None + } + } + + /// Memory reference using the sum of two registers as an address. + pub fn reg_plus_reg(reg1: Reg, reg2: Reg) -> MemArg { + MemArg::RegReg(reg1, reg2) + } + + /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address. + pub fn reg_plus_reg_scaled(reg1: Reg, reg2: Reg, ty: Type) -> MemArg { + MemArg::RegScaled(reg1, reg2, ty) + } + + /// Memory reference using `reg1 + sizeof(ty) * reg2` as an address, with `reg2` sign- or + /// zero-extended as per `op`. + pub fn reg_plus_reg_scaled_extended(reg1: Reg, reg2: Reg, ty: Type, op: ExtendOp) -> MemArg { + MemArg::RegScaledExtended(reg1, reg2, ty, op) + } + + /// Memory reference to a label: a global function or value, or data in the constant pool. + pub fn label(label: MemLabel) -> MemArg { + MemArg::Label(label) + } +} + +/// A memory argument to a load/store-pair. +#[derive(Clone, Debug)] +pub enum PairMemArg { + SignedOffset(Reg, SImm7Scaled), + PreIndexed(Writable, SImm7Scaled), + PostIndexed(Writable, SImm7Scaled), +} + +//============================================================================= +// Instruction sub-components (conditions, branches and branch targets): +// definitions + +/// Condition for conditional branches. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +#[repr(u8)] +pub enum Cond { + Eq = 0, + Ne = 1, + Hs = 2, + Lo = 3, + Mi = 4, + Pl = 5, + Vs = 6, + Vc = 7, + Hi = 8, + Ls = 9, + Ge = 10, + Lt = 11, + Gt = 12, + Le = 13, + Al = 14, + Nv = 15, +} + +impl Cond { + /// Return the inverted condition. + pub fn invert(self) -> Cond { + match self { + Cond::Eq => Cond::Ne, + Cond::Ne => Cond::Eq, + + Cond::Hs => Cond::Lo, + Cond::Lo => Cond::Hs, + + Cond::Mi => Cond::Pl, + Cond::Pl => Cond::Mi, + + Cond::Vs => Cond::Vc, + Cond::Vc => Cond::Vs, + + Cond::Hi => Cond::Ls, + Cond::Ls => Cond::Hi, + + Cond::Ge => Cond::Lt, + Cond::Lt => Cond::Ge, + + Cond::Gt => Cond::Le, + Cond::Le => Cond::Gt, + + Cond::Al => Cond::Nv, + Cond::Nv => Cond::Al, + } + } + + /// Return the machine encoding of this condition. + pub fn bits(self) -> u32 { + self as u32 + } +} + +/// The kind of conditional branch: the common-case-optimized "reg-is-zero" / +/// "reg-is-nonzero" variants, or the generic one that tests the machine +/// condition codes. +#[derive(Clone, Copy, Debug)] +pub enum CondBrKind { + /// Condition: given register is zero. + Zero(Reg), + /// Condition: given register is nonzero. + NotZero(Reg), + /// Condition: the given condition-code test is true. + Cond(Cond), +} + +impl CondBrKind { + /// Return the inverted branch condition. + pub fn invert(self) -> CondBrKind { + match self { + CondBrKind::Zero(reg) => CondBrKind::NotZero(reg), + CondBrKind::NotZero(reg) => CondBrKind::Zero(reg), + CondBrKind::Cond(c) => CondBrKind::Cond(c.invert()), + } + } +} + +/// A branch target. Either unresolved (basic-block index) or resolved (offset +/// from end of current instruction). +#[derive(Clone, Copy, Debug)] +pub enum BranchTarget { + /// An unresolved reference to a BlockIndex, as passed into + /// `lower_branch_group()`. + Block(BlockIndex), + /// A resolved reference to another instruction, after + /// `Inst::with_block_offsets()`. + ResolvedOffset(isize), +} + +impl BranchTarget { + /// Lower the branch target given offsets of each block. + pub fn lower(&mut self, targets: &[CodeOffset], my_offset: CodeOffset) { + match self { + &mut BranchTarget::Block(bix) => { + let bix = usize::try_from(bix).unwrap(); + assert!(bix < targets.len()); + let block_offset_in_func = targets[bix]; + let branch_offset = (block_offset_in_func as isize) - (my_offset as isize); + *self = BranchTarget::ResolvedOffset(branch_offset); + } + &mut BranchTarget::ResolvedOffset(..) => {} + } + } + + /// Get the block index. + pub fn as_block_index(&self) -> Option { + match self { + &BranchTarget::Block(bix) => Some(bix), + _ => None, + } + } + + /// Get the offset as 4-byte words. Returns `0` if not + /// yet resolved (in that case, we're only computing + /// size and the offset doesn't matter). + pub fn as_offset_words(&self) -> isize { + match self { + &BranchTarget::ResolvedOffset(off) => off >> 2, + _ => 0, + } + } + + /// Get the offset as a 26-bit offset suitable for a 26-bit jump, or `None` if overflow. + pub fn as_off26(&self) -> Option { + let off = self.as_offset_words(); + if (off < (1 << 25)) && (off >= -(1 << 25)) { + Some((off as u32) & ((1 << 26) - 1)) + } else { + None + } + } + + /// Get the offset as a 19-bit offset, or `None` if overflow. + pub fn as_off19(&self) -> Option { + let off = self.as_offset_words(); + if (off < (1 << 18)) && (off >= -(1 << 18)) { + Some((off as u32) & ((1 << 19) - 1)) + } else { + None + } + } + + /// Map the block index given a transform map. + pub fn map(&mut self, block_index_map: &[BlockIndex]) { + match self { + &mut BranchTarget::Block(ref mut bix) => { + let n = block_index_map[usize::try_from(*bix).unwrap()]; + *bix = n; + } + &mut BranchTarget::ResolvedOffset(_) => {} + } + } +} + +impl ShowWithRRU for ShiftOpAndAmt { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("{:?} {}", self.op(), self.amt().value()) + } +} + +impl ShowWithRRU for ExtendOp { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("{:?}", self) + } +} + +impl ShowWithRRU for MemLabel { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &MemLabel::PCRel(off) => format!("pc+{}", off), + } + } +} + +fn shift_for_type(ty: Type) -> usize { + match ty.bytes() { + 1 => 0, + 2 => 1, + 4 => 2, + 8 => 3, + 16 => 4, + _ => panic!("unknown type: {}", ty), + } +} + +impl ShowWithRRU for MemArg { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &MemArg::Unscaled(reg, simm9) => { + if simm9.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), simm9.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &MemArg::UnsignedOffset(reg, uimm12) => { + if uimm12.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), uimm12.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &MemArg::RegReg(r1, r2) => { + format!("[{}, {}]", r1.show_rru(mb_rru), r2.show_rru(mb_rru),) + } + &MemArg::RegScaled(r1, r2, ty) => { + let shift = shift_for_type(ty); + format!( + "[{}, {}, LSL #{}]", + r1.show_rru(mb_rru), + r2.show_rru(mb_rru), + shift, + ) + } + &MemArg::RegScaledExtended(r1, r2, ty, op) => { + let shift = shift_for_type(ty); + let size = match op { + ExtendOp::SXTW | ExtendOp::UXTW => InstSize::Size32, + _ => InstSize::Size64, + }; + let op = op.show_rru(mb_rru); + format!( + "[{}, {}, {} #{}]", + r1.show_rru(mb_rru), + show_ireg_sized(r2, mb_rru, size), + op, + shift + ) + } + &MemArg::Label(ref label) => label.show_rru(mb_rru), + &MemArg::PreIndexed(r, simm9) => format!( + "[{}, {}]!", + r.to_reg().show_rru(mb_rru), + simm9.show_rru(mb_rru) + ), + &MemArg::PostIndexed(r, simm9) => format!( + "[{}], {}", + r.to_reg().show_rru(mb_rru), + simm9.show_rru(mb_rru) + ), + // Eliminated by `mem_finalize()`. + &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => { + panic!("Unexpected stack-offset mem-arg mode!") + } + } + } +} + +impl ShowWithRRU for PairMemArg { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &PairMemArg::SignedOffset(reg, simm7) => { + if simm7.value != 0 { + format!("[{}, {}]", reg.show_rru(mb_rru), simm7.show_rru(mb_rru)) + } else { + format!("[{}]", reg.show_rru(mb_rru)) + } + } + &PairMemArg::PreIndexed(reg, simm7) => format!( + "[{}, {}]!", + reg.to_reg().show_rru(mb_rru), + simm7.show_rru(mb_rru) + ), + &PairMemArg::PostIndexed(reg, simm7) => format!( + "[{}], {}", + reg.to_reg().show_rru(mb_rru), + simm7.show_rru(mb_rru) + ), + } + } +} + +impl ShowWithRRU for Cond { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let mut s = format!("{:?}", self); + s.make_ascii_lowercase(); + s + } +} + +impl ShowWithRRU for BranchTarget { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + match self { + &BranchTarget::Block(block) => format!("block{}", block), + &BranchTarget::ResolvedOffset(off) => format!("{}", off), + } + } +} + +/// Type used to communicate the operand size of a machine instruction, as AArch64 has 32- and +/// 64-bit variants of many instructions (and integer registers). +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum InstSize { + Size32, + Size64, +} + +impl InstSize { + /// 32-bit case? + pub fn is32(self) -> bool { + self == InstSize::Size32 + } + /// 64-bit case? + pub fn is64(self) -> bool { + self == InstSize::Size64 + } + /// Convert from an `is32` boolean flag to an `InstSize`. + pub fn from_is32(is32: bool) -> InstSize { + if is32 { + InstSize::Size32 + } else { + InstSize::Size64 + } + } + /// Convert from a needed width to the smallest size that fits. + pub fn from_bits>(bits: I) -> InstSize { + let bits: usize = bits.into(); + assert!(bits <= 64); + if bits <= 32 { + InstSize::Size32 + } else { + InstSize::Size64 + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs new file mode 100644 index 000000000000..f01746543ce7 --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -0,0 +1,4099 @@ +//! AArch64 ISA: binary code emission. + +use crate::binemit::{CodeOffset, Reloc}; +use crate::ir::constant::ConstantData; +use crate::ir::types::*; +use crate::ir::TrapCode; +use crate::isa::aarch64::inst::*; + +use core::convert::TryFrom; + +use regalloc::{Reg, RegClass, Writable}; + +use alloc::vec::Vec; + +/// Memory label/reference finalization: convert a MemLabel to a PC-relative +/// offset, possibly emitting relocation(s) as necessary. +pub fn memlabel_finalize(_insn_off: CodeOffset, label: &MemLabel) -> i32 { + match label { + &MemLabel::PCRel(rel) => rel, + } +} + +/// Memory addressing mode finalization: convert "special" modes (e.g., +/// generic arbitrary stack offset) into real addressing modes, possibly by +/// emitting some helper instructions that come immediately before the use +/// of this amode. +pub fn mem_finalize(insn_off: CodeOffset, mem: &MemArg) -> (Vec, MemArg) { + match mem { + &MemArg::SPOffset(off) | &MemArg::FPOffset(off) => { + let basereg = match mem { + &MemArg::SPOffset(..) => stack_reg(), + &MemArg::FPOffset(..) => fp_reg(), + _ => unreachable!(), + }; + if let Some(simm9) = SImm9::maybe_from_i64(off) { + let mem = MemArg::Unscaled(basereg, simm9); + (vec![], mem) + } else { + let tmp = writable_spilltmp_reg(); + let mut const_insts = Inst::load_constant(tmp, off as u64); + let add_inst = Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: basereg, + }; + const_insts.push(add_inst); + (const_insts.to_vec(), MemArg::reg(tmp.to_reg())) + } + } + &MemArg::Label(ref label) => { + let off = memlabel_finalize(insn_off, label); + (vec![], MemArg::Label(MemLabel::PCRel(off))) + } + _ => (vec![], mem.clone()), + } +} + +/// Helper: get a ConstantData from a u64. +pub fn u64_constant(bits: u64) -> ConstantData { + let data = bits.to_le_bytes(); + ConstantData::from(&data[..]) +} + +//============================================================================= +// Instructions and subcomponents: emission + +fn machreg_to_gpr(m: Reg) -> u32 { + assert!(m.get_class() == RegClass::I64); + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn machreg_to_vec(m: Reg) -> u32 { + assert!(m.get_class() == RegClass::V128); + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn machreg_to_gpr_or_vec(m: Reg) -> u32 { + u32::try_from(m.to_real_reg().get_hw_encoding()).unwrap() +} + +fn enc_arith_rrr(bits_31_21: u32, bits_15_10: u32, rd: Writable, rn: Reg, rm: Reg) -> u32 { + (bits_31_21 << 21) + | (bits_15_10 << 10) + | machreg_to_gpr(rd.to_reg()) + | (machreg_to_gpr(rn) << 5) + | (machreg_to_gpr(rm) << 16) +} + +fn enc_arith_rr_imm12( + bits_31_24: u32, + immshift: u32, + imm12: u32, + rn: Reg, + rd: Writable, +) -> u32 { + (bits_31_24 << 24) + | (immshift << 22) + | (imm12 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_arith_rr_imml(bits_31_23: u32, imm_bits: u32, rn: Reg, rd: Writable) -> u32 { + (bits_31_23 << 23) | (imm_bits << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_arith_rrrr(top11: u32, rm: Reg, bit15: u32, ra: Reg, rn: Reg, rd: Writable) -> u32 { + (top11 << 21) + | (machreg_to_gpr(rm) << 16) + | (bit15 << 15) + | (machreg_to_gpr(ra) << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_jump26(op_31_26: u32, off_26_0: u32) -> u32 { + assert!(off_26_0 < (1 << 26)); + (op_31_26 << 26) | off_26_0 +} + +fn enc_cmpbr(op_31_24: u32, off_18_0: u32, reg: Reg) -> u32 { + assert!(off_18_0 < (1 << 19)); + (op_31_24 << 24) | (off_18_0 << 5) | machreg_to_gpr(reg) +} + +fn enc_cbr(op_31_24: u32, off_18_0: u32, op_4: u32, cond: u32) -> u32 { + assert!(off_18_0 < (1 << 19)); + assert!(cond < (1 << 4)); + (op_31_24 << 24) | (off_18_0 << 5) | (op_4 << 4) | cond +} + +const MOVE_WIDE_FIXED: u32 = 0x92800000; + +#[repr(u32)] +enum MoveWideOpcode { + MOVN = 0b00, + MOVZ = 0b10, + MOVK = 0b11, +} + +fn enc_move_wide(op: MoveWideOpcode, rd: Writable, imm: MoveWideConst) -> u32 { + assert!(imm.shift <= 0b11); + MOVE_WIDE_FIXED + | (op as u32) << 29 + | u32::from(imm.shift) << 21 + | u32::from(imm.bits) << 5 + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_ldst_pair(op_31_22: u32, simm7: SImm7Scaled, rn: Reg, rt: Reg, rt2: Reg) -> u32 { + (op_31_22 << 22) + | (simm7.bits() << 15) + | (machreg_to_gpr(rt2) << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rt) +} + +fn enc_ldst_simm9(op_31_22: u32, simm9: SImm9, op_11_10: u32, rn: Reg, rd: Reg) -> u32 { + (op_31_22 << 22) + | (simm9.bits() << 12) + | (op_11_10 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_uimm12(op_31_22: u32, uimm12: UImm12Scaled, rn: Reg, rd: Reg) -> u32 { + (op_31_22 << 22) + | (0b1 << 24) + | (uimm12.bits() << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_reg( + op_31_22: u32, + rn: Reg, + rm: Reg, + s_bit: bool, + extendop: Option, + rd: Reg, +) -> u32 { + let s_bit = if s_bit { 1 } else { 0 }; + let extend_bits = match extendop { + Some(ExtendOp::UXTW) => 0b010, + Some(ExtendOp::SXTW) => 0b110, + Some(ExtendOp::SXTX) => 0b111, + None => 0b011, // LSL + _ => panic!("bad extend mode for ld/st MemArg"), + }; + (op_31_22 << 22) + | (1 << 21) + | (machreg_to_gpr(rm) << 16) + | (extend_bits << 13) + | (s_bit << 12) + | (0b10 << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr_or_vec(rd) +} + +fn enc_ldst_imm19(op_31_24: u32, imm19: u32, rd: Reg) -> u32 { + (op_31_24 << 24) | (imm19 << 5) | machreg_to_gpr_or_vec(rd) +} + +fn enc_extend(top22: u32, rd: Writable, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_vec_rrr(top11: u32, rm: Reg, bit15_10: u32, rn: Reg, rd: Writable) -> u32 { + (top11 << 21) + | (machreg_to_vec(rm) << 16) + | (bit15_10 << 10) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_bit_rr(size: u32, opcode2: u32, opcode1: u32, rn: Reg, rd: Writable) -> u32 { + (0b01011010110 << 21) + | size << 31 + | opcode2 << 16 + | opcode1 << 10 + | machreg_to_gpr(rn) << 5 + | machreg_to_gpr(rd.to_reg()) +} + +fn enc_br(rn: Reg) -> u32 { + 0b1101011_0000_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5) +} + +fn enc_adr(off: i32, rd: Writable) -> u32 { + let off = u32::try_from(off).unwrap(); + let immlo = off & 3; + let immhi = (off >> 2) & ((1 << 19) - 1); + (0b00010000 << 24) | (immlo << 29) | (immhi << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_csel(rd: Writable, rn: Reg, rm: Reg, cond: Cond) -> u32 { + 0b100_11010100_00000_0000_00_00000_00000 + | (machreg_to_gpr(rm) << 16) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()) + | (cond.bits() << 12) +} + +fn enc_fcsel(rd: Writable, rn: Reg, rm: Reg, cond: Cond, size: InstSize) -> u32 { + let ty_bit = if size.is32() { 0 } else { 1 }; + 0b000_11110_00_1_00000_0000_11_00000_00000 + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) + | (cond.bits() << 12) + | (ty_bit << 22) +} + +fn enc_cset(rd: Writable, cond: Cond) -> u32 { + 0b100_11010100_11111_0000_01_11111_00000 + | machreg_to_gpr(rd.to_reg()) + | (cond.invert().bits() << 12) +} + +fn enc_vecmov(is_16b: bool, rd: Writable, rn: Reg) -> u32 { + debug_assert!(!is_16b); // to be supported later. + 0b00001110_101_00000_00011_1_00000_00000 + | machreg_to_vec(rd.to_reg()) + | (machreg_to_vec(rn) << 16) + | (machreg_to_vec(rn) << 5) +} + +fn enc_fpurr(top22: u32, rd: Writable, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +fn enc_fpurrr(top22: u32, rd: Writable, rn: Reg, rm: Reg) -> u32 { + (top22 << 10) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_fpurrrr(top17: u32, rd: Writable, rn: Reg, rm: Reg, ra: Reg) -> u32 { + (top17 << 15) + | (machreg_to_vec(rm) << 16) + | (machreg_to_vec(ra) << 10) + | (machreg_to_vec(rn) << 5) + | machreg_to_vec(rd.to_reg()) +} + +fn enc_fcmp(size: InstSize, rn: Reg, rm: Reg) -> u32 { + let bits = if size.is32() { + 0b000_11110_00_1_00000_00_1000_00000_00000 + } else { + 0b000_11110_01_1_00000_00_1000_00000_00000 + }; + bits | (machreg_to_vec(rm) << 16) | (machreg_to_vec(rn) << 5) +} + +fn enc_fputoint(top16: u32, rd: Writable, rn: Reg) -> u32 { + (top16 << 16) | (machreg_to_vec(rn) << 5) | machreg_to_gpr(rd.to_reg()) +} + +fn enc_inttofpu(top16: u32, rd: Writable, rn: Reg) -> u32 { + (top16 << 16) | (machreg_to_gpr(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +fn enc_fround(top22: u32, rd: Writable, rn: Reg) -> u32 { + (top22 << 10) | (machreg_to_vec(rn) << 5) | machreg_to_vec(rd.to_reg()) +} + +impl MachInstEmit for Inst { + fn emit(&self, sink: &mut O) { + match self { + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let top11 = match alu_op { + ALUOp::Add32 => 0b00001011_000, + ALUOp::Add64 => 0b10001011_000, + ALUOp::Sub32 => 0b01001011_000, + ALUOp::Sub64 => 0b11001011_000, + ALUOp::Orr32 => 0b00101010_000, + ALUOp::Orr64 => 0b10101010_000, + ALUOp::And32 => 0b00001010_000, + ALUOp::And64 => 0b10001010_000, + ALUOp::Eor32 => 0b01001010_000, + ALUOp::Eor64 => 0b11001010_000, + ALUOp::OrrNot32 => 0b00101010_001, + ALUOp::OrrNot64 => 0b10101010_001, + ALUOp::AndNot32 => 0b00001010_001, + ALUOp::AndNot64 => 0b10001010_001, + ALUOp::EorNot32 => 0b01001010_001, + ALUOp::EorNot64 => 0b11001010_001, + ALUOp::AddS32 => 0b00101011_000, + ALUOp::AddS64 => 0b10101011_000, + ALUOp::SubS32 => 0b01101011_000, + ALUOp::SubS64 => 0b11101011_000, + ALUOp::SDiv64 => 0b10011010_110, + ALUOp::UDiv64 => 0b10011010_110, + ALUOp::RotR32 | ALUOp::Lsr32 | ALUOp::Asr32 | ALUOp::Lsl32 => 0b00011010_110, + ALUOp::RotR64 | ALUOp::Lsr64 | ALUOp::Asr64 | ALUOp::Lsl64 => 0b10011010_110, + + ALUOp::MAdd32 + | ALUOp::MAdd64 + | ALUOp::MSub32 + | ALUOp::MSub64 + | ALUOp::SMulH + | ALUOp::UMulH => { + //// RRRR ops. + panic!("Bad ALUOp {:?} in RRR form!", alu_op); + } + }; + let bit15_10 = match alu_op { + ALUOp::SDiv64 => 0b000011, + ALUOp::UDiv64 => 0b000010, + ALUOp::RotR32 | ALUOp::RotR64 => 0b001011, + ALUOp::Lsr32 | ALUOp::Lsr64 => 0b001001, + ALUOp::Asr32 | ALUOp::Asr64 => 0b001010, + ALUOp::Lsl32 | ALUOp::Lsl64 => 0b001000, + _ => 0b000000, + }; + assert_ne!(writable_stack_reg(), rd); + sink.put4(enc_arith_rrr(top11, bit15_10, rd, rn, rm)); + } + &Inst::AluRRRR { + alu_op, + rd, + rm, + rn, + ra, + } => { + let (top11, bit15) = match alu_op { + ALUOp::MAdd32 => (0b0_00_11011_000, 0), + ALUOp::MSub32 => (0b0_00_11011_000, 1), + ALUOp::MAdd64 => (0b1_00_11011_000, 0), + ALUOp::MSub64 => (0b1_00_11011_000, 1), + ALUOp::SMulH => (0b1_00_11011_010, 0), + ALUOp::UMulH => (0b1_00_11011_110, 0), + _ => unimplemented!("{:?}", alu_op), + }; + sink.put4(enc_arith_rrrr(top11, rm, bit15, ra, rn, rd)); + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + ref imm12, + } => { + let top8 = match alu_op { + ALUOp::Add32 => 0b000_10001, + ALUOp::Add64 => 0b100_10001, + ALUOp::Sub32 => 0b010_10001, + ALUOp::Sub64 => 0b110_10001, + ALUOp::AddS32 => 0b001_10001, + ALUOp::AddS64 => 0b101_10001, + ALUOp::SubS32 => 0b011_10001, + ALUOp::SubS64 => 0b111_10001, + _ => unimplemented!("{:?}", alu_op), + }; + sink.put4(enc_arith_rr_imm12( + top8, + imm12.shift_bits(), + imm12.imm_bits(), + rn, + rd, + )); + } + &Inst::AluRRImmLogic { + alu_op, + rd, + rn, + ref imml, + } => { + let (top9, inv) = match alu_op { + ALUOp::Orr32 => (0b001_100100, false), + ALUOp::Orr64 => (0b101_100100, false), + ALUOp::And32 => (0b000_100100, false), + ALUOp::And64 => (0b100_100100, false), + ALUOp::Eor32 => (0b010_100100, false), + ALUOp::Eor64 => (0b110_100100, false), + ALUOp::OrrNot32 => (0b001_100100, true), + ALUOp::OrrNot64 => (0b101_100100, true), + ALUOp::AndNot32 => (0b000_100100, true), + ALUOp::AndNot64 => (0b100_100100, true), + ALUOp::EorNot32 => (0b010_100100, true), + ALUOp::EorNot64 => (0b110_100100, true), + _ => unimplemented!("{:?}", alu_op), + }; + let imml = if inv { imml.invert() } else { imml.clone() }; + sink.put4(enc_arith_rr_imml(top9, imml.enc_bits(), rn, rd)); + } + + &Inst::AluRRImmShift { + alu_op, + rd, + rn, + ref immshift, + } => { + let amt = immshift.value(); + let (top10, immr, imms) = match alu_op { + ALUOp::RotR32 => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)), + ALUOp::RotR64 => (0b1001001111, machreg_to_gpr(rn), u32::from(amt)), + ALUOp::Lsr32 => (0b0101001100, u32::from(amt), 0b011111), + ALUOp::Lsr64 => (0b1101001101, u32::from(amt), 0b111111), + ALUOp::Asr32 => (0b0001001100, u32::from(amt), 0b011111), + ALUOp::Asr64 => (0b1001001101, u32::from(amt), 0b111111), + ALUOp::Lsl32 => (0b0101001100, u32::from(32 - amt), u32::from(31 - amt)), + ALUOp::Lsl64 => (0b1101001101, u32::from(64 - amt), u32::from(63 - amt)), + _ => unimplemented!("{:?}", alu_op), + }; + sink.put4( + (top10 << 22) + | (immr << 16) + | (imms << 10) + | (machreg_to_gpr(rn) << 5) + | machreg_to_gpr(rd.to_reg()), + ); + } + + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shiftop, + } => { + let top11: u32 = match alu_op { + ALUOp::Add32 => 0b000_01011000, + ALUOp::Add64 => 0b100_01011000, + ALUOp::AddS32 => 0b001_01011000, + ALUOp::AddS64 => 0b101_01011000, + ALUOp::Sub32 => 0b010_01011000, + ALUOp::Sub64 => 0b110_01011000, + ALUOp::SubS32 => 0b011_01011000, + ALUOp::SubS64 => 0b111_01011000, + ALUOp::Orr32 => 0b001_01010000, + ALUOp::Orr64 => 0b101_01010000, + ALUOp::And32 => 0b000_01010000, + ALUOp::And64 => 0b100_01010000, + ALUOp::Eor32 => 0b010_01010000, + ALUOp::Eor64 => 0b110_01010000, + ALUOp::OrrNot32 => 0b001_01010001, + ALUOp::OrrNot64 => 0b101_01010001, + ALUOp::EorNot32 => 0b010_01010001, + ALUOp::EorNot64 => 0b110_01010001, + ALUOp::AndNot32 => 0b000_01010001, + ALUOp::AndNot64 => 0b100_01010001, + _ => unimplemented!("{:?}", alu_op), + }; + let top11 = top11 | (u32::from(shiftop.op().bits()) << 1); + let bits_15_10 = u32::from(shiftop.amt().value()); + sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm)); + } + + &Inst::AluRRRExtend { + alu_op, + rd, + rn, + rm, + extendop, + } => { + let top11: u32 = match alu_op { + ALUOp::Add32 => 0b00001011001, + ALUOp::Add64 => 0b10001011001, + ALUOp::Sub32 => 0b01001011001, + ALUOp::Sub64 => 0b11001011001, + ALUOp::AddS32 => 0b00101011001, + ALUOp::AddS64 => 0b10101011001, + ALUOp::SubS32 => 0b01101011001, + ALUOp::SubS64 => 0b11101011001, + _ => unimplemented!("{:?}", alu_op), + }; + let bits_15_10 = u32::from(extendop.bits()) << 3; + sink.put4(enc_arith_rrr(top11, bits_15_10, rd, rn, rm)); + } + + &Inst::BitRR { op, rd, rn, .. } => { + let size = if op.inst_size().is32() { 0b0 } else { 0b1 }; + let (op1, op2) = match op { + BitOp::RBit32 | BitOp::RBit64 => (0b00000, 0b000000), + BitOp::Clz32 | BitOp::Clz64 => (0b00000, 0b000100), + BitOp::Cls32 | BitOp::Cls64 => (0b00000, 0b000101), + }; + sink.put4(enc_bit_rr(size, op1, op2, rn, rd)) + } + + &Inst::ULoad8 { + rd, + ref mem, + srcloc, + } + | &Inst::SLoad8 { + rd, + ref mem, + srcloc, + } + | &Inst::ULoad16 { + rd, + ref mem, + srcloc, + } + | &Inst::SLoad16 { + rd, + ref mem, + srcloc, + } + | &Inst::ULoad32 { + rd, + ref mem, + srcloc, + } + | &Inst::SLoad32 { + rd, + ref mem, + srcloc, + } + | &Inst::ULoad64 { + rd, + ref mem, + srcloc, + .. + } + | &Inst::FpuLoad32 { + rd, + ref mem, + srcloc, + } + | &Inst::FpuLoad64 { + rd, + ref mem, + srcloc, + } + | &Inst::FpuLoad128 { + rd, + ref mem, + srcloc, + } => { + let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem); + + for inst in mem_insts.into_iter() { + inst.emit(sink); + } + + // ldst encoding helpers take Reg, not Writable. + let rd = rd.to_reg(); + + // This is the base opcode (top 10 bits) for the "unscaled + // immediate" form (Unscaled). Other addressing modes will OR in + // other values for bits 24/25 (bits 1/2 of this constant). + let op = match self { + &Inst::ULoad8 { .. } => 0b0011100001, + &Inst::SLoad8 { .. } => 0b0011100010, + &Inst::ULoad16 { .. } => 0b0111100001, + &Inst::SLoad16 { .. } => 0b0111100010, + &Inst::ULoad32 { .. } => 0b1011100001, + &Inst::SLoad32 { .. } => 0b1011100010, + &Inst::ULoad64 { .. } => 0b1111100001, + &Inst::FpuLoad32 { .. } => 0b1011110001, + &Inst::FpuLoad64 { .. } => 0b1111110001, + &Inst::FpuLoad128 { .. } => 0b0011110011, + _ => unreachable!(), + }; + + if let Some(srcloc) = srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::OutOfBounds); + } + + match &mem { + &MemArg::Unscaled(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd)); + } + &MemArg::UnsignedOffset(reg, uimm12scaled) => { + sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd)); + } + &MemArg::RegReg(r1, r2) => { + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd, + )); + } + &MemArg::RegScaled(r1, r2, ty) | &MemArg::RegScaledExtended(r1, r2, ty, _) => { + match (ty, self) { + (I8, &Inst::ULoad8 { .. }) => {} + (I8, &Inst::SLoad8 { .. }) => {} + (I16, &Inst::ULoad16 { .. }) => {} + (I16, &Inst::SLoad16 { .. }) => {} + (I32, &Inst::ULoad32 { .. }) => {} + (I32, &Inst::SLoad32 { .. }) => {} + (I64, &Inst::ULoad64 { .. }) => {} + (F32, &Inst::FpuLoad32 { .. }) => {} + (F64, &Inst::FpuLoad64 { .. }) => {} + (I128, &Inst::FpuLoad128 { .. }) => {} + _ => panic!("Mismatching reg-scaling type in MemArg"), + } + let extendop = match &mem { + &MemArg::RegScaled(..) => None, + &MemArg::RegScaledExtended(_, _, _, op) => Some(op), + _ => unreachable!(), + }; + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ true, extendop, rd, + )); + } + &MemArg::Label(ref label) => { + let offset = match label { + // cast i32 to u32 (two's-complement) + &MemLabel::PCRel(off) => off as u32, + } / 4; + assert!(offset < (1 << 19)); + match self { + &Inst::ULoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b00011000, offset, rd)); + } + &Inst::SLoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b10011000, offset, rd)); + } + &Inst::FpuLoad32 { .. } => { + sink.put4(enc_ldst_imm19(0b00011100, offset, rd)); + } + &Inst::ULoad64 { .. } => { + sink.put4(enc_ldst_imm19(0b01011000, offset, rd)); + } + &Inst::FpuLoad64 { .. } => { + sink.put4(enc_ldst_imm19(0b01011100, offset, rd)); + } + &Inst::FpuLoad128 { .. } => { + sink.put4(enc_ldst_imm19(0b10011100, offset, rd)); + } + _ => panic!("Unspported size for LDR from constant pool!"), + } + } + &MemArg::PreIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd)); + } + &MemArg::PostIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd)); + } + // Eliminated by `mem_finalize()` above. + &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => { + panic!("Should not see stack-offset here!") + } + } + } + + &Inst::Store8 { + rd, + ref mem, + srcloc, + } + | &Inst::Store16 { + rd, + ref mem, + srcloc, + } + | &Inst::Store32 { + rd, + ref mem, + srcloc, + } + | &Inst::Store64 { + rd, + ref mem, + srcloc, + .. + } + | &Inst::FpuStore32 { + rd, + ref mem, + srcloc, + } + | &Inst::FpuStore64 { + rd, + ref mem, + srcloc, + } + | &Inst::FpuStore128 { + rd, + ref mem, + srcloc, + } => { + let (mem_insts, mem) = mem_finalize(sink.cur_offset_from_start(), mem); + + for inst in mem_insts.into_iter() { + inst.emit(sink); + } + + let op = match self { + &Inst::Store8 { .. } => 0b0011100000, + &Inst::Store16 { .. } => 0b0111100000, + &Inst::Store32 { .. } => 0b1011100000, + &Inst::Store64 { .. } => 0b1111100000, + &Inst::FpuStore32 { .. } => 0b1011110000, + &Inst::FpuStore64 { .. } => 0b1111110000, + &Inst::FpuStore128 { .. } => 0b0011110010, + _ => unreachable!(), + }; + + if let Some(srcloc) = srcloc { + // Register the offset at which the actual load instruction starts. + sink.add_trap(srcloc, TrapCode::OutOfBounds); + } + + match &mem { + &MemArg::Unscaled(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b00, reg, rd)); + } + &MemArg::UnsignedOffset(reg, uimm12scaled) => { + sink.put4(enc_ldst_uimm12(op, uimm12scaled, reg, rd)); + } + &MemArg::RegReg(r1, r2) => { + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ false, /* extendop = */ None, rd, + )); + } + &MemArg::RegScaled(r1, r2, _ty) + | &MemArg::RegScaledExtended(r1, r2, _ty, _) => { + let extendop = match &mem { + &MemArg::RegScaled(..) => None, + &MemArg::RegScaledExtended(_, _, _, op) => Some(op), + _ => unreachable!(), + }; + sink.put4(enc_ldst_reg( + op, r1, r2, /* scaled = */ true, extendop, rd, + )); + } + &MemArg::Label(..) => { + panic!("Store to a MemLabel not implemented!"); + } + &MemArg::PreIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b11, reg.to_reg(), rd)); + } + &MemArg::PostIndexed(reg, simm9) => { + sink.put4(enc_ldst_simm9(op, simm9, 0b01, reg.to_reg(), rd)); + } + // Eliminated by `mem_finalize()` above. + &MemArg::SPOffset(..) | &MemArg::FPOffset(..) => { + panic!("Should not see stack-offset here!") + } + } + } + + &Inst::StoreP64 { rt, rt2, ref mem } => match mem { + &PairMemArg::SignedOffset(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100100, simm7, reg, rt, rt2)); + } + &PairMemArg::PreIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100110, simm7, reg.to_reg(), rt, rt2)); + } + &PairMemArg::PostIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100010, simm7, reg.to_reg(), rt, rt2)); + } + }, + &Inst::LoadP64 { rt, rt2, ref mem } => { + let rt = rt.to_reg(); + let rt2 = rt2.to_reg(); + match mem { + &PairMemArg::SignedOffset(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100101, simm7, reg, rt, rt2)); + } + &PairMemArg::PreIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100111, simm7, reg.to_reg(), rt, rt2)); + } + &PairMemArg::PostIndexed(reg, simm7) => { + assert_eq!(simm7.scale_ty, I64); + sink.put4(enc_ldst_pair(0b1010100011, simm7, reg.to_reg(), rt, rt2)); + } + } + } + &Inst::Mov { rd, rm } => { + assert!(rd.to_reg().get_class() == rm.get_class()); + assert!(rm.get_class() == RegClass::I64); + // MOV to SP is interpreted as MOV to XZR instead. And our codegen + // should never MOV to XZR. + assert!(machreg_to_gpr(rd.to_reg()) != 31); + // Encoded as ORR rd, rm, zero. + sink.put4(enc_arith_rrr(0b10101010_000, 0b000_000, rd, zero_reg(), rm)); + } + &Inst::Mov32 { rd, rm } => { + // MOV to SP is interpreted as MOV to XZR instead. And our codegen + // should never MOV to XZR. + assert!(machreg_to_gpr(rd.to_reg()) != 31); + // Encoded as ORR rd, rm, zero. + sink.put4(enc_arith_rrr(0b00101010_000, 0b000_000, rd, zero_reg(), rm)); + } + &Inst::MovZ { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVZ, rd, imm)), + &Inst::MovN { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVN, rd, imm)), + &Inst::MovK { rd, imm } => sink.put4(enc_move_wide(MoveWideOpcode::MOVK, rd, imm)), + &Inst::CSel { rd, rn, rm, cond } => { + sink.put4(enc_csel(rd, rn, rm, cond)); + } + &Inst::CSet { rd, cond } => { + sink.put4(enc_cset(rd, cond)); + } + &Inst::FpuMove64 { rd, rn } => { + sink.put4(enc_vecmov(/* 16b = */ false, rd, rn)); + } + &Inst::FpuRR { fpu_op, rd, rn } => { + let top22 = match fpu_op { + FPUOp1::Abs32 => 0b000_11110_00_1_000001_10000, + FPUOp1::Abs64 => 0b000_11110_01_1_000001_10000, + FPUOp1::Neg32 => 0b000_11110_00_1_000010_10000, + FPUOp1::Neg64 => 0b000_11110_01_1_000010_10000, + FPUOp1::Sqrt32 => 0b000_11110_00_1_000011_10000, + FPUOp1::Sqrt64 => 0b000_11110_01_1_000011_10000, + FPUOp1::Cvt32To64 => 0b000_11110_00_1_000101_10000, + FPUOp1::Cvt64To32 => 0b000_11110_01_1_000100_10000, + }; + sink.put4(enc_fpurr(top22, rd, rn)); + } + &Inst::FpuRRR { fpu_op, rd, rn, rm } => { + let top22 = match fpu_op { + FPUOp2::Add32 => 0b000_11110_00_1_00000_001010, + FPUOp2::Add64 => 0b000_11110_01_1_00000_001010, + FPUOp2::Sub32 => 0b000_11110_00_1_00000_001110, + FPUOp2::Sub64 => 0b000_11110_01_1_00000_001110, + FPUOp2::Mul32 => 0b000_11110_00_1_00000_000010, + FPUOp2::Mul64 => 0b000_11110_01_1_00000_000010, + FPUOp2::Div32 => 0b000_11110_00_1_00000_000110, + FPUOp2::Div64 => 0b000_11110_01_1_00000_000110, + FPUOp2::Max32 => 0b000_11110_00_1_00000_010010, + FPUOp2::Max64 => 0b000_11110_01_1_00000_010010, + FPUOp2::Min32 => 0b000_11110_00_1_00000_010110, + FPUOp2::Min64 => 0b000_11110_01_1_00000_010110, + }; + sink.put4(enc_fpurrr(top22, rd, rn, rm)); + } + &Inst::FpuRRRR { + fpu_op, + rd, + rn, + rm, + ra, + } => { + let top17 = match fpu_op { + FPUOp3::MAdd32 => 0b000_11111_00_0_00000_0, + FPUOp3::MAdd64 => 0b000_11111_01_0_00000_0, + }; + sink.put4(enc_fpurrrr(top17, rd, rn, rm, ra)); + } + &Inst::FpuCmp32 { rn, rm } => { + sink.put4(enc_fcmp(InstSize::Size32, rn, rm)); + } + &Inst::FpuCmp64 { rn, rm } => { + sink.put4(enc_fcmp(InstSize::Size64, rn, rm)); + } + &Inst::FpuToInt { op, rd, rn } => { + let top16 = match op { + // FCVTZS (32/32-bit) + FpuToIntOp::F32ToI32 => 0b000_11110_00_1_11_000, + // FCVTZU (32/32-bit) + FpuToIntOp::F32ToU32 => 0b000_11110_00_1_11_001, + // FCVTZS (32/64-bit) + FpuToIntOp::F32ToI64 => 0b100_11110_00_1_11_000, + // FCVTZU (32/64-bit) + FpuToIntOp::F32ToU64 => 0b100_11110_00_1_11_001, + // FCVTZS (64/32-bit) + FpuToIntOp::F64ToI32 => 0b000_11110_01_1_11_000, + // FCVTZU (64/32-bit) + FpuToIntOp::F64ToU32 => 0b000_11110_01_1_11_001, + // FCVTZS (64/64-bit) + FpuToIntOp::F64ToI64 => 0b100_11110_01_1_11_000, + // FCVTZU (64/64-bit) + FpuToIntOp::F64ToU64 => 0b100_11110_01_1_11_001, + }; + sink.put4(enc_fputoint(top16, rd, rn)); + } + &Inst::IntToFpu { op, rd, rn } => { + let top16 = match op { + // SCVTF (32/32-bit) + IntToFpuOp::I32ToF32 => 0b000_11110_00_1_00_010, + // UCVTF (32/32-bit) + IntToFpuOp::U32ToF32 => 0b000_11110_00_1_00_011, + // SCVTF (64/32-bit) + IntToFpuOp::I64ToF32 => 0b100_11110_00_1_00_010, + // UCVTF (64/32-bit) + IntToFpuOp::U64ToF32 => 0b100_11110_00_1_00_011, + // SCVTF (32/64-bit) + IntToFpuOp::I32ToF64 => 0b000_11110_01_1_00_010, + // UCVTF (32/64-bit) + IntToFpuOp::U32ToF64 => 0b000_11110_01_1_00_011, + // SCVTF (64/64-bit) + IntToFpuOp::I64ToF64 => 0b100_11110_01_1_00_010, + // UCVTF (64/64-bit) + IntToFpuOp::U64ToF64 => 0b100_11110_01_1_00_011, + }; + sink.put4(enc_inttofpu(top16, rd, rn)); + } + &Inst::LoadFpuConst32 { rd, const_data } => { + let inst = Inst::FpuLoad32 { + rd, + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }; + inst.emit(sink); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(8), + }; + inst.emit(sink); + sink.put4(const_data.to_bits()); + } + &Inst::LoadFpuConst64 { rd, const_data } => { + let inst = Inst::FpuLoad64 { + rd, + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }; + inst.emit(sink); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(12), + }; + inst.emit(sink); + sink.put8(const_data.to_bits()); + } + &Inst::FpuCSel32 { rd, rn, rm, cond } => { + sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size32)); + } + &Inst::FpuCSel64 { rd, rn, rm, cond } => { + sink.put4(enc_fcsel(rd, rn, rm, cond, InstSize::Size64)); + } + &Inst::FpuRound { op, rd, rn } => { + let top22 = match op { + FpuRoundMode::Minus32 => 0b000_11110_00_1_001_010_10000, + FpuRoundMode::Minus64 => 0b000_11110_01_1_001_010_10000, + FpuRoundMode::Plus32 => 0b000_11110_00_1_001_001_10000, + FpuRoundMode::Plus64 => 0b000_11110_01_1_001_001_10000, + FpuRoundMode::Zero32 => 0b000_11110_00_1_001_011_10000, + FpuRoundMode::Zero64 => 0b000_11110_01_1_001_011_10000, + FpuRoundMode::Nearest32 => 0b000_11110_00_1_001_000_10000, + FpuRoundMode::Nearest64 => 0b000_11110_01_1_001_000_10000, + }; + sink.put4(enc_fround(top22, rd, rn)); + } + &Inst::MovToVec64 { rd, rn } => { + sink.put4( + 0b010_01110000_01000_0_0011_1_00000_00000 + | (machreg_to_gpr(rn) << 5) + | machreg_to_vec(rd.to_reg()), + ); + } + &Inst::MovFromVec64 { rd, rn } => { + sink.put4( + 0b010_01110000_01000_0_0111_1_00000_00000 + | (machreg_to_vec(rn) << 5) + | machreg_to_gpr(rd.to_reg()), + ); + } + &Inst::VecRRR { rd, rn, rm, alu_op } => { + let (top11, bit15_10) = match alu_op { + VecALUOp::SQAddScalar => (0b010_11110_11_1, 0b000011), + VecALUOp::SQSubScalar => (0b010_11110_11_1, 0b001011), + VecALUOp::UQAddScalar => (0b011_11110_11_1, 0b000011), + VecALUOp::UQSubScalar => (0b011_11110_11_1, 0b001011), + }; + sink.put4(enc_vec_rrr(top11, rm, bit15_10, rn, rd)); + } + &Inst::MovToNZCV { rn } => { + sink.put4(0xd51b4200 | machreg_to_gpr(rn)); + } + &Inst::MovFromNZCV { rd } => { + sink.put4(0xd53b4200 | machreg_to_gpr(rd.to_reg())); + } + &Inst::CondSet { rd, cond } => { + sink.put4( + 0b100_11010100_11111_0000_01_11111_00000 + | (cond.invert().bits() << 12) + | machreg_to_gpr(rd.to_reg()), + ); + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits >= 8 => { + let top22 = match (signed, from_bits, to_bits) { + (false, 8, 32) => 0b010_100110_0_000000_000111, // UXTB (32) + (false, 16, 32) => 0b010_100110_0_000000_001111, // UXTH (32) + (true, 8, 32) => 0b000_100110_0_000000_000111, // SXTB (32) + (true, 16, 32) => 0b000_100110_0_000000_001111, // SXTH (32) + // The 64-bit unsigned variants are the same as the 32-bit ones, + // because writes to Wn zero out the top 32 bits of Xn + (false, 8, 64) => 0b010_100110_0_000000_000111, // UXTB (64) + (false, 16, 64) => 0b010_100110_0_000000_001111, // UXTH (64) + (true, 8, 64) => 0b100_100110_1_000000_000111, // SXTB (64) + (true, 16, 64) => 0b100_100110_1_000000_001111, // SXTH (64) + // 32-to-64: the unsigned case is a 'mov' (special-cased below). + (false, 32, 64) => 0, // MOV + (true, 32, 64) => 0b100_100110_1_000000_011111, // SXTW (64) + _ => panic!( + "Unsupported extend combination: signed = {}, from_bits = {}, to_bits = {}", + signed, from_bits, to_bits + ), + }; + if top22 != 0 { + sink.put4(enc_extend(top22, rd, rn)); + } else { + Inst::mov32(rd, rn).emit(sink); + } + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && signed => { + assert!(to_bits <= 64); + // Reduce sign-extend-from-1-bit to: + // - and rd, rn, #1 + // - sub rd, zr, rd + + // We don't have ImmLogic yet, so we just hardcode this. FIXME. + sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())); + let sub_inst = Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd, + rn: zero_reg(), + rm: rd.to_reg(), + }; + sub_inst.emit(sink); + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && !signed => { + assert!(to_bits <= 64); + // Reduce zero-extend-from-1-bit to: + // - and rd, rn, #1 + + // We don't have ImmLogic yet, so we just hardcode this. FIXME. + sink.put4(0x92400000 | (machreg_to_gpr(rn) << 5) | machreg_to_gpr(rd.to_reg())); + } + &Inst::Extend { .. } => { + panic!("Unsupported extend variant"); + } + &Inst::Jump { ref dest } => { + // TODO: differentiate between as_off26() returning `None` for + // out-of-range vs. not-yet-finalized. The latter happens when we + // do early (fake) emission for size computation. + sink.put4(enc_jump26(0b000101, dest.as_off26().unwrap())); + } + &Inst::Ret => { + sink.put4(0xd65f03c0); + } + &Inst::EpiloguePlaceholder => { + // Noop; this is just a placeholder for epilogues. + } + &Inst::Call { + ref dest, + loc, + opcode, + .. + } => { + sink.add_reloc(loc, Reloc::Arm64Call, dest, 0); + sink.put4(enc_jump26(0b100101, 0)); + if opcode.is_call() { + sink.add_call_site(loc, opcode); + } + } + &Inst::CallInd { + rn, loc, opcode, .. + } => { + sink.put4(0b1101011_0001_11111_000000_00000_00000 | (machreg_to_gpr(rn) << 5)); + if opcode.is_call() { + sink.add_call_site(loc, opcode); + } + } + &Inst::CondBr { .. } => panic!("Unlowered CondBr during binemit!"), + &Inst::CondBrLowered { target, kind } => match kind { + // TODO: handle >2^19 case by emitting a compound sequence with + // an unconditional (26-bit) branch. We need branch-relaxation + // adjustment machinery to enable this (because we don't want to + // always emit the long form). + CondBrKind::Zero(reg) => { + sink.put4(enc_cmpbr(0b1_011010_0, target.as_off19().unwrap(), reg)); + } + CondBrKind::NotZero(reg) => { + sink.put4(enc_cmpbr(0b1_011010_1, target.as_off19().unwrap(), reg)); + } + CondBrKind::Cond(c) => { + sink.put4(enc_cbr( + 0b01010100, + target.as_off19().unwrap_or(0), + 0b0, + c.bits(), + )); + } + }, + &Inst::CondBrLoweredCompound { + taken, + not_taken, + kind, + } => { + // Conditional part first. + match kind { + CondBrKind::Zero(reg) => { + sink.put4(enc_cmpbr(0b1_011010_0, taken.as_off19().unwrap(), reg)); + } + CondBrKind::NotZero(reg) => { + sink.put4(enc_cmpbr(0b1_011010_1, taken.as_off19().unwrap(), reg)); + } + CondBrKind::Cond(c) => { + sink.put4(enc_cbr( + 0b01010100, + taken.as_off19().unwrap_or(0), + 0b0, + c.bits(), + )); + } + } + // Unconditional part. + sink.put4(enc_jump26(0b000101, not_taken.as_off26().unwrap_or(0))); + } + &Inst::IndirectBr { rn, .. } => { + sink.put4(enc_br(rn)); + } + &Inst::Nop0 => {} + &Inst::Nop4 => { + sink.put4(0xd503201f); + } + &Inst::Brk => { + sink.put4(0xd4200000); + } + &Inst::Udf { trap_info } => { + let (srcloc, code) = trap_info; + sink.add_trap(srcloc, code); + sink.put4(0xd4a00000); + } + &Inst::Adr { rd, ref label } => { + let off = memlabel_finalize(sink.cur_offset_from_start(), label); + assert!(off > -(1 << 20)); + assert!(off < (1 << 20)); + sink.put4(enc_adr(off, rd)); + } + &Inst::Word4 { data } => { + sink.put4(data); + } + &Inst::Word8 { data } => { + sink.put8(data); + } + &Inst::JTSequence { + ridx, + rtmp1, + rtmp2, + ref targets, + .. + } => { + // This sequence is *one* instruction in the vcode, and is expanded only here at + // emission time, because we cannot allow the regalloc to insert spills/reloads in + // the middle; we depend on hardcoded PC-rel addressing below. + // + // N.B.: if PC-rel addressing on ADR below is changed, also update + // `Inst::with_block_offsets()` in aarch64/inst/mod.rs. + + // Save index in a tmp (the live range of ridx only goes to start of this + // sequence; rtmp1 or rtmp2 may overwrite it). + let inst = Inst::gen_move(rtmp2, ridx, I64); + inst.emit(sink); + // Load address of jump table + let inst = Inst::Adr { + rd: rtmp1, + label: MemLabel::PCRel(16), + }; + inst.emit(sink); + // Load value out of jump table + let inst = Inst::SLoad32 { + rd: rtmp2, + mem: MemArg::reg_plus_reg_scaled_extended( + rtmp1.to_reg(), + rtmp2.to_reg(), + I32, + ExtendOp::UXTW, + ), + srcloc: None, // can't cause a user trap. + }; + inst.emit(sink); + // Add base of jump table to jump-table-sourced block offset + let inst = Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: rtmp1, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + }; + inst.emit(sink); + // Branch to computed address. (`targets` here is only used for successor queries + // and is not needed for emission.) + let inst = Inst::IndirectBr { + rn: rtmp1.to_reg(), + targets: vec![], + }; + inst.emit(sink); + // Emit jump table (table of 32-bit offsets). + for target in targets { + let off = target.as_offset_words() * 4; + let off = i32::try_from(off).unwrap(); + // cast i32 to u32 (two's-complement) + let off = off as u32; + sink.put4(off); + } + } + &Inst::LoadConst64 { rd, const_data } => { + let inst = Inst::ULoad64 { + rd, + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, // can't cause a user trap. + }; + inst.emit(sink); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(12), + }; + inst.emit(sink); + sink.put8(const_data); + } + &Inst::LoadExtName { + rd, + ref name, + offset, + srcloc, + } => { + let inst = Inst::ULoad64 { + rd, + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, // can't cause a user trap. + }; + inst.emit(sink); + let inst = Inst::Jump { + dest: BranchTarget::ResolvedOffset(12), + }; + inst.emit(sink); + sink.add_reloc(srcloc, Reloc::Abs8, name, offset); + sink.put8(0); + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::isa::test_utils; + + #[test] + fn test_aarch64_binemit() { + let mut insns = Vec::<(Inst, &str, &str)>::new(); + + // N.B.: the architecture is little-endian, so when transcribing the 32-bit + // hex instructions from e.g. objdump disassembly, one must swap the bytes + // seen below. (E.g., a `ret` is normally written as the u32 `D65F03C0`, + // but we write it here as C0035FD6.) + + // Useful helper script to produce the encodings from the text: + // + // #!/bin/sh + // tmp=`mktemp /tmp/XXXXXXXX.o` + // aarch64-linux-gnu-as /dev/stdin -o $tmp + // aarch64-linux-gnu-objdump -d $tmp + // rm -f $tmp + // + // Then: + // + // $ echo "mov x1, x2" | aarch64inst.sh + insns.push((Inst::Ret, "C0035FD6", "ret")); + insns.push((Inst::Nop0, "", "nop-zero-len")); + insns.push((Inst::Nop4, "1F2003D5", "nop")); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Add32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100030B", + "add w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400068B", + "add x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100034B", + "sub w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006CB", + "sub x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100032A", + "orr w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006AA", + "orr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::And32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100030A", + "and w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::And64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400068A", + "and x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100036B", + "subs w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SubS64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006EB", + "subs x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AddS32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + }, + "4100032B", + "adds w1, w2, w3", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AddS64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006AB", + "adds x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::SDiv64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40CC69A", + "sdiv x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::UDiv64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A408C69A", + "udiv x4, x5, x6", + )); + + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Eor32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400064A", + "eor w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Eor64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40006CA", + "eor x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AndNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400260A", + "bic w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::AndNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400268A", + "bic x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::OrrNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400262A", + "orn w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::OrrNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40026AA", + "orn x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::EorNot32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A400264A", + "eon w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::EorNot64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A40026CA", + "eon x4, x5, x6", + )); + + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::RotR32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A42CC61A", + "ror w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::RotR64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A42CC69A", + "ror x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A424C61A", + "lsr w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A424C69A", + "lsr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A428C61A", + "asr w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Asr64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A428C69A", + "asr x4, x5, x6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl32, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A420C61A", + "lsl w4, w5, w6", + )); + insns.push(( + Inst::AluRRR { + alu_op: ALUOp::Lsl64, + rd: writable_xreg(4), + rn: xreg(5), + rm: xreg(6), + }, + "A420C69A", + "lsl x4, x5, x6", + )); + + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0411", + "add w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: true, + }, + }, + "078D4411", + "add w7, w8, #1191936", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Add64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0491", + "add x7, x8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0451", + "sub w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::Sub64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D04D1", + "sub x7, x8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::SubS32, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D0471", + "subs w7, w8, #291", + )); + insns.push(( + Inst::AluRRImm12 { + alu_op: ALUOp::SubS64, + rd: writable_xreg(7), + rn: xreg(8), + imm12: Imm12 { + bits: 0x123, + shift12: false, + }, + }, + "078D04F1", + "subs x7, x8, #291", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Add32, + rd: writable_xreg(7), + rn: xreg(8), + rm: xreg(9), + extendop: ExtendOp::SXTB, + }, + "0781290B", + "add w7, w8, w9, SXTB", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Add64, + rd: writable_xreg(15), + rn: xreg(16), + rm: xreg(17), + extendop: ExtendOp::UXTB, + }, + "0F02318B", + "add x15, x16, x17, UXTB", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Sub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + extendop: ExtendOp::SXTH, + }, + "41A0234B", + "sub w1, w2, w3, SXTH", + )); + + insns.push(( + Inst::AluRRRExtend { + alu_op: ALUOp::Sub64, + rd: writable_xreg(20), + rn: xreg(21), + rm: xreg(22), + extendop: ExtendOp::UXTW, + }, + "B44236CB", + "sub x20, x21, x22, UXTW", + )); + + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(20).unwrap(), + ), + }, + "6A510C0B", + "add w10, w11, w12, LSL 20", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::ASR, + ShiftOpShiftImm::maybe_from_shift(42).unwrap(), + ), + }, + "6AA98C8B", + "add x10, x11, x12, ASR 42", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C4B", + "sub w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Sub64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CCB", + "sub x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C2A", + "orr w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Orr64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CAA", + "orr x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C0A", + "and w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::And64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C8A", + "and x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C4A", + "eor w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::Eor64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CCA", + "eor x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::OrrNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C2A", + "orn w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::OrrNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2CAA", + "orn x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AndNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C0A", + "bic w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AndNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C8A", + "bic x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::EorNot32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2C4A", + "eon w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::EorNot64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D2CCA", + "eon x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AddS32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C2B", + "adds w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::AddS64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CAB", + "adds x10, x11, x12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::SubS32, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0C6B", + "subs w10, w11, w12, LSL 23", + )); + insns.push(( + Inst::AluRRRShift { + alu_op: ALUOp::SubS64, + rd: writable_xreg(10), + rn: xreg(11), + rm: xreg(12), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(23).unwrap(), + ), + }, + "6A5D0CEB", + "subs x10, x11, x12, LSL 23", + )); + + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::MAdd32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4110031B", + "madd w1, w2, w3, w4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::MAdd64, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4110039B", + "madd x1, x2, x3, x4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::MSub32, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4190031B", + "msub w1, w2, w3, w4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::MSub64, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: xreg(4), + }, + "4190039B", + "msub x1, x2, x3, x4", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::SMulH, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: zero_reg(), + }, + "417C439B", + "smulh x1, x2, x3", + )); + insns.push(( + Inst::AluRRRR { + alu_op: ALUOp::UMulH, + rd: writable_xreg(1), + rn: xreg(2), + rm: xreg(3), + ra: zero_reg(), + }, + "417CC39B", + "umulh x1, x2, x3", + )); + + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::RotR32, + rd: writable_xreg(20), + rn: xreg(21), + immshift: ImmShift::maybe_from_u64(19).unwrap(), + }, + "B44E9513", + "ror w20, w21, #19", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::RotR64, + rd: writable_xreg(20), + rn: xreg(21), + immshift: ImmShift::maybe_from_u64(42).unwrap(), + }, + "B4AAD593", + "ror x20, x21, #42", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(13).unwrap(), + }, + "6A7D0D53", + "lsr w10, w11, #13", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: writable_xreg(10), + rn: xreg(11), + immshift: ImmShift::maybe_from_u64(57).unwrap(), + }, + "6AFD79D3", + "lsr x10, x11, #57", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Asr32, + rd: writable_xreg(4), + rn: xreg(5), + immshift: ImmShift::maybe_from_u64(7).unwrap(), + }, + "A47C0713", + "asr w4, w5, #7", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Asr64, + rd: writable_xreg(4), + rn: xreg(5), + immshift: ImmShift::maybe_from_u64(35).unwrap(), + }, + "A4FC6393", + "asr x4, x5, #35", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl32, + rd: writable_xreg(8), + rn: xreg(9), + immshift: ImmShift::maybe_from_u64(24).unwrap(), + }, + "281D0853", + "lsl w8, w9, #24", + )); + insns.push(( + Inst::AluRRImmShift { + alu_op: ALUOp::Lsl64, + rd: writable_xreg(8), + rn: xreg(9), + immshift: ImmShift::maybe_from_u64(63).unwrap(), + }, + "280141D3", + "lsl x8, x9, #63", + )); + + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::And32, + rd: writable_xreg(21), + rn: xreg(27), + imml: ImmLogic::maybe_from_u64(0x80003fff, I32).unwrap(), + }, + "753B0112", + "and w21, w27, #2147500031", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: writable_xreg(7), + rn: xreg(6), + imml: ImmLogic::maybe_from_u64(0x3fff80003fff800, I64).unwrap(), + }, + "C7381592", + "and x7, x6, #288221580125796352", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Orr32, + rd: writable_xreg(1), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x100000, I32).unwrap(), + }, + "A1000C32", + "orr w1, w5, #1048576", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Orr64, + rd: writable_xreg(4), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(), + }, + "A4C401B2", + "orr x4, x5, #9331882296111890817", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Eor32, + rd: writable_xreg(1), + rn: xreg(5), + imml: ImmLogic::maybe_from_u64(0x00007fff, I32).unwrap(), + }, + "A1380052", + "eor w1, w5, #32767", + )); + insns.push(( + Inst::AluRRImmLogic { + alu_op: ALUOp::Eor64, + rd: writable_xreg(10), + rn: xreg(8), + imml: ImmLogic::maybe_from_u64(0x8181818181818181, I64).unwrap(), + }, + "0AC501D2", + "eor x10, x8, #9331882296111890817", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::RBit32, + rd: writable_xreg(1), + rn: xreg(10), + }, + "4101C05A", + "rbit w1, w10", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::RBit64, + rd: writable_xreg(1), + rn: xreg(10), + }, + "4101C0DA", + "rbit x1, x10", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Clz32, + rd: writable_xreg(15), + rn: xreg(3), + }, + "6F10C05A", + "clz w15, w3", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Clz64, + rd: writable_xreg(15), + rn: xreg(3), + }, + "6F10C0DA", + "clz x15, x3", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Cls32, + rd: writable_xreg(21), + rn: xreg(16), + }, + "1516C05A", + "cls w21, w16", + )); + + insns.push(( + Inst::BitRR { + op: BitOp::Cls64, + rd: writable_xreg(21), + rn: xreg(16), + }, + "1516C0DA", + "cls x21, x16", + )); + + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "41004038", + "ldurb w1, [x2]", + )); + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::zero(I8)), + srcloc: None, + }, + "41004039", + "ldrb w1, [x2]", + )); + insns.push(( + Inst::ULoad8 { + rd: writable_xreg(1), + mem: MemArg::RegReg(xreg(2), xreg(5)), + srcloc: None, + }, + "41686538", + "ldrb w1, [x2, x5]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "41008038", + "ldursb x1, [x2]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(63, I8).unwrap()), + srcloc: None, + }, + "41FC8039", + "ldrsb x1, [x2, #63]", + )); + insns.push(( + Inst::SLoad8 { + rd: writable_xreg(1), + mem: MemArg::RegReg(xreg(2), xreg(5)), + srcloc: None, + }, + "4168A538", + "ldrsb x1, [x2, x5]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(5).unwrap()), + srcloc: None, + }, + "41504078", + "ldurh w1, [x2, #5]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: MemArg::UnsignedOffset(xreg(2), UImm12Scaled::maybe_from_i64(8, I16).unwrap()), + srcloc: None, + }, + "41104079", + "ldrh w1, [x2, #8]", + )); + insns.push(( + Inst::ULoad16 { + rd: writable_xreg(1), + mem: MemArg::RegScaled(xreg(2), xreg(3), I16), + srcloc: None, + }, + "41786378", + "ldrh w1, [x2, x3, LSL #1]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "41008078", + "ldursh x1, [x2]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(28), + mem: MemArg::UnsignedOffset( + xreg(20), + UImm12Scaled::maybe_from_i64(24, I16).unwrap(), + ), + srcloc: None, + }, + "9C328079", + "ldrsh x28, [x20, #24]", + )); + insns.push(( + Inst::SLoad16 { + rd: writable_xreg(28), + mem: MemArg::RegScaled(xreg(20), xreg(20), I16), + srcloc: None, + }, + "9C7AB478", + "ldrsh x28, [x20, x20, LSL #1]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "410040B8", + "ldur w1, [x2]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(12), + mem: MemArg::UnsignedOffset( + xreg(0), + UImm12Scaled::maybe_from_i64(204, I32).unwrap(), + ), + srcloc: None, + }, + "0CCC40B9", + "ldr w12, [x0, #204]", + )); + insns.push(( + Inst::ULoad32 { + rd: writable_xreg(1), + mem: MemArg::RegScaled(xreg(2), xreg(12), I32), + srcloc: None, + }, + "41786CB8", + "ldr w1, [x2, x12, LSL #2]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "410080B8", + "ldursw x1, [x2]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(12), + mem: MemArg::UnsignedOffset( + xreg(1), + UImm12Scaled::maybe_from_i64(16380, I32).unwrap(), + ), + srcloc: None, + }, + "2CFCBFB9", + "ldrsw x12, [x1, #16380]", + )); + insns.push(( + Inst::SLoad32 { + rd: writable_xreg(1), + mem: MemArg::RegScaled(xreg(5), xreg(1), I32), + srcloc: None, + }, + "A178A1B8", + "ldrsw x1, [x5, x1, LSL #2]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "410040F8", + "ldur x1, [x2]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(-256).unwrap()), + srcloc: None, + }, + "410050F8", + "ldur x1, [x2, #-256]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::maybe_from_i64(255).unwrap()), + srcloc: None, + }, + "41F04FF8", + "ldur x1, [x2, #255]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::UnsignedOffset( + xreg(2), + UImm12Scaled::maybe_from_i64(32760, I64).unwrap(), + ), + srcloc: None, + }, + "41FC7FF9", + "ldr x1, [x2, #32760]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::RegReg(xreg(2), xreg(3)), + srcloc: None, + }, + "416863F8", + "ldr x1, [x2, x3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::RegScaled(xreg(2), xreg(3), I64), + srcloc: None, + }, + "417863F8", + "ldr x1, [x2, x3, LSL #3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::SXTW), + srcloc: None, + }, + "41D863F8", + "ldr x1, [x2, w3, SXTW #3]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::Label(MemLabel::PCRel(64)), + srcloc: None, + }, + "01020058", + "ldr x1, pc+64", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + srcloc: None, + }, + "410C41F8", + "ldr x1, [x2, #16]!", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + srcloc: None, + }, + "410441F8", + "ldr x1, [x2], #16", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::FPOffset(32768), + srcloc: None, + }, + "0F0090D2EF011D8BE10140F9", + "movz x15, #32768 ; add x15, x15, fp ; ldr x1, [x15]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::FPOffset(-32768), + srcloc: None, + }, + "EFFF8F92EF011D8BE10140F9", + "movn x15, #32767 ; add x15, x15, fp ; ldr x1, [x15]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::FPOffset(1048576), // 2^20 + srcloc: None, + }, + "0F02A0D2EF011D8BE10140F9", + "movz x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]", + )); + insns.push(( + Inst::ULoad64 { + rd: writable_xreg(1), + mem: MemArg::FPOffset(1048576 + 1), // 2^20 + 1 + srcloc: None, + }, + "2F0080D20F02A0F2EF011D8BE10140F9", + "movz x15, #1 ; movk x15, #16, LSL #16 ; add x15, x15, fp ; ldr x1, [x15]", + )); + + insns.push(( + Inst::Store8 { + rd: xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "41000038", + "sturb w1, [x2]", + )); + insns.push(( + Inst::Store8 { + rd: xreg(1), + mem: MemArg::UnsignedOffset( + xreg(2), + UImm12Scaled::maybe_from_i64(4095, I8).unwrap(), + ), + srcloc: None, + }, + "41FC3F39", + "strb w1, [x2, #4095]", + )); + insns.push(( + Inst::Store16 { + rd: xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "41000078", + "sturh w1, [x2]", + )); + insns.push(( + Inst::Store16 { + rd: xreg(1), + mem: MemArg::UnsignedOffset( + xreg(2), + UImm12Scaled::maybe_from_i64(8190, I16).unwrap(), + ), + srcloc: None, + }, + "41FC3F79", + "strh w1, [x2, #8190]", + )); + insns.push(( + Inst::Store32 { + rd: xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "410000B8", + "stur w1, [x2]", + )); + insns.push(( + Inst::Store32 { + rd: xreg(1), + mem: MemArg::UnsignedOffset( + xreg(2), + UImm12Scaled::maybe_from_i64(16380, I32).unwrap(), + ), + srcloc: None, + }, + "41FC3FB9", + "str w1, [x2, #16380]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::Unscaled(xreg(2), SImm9::zero()), + srcloc: None, + }, + "410000F8", + "stur x1, [x2]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::UnsignedOffset( + xreg(2), + UImm12Scaled::maybe_from_i64(32760, I64).unwrap(), + ), + srcloc: None, + }, + "41FC3FF9", + "str x1, [x2, #32760]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::RegReg(xreg(2), xreg(3)), + srcloc: None, + }, + "416823F8", + "str x1, [x2, x3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::RegScaled(xreg(2), xreg(3), I64), + srcloc: None, + }, + "417823F8", + "str x1, [x2, x3, LSL #3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::RegScaledExtended(xreg(2), xreg(3), I64, ExtendOp::UXTW), + srcloc: None, + }, + "415823F8", + "str x1, [x2, w3, UXTW #3]", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::PreIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + srcloc: None, + }, + "410C01F8", + "str x1, [x2, #16]!", + )); + insns.push(( + Inst::Store64 { + rd: xreg(1), + mem: MemArg::PostIndexed(writable_xreg(2), SImm9::maybe_from_i64(16).unwrap()), + srcloc: None, + }, + "410401F8", + "str x1, [x2], #16", + )); + + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairMemArg::SignedOffset(xreg(10), SImm7Scaled::zero(I64)), + }, + "482500A9", + "stp x8, x9, [x10]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairMemArg::SignedOffset( + xreg(10), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + }, + "48A51FA9", + "stp x8, x9, [x10, #504]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairMemArg::SignedOffset( + xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + }, + "48253CA9", + "stp x8, x9, [x10, #-64]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(21), + rt2: xreg(28), + mem: PairMemArg::SignedOffset( + xreg(1), + SImm7Scaled::maybe_from_i64(-512, I64).unwrap(), + ), + }, + "357020A9", + "stp x21, x28, [x1, #-512]", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(8), + rt2: xreg(9), + mem: PairMemArg::PreIndexed( + writable_xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + }, + "4825BCA9", + "stp x8, x9, [x10, #-64]!", + )); + insns.push(( + Inst::StoreP64 { + rt: xreg(15), + rt2: xreg(16), + mem: PairMemArg::PostIndexed( + writable_xreg(20), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + }, + "8FC29FA8", + "stp x15, x16, [x20], #504", + )); + + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairMemArg::SignedOffset(xreg(10), SImm7Scaled::zero(I64)), + }, + "482540A9", + "ldp x8, x9, [x10]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairMemArg::SignedOffset( + xreg(10), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + }, + "48A55FA9", + "ldp x8, x9, [x10, #504]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairMemArg::SignedOffset( + xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + }, + "48257CA9", + "ldp x8, x9, [x10, #-64]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairMemArg::SignedOffset( + xreg(10), + SImm7Scaled::maybe_from_i64(-512, I64).unwrap(), + ), + }, + "482560A9", + "ldp x8, x9, [x10, #-512]", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(9), + mem: PairMemArg::PreIndexed( + writable_xreg(10), + SImm7Scaled::maybe_from_i64(-64, I64).unwrap(), + ), + }, + "4825FCA9", + "ldp x8, x9, [x10, #-64]!", + )); + insns.push(( + Inst::LoadP64 { + rt: writable_xreg(8), + rt2: writable_xreg(25), + mem: PairMemArg::PostIndexed( + writable_xreg(12), + SImm7Scaled::maybe_from_i64(504, I64).unwrap(), + ), + }, + "88E5DFA8", + "ldp x8, x25, [x12], #504", + )); + + insns.push(( + Inst::Mov { + rd: writable_xreg(8), + rm: xreg(9), + }, + "E80309AA", + "mov x8, x9", + )); + insns.push(( + Inst::Mov32 { + rd: writable_xreg(8), + rm: xreg(9), + }, + "E803092A", + "mov w8, w9", + )); + + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + }, + "E8FF9FD2", + "movz x8, #65535", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + }, + "E8FFBFD2", + "movz x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + }, + "E8FFDFD2", + "movz x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovZ { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + }, + "E8FFFFD2", + "movz x8, #65535, LSL #48", + )); + + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + }, + "E8FF9F92", + "movn x8, #65535", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + }, + "E8FFBF92", + "movn x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + }, + "E8FFDF92", + "movn x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovN { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + }, + "E8FFFF92", + "movn x8, #65535, LSL #48", + )); + + insns.push(( + Inst::MovK { + rd: writable_xreg(12), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_0000).unwrap(), + }, + "0C0080F2", + "movk x12, #0", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(19), + imm: MoveWideConst::maybe_with_shift(0x0000, 16).unwrap(), + }, + "1300A0F2", + "movk x19, #0, LSL #16", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(3), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_0000_ffff).unwrap(), + }, + "E3FF9FF2", + "movk x3, #65535", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_0000_ffff_0000).unwrap(), + }, + "E8FFBFF2", + "movk x8, #65535, LSL #16", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0x0000_ffff_0000_0000).unwrap(), + }, + "E8FFDFF2", + "movk x8, #65535, LSL #32", + )); + insns.push(( + Inst::MovK { + rd: writable_xreg(8), + imm: MoveWideConst::maybe_from_u64(0xffff_0000_0000_0000).unwrap(), + }, + "E8FFFFF2", + "movk x8, #65535, LSL #48", + )); + + insns.push(( + Inst::CSel { + rd: writable_xreg(10), + rn: xreg(12), + rm: xreg(14), + cond: Cond::Hs, + }, + "8A218E9A", + "csel x10, x12, x14, hs", + )); + insns.push(( + Inst::CSet { + rd: writable_xreg(15), + cond: Cond::Ge, + }, + "EFB79F9A", + "cset x15, ge", + )); + insns.push(( + Inst::MovToVec64 { + rd: writable_vreg(20), + rn: xreg(21), + }, + "B41E084E", + "mov v20.d[0], x21", + )); + insns.push(( + Inst::MovFromVec64 { + rd: writable_xreg(21), + rn: vreg(20), + }, + "953E084E", + "mov x21, v20.d[0]", + )); + insns.push(( + Inst::MovToNZCV { rn: xreg(13) }, + "0D421BD5", + "msr nzcv, x13", + )); + insns.push(( + Inst::MovFromNZCV { + rd: writable_xreg(27), + }, + "1B423BD5", + "mrs x27, nzcv", + )); + insns.push(( + Inst::CondSet { + rd: writable_xreg(5), + cond: Cond::Hi, + }, + "E5979F9A", + "cset x5, hi", + )); + insns.push(( + Inst::VecRRR { + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + alu_op: VecALUOp::UQAddScalar, + }, + "D50EF77E", + "uqadd d21, d22, d23", + )); + insns.push(( + Inst::VecRRR { + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + alu_op: VecALUOp::SQAddScalar, + }, + "D50EF75E", + "sqadd d21, d22, d23", + )); + insns.push(( + Inst::VecRRR { + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + alu_op: VecALUOp::UQSubScalar, + }, + "D52EF77E", + "uqsub d21, d22, d23", + )); + insns.push(( + Inst::VecRRR { + rd: writable_vreg(21), + rn: vreg(22), + rm: vreg(23), + alu_op: VecALUOp::SQSubScalar, + }, + "D52EF75E", + "sqsub d21, d22, d23", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 8, + to_bits: 32, + }, + "411C0053", + "uxtb w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 8, + to_bits: 32, + }, + "411C0013", + "sxtb w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 16, + to_bits: 32, + }, + "413C0053", + "uxth w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 16, + to_bits: 32, + }, + "413C0013", + "sxth w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 8, + to_bits: 64, + }, + "411C0053", + "uxtb x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 8, + to_bits: 64, + }, + "411C4093", + "sxtb x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 16, + to_bits: 64, + }, + "413C0053", + "uxth x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 16, + to_bits: 64, + }, + "413C4093", + "sxth x1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: false, + from_bits: 32, + to_bits: 64, + }, + "E103022A", + "mov w1, w2", + )); + insns.push(( + Inst::Extend { + rd: writable_xreg(1), + rn: xreg(2), + signed: true, + from_bits: 32, + to_bits: 64, + }, + "417C4093", + "sxtw x1, w2", + )); + + insns.push(( + Inst::Jump { + dest: BranchTarget::ResolvedOffset(64), + }, + "10000014", + "b 64", + )); + + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Zero(xreg(8)), + }, + "080200B4", + "cbz x8, 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::NotZero(xreg(8)), + }, + "080200B5", + "cbnz x8, 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Eq), + }, + "00020054", + "b.eq 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Ne), + }, + "01020054", + "b.ne 64", + )); + + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Hs), + }, + "02020054", + "b.hs 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Lo), + }, + "03020054", + "b.lo 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Mi), + }, + "04020054", + "b.mi 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Pl), + }, + "05020054", + "b.pl 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Vs), + }, + "06020054", + "b.vs 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Vc), + }, + "07020054", + "b.vc 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Hi), + }, + "08020054", + "b.hi 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Ls), + }, + "09020054", + "b.ls 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Ge), + }, + "0A020054", + "b.ge 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Lt), + }, + "0B020054", + "b.lt 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Gt), + }, + "0C020054", + "b.gt 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Le), + }, + "0D020054", + "b.le 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Al), + }, + "0E020054", + "b.al 64", + )); + insns.push(( + Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(64), + kind: CondBrKind::Cond(Cond::Nv), + }, + "0F020054", + "b.nv 64", + )); + + insns.push(( + Inst::CondBrLoweredCompound { + taken: BranchTarget::ResolvedOffset(64), + not_taken: BranchTarget::ResolvedOffset(128), + kind: CondBrKind::Cond(Cond::Le), + }, + "0D02005420000014", + "b.le 64 ; b 128", + )); + + insns.push(( + Inst::Call { + dest: ExternalName::testcase("test0"), + uses: Set::empty(), + defs: Set::empty(), + loc: SourceLoc::default(), + opcode: Opcode::Call, + }, + "00000094", + "bl 0", + )); + + insns.push(( + Inst::CallInd { + rn: xreg(10), + uses: Set::empty(), + defs: Set::empty(), + loc: SourceLoc::default(), + opcode: Opcode::CallIndirect, + }, + "40013FD6", + "blr x10", + )); + + insns.push(( + Inst::IndirectBr { + rn: xreg(3), + targets: vec![1, 2, 3], + }, + "60001FD6", + "br x3", + )); + + insns.push((Inst::Brk, "000020D4", "brk #0")); + + insns.push(( + Inst::Adr { + rd: writable_xreg(15), + label: MemLabel::PCRel((1 << 20) - 4), + }, + "EFFF7F10", + "adr x15, pc+1048572", + )); + + insns.push(( + Inst::FpuMove64 { + rd: writable_vreg(8), + rn: vreg(4), + }, + "881CA40E", + "mov v8.8b, v4.8b", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3201E", + "fabs s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Abs64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3601E", + "fabs d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43211E", + "fneg s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Neg64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43611E", + "fneg d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3211E", + "fsqrt s15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Sqrt64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3611E", + "fsqrt d15, d30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt32To64, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CFC3221E", + "fcvt d15, s30", + )); + + insns.push(( + Inst::FpuRR { + fpu_op: FPUOp1::Cvt64To32, + rd: writable_vreg(15), + rn: vreg(30), + }, + "CF43621E", + "fcvt s15, d30", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF2B3F1E", + "fadd s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Add64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF2B7F1E", + "fadd d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF3B3F1E", + "fsub s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Sub64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF3B7F1E", + "fsub d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF0B3F1E", + "fmul s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Mul64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF0B7F1E", + "fmul d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF1B3F1E", + "fdiv s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Div64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF1B7F1E", + "fdiv d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF4B3F1E", + "fmax s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Max64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF4B7F1E", + "fmax d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF5B3F1E", + "fmin s15, s30, s31", + )); + + insns.push(( + Inst::FpuRRR { + fpu_op: FPUOp2::Min64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + }, + "CF5B7F1E", + "fmin d15, d30, d31", + )); + + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd32, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + ra: vreg(1), + }, + "CF071F1F", + "fmadd s15, s30, s31, s1", + )); + + insns.push(( + Inst::FpuRRRR { + fpu_op: FPUOp3::MAdd64, + rd: writable_vreg(15), + rn: vreg(30), + rm: vreg(31), + ra: vreg(1), + }, + "CF075F1F", + "fmadd d15, d30, d31, d1", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToU32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100391E", + "fcvtzu w1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToU64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100399E", + "fcvtzu x1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToI32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100381E", + "fcvtzs w1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F32ToI64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100389E", + "fcvtzs x1, s4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToU32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100791E", + "fcvtzu w1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToU64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100799E", + "fcvtzu x1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToI32, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100781E", + "fcvtzs w1, d4", + )); + + insns.push(( + Inst::FpuToInt { + op: FpuToIntOp::F64ToI64, + rd: writable_xreg(1), + rn: vreg(4), + }, + "8100789E", + "fcvtzs x1, d4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U32ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100231E", + "ucvtf s1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I32ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100221E", + "scvtf s1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U32ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100631E", + "ucvtf d1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I32ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100621E", + "scvtf d1, w4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U64ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100239E", + "ucvtf s1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I64ToF32, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100229E", + "scvtf s1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::U64ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100639E", + "ucvtf d1, x4", + )); + + insns.push(( + Inst::IntToFpu { + op: IntToFpuOp::I64ToF64, + rd: writable_vreg(1), + rn: xreg(4), + }, + "8100629E", + "scvtf d1, x4", + )); + + insns.push(( + Inst::FpuCmp32 { + rn: vreg(23), + rm: vreg(24), + }, + "E022381E", + "fcmp s23, s24", + )); + + insns.push(( + Inst::FpuCmp64 { + rn: vreg(23), + rm: vreg(24), + }, + "E022781E", + "fcmp d23, d24", + )); + + insns.push(( + Inst::FpuLoad32 { + rd: writable_vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), F32), + srcloc: None, + }, + "107969BC", + "ldr s16, [x8, x9, LSL #2]", + )); + + insns.push(( + Inst::FpuLoad64 { + rd: writable_vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), F64), + srcloc: None, + }, + "107969FC", + "ldr d16, [x8, x9, LSL #3]", + )); + + insns.push(( + Inst::FpuLoad128 { + rd: writable_vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), I128), + srcloc: None, + }, + "1079E93C", + "ldr q16, [x8, x9, LSL #4]", + )); + + insns.push(( + Inst::FpuLoad32 { + rd: writable_vreg(16), + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }, + "5000001C", + "ldr s16, pc+8", + )); + + insns.push(( + Inst::FpuLoad64 { + rd: writable_vreg(16), + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }, + "5000005C", + "ldr d16, pc+8", + )); + + insns.push(( + Inst::FpuLoad128 { + rd: writable_vreg(16), + mem: MemArg::Label(MemLabel::PCRel(8)), + srcloc: None, + }, + "5000009C", + "ldr q16, pc+8", + )); + + insns.push(( + Inst::FpuStore32 { + rd: vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), F32), + srcloc: None, + }, + "107929BC", + "str s16, [x8, x9, LSL #2]", + )); + + insns.push(( + Inst::FpuStore64 { + rd: vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), F64), + srcloc: None, + }, + "107929FC", + "str d16, [x8, x9, LSL #3]", + )); + + insns.push(( + Inst::FpuStore128 { + rd: vreg(16), + mem: MemArg::RegScaled(xreg(8), xreg(9), I128), + srcloc: None, + }, + "1079A93C", + "str q16, [x8, x9, LSL #4]", + )); + + insns.push(( + Inst::LoadFpuConst32 { + rd: writable_vreg(16), + const_data: 1.0, + }, + "5000001C020000140000803F", + "ldr s16, pc+8 ; b 8 ; data.f32 1", + )); + + insns.push(( + Inst::LoadFpuConst64 { + rd: writable_vreg(16), + const_data: 1.0, + }, + "5000005C03000014000000000000F03F", + "ldr d16, pc+8 ; b 12 ; data.f64 1", + )); + + insns.push(( + Inst::FpuCSel32 { + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(3), + cond: Cond::Hi, + }, + "418C231E", + "fcsel s1, s2, s3, hi", + )); + + insns.push(( + Inst::FpuCSel64 { + rd: writable_vreg(1), + rn: vreg(2), + rm: vreg(3), + cond: Cond::Eq, + }, + "410C631E", + "fcsel d1, d2, d3, eq", + )); + + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Minus32, + }, + "1743251E", + "frintm s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Minus64, + }, + "1743651E", + "frintm d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Plus32, + }, + "17C3241E", + "frintp s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Plus64, + }, + "17C3641E", + "frintp d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Zero32, + }, + "17C3251E", + "frintz s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Zero64, + }, + "17C3651E", + "frintz d23, d24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Nearest32, + }, + "1743241E", + "frintn s23, s24", + )); + insns.push(( + Inst::FpuRound { + rd: writable_vreg(23), + rn: vreg(24), + op: FpuRoundMode::Nearest64, + }, + "1743641E", + "frintn d23, d24", + )); + + let rru = create_reg_universe(); + for (insn, expected_encoding, expected_printing) in insns { + println!( + "AArch64: {:?}, {}, {}", + insn, expected_encoding, expected_printing + ); + + // Check the printed text is as expected. + let actual_printing = insn.show_rru(Some(&rru)); + assert_eq!(expected_printing, actual_printing); + + // Check the encoding is as expected. + let text_size = { + let mut code_sec = MachSectionSize::new(0); + insn.emit(&mut code_sec); + code_sec.size() + }; + + let mut sink = test_utils::TestCodeSink::new(); + let mut sections = MachSections::new(); + let code_idx = sections.add_section(0, text_size); + let code_sec = sections.get_section(code_idx); + insn.emit(code_sec); + sections.emit(&mut sink); + let actual_encoding = &sink.stringify(); + assert_eq!(expected_encoding, actual_encoding); + } + } + + #[test] + fn test_cond_invert() { + for cond in vec![ + Cond::Eq, + Cond::Ne, + Cond::Hs, + Cond::Lo, + Cond::Mi, + Cond::Pl, + Cond::Vs, + Cond::Vc, + Cond::Hi, + Cond::Ls, + Cond::Ge, + Cond::Lt, + Cond::Gt, + Cond::Le, + Cond::Al, + Cond::Nv, + ] + .into_iter() + { + assert_eq!(cond.invert().invert(), cond); + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/imms.rs b/cranelift/codegen/src/isa/aarch64/inst/imms.rs new file mode 100644 index 000000000000..7230b4f44e3f --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/inst/imms.rs @@ -0,0 +1,752 @@ +//! AArch64 ISA definitions: immediate constants. + +// Some variants are never constructed, but we still want them as options in the future. +#[allow(dead_code)] +use crate::ir::types::*; +use crate::ir::Type; +use crate::machinst::*; + +use regalloc::RealRegUniverse; + +use core::convert::TryFrom; +use std::string::String; + +/// A signed, scaled 7-bit offset. +#[derive(Clone, Copy, Debug)] +pub struct SImm7Scaled { + /// The value. + pub value: i16, + /// multiplied by the size of this type + pub scale_ty: Type, +} + +impl SImm7Scaled { + /// Create a SImm7Scaled from a raw offset and the known scale type, if + /// possible. + pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option { + assert!(scale_ty == I64 || scale_ty == I32); + let scale = scale_ty.bytes(); + assert!(scale.is_power_of_two()); + let scale = i64::from(scale); + let upper_limit = 63 * scale; + let lower_limit = -(64 * scale); + if value >= lower_limit && value <= upper_limit && (value & (scale - 1)) == 0 { + Some(SImm7Scaled { + value: i16::try_from(value).unwrap(), + scale_ty, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero(scale_ty: Type) -> SImm7Scaled { + SImm7Scaled { value: 0, scale_ty } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + let ty_bytes: i16 = self.scale_ty.bytes() as i16; + let scaled: i16 = self.value / ty_bytes; + assert!(scaled <= 63 && scaled >= -64); + let scaled: i8 = scaled as i8; + let encoded: u32 = scaled as u32; + encoded & 0x7f + } +} + +/// a 9-bit signed offset. +#[derive(Clone, Copy, Debug)] +pub struct SImm9 { + /// The value. + pub value: i16, +} + +impl SImm9 { + /// Create a signed 9-bit offset from a full-range value, if possible. + pub fn maybe_from_i64(value: i64) -> Option { + if value >= -256 && value <= 255 { + Some(SImm9 { + value: value as i16, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero() -> SImm9 { + SImm9 { value: 0 } + } + + /// Bits for encoding. + pub fn bits(&self) -> u32 { + (self.value as u32) & 0x1ff + } +} + +/// An unsigned, scaled 12-bit offset. +#[derive(Clone, Copy, Debug)] +pub struct UImm12Scaled { + /// The value. + pub value: u16, + /// multiplied by the size of this type + pub scale_ty: Type, +} + +impl UImm12Scaled { + /// Create a UImm12Scaled from a raw offset and the known scale type, if + /// possible. + pub fn maybe_from_i64(value: i64, scale_ty: Type) -> Option { + let scale = scale_ty.bytes(); + assert!(scale.is_power_of_two()); + let scale = scale as i64; + let limit = 4095 * scale; + if value >= 0 && value <= limit && (value & (scale - 1)) == 0 { + Some(UImm12Scaled { + value: value as u16, + scale_ty, + }) + } else { + None + } + } + + /// Create a zero immediate of this format. + pub fn zero(scale_ty: Type) -> UImm12Scaled { + UImm12Scaled { value: 0, scale_ty } + } + + /// Encoded bits. + pub fn bits(&self) -> u32 { + (self.value as u32 / self.scale_ty.bytes()) & 0xfff + } +} + +/// A shifted immediate value in 'imm12' format: supports 12 bits, shifted +/// left by 0 or 12 places. +#[derive(Clone, Debug)] +pub struct Imm12 { + /// The immediate bits. + pub bits: u16, + /// Whether the immediate bits are shifted left by 12 or not. + pub shift12: bool, +} + +impl Imm12 { + /// Compute a Imm12 from raw bits, if possible. + pub fn maybe_from_u64(val: u64) -> Option { + if val == 0 { + Some(Imm12 { + bits: 0, + shift12: false, + }) + } else if val < 0xfff { + Some(Imm12 { + bits: val as u16, + shift12: false, + }) + } else if val < 0xfff_000 && (val & 0xfff == 0) { + Some(Imm12 { + bits: (val >> 12) as u16, + shift12: true, + }) + } else { + None + } + } + + /// Bits for 2-bit "shift" field in e.g. AddI. + pub fn shift_bits(&self) -> u32 { + if self.shift12 { + 0b01 + } else { + 0b00 + } + } + + /// Bits for 12-bit "imm" field in e.g. AddI. + pub fn imm_bits(&self) -> u32 { + self.bits as u32 + } +} + +/// An immediate for logical instructions. +#[derive(Clone, Debug)] +#[cfg_attr(test, derive(PartialEq))] +pub struct ImmLogic { + /// The actual value. + value: u64, + /// `N` flag. + pub n: bool, + /// `S` field: element size and element bits. + pub r: u8, + /// `R` field: rotate amount. + pub s: u8, +} + +impl ImmLogic { + /// Compute an ImmLogic from raw bits, if possible. + pub fn maybe_from_u64(value: u64, ty: Type) -> Option { + // Note: This function is a port of VIXL's Assembler::IsImmLogical. + + if ty != I64 && ty != I32 { + return None; + } + + let original_value = value; + + let value = if ty == I32 { + // To handle 32-bit logical immediates, the very easiest thing is to repeat + // the input value twice to make a 64-bit word. The correct encoding of that + // as a logical immediate will also be the correct encoding of the 32-bit + // value. + + // Avoid making the assumption that the most-significant 32 bits are zero by + // shifting the value left and duplicating it. + let value = value << 32; + value | value >> 32 + } else { + value + }; + + // Logical immediates are encoded using parameters n, imm_s and imm_r using + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 bits + // are set. The pattern is rotated right by R, and repeated across a 32 or + // 64-bit value, depending on destination register width. + // + // Put another way: the basic format of a logical immediate is a single + // contiguous stretch of 1 bits, repeated across the whole word at intervals + // given by a power of 2. To identify them quickly, we first locate the + // lowest stretch of 1 bits, then the next 1 bit above that; that combination + // is different for every logical immediate, so it gives us all the + // information we need to identify the only logical immediate that our input + // could be, and then we simply check if that's the value we actually have. + // + // (The rotation parameter does give the possibility of the stretch of 1 bits + // going 'round the end' of the word. To deal with that, we observe that in + // any situation where that happens the bitwise NOT of the value is also a + // valid logical immediate. So we simply invert the input whenever its low bit + // is set, and then we know that the rotated case can't arise.) + let (value, inverted) = if value & 1 == 1 { + (!value, true) + } else { + (value, false) + }; + + if value == 0 { + return None; + } + + // The basic analysis idea: imagine our input word looks like this. + // + // 0011111000111110001111100011111000111110001111100011111000111110 + // c b a + // |<--d-->| + // + // We find the lowest set bit (as an actual power-of-2 value, not its index) + // and call it a. Then we add a to our original number, which wipes out the + // bottommost stretch of set bits and replaces it with a 1 carried into the + // next zero bit. Then we look for the new lowest set bit, which is in + // position b, and subtract it, so now our number is just like the original + // but with the lowest stretch of set bits completely gone. Now we find the + // lowest set bit again, which is position c in the diagram above. Then we'll + // measure the distance d between bit positions a and c (using CLZ), and that + // tells us that the only valid logical immediate that could possibly be equal + // to this number is the one in which a stretch of bits running from a to just + // below b is replicated every d bits. + fn lowest_set_bit(value: u64) -> u64 { + let bit = value.trailing_zeros(); + 1u64.checked_shl(bit).unwrap_or(0) + } + let a = lowest_set_bit(value); + assert_ne!(0, a); + let value_plus_a = value.wrapping_add(a); + let b = lowest_set_bit(value_plus_a); + let value_plus_a_minus_b = value_plus_a - b; + let c = lowest_set_bit(value_plus_a_minus_b); + + let (d, clz_a, out_n, mask) = if c != 0 { + // The general case, in which there is more than one stretch of set bits. + // Compute the repeat distance d, and set up a bitmask covering the basic + // unit of repetition (i.e. a word with the bottom d bits set). Also, in all + // of these cases the N bit of the output will be zero. + let clz_a = a.leading_zeros(); + let clz_c = c.leading_zeros(); + let d = clz_a - clz_c; + let mask = (1 << d) - 1; + (d, clz_a, 0, mask) + } else { + (64, a.leading_zeros(), 1, u64::max_value()) + }; + + // If the repeat period d is not a power of two, it can't be encoded. + if !d.is_power_of_two() { + return None; + } + + if ((b.wrapping_sub(a)) & !mask) != 0 { + // If the bit stretch (b - a) does not fit within the mask derived from the + // repeat period, then fail. + return None; + } + + // The only possible option is b - a repeated every d bits. Now we're going to + // actually construct the valid logical immediate derived from that + // specification, and see if it equals our original input. + // + // To repeat a value every d bits, we multiply it by a number of the form + // (1 + 2^d + 2^(2d) + ...), i.e. 0x0001000100010001 or similar. These can + // be derived using a table lookup on CLZ(d). + const MULTIPLIERS: [u64; 6] = [ + 0x0000000000000001, + 0x0000000100000001, + 0x0001000100010001, + 0x0101010101010101, + 0x1111111111111111, + 0x5555555555555555, + ]; + let multiplier = MULTIPLIERS[(u64::from(d).leading_zeros() - 57) as usize]; + let candidate = b.wrapping_sub(a) * multiplier; + + if value != candidate { + // The candidate pattern doesn't match our input value, so fail. + return None; + } + + // We have a match! This is a valid logical immediate, so now we have to + // construct the bits and pieces of the instruction encoding that generates + // it. + + // Count the set bits in our basic stretch. The special case of clz(0) == -1 + // makes the answer come out right for stretches that reach the very top of + // the word (e.g. numbers like 0xffffc00000000000). + let clz_b = if b == 0 { + u32::max_value() // -1 + } else { + b.leading_zeros() + }; + let s = clz_a.wrapping_sub(clz_b); + + // Decide how many bits to rotate right by, to put the low bit of that basic + // stretch in position a. + let (s, r) = if inverted { + // If we inverted the input right at the start of this function, here's + // where we compensate: the number of set bits becomes the number of clear + // bits, and the rotation count is based on position b rather than position + // a (since b is the location of the 'lowest' 1 bit after inversion). + // Need wrapping for when clz_b is max_value() (for when b == 0). + (d - s, clz_b.wrapping_add(1) & (d - 1)) + } else { + (s, (clz_a + 1) & (d - 1)) + }; + + // Now we're done, except for having to encode the S output in such a way that + // it gives both the number of set bits and the length of the repeated + // segment. The s field is encoded like this: + // + // imms size S + // ssssss 64 UInt(ssssss) + // 0sssss 32 UInt(sssss) + // 10ssss 16 UInt(ssss) + // 110sss 8 UInt(sss) + // 1110ss 4 UInt(ss) + // 11110s 2 UInt(s) + // + // So we 'or' (2 * -d) with our computed s to form imms. + let s = ((d * 2).wrapping_neg() | (s - 1)) & 0x3f; + debug_assert!(u8::try_from(r).is_ok()); + debug_assert!(u8::try_from(s).is_ok()); + Some(ImmLogic { + value: original_value, + n: out_n != 0, + r: r as u8, + s: s as u8, + }) + } + + pub fn from_raw(value: u64, n: bool, r: u8, s: u8) -> ImmLogic { + ImmLogic { n, r, s, value } + } + + /// Returns bits ready for encoding: (N:1, R:6, S:6) + pub fn enc_bits(&self) -> u32 { + ((self.n as u32) << 12) | ((self.r as u32) << 6) | (self.s as u32) + } + + /// Returns the value that this immediate represents. + pub fn value(&self) -> u64 { + self.value + } + + /// Return an immediate for the bitwise-inverted value. + pub fn invert(&self) -> ImmLogic { + // For every ImmLogical immediate, the inverse can also be encoded. + Self::maybe_from_u64(!self.value, I64).unwrap() + } +} + +/// An immediate for shift instructions. +#[derive(Clone, Debug)] +pub struct ImmShift { + /// 6-bit shift amount. + pub imm: u8, +} + +impl ImmShift { + /// Create an ImmShift from raw bits, if possible. + pub fn maybe_from_u64(val: u64) -> Option { + if val < 64 { + Some(ImmShift { imm: val as u8 }) + } else { + None + } + } + + /// Get the immediate value. + pub fn value(&self) -> u8 { + self.imm + } +} + +/// A 16-bit immediate for a MOVZ instruction, with a {0,16,32,48}-bit shift. +#[derive(Clone, Copy, Debug)] +pub struct MoveWideConst { + /// The value. + pub bits: u16, + /// Result is `bits` shifted 16*shift bits to the left. + pub shift: u8, +} + +impl MoveWideConst { + /// Construct a MoveWideConst from an arbitrary 64-bit constant if possible. + pub fn maybe_from_u64(value: u64) -> Option { + let mask0 = 0x0000_0000_0000_ffffu64; + let mask1 = 0x0000_0000_ffff_0000u64; + let mask2 = 0x0000_ffff_0000_0000u64; + let mask3 = 0xffff_0000_0000_0000u64; + + if value == (value & mask0) { + return Some(MoveWideConst { + bits: (value & mask0) as u16, + shift: 0, + }); + } + if value == (value & mask1) { + return Some(MoveWideConst { + bits: ((value >> 16) & mask0) as u16, + shift: 1, + }); + } + if value == (value & mask2) { + return Some(MoveWideConst { + bits: ((value >> 32) & mask0) as u16, + shift: 2, + }); + } + if value == (value & mask3) { + return Some(MoveWideConst { + bits: ((value >> 48) & mask0) as u16, + shift: 3, + }); + } + None + } + + pub fn maybe_with_shift(imm: u16, shift: u8) -> Option { + let shift_enc = shift / 16; + if shift_enc > 3 { + None + } else { + Some(MoveWideConst { + bits: imm, + shift: shift_enc, + }) + } + } + + /// Returns the value that this constant represents. + pub fn value(&self) -> u64 { + (self.bits as u64) << (16 * self.shift) + } +} + +impl ShowWithRRU for Imm12 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + let shift = if self.shift12 { 12 } else { 0 }; + let value = u32::from(self.bits) << shift; + format!("#{}", value) + } +} + +impl ShowWithRRU for SImm7Scaled { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl ShowWithRRU for SImm9 { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl ShowWithRRU for UImm12Scaled { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value) + } +} + +impl ShowWithRRU for ImmLogic { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.value()) + } +} + +impl ShowWithRRU for ImmShift { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + format!("#{}", self.imm) + } +} + +impl ShowWithRRU for MoveWideConst { + fn show_rru(&self, _mb_rru: Option<&RealRegUniverse>) -> String { + if self.shift == 0 { + format!("#{}", self.bits) + } else { + format!("#{}, LSL #{}", self.bits, self.shift * 16) + } + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn imm_logical_test() { + assert_eq!(None, ImmLogic::maybe_from_u64(0, I64)); + assert_eq!(None, ImmLogic::maybe_from_u64(u64::max_value(), I64)); + + assert_eq!( + Some(ImmLogic { + value: 1, + n: true, + r: 0, + s: 0 + }), + ImmLogic::maybe_from_u64(1, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 2, + n: true, + r: 63, + s: 0 + }), + ImmLogic::maybe_from_u64(2, I64) + ); + + assert_eq!(None, ImmLogic::maybe_from_u64(5, I64)); + + assert_eq!(None, ImmLogic::maybe_from_u64(11, I64)); + + assert_eq!( + Some(ImmLogic { + value: 248, + n: true, + r: 61, + s: 4 + }), + ImmLogic::maybe_from_u64(248, I64) + ); + + assert_eq!(None, ImmLogic::maybe_from_u64(249, I64)); + + assert_eq!( + Some(ImmLogic { + value: 1920, + n: true, + r: 57, + s: 3 + }), + ImmLogic::maybe_from_u64(1920, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x7ffe, + n: true, + r: 63, + s: 13 + }), + ImmLogic::maybe_from_u64(0x7ffe, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x30000, + n: true, + r: 48, + s: 1 + }), + ImmLogic::maybe_from_u64(0x30000, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x100000, + n: true, + r: 44, + s: 0 + }), + ImmLogic::maybe_from_u64(0x100000, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: u64::max_value() - 1, + n: true, + r: 63, + s: 62 + }), + ImmLogic::maybe_from_u64(u64::max_value() - 1, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0xaaaaaaaaaaaaaaaa, + n: false, + r: 1, + s: 60 + }), + ImmLogic::maybe_from_u64(0xaaaaaaaaaaaaaaaa, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x8181818181818181, + n: false, + r: 1, + s: 49 + }), + ImmLogic::maybe_from_u64(0x8181818181818181, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0xffc3ffc3ffc3ffc3, + n: false, + r: 10, + s: 43 + }), + ImmLogic::maybe_from_u64(0xffc3ffc3ffc3ffc3, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x100000001, + n: false, + r: 0, + s: 0 + }), + ImmLogic::maybe_from_u64(0x100000001, I64) + ); + + assert_eq!( + Some(ImmLogic { + value: 0x1111111111111111, + n: false, + r: 0, + s: 56 + }), + ImmLogic::maybe_from_u64(0x1111111111111111, I64) + ); + + for n in 0..2 { + let types = if n == 0 { vec![I64, I32] } else { vec![I64] }; + for s in 0..64 { + for r in 0..64 { + let imm = get_logical_imm(n, s, r); + for &ty in &types { + match ImmLogic::maybe_from_u64(imm, ty) { + Some(ImmLogic { value, .. }) => { + assert_eq!(imm, value); + ImmLogic::maybe_from_u64(!value, ty).unwrap(); + } + None => assert_eq!(0, imm), + }; + } + } + } + } + } + + // Repeat a value that has `width` bits, across a 64-bit value. + fn repeat(value: u64, width: u64) -> u64 { + let mut result = value & ((1 << width) - 1); + let mut i = width; + while i < 64 { + result |= result << i; + i *= 2; + } + result + } + + // Get the logical immediate, from the encoding N/R/S bits. + fn get_logical_imm(n: u32, s: u32, r: u32) -> u64 { + // An integer is constructed from the n, imm_s and imm_r bits according to + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 + // bits are set. The pattern is rotated right by R, and repeated across a + // 64-bit value. + + if n == 1 { + if s == 0x3f { + return 0; + } + let bits = (1u64 << (s + 1)) - 1; + bits.rotate_right(r) + } else { + if (s >> 1) == 0x1f { + return 0; + } + let mut width = 0x20; + while width >= 0x2 { + if (s & width) == 0 { + let mask = width - 1; + if (s & mask) == mask { + return 0; + } + let bits = (1u64 << ((s & mask) + 1)) - 1; + return repeat(bits.rotate_right(r & mask), width.into()); + } + width >>= 1; + } + unreachable!(); + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs new file mode 100644 index 000000000000..44da584b444d --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -0,0 +1,2541 @@ +//! This module defines aarch64-specific machine instruction types. + +// Some variants are not constructed, but we still want them as options in the future. +#![allow(dead_code)] + +use crate::binemit::CodeOffset; +use crate::ir::types::{B1, B16, B32, B64, B8, F32, F64, FFLAGS, I16, I32, I64, I8, IFLAGS}; +use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode, Type}; +use crate::machinst::*; + +use regalloc::Map as RegallocMap; +use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable}; +use regalloc::{RegUsageCollector, Set}; + +use alloc::vec::Vec; +use smallvec::{smallvec, SmallVec}; +use std::string::{String, ToString}; + +pub mod regs; +pub use self::regs::*; +pub mod imms; +pub use self::imms::*; +pub mod args; +pub use self::args::*; +pub mod emit; +pub use self::emit::*; + +//============================================================================= +// Instructions (top level): definition + +/// An ALU operation. This can be paired with several instruction formats +/// below (see `Inst`) in any combination. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum ALUOp { + Add32, + Add64, + Sub32, + Sub64, + Orr32, + Orr64, + /// NOR + OrrNot32, + /// NOR + OrrNot64, + And32, + And64, + /// NAND + AndNot32, + /// NAND + AndNot64, + /// XOR (AArch64 calls this "EOR") + Eor32, + /// XOR (AArch64 calls this "EOR") + Eor64, + /// XNOR (AArch64 calls this "EOR-NOT") + EorNot32, + /// XNOR (AArch64 calls this "EOR-NOT") + EorNot64, + /// Add, setting flags + AddS32, + /// Add, setting flags + AddS64, + /// Sub, setting flags + SubS32, + /// Sub, setting flags + SubS64, + /// Multiply-add + MAdd32, + /// Multiply-add + MAdd64, + /// Multiply-sub + MSub32, + /// Multiply-sub + MSub64, + /// Signed multiply, high-word result + SMulH, + /// Unsigned multiply, high-word result + UMulH, + SDiv64, + UDiv64, + RotR32, + RotR64, + Lsr32, + Lsr64, + Asr32, + Asr64, + Lsl32, + Lsl64, +} + +/// A floating-point unit (FPU) operation with one arg. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp1 { + Abs32, + Abs64, + Neg32, + Neg64, + Sqrt32, + Sqrt64, + Cvt32To64, + Cvt64To32, +} + +/// A floating-point unit (FPU) operation with two args. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp2 { + Add32, + Add64, + Sub32, + Sub64, + Mul32, + Mul64, + Div32, + Div64, + Max32, + Max64, + Min32, + Min64, +} + +/// A floating-point unit (FPU) operation with three args. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FPUOp3 { + MAdd32, + MAdd64, +} + +/// A conversion from an FP to an integer value. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FpuToIntOp { + F32ToU32, + F32ToI32, + F32ToU64, + F32ToI64, + F64ToU32, + F64ToI32, + F64ToU64, + F64ToI64, +} + +/// A conversion from an integer to an FP value. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum IntToFpuOp { + U32ToF32, + I32ToF32, + U32ToF64, + I32ToF64, + U64ToF32, + I64ToF32, + U64ToF64, + I64ToF64, +} + +/// Modes for FP rounding ops: round down (floor) or up (ceil), or toward zero (trunc), or to +/// nearest, and for 32- or 64-bit FP values. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum FpuRoundMode { + Minus32, + Minus64, + Plus32, + Plus64, + Zero32, + Zero64, + Nearest32, + Nearest64, +} + +/// A vector ALU operation. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum VecALUOp { + /// Signed saturating add + SQAddScalar, + /// Unsigned saturating add + UQAddScalar, + /// Signed saturating subtract + SQSubScalar, + /// Unsigned saturating subtract + UQSubScalar, +} + +/// An operation on the bits of a register. This can be paired with several instruction formats +/// below (see `Inst`) in any combination. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum BitOp { + /// Bit reverse + RBit32, + /// Bit reverse + RBit64, + Clz32, + Clz64, + Cls32, + Cls64, +} + +impl BitOp { + /// What is the opcode's native width? + pub fn inst_size(&self) -> InstSize { + match self { + BitOp::RBit32 | BitOp::Clz32 | BitOp::Cls32 => InstSize::Size32, + _ => InstSize::Size64, + } + } + + /// Get the assembly mnemonic for this opcode. + pub fn op_str(&self) -> &'static str { + match self { + BitOp::RBit32 | BitOp::RBit64 => "rbit", + BitOp::Clz32 | BitOp::Clz64 => "clz", + BitOp::Cls32 | BitOp::Cls64 => "cls", + } + } +} + +impl From<(Opcode, Type)> for BitOp { + /// Get the BitOp from the IR opcode. + fn from(op_ty: (Opcode, Type)) -> BitOp { + match op_ty { + (Opcode::Bitrev, I32) => BitOp::RBit32, + (Opcode::Bitrev, I64) => BitOp::RBit64, + (Opcode::Clz, I32) => BitOp::Clz32, + (Opcode::Clz, I64) => BitOp::Clz64, + (Opcode::Cls, I32) => BitOp::Cls32, + (Opcode::Cls, I64) => BitOp::Cls64, + _ => unreachable!("Called with non-bit op!"), + } + } +} + +/// Instruction formats. +#[derive(Clone, Debug)] +pub enum Inst { + /// A no-op of zero size. + Nop0, + + /// A no-op that is one instruction large. + Nop4, + + /// An ALU operation with two register sources and a register destination. + AluRRR { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + rm: Reg, + }, + /// An ALU operation with three register sources and a register destination. + AluRRRR { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + rm: Reg, + ra: Reg, + }, + /// An ALU operation with a register source and an immediate-12 source, and a register + /// destination. + AluRRImm12 { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + imm12: Imm12, + }, + /// An ALU operation with a register source and an immediate-logic source, and a register destination. + AluRRImmLogic { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + imml: ImmLogic, + }, + /// An ALU operation with a register source and an immediate-shiftamt source, and a register destination. + AluRRImmShift { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + immshift: ImmShift, + }, + /// An ALU operation with two register sources, one of which can be shifted, and a register + /// destination. + AluRRRShift { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + rm: Reg, + shiftop: ShiftOpAndAmt, + }, + /// An ALU operation with two register sources, one of which can be {zero,sign}-extended and + /// shifted, and a register destination. + AluRRRExtend { + alu_op: ALUOp, + rd: Writable, + rn: Reg, + rm: Reg, + extendop: ExtendOp, + }, + + /// A bit op instruction with a single register source. + BitRR { + op: BitOp, + rd: Writable, + rn: Reg, + }, + + /// An unsigned (zero-extending) 8-bit load. + ULoad8 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// A signed (sign-extending) 8-bit load. + SLoad8 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// An unsigned (zero-extending) 16-bit load. + ULoad16 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// A signed (sign-extending) 16-bit load. + SLoad16 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// An unsigned (zero-extending) 32-bit load. + ULoad32 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// A signed (sign-extending) 32-bit load. + SLoad32 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// A 64-bit load. + ULoad64 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + + /// An 8-bit store. + Store8 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + /// A 16-bit store. + Store16 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + /// A 32-bit store. + Store32 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + /// A 64-bit store. + Store64 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + + /// A store of a pair of registers. + StoreP64 { + rt: Reg, + rt2: Reg, + mem: PairMemArg, + }, + /// A load of a pair of registers. + LoadP64 { + rt: Writable, + rt2: Writable, + mem: PairMemArg, + }, + + /// A MOV instruction. These are encoded as ORR's (AluRRR form) but we + /// keep them separate at the `Inst` level for better pretty-printing + /// and faster `is_move()` logic. + Mov { + rd: Writable, + rm: Reg, + }, + + /// A 32-bit MOV. Zeroes the top 32 bits of the destination. This is + /// effectively an alias for an unsigned 32-to-64-bit extension. + Mov32 { + rd: Writable, + rm: Reg, + }, + + /// A MOVZ with a 16-bit immediate. + MovZ { + rd: Writable, + imm: MoveWideConst, + }, + + /// A MOVN with a 16-bit immediate. + MovN { + rd: Writable, + imm: MoveWideConst, + }, + + /// A MOVK with a 16-bit immediate. + MovK { + rd: Writable, + imm: MoveWideConst, + }, + + /// A sign- or zero-extend operation. + Extend { + rd: Writable, + rn: Reg, + signed: bool, + from_bits: u8, + to_bits: u8, + }, + + /// A conditional-select operation. + CSel { + rd: Writable, + cond: Cond, + rn: Reg, + rm: Reg, + }, + + /// A conditional-set operation. + CSet { + rd: Writable, + cond: Cond, + }, + + /// FPU move. Note that this is distinct from a vector-register + /// move; moving just 64 bits seems to be significantly faster. + FpuMove64 { + rd: Writable, + rn: Reg, + }, + + /// 1-op FPU instruction. + FpuRR { + fpu_op: FPUOp1, + rd: Writable, + rn: Reg, + }, + + /// 2-op FPU instruction. + FpuRRR { + fpu_op: FPUOp2, + rd: Writable, + rn: Reg, + rm: Reg, + }, + + /// 3-op FPU instruction. + FpuRRRR { + fpu_op: FPUOp3, + rd: Writable, + rn: Reg, + rm: Reg, + ra: Reg, + }, + + /// FPU comparison, single-precision (32 bit). + FpuCmp32 { + rn: Reg, + rm: Reg, + }, + + /// FPU comparison, double-precision (64 bit). + FpuCmp64 { + rn: Reg, + rm: Reg, + }, + + /// Floating-point load, single-precision (32 bit). + FpuLoad32 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// Floating-point store, single-precision (32 bit). + FpuStore32 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + /// Floating-point load, double-precision (64 bit). + FpuLoad64 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// Floating-point store, double-precision (64 bit). + FpuStore64 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + /// Floating-point/vector load, 128 bit. + FpuLoad128 { + rd: Writable, + mem: MemArg, + srcloc: Option, + }, + /// Floating-point/vector store, 128 bit. + FpuStore128 { + rd: Reg, + mem: MemArg, + srcloc: Option, + }, + + LoadFpuConst32 { + rd: Writable, + const_data: f32, + }, + + LoadFpuConst64 { + rd: Writable, + const_data: f64, + }, + + /// Conversion: FP -> integer. + FpuToInt { + op: FpuToIntOp, + rd: Writable, + rn: Reg, + }, + + /// Conversion: integer -> FP. + IntToFpu { + op: IntToFpuOp, + rd: Writable, + rn: Reg, + }, + + /// FP conditional select, 32 bit. + FpuCSel32 { + rd: Writable, + rn: Reg, + rm: Reg, + cond: Cond, + }, + /// FP conditional select, 64 bit. + FpuCSel64 { + rd: Writable, + rn: Reg, + rm: Reg, + cond: Cond, + }, + + /// Round to integer. + FpuRound { + op: FpuRoundMode, + rd: Writable, + rn: Reg, + }, + + /// Move to a vector register from a GPR. + MovToVec64 { + rd: Writable, + rn: Reg, + }, + + /// Move to a GPR from a vector register. + MovFromVec64 { + rd: Writable, + rn: Reg, + }, + + /// A vector ALU op. + VecRRR { + alu_op: VecALUOp, + rd: Writable, + rn: Reg, + rm: Reg, + }, + + /// Move to the NZCV flags (actually a `MSR NZCV, Xn` insn). + MovToNZCV { + rn: Reg, + }, + + /// Move from the NZCV flags (actually a `MRS Xn, NZCV` insn). + MovFromNZCV { + rd: Writable, + }, + + /// Set a register to 1 if condition, else 0. + CondSet { + rd: Writable, + cond: Cond, + }, + + /// A machine call instruction. + Call { + dest: ExternalName, + uses: Set, + defs: Set>, + loc: SourceLoc, + opcode: Opcode, + }, + /// A machine indirect-call instruction. + CallInd { + rn: Reg, + uses: Set, + defs: Set>, + loc: SourceLoc, + opcode: Opcode, + }, + + // ---- branches (exactly one must appear at end of BB) ---- + /// A machine return instruction. + Ret, + + /// A placeholder instruction, generating no code, meaning that a function epilogue must be + /// inserted there. + EpiloguePlaceholder, + + /// An unconditional branch. + Jump { + dest: BranchTarget, + }, + + /// A conditional branch. + CondBr { + taken: BranchTarget, + not_taken: BranchTarget, + kind: CondBrKind, + }, + + /// Lowered conditional branch: contains the original branch kind (or the + /// inverse), but only one BranchTarget is retained. The other is + /// implicitly the next instruction, given the final basic-block layout. + CondBrLowered { + target: BranchTarget, + kind: CondBrKind, + }, + + /// As for `CondBrLowered`, but represents a condbr/uncond-br sequence (two + /// actual machine instructions). Needed when the final block layout implies + /// that neither arm of a conditional branch targets the fallthrough block. + CondBrLoweredCompound { + taken: BranchTarget, + not_taken: BranchTarget, + kind: CondBrKind, + }, + + /// An indirect branch through a register, augmented with set of all + /// possible successors. + IndirectBr { + rn: Reg, + targets: Vec, + }, + + /// A "break" instruction, used for e.g. traps and debug breakpoints. + Brk, + + /// An instruction guaranteed to always be undefined and to trigger an illegal instruction at + /// runtime. + Udf { + trap_info: (SourceLoc, TrapCode), + }, + + /// Load the address (using a PC-relative offset) of a MemLabel, using the + /// `ADR` instruction. + Adr { + rd: Writable, + label: MemLabel, + }, + + /// Raw 32-bit word, used for inline constants and jump-table entries. + Word4 { + data: u32, + }, + + /// Raw 64-bit word, used for inline constants. + Word8 { + data: u64, + }, + + /// Jump-table sequence, as one compound instruction (see note in lower.rs + /// for rationale). + JTSequence { + targets: Vec, + targets_for_term: Vec, // needed for MachTerminator. + ridx: Reg, + rtmp1: Writable, + rtmp2: Writable, + }, + + /// Load an inline constant. + LoadConst64 { + rd: Writable, + const_data: u64, + }, + + /// Load an inline symbol reference. + LoadExtName { + rd: Writable, + name: ExternalName, + srcloc: SourceLoc, + offset: i64, + }, +} + +fn count_zero_half_words(mut value: u64) -> usize { + let mut count = 0; + for _ in 0..4 { + if value & 0xffff == 0 { + count += 1; + } + value >>= 16; + } + + count +} + +impl Inst { + /// Create a move instruction. + pub fn mov(to_reg: Writable, from_reg: Reg) -> Inst { + assert!(to_reg.to_reg().get_class() == from_reg.get_class()); + if from_reg.get_class() == RegClass::I64 { + Inst::Mov { + rd: to_reg, + rm: from_reg, + } + } else { + Inst::FpuMove64 { + rd: to_reg, + rn: from_reg, + } + } + } + + /// Create a 32-bit move instruction. + pub fn mov32(to_reg: Writable, from_reg: Reg) -> Inst { + Inst::Mov32 { + rd: to_reg, + rm: from_reg, + } + } + + /// Create an instruction that loads a constant, using one of serveral options (MOVZ, MOVN, + /// logical immediate, or constant pool). + pub fn load_constant(rd: Writable, value: u64) -> SmallVec<[Inst; 4]> { + if let Some(imm) = MoveWideConst::maybe_from_u64(value) { + // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVZ + smallvec![Inst::MovZ { rd, imm }] + } else if let Some(imm) = MoveWideConst::maybe_from_u64(!value) { + // 16-bit immediate (shifted by 0, 16, 32 or 48 bits) in MOVN + smallvec![Inst::MovN { rd, imm }] + } else if let Some(imml) = ImmLogic::maybe_from_u64(value, I64) { + // Weird logical-instruction immediate in ORI using zero register + smallvec![Inst::AluRRImmLogic { + alu_op: ALUOp::Orr64, + rd, + rn: zero_reg(), + imml, + }] + } else { + let mut insts = smallvec![]; + + // If the number of 0xffff half words is greater than the number of 0x0000 half words + // it is more efficient to use `movn` for the first instruction. + let first_is_inverted = count_zero_half_words(!value) > count_zero_half_words(value); + // Either 0xffff or 0x0000 half words can be skipped, depending on the first + // instruction used. + let ignored_halfword = if first_is_inverted { 0xffff } else { 0 }; + let mut first_mov_emitted = false; + + for i in 0..4 { + let imm16 = (value >> (16 * i)) & 0xffff; + if imm16 != ignored_halfword { + if !first_mov_emitted { + first_mov_emitted = true; + if first_is_inverted { + let imm = + MoveWideConst::maybe_with_shift(((!imm16) & 0xffff) as u16, i * 16) + .unwrap(); + insts.push(Inst::MovN { rd, imm }); + } else { + let imm = + MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); + insts.push(Inst::MovZ { rd, imm }); + } + } else { + let imm = MoveWideConst::maybe_with_shift(imm16 as u16, i * 16).unwrap(); + insts.push(Inst::MovK { rd, imm }); + } + } + } + + assert!(first_mov_emitted); + + insts + } + } + + /// Create an instruction that loads a 32-bit floating-point constant. + pub fn load_fp_constant32(rd: Writable, value: f32) -> Inst { + // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. + Inst::LoadFpuConst32 { + rd, + const_data: value, + } + } + + /// Create an instruction that loads a 64-bit floating-point constant. + pub fn load_fp_constant64(rd: Writable, value: f64) -> Inst { + // TODO: use FMOV immediate form when `value` has sufficiently few mantissa/exponent bits. + Inst::LoadFpuConst64 { + rd, + const_data: value, + } + } +} + +//============================================================================= +// Instructions: get_regs + +fn memarg_regs(memarg: &MemArg, collector: &mut RegUsageCollector) { + match memarg { + &MemArg::Unscaled(reg, ..) | &MemArg::UnsignedOffset(reg, ..) => { + collector.add_use(reg); + } + &MemArg::RegReg(r1, r2, ..) + | &MemArg::RegScaled(r1, r2, ..) + | &MemArg::RegScaledExtended(r1, r2, ..) => { + collector.add_use(r1); + collector.add_use(r2); + } + &MemArg::Label(..) => {} + &MemArg::PreIndexed(reg, ..) | &MemArg::PostIndexed(reg, ..) => { + collector.add_mod(reg); + } + &MemArg::FPOffset(..) => { + collector.add_use(fp_reg()); + } + &MemArg::SPOffset(..) => { + collector.add_use(stack_reg()); + } + } +} + +fn pairmemarg_regs(pairmemarg: &PairMemArg, collector: &mut RegUsageCollector) { + match pairmemarg { + &PairMemArg::SignedOffset(reg, ..) => { + collector.add_use(reg); + } + &PairMemArg::PreIndexed(reg, ..) | &PairMemArg::PostIndexed(reg, ..) => { + collector.add_mod(reg); + } + } +} + +fn aarch64_get_regs(inst: &Inst, collector: &mut RegUsageCollector) { + match inst { + &Inst::AluRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRRR { rd, rn, rm, ra, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + collector.add_use(ra); + } + &Inst::AluRRImm12 { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRImmLogic { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRImmShift { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::AluRRRShift { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::AluRRRExtend { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::BitRR { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::ULoad8 { rd, ref mem, .. } + | &Inst::SLoad8 { rd, ref mem, .. } + | &Inst::ULoad16 { rd, ref mem, .. } + | &Inst::SLoad16 { rd, ref mem, .. } + | &Inst::ULoad32 { rd, ref mem, .. } + | &Inst::SLoad32 { rd, ref mem, .. } + | &Inst::ULoad64 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::Store8 { rd, ref mem, .. } + | &Inst::Store16 { rd, ref mem, .. } + | &Inst::Store32 { rd, ref mem, .. } + | &Inst::Store64 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::StoreP64 { + rt, rt2, ref mem, .. + } => { + collector.add_use(rt); + collector.add_use(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::LoadP64 { + rt, rt2, ref mem, .. + } => { + collector.add_def(rt); + collector.add_def(rt2); + pairmemarg_regs(mem, collector); + } + &Inst::Mov { rd, rm } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::Mov32 { rd, rm } => { + collector.add_def(rd); + collector.add_use(rm); + } + &Inst::MovZ { rd, .. } | &Inst::MovN { rd, .. } => { + collector.add_def(rd); + } + &Inst::MovK { rd, .. } => { + collector.add_mod(rd); + } + &Inst::CSel { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::CSet { rd, .. } => { + collector.add_def(rd); + } + &Inst::FpuMove64 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuRR { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuRRRR { rd, rn, rm, ra, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + collector.add_use(ra); + } + &Inst::FpuCmp32 { rn, rm } | &Inst::FpuCmp64 { rn, rm } => { + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuLoad32 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuLoad64 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuLoad128 { rd, ref mem, .. } => { + collector.add_def(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore32 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore64 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::FpuStore128 { rd, ref mem, .. } => { + collector.add_use(rd); + memarg_regs(mem, collector); + } + &Inst::LoadFpuConst32 { rd, .. } | &Inst::LoadFpuConst64 { rd, .. } => { + collector.add_def(rd); + } + &Inst::FpuToInt { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::IntToFpu { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::FpuCSel32 { rd, rn, rm, .. } | &Inst::FpuCSel64 { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::FpuRound { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::MovToVec64 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::MovFromVec64 { rd, rn } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::VecRRR { rd, rn, rm, .. } => { + collector.add_def(rd); + collector.add_use(rn); + collector.add_use(rm); + } + &Inst::MovToNZCV { rn } => { + collector.add_use(rn); + } + &Inst::MovFromNZCV { rd } => { + collector.add_def(rd); + } + &Inst::CondSet { rd, .. } => { + collector.add_def(rd); + } + &Inst::Extend { rd, rn, .. } => { + collector.add_def(rd); + collector.add_use(rn); + } + &Inst::Jump { .. } | &Inst::Ret | &Inst::EpiloguePlaceholder => {} + &Inst::Call { + ref uses, ref defs, .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + } + &Inst::CallInd { + ref uses, + ref defs, + rn, + .. + } => { + collector.add_uses(uses); + collector.add_defs(defs); + collector.add_use(rn); + } + &Inst::CondBr { ref kind, .. } + | &Inst::CondBrLowered { ref kind, .. } + | &Inst::CondBrLoweredCompound { ref kind, .. } => match kind { + CondBrKind::Zero(rt) | CondBrKind::NotZero(rt) => { + collector.add_use(*rt); + } + CondBrKind::Cond(_) => {} + }, + &Inst::IndirectBr { rn, .. } => { + collector.add_use(rn); + } + &Inst::Nop0 | Inst::Nop4 => {} + &Inst::Brk => {} + &Inst::Udf { .. } => {} + &Inst::Adr { rd, .. } => { + collector.add_def(rd); + } + &Inst::Word4 { .. } | &Inst::Word8 { .. } => {} + &Inst::JTSequence { + ridx, rtmp1, rtmp2, .. + } => { + collector.add_use(ridx); + collector.add_def(rtmp1); + collector.add_def(rtmp2); + } + &Inst::LoadConst64 { rd, .. } | &Inst::LoadExtName { rd, .. } => { + collector.add_def(rd); + } + } +} + +//============================================================================= +// Instructions: map_regs + +fn aarch64_map_regs( + inst: &mut Inst, + pre_map: &RegallocMap, + post_map: &RegallocMap, +) { + fn map(m: &RegallocMap, r: &mut Reg) { + if r.is_virtual() { + let new = m.get(&r.to_virtual_reg()).cloned().unwrap().to_reg(); + *r = new; + } + } + + fn map_wr(m: &RegallocMap, r: &mut Writable) { + let mut reg = r.to_reg(); + map(m, &mut reg); + *r = Writable::from_reg(reg); + } + + fn map_mem(u: &RegallocMap, mem: &mut MemArg) { + // N.B.: we take only the pre-map here, but this is OK because the + // only addressing modes that update registers (pre/post-increment on + // AArch64) both read and write registers, so they are "mods" rather + // than "defs", so must be the same in both the pre- and post-map. + match mem { + &mut MemArg::Unscaled(ref mut reg, ..) => map(u, reg), + &mut MemArg::UnsignedOffset(ref mut reg, ..) => map(u, reg), + &mut MemArg::RegReg(ref mut r1, ref mut r2) => { + map(u, r1); + map(u, r2); + } + &mut MemArg::RegScaled(ref mut r1, ref mut r2, ..) => { + map(u, r1); + map(u, r2); + } + &mut MemArg::RegScaledExtended(ref mut r1, ref mut r2, ..) => { + map(u, r1); + map(u, r2); + } + &mut MemArg::Label(..) => {} + &mut MemArg::PreIndexed(ref mut r, ..) => map_wr(u, r), + &mut MemArg::PostIndexed(ref mut r, ..) => map_wr(u, r), + &mut MemArg::FPOffset(..) | &mut MemArg::SPOffset(..) => {} + }; + } + + fn map_pairmem(u: &RegallocMap, mem: &mut PairMemArg) { + match mem { + &mut PairMemArg::SignedOffset(ref mut reg, ..) => map(u, reg), + &mut PairMemArg::PreIndexed(ref mut reg, ..) => map_wr(u, reg), + &mut PairMemArg::PostIndexed(ref mut reg, ..) => map_wr(u, reg), + } + } + + fn map_br(u: &RegallocMap, br: &mut CondBrKind) { + match br { + &mut CondBrKind::Zero(ref mut reg) => map(u, reg), + &mut CondBrKind::NotZero(ref mut reg) => map(u, reg), + &mut CondBrKind::Cond(..) => {} + }; + } + + let u = pre_map; // For brevity below. + let d = post_map; + + match inst { + &mut Inst::AluRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::AluRRRR { + ref mut rd, + ref mut rn, + ref mut rm, + ref mut ra, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + map(u, ra); + } + &mut Inst::AluRRImm12 { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::AluRRImmLogic { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::AluRRImmShift { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::AluRRRShift { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::AluRRRExtend { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::BitRR { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::ULoad8 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::SLoad8 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::ULoad16 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::SLoad16 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::ULoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::SLoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + + &mut Inst::ULoad64 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::Store8 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::Store16 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::Store32 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::Store64 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + + &mut Inst::StoreP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + } => { + map(u, rt); + map(u, rt2); + map_pairmem(u, mem); + } + &mut Inst::LoadP64 { + ref mut rt, + ref mut rt2, + ref mut mem, + } => { + map_wr(d, rt); + map_wr(d, rt2); + map_pairmem(u, mem); + } + &mut Inst::Mov { + ref mut rd, + ref mut rm, + } => { + map_wr(d, rd); + map(u, rm); + } + &mut Inst::Mov32 { + ref mut rd, + ref mut rm, + } => { + map_wr(d, rd); + map(u, rm); + } + &mut Inst::MovZ { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::MovN { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::MovK { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::CSel { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::CSet { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::FpuMove64 { + ref mut rd, + ref mut rn, + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::FpuRR { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::FpuRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::FpuRRRR { + ref mut rd, + ref mut rn, + ref mut rm, + ref mut ra, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + map(u, ra); + } + &mut Inst::FpuCmp32 { + ref mut rn, + ref mut rm, + } => { + map(u, rn); + map(u, rm); + } + &mut Inst::FpuCmp64 { + ref mut rn, + ref mut rm, + } => { + map(u, rn); + map(u, rm); + } + &mut Inst::FpuLoad32 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::FpuLoad64 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::FpuLoad128 { + ref mut rd, + ref mut mem, + .. + } => { + map_wr(d, rd); + map_mem(u, mem); + } + &mut Inst::FpuStore32 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::FpuStore64 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::FpuStore128 { + ref mut rd, + ref mut mem, + .. + } => { + map(u, rd); + map_mem(u, mem); + } + &mut Inst::LoadFpuConst32 { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::LoadFpuConst64 { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::FpuToInt { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::IntToFpu { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::FpuCSel32 { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::FpuCSel64 { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::FpuRound { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::MovToVec64 { + ref mut rd, + ref mut rn, + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::MovFromVec64 { + ref mut rd, + ref mut rn, + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::VecRRR { + ref mut rd, + ref mut rn, + ref mut rm, + .. + } => { + map_wr(d, rd); + map(u, rn); + map(u, rm); + } + &mut Inst::MovToNZCV { ref mut rn } => { + map(u, rn); + } + &mut Inst::MovFromNZCV { ref mut rd } => { + map_wr(d, rd); + } + &mut Inst::CondSet { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::Extend { + ref mut rd, + ref mut rn, + .. + } => { + map_wr(d, rd); + map(u, rn); + } + &mut Inst::Jump { .. } => {} + &mut Inst::Call { + ref mut uses, + ref mut defs, + .. + } => { + // TODO: add `map_mut()` to regalloc.rs's Set. + let new_uses = uses.map(|r| { + let mut r = *r; + map(u, &mut r); + r + }); + let new_defs = defs.map(|r| { + let mut r = *r; + map_wr(d, &mut r); + r + }); + *uses = new_uses; + *defs = new_defs; + } + &mut Inst::Ret | &mut Inst::EpiloguePlaceholder => {} + &mut Inst::CallInd { + ref mut uses, + ref mut defs, + ref mut rn, + .. + } => { + // TODO: add `map_mut()` to regalloc.rs's Set. + let new_uses = uses.map(|r| { + let mut r = *r; + map(u, &mut r); + r + }); + let new_defs = defs.map(|r| { + let mut r = *r; + map_wr(d, &mut r); + r + }); + *uses = new_uses; + *defs = new_defs; + map(u, rn); + } + &mut Inst::CondBr { ref mut kind, .. } => { + map_br(u, kind); + } + &mut Inst::CondBrLowered { ref mut kind, .. } => { + map_br(u, kind); + } + &mut Inst::CondBrLoweredCompound { ref mut kind, .. } => { + map_br(u, kind); + } + &mut Inst::IndirectBr { ref mut rn, .. } => { + map(u, rn); + } + &mut Inst::Nop0 | &mut Inst::Nop4 | &mut Inst::Brk | &mut Inst::Udf { .. } => {} + &mut Inst::Adr { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::Word4 { .. } | &mut Inst::Word8 { .. } => {} + &mut Inst::JTSequence { + ref mut ridx, + ref mut rtmp1, + ref mut rtmp2, + .. + } => { + map(u, ridx); + map_wr(d, rtmp1); + map_wr(d, rtmp2); + } + &mut Inst::LoadConst64 { ref mut rd, .. } => { + map_wr(d, rd); + } + &mut Inst::LoadExtName { ref mut rd, .. } => { + map_wr(d, rd); + } + } +} + +//============================================================================= +// Instructions: misc functions and external interface + +impl MachInst for Inst { + fn get_regs(&self, collector: &mut RegUsageCollector) { + aarch64_get_regs(self, collector) + } + + fn map_regs( + &mut self, + pre_map: &RegallocMap, + post_map: &RegallocMap, + ) { + aarch64_map_regs(self, pre_map, post_map); + } + + fn is_move(&self) -> Option<(Writable, Reg)> { + match self { + &Inst::Mov { rd, rm } => Some((rd, rm)), + &Inst::FpuMove64 { rd, rn } => Some((rd, rn)), + _ => None, + } + } + + fn is_epilogue_placeholder(&self) -> bool { + if let Inst::EpiloguePlaceholder = self { + true + } else { + false + } + } + + fn is_term<'a>(&'a self) -> MachTerminator<'a> { + match self { + &Inst::Ret | &Inst::EpiloguePlaceholder => MachTerminator::Ret, + &Inst::Jump { dest } => MachTerminator::Uncond(dest.as_block_index().unwrap()), + &Inst::CondBr { + taken, not_taken, .. + } => MachTerminator::Cond( + taken.as_block_index().unwrap(), + not_taken.as_block_index().unwrap(), + ), + &Inst::CondBrLowered { .. } => { + // When this is used prior to branch finalization for branches + // within an open-coded sequence, i.e. with ResolvedOffsets, + // do not consider it a terminator. From the point of view of CFG analysis, + // it is part of a black-box single-in single-out region, hence is not + // denoted a terminator. + MachTerminator::None + } + &Inst::CondBrLoweredCompound { .. } => { + panic!("is_term() called after lowering branches"); + } + &Inst::IndirectBr { ref targets, .. } => MachTerminator::Indirect(&targets[..]), + &Inst::JTSequence { + ref targets_for_term, + .. + } => MachTerminator::Indirect(&targets_for_term[..]), + _ => MachTerminator::None, + } + } + + fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Inst { + assert!(ty.bits() <= 64); // no vector support yet! + Inst::mov(to_reg, from_reg) + } + + fn gen_zero_len_nop() -> Inst { + Inst::Nop0 + } + + fn gen_nop(preferred_size: usize) -> Inst { + // We can't give a NOP (or any insn) < 4 bytes. + assert!(preferred_size >= 4); + Inst::Nop4 + } + + fn maybe_direct_reload(&self, _reg: VirtualReg, _slot: SpillSlot) -> Option { + None + } + + fn rc_for_type(ty: Type) -> RegClass { + match ty { + I8 | I16 | I32 | I64 | B1 | B8 | B16 | B32 | B64 => RegClass::I64, + F32 | F64 => RegClass::V128, + IFLAGS | FFLAGS => RegClass::I64, + _ => panic!("Unexpected SSA-value type: {}", ty), + } + } + + fn gen_jump(blockindex: BlockIndex) -> Inst { + Inst::Jump { + dest: BranchTarget::Block(blockindex), + } + } + + fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]) { + match self { + &mut Inst::Jump { ref mut dest } => { + dest.map(block_target_map); + } + &mut Inst::CondBr { + ref mut taken, + ref mut not_taken, + .. + } => { + taken.map(block_target_map); + not_taken.map(block_target_map); + } + &mut Inst::CondBrLowered { .. } => { + // See note in `is_term()`: this is used in open-coded sequences + // within blocks and should be left alone. + } + &mut Inst::CondBrLoweredCompound { .. } => { + panic!("with_block_rewrites called after branch lowering!"); + } + _ => {} + } + } + + fn with_fallthrough_block(&mut self, fallthrough: Option) { + match self { + &mut Inst::CondBr { + taken, + not_taken, + kind, + } => { + if taken.as_block_index() == fallthrough + && not_taken.as_block_index() == fallthrough + { + *self = Inst::Nop0; + } else if taken.as_block_index() == fallthrough { + *self = Inst::CondBrLowered { + target: not_taken, + kind: kind.invert(), + }; + } else if not_taken.as_block_index() == fallthrough { + *self = Inst::CondBrLowered { + target: taken, + kind, + }; + } else { + // We need a compound sequence (condbr / uncond-br). + *self = Inst::CondBrLoweredCompound { + taken, + not_taken, + kind, + }; + } + } + &mut Inst::Jump { dest } => { + if dest.as_block_index() == fallthrough { + *self = Inst::Nop0; + } + } + _ => {} + } + } + + fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]) { + match self { + &mut Inst::CondBrLowered { ref mut target, .. } => { + target.lower(targets, my_offset); + } + &mut Inst::CondBrLoweredCompound { + ref mut taken, + ref mut not_taken, + .. + } => { + taken.lower(targets, my_offset); + not_taken.lower(targets, my_offset + 4); + } + &mut Inst::Jump { ref mut dest } => { + dest.lower(targets, my_offset); + } + &mut Inst::JTSequence { + targets: ref mut t, .. + } => { + for target in t { + // offset+20: jumptable is 20 bytes into compound sequence. + target.lower(targets, my_offset + 20); + } + } + _ => {} + } + } + + fn reg_universe() -> RealRegUniverse { + create_reg_universe() + } +} + +//============================================================================= +// Pretty-printing of instructions. + +fn mem_finalize_for_show(mem: &MemArg, mb_rru: Option<&RealRegUniverse>) -> (String, MemArg) { + let (mem_insts, mem) = mem_finalize(0, mem); + let mut mem_str = mem_insts + .into_iter() + .map(|inst| inst.show_rru(mb_rru)) + .collect::>() + .join(" ; "); + if !mem_str.is_empty() { + mem_str += " ; "; + } + + (mem_str, mem) +} + +impl ShowWithRRU for Inst { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + fn op_name_size(alu_op: ALUOp) -> (&'static str, InstSize) { + match alu_op { + ALUOp::Add32 => ("add", InstSize::Size32), + ALUOp::Add64 => ("add", InstSize::Size64), + ALUOp::Sub32 => ("sub", InstSize::Size32), + ALUOp::Sub64 => ("sub", InstSize::Size64), + ALUOp::Orr32 => ("orr", InstSize::Size32), + ALUOp::Orr64 => ("orr", InstSize::Size64), + ALUOp::And32 => ("and", InstSize::Size32), + ALUOp::And64 => ("and", InstSize::Size64), + ALUOp::Eor32 => ("eor", InstSize::Size32), + ALUOp::Eor64 => ("eor", InstSize::Size64), + ALUOp::AddS32 => ("adds", InstSize::Size32), + ALUOp::AddS64 => ("adds", InstSize::Size64), + ALUOp::SubS32 => ("subs", InstSize::Size32), + ALUOp::SubS64 => ("subs", InstSize::Size64), + ALUOp::MAdd32 => ("madd", InstSize::Size32), + ALUOp::MAdd64 => ("madd", InstSize::Size64), + ALUOp::MSub32 => ("msub", InstSize::Size32), + ALUOp::MSub64 => ("msub", InstSize::Size64), + ALUOp::SMulH => ("smulh", InstSize::Size64), + ALUOp::UMulH => ("umulh", InstSize::Size64), + ALUOp::SDiv64 => ("sdiv", InstSize::Size64), + ALUOp::UDiv64 => ("udiv", InstSize::Size64), + ALUOp::AndNot32 => ("bic", InstSize::Size32), + ALUOp::AndNot64 => ("bic", InstSize::Size64), + ALUOp::OrrNot32 => ("orn", InstSize::Size32), + ALUOp::OrrNot64 => ("orn", InstSize::Size64), + ALUOp::EorNot32 => ("eon", InstSize::Size32), + ALUOp::EorNot64 => ("eon", InstSize::Size64), + ALUOp::RotR32 => ("ror", InstSize::Size32), + ALUOp::RotR64 => ("ror", InstSize::Size64), + ALUOp::Lsr32 => ("lsr", InstSize::Size32), + ALUOp::Lsr64 => ("lsr", InstSize::Size64), + ALUOp::Asr32 => ("asr", InstSize::Size32), + ALUOp::Asr64 => ("asr", InstSize::Size64), + ALUOp::Lsl32 => ("lsl", InstSize::Size32), + ALUOp::Lsl64 => ("lsl", InstSize::Size64), + } + } + + match self { + &Inst::Nop0 => "nop-zero-len".to_string(), + &Inst::Nop4 => "nop".to_string(), + &Inst::AluRRR { alu_op, rd, rn, rm } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra, + } => { + let (op, size) = op_name_size(alu_op); + let four_args = alu_op != ALUOp::SMulH && alu_op != ALUOp::UMulH; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let ra = show_ireg_sized(ra, mb_rru, size); + if four_args { + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) + } else { + // smulh and umulh have Ra "hard-wired" to the zero register + // and the canonical assembly form has only three regs. + format!("{} {}, {}, {}", op, rd, rn, rm) + } + } + &Inst::AluRRImm12 { + alu_op, + rd, + rn, + ref imm12, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + + if imm12.bits == 0 && alu_op == ALUOp::Add64 { + // special-case MOV (used for moving into SP). + format!("mov {}, {}", rd, rn) + } else { + let imm12 = imm12.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imm12) + } + } + &Inst::AluRRImmLogic { + alu_op, + rd, + rn, + ref imml, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let imml = imml.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, imml) + } + &Inst::AluRRImmShift { + alu_op, + rd, + rn, + ref immshift, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let immshift = immshift.show_rru(mb_rru); + format!("{} {}, {}, {}", op, rd, rn, immshift) + } + &Inst::AluRRRShift { + alu_op, + rd, + rn, + rm, + ref shiftop, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let shiftop = shiftop.show_rru(mb_rru); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, shiftop) + } + &Inst::AluRRRExtend { + alu_op, + rd, + rn, + rm, + ref extendop, + } => { + let (op, size) = op_name_size(alu_op); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + let rm = show_ireg_sized(rm, mb_rru, size); + let extendop = extendop.show_rru(mb_rru); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, extendop) + } + &Inst::BitRR { op, rd, rn } => { + let size = op.inst_size(); + let op = op.op_str(); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let rn = show_ireg_sized(rn, mb_rru, size); + format!("{} {}, {}", op, rd, rn) + } + &Inst::ULoad8 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::SLoad8 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::ULoad16 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::SLoad16 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::ULoad32 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::SLoad32 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::ULoad64 { + rd, + ref mem, + srcloc: _srcloc, + .. + } => { + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru); + + let is_unscaled = match &mem { + &MemArg::Unscaled(..) => true, + _ => false, + }; + let (op, size) = match (self, is_unscaled) { + (&Inst::ULoad8 { .. }, false) => ("ldrb", InstSize::Size32), + (&Inst::ULoad8 { .. }, true) => ("ldurb", InstSize::Size32), + (&Inst::SLoad8 { .. }, false) => ("ldrsb", InstSize::Size64), + (&Inst::SLoad8 { .. }, true) => ("ldursb", InstSize::Size64), + (&Inst::ULoad16 { .. }, false) => ("ldrh", InstSize::Size32), + (&Inst::ULoad16 { .. }, true) => ("ldurh", InstSize::Size32), + (&Inst::SLoad16 { .. }, false) => ("ldrsh", InstSize::Size64), + (&Inst::SLoad16 { .. }, true) => ("ldursh", InstSize::Size64), + (&Inst::ULoad32 { .. }, false) => ("ldr", InstSize::Size32), + (&Inst::ULoad32 { .. }, true) => ("ldur", InstSize::Size32), + (&Inst::SLoad32 { .. }, false) => ("ldrsw", InstSize::Size64), + (&Inst::SLoad32 { .. }, true) => ("ldursw", InstSize::Size64), + (&Inst::ULoad64 { .. }, false) => ("ldr", InstSize::Size64), + (&Inst::ULoad64 { .. }, true) => ("ldur", InstSize::Size64), + _ => unreachable!(), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, size); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rd, mem) + } + &Inst::Store8 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::Store16 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::Store32 { + rd, + ref mem, + srcloc: _srcloc, + } + | &Inst::Store64 { + rd, + ref mem, + srcloc: _srcloc, + .. + } => { + let (mem_str, mem) = mem_finalize_for_show(mem, mb_rru); + + let is_unscaled = match &mem { + &MemArg::Unscaled(..) => true, + _ => false, + }; + let (op, size) = match (self, is_unscaled) { + (&Inst::Store8 { .. }, false) => ("strb", InstSize::Size32), + (&Inst::Store8 { .. }, true) => ("sturb", InstSize::Size32), + (&Inst::Store16 { .. }, false) => ("strh", InstSize::Size32), + (&Inst::Store16 { .. }, true) => ("sturh", InstSize::Size32), + (&Inst::Store32 { .. }, false) => ("str", InstSize::Size32), + (&Inst::Store32 { .. }, true) => ("stur", InstSize::Size32), + (&Inst::Store64 { .. }, false) => ("str", InstSize::Size64), + (&Inst::Store64 { .. }, true) => ("stur", InstSize::Size64), + _ => unreachable!(), + }; + let rd = show_ireg_sized(rd, mb_rru, size); + let mem = mem.show_rru(mb_rru); + format!("{}{} {}, {}", mem_str, op, rd, mem) + } + &Inst::StoreP64 { rt, rt2, ref mem } => { + let rt = rt.show_rru(mb_rru); + let rt2 = rt2.show_rru(mb_rru); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("stp {}, {}, {}", rt, rt2, mem) + } + &Inst::LoadP64 { rt, rt2, ref mem } => { + let rt = rt.to_reg().show_rru(mb_rru); + let rt2 = rt2.to_reg().show_rru(mb_rru); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("ldp {}, {}, {}", rt, rt2, mem) + } + &Inst::Mov { rd, rm } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + format!("mov {}, {}", rd, rm) + } + &Inst::Mov32 { rd, rm } => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + let rm = show_ireg_sized(rm, mb_rru, InstSize::Size32); + format!("mov {}, {}", rd, rm) + } + &Inst::MovZ { rd, ref imm } => { + let rd = rd.to_reg().show_rru(mb_rru); + let imm = imm.show_rru(mb_rru); + format!("movz {}, {}", rd, imm) + } + &Inst::MovN { rd, ref imm } => { + let rd = rd.to_reg().show_rru(mb_rru); + let imm = imm.show_rru(mb_rru); + format!("movn {}, {}", rd, imm) + } + &Inst::MovK { rd, ref imm } => { + let rd = rd.to_reg().show_rru(mb_rru); + let imm = imm.show_rru(mb_rru); + format!("movk {}, {}", rd, imm) + } + &Inst::CSel { rd, rn, rm, cond } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + let rm = rm.show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("csel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::CSet { rd, cond } => { + let rd = rd.to_reg().show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("cset {}, {}", rd, cond) + } + &Inst::FpuMove64 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}.8b, {}.8b", rd, rn) + } + &Inst::FpuRR { fpu_op, rd, rn } => { + let (op, sizesrc, sizedest) = match fpu_op { + FPUOp1::Abs32 => ("fabs", InstSize::Size32, InstSize::Size32), + FPUOp1::Abs64 => ("fabs", InstSize::Size64, InstSize::Size64), + FPUOp1::Neg32 => ("fneg", InstSize::Size32, InstSize::Size32), + FPUOp1::Neg64 => ("fneg", InstSize::Size64, InstSize::Size64), + FPUOp1::Sqrt32 => ("fsqrt", InstSize::Size32, InstSize::Size32), + FPUOp1::Sqrt64 => ("fsqrt", InstSize::Size64, InstSize::Size64), + FPUOp1::Cvt32To64 => ("fcvt", InstSize::Size32, InstSize::Size64), + FPUOp1::Cvt64To32 => ("fcvt", InstSize::Size64, InstSize::Size32), + }; + let rd = show_freg_sized(rd.to_reg(), mb_rru, sizedest); + let rn = show_freg_sized(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::FpuRRR { fpu_op, rd, rn, rm } => { + let (op, size) = match fpu_op { + FPUOp2::Add32 => ("fadd", InstSize::Size32), + FPUOp2::Add64 => ("fadd", InstSize::Size64), + FPUOp2::Sub32 => ("fsub", InstSize::Size32), + FPUOp2::Sub64 => ("fsub", InstSize::Size64), + FPUOp2::Mul32 => ("fmul", InstSize::Size32), + FPUOp2::Mul64 => ("fmul", InstSize::Size64), + FPUOp2::Div32 => ("fdiv", InstSize::Size32), + FPUOp2::Div64 => ("fdiv", InstSize::Size64), + FPUOp2::Max32 => ("fmax", InstSize::Size32), + FPUOp2::Max64 => ("fmax", InstSize::Size64), + FPUOp2::Min32 => ("fmin", InstSize::Size32), + FPUOp2::Min64 => ("fmin", InstSize::Size64), + }; + let rd = show_freg_sized(rd.to_reg(), mb_rru, size); + let rn = show_freg_sized(rn, mb_rru, size); + let rm = show_freg_sized(rm, mb_rru, size); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::FpuRRRR { + fpu_op, + rd, + rn, + rm, + ra, + } => { + let (op, size) = match fpu_op { + FPUOp3::MAdd32 => ("fmadd", InstSize::Size32), + FPUOp3::MAdd64 => ("fmadd", InstSize::Size64), + }; + let rd = show_freg_sized(rd.to_reg(), mb_rru, size); + let rn = show_freg_sized(rn, mb_rru, size); + let rm = show_freg_sized(rm, mb_rru, size); + let ra = show_freg_sized(ra, mb_rru, size); + format!("{} {}, {}, {}, {}", op, rd, rn, rm, ra) + } + &Inst::FpuCmp32 { rn, rm } => { + let rn = show_freg_sized(rn, mb_rru, InstSize::Size32); + let rm = show_freg_sized(rm, mb_rru, InstSize::Size32); + format!("fcmp {}, {}", rn, rm) + } + &Inst::FpuCmp64 { rn, rm } => { + let rn = show_freg_sized(rn, mb_rru, InstSize::Size64); + let rm = show_freg_sized(rm, mb_rru, InstSize::Size64); + format!("fcmp {}, {}", rn, rm) + } + &Inst::FpuLoad32 { rd, ref mem, .. } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 4); + format!("ldr {}, {}", rd, mem) + } + &Inst::FpuLoad64 { rd, ref mem, .. } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("ldr {}, {}", rd, mem) + } + &Inst::FpuLoad128 { rd, ref mem, .. } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rd = "q".to_string() + &rd[1..]; + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("ldr {}, {}", rd, mem) + } + &Inst::FpuStore32 { rd, ref mem, .. } => { + let rd = show_freg_sized(rd, mb_rru, InstSize::Size32); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 4); + format!("str {}, {}", rd, mem) + } + &Inst::FpuStore64 { rd, ref mem, .. } => { + let rd = show_freg_sized(rd, mb_rru, InstSize::Size64); + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("str {}, {}", rd, mem) + } + &Inst::FpuStore128 { rd, ref mem, .. } => { + let rd = rd.show_rru(mb_rru); + let rd = "q".to_string() + &rd[1..]; + let mem = mem.show_rru_sized(mb_rru, /* size = */ 8); + format!("str {}, {}", rd, mem) + } + &Inst::LoadFpuConst32 { rd, const_data } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + format!("ldr {}, pc+8 ; b 8 ; data.f32 {}", rd, const_data) + } + &Inst::LoadFpuConst64 { rd, const_data } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64); + format!("ldr {}, pc+8 ; b 12 ; data.f64 {}", rd, const_data) + } + &Inst::FpuToInt { op, rd, rn } => { + let (op, sizesrc, sizedest) = match op { + FpuToIntOp::F32ToI32 => ("fcvtzs", InstSize::Size32, InstSize::Size32), + FpuToIntOp::F32ToU32 => ("fcvtzu", InstSize::Size32, InstSize::Size32), + FpuToIntOp::F32ToI64 => ("fcvtzs", InstSize::Size32, InstSize::Size64), + FpuToIntOp::F32ToU64 => ("fcvtzu", InstSize::Size32, InstSize::Size64), + FpuToIntOp::F64ToI32 => ("fcvtzs", InstSize::Size64, InstSize::Size32), + FpuToIntOp::F64ToU32 => ("fcvtzu", InstSize::Size64, InstSize::Size32), + FpuToIntOp::F64ToI64 => ("fcvtzs", InstSize::Size64, InstSize::Size64), + FpuToIntOp::F64ToU64 => ("fcvtzu", InstSize::Size64, InstSize::Size64), + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, sizedest); + let rn = show_freg_sized(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::IntToFpu { op, rd, rn } => { + let (op, sizesrc, sizedest) = match op { + IntToFpuOp::I32ToF32 => ("scvtf", InstSize::Size32, InstSize::Size32), + IntToFpuOp::U32ToF32 => ("ucvtf", InstSize::Size32, InstSize::Size32), + IntToFpuOp::I64ToF32 => ("scvtf", InstSize::Size64, InstSize::Size32), + IntToFpuOp::U64ToF32 => ("ucvtf", InstSize::Size64, InstSize::Size32), + IntToFpuOp::I32ToF64 => ("scvtf", InstSize::Size32, InstSize::Size64), + IntToFpuOp::U32ToF64 => ("ucvtf", InstSize::Size32, InstSize::Size64), + IntToFpuOp::I64ToF64 => ("scvtf", InstSize::Size64, InstSize::Size64), + IntToFpuOp::U64ToF64 => ("ucvtf", InstSize::Size64, InstSize::Size64), + }; + let rd = show_freg_sized(rd.to_reg(), mb_rru, sizedest); + let rn = show_ireg_sized(rn, mb_rru, sizesrc); + format!("{} {}, {}", op, rd, rn) + } + &Inst::FpuCSel32 { rd, rn, rm, cond } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + let rn = show_freg_sized(rn, mb_rru, InstSize::Size32); + let rm = show_freg_sized(rm, mb_rru, InstSize::Size32); + let cond = cond.show_rru(mb_rru); + format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::FpuCSel64 { rd, rn, rm, cond } => { + let rd = show_freg_sized(rd.to_reg(), mb_rru, InstSize::Size64); + let rn = show_freg_sized(rn, mb_rru, InstSize::Size64); + let rm = show_freg_sized(rm, mb_rru, InstSize::Size64); + let cond = cond.show_rru(mb_rru); + format!("fcsel {}, {}, {}, {}", rd, rn, rm, cond) + } + &Inst::FpuRound { op, rd, rn } => { + let (inst, size) = match op { + FpuRoundMode::Minus32 => ("frintm", InstSize::Size32), + FpuRoundMode::Minus64 => ("frintm", InstSize::Size64), + FpuRoundMode::Plus32 => ("frintp", InstSize::Size32), + FpuRoundMode::Plus64 => ("frintp", InstSize::Size64), + FpuRoundMode::Zero32 => ("frintz", InstSize::Size32), + FpuRoundMode::Zero64 => ("frintz", InstSize::Size64), + FpuRoundMode::Nearest32 => ("frintn", InstSize::Size32), + FpuRoundMode::Nearest64 => ("frintn", InstSize::Size64), + }; + let rd = show_freg_sized(rd.to_reg(), mb_rru, size); + let rn = show_freg_sized(rn, mb_rru, size); + format!("{} {}, {}", inst, rd, rn) + } + &Inst::MovToVec64 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}.d[0], {}", rd, rn) + } + &Inst::MovFromVec64 { rd, rn } => { + let rd = rd.to_reg().show_rru(mb_rru); + let rn = rn.show_rru(mb_rru); + format!("mov {}, {}.d[0]", rd, rn) + } + &Inst::VecRRR { rd, rn, rm, alu_op } => { + let op = match alu_op { + VecALUOp::SQAddScalar => "sqadd", + VecALUOp::UQAddScalar => "uqadd", + VecALUOp::SQSubScalar => "sqsub", + VecALUOp::UQSubScalar => "uqsub", + }; + let rd = show_vreg_scalar(rd.to_reg(), mb_rru); + let rn = show_vreg_scalar(rn, mb_rru); + let rm = show_vreg_scalar(rm, mb_rru); + format!("{} {}, {}, {}", op, rd, rn, rm) + } + &Inst::MovToNZCV { rn } => { + let rn = rn.show_rru(mb_rru); + format!("msr nzcv, {}", rn) + } + &Inst::MovFromNZCV { rd } => { + let rd = rd.to_reg().show_rru(mb_rru); + format!("mrs {}, nzcv", rd) + } + &Inst::CondSet { rd, cond } => { + let rd = rd.to_reg().show_rru(mb_rru); + let cond = cond.show_rru(mb_rru); + format!("cset {}, {}", rd, cond) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits >= 8 => { + // Is the destination a 32-bit register? Corresponds to whether + // extend-to width is <= 32 bits, *unless* we have an unsigned + // 32-to-64-bit extension, which is implemented with a "mov" to a + // 32-bit (W-reg) dest, because this zeroes the top 32 bits. + let dest_size = if !signed && from_bits == 32 && to_bits == 64 { + InstSize::Size32 + } else { + InstSize::from_bits(to_bits) + }; + let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size); + let rn = show_ireg_sized(rn, mb_rru, InstSize::from_bits(from_bits)); + let op = match (signed, from_bits, to_bits) { + (false, 8, 32) => "uxtb", + (true, 8, 32) => "sxtb", + (false, 16, 32) => "uxth", + (true, 16, 32) => "sxth", + (false, 8, 64) => "uxtb", + (true, 8, 64) => "sxtb", + (false, 16, 64) => "uxth", + (true, 16, 64) => "sxth", + (false, 32, 64) => "mov", // special case (see above). + (true, 32, 64) => "sxtw", + _ => panic!("Unsupported Extend case: {:?}", self), + }; + format!("{} {}, {}", op, rd, rn) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + } if from_bits == 1 && signed => { + let dest_size = InstSize::from_bits(to_bits); + let zr = if dest_size.is32() { "wzr" } else { "xzr" }; + let rd32 = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + let rd = show_ireg_sized(rd.to_reg(), mb_rru, dest_size); + let rn = show_ireg_sized(rn, mb_rru, InstSize::Size32); + format!("and {}, {}, #1 ; sub {}, {}, {}", rd32, rn, rd, zr, rd) + } + &Inst::Extend { + rd, + rn, + signed, + from_bits, + .. + } if from_bits == 1 && !signed => { + let rd = show_ireg_sized(rd.to_reg(), mb_rru, InstSize::Size32); + let rn = show_ireg_sized(rn, mb_rru, InstSize::Size32); + format!("and {}, {}, #1", rd, rn) + } + &Inst::Extend { .. } => { + panic!("Unsupported Extend case"); + } + &Inst::Call { dest: _, .. } => format!("bl 0"), + &Inst::CallInd { rn, .. } => { + let rn = rn.show_rru(mb_rru); + format!("blr {}", rn) + } + &Inst::Ret => "ret".to_string(), + &Inst::EpiloguePlaceholder => "epilogue placeholder".to_string(), + &Inst::Jump { ref dest } => { + let dest = dest.show_rru(mb_rru); + format!("b {}", dest) + } + &Inst::CondBr { + ref taken, + ref not_taken, + ref kind, + } => { + let taken = taken.show_rru(mb_rru); + let not_taken = not_taken.show_rru(mb_rru); + match kind { + &CondBrKind::Zero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbz {}, {} ; b {}", reg, taken, not_taken) + } + &CondBrKind::NotZero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbnz {}, {} ; b {}", reg, taken, not_taken) + } + &CondBrKind::Cond(c) => { + let c = c.show_rru(mb_rru); + format!("b.{} {} ; b {}", c, taken, not_taken) + } + } + } + &Inst::CondBrLowered { + ref target, + ref kind, + } => { + let target = target.show_rru(mb_rru); + match &kind { + &CondBrKind::Zero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbz {}, {}", reg, target) + } + &CondBrKind::NotZero(reg) => { + let reg = reg.show_rru(mb_rru); + format!("cbnz {}, {}", reg, target) + } + &CondBrKind::Cond(c) => { + let c = c.show_rru(mb_rru); + format!("b.{} {}", c, target) + } + } + } + &Inst::CondBrLoweredCompound { + ref taken, + ref not_taken, + ref kind, + } => { + let first = Inst::CondBrLowered { + target: taken.clone(), + kind: kind.clone(), + }; + let second = Inst::Jump { + dest: not_taken.clone(), + }; + first.show_rru(mb_rru) + " ; " + &second.show_rru(mb_rru) + } + &Inst::IndirectBr { rn, .. } => { + let rn = rn.show_rru(mb_rru); + format!("br {}", rn) + } + &Inst::Brk => "brk #0".to_string(), + &Inst::Udf { .. } => "udf".to_string(), + &Inst::Adr { rd, ref label } => { + let rd = rd.show_rru(mb_rru); + let label = label.show_rru(mb_rru); + format!("adr {}, {}", rd, label) + } + &Inst::Word4 { data } => format!("data.i32 {}", data), + &Inst::Word8 { data } => format!("data.i64 {}", data), + &Inst::JTSequence { + ref targets, + ridx, + rtmp1, + rtmp2, + .. + } => { + let ridx = ridx.show_rru(mb_rru); + let rtmp1 = rtmp1.show_rru(mb_rru); + let rtmp2 = rtmp2.show_rru(mb_rru); + format!( + concat!( + "adr {}, pc+16 ; ", + "ldrsw {}, [{}, {}, LSL 2] ; ", + "add {}, {}, {} ; ", + "br {} ; ", + "jt_entries {:?}" + ), + rtmp1, rtmp2, rtmp1, ridx, rtmp1, rtmp1, rtmp2, rtmp1, targets + ) + } + &Inst::LoadConst64 { rd, const_data } => { + let rd = rd.show_rru(mb_rru); + format!("ldr {}, 8 ; b 12 ; data {:?}", rd, const_data) + } + &Inst::LoadExtName { + rd, + ref name, + offset, + srcloc: _srcloc, + } => { + let rd = rd.show_rru(mb_rru); + format!("ldr {}, 8 ; b 12 ; data {:?} + {}", rd, name, offset) + } + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/inst/regs.rs b/cranelift/codegen/src/isa/aarch64/inst/regs.rs new file mode 100644 index 000000000000..b675d7f4d722 --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/inst/regs.rs @@ -0,0 +1,270 @@ +//! AArch64 ISA definitions: registers. + +use crate::isa::aarch64::inst::InstSize; +use crate::machinst::*; + +use regalloc::{RealRegUniverse, Reg, RegClass, RegClassInfo, Writable, NUM_REG_CLASSES}; + +use std::string::{String, ToString}; + +//============================================================================= +// Registers, the Universe thereof, and printing + +#[rustfmt::skip] +const XREG_INDICES: [u8; 31] = [ + // X0 - X7 + 32, 33, 34, 35, 36, 37, 38, 39, + // X8 - X14 + 40, 41, 42, 43, 44, 45, 46, + // X15 + 59, + // X16, X17 + 47, 48, + // X18 + 60, + // X19 - X28 + 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + // X29 + 61, + // X30 + 62, +]; + +const ZERO_REG_INDEX: u8 = 63; + +const SP_REG_INDEX: u8 = 64; + +/// Get a reference to an X-register (integer register). +pub fn xreg(num: u8) -> Reg { + assert!(num < 31); + Reg::new_real( + RegClass::I64, + /* enc = */ num, + /* index = */ XREG_INDICES[num as usize], + ) +} + +/// Get a writable reference to an X-register. +pub fn writable_xreg(num: u8) -> Writable { + Writable::from_reg(xreg(num)) +} + +/// Get a reference to a V-register (vector/FP register). +pub fn vreg(num: u8) -> Reg { + assert!(num < 32); + Reg::new_real(RegClass::V128, /* enc = */ num, /* index = */ num) +} + +/// Get a writable reference to a V-register. +pub fn writable_vreg(num: u8) -> Writable { + Writable::from_reg(vreg(num)) +} + +/// Get a reference to the zero-register. +pub fn zero_reg() -> Reg { + // This should be the same as what xreg(31) returns, except that + // we use the special index into the register index space. + Reg::new_real( + RegClass::I64, + /* enc = */ 31, + /* index = */ ZERO_REG_INDEX, + ) +} + +/// Get a writable reference to the zero-register (this discards a result). +pub fn writable_zero_reg() -> Writable { + Writable::from_reg(zero_reg()) +} + +/// Get a reference to the stack-pointer register. +pub fn stack_reg() -> Reg { + // XSP (stack) and XZR (zero) are logically different registers which have + // the same hardware encoding, and whose meaning, in real aarch64 + // instructions, is context-dependent. For convenience of + // universe-construction and for correct printing, we make them be two + // different real registers. + Reg::new_real( + RegClass::I64, + /* enc = */ 31, + /* index = */ SP_REG_INDEX, + ) +} + +/// Get a writable reference to the stack-pointer register. +pub fn writable_stack_reg() -> Writable { + Writable::from_reg(stack_reg()) +} + +/// Get a reference to the link register (x30). +pub fn link_reg() -> Reg { + xreg(30) +} + +/// Get a writable reference to the link register. +pub fn writable_link_reg() -> Writable { + Writable::from_reg(link_reg()) +} + +/// Get a reference to the frame pointer (x29). +pub fn fp_reg() -> Reg { + xreg(29) +} + +/// Get a writable reference to the frame pointer. +pub fn writable_fp_reg() -> Writable { + Writable::from_reg(fp_reg()) +} + +/// Get a reference to the "spill temp" register. This register is used to +/// compute the address of a spill slot when a direct offset addressing mode from +/// FP is not sufficient (+/- 2^11 words). We exclude this register from regalloc +/// and reserve it for this purpose for simplicity; otherwise we need a +/// multi-stage analysis where we first determine how many spill slots we have, +/// then perhaps remove the reg from the pool and recompute regalloc. +pub fn spilltmp_reg() -> Reg { + xreg(15) +} + +/// Get a writable reference to the spilltmp reg. +pub fn writable_spilltmp_reg() -> Writable { + Writable::from_reg(spilltmp_reg()) +} + +/// Create the register universe for AArch64. +pub fn create_reg_universe() -> RealRegUniverse { + let mut regs = vec![]; + let mut allocable_by_class = [None; NUM_REG_CLASSES]; + + // Numbering Scheme: we put V-regs first, then X-regs. The X-regs + // exclude several registers: x18 (globally reserved for platform-specific + // purposes), x29 (frame pointer), x30 (link register), x31 (stack pointer + // or zero register, depending on context). + + let v_reg_base = 0u8; // in contiguous real-register index space + let v_reg_count = 32; + for i in 0u8..v_reg_count { + let reg = Reg::new_real( + RegClass::V128, + /* enc = */ i, + /* index = */ v_reg_base + i, + ) + .to_real_reg(); + let name = format!("v{}", i); + regs.push((reg, name)); + } + let v_reg_last = v_reg_base + v_reg_count - 1; + + // Add the X registers. N.B.: the order here must match the order implied + // by XREG_INDICES, ZERO_REG_INDEX, and SP_REG_INDEX above. + + let x_reg_base = 32u8; // in contiguous real-register index space + let mut x_reg_count = 0; + for i in 0u8..32u8 { + // See above for excluded registers. + if i == 15 || i == 18 || i == 29 || i == 30 || i == 31 { + continue; + } + let reg = Reg::new_real( + RegClass::I64, + /* enc = */ i, + /* index = */ x_reg_base + x_reg_count, + ) + .to_real_reg(); + let name = format!("x{}", i); + regs.push((reg, name)); + x_reg_count += 1; + } + let x_reg_last = x_reg_base + x_reg_count - 1; + + allocable_by_class[RegClass::I64.rc_to_usize()] = Some(RegClassInfo { + first: x_reg_base as usize, + last: x_reg_last as usize, + suggested_scratch: Some(XREG_INDICES[13] as usize), + }); + allocable_by_class[RegClass::V128.rc_to_usize()] = Some(RegClassInfo { + first: v_reg_base as usize, + last: v_reg_last as usize, + suggested_scratch: Some(/* V31: */ 31), + }); + + // Other regs, not available to the allocator. + let allocable = regs.len(); + regs.push((xreg(15).to_real_reg(), "x15".to_string())); + regs.push((xreg(18).to_real_reg(), "x18".to_string())); + regs.push((fp_reg().to_real_reg(), "fp".to_string())); + regs.push((link_reg().to_real_reg(), "lr".to_string())); + regs.push((zero_reg().to_real_reg(), "xzr".to_string())); + regs.push((stack_reg().to_real_reg(), "sp".to_string())); + // FIXME JRS 2020Feb06: unfortunately this pushes the number of real regs + // to 65, which is potentially inconvenient from a compiler performance + // standpoint. We could possibly drop back to 64 by "losing" a vector + // register in future. + + // Assert sanity: the indices in the register structs must match their + // actual indices in the array. + for (i, reg) in regs.iter().enumerate() { + assert_eq!(i, reg.0.get_index()); + } + + RealRegUniverse { + regs, + allocable, + allocable_by_class, + } +} + +/// If `ireg` denotes an I64-classed reg, make a best-effort attempt to show +/// its name at the 32-bit size. +pub fn show_ireg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String { + let mut s = reg.show_rru(mb_rru); + if reg.get_class() != RegClass::I64 || !size.is32() { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "x42" into "w42" as appropriate + if reg.get_class() == RegClass::I64 && size.is32() && s.starts_with("x") { + s = "w".to_string() + &s[1..]; + } + } else { + // Add a "w" suffix to RegClass::I64 vregs used in a 32-bit role + if reg.get_class() == RegClass::I64 && size.is32() { + s.push('w'); + } + } + s +} + +/// Show a vector register when its use as a 32-bit or 64-bit float is known. +pub fn show_freg_sized(reg: Reg, mb_rru: Option<&RealRegUniverse>, size: InstSize) -> String { + let mut s = reg.show_rru(mb_rru); + if reg.get_class() != RegClass::V128 { + return s; + } + let prefix = if size.is32() { "s" } else { "d" }; + s.replace_range(0..1, prefix); + s +} + +/// Show a vector register used in a scalar context. +pub fn show_vreg_scalar(reg: Reg, mb_rru: Option<&RealRegUniverse>) -> String { + let mut s = reg.show_rru(mb_rru); + if reg.get_class() != RegClass::V128 { + // We can't do any better. + return s; + } + + if reg.is_real() { + // Change (eg) "v0" into "d0". + if reg.get_class() == RegClass::V128 && s.starts_with("v") { + s.replace_range(0..1, "d"); + } + } else { + // Add a "d" suffix to RegClass::V128 vregs. + if reg.get_class() == RegClass::V128 { + s.push('d'); + } + } + s +} diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs new file mode 100644 index 000000000000..07a8e896e684 --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -0,0 +1,2768 @@ +//! Lowering rules for AArch64. +//! +//! TODO: opportunities for better code generation: +//! +//! - Smarter use of addressing modes. Recognize a+SCALE*b patterns; recognize +//! and incorporate sign/zero extension on indicies. Recognize pre/post-index +//! opportunities. +//! +//! - Floating-point immediates (FIMM instruction). + +use crate::ir::condcodes::{FloatCC, IntCC}; +use crate::ir::types::*; +use crate::ir::Inst as IRInst; +use crate::ir::{InstructionData, Opcode, TrapCode, Type}; +use crate::machinst::lower::*; +use crate::machinst::*; + +use crate::isa::aarch64::abi::*; +use crate::isa::aarch64::inst::*; +use crate::isa::aarch64::AArch64Backend; + +use regalloc::{Reg, RegClass, Writable}; + +use alloc::vec::Vec; +use smallvec::SmallVec; + +//============================================================================ +// Result enum types. +// +// Lowering of a given value results in one of these enums, depending on the +// modes in which we can accept the value. + +/// A lowering result: register, register-shift. An SSA value can always be +/// lowered into one of these options; the register form is the fallback. +#[derive(Clone, Debug)] +enum ResultRS { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), +} + +/// A lowering result: register, register-shift, register-extend. An SSA value can always be +/// lowered into one of these options; the register form is the fallback. +#[derive(Clone, Debug)] +enum ResultRSE { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + RegExtend(Reg, ExtendOp), +} + +impl ResultRSE { + fn from_rs(rs: ResultRS) -> ResultRSE { + match rs { + ResultRS::Reg(r) => ResultRSE::Reg(r), + ResultRS::RegShift(r, s) => ResultRSE::RegShift(r, s), + } + } +} + +/// A lowering result: register, register-shift, register-extend, or 12-bit immediate form. +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +enum ResultRSEImm12 { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + RegExtend(Reg, ExtendOp), + Imm12(Imm12), +} + +impl ResultRSEImm12 { + fn from_rse(rse: ResultRSE) -> ResultRSEImm12 { + match rse { + ResultRSE::Reg(r) => ResultRSEImm12::Reg(r), + ResultRSE::RegShift(r, s) => ResultRSEImm12::RegShift(r, s), + ResultRSE::RegExtend(r, e) => ResultRSEImm12::RegExtend(r, e), + } + } +} + +/// A lowering result: register, register-shift, or logical immediate form. +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +enum ResultRSImmLogic { + Reg(Reg), + RegShift(Reg, ShiftOpAndAmt), + ImmLogic(ImmLogic), +} + +impl ResultRSImmLogic { + fn from_rs(rse: ResultRS) -> ResultRSImmLogic { + match rse { + ResultRS::Reg(r) => ResultRSImmLogic::Reg(r), + ResultRS::RegShift(r, s) => ResultRSImmLogic::RegShift(r, s), + } + } +} + +/// A lowering result: register or immediate shift amount (arg to a shift op). +/// An SSA value can always be lowered into one of these options; the register form is the +/// fallback. +#[derive(Clone, Debug)] +enum ResultRegImmShift { + Reg(Reg), + ImmShift(ImmShift), +} + +//============================================================================ +// Instruction input and output "slots". +// +// We use these types to refer to operand numbers, and result numbers, together +// with the associated instruction, in a type-safe way. + +/// Identifier for a particular output of an instruction. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +struct InsnOutput { + insn: IRInst, + output: usize, +} + +/// Identifier for a particular input of an instruction. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +struct InsnInput { + insn: IRInst, + input: usize, +} + +/// Producer of a value: either a previous instruction's output, or a register that will be +/// codegen'd separately. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum InsnInputSource { + Output(InsnOutput), + Reg(Reg), +} + +impl InsnInputSource { + fn as_output(self) -> Option { + match self { + InsnInputSource::Output(o) => Some(o), + _ => None, + } + } +} + +fn get_input>(ctx: &mut C, output: InsnOutput, num: usize) -> InsnInput { + assert!(num <= ctx.num_inputs(output.insn)); + InsnInput { + insn: output.insn, + input: num, + } +} + +/// Convert an instruction input to a producing instruction's output if possible (in same BB), or a +/// register otherwise. +fn input_source>(ctx: &mut C, input: InsnInput) -> InsnInputSource { + if let Some((input_inst, result_num)) = ctx.input_inst(input.insn, input.input) { + let out = InsnOutput { + insn: input_inst, + output: result_num, + }; + InsnInputSource::Output(out) + } else { + let reg = ctx.input(input.insn, input.input); + InsnInputSource::Reg(reg) + } +} + +//============================================================================ +// Lowering: convert instruction outputs to result types. + +/// Lower an instruction output to a 64-bit constant, if possible. +fn output_to_const>(ctx: &mut C, out: InsnOutput) -> Option { + if out.output > 0 { + None + } else { + let inst_data = ctx.data(out.insn); + if inst_data.opcode() == Opcode::Null { + Some(0) + } else { + match inst_data { + &InstructionData::UnaryImm { opcode: _, imm } => { + // Only has Into for i64; we use u64 elsewhere, so we cast. + let imm: i64 = imm.into(); + Some(imm as u64) + } + &InstructionData::UnaryIeee32 { opcode: _, imm } => Some(u64::from(imm.bits())), + &InstructionData::UnaryIeee64 { opcode: _, imm } => Some(imm.bits()), + _ => None, + } + } + } +} + +fn output_to_const_f32>(ctx: &mut C, out: InsnOutput) -> Option { + output_to_const(ctx, out).map(|value| f32::from_bits(value as u32)) +} + +fn output_to_const_f64>(ctx: &mut C, out: InsnOutput) -> Option { + output_to_const(ctx, out).map(|value| f64::from_bits(value)) +} + +/// Lower an instruction output to a constant register-shift amount, if possible. +fn output_to_shiftimm>( + ctx: &mut C, + out: InsnOutput, +) -> Option { + output_to_const(ctx, out).and_then(ShiftOpShiftImm::maybe_from_shift) +} + +/// How to handle narrow values loaded into registers; see note on `narrow_mode` +/// parameter to `input_to_*` below. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +enum NarrowValueMode { + None, + /// Zero-extend to 32 bits if original is < 32 bits. + ZeroExtend32, + /// Sign-extend to 32 bits if original is < 32 bits. + SignExtend32, + /// Zero-extend to 64 bits if original is < 64 bits. + ZeroExtend64, + /// Sign-extend to 64 bits if original is < 64 bits. + SignExtend64, +} + +impl NarrowValueMode { + fn is_32bit(&self) -> bool { + match self { + NarrowValueMode::None => false, + NarrowValueMode::ZeroExtend32 | NarrowValueMode::SignExtend32 => true, + NarrowValueMode::ZeroExtend64 | NarrowValueMode::SignExtend64 => false, + } + } +} + +/// Lower an instruction output to a reg. +fn output_to_reg>(ctx: &mut C, out: InsnOutput) -> Writable { + ctx.output(out.insn, out.output) +} + +/// Lower an instruction input to a reg. +/// +/// The given register will be extended appropriately, according to +/// `narrow_mode` and the input's type. If extended, the value is +/// always extended to 64 bits, for simplicity. +fn input_to_reg>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> Reg { + let ty = ctx.input_ty(input.insn, input.input); + let from_bits = ty_bits(ty) as u8; + let in_reg = ctx.input(input.insn, input.input); + match (narrow_mode, from_bits) { + (NarrowValueMode::None, _) => in_reg, + (NarrowValueMode::ZeroExtend32, n) if n < 32 => { + let tmp = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: false, + from_bits, + to_bits: 32, + }); + tmp.to_reg() + } + (NarrowValueMode::SignExtend32, n) if n < 32 => { + let tmp = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: true, + from_bits, + to_bits: 32, + }); + tmp.to_reg() + } + (NarrowValueMode::ZeroExtend32, 32) | (NarrowValueMode::SignExtend32, 32) => in_reg, + + (NarrowValueMode::ZeroExtend64, n) if n < 64 => { + let tmp = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: false, + from_bits, + to_bits: 64, + }); + tmp.to_reg() + } + (NarrowValueMode::SignExtend64, n) if n < 64 => { + let tmp = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::Extend { + rd: tmp, + rn: in_reg, + signed: true, + from_bits, + to_bits: 64, + }); + tmp.to_reg() + } + (_, 64) => in_reg, + + _ => panic!( + "Unsupported input width: input ty {} bits {} mode {:?}", + ty, from_bits, narrow_mode + ), + } +} + +/// Lower an instruction input to a reg or reg/shift, or reg/extend operand. +/// This does not actually codegen the source instruction; it just uses the +/// vreg into which the source instruction will generate its value. +/// +/// The `narrow_mode` flag indicates whether the consumer of this value needs +/// the high bits clear. For many operations, such as an add/sub/mul or any +/// bitwise logical operation, the low-bit results depend only on the low-bit +/// inputs, so e.g. we can do an 8 bit add on 32 bit registers where the 8-bit +/// value is stored in the low 8 bits of the register and the high 24 bits are +/// undefined. If the op truly needs the high N bits clear (such as for a +/// divide or a right-shift or a compare-to-zero), `narrow_mode` should be +/// set to `ZeroExtend` or `SignExtend` as appropriate, and the resulting +/// register will be provided the extended value. +fn input_to_rs>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRS { + if let InsnInputSource::Output(out) = input_source(ctx, input) { + let insn = out.insn; + assert!(out.output <= ctx.num_outputs(insn)); + let op = ctx.data(insn).opcode(); + + if op == Opcode::Ishl { + let shiftee = get_input(ctx, out, 0); + let shift_amt = get_input(ctx, out, 1); + + // Can we get the shift amount as an immediate? + if let Some(shift_amt_out) = input_source(ctx, shift_amt).as_output() { + if let Some(shiftimm) = output_to_shiftimm(ctx, shift_amt_out) { + let reg = input_to_reg(ctx, shiftee, narrow_mode); + ctx.merged(insn); + ctx.merged(shift_amt_out.insn); + return ResultRS::RegShift(reg, ShiftOpAndAmt::new(ShiftOp::LSL, shiftimm)); + } + } + } + } + + ResultRS::Reg(input_to_reg(ctx, input, narrow_mode)) +} + +/// Lower an instruction input to a reg or reg/shift, or reg/extend operand. +/// This does not actually codegen the source instruction; it just uses the +/// vreg into which the source instruction will generate its value. +/// +/// See note on `input_to_rs` for a description of `narrow_mode`. +fn input_to_rse>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSE { + if let InsnInputSource::Output(out) = input_source(ctx, input) { + let insn = out.insn; + assert!(out.output <= ctx.num_outputs(insn)); + let op = ctx.data(insn).opcode(); + let out_ty = ctx.output_ty(insn, out.output); + let out_bits = ty_bits(out_ty); + + // If `out_ty` is smaller than 32 bits and we need to zero- or sign-extend, + // then get the result into a register and return an Extend-mode operand on + // that register. + if narrow_mode != NarrowValueMode::None + && ((narrow_mode.is_32bit() && out_bits < 32) + || (!narrow_mode.is_32bit() && out_bits < 64)) + { + let reg = output_to_reg(ctx, out); + let extendop = match (narrow_mode, out_bits) { + (NarrowValueMode::SignExtend32, 1) | (NarrowValueMode::SignExtend64, 1) => { + ExtendOp::SXTB + } + (NarrowValueMode::ZeroExtend32, 1) | (NarrowValueMode::ZeroExtend64, 1) => { + ExtendOp::UXTB + } + (NarrowValueMode::SignExtend32, 8) | (NarrowValueMode::SignExtend64, 8) => { + ExtendOp::SXTB + } + (NarrowValueMode::ZeroExtend32, 8) | (NarrowValueMode::ZeroExtend64, 8) => { + ExtendOp::UXTB + } + (NarrowValueMode::SignExtend32, 16) | (NarrowValueMode::SignExtend64, 16) => { + ExtendOp::SXTH + } + (NarrowValueMode::ZeroExtend32, 16) | (NarrowValueMode::ZeroExtend64, 16) => { + ExtendOp::UXTH + } + (NarrowValueMode::SignExtend64, 32) => ExtendOp::SXTW, + (NarrowValueMode::ZeroExtend64, 32) => ExtendOp::UXTW, + _ => unreachable!(), + }; + return ResultRSE::RegExtend(reg.to_reg(), extendop); + } + + // Is this a zero-extend or sign-extend and can we handle that with a register-mode operator? + if op == Opcode::Uextend || op == Opcode::Sextend { + assert!(out_bits == 32 || out_bits == 64); + let sign_extend = op == Opcode::Sextend; + let extendee = get_input(ctx, out, 0); + let inner_ty = ctx.input_ty(extendee.insn, extendee.input); + let inner_bits = ty_bits(inner_ty); + assert!(inner_bits < out_bits); + let extendop = match (sign_extend, inner_bits) { + (true, 1) => ExtendOp::SXTB, + (false, 1) => ExtendOp::UXTB, + (true, 8) => ExtendOp::SXTB, + (false, 8) => ExtendOp::UXTB, + (true, 16) => ExtendOp::SXTH, + (false, 16) => ExtendOp::UXTH, + (true, 32) => ExtendOp::SXTW, + (false, 32) => ExtendOp::UXTW, + _ => unreachable!(), + }; + let reg = input_to_reg(ctx, extendee, NarrowValueMode::None); + ctx.merged(insn); + return ResultRSE::RegExtend(reg, extendop); + } + } + + ResultRSE::from_rs(input_to_rs(ctx, input, narrow_mode)) +} + +fn input_to_rse_imm12>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSEImm12 { + if let InsnInputSource::Output(out) = input_source(ctx, input) { + if let Some(imm_value) = output_to_const(ctx, out) { + if let Some(i) = Imm12::maybe_from_u64(imm_value) { + ctx.merged(out.insn); + return ResultRSEImm12::Imm12(i); + } + } + } + + ResultRSEImm12::from_rse(input_to_rse(ctx, input, narrow_mode)) +} + +fn input_to_rs_immlogic>( + ctx: &mut C, + input: InsnInput, + narrow_mode: NarrowValueMode, +) -> ResultRSImmLogic { + if let InsnInputSource::Output(out) = input_source(ctx, input) { + if let Some(imm_value) = output_to_const(ctx, out) { + let ty = ctx.output_ty(out.insn, out.output); + let ty = if ty_bits(ty) < 32 { I32 } else { ty }; + if let Some(i) = ImmLogic::maybe_from_u64(imm_value, ty) { + ctx.merged(out.insn); + return ResultRSImmLogic::ImmLogic(i); + } + } + } + + ResultRSImmLogic::from_rs(input_to_rs(ctx, input, narrow_mode)) +} + +fn input_to_reg_immshift>( + ctx: &mut C, + input: InsnInput, +) -> ResultRegImmShift { + if let InsnInputSource::Output(out) = input_source(ctx, input) { + if let Some(imm_value) = output_to_const(ctx, out) { + if let Some(immshift) = ImmShift::maybe_from_u64(imm_value) { + ctx.merged(out.insn); + return ResultRegImmShift::ImmShift(immshift); + } + } + } + + ResultRegImmShift::Reg(input_to_reg(ctx, input, NarrowValueMode::None)) +} + +//============================================================================ +// ALU instruction constructors. + +fn alu_inst_imm12(op: ALUOp, rd: Writable, rn: Reg, rm: ResultRSEImm12) -> Inst { + match rm { + ResultRSEImm12::Imm12(imm12) => Inst::AluRRImm12 { + alu_op: op, + rd, + rn, + imm12, + }, + ResultRSEImm12::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + ResultRSEImm12::RegShift(rm, shiftop) => Inst::AluRRRShift { + alu_op: op, + rd, + rn, + rm, + shiftop, + }, + ResultRSEImm12::RegExtend(rm, extendop) => Inst::AluRRRExtend { + alu_op: op, + rd, + rn, + rm, + extendop, + }, + } +} + +fn alu_inst_immlogic(op: ALUOp, rd: Writable, rn: Reg, rm: ResultRSImmLogic) -> Inst { + match rm { + ResultRSImmLogic::ImmLogic(imml) => Inst::AluRRImmLogic { + alu_op: op, + rd, + rn, + imml, + }, + ResultRSImmLogic::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + ResultRSImmLogic::RegShift(rm, shiftop) => Inst::AluRRRShift { + alu_op: op, + rd, + rn, + rm, + shiftop, + }, + } +} + +fn alu_inst_immshift(op: ALUOp, rd: Writable, rn: Reg, rm: ResultRegImmShift) -> Inst { + match rm { + ResultRegImmShift::ImmShift(immshift) => Inst::AluRRImmShift { + alu_op: op, + rd, + rn, + immshift, + }, + ResultRegImmShift::Reg(rm) => Inst::AluRRR { + alu_op: op, + rd, + rn, + rm, + }, + } +} + +//============================================================================ +// Lowering: addressing mode support. Takes instruction directly, rather +// than an `InsnInput`, to do more introspection. + +/// Lower the address of a load or store. +fn lower_address>( + ctx: &mut C, + elem_ty: Type, + addends: &[InsnInput], + offset: i32, +) -> MemArg { + // TODO: support base_reg + scale * index_reg. For this, we would need to pattern-match shl or + // mul instructions (Load/StoreComplex don't include scale factors). + + // Handle one reg and offset that fits in immediate, if possible. + if addends.len() == 1 { + let reg = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64); + if let Some(memarg) = MemArg::reg_maybe_offset(reg, offset as i64, elem_ty) { + return memarg; + } + } + + // Handle two regs and a zero offset, if possible. + if addends.len() == 2 && offset == 0 { + let ra = input_to_reg(ctx, addends[0], NarrowValueMode::ZeroExtend64); + let rb = input_to_reg(ctx, addends[1], NarrowValueMode::ZeroExtend64); + return MemArg::reg_plus_reg(ra, rb); + } + + // Otherwise, generate add instructions. + let addr = ctx.tmp(RegClass::I64, I64); + + // Get the const into a reg. + lower_constant_u64(ctx, addr.clone(), offset as u64); + + // Add each addend to the address. + for addend in addends { + let reg = input_to_reg(ctx, *addend, NarrowValueMode::ZeroExtend64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: addr.clone(), + rn: addr.to_reg(), + rm: reg.clone(), + }); + } + + MemArg::reg(addr.to_reg()) +} + +fn lower_constant_u64>(ctx: &mut C, rd: Writable, value: u64) { + for inst in Inst::load_constant(rd, value) { + ctx.emit(inst); + } +} + +fn lower_constant_f32>(ctx: &mut C, rd: Writable, value: f32) { + ctx.emit(Inst::load_fp_constant32(rd, value)); +} + +fn lower_constant_f64>(ctx: &mut C, rd: Writable, value: f64) { + ctx.emit(Inst::load_fp_constant64(rd, value)); +} + +fn lower_condcode(cc: IntCC) -> Cond { + match cc { + IntCC::Equal => Cond::Eq, + IntCC::NotEqual => Cond::Ne, + IntCC::SignedGreaterThanOrEqual => Cond::Ge, + IntCC::SignedGreaterThan => Cond::Gt, + IntCC::SignedLessThanOrEqual => Cond::Le, + IntCC::SignedLessThan => Cond::Lt, + IntCC::UnsignedGreaterThanOrEqual => Cond::Hs, + IntCC::UnsignedGreaterThan => Cond::Hi, + IntCC::UnsignedLessThanOrEqual => Cond::Ls, + IntCC::UnsignedLessThan => Cond::Lo, + IntCC::Overflow => Cond::Vs, + IntCC::NotOverflow => Cond::Vc, + } +} + +fn lower_fp_condcode(cc: FloatCC) -> Cond { + // Refer to `codegen/shared/src/condcodes.rs` and to the `FCMP` AArch64 docs. + // The FCMP instruction sets: + // NZCV + // - PCSR.NZCV = 0011 on UN (unordered), + // 0110 on EQ, + // 1000 on LT, + // 0010 on GT. + match cc { + // EQ | LT | GT. Vc => V clear. + FloatCC::Ordered => Cond::Vc, + // UN. Vs => V set. + FloatCC::Unordered => Cond::Vs, + // EQ. Eq => Z set. + FloatCC::Equal => Cond::Eq, + // UN | LT | GT. Ne => Z clear. + FloatCC::NotEqual => Cond::Ne, + // LT | GT. + FloatCC::OrderedNotEqual => unimplemented!(), + // UN | EQ + FloatCC::UnorderedOrEqual => unimplemented!(), + // LT. Mi => N set. + FloatCC::LessThan => Cond::Mi, + // LT | EQ. Ls => C clear or Z set. + FloatCC::LessThanOrEqual => Cond::Ls, + // GT. Gt => Z clear, N = V. + FloatCC::GreaterThan => Cond::Gt, + // GT | EQ. Ge => N = V. + FloatCC::GreaterThanOrEqual => Cond::Ge, + // UN | LT + FloatCC::UnorderedOrLessThan => unimplemented!(), + // UN | LT | EQ + FloatCC::UnorderedOrLessThanOrEqual => unimplemented!(), + // UN | GT + FloatCC::UnorderedOrGreaterThan => unimplemented!(), + // UN | GT | EQ + FloatCC::UnorderedOrGreaterThanOrEqual => unimplemented!(), + } +} + +/// Determines whether this condcode interprets inputs as signed or +/// unsigned. See the documentation for the `icmp` instruction in +/// cranelift-codegen/meta/src/shared/instructions.rs for further insights +/// into this. +pub fn condcode_is_signed(cc: IntCC) -> bool { + match cc { + IntCC::Equal => false, + IntCC::NotEqual => false, + IntCC::SignedGreaterThanOrEqual => true, + IntCC::SignedGreaterThan => true, + IntCC::SignedLessThanOrEqual => true, + IntCC::SignedLessThan => true, + IntCC::UnsignedGreaterThanOrEqual => false, + IntCC::UnsignedGreaterThan => false, + IntCC::UnsignedLessThanOrEqual => false, + IntCC::UnsignedLessThan => false, + IntCC::Overflow => true, + IntCC::NotOverflow => true, + } +} + +//============================================================================= +// Top-level instruction lowering entry point, for one instruction. + +/// Actually codegen an instruction's results into registers. +fn lower_insn_to_regs>(ctx: &mut C, insn: IRInst) { + let op = ctx.data(insn).opcode(); + let inputs: SmallVec<[InsnInput; 4]> = (0..ctx.num_inputs(insn)) + .map(|i| InsnInput { insn, input: i }) + .collect(); + let outputs: SmallVec<[InsnOutput; 2]> = (0..ctx.num_outputs(insn)) + .map(|i| InsnOutput { insn, output: i }) + .collect(); + let ty = if outputs.len() > 0 { + Some(ctx.output_ty(insn, 0)) + } else { + None + }; + + match op { + Opcode::Iconst | Opcode::Bconst | Opcode::Null => { + let value = output_to_const(ctx, outputs[0]).unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + lower_constant_u64(ctx, rd, value); + } + Opcode::F32const => { + let value = output_to_const_f32(ctx, outputs[0]).unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + lower_constant_f32(ctx, rd, value); + } + Opcode::F64const => { + let value = output_to_const_f64(ctx, outputs[0]).unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + lower_constant_f64(ctx, rd, value); + } + Opcode::Iadd => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::Add32, ALUOp::Add64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } + Opcode::Isub => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_rse_imm12(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } + Opcode::UaddSat | Opcode::SaddSat => { + // We use the vector instruction set's saturating adds (UQADD / + // SQADD), which require vector registers. + let is_signed = op == Opcode::SaddSat; + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let alu_op = if is_signed { + VecALUOp::SQAddScalar + } else { + VecALUOp::UQAddScalar + }; + let va = ctx.tmp(RegClass::V128, I128); + let vb = ctx.tmp(RegClass::V128, I128); + let ra = input_to_reg(ctx, inputs[0], narrow_mode); + let rb = input_to_reg(ctx, inputs[1], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::MovToVec64 { rd: va, rn: ra }); + ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb }); + ctx.emit(Inst::VecRRR { + rd: va, + rn: va.to_reg(), + rm: vb.to_reg(), + alu_op, + }); + ctx.emit(Inst::MovFromVec64 { + rd, + rn: va.to_reg(), + }); + } + + Opcode::UsubSat | Opcode::SsubSat => { + let is_signed = op == Opcode::SsubSat; + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let alu_op = if is_signed { + VecALUOp::SQSubScalar + } else { + VecALUOp::UQSubScalar + }; + let va = ctx.tmp(RegClass::V128, I128); + let vb = ctx.tmp(RegClass::V128, I128); + let ra = input_to_reg(ctx, inputs[0], narrow_mode); + let rb = input_to_reg(ctx, inputs[1], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::MovToVec64 { rd: va, rn: ra }); + ctx.emit(Inst::MovToVec64 { rd: vb, rn: rb }); + ctx.emit(Inst::VecRRR { + rd: va, + rn: va.to_reg(), + rm: vb.to_reg(), + alu_op, + }); + ctx.emit(Inst::MovFromVec64 { + rd, + rn: va.to_reg(), + }); + } + + Opcode::Ineg => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = zero_reg(); + let rm = input_to_rse_imm12(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::Sub32, ALUOp::Sub64); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + } + + Opcode::Imul => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::MAdd32, ALUOp::MAdd64); + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra: zero_reg(), + }); + } + + Opcode::Umulhi | Opcode::Smulhi => { + let rd = output_to_reg(ctx, outputs[0]); + let is_signed = op == Opcode::Smulhi; + let input_ty = ctx.input_ty(insn, 0); + assert!(ctx.input_ty(insn, 1) == input_ty); + assert!(ctx.output_ty(insn, 0) == input_ty); + + match input_ty { + I64 => { + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let ra = zero_reg(); + let alu_op = if is_signed { + ALUOp::SMulH + } else { + ALUOp::UMulH + }; + ctx.emit(Inst::AluRRRR { + alu_op, + rd, + rn, + rm, + ra, + }); + } + I32 | I16 | I8 => { + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + let ra = zero_reg(); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp::MAdd64, + rd, + rn, + rm, + ra, + }); + let shift_op = if is_signed { + ALUOp::Asr64 + } else { + ALUOp::Lsr64 + }; + let shift_amt = match input_ty { + I32 => 32, + I16 => 16, + I8 => 8, + _ => unreachable!(), + }; + ctx.emit(Inst::AluRRImmShift { + alu_op: shift_op, + rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(shift_amt).unwrap(), + }); + } + _ => { + panic!("Unsupported argument type for umulhi/smulhi: {}", input_ty); + } + } + } + + Opcode::Udiv | Opcode::Sdiv | Opcode::Urem | Opcode::Srem => { + let is_signed = match op { + Opcode::Udiv | Opcode::Urem => false, + Opcode::Sdiv | Opcode::Srem => true, + _ => unreachable!(), + }; + let is_rem = match op { + Opcode::Udiv | Opcode::Sdiv => false, + Opcode::Urem | Opcode::Srem => true, + _ => unreachable!(), + }; + let narrow_mode = if is_signed { + NarrowValueMode::SignExtend64 + } else { + NarrowValueMode::ZeroExtend64 + }; + let div_op = if is_signed { + ALUOp::SDiv64 + } else { + ALUOp::UDiv64 + }; + + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + if !is_rem { + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + ctx.emit(Inst::AluRRR { + alu_op: div_op, + rd, + rn, + rm, + }); + } else { + let rm = input_to_reg(ctx, inputs[1], narrow_mode); + // Remainder (rn % rm) is implemented as: + // + // tmp = rn / rm + // rd = rn - (tmp*rm) + // + // use 'rd' for tmp and you have: + // + // div rd, rn, rm ; rd = rn / rm + // msub rd, rd, rm, rn ; rd = rn - rd * rm + ctx.emit(Inst::AluRRR { + alu_op: div_op, + rd, + rn, + rm, + }); + ctx.emit(Inst::AluRRRR { + alu_op: ALUOp::MSub64, + rd: rd, + rn: rd.to_reg(), + rm: rm, + ra: rn, + }); + } + } + + Opcode::Uextend | Opcode::Sextend => { + let output_ty = ty.unwrap(); + let input_ty = ctx.input_ty(insn, 0); + let from_bits = ty_bits(input_ty) as u8; + let to_bits = ty_bits(output_ty) as u8; + let to_bits = std::cmp::max(32, to_bits); + assert!(from_bits <= to_bits); + if from_bits < to_bits { + let signed = op == Opcode::Sextend; + // If we reach this point, we weren't able to incorporate the extend as + // a register-mode on another instruction, so we have a 'None' + // narrow-value/extend mode here, and we emit the explicit instruction. + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::Extend { + rd, + rn, + signed, + from_bits, + to_bits, + }); + } + } + + Opcode::Bnot => { + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_rs_immlogic(ctx, inputs[0], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64); + // NOT rd, rm ==> ORR_NOT rd, zero, rm + ctx.emit(alu_inst_immlogic(alu_op, rd, zero_reg(), rm)); + } + + Opcode::Band + | Opcode::Bor + | Opcode::Bxor + | Opcode::BandNot + | Opcode::BorNot + | Opcode::BxorNot => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_rs_immlogic(ctx, inputs[1], NarrowValueMode::None); + let ty = ty.unwrap(); + let alu_op = match op { + Opcode::Band => choose_32_64(ty, ALUOp::And32, ALUOp::And64), + Opcode::Bor => choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64), + Opcode::Bxor => choose_32_64(ty, ALUOp::Eor32, ALUOp::Eor64), + Opcode::BandNot => choose_32_64(ty, ALUOp::AndNot32, ALUOp::AndNot64), + Opcode::BorNot => choose_32_64(ty, ALUOp::OrrNot32, ALUOp::OrrNot64), + Opcode::BxorNot => choose_32_64(ty, ALUOp::EorNot32, ALUOp::EorNot64), + _ => unreachable!(), + }; + ctx.emit(alu_inst_immlogic(alu_op, rd, rn, rm)); + } + + Opcode::Ishl | Opcode::Ushr | Opcode::Sshr => { + let ty = ty.unwrap(); + let size = InstSize::from_bits(ty_bits(ty)); + let narrow_mode = match (op, size) { + (Opcode::Ishl, _) => NarrowValueMode::None, + (Opcode::Ushr, InstSize::Size64) => NarrowValueMode::ZeroExtend64, + (Opcode::Ushr, InstSize::Size32) => NarrowValueMode::ZeroExtend32, + (Opcode::Sshr, InstSize::Size64) => NarrowValueMode::SignExtend64, + (Opcode::Sshr, InstSize::Size32) => NarrowValueMode::SignExtend32, + _ => unreachable!(), + }; + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_reg_immshift(ctx, inputs[1]); + let alu_op = match op { + Opcode::Ishl => choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64), + Opcode::Ushr => choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + Opcode::Sshr => choose_32_64(ty, ALUOp::Asr32, ALUOp::Asr64), + _ => unreachable!(), + }; + ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm)); + } + + Opcode::Rotr => { + // For a 32-bit or 64-bit rotate-right, we can use the ROR + // instruction directly. + // + // For a < 32-bit rotate-right, we synthesize this as: + // + // rotr rd, rn, rm + // + // => + // + // zero-extend rn, <32-or-64> + // sub tmp1, rm, + // sub tmp1, zero, tmp1 ; neg + // lsr tmp2, rn, rm + // lsl rd, rn, tmp1 + // orr rd, rd, tmp2 + // + // For a constant amount, we can instead do: + // + // zero-extend rn, <32-or-64> + // lsr tmp2, rn, # + // lsl rd, rn, + // orr rd, rd, tmp2 + + let ty = ty.unwrap(); + let bits = ty_bits(ty); + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg( + ctx, + inputs[0], + if bits <= 32 { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::ZeroExtend64 + }, + ); + let rm = input_to_reg_immshift(ctx, inputs[1]); + + if bits == 32 || bits == 64 { + let alu_op = choose_32_64(ty, ALUOp::RotR32, ALUOp::RotR64); + ctx.emit(alu_inst_immshift(alu_op, rd, rn, rm)); + } else { + assert!(bits < 32); + match rm { + ResultRegImmShift::Reg(reg) => { + let tmp1 = ctx.tmp(RegClass::I64, I32); + let tmp2 = ctx.tmp(RegClass::I64, I32); + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: reg, + imm12: Imm12::maybe_from_u64(bits as u64).unwrap(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: zero_reg(), + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Lsr32, + rd: tmp2, + rn: rn, + rm: reg, + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Lsl32, + rd: rd, + rn: rn, + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd: rd, + rn: rd.to_reg(), + rm: tmp2.to_reg(), + }); + } + ResultRegImmShift::ImmShift(immshift) => { + let tmp1 = ctx.tmp(RegClass::I64, I32); + let amt = immshift.value(); + assert!(amt <= bits as u8); + let opp_shift = ImmShift::maybe_from_u64(bits as u64 - amt as u64).unwrap(); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr32, + rd: tmp1, + rn: rn, + immshift: immshift, + }); + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsl32, + rd: rd, + rn: rn, + immshift: opp_shift, + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr32, + rd: rd, + rn: rd.to_reg(), + rm: tmp1.to_reg(), + }); + } + } + } + } + + Opcode::Rotl => { + // AArch64 does not have a ROL instruction, so we always synthesize + // this as: + // + // rotl rd, rn, rm + // + // => + // + // zero-extend rn, <32-or-64> + // sub tmp1, rm, + // sub tmp1, zero, tmp1 ; neg + // lsl tmp2, rn, rm + // lsr rd, rn, tmp1 + // orr rd, rd, tmp2 + // + // For a constant amount, we can instead do: + // + // zero-extend rn, <32-or-64> + // lsl tmp2, rn, # + // lsr rd, rn, # + // orr rd, rd, tmp2 + + let ty = ty.unwrap(); + let bits = ty_bits(ty); + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg( + ctx, + inputs[0], + if bits <= 32 { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::ZeroExtend64 + }, + ); + let rm = input_to_reg_immshift(ctx, inputs[1]); + + match rm { + ResultRegImmShift::Reg(reg) => { + let tmp1 = ctx.tmp(RegClass::I64, I32); + let tmp2 = ctx.tmp(RegClass::I64, I64); + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: reg, + imm12: Imm12::maybe_from_u64(bits as u64).unwrap(), + }); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub32, + rd: tmp1, + rn: zero_reg(), + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64), + rd: tmp2, + rn: rn, + rm: reg, + }); + ctx.emit(Inst::AluRRR { + alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + rd: rd, + rn: rn, + rm: tmp1.to_reg(), + }); + ctx.emit(Inst::AluRRR { + alu_op: choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64), + rd: rd, + rn: rd.to_reg(), + rm: tmp2.to_reg(), + }); + } + ResultRegImmShift::ImmShift(immshift) => { + let tmp1 = ctx.tmp(RegClass::I64, I64); + let amt = immshift.value(); + assert!(amt <= bits as u8); + let opp_shift = ImmShift::maybe_from_u64(bits as u64 - amt as u64).unwrap(); + ctx.emit(Inst::AluRRImmShift { + alu_op: choose_32_64(ty, ALUOp::Lsl32, ALUOp::Lsl64), + rd: tmp1, + rn: rn, + immshift: immshift, + }); + ctx.emit(Inst::AluRRImmShift { + alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + rd: rd, + rn: rn, + immshift: opp_shift, + }); + ctx.emit(Inst::AluRRR { + alu_op: choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64), + rd: rd, + rn: rd.to_reg(), + rm: tmp1.to_reg(), + }); + } + } + } + + Opcode::Bitrev | Opcode::Clz | Opcode::Cls => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let op = BitOp::from((op, ty.unwrap())); + ctx.emit(Inst::BitRR { rd, rn, op }); + } + + Opcode::Ctz => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let op = BitOp::from((Opcode::Bitrev, ty.unwrap())); + ctx.emit(Inst::BitRR { rd, rn, op }); + let op = BitOp::from((Opcode::Clz, ty.unwrap())); + ctx.emit(Inst::BitRR { + rd, + rn: rd.to_reg(), + op, + }); + } + + Opcode::Popcnt => { + // Lower popcount using the following algorithm: + // + // x -= (x >> 1) & 0x5555555555555555 + // x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333) + // x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f + // x += x << 8 + // x += x << 16 + // x += x << 32 + // x >> 56 + let ty = ty.unwrap(); + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let tmp = ctx.tmp(RegClass::I64, I64); + + // If this is a 32-bit Popcnt, use Lsr32 to clear the top 32 bits of the register, then + // the rest of the code is identical to the 64-bit version. + // lsr [wx]d, [wx]n, #1 + ctx.emit(Inst::AluRRImmShift { + alu_op: choose_32_64(ty, ALUOp::Lsr32, ALUOp::Lsr64), + rd: rd, + rn: rn, + immshift: ImmShift::maybe_from_u64(1).unwrap(), + }); + + // and xd, xd, #0x5555555555555555 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: rd, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x5555555555555555, I64).unwrap(), + }); + + // sub xd, xn, xd + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd: rd, + rn: rn, + rm: rd.to_reg(), + }); + + // and xt, xd, #0x3333333333333333 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: tmp, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), + }); + + // lsr xd, xd, #2 + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: rd, + rn: rd.to_reg(), + immshift: ImmShift::maybe_from_u64(2).unwrap(), + }); + + // and xd, xd, #0x3333333333333333 + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: rd, + rn: rd.to_reg(), + imml: ImmLogic::maybe_from_u64(0x3333333333333333, I64).unwrap(), + }); + + // add xt, xd, xt + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Add64, + rd: tmp, + rn: rd.to_reg(), + rm: tmp.to_reg(), + }); + + // add xt, xt, xt LSR #4 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSR, + ShiftOpShiftImm::maybe_from_shift(4).unwrap(), + ), + }); + + // and xt, xt, #0x0f0f0f0f0f0f0f0f + ctx.emit(Inst::AluRRImmLogic { + alu_op: ALUOp::And64, + rd: tmp, + rn: tmp.to_reg(), + imml: ImmLogic::maybe_from_u64(0x0f0f0f0f0f0f0f0f, I64).unwrap(), + }); + + // add xt, xt, xt, LSL #8 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(8).unwrap(), + ), + }); + + // add xt, xt, xt, LSL #16 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(16).unwrap(), + ), + }); + + // add xt, xt, xt, LSL #32 + ctx.emit(Inst::AluRRRShift { + alu_op: ALUOp::Add64, + rd: tmp, + rn: tmp.to_reg(), + rm: tmp.to_reg(), + shiftop: ShiftOpAndAmt::new( + ShiftOp::LSL, + ShiftOpShiftImm::maybe_from_shift(32).unwrap(), + ), + }); + + // lsr xd, xt, #56 + ctx.emit(Inst::AluRRImmShift { + alu_op: ALUOp::Lsr64, + rd: rd, + rn: tmp.to_reg(), + immshift: ImmShift::maybe_from_u64(56).unwrap(), + }); + } + + Opcode::Load + | Opcode::Uload8 + | Opcode::Sload8 + | Opcode::Uload16 + | Opcode::Sload16 + | Opcode::Uload32 + | Opcode::Sload32 + | Opcode::LoadComplex + | Opcode::Uload8Complex + | Opcode::Sload8Complex + | Opcode::Uload16Complex + | Opcode::Sload16Complex + | Opcode::Uload32Complex + | Opcode::Sload32Complex => { + let off = ldst_offset(ctx.data(insn)).unwrap(); + let elem_ty = match op { + Opcode::Sload8 | Opcode::Uload8 | Opcode::Sload8Complex | Opcode::Uload8Complex => { + I8 + } + Opcode::Sload16 + | Opcode::Uload16 + | Opcode::Sload16Complex + | Opcode::Uload16Complex => I16, + Opcode::Sload32 + | Opcode::Uload32 + | Opcode::Sload32Complex + | Opcode::Uload32Complex => I32, + Opcode::Load | Opcode::LoadComplex => ctx.output_ty(insn, 0), + _ => unreachable!(), + }; + let sign_extend = match op { + Opcode::Sload8 + | Opcode::Sload8Complex + | Opcode::Sload16 + | Opcode::Sload16Complex + | Opcode::Sload32 + | Opcode::Sload32Complex => true, + _ => false, + }; + let is_float = ty_is_float(elem_ty); + + let mem = lower_address(ctx, elem_ty, &inputs[..], off); + let rd = output_to_reg(ctx, outputs[0]); + + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + + ctx.emit(match (ty_bits(elem_ty), sign_extend, is_float) { + (1, _, _) => Inst::ULoad8 { rd, mem, srcloc }, + (8, false, _) => Inst::ULoad8 { rd, mem, srcloc }, + (8, true, _) => Inst::SLoad8 { rd, mem, srcloc }, + (16, false, _) => Inst::ULoad16 { rd, mem, srcloc }, + (16, true, _) => Inst::SLoad16 { rd, mem, srcloc }, + (32, false, false) => Inst::ULoad32 { rd, mem, srcloc }, + (32, true, false) => Inst::SLoad32 { rd, mem, srcloc }, + (32, _, true) => Inst::FpuLoad32 { rd, mem, srcloc }, + (64, _, false) => Inst::ULoad64 { rd, mem, srcloc }, + (64, _, true) => Inst::FpuLoad64 { rd, mem, srcloc }, + _ => panic!("Unsupported size in load"), + }); + } + + Opcode::Store + | Opcode::Istore8 + | Opcode::Istore16 + | Opcode::Istore32 + | Opcode::StoreComplex + | Opcode::Istore8Complex + | Opcode::Istore16Complex + | Opcode::Istore32Complex => { + let off = ldst_offset(ctx.data(insn)).unwrap(); + let elem_ty = match op { + Opcode::Istore8 | Opcode::Istore8Complex => I8, + Opcode::Istore16 | Opcode::Istore16Complex => I16, + Opcode::Istore32 | Opcode::Istore32Complex => I32, + Opcode::Store | Opcode::StoreComplex => ctx.input_ty(insn, 0), + _ => unreachable!(), + }; + let is_float = ty_is_float(elem_ty); + + let mem = lower_address(ctx, elem_ty, &inputs[1..], off); + let rd = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + + let memflags = ctx.memflags(insn).expect("memory flags"); + let srcloc = if !memflags.notrap() { + Some(ctx.srcloc(insn)) + } else { + None + }; + + ctx.emit(match (ty_bits(elem_ty), is_float) { + (1, _) | (8, _) => Inst::Store8 { rd, mem, srcloc }, + (16, _) => Inst::Store16 { rd, mem, srcloc }, + (32, false) => Inst::Store32 { rd, mem, srcloc }, + (32, true) => Inst::FpuStore32 { rd, mem, srcloc }, + (64, false) => Inst::Store64 { rd, mem, srcloc }, + (64, true) => Inst::FpuStore64 { rd, mem, srcloc }, + _ => panic!("Unsupported size in store"), + }); + } + + Opcode::StackLoad | Opcode::StackStore | Opcode::StackAddr => { + panic!("Direct stack memory access not supported; should not be used by Wasm"); + } + + Opcode::HeapAddr => { + panic!("heap_addr should have been removed by legalization!"); + } + + Opcode::TableAddr => { + panic!("table_addr should have been removed by legalization!"); + } + + Opcode::Nop => { + // Nothing. + } + + Opcode::Select | Opcode::Selectif => { + let cond = if op == Opcode::Select { + let (cmp_op, narrow_mode) = if ty_bits(ctx.input_ty(insn, 0)) > 32 { + (ALUOp::SubS64, NarrowValueMode::ZeroExtend64) + } else { + (ALUOp::SubS32, NarrowValueMode::ZeroExtend32) + }; + + let rcond = input_to_reg(ctx, inputs[0], narrow_mode); + // cmp rcond, #0 + ctx.emit(Inst::AluRRR { + alu_op: cmp_op, + rd: writable_zero_reg(), + rn: rcond, + rm: zero_reg(), + }); + Cond::Ne + } else { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + // Verification ensures that the input is always a + // single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + cond + }; + + // csel.COND rd, rn, rm + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None); + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty); + if ty_is_float(ty) && bits == 32 { + ctx.emit(Inst::FpuCSel32 { cond, rd, rn, rm }); + } else if ty_is_float(ty) && bits == 64 { + ctx.emit(Inst::FpuCSel64 { cond, rd, rn, rm }); + } else { + ctx.emit(Inst::CSel { cond, rd, rn, rm }); + } + } + + Opcode::Bitselect => { + let tmp = ctx.tmp(RegClass::I64, I64); + let rd = output_to_reg(ctx, outputs[0]); + let rcond = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rn = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[2], NarrowValueMode::None); + // AND rTmp, rn, rcond + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::And64, + rd: tmp, + rn, + rm: rcond, + }); + // BIC rd, rm, rcond + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::AndNot64, + rd, + rn: rm, + rm: rcond, + }); + // ORR rd, rd, rTmp + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Orr64, + rd, + rn: rd.to_reg(), + rm: tmp.to_reg(), + }); + } + + Opcode::Trueif => { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + // Verification ensures that the input is always a + // single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::CSet { rd, cond }); + } + + Opcode::Trueff => { + let condcode = inst_fp_condcode(ctx.data(insn)).unwrap(); + let cond = lower_fp_condcode(condcode); + let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::CSet { rd, cond }); + } + + Opcode::IsNull | Opcode::IsInvalid => { + panic!("Reference types not supported"); + } + + Opcode::Copy => { + let rd = output_to_reg(ctx, outputs[0]); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + + Opcode::Bint | Opcode::Breduce | Opcode::Bextend | Opcode::Ireduce => { + // All of these ops are simply a move from a zero-extended source. + // Here is why this works, in each case: + // + // - Bint: Bool-to-int. We always represent a bool as a 0 or 1, so we + // merely need to zero-extend here. + // + // - Breduce, Bextend: changing width of a boolean. We represent a + // bool as a 0 or 1, so again, this is a zero-extend / no-op. + // + // - Ireduce: changing width of an integer. Smaller ints are stored + // with undefined high-order bits, so we can simply do a copy. + + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + let rd = output_to_reg(ctx, outputs[0]); + let ty = ctx.input_ty(insn, 0); + ctx.emit(Inst::gen_move(rd, rn, ty)); + } + + Opcode::Bmask => { + // Bool is {0, 1}, so we can subtract from 0 to get all-1s. + let rd = output_to_reg(ctx, outputs[0]); + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::Sub64, + rd, + rn: zero_reg(), + rm, + }); + } + + Opcode::Bitcast => { + let rd = output_to_reg(ctx, outputs[0]); + let ity = ctx.input_ty(insn, 0); + let oty = ctx.output_ty(insn, 0); + match (ty_is_float(ity), ty_is_float(oty)) { + (true, true) => { + let narrow_mode = if ty_bits(ity) <= 32 && ty_bits(oty) <= 32 { + NarrowValueMode::ZeroExtend32 + } else { + NarrowValueMode::ZeroExtend64 + }; + let rm = input_to_reg(ctx, inputs[0], narrow_mode); + ctx.emit(Inst::gen_move(rd, rm, oty)); + } + (false, false) => { + let rm = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::gen_move(rd, rm, oty)); + } + (false, true) => { + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + ctx.emit(Inst::MovToVec64 { rd, rn }); + } + (true, false) => { + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + ctx.emit(Inst::MovFromVec64 { rd, rn }); + } + } + } + + Opcode::FallthroughReturn | Opcode::Return => { + for (i, input) in inputs.iter().enumerate() { + // N.B.: according to the AArch64 ABI, the top bits of a register + // (above the bits for the value's type) are undefined, so we + // need not extend the return values. + let reg = input_to_reg(ctx, *input, NarrowValueMode::None); + let retval_reg = ctx.retval(i); + let ty = ctx.input_ty(insn, i); + ctx.emit(Inst::gen_move(retval_reg, reg, ty)); + } + // N.B.: the Ret itself is generated by the ABI. + } + + Opcode::Ifcmp | Opcode::Ffcmp => { + // An Ifcmp/Ffcmp must always be seen as a use of a brif/brff or trueif/trueff + // instruction. This will always be the case as long as the IR uses an Ifcmp/Ffcmp from + // the same block, or a dominating block. In other words, it cannot pass through a BB + // param (phi). The flags pass of the verifier will ensure this. + panic!("Should never reach ifcmp as isel root!"); + } + + Opcode::Icmp => { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(alu_inst_imm12(alu_op, writable_zero_reg(), rn, rm)); + ctx.emit(Inst::CondSet { cond, rd }); + } + + Opcode::Fcmp => { + let condcode = inst_fp_condcode(ctx.data(insn)).unwrap(); + let cond = lower_fp_condcode(condcode); + let ty = ctx.input_ty(insn, 0); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + match ty_bits(ty) { + 32 => { + ctx.emit(Inst::FpuCmp32 { rn, rm }); + } + 64 => { + ctx.emit(Inst::FpuCmp64 { rn, rm }); + } + _ => panic!("Bad float size"), + } + ctx.emit(Inst::CondSet { cond, rd }); + } + + Opcode::JumpTableEntry | Opcode::JumpTableBase => { + panic!("Should not appear: we handle BrTable directly"); + } + + Opcode::Debugtrap => { + ctx.emit(Inst::Brk); + } + + Opcode::Trap => { + let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap()); + ctx.emit(Inst::Udf { trap_info }) + } + + Opcode::Trapif | Opcode::Trapff => { + let trap_info = (ctx.srcloc(insn), inst_trapcode(ctx.data(insn)).unwrap()); + + let cond = if op == Opcode::Trapif { + let condcode = inst_condcode(ctx.data(insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + + // Verification ensures that the input is always a single-def ifcmp. + let ifcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ifcmp).unwrap(); + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + cond + } else { + let condcode = inst_fp_condcode(ctx.data(insn)).unwrap(); + let cond = lower_fp_condcode(condcode); + + // Verification ensures that the input is always a + // single-def ffcmp. + let ffcmp_insn = maybe_input_insn(ctx, inputs[0], Opcode::Ffcmp).unwrap(); + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + cond + }; + + // Branch around the break instruction with inverted cond. Go straight to lowered + // one-target form; this is logically part of a single-in single-out template lowering. + let cond = cond.invert(); + ctx.emit(Inst::CondBrLowered { + target: BranchTarget::ResolvedOffset(8), + kind: CondBrKind::Cond(cond), + }); + + ctx.emit(Inst::Udf { trap_info }) + } + + Opcode::Safepoint => { + panic!("safepoint support not implemented!"); + } + + Opcode::Trapz | Opcode::Trapnz => { + panic!("trapz / trapnz should have been removed by legalization!"); + } + + Opcode::ResumableTrap => { + panic!("Resumable traps not supported"); + } + + Opcode::FuncAddr => { + let rd = output_to_reg(ctx, outputs[0]); + let extname = ctx.call_target(insn).unwrap().clone(); + let loc = ctx.srcloc(insn); + ctx.emit(Inst::LoadExtName { + rd, + name: extname, + srcloc: loc, + offset: 0, + }); + } + + Opcode::GlobalValue => { + panic!("global_value should have been removed by legalization!"); + } + + Opcode::SymbolValue => { + let rd = output_to_reg(ctx, outputs[0]); + let (extname, offset) = ctx.symbol_value(insn).unwrap(); + let extname = extname.clone(); + let loc = ctx.srcloc(insn); + ctx.emit(Inst::LoadExtName { + rd, + name: extname, + srcloc: loc, + offset, + }); + } + + Opcode::Call | Opcode::CallIndirect => { + let loc = ctx.srcloc(insn); + let (abi, inputs) = match op { + Opcode::Call => { + let extname = ctx.call_target(insn).unwrap(); + let extname = extname.clone(); + let sig = ctx.call_sig(insn).unwrap(); + assert!(inputs.len() == sig.params.len()); + assert!(outputs.len() == sig.returns.len()); + (AArch64ABICall::from_func(sig, &extname, loc), &inputs[..]) + } + Opcode::CallIndirect => { + let ptr = input_to_reg(ctx, inputs[0], NarrowValueMode::ZeroExtend64); + let sig = ctx.call_sig(insn).unwrap(); + assert!(inputs.len() - 1 == sig.params.len()); + assert!(outputs.len() == sig.returns.len()); + (AArch64ABICall::from_ptr(sig, ptr, loc, op), &inputs[1..]) + } + _ => unreachable!(), + }; + + for inst in abi.gen_stack_pre_adjust().into_iter() { + ctx.emit(inst); + } + assert!(inputs.len() == abi.num_args()); + for (i, input) in inputs.iter().enumerate() { + let arg_reg = input_to_reg(ctx, *input, NarrowValueMode::None); + ctx.emit(abi.gen_copy_reg_to_arg(i, arg_reg)); + } + for inst in abi.gen_call().into_iter() { + ctx.emit(inst); + } + for (i, output) in outputs.iter().enumerate() { + let retval_reg = output_to_reg(ctx, *output); + ctx.emit(abi.gen_copy_retval_to_reg(i, retval_reg)); + } + for inst in abi.gen_stack_post_adjust().into_iter() { + ctx.emit(inst); + } + } + + Opcode::GetPinnedReg + | Opcode::SetPinnedReg + | Opcode::Spill + | Opcode::Fill + | Opcode::FillNop + | Opcode::Regmove + | Opcode::CopySpecial + | Opcode::CopyToSsa + | Opcode::CopyNop + | Opcode::AdjustSpDown + | Opcode::AdjustSpUpImm + | Opcode::AdjustSpDownImm + | Opcode::IfcmpSp + | Opcode::Regspill + | Opcode::Regfill => { + panic!("Unused opcode should not be encountered."); + } + + Opcode::Jump + | Opcode::Fallthrough + | Opcode::Brz + | Opcode::Brnz + | Opcode::BrIcmp + | Opcode::Brif + | Opcode::Brff + | Opcode::IndirectJumpTableBr + | Opcode::BrTable => { + panic!("Branch opcode reached non-branch lowering logic!"); + } + + Opcode::Vconst + | Opcode::Shuffle + | Opcode::Vsplit + | Opcode::Vconcat + | Opcode::Vselect + | Opcode::VanyTrue + | Opcode::VallTrue + | Opcode::Splat + | Opcode::Insertlane + | Opcode::Extractlane + | Opcode::RawBitcast + | Opcode::ScalarToVector + | Opcode::Swizzle + | Opcode::Uload8x8 + | Opcode::Sload8x8 + | Opcode::Uload16x4 + | Opcode::Sload16x4 + | Opcode::Uload32x2 + | Opcode::Sload32x2 => { + // TODO + panic!("Vector ops not implemented."); + } + + Opcode::Isplit | Opcode::Iconcat => panic!("Vector ops not supported."), + Opcode::Imax | Opcode::Imin | Opcode::Umin | Opcode::Umax => { + panic!("Vector ops not supported.") + } + + Opcode::Fadd | Opcode::Fsub | Opcode::Fmul | Opcode::Fdiv | Opcode::Fmin | Opcode::Fmax => { + let bits = ty_bits(ctx.output_ty(insn, 0)); + let fpu_op = match (op, bits) { + (Opcode::Fadd, 32) => FPUOp2::Add32, + (Opcode::Fadd, 64) => FPUOp2::Add64, + (Opcode::Fsub, 32) => FPUOp2::Sub32, + (Opcode::Fsub, 64) => FPUOp2::Sub64, + (Opcode::Fmul, 32) => FPUOp2::Mul32, + (Opcode::Fmul, 64) => FPUOp2::Mul64, + (Opcode::Fdiv, 32) => FPUOp2::Div32, + (Opcode::Fdiv, 64) => FPUOp2::Div64, + (Opcode::Fmin, 32) => FPUOp2::Min32, + (Opcode::Fmin, 64) => FPUOp2::Min64, + (Opcode::Fmax, 32) => FPUOp2::Max32, + (Opcode::Fmax, 64) => FPUOp2::Max64, + _ => panic!("Unknown op/bits combination"), + }; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRRR { fpu_op, rd, rn, rm }); + } + + Opcode::Sqrt | Opcode::Fneg | Opcode::Fabs | Opcode::Fpromote | Opcode::Fdemote => { + let bits = ty_bits(ctx.output_ty(insn, 0)); + let fpu_op = match (op, bits) { + (Opcode::Sqrt, 32) => FPUOp1::Sqrt32, + (Opcode::Sqrt, 64) => FPUOp1::Sqrt64, + (Opcode::Fneg, 32) => FPUOp1::Neg32, + (Opcode::Fneg, 64) => FPUOp1::Neg64, + (Opcode::Fabs, 32) => FPUOp1::Abs32, + (Opcode::Fabs, 64) => FPUOp1::Abs64, + (Opcode::Fpromote, 32) => panic!("Cannot promote to 32 bits"), + (Opcode::Fpromote, 64) => FPUOp1::Cvt32To64, + (Opcode::Fdemote, 32) => FPUOp1::Cvt64To32, + (Opcode::Fdemote, 64) => panic!("Cannot demote to 64 bits"), + _ => panic!("Unknown op/bits combination"), + }; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRR { fpu_op, rd, rn }); + } + + Opcode::Ceil | Opcode::Floor | Opcode::Trunc | Opcode::Nearest => { + let bits = ty_bits(ctx.output_ty(insn, 0)); + let op = match (op, bits) { + (Opcode::Ceil, 32) => FpuRoundMode::Plus32, + (Opcode::Ceil, 64) => FpuRoundMode::Plus64, + (Opcode::Floor, 32) => FpuRoundMode::Minus32, + (Opcode::Floor, 64) => FpuRoundMode::Minus64, + (Opcode::Trunc, 32) => FpuRoundMode::Zero32, + (Opcode::Trunc, 64) => FpuRoundMode::Zero64, + (Opcode::Nearest, 32) => FpuRoundMode::Nearest32, + (Opcode::Nearest, 64) => FpuRoundMode::Nearest64, + _ => panic!("Unknown op/bits combination"), + }; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRound { op, rd, rn }); + } + + Opcode::Fma => { + let bits = ty_bits(ctx.output_ty(insn, 0)); + let fpu_op = match bits { + 32 => FPUOp3::MAdd32, + 64 => FPUOp3::MAdd64, + _ => panic!("Unknown op size"), + }; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let ra = input_to_reg(ctx, inputs[2], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuRRRR { + fpu_op, + rn, + rm, + ra, + rd, + }); + } + + Opcode::Fcopysign => { + // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: + // + // (64 bits for example, 32-bit sequence is analogous): + // + // MOV Xtmp1, Dinput0 + // MOV Xtmp2, Dinput1 + // AND Xtmp2, 0x8000_0000_0000_0000 + // ORR Xtmp1, Xtmp1, Xtmp2 + // MOV Doutput, Xtmp1 + + let ty = ctx.output_ty(insn, 0); + let bits = ty_bits(ty); + assert!(bits == 32 || bits == 64); + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + let tmp1 = ctx.tmp(RegClass::I64, I64); + let tmp2 = ctx.tmp(RegClass::I64, I64); + ctx.emit(Inst::MovFromVec64 { rd: tmp1, rn: rn }); + ctx.emit(Inst::MovFromVec64 { rd: tmp2, rn: rm }); + let imml = if bits == 32 { + ImmLogic::from_raw( + /* value = */ 0x8000_0000, + /* n = */ false, + /* r = */ 1, + /* s = */ 0, + ) + } else { + ImmLogic::from_raw( + /* value = */ 0x8000_0000_0000_0000, + /* n = */ true, + /* r = */ 1, + /* s = */ 0, + ) + }; + let alu_op = choose_32_64(ty, ALUOp::And32, ALUOp::And64); + ctx.emit(Inst::AluRRImmLogic { + alu_op, + rd: tmp2, + rn: tmp2.to_reg(), + imml, + }); + let alu_op = choose_32_64(ty, ALUOp::Orr32, ALUOp::Orr64); + ctx.emit(Inst::AluRRR { + alu_op, + rd: tmp1, + rn: tmp1.to_reg(), + rm: tmp2.to_reg(), + }); + ctx.emit(Inst::MovToVec64 { + rd, + rn: tmp1.to_reg(), + }); + } + + Opcode::FcvtToUint | Opcode::FcvtToSint => { + let in_bits = ty_bits(ctx.input_ty(insn, 0)); + let out_bits = ty_bits(ctx.output_ty(insn, 0)); + let signed = op == Opcode::FcvtToSint; + let op = match (signed, in_bits, out_bits) { + (false, 32, 32) => FpuToIntOp::F32ToU32, + (true, 32, 32) => FpuToIntOp::F32ToI32, + (false, 32, 64) => FpuToIntOp::F32ToU64, + (true, 32, 64) => FpuToIntOp::F32ToI64, + (false, 64, 32) => FpuToIntOp::F64ToU32, + (true, 64, 32) => FpuToIntOp::F64ToI32, + (false, 64, 64) => FpuToIntOp::F64ToU64, + (true, 64, 64) => FpuToIntOp::F64ToI64, + _ => panic!("Unknown input/output-bits combination"), + }; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::FpuToInt { op, rd, rn }); + } + + Opcode::FcvtFromUint | Opcode::FcvtFromSint => { + let in_bits = ty_bits(ctx.input_ty(insn, 0)); + let out_bits = ty_bits(ctx.output_ty(insn, 0)); + let signed = op == Opcode::FcvtFromSint; + let op = match (signed, in_bits, out_bits) { + (false, 32, 32) => IntToFpuOp::U32ToF32, + (true, 32, 32) => IntToFpuOp::I32ToF32, + (false, 32, 64) => IntToFpuOp::U32ToF64, + (true, 32, 64) => IntToFpuOp::I32ToF64, + (false, 64, 32) => IntToFpuOp::U64ToF32, + (true, 64, 32) => IntToFpuOp::I64ToF32, + (false, 64, 64) => IntToFpuOp::U64ToF64, + (true, 64, 64) => IntToFpuOp::I64ToF64, + _ => panic!("Unknown input/output-bits combination"), + }; + let narrow_mode = match (signed, in_bits) { + (false, 32) => NarrowValueMode::ZeroExtend32, + (true, 32) => NarrowValueMode::SignExtend32, + (false, 64) => NarrowValueMode::ZeroExtend64, + (true, 64) => NarrowValueMode::SignExtend64, + _ => panic!("Unknown input size"), + }; + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rd = output_to_reg(ctx, outputs[0]); + ctx.emit(Inst::IntToFpu { op, rd, rn }); + } + + Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { + let in_ty = ctx.input_ty(insn, 0); + let in_bits = ty_bits(in_ty); + let out_ty = ctx.output_ty(insn, 0); + let out_bits = ty_bits(out_ty); + let out_signed = op == Opcode::FcvtToSintSat; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rd = output_to_reg(ctx, outputs[0]); + + // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX + // FMIN Vtmp2, Vin, Vtmp1 + // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN + // FMAX Vtmp2, Vtmp2, Vtmp + // FCMP Vin, Vin + // FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0 + // convert Rout, Vtmp2 + + assert!(in_bits == 32 || in_bits == 64); + assert!(out_bits == 32 || out_bits == 64); + + let min: f64 = match (out_bits, out_signed) { + (32, true) => std::i32::MIN as f64, + (32, false) => 0.0, + (64, true) => std::i64::MIN as f64, + (64, false) => 0.0, + _ => unreachable!(), + }; + + let max = match (out_bits, out_signed) { + (32, true) => std::i32::MAX as f64, + (32, false) => std::u32::MAX as f64, + (64, true) => std::i64::MAX as f64, + (64, false) => std::u64::MAX as f64, + _ => unreachable!(), + }; + + let rtmp1 = ctx.tmp(RegClass::V128, in_ty); + let rtmp2 = ctx.tmp(RegClass::V128, in_ty); + + if in_bits == 32 { + ctx.emit(Inst::LoadFpuConst32 { + rd: rtmp1, + const_data: max as f32, + }); + } else { + ctx.emit(Inst::LoadFpuConst64 { + rd: rtmp1, + const_data: max, + }); + } + ctx.emit(Inst::FpuRRR { + fpu_op: choose_32_64(in_ty, FPUOp2::Min32, FPUOp2::Min64), + rd: rtmp2, + rn: rn, + rm: rtmp1.to_reg(), + }); + if in_bits == 32 { + ctx.emit(Inst::LoadFpuConst32 { + rd: rtmp1, + const_data: min as f32, + }); + } else { + ctx.emit(Inst::LoadFpuConst64 { + rd: rtmp1, + const_data: min, + }); + } + ctx.emit(Inst::FpuRRR { + fpu_op: choose_32_64(in_ty, FPUOp2::Max32, FPUOp2::Max64), + rd: rtmp2, + rn: rtmp2.to_reg(), + rm: rtmp1.to_reg(), + }); + if in_bits == 32 { + ctx.emit(Inst::FpuCmp32 { rn: rn, rm: rn }); + ctx.emit(Inst::FpuCSel32 { + rd: rtmp2, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + cond: Cond::Ne, + }); + } else { + ctx.emit(Inst::FpuCmp64 { rn: rn, rm: rn }); + ctx.emit(Inst::FpuCSel64 { + rd: rtmp2, + rn: rtmp1.to_reg(), + rm: rtmp2.to_reg(), + cond: Cond::Ne, + }); + } + + let cvt = match (in_bits, out_bits, out_signed) { + (32, 32, false) => FpuToIntOp::F32ToU32, + (32, 32, true) => FpuToIntOp::F32ToI32, + (32, 64, false) => FpuToIntOp::F32ToU64, + (32, 64, true) => FpuToIntOp::F32ToI64, + (64, 32, false) => FpuToIntOp::F64ToU32, + (64, 32, true) => FpuToIntOp::F64ToI32, + (64, 64, false) => FpuToIntOp::F64ToU64, + (64, 64, true) => FpuToIntOp::F64ToI64, + _ => unreachable!(), + }; + ctx.emit(Inst::FpuToInt { + op: cvt, + rd, + rn: rtmp2.to_reg(), + }); + } + + Opcode::IaddImm + | Opcode::ImulImm + | Opcode::UdivImm + | Opcode::SdivImm + | Opcode::UremImm + | Opcode::SremImm + | Opcode::IrsubImm + | Opcode::IaddCin + | Opcode::IaddIfcin + | Opcode::IaddCout + | Opcode::IaddIfcout + | Opcode::IaddCarry + | Opcode::IaddIfcarry + | Opcode::IsubBin + | Opcode::IsubIfbin + | Opcode::IsubBout + | Opcode::IsubIfbout + | Opcode::IsubBorrow + | Opcode::IsubIfborrow + | Opcode::BandImm + | Opcode::BorImm + | Opcode::BxorImm + | Opcode::RotlImm + | Opcode::RotrImm + | Opcode::IshlImm + | Opcode::UshrImm + | Opcode::SshrImm + | Opcode::IcmpImm + | Opcode::IfcmpImm => { + panic!("ALU+imm and ALU+carry ops should not appear here!"); + } + + #[cfg(feature = "x86")] + Opcode::X86Udivmodx + | Opcode::X86Sdivmodx + | Opcode::X86Umulx + | Opcode::X86Smulx + | Opcode::X86Cvtt2si + | Opcode::X86Fmin + | Opcode::X86Fmax + | Opcode::X86Push + | Opcode::X86Pop + | Opcode::X86Bsr + | Opcode::X86Bsf + | Opcode::X86Pshufd + | Opcode::X86Pshufb + | Opcode::X86Pextr + | Opcode::X86Pinsr + | Opcode::X86Insertps + | Opcode::X86Movsd + | Opcode::X86Movlhps + | Opcode::X86Psll + | Opcode::X86Psrl + | Opcode::X86Psra + | Opcode::X86Ptest + | Opcode::X86Pmaxs + | Opcode::X86Pmaxu + | Opcode::X86Pmins + | Opcode::X86Pminu + | Opcode::X86ElfTlsGetAddr + | Opcode::X86MachoTlsGetAddr => { + panic!("x86-specific opcode in supposedly arch-neutral IR!"); + } + + Opcode::AvgRound => unimplemented!(), + Opcode::TlsValue => unimplemented!(), + } +} + +//============================================================================= +// Helpers for instruction lowering. +fn ty_bits(ty: Type) -> usize { + match ty { + B1 => 1, + B8 | I8 => 8, + B16 | I16 => 16, + B32 | I32 | F32 => 32, + B64 | I64 | F64 => 64, + B128 | I128 => 128, + IFLAGS | FFLAGS => 32, + _ => panic!("ty_bits() on unknown type: {:?}", ty), + } +} + +fn ty_is_int(ty: Type) -> bool { + match ty { + B1 | B8 | I8 | B16 | I16 | B32 | I32 | B64 | I64 => true, + F32 | F64 | B128 | I128 => false, + IFLAGS | FFLAGS => panic!("Unexpected flags type"), + _ => panic!("ty_is_int() on unknown type: {:?}", ty), + } +} + +fn ty_is_float(ty: Type) -> bool { + !ty_is_int(ty) +} + +fn choose_32_64(ty: Type, op32: T, op64: T) -> T { + let bits = ty_bits(ty); + if bits <= 32 { + op32 + } else if bits == 64 { + op64 + } else { + panic!("choose_32_64 on > 64 bits!") + } +} + +fn ldst_offset(data: &InstructionData) -> Option { + match data { + &InstructionData::Load { offset, .. } + | &InstructionData::StackLoad { offset, .. } + | &InstructionData::LoadComplex { offset, .. } + | &InstructionData::Store { offset, .. } + | &InstructionData::StackStore { offset, .. } + | &InstructionData::StoreComplex { offset, .. } => Some(offset.into()), + _ => None, + } +} + +fn inst_condcode(data: &InstructionData) -> Option { + match data { + &InstructionData::IntCond { cond, .. } + | &InstructionData::BranchIcmp { cond, .. } + | &InstructionData::IntCompare { cond, .. } + | &InstructionData::IntCondTrap { cond, .. } + | &InstructionData::BranchInt { cond, .. } + | &InstructionData::IntSelect { cond, .. } + | &InstructionData::IntCompareImm { cond, .. } => Some(cond), + _ => None, + } +} + +fn inst_fp_condcode(data: &InstructionData) -> Option { + match data { + &InstructionData::BranchFloat { cond, .. } + | &InstructionData::FloatCompare { cond, .. } + | &InstructionData::FloatCond { cond, .. } + | &InstructionData::FloatCondTrap { cond, .. } => Some(cond), + _ => None, + } +} + +fn inst_trapcode(data: &InstructionData) -> Option { + match data { + &InstructionData::Trap { code, .. } + | &InstructionData::CondTrap { code, .. } + | &InstructionData::IntCondTrap { code, .. } + | &InstructionData::FloatCondTrap { code, .. } => Some(code), + _ => None, + } +} + +/// Checks for an instance of `op` feeding the given input. Marks as merged (decrementing refcount) if so. +fn maybe_input_insn>( + c: &mut C, + input: InsnInput, + op: Opcode, +) -> Option { + if let InsnInputSource::Output(out) = input_source(c, input) { + let data = c.data(out.insn); + if data.opcode() == op { + c.merged(out.insn); + return Some(out.insn); + } + } + None +} + +/// Checks for an instance of `op` feeding the given input, possibly via a conversion `conv` (e.g., +/// Bint or a bitcast). Marks one or both as merged if so, as appropriate. +/// +/// FIXME cfallin 2020-03-30: this is really ugly. Factor out tree-matching stuff and make it +/// a bit more generic. +fn maybe_input_insn_via_conv>( + c: &mut C, + input: InsnInput, + op: Opcode, + conv: Opcode, +) -> Option { + if let Some(ret) = maybe_input_insn(c, input, op) { + return Some(ret); + } + + if let InsnInputSource::Output(out) = input_source(c, input) { + let data = c.data(out.insn); + if data.opcode() == conv { + let conv_insn = out.insn; + let conv_input = InsnInput { + insn: conv_insn, + input: 0, + }; + if let Some(inner) = maybe_input_insn(c, conv_input, op) { + c.merged(conv_insn); + return Some(inner); + } + } + } + None +} + +fn lower_icmp_or_ifcmp_to_flags>(ctx: &mut C, insn: IRInst, is_signed: bool) { + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let inputs = [ + InsnInput { + insn: insn, + input: 0, + }, + InsnInput { + insn: insn, + input: 1, + }, + ]; + let ty = ctx.input_ty(insn, 0); + let rn = input_to_reg(ctx, inputs[0], narrow_mode); + let rm = input_to_rse_imm12(ctx, inputs[1], narrow_mode); + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rd = writable_zero_reg(); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); +} + +fn lower_fcmp_or_ffcmp_to_flags>(ctx: &mut C, insn: IRInst) { + let ty = ctx.input_ty(insn, 0); + let bits = ty_bits(ty); + let inputs = [ + InsnInput { + insn: insn, + input: 0, + }, + InsnInput { + insn: insn, + input: 1, + }, + ]; + let rn = input_to_reg(ctx, inputs[0], NarrowValueMode::None); + let rm = input_to_reg(ctx, inputs[1], NarrowValueMode::None); + match bits { + 32 => { + ctx.emit(Inst::FpuCmp32 { rn, rm }); + } + 64 => { + ctx.emit(Inst::FpuCmp64 { rn, rm }); + } + _ => panic!("Unknown float size"), + } +} + +//============================================================================= +// Lowering-backend trait implementation. + +impl LowerBackend for AArch64Backend { + type MInst = Inst; + + fn lower>(&self, ctx: &mut C, ir_inst: IRInst) { + lower_insn_to_regs(ctx, ir_inst); + } + + fn lower_branch_group>( + &self, + ctx: &mut C, + branches: &[IRInst], + targets: &[BlockIndex], + fallthrough: Option, + ) { + // A block should end with at most two branches. The first may be a + // conditional branch; a conditional branch can be followed only by an + // unconditional branch or fallthrough. Otherwise, if only one branch, + // it may be an unconditional branch, a fallthrough, a return, or a + // trap. These conditions are verified by `is_ebb_basic()` during the + // verifier pass. + assert!(branches.len() <= 2); + + if branches.len() == 2 { + // Must be a conditional branch followed by an unconditional branch. + let op0 = ctx.data(branches[0]).opcode(); + let op1 = ctx.data(branches[1]).opcode(); + + //println!( + // "lowering two-branch group: opcodes are {:?} and {:?}", + // op0, op1 + //); + + assert!(op1 == Opcode::Jump || op1 == Opcode::Fallthrough); + let taken = BranchTarget::Block(targets[0]); + let not_taken = match op1 { + Opcode::Jump => BranchTarget::Block(targets[1]), + Opcode::Fallthrough => BranchTarget::Block(fallthrough.unwrap()), + _ => unreachable!(), // assert above. + }; + match op0 { + Opcode::Brz | Opcode::Brnz => { + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(icmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Icmp, Opcode::Bint) + { + let condcode = inst_condcode(ctx.data(icmp_insn)).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let negated = op0 == Opcode::Brz; + let cond = if negated { cond.invert() } else { cond }; + + lower_icmp_or_ifcmp_to_flags(ctx, icmp_insn, is_signed); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else if let Some(fcmp_insn) = + maybe_input_insn_via_conv(ctx, flag_input, Opcode::Fcmp, Opcode::Bint) + { + let condcode = inst_fp_condcode(ctx.data(fcmp_insn)).unwrap(); + let cond = lower_fp_condcode(condcode); + let negated = op0 == Opcode::Brz; + let cond = if negated { cond.invert() } else { cond }; + + lower_fcmp_or_ffcmp_to_flags(ctx, fcmp_insn); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else { + let rt = input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + NarrowValueMode::ZeroExtend64, + ); + let kind = match op0 { + Opcode::Brz => CondBrKind::Zero(rt), + Opcode::Brnz => CondBrKind::NotZero(rt), + _ => unreachable!(), + }; + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind, + }); + } + } + Opcode::BrIcmp => { + let condcode = inst_condcode(ctx.data(branches[0])).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let ty = ctx.input_ty(branches[0], 0); + let bits = ty_bits(ty); + let narrow_mode = match (bits <= 32, is_signed) { + (true, true) => NarrowValueMode::SignExtend32, + (true, false) => NarrowValueMode::ZeroExtend32, + (false, true) => NarrowValueMode::SignExtend64, + (false, false) => NarrowValueMode::ZeroExtend64, + }; + let rn = input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + narrow_mode, + ); + let rm = input_to_rse_imm12( + ctx, + InsnInput { + insn: branches[0], + input: 1, + }, + narrow_mode, + ); + + let alu_op = choose_32_64(ty, ALUOp::SubS32, ALUOp::SubS64); + let rd = writable_zero_reg(); + ctx.emit(alu_inst_imm12(alu_op, rd, rn, rm)); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } + + Opcode::Brif => { + let condcode = inst_condcode(ctx.data(branches[0])).unwrap(); + let cond = lower_condcode(condcode); + let is_signed = condcode_is_signed(condcode); + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(ifcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ifcmp) { + lower_icmp_or_ifcmp_to_flags(ctx, ifcmp_insn, is_signed); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else { + // If the ifcmp result is actually placed in a + // register, we need to move it back into the flags. + let rn = input_to_reg(ctx, flag_input, NarrowValueMode::None); + ctx.emit(Inst::MovToNZCV { rn }); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } + } + + Opcode::Brff => { + let condcode = inst_fp_condcode(ctx.data(branches[0])).unwrap(); + let cond = lower_fp_condcode(condcode); + let flag_input = InsnInput { + insn: branches[0], + input: 0, + }; + if let Some(ffcmp_insn) = maybe_input_insn(ctx, flag_input, Opcode::Ffcmp) { + lower_fcmp_or_ffcmp_to_flags(ctx, ffcmp_insn); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } else { + // If the ffcmp result is actually placed in a + // register, we need to move it back into the flags. + let rn = input_to_reg(ctx, flag_input, NarrowValueMode::None); + ctx.emit(Inst::MovToNZCV { rn }); + ctx.emit(Inst::CondBr { + taken, + not_taken, + kind: CondBrKind::Cond(cond), + }); + } + } + + _ => unimplemented!(), + } + } else { + // Must be an unconditional branch or an indirect branch. + let op = ctx.data(branches[0]).opcode(); + match op { + Opcode::Jump | Opcode::Fallthrough => { + assert!(branches.len() == 1); + // In the Fallthrough case, the machine-independent driver + // fills in `targets[0]` with our fallthrough block, so this + // is valid for both Jump and Fallthrough. + ctx.emit(Inst::Jump { + dest: BranchTarget::Block(targets[0]), + }); + } + Opcode::BrTable => { + // Expand `br_table index, default, JT` to: + // + // subs idx, #jt_size + // b.hs default + // adr vTmp1, PC+16 + // ldr vTmp2, [vTmp1, idx, lsl #2] + // add vTmp2, vTmp2, vTmp1 + // br vTmp2 + // [jumptable offsets relative to JT base] + let jt_size = targets.len() - 1; + assert!(jt_size <= std::u32::MAX as usize); + let ridx = input_to_reg( + ctx, + InsnInput { + insn: branches[0], + input: 0, + }, + NarrowValueMode::ZeroExtend32, + ); + + let rtmp1 = ctx.tmp(RegClass::I64, I32); + let rtmp2 = ctx.tmp(RegClass::I64, I32); + + // Bounds-check and branch to default. + if let Some(imm12) = Imm12::maybe_from_u64(jt_size as u64) { + ctx.emit(Inst::AluRRImm12 { + alu_op: ALUOp::SubS32, + rd: writable_zero_reg(), + rn: ridx, + imm12, + }); + } else { + lower_constant_u64(ctx, rtmp1, jt_size as u64); + ctx.emit(Inst::AluRRR { + alu_op: ALUOp::SubS32, + rd: writable_zero_reg(), + rn: ridx, + rm: rtmp1.to_reg(), + }); + } + let default_target = BranchTarget::Block(targets[0]); + ctx.emit(Inst::CondBrLowered { + kind: CondBrKind::Cond(Cond::Hs), // unsigned >= + target: default_target.clone(), + }); + + // Emit the compound instruction that does: + // + // adr rA, jt + // ldrsw rB, [rA, rIndex, UXTW 2] + // add rA, rA, rB + // br rA + // [jt entries] + // + // This must be *one* instruction in the vcode because + // we cannot allow regalloc to insert any spills/fills + // in the middle of the sequence; otherwise, the ADR's + // PC-rel offset to the jumptable would be incorrect. + // (The alternative is to introduce a relocation pass + // for inlined jumptables, which is much worse, IMHO.) + + let jt_targets: Vec = targets + .iter() + .skip(1) + .map(|bix| BranchTarget::Block(*bix)) + .collect(); + let targets_for_term: Vec = targets.to_vec(); + ctx.emit(Inst::JTSequence { + ridx, + rtmp1, + rtmp2, + targets: jt_targets, + targets_for_term, + }); + } + + _ => panic!("Unknown branch type!"), + } + } + } +} diff --git a/cranelift/codegen/src/isa/aarch64/mod.rs b/cranelift/codegen/src/isa/aarch64/mod.rs new file mode 100644 index 000000000000..2a71085929e7 --- /dev/null +++ b/cranelift/codegen/src/isa/aarch64/mod.rs @@ -0,0 +1,220 @@ +//! ARM 64-bit Instruction Set Architecture. + +use crate::ir::Function; +use crate::isa::Builder as IsaBuilder; +use crate::machinst::{ + compile, MachBackend, MachCompileResult, ShowWithRRU, TargetIsaAdapter, VCode, +}; +use crate::result::CodegenResult; +use crate::settings; + +use alloc::boxed::Box; + +use regalloc::RealRegUniverse; +use target_lexicon::{Aarch64Architecture, Architecture, Triple}; + +// New backend: +mod abi; +mod inst; +mod lower; + +use inst::create_reg_universe; + +/// An AArch64 backend. +pub struct AArch64Backend { + triple: Triple, + flags: settings::Flags, +} + +impl AArch64Backend { + /// Create a new AArch64 backend with the given (shared) flags. + pub fn new_with_flags(triple: Triple, flags: settings::Flags) -> AArch64Backend { + AArch64Backend { triple, flags } + } + + fn compile_vcode(&self, func: &Function, flags: &settings::Flags) -> VCode { + // This performs lowering to VCode, register-allocates the code, computes + // block layout and finalizes branches. The result is ready for binary emission. + let abi = Box::new(abi::AArch64ABIBody::new(func)); + compile::compile::(func, self, abi, flags) + } +} + +impl MachBackend for AArch64Backend { + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult { + let flags = self.flags(); + let vcode = self.compile_vcode(func, flags); + let sections = vcode.emit(); + let frame_size = vcode.frame_size(); + + let disasm = if want_disasm { + Some(vcode.show_rru(Some(&create_reg_universe()))) + } else { + None + }; + + Ok(MachCompileResult { + sections, + frame_size, + disasm, + }) + } + + fn name(&self) -> &'static str { + "aarch64" + } + + fn triple(&self) -> Triple { + self.triple.clone() + } + + fn flags(&self) -> &settings::Flags { + &self.flags + } + + fn reg_universe(&self) -> RealRegUniverse { + create_reg_universe() + } +} + +/// Create a new `isa::Builder`. +pub fn isa_builder(triple: Triple) -> IsaBuilder { + assert!(triple.architecture == Architecture::Aarch64(Aarch64Architecture::Aarch64)); + IsaBuilder { + triple, + setup: settings::builder(), + constructor: |triple, shared_flags, _| { + let backend = AArch64Backend::new_with_flags(triple, shared_flags); + Box::new(TargetIsaAdapter::new(backend)) + }, + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::cursor::{Cursor, FuncCursor}; + use crate::ir::types::*; + use crate::ir::{AbiParam, ExternalName, Function, InstBuilder, Signature}; + use crate::isa::CallConv; + use crate::settings; + use crate::settings::Configurable; + use core::str::FromStr; + use target_lexicon::Triple; + + #[test] + fn test_compile_function() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().return_(&[v1]); + + let mut shared_flags = settings::builder(); + shared_flags.set("opt_level", "none").unwrap(); + let backend = AArch64Backend::new_with_flags( + Triple::from_str("aarch64").unwrap(), + settings::Flags::new(shared_flags), + ); + let sections = backend.compile_function(&mut func, false).unwrap().sections; + let code = §ions.sections[0].data; + + // stp x29, x30, [sp, #-16]! + // mov x29, sp + // mov x1, #0x1234 + // add w0, w0, w1 + // mov sp, x29 + // ldp x29, x30, [sp], #16 + // ret + let golden = vec![ + 0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0x81, 0x46, 0x82, 0xd2, 0x00, 0x00, + 0x01, 0x0b, 0xbf, 0x03, 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, + ]; + + assert_eq!(code, &golden); + } + + #[test] + fn test_branch_lowering() { + let name = ExternalName::testcase("test0"); + let mut sig = Signature::new(CallConv::SystemV); + sig.params.push(AbiParam::new(I32)); + sig.returns.push(AbiParam::new(I32)); + let mut func = Function::with_name_signature(name, sig); + + let bb0 = func.dfg.make_block(); + let arg0 = func.dfg.append_block_param(bb0, I32); + let bb1 = func.dfg.make_block(); + let bb2 = func.dfg.make_block(); + let bb3 = func.dfg.make_block(); + + let mut pos = FuncCursor::new(&mut func); + pos.insert_block(bb0); + let v0 = pos.ins().iconst(I32, 0x1234); + let v1 = pos.ins().iadd(arg0, v0); + pos.ins().brnz(v1, bb1, &[]); + pos.ins().jump(bb2, &[]); + pos.insert_block(bb1); + pos.ins().brnz(v1, bb2, &[]); + pos.ins().jump(bb3, &[]); + pos.insert_block(bb2); + let v2 = pos.ins().iadd(v1, v0); + pos.ins().brnz(v2, bb2, &[]); + pos.ins().jump(bb1, &[]); + pos.insert_block(bb3); + let v3 = pos.ins().isub(v1, v0); + pos.ins().return_(&[v3]); + + let mut shared_flags = settings::builder(); + shared_flags.set("opt_level", "none").unwrap(); + let backend = AArch64Backend::new_with_flags( + Triple::from_str("aarch64").unwrap(), + settings::Flags::new(shared_flags), + ); + let result = backend + .compile_function(&mut func, /* want_disasm = */ false) + .unwrap(); + let code = &result.sections.sections[0].data; + + // stp x29, x30, [sp, #-16]! + // mov x29, sp + // mov x1, x0 + // mov x0, #0x1234 + // add w1, w1, w0 + // mov w2, w1 + // cbz x2, ... + // mov w2, w1 + // cbz x2, ... + // sub w0, w1, w0 + // mov sp, x29 + // ldp x29, x30, [sp], #16 + // ret + // add w2, w1, w0 + // mov w2, w2 + // cbnz x2, ... <---- compound branch (cond / uncond) + // b ... <---- + + let golden = vec![ + 0xfd, 0x7b, 0xbf, 0xa9, 0xfd, 0x03, 0x00, 0x91, 0xe1, 0x03, 0x00, 0xaa, 0x80, 0x46, + 0x82, 0xd2, 0x21, 0x00, 0x00, 0x0b, 0xe2, 0x03, 0x01, 0x2a, 0xe2, 0x00, 0x00, 0xb4, + 0xe2, 0x03, 0x01, 0x2a, 0xa2, 0x00, 0x00, 0xb5, 0x20, 0x00, 0x00, 0x4b, 0xbf, 0x03, + 0x00, 0x91, 0xfd, 0x7b, 0xc1, 0xa8, 0xc0, 0x03, 0x5f, 0xd6, 0x22, 0x00, 0x00, 0x0b, + 0xe2, 0x03, 0x02, 0x2a, 0xc2, 0xff, 0xff, 0xb5, 0xf7, 0xff, 0xff, 0x17, + ]; + + assert_eq!(code, &golden); + } +} diff --git a/cranelift/codegen/src/isa/arm64/abi.rs b/cranelift/codegen/src/isa/arm64/abi.rs deleted file mode 100644 index 8d486d4193f6..000000000000 --- a/cranelift/codegen/src/isa/arm64/abi.rs +++ /dev/null @@ -1,31 +0,0 @@ -//! ARM 64 ABI implementation. - -use super::registers::{FPR, GPR}; -use crate::ir; -use crate::isa::RegClass; -use crate::regalloc::RegisterSet; -use crate::settings as shared_settings; -use alloc::borrow::Cow; - -/// Legalize `sig`. -pub fn legalize_signature( - _sig: &mut Cow, - _flags: &shared_settings::Flags, - _current: bool, -) { - unimplemented!() -} - -/// Get register class for a type appearing in a legalized signature. -pub fn regclass_for_abi_type(ty: ir::Type) -> RegClass { - if ty.is_int() { - GPR - } else { - FPR - } -} - -/// Get the set of allocatable registers for `func`. -pub fn allocatable_registers(_func: &ir::Function) -> RegisterSet { - unimplemented!() -} diff --git a/cranelift/codegen/src/isa/arm64/binemit.rs b/cranelift/codegen/src/isa/arm64/binemit.rs deleted file mode 100644 index 4401b6d6f59d..000000000000 --- a/cranelift/codegen/src/isa/arm64/binemit.rs +++ /dev/null @@ -1,8 +0,0 @@ -//! Emitting binary ARM64 machine code. - -use crate::binemit::{bad_encoding, CodeSink}; -use crate::ir::{Function, Inst}; -use crate::isa::TargetIsa; -use crate::regalloc::RegDiversions; - -include!(concat!(env!("OUT_DIR"), "/binemit-arm64.rs")); diff --git a/cranelift/codegen/src/isa/arm64/enc_tables.rs b/cranelift/codegen/src/isa/arm64/enc_tables.rs deleted file mode 100644 index 6040a9b866ea..000000000000 --- a/cranelift/codegen/src/isa/arm64/enc_tables.rs +++ /dev/null @@ -1,10 +0,0 @@ -//! Encoding tables for ARM64 ISA. - -use crate::ir; -use crate::isa; -use crate::isa::constraints::*; -use crate::isa::enc_tables::*; -use crate::isa::encoding::RecipeSizing; - -include!(concat!(env!("OUT_DIR"), "/encoding-arm64.rs")); -include!(concat!(env!("OUT_DIR"), "/legalize-arm64.rs")); diff --git a/cranelift/codegen/src/isa/arm64/mod.rs b/cranelift/codegen/src/isa/arm64/mod.rs deleted file mode 100644 index f00062b2afc3..000000000000 --- a/cranelift/codegen/src/isa/arm64/mod.rs +++ /dev/null @@ -1,132 +0,0 @@ -//! ARM 64-bit Instruction Set Architecture. - -mod abi; -mod binemit; -mod enc_tables; -mod registers; -pub mod settings; - -use super::super::settings as shared_settings; -#[cfg(feature = "testing_hooks")] -use crate::binemit::CodeSink; -use crate::binemit::{emit_function, MemoryCodeSink}; -use crate::ir; -use crate::isa::enc_tables::{lookup_enclist, Encodings}; -use crate::isa::Builder as IsaBuilder; -use crate::isa::{EncInfo, RegClass, RegInfo, TargetIsa}; -use crate::regalloc; -use alloc::borrow::Cow; -use alloc::boxed::Box; -use core::fmt; -use target_lexicon::Triple; - -#[allow(dead_code)] -struct Isa { - triple: Triple, - shared_flags: shared_settings::Flags, - isa_flags: settings::Flags, -} - -/// Get an ISA builder for creating ARM64 targets. -pub fn isa_builder(triple: Triple) -> IsaBuilder { - IsaBuilder { - triple, - setup: settings::builder(), - constructor: isa_constructor, - } -} - -fn isa_constructor( - triple: Triple, - shared_flags: shared_settings::Flags, - builder: shared_settings::Builder, -) -> Box { - Box::new(Isa { - triple, - isa_flags: settings::Flags::new(&shared_flags, builder), - shared_flags, - }) -} - -impl TargetIsa for Isa { - fn name(&self) -> &'static str { - "arm64" - } - - fn triple(&self) -> &Triple { - &self.triple - } - - fn flags(&self) -> &shared_settings::Flags { - &self.shared_flags - } - - fn register_info(&self) -> RegInfo { - registers::INFO.clone() - } - - fn encoding_info(&self) -> EncInfo { - enc_tables::INFO.clone() - } - - fn legal_encodings<'a>( - &'a self, - func: &'a ir::Function, - inst: &'a ir::InstructionData, - ctrl_typevar: ir::Type, - ) -> Encodings<'a> { - lookup_enclist( - ctrl_typevar, - inst, - func, - &enc_tables::LEVEL1_A64[..], - &enc_tables::LEVEL2[..], - &enc_tables::ENCLISTS[..], - &enc_tables::LEGALIZE_ACTIONS[..], - &enc_tables::RECIPE_PREDICATES[..], - &enc_tables::INST_PREDICATES[..], - self.isa_flags.predicate_view(), - ) - } - - fn legalize_signature(&self, sig: &mut Cow, current: bool) { - abi::legalize_signature(sig, &self.shared_flags, current) - } - - fn regclass_for_abi_type(&self, ty: ir::Type) -> RegClass { - abi::regclass_for_abi_type(ty) - } - - fn allocatable_registers(&self, func: &ir::Function) -> regalloc::RegisterSet { - abi::allocatable_registers(func) - } - - #[cfg(feature = "testing_hooks")] - fn emit_inst( - &self, - func: &ir::Function, - inst: ir::Inst, - divert: &mut regalloc::RegDiversions, - sink: &mut dyn CodeSink, - ) { - binemit::emit_inst(func, inst, divert, sink, self) - } - - fn emit_function_to_memory(&self, func: &ir::Function, sink: &mut MemoryCodeSink) { - emit_function(func, binemit::emit_inst, sink, self) - } - - fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { - ir::condcodes::IntCC::UnsignedLessThan - } - - fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC { - ir::condcodes::IntCC::UnsignedGreaterThanOrEqual - } -} - -impl fmt::Display for Isa { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}\n{}", self.shared_flags, self.isa_flags) - } -} diff --git a/cranelift/codegen/src/isa/arm64/registers.rs b/cranelift/codegen/src/isa/arm64/registers.rs deleted file mode 100644 index c02f6b7d4d11..000000000000 --- a/cranelift/codegen/src/isa/arm64/registers.rs +++ /dev/null @@ -1,39 +0,0 @@ -//! ARM64 register descriptions. - -use crate::isa::registers::{RegBank, RegClass, RegClassData, RegInfo, RegUnit}; - -include!(concat!(env!("OUT_DIR"), "/registers-arm64.rs")); - -#[cfg(test)] -mod tests { - use super::INFO; - use crate::isa::RegUnit; - use alloc::string::{String, ToString}; - - #[test] - fn unit_encodings() { - assert_eq!(INFO.parse_regunit("x0"), Some(0)); - assert_eq!(INFO.parse_regunit("x31"), Some(31)); - assert_eq!(INFO.parse_regunit("v0"), Some(32)); - assert_eq!(INFO.parse_regunit("v31"), Some(63)); - - assert_eq!(INFO.parse_regunit("x32"), None); - assert_eq!(INFO.parse_regunit("v32"), None); - } - - #[test] - fn unit_names() { - fn uname(ru: RegUnit) -> String { - INFO.display_regunit(ru).to_string() - } - - assert_eq!(uname(0), "%x0"); - assert_eq!(uname(1), "%x1"); - assert_eq!(uname(31), "%x31"); - assert_eq!(uname(32), "%v0"); - assert_eq!(uname(33), "%v1"); - assert_eq!(uname(63), "%v31"); - assert_eq!(uname(64), "%nzcv"); - assert_eq!(uname(65), "%INVALID65"); - } -} diff --git a/cranelift/codegen/src/isa/arm64/settings.rs b/cranelift/codegen/src/isa/arm64/settings.rs deleted file mode 100644 index 56d0f4ee0b44..000000000000 --- a/cranelift/codegen/src/isa/arm64/settings.rs +++ /dev/null @@ -1,9 +0,0 @@ -//! ARM64 Settings. - -use crate::settings::{self, detail, Builder}; -use core::fmt; - -// Include code generated by `cranelift-codegen/meta/src/gen_settings.rs`. This file contains a -// public `Flags` struct with an impl for all of the settings defined in -// `cranelift-codegen/meta/src/isa/arm64/mod.rs`. -include!(concat!(env!("OUT_DIR"), "/settings-arm64.rs")); diff --git a/cranelift/codegen/src/isa/mod.rs b/cranelift/codegen/src/isa/mod.rs index af263c2b5bf0..71c540dd91c5 100644 --- a/cranelift/codegen/src/isa/mod.rs +++ b/cranelift/codegen/src/isa/mod.rs @@ -48,6 +48,7 @@ pub use crate::isa::call_conv::CallConv; pub use crate::isa::constraints::{ BranchRange, ConstraintKind, OperandConstraint, RecipeConstraints, }; +pub use crate::isa::enc_tables::Encodings; pub use crate::isa::encoding::{base_size, EncInfo, Encoding}; pub use crate::isa::registers::{regs_overlap, RegClass, RegClassIndex, RegInfo, RegUnit}; pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef}; @@ -55,9 +56,9 @@ pub use crate::isa::stack::{StackBase, StackBaseMask, StackRef}; use crate::binemit; use crate::flowgraph; use crate::ir; -use crate::isa::enc_tables::Encodings; -#[cfg(feature = "unwind")] use crate::isa::fde::RegisterMappingError; +#[cfg(feature = "unwind")] +use crate::machinst::MachBackend; use crate::regalloc; use crate::result::CodegenResult; use crate::settings; @@ -83,7 +84,7 @@ pub mod fde; mod arm32; #[cfg(feature = "arm64")] -mod arm64; +mod aarch64; mod call_conv; mod constraints; @@ -92,6 +93,9 @@ mod encoding; pub mod registers; mod stack; +#[cfg(test)] +mod test_utils; + /// Returns a builder that can create a corresponding `TargetIsa` /// or `Err(LookupError::SupportDisabled)` if not enabled. macro_rules! isa_builder { @@ -116,7 +120,7 @@ pub fn lookup(triple: Triple) -> Result { isa_builder!(x86, "x86", triple) } Architecture::Arm { .. } => isa_builder!(arm32, "arm32", triple), - Architecture::Aarch64 { .. } => isa_builder!(arm64, "arm64", triple), + Architecture::Aarch64 { .. } => isa_builder!(aarch64, "arm64", triple), _ => Err(LookupError::Unsupported), } } @@ -402,6 +406,11 @@ pub trait TargetIsa: fmt::Display + Send + Sync { // No-op by default Ok(()) } + + /// Get the new-style MachBackend, if this is an adapter around one. + fn get_mach_backend(&self) -> Option<&dyn MachBackend> { + None + } } impl Debug for &dyn TargetIsa { diff --git a/cranelift/codegen/src/isa/test_utils.rs b/cranelift/codegen/src/isa/test_utils.rs new file mode 100644 index 000000000000..c7802b052a21 --- /dev/null +++ b/cranelift/codegen/src/isa/test_utils.rs @@ -0,0 +1,88 @@ +// This is unused when no platforms with the new backend are enabled. +#![allow(dead_code)] + +use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; +use crate::ir::Value; +use crate::ir::{ConstantOffset, ExternalName, Function, JumpTable, Opcode, SourceLoc, TrapCode}; +use crate::isa::TargetIsa; + +use alloc::vec::Vec; +use std::string::String; + +pub struct TestCodeSink { + bytes: Vec, +} + +impl TestCodeSink { + /// Create a new TestCodeSink. + pub fn new() -> TestCodeSink { + TestCodeSink { bytes: vec![] } + } + + /// Return the code emitted to this sink as a hex string. + pub fn stringify(&self) -> String { + // This is pretty lame, but whatever .. + use std::fmt::Write; + let mut s = String::with_capacity(self.bytes.len() * 2); + for b in &self.bytes { + write!(&mut s, "{:02X}", b).unwrap(); + } + s + } +} + +impl CodeSink for TestCodeSink { + fn offset(&self) -> CodeOffset { + self.bytes.len() as CodeOffset + } + + fn put1(&mut self, x: u8) { + self.bytes.push(x); + } + + fn put2(&mut self, x: u16) { + self.bytes.push((x >> 0) as u8); + self.bytes.push((x >> 8) as u8); + } + + fn put4(&mut self, mut x: u32) { + for _ in 0..4 { + self.bytes.push(x as u8); + x >>= 8; + } + } + + fn put8(&mut self, mut x: u64) { + for _ in 0..8 { + self.bytes.push(x as u8); + x >>= 8; + } + } + + fn reloc_block(&mut self, _rel: Reloc, _block_offset: CodeOffset) {} + + fn reloc_external( + &mut self, + _srcloc: SourceLoc, + _rel: Reloc, + _name: &ExternalName, + _addend: Addend, + ) { + } + + fn reloc_constant(&mut self, _rel: Reloc, _constant_offset: ConstantOffset) {} + + fn reloc_jt(&mut self, _rel: Reloc, _jt: JumpTable) {} + + fn trap(&mut self, _code: TrapCode, _srcloc: SourceLoc) {} + + fn begin_jumptables(&mut self) {} + + fn begin_rodata(&mut self) {} + + fn end_codegen(&mut self) {} + + fn add_stackmap(&mut self, _val_list: &[Value], _func: &Function, _isa: &dyn TargetIsa) {} + + fn add_call_site(&mut self, _opcode: Opcode, _srcloc: SourceLoc) {} +} diff --git a/cranelift/codegen/src/legalizer/mod.rs b/cranelift/codegen/src/legalizer/mod.rs index 781767336a5c..e28cc47d6ab7 100644 --- a/cranelift/codegen/src/legalizer/mod.rs +++ b/cranelift/codegen/src/legalizer/mod.rs @@ -196,6 +196,55 @@ pub fn legalize_function(func: &mut ir::Function, cfg: &mut ControlFlowGraph, is } } +/// Perform a simple legalization by expansion of the function, without +/// platform-specific transforms. +pub fn simple_legalize(func: &mut ir::Function, cfg: &mut ControlFlowGraph, isa: &dyn TargetIsa) { + let mut pos = FuncCursor::new(func); + let func_begin = pos.position(); + pos.set_position(func_begin); + while let Some(_block) = pos.next_block() { + let mut prev_pos = pos.position(); + while let Some(inst) = pos.next_inst() { + let expanded = match pos.func.dfg[inst].opcode() { + ir::Opcode::BrIcmp + | ir::Opcode::GlobalValue + | ir::Opcode::HeapAddr + | ir::Opcode::StackLoad + | ir::Opcode::StackStore + | ir::Opcode::TableAddr + | ir::Opcode::Trapnz + | ir::Opcode::Trapz + | ir::Opcode::BandImm + | ir::Opcode::BorImm + | ir::Opcode::BxorImm + | ir::Opcode::IaddImm + | ir::Opcode::IfcmpImm + | ir::Opcode::ImulImm + | ir::Opcode::IrsubImm + | ir::Opcode::IshlImm + | ir::Opcode::RotlImm + | ir::Opcode::RotrImm + | ir::Opcode::SdivImm + | ir::Opcode::SremImm + | ir::Opcode::SshrImm + | ir::Opcode::UdivImm + | ir::Opcode::UremImm + | ir::Opcode::UshrImm + | ir::Opcode::IcmpImm => expand(inst, &mut pos.func, cfg, isa), + _ => false, + }; + + if expanded { + // Legalization implementations require fixpoint loop + // here. TODO: fix this. + pos.set_position(prev_pos); + } else { + prev_pos = pos.position(); + } + } + } +} + // Include legalization patterns that were generated by `gen_legalizer.rs` from the // `TransformGroup` in `cranelift-codegen/meta/shared/legalize.rs`. // diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs index 772562b916cc..d87bbf26b86b 100644 --- a/cranelift/codegen/src/lib.rs +++ b/cranelift/codegen/src/lib.rs @@ -71,6 +71,7 @@ pub mod flowgraph; pub mod ir; pub mod isa; pub mod loop_analysis; +pub mod machinst; pub mod print_errors; pub mod settings; pub mod timing; @@ -86,10 +87,12 @@ mod context; mod dce; mod divconst_magic_numbers; mod fx; +mod inst_predicates; mod iterators; mod legalizer; mod licm; mod nan_canonicalization; +mod num_uses; mod partition_slice; mod postopt; mod predicates; diff --git a/cranelift/codegen/src/machinst/abi.rs b/cranelift/codegen/src/machinst/abi.rs new file mode 100644 index 000000000000..11a96c58b2be --- /dev/null +++ b/cranelift/codegen/src/machinst/abi.rs @@ -0,0 +1,149 @@ +//! ABI definitions. + +use crate::ir::StackSlot; +use crate::machinst::*; +use crate::settings; + +use regalloc::{Reg, Set, SpillSlot, Writable}; + +/// Trait implemented by an object that tracks ABI-related state (e.g., stack +/// layout) and can generate code while emitting the *body* of a function. +pub trait ABIBody { + /// The instruction type for the ISA associated with this ABI. + type I: VCodeInst; + + /// Get the liveins of the function. + fn liveins(&self) -> Set; + + /// Get the liveouts of the function. + fn liveouts(&self) -> Set; + + /// Number of arguments. + fn num_args(&self) -> usize; + + /// Number of return values. + fn num_retvals(&self) -> usize; + + /// Number of stack slots (not spill slots). + fn num_stackslots(&self) -> usize; + + /// Generate an instruction which copies an argument to a destination + /// register. + fn gen_copy_arg_to_reg(&self, idx: usize, into_reg: Writable) -> Self::I; + + /// Generate an instruction which copies a source register to a return + /// value slot. + fn gen_copy_reg_to_retval(&self, idx: usize, from_reg: Reg) -> Self::I; + + /// Generate a return instruction. + fn gen_ret(&self) -> Self::I; + + /// Generate an epilogue placeholder. The returned instruction should return `true` from + /// `is_epilogue_placeholder()`; this is used to indicate to the lowering driver when + /// the epilogue should be inserted. + fn gen_epilogue_placeholder(&self) -> Self::I; + + // ----------------------------------------------------------------- + // Every function above this line may only be called pre-regalloc. + // Every function below this line may only be called post-regalloc. + // `spillslots()` must be called before any other post-regalloc + // function. + // ---------------------------------------------------------------- + + /// Update with the number of spillslots, post-regalloc. + fn set_num_spillslots(&mut self, slots: usize); + + /// Update with the clobbered registers, post-regalloc. + fn set_clobbered(&mut self, clobbered: Set>); + + /// Load from a stackslot. + fn load_stackslot( + &self, + slot: StackSlot, + offset: u32, + ty: Type, + into_reg: Writable, + ) -> Self::I; + + /// Store to a stackslot. + fn store_stackslot(&self, slot: StackSlot, offset: u32, ty: Type, from_reg: Reg) -> Self::I; + + /// Load from a spillslot. + fn load_spillslot(&self, slot: SpillSlot, ty: Type, into_reg: Writable) -> Self::I; + + /// Store to a spillslot. + fn store_spillslot(&self, slot: SpillSlot, ty: Type, from_reg: Reg) -> Self::I; + + /// Generate a prologue, post-regalloc. This should include any stack + /// frame or other setup necessary to use the other methods (`load_arg`, + /// `store_retval`, and spillslot accesses.) `self` is mutable so that we + /// can store information in it which will be useful when creating the + /// epilogue. + fn gen_prologue(&mut self, flags: &settings::Flags) -> Vec; + + /// Generate an epilogue, post-regalloc. Note that this must generate the + /// actual return instruction (rather than emitting this in the lowering + /// logic), because the epilogue code comes before the return and the two are + /// likely closely related. + fn gen_epilogue(&self, flags: &settings::Flags) -> Vec; + + /// Returns the full frame size for the given function, after prologue emission has run. This + /// comprises the spill space, incoming argument space, alignment padding, etc. + fn frame_size(&self) -> u32; + + /// Get the spill-slot size. + fn get_spillslot_size(&self, rc: RegClass, ty: Type) -> u32; + + /// Generate a spill. + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, ty: Type) -> Self::I; + + /// Generate a reload (fill). + fn gen_reload(&self, to_reg: Writable, from_slot: SpillSlot, ty: Type) -> Self::I; +} + +/// Trait implemented by an object that tracks ABI-related state and can +/// generate code while emitting a *call* to a function. +/// +/// An instance of this trait returns information for a *particular* +/// callsite. It will usually be computed from the called function's +/// signature. +/// +/// Unlike `ABIBody` above, methods on this trait are not invoked directly +/// by the machine-independent code. Rather, the machine-specific lowering +/// code will typically create an `ABICall` when creating machine instructions +/// for an IR call instruction inside `lower()`, directly emit the arg and +/// and retval copies, and attach the register use/def info to the call. +/// +/// This trait is thus provided for convenience to the backends. +pub trait ABICall { + /// The instruction type for the ISA associated with this ABI. + type I: VCodeInst; + + /// Get the number of arguments expected. + fn num_args(&self) -> usize; + + /// Save the clobbered registers. + /// Copy an argument value from a source register, prior to the call. + fn gen_copy_reg_to_arg(&self, idx: usize, from_reg: Reg) -> Self::I; + + /// Copy a return value into a destination register, after the call returns. + fn gen_copy_retval_to_reg(&self, idx: usize, into_reg: Writable) -> Self::I; + + /// Pre-adjust the stack, prior to argument copies and call. + fn gen_stack_pre_adjust(&self) -> Vec; + + /// Post-adjust the satck, after call return and return-value copies. + fn gen_stack_post_adjust(&self) -> Vec; + + /// Generate the call itself. + /// + /// The returned instruction should have proper use- and def-sets according + /// to the argument registers, return-value registers, and clobbered + /// registers for this function signature in this ABI. + /// + /// (Arg registers are uses, and retval registers are defs. Clobbered + /// registers are also logically defs, but should never be read; their + /// values are "defined" (to the regalloc) but "undefined" in every other + /// sense.) + fn gen_call(&self) -> Vec; +} diff --git a/cranelift/codegen/src/machinst/adapter.rs b/cranelift/codegen/src/machinst/adapter.rs new file mode 100644 index 000000000000..c9cf41f359b2 --- /dev/null +++ b/cranelift/codegen/src/machinst/adapter.rs @@ -0,0 +1,130 @@ +//! Adapter for a `MachBackend` to implement the `TargetIsa` trait. + +use crate::binemit; +use crate::ir; +use crate::isa::{EncInfo, Encoding, Encodings, Legalize, RegClass, RegInfo, TargetIsa}; +use crate::machinst::*; +use crate::regalloc::RegisterSet; +use crate::settings::Flags; + +#[cfg(feature = "testing_hooks")] +use crate::regalloc::RegDiversions; + +use std::borrow::Cow; +use std::fmt; +use target_lexicon::Triple; + +/// A wrapper around a `MachBackend` that provides a `TargetIsa` impl. +pub struct TargetIsaAdapter { + backend: Box, + triple: Triple, +} + +impl TargetIsaAdapter { + /// Create a new `TargetIsa` wrapper around a `MachBackend`. + pub fn new(backend: B) -> TargetIsaAdapter { + let triple = backend.triple(); + TargetIsaAdapter { + backend: Box::new(backend), + triple, + } + } +} + +impl fmt::Display for TargetIsaAdapter { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("MachBackend") + .field("name", &self.backend.name()) + .field("triple", &self.backend.triple()) + .field("flags", &format!("{}", self.backend.flags())) + .finish() + } +} + +impl TargetIsa for TargetIsaAdapter { + fn name(&self) -> &'static str { + self.backend.name() + } + + fn triple(&self) -> &Triple { + &self.triple + } + + fn flags(&self) -> &Flags { + self.backend.flags() + } + + fn register_info(&self) -> RegInfo { + // Called from function's Display impl, so we need a stub here. + RegInfo { + banks: &[], + classes: &[], + } + } + + fn legal_encodings<'a>( + &'a self, + _func: &'a ir::Function, + _inst: &'a ir::InstructionData, + _ctrl_typevar: ir::Type, + ) -> Encodings<'a> { + panic!("Should not be called when new-style backend is available!") + } + + fn encode( + &self, + _func: &ir::Function, + _inst: &ir::InstructionData, + _ctrl_typevar: ir::Type, + ) -> Result { + panic!("Should not be called when new-style backend is available!") + } + + fn encoding_info(&self) -> EncInfo { + panic!("Should not be called when new-style backend is available!") + } + + fn legalize_signature(&self, _sig: &mut Cow, _current: bool) { + panic!("Should not be called when new-style backend is available!") + } + + fn regclass_for_abi_type(&self, _ty: ir::Type) -> RegClass { + panic!("Should not be called when new-style backend is available!") + } + + fn allocatable_registers(&self, _func: &ir::Function) -> RegisterSet { + panic!("Should not be called when new-style backend is available!") + } + + fn prologue_epilogue(&self, _func: &mut ir::Function) -> CodegenResult<()> { + panic!("Should not be called when new-style backend is available!") + } + + #[cfg(feature = "testing_hooks")] + fn emit_inst( + &self, + _func: &ir::Function, + _inst: ir::Inst, + _divert: &mut RegDiversions, + _sink: &mut dyn binemit::CodeSink, + ) { + panic!("Should not be called when new-style backend is available!") + } + + /// Emit a whole function into memory. + fn emit_function_to_memory(&self, _func: &ir::Function, _sink: &mut binemit::MemoryCodeSink) { + panic!("Should not be called when new-style backend is available!") + } + + fn get_mach_backend(&self) -> Option<&dyn MachBackend> { + Some(&*self.backend) + } + + fn unsigned_add_overflow_condition(&self) -> ir::condcodes::IntCC { + self.backend.unsigned_add_overflow_condition() + } + + fn unsigned_sub_overflow_condition(&self) -> ir::condcodes::IntCC { + self.backend.unsigned_sub_overflow_condition() + } +} diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs new file mode 100644 index 000000000000..847f2a6b663c --- /dev/null +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -0,0 +1,59 @@ +//! Computation of basic block order in emitted code. + +use crate::machinst::*; +use regalloc::{BlockIx, Function}; + +/// Simple reverse postorder-based block order emission. +/// +/// TODO: use a proper algorithm, such as the bottom-up straight-line-section +/// construction algorithm. +struct BlockRPO { + visited: Vec, + postorder: Vec, + deferred_last: Option, +} + +impl BlockRPO { + fn new(vcode: &VCode) -> BlockRPO { + BlockRPO { + visited: vec![false; vcode.num_blocks()], + postorder: vec![], + deferred_last: None, + } + } + + fn visit(&mut self, vcode: &VCode, block: BlockIndex) { + self.visited[block as usize] = true; + for succ in vcode.succs(block) { + if !self.visited[*succ as usize] { + self.visit(vcode, *succ); + } + } + + for i in vcode.block_insns(BlockIx::new(block)) { + if vcode.get_insn(i).is_epilogue_placeholder() { + debug_assert!(self.deferred_last.is_none()); + self.deferred_last = Some(block); + return; + } + } + + self.postorder.push(block); + } + + fn rpo(self) -> Vec { + let mut rpo = self.postorder; + rpo.reverse(); + if let Some(block) = self.deferred_last { + rpo.push(block); + } + rpo + } +} + +/// Compute the final block order. +pub fn compute_final_block_order(vcode: &VCode) -> Vec { + let mut rpo = BlockRPO::new(vcode); + rpo.visit(vcode, vcode.entry()); + rpo.rpo() +} diff --git a/cranelift/codegen/src/machinst/compile.rs b/cranelift/codegen/src/machinst/compile.rs new file mode 100644 index 000000000000..eda3955f88d3 --- /dev/null +++ b/cranelift/codegen/src/machinst/compile.rs @@ -0,0 +1,63 @@ +//! Compilation backend pipeline: optimized IR to VCode / binemit. + +use crate::ir::Function; +use crate::machinst::*; +use crate::settings; +use crate::timing; + +use log::debug; +use regalloc::{allocate_registers, RegAllocAlgorithm}; + +/// Compile the given function down to VCode with allocated registers, ready +/// for binary emission. +pub fn compile( + f: &Function, + b: &B, + abi: Box>, + flags: &settings::Flags, +) -> VCode +where + B::MInst: ShowWithRRU, +{ + // This lowers the CL IR. + let mut vcode = Lower::new(f, abi).lower(b); + + let universe = &B::MInst::reg_universe(); + + debug!("vcode from lowering: \n{}", vcode.show_rru(Some(universe))); + + // Perform register allocation. + // TODO: select register allocation algorithm from flags. + let algorithm = RegAllocAlgorithm::Backtracking; + let result = { + let _tt = timing::regalloc(); + allocate_registers( + &mut vcode, algorithm, universe, /*request_block_annotations=*/ false, + ) + .map_err(|err| { + debug!( + "Register allocation error for vcode\n{}\nError: {:?}", + vcode.show_rru(Some(universe)), + err + ); + err + }) + .expect("register allocation") + }; + + // Reorder vcode into final order and copy out final instruction sequence + // all at once. This also inserts prologues/epilogues. + vcode.replace_insns_from_regalloc(result, flags); + + vcode.remove_redundant_branches(); + + // Do final passes over code to finalize branches. + vcode.finalize_branches(); + + debug!( + "vcode after regalloc: final version:\n{}", + vcode.show_rru(Some(universe)) + ); + + vcode +} diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs new file mode 100644 index 000000000000..0d8fb1ff0e0e --- /dev/null +++ b/cranelift/codegen/src/machinst/lower.rs @@ -0,0 +1,720 @@ +//! This module implements lowering (instruction selection) from Cranelift IR +//! to machine instructions with virtual registers. This is *almost* the final +//! machine code, except for register allocation. + +use crate::entity::SecondaryMap; +use crate::inst_predicates::has_side_effect; +use crate::ir::instructions::BranchInfo; +use crate::ir::{ + Block, ExternalName, Function, GlobalValueData, Inst, InstructionData, MemFlags, Opcode, + Signature, SourceLoc, Type, Value, ValueDef, +}; +use crate::machinst::{ABIBody, BlockIndex, VCode, VCodeBuilder, VCodeInst}; +use crate::num_uses::NumUses; + +use regalloc::{Reg, RegClass, Set, VirtualReg, Writable}; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use log::debug; +use smallvec::SmallVec; +use std::collections::VecDeque; + +/// A context that machine-specific lowering code can use to emit lowered instructions. This is the +/// view of the machine-independent per-function lowering context that is seen by the machine +/// backend. +pub trait LowerCtx { + /// The instruction type for which this lowering framework is instantiated. + type I; + + /// Get the instdata for a given IR instruction. + fn data(&self, ir_inst: Inst) -> &InstructionData; + /// Get the controlling type for a polymorphic IR instruction. + fn ty(&self, ir_inst: Inst) -> Type; + /// Emit a machine instruction. + fn emit(&mut self, mach_inst: Self::I); + /// Indicate that an IR instruction has been merged, and so one of its + /// uses is gone (replaced by uses of the instruction's inputs). This + /// helps the lowering algorithm to perform on-the-fly DCE, skipping over + /// unused instructions (such as immediates incorporated directly). + fn merged(&mut self, from_inst: Inst); + /// Get the producing instruction, if any, and output number, for the `idx`th input to the + /// given IR instruction + fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)>; + /// Map a Value to its associated writable (probably virtual) Reg. + fn value_to_writable_reg(&self, val: Value) -> Writable; + /// Map a Value to its associated (probably virtual) Reg. + fn value_to_reg(&self, val: Value) -> Reg; + /// Get the `idx`th input to the given IR instruction as a virtual register. + fn input(&self, ir_inst: Inst, idx: usize) -> Reg; + /// Get the `idx`th output of the given IR instruction as a virtual register. + fn output(&self, ir_inst: Inst, idx: usize) -> Writable; + /// Get the number of inputs to the given IR instruction. + fn num_inputs(&self, ir_inst: Inst) -> usize; + /// Get the number of outputs to the given IR instruction. + fn num_outputs(&self, ir_inst: Inst) -> usize; + /// Get the type for an instruction's input. + fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type; + /// Get the type for an instruction's output. + fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type; + /// Get a new temp. + fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable; + /// Get the number of block params. + fn num_bb_params(&self, bb: Block) -> usize; + /// Get the register for a block param. + fn bb_param(&self, bb: Block, idx: usize) -> Reg; + /// Get the register for a return value. + fn retval(&self, idx: usize) -> Writable; + /// Get the target for a call instruction, as an `ExternalName`. + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName>; + /// Get the signature for a call or call-indirect instruction. + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature>; + /// Get the symbol name and offset for a symbol_value instruction. + fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)>; + /// Returns the memory flags of a given memory access. + fn memflags(&self, ir_inst: Inst) -> Option; + /// Get the source location for a given instruction. + fn srcloc(&self, ir_inst: Inst) -> SourceLoc; +} + +/// A machine backend. +pub trait LowerBackend { + /// The machine instruction type. + type MInst: VCodeInst; + + /// Lower a single instruction. Instructions are lowered in reverse order. + /// This function need not handle branches; those are always passed to + /// `lower_branch_group` below. + fn lower>(&self, ctx: &mut C, inst: Inst); + + /// Lower a block-terminating group of branches (which together can be seen as one + /// N-way branch), given a vcode BlockIndex for each target. + fn lower_branch_group>( + &self, + ctx: &mut C, + insts: &[Inst], + targets: &[BlockIndex], + fallthrough: Option, + ); +} + +/// Machine-independent lowering driver / machine-instruction container. Maintains a correspondence +/// from original Inst to MachInsts. +pub struct Lower<'a, I: VCodeInst> { + /// The function to lower. + f: &'a Function, + + /// Lowered machine instructions. + vcode: VCodeBuilder, + + /// Number of active uses (minus `dec_use()` calls by backend) of each instruction. + num_uses: SecondaryMap, + + /// Mapping from `Value` (SSA value in IR) to virtual register. + value_regs: SecondaryMap, + + /// Return-value vregs. + retval_regs: Vec, + + /// Next virtual register number to allocate. + next_vreg: u32, +} + +fn alloc_vreg( + value_regs: &mut SecondaryMap, + regclass: RegClass, + value: Value, + next_vreg: &mut u32, +) -> VirtualReg { + if value_regs[value].get_index() == 0 { + // default value in map. + let v = *next_vreg; + *next_vreg += 1; + value_regs[value] = Reg::new_virtual(regclass, v); + } + value_regs[value].as_virtual_reg().unwrap() +} + +enum GenerateReturn { + Yes, + No, +} + +impl<'a, I: VCodeInst> Lower<'a, I> { + /// Prepare a new lowering context for the given IR function. + pub fn new(f: &'a Function, abi: Box>) -> Lower<'a, I> { + let mut vcode = VCodeBuilder::new(abi); + + let num_uses = NumUses::compute(f).take_uses(); + + let mut next_vreg: u32 = 1; + + // Default register should never be seen, but the `value_regs` map needs a default and we + // don't want to push `Option` everywhere. All values will be assigned registers by the + // loops over block parameters and instruction results below. + // + // We do not use vreg 0 so that we can detect any unassigned register that leaks through. + let default_register = Reg::new_virtual(RegClass::I32, 0); + let mut value_regs = SecondaryMap::with_default(default_register); + + // Assign a vreg to each value. + for bb in f.layout.blocks() { + for param in f.dfg.block_params(bb) { + let vreg = alloc_vreg( + &mut value_regs, + I::rc_for_type(f.dfg.value_type(*param)), + *param, + &mut next_vreg, + ); + vcode.set_vreg_type(vreg, f.dfg.value_type(*param)); + } + for inst in f.layout.block_insts(bb) { + for result in f.dfg.inst_results(inst) { + let vreg = alloc_vreg( + &mut value_regs, + I::rc_for_type(f.dfg.value_type(*result)), + *result, + &mut next_vreg, + ); + vcode.set_vreg_type(vreg, f.dfg.value_type(*result)); + } + } + } + + // Assign a vreg to each return value. + let mut retval_regs = vec![]; + for ret in &f.signature.returns { + let v = next_vreg; + next_vreg += 1; + let regclass = I::rc_for_type(ret.value_type); + let vreg = Reg::new_virtual(regclass, v); + retval_regs.push(vreg); + vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ret.value_type); + } + + Lower { + f, + vcode, + num_uses, + value_regs, + retval_regs, + next_vreg, + } + } + + fn gen_arg_setup(&mut self) { + if let Some(entry_bb) = self.f.layout.entry_block() { + debug!( + "gen_arg_setup: entry BB {} args are:\n{:?}", + entry_bb, + self.f.dfg.block_params(entry_bb) + ); + for (i, param) in self.f.dfg.block_params(entry_bb).iter().enumerate() { + let reg = Writable::from_reg(self.value_regs[*param]); + let insn = self.vcode.abi().gen_copy_arg_to_reg(i, reg); + self.vcode.push(insn); + } + } + } + + fn gen_retval_setup(&mut self, gen_ret_inst: GenerateReturn) { + for (i, reg) in self.retval_regs.iter().enumerate() { + let insn = self.vcode.abi().gen_copy_reg_to_retval(i, *reg); + self.vcode.push(insn); + } + let inst = match gen_ret_inst { + GenerateReturn::Yes => self.vcode.abi().gen_ret(), + GenerateReturn::No => self.vcode.abi().gen_epilogue_placeholder(), + }; + self.vcode.push(inst); + } + + fn find_reachable_bbs(&self) -> SmallVec<[Block; 16]> { + if let Some(entry) = self.f.layout.entry_block() { + let mut ret = SmallVec::new(); + let mut queue = VecDeque::new(); + let mut visited = SecondaryMap::with_default(false); + queue.push_back(entry); + visited[entry] = true; + while !queue.is_empty() { + let b = queue.pop_front().unwrap(); + ret.push(b); + let mut succs: SmallVec<[Block; 16]> = SmallVec::new(); + for inst in self.f.layout.block_insts(b) { + if self.f.dfg[inst].opcode().is_branch() { + visit_branch_targets(self.f, b, inst, |succ| { + succs.push(succ); + }); + } + } + for succ in succs.into_iter() { + if !visited[succ] { + queue.push_back(succ); + visited[succ] = true; + } + } + } + + ret + } else { + SmallVec::new() + } + } + + /// Lower the function. + pub fn lower>(mut self, backend: &B) -> VCode { + // Find all reachable blocks. + let bbs = self.find_reachable_bbs(); + + // This records a Block-to-BlockIndex map so that branch targets can be resolved. + let mut next_bindex = self.vcode.init_bb_map(&bbs[..]); + + // Allocate a separate BlockIndex for each control-flow instruction so that we can create + // the edge blocks later. Each entry for a control-flow inst is the edge block; the list + // has (control flow inst, edge block, orig block) tuples. + let mut edge_blocks_by_inst: SecondaryMap> = + SecondaryMap::with_default(vec![]); + let mut edge_blocks: Vec<(Inst, BlockIndex, Block)> = vec![]; + + debug!("about to lower function: {:?}", self.f); + debug!("bb map: {:?}", self.vcode.blocks_by_bb()); + + // Work backward (reverse block order, reverse through each block), skipping insns with zero + // uses. + for bb in bbs.iter().rev() { + for inst in self.f.layout.block_insts(*bb) { + let op = self.f.dfg[inst].opcode(); + if op.is_branch() { + // Find the original target. + let mut add_succ = |next_bb| { + let edge_block = next_bindex; + next_bindex += 1; + edge_blocks_by_inst[inst].push(edge_block); + edge_blocks.push((inst, edge_block, next_bb)); + }; + visit_branch_targets(self.f, *bb, inst, |succ| { + add_succ(succ); + }); + } + } + } + + for bb in bbs.iter() { + debug!("lowering bb: {}", bb); + + // If this is a return block, produce the return value setup. N.B.: this comes + // *before* the below because it must occur *after* any other instructions, and + // instructions are lowered in reverse order. + let last_insn = self.f.layout.block_insts(*bb).last().unwrap(); + let last_insn_opcode = self.f.dfg[last_insn].opcode(); + if last_insn_opcode.is_return() { + let gen_ret = if last_insn_opcode == Opcode::Return { + GenerateReturn::Yes + } else { + debug_assert!(last_insn_opcode == Opcode::FallthroughReturn); + GenerateReturn::No + }; + self.gen_retval_setup(gen_ret); + self.vcode.end_ir_inst(); + } + + // Find the branches at the end first, and process those, if any. + let mut branches: SmallVec<[Inst; 2]> = SmallVec::new(); + let mut targets: SmallVec<[BlockIndex; 2]> = SmallVec::new(); + + for inst in self.f.layout.block_insts(*bb).rev() { + debug!("lower: inst {}", inst); + if edge_blocks_by_inst[inst].len() > 0 { + branches.push(inst); + for target in edge_blocks_by_inst[inst].iter().rev().cloned() { + targets.push(target); + } + } else { + // We've reached the end of the branches -- process all as a group, first. + if branches.len() > 0 { + let fallthrough = self.f.layout.next_block(*bb); + let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb)); + branches.reverse(); + targets.reverse(); + debug!( + "lower_branch_group: targets = {:?} branches = {:?}", + targets, branches + ); + backend.lower_branch_group( + &mut self, + &branches[..], + &targets[..], + fallthrough, + ); + self.vcode.end_ir_inst(); + branches.clear(); + targets.clear(); + } + + // Only codegen an instruction if it either has a side + // effect, or has at least one use of one of its results. + let num_uses = self.num_uses[inst]; + let side_effect = has_side_effect(self.f, inst); + if side_effect || num_uses > 0 { + backend.lower(&mut self, inst); + self.vcode.end_ir_inst(); + } else { + // If we're skipping the instruction, we need to dec-ref + // its arguments. + for arg in self.f.dfg.inst_args(inst) { + let val = self.f.dfg.resolve_aliases(*arg); + match self.f.dfg.value_def(val) { + ValueDef::Result(src_inst, _) => { + self.dec_use(src_inst); + } + _ => {} + } + } + } + } + } + + // There are possibly some branches left if the block contained only branches. + if branches.len() > 0 { + let fallthrough = self.f.layout.next_block(*bb); + let fallthrough = fallthrough.map(|bb| self.vcode.bb_to_bindex(bb)); + branches.reverse(); + targets.reverse(); + debug!( + "lower_branch_group: targets = {:?} branches = {:?}", + targets, branches + ); + backend.lower_branch_group(&mut self, &branches[..], &targets[..], fallthrough); + self.vcode.end_ir_inst(); + branches.clear(); + targets.clear(); + } + + // If this is the entry block, produce the argument setup. + if Some(*bb) == self.f.layout.entry_block() { + self.gen_arg_setup(); + self.vcode.end_ir_inst(); + } + + let vcode_bb = self.vcode.end_bb(); + debug!("finished building bb: BlockIndex {}", vcode_bb); + debug!("bb_to_bindex map says: {}", self.vcode.bb_to_bindex(*bb)); + assert!(vcode_bb == self.vcode.bb_to_bindex(*bb)); + if Some(*bb) == self.f.layout.entry_block() { + self.vcode.set_entry(vcode_bb); + } + } + + // Now create the edge blocks, with phi lowering (block parameter copies). + for (inst, edge_block, orig_block) in edge_blocks.into_iter() { + debug!( + "creating edge block: inst {}, edge_block {}, orig_block {}", + inst, edge_block, orig_block + ); + + // Create a temporary for each block parameter. + let phi_classes: Vec<(Type, RegClass)> = self + .f + .dfg + .block_params(orig_block) + .iter() + .map(|p| self.f.dfg.value_type(*p)) + .map(|ty| (ty, I::rc_for_type(ty))) + .collect(); + + // FIXME sewardj 2020Feb29: use SmallVec + let mut src_regs = vec![]; + let mut dst_regs = vec![]; + + // Create all of the phi uses (reads) from jump args to temps. + + // Round up all the source and destination regs + for (i, arg) in self.f.dfg.inst_variable_args(inst).iter().enumerate() { + let arg = self.f.dfg.resolve_aliases(*arg); + debug!("jump arg {} is {}", i, arg); + src_regs.push(self.value_regs[arg]); + } + for (i, param) in self.f.dfg.block_params(orig_block).iter().enumerate() { + debug!("bb arg {} is {}", i, param); + dst_regs.push(Writable::from_reg(self.value_regs[*param])); + } + debug_assert!(src_regs.len() == dst_regs.len()); + debug_assert!(phi_classes.len() == dst_regs.len()); + + // If, as is mostly the case, the source and destination register + // sets are non overlapping, then we can copy directly, so as to + // save the register allocator work. + if !Set::::from_vec(src_regs.clone()).intersects(&Set::::from_vec( + dst_regs.iter().map(|r| r.to_reg()).collect(), + )) { + for (dst_reg, (src_reg, (ty, _))) in + dst_regs.iter().zip(src_regs.iter().zip(phi_classes)) + { + self.vcode.push(I::gen_move(*dst_reg, *src_reg, ty)); + } + } else { + // There's some overlap, so play safe and copy via temps. + + let tmp_regs: Vec> = phi_classes + .iter() + .map(|&(ty, rc)| self.tmp(rc, ty)) // borrows `self` mutably. + .collect(); + + debug!("phi_temps = {:?}", tmp_regs); + debug_assert!(tmp_regs.len() == src_regs.len()); + + for (tmp_reg, (src_reg, &(ty, _))) in + tmp_regs.iter().zip(src_regs.iter().zip(phi_classes.iter())) + { + self.vcode.push(I::gen_move(*tmp_reg, *src_reg, ty)); + } + for (dst_reg, (tmp_reg, &(ty, _))) in + dst_regs.iter().zip(tmp_regs.iter().zip(phi_classes.iter())) + { + self.vcode.push(I::gen_move(*dst_reg, tmp_reg.to_reg(), ty)); + } + } + + // Create the unconditional jump to the original target block. + self.vcode + .push(I::gen_jump(self.vcode.bb_to_bindex(orig_block))); + + // End the IR inst and block. (We lower this as if it were one IR instruction so that + // we can emit machine instructions in forward order.) + self.vcode.end_ir_inst(); + let blocknum = self.vcode.end_bb(); + assert!(blocknum == edge_block); + } + + // Now that we've emitted all instructions into the VCodeBuilder, let's build the VCode. + self.vcode.build() + } + + /// Reduce the use-count of an IR instruction. Use this when, e.g., isel incorporates the + /// computation of an input instruction directly, so that input instruction has one + /// fewer use. + fn dec_use(&mut self, ir_inst: Inst) { + assert!(self.num_uses[ir_inst] > 0); + self.num_uses[ir_inst] -= 1; + debug!( + "incref: ir_inst {} now has {} uses", + ir_inst, self.num_uses[ir_inst] + ); + } + + /// Increase the use-count of an IR instruction. Use this when, e.g., isel incorporates + /// the computation of an input instruction directly, so that input instruction's + /// inputs are now used directly by the merged instruction. + fn inc_use(&mut self, ir_inst: Inst) { + self.num_uses[ir_inst] += 1; + debug!( + "decref: ir_inst {} now has {} uses", + ir_inst, self.num_uses[ir_inst] + ); + } +} + +impl<'a, I: VCodeInst> LowerCtx for Lower<'a, I> { + type I = I; + + /// Get the instdata for a given IR instruction. + fn data(&self, ir_inst: Inst) -> &InstructionData { + &self.f.dfg[ir_inst] + } + + /// Get the controlling type for a polymorphic IR instruction. + fn ty(&self, ir_inst: Inst) -> Type { + self.f.dfg.ctrl_typevar(ir_inst) + } + + /// Emit a machine instruction. + fn emit(&mut self, mach_inst: I) { + self.vcode.push(mach_inst); + } + + /// Indicate that a merge has occurred. + fn merged(&mut self, from_inst: Inst) { + debug!("merged: inst {}", from_inst); + // First, inc-ref all inputs of `from_inst`, because they are now used + // directly by `into_inst`. + for arg in self.f.dfg.inst_args(from_inst) { + let arg = self.f.dfg.resolve_aliases(*arg); + match self.f.dfg.value_def(arg) { + ValueDef::Result(src_inst, _) => { + debug!(" -> inc-reffing src inst {}", src_inst); + self.inc_use(src_inst); + } + _ => {} + } + } + // Then, dec-ref the merged instruction itself. It still retains references + // to its arguments (inc-ref'd above). If its refcount has reached zero, + // it will be skipped during emission and its args will be dec-ref'd at that + // time. + self.dec_use(from_inst); + } + + /// Get the producing instruction, if any, and output number, for the `idx`th input to the + /// given IR instruction. + fn input_inst(&self, ir_inst: Inst, idx: usize) -> Option<(Inst, usize)> { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + match self.f.dfg.value_def(val) { + ValueDef::Result(src_inst, result_idx) => Some((src_inst, result_idx)), + _ => None, + } + } + + /// Map a Value to its associated writable (probably virtual) Reg. + fn value_to_writable_reg(&self, val: Value) -> Writable { + let val = self.f.dfg.resolve_aliases(val); + Writable::from_reg(self.value_regs[val]) + } + + /// Map a Value to its associated (probably virtual) Reg. + fn value_to_reg(&self, val: Value) -> Reg { + let val = self.f.dfg.resolve_aliases(val); + self.value_regs[val] + } + + /// Get the `idx`th input to the given IR instruction as a virtual register. + fn input(&self, ir_inst: Inst, idx: usize) -> Reg { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.value_to_reg(val) + } + + /// Get the `idx`th output of the given IR instruction as a virtual register. + fn output(&self, ir_inst: Inst, idx: usize) -> Writable { + let val = self.f.dfg.inst_results(ir_inst)[idx]; + self.value_to_writable_reg(val) + } + + /// Get a new temp. + fn tmp(&mut self, rc: RegClass, ty: Type) -> Writable { + let v = self.next_vreg; + self.next_vreg += 1; + let vreg = Reg::new_virtual(rc, v); + self.vcode.set_vreg_type(vreg.as_virtual_reg().unwrap(), ty); + Writable::from_reg(vreg) + } + + /// Get the number of inputs for the given IR instruction. + fn num_inputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_args(ir_inst).len() + } + + /// Get the number of outputs for the given IR instruction. + fn num_outputs(&self, ir_inst: Inst) -> usize { + self.f.dfg.inst_results(ir_inst).len() + } + + /// Get the type for an instruction's input. + fn input_ty(&self, ir_inst: Inst, idx: usize) -> Type { + let val = self.f.dfg.inst_args(ir_inst)[idx]; + let val = self.f.dfg.resolve_aliases(val); + self.f.dfg.value_type(val) + } + + /// Get the type for an instruction's output. + fn output_ty(&self, ir_inst: Inst, idx: usize) -> Type { + self.f.dfg.value_type(self.f.dfg.inst_results(ir_inst)[idx]) + } + + /// Get the number of block params. + fn num_bb_params(&self, bb: Block) -> usize { + self.f.dfg.block_params(bb).len() + } + + /// Get the register for a block param. + fn bb_param(&self, bb: Block, idx: usize) -> Reg { + let val = self.f.dfg.block_params(bb)[idx]; + self.value_regs[val] + } + + /// Get the register for a return value. + fn retval(&self, idx: usize) -> Writable { + Writable::from_reg(self.retval_regs[idx]) + } + + /// Get the target for a call instruction, as an `ExternalName`. + fn call_target<'b>(&'b self, ir_inst: Inst) -> Option<&'b ExternalName> { + match &self.f.dfg[ir_inst] { + &InstructionData::Call { func_ref, .. } + | &InstructionData::FuncAddr { func_ref, .. } => { + let funcdata = &self.f.dfg.ext_funcs[func_ref]; + Some(&funcdata.name) + } + _ => None, + } + } + /// Get the signature for a call or call-indirect instruction. + fn call_sig<'b>(&'b self, ir_inst: Inst) -> Option<&'b Signature> { + match &self.f.dfg[ir_inst] { + &InstructionData::Call { func_ref, .. } => { + let funcdata = &self.f.dfg.ext_funcs[func_ref]; + Some(&self.f.dfg.signatures[funcdata.signature]) + } + &InstructionData::CallIndirect { sig_ref, .. } => Some(&self.f.dfg.signatures[sig_ref]), + _ => None, + } + } + + /// Get the symbol name and offset for a symbol_value instruction. + fn symbol_value<'b>(&'b self, ir_inst: Inst) -> Option<(&'b ExternalName, i64)> { + match &self.f.dfg[ir_inst] { + &InstructionData::UnaryGlobalValue { global_value, .. } => { + let gvdata = &self.f.global_values[global_value]; + match gvdata { + &GlobalValueData::Symbol { + ref name, + ref offset, + .. + } => { + let offset = offset.bits(); + Some((name, offset)) + } + _ => None, + } + } + _ => None, + } + } + + /// Returns the memory flags of a given memory access. + fn memflags(&self, ir_inst: Inst) -> Option { + match &self.f.dfg[ir_inst] { + &InstructionData::Load { flags, .. } + | &InstructionData::LoadComplex { flags, .. } + | &InstructionData::Store { flags, .. } + | &InstructionData::StoreComplex { flags, .. } => Some(flags), + _ => None, + } + } + + /// Get the source location for a given instruction. + fn srcloc(&self, ir_inst: Inst) -> SourceLoc { + self.f.srclocs[ir_inst] + } +} + +fn visit_branch_targets(f: &Function, block: Block, inst: Inst, mut visit: F) { + if f.dfg[inst].opcode() == Opcode::Fallthrough { + visit(f.layout.next_block(block).unwrap()); + } else { + match f.dfg[inst].analyze_branch(&f.dfg.value_lists) { + BranchInfo::NotABranch => {} + BranchInfo::SingleDest(dest, _) => { + visit(dest); + } + BranchInfo::Table(table, maybe_dest) => { + if let Some(dest) = maybe_dest { + visit(dest); + } + for &dest in f.jump_tables[table].as_slice() { + visit(dest); + } + } + } + } +} diff --git a/cranelift/codegen/src/machinst/mod.rs b/cranelift/codegen/src/machinst/mod.rs new file mode 100644 index 000000000000..844d0d1a4f48 --- /dev/null +++ b/cranelift/codegen/src/machinst/mod.rs @@ -0,0 +1,280 @@ +//! This module exposes the machine-specific backend definition pieces. +//! +//! The MachInst infrastructure is the compiler backend, from CLIF +//! (ir::Function) to machine code. The purpose of this infrastructure is, at a +//! high level, to do instruction selection/lowering (to machine instructions), +//! register allocation, and then perform all the fixups to branches, constant +//! data references, etc., needed to actually generate machine code. +//! +//! The container for machine instructions, at various stages of construction, +//! is the `VCode` struct. We refer to a sequence of machine instructions organized +//! into basic blocks as "vcode". This is short for "virtual-register code", though +//! it's a bit of a misnomer because near the end of the pipeline, vcode has all +//! real registers. Nevertheless, the name is catchy and we like it. +//! +//! The compilation pipeline, from an `ir::Function` (already optimized as much as +//! you like by machine-independent optimization passes) onward, is as follows. +//! (N.B.: though we show the VCode separately at each stage, the passes +//! mutate the VCode in place; these are not separate copies of the code.) +//! +//! ```plain +//! +//! ir::Function (SSA IR, machine-independent opcodes) +//! | +//! | [lower] +//! | +//! VCode (machine instructions: +//! | - mostly virtual registers. +//! | - cond branches in two-target form. +//! | - branch targets are block indices. +//! | - in-memory constants held by insns, +//! | with unknown offsets. +//! | - critical edges (actually all edges) +//! | are split.) +//! | [regalloc] +//! | +//! VCode (machine instructions: +//! | - all real registers. +//! | - new instruction sequence returned +//! | out-of-band in RegAllocResult. +//! | - instruction sequence has spills, +//! | reloads, and moves inserted. +//! | - other invariants same as above.) +//! | +//! | [preamble/postamble] +//! | +//! VCode (machine instructions: +//! | - stack-frame size known. +//! | - out-of-band instruction sequence +//! | has preamble prepended to entry +//! | block, and postamble injected before +//! | every return instruction. +//! | - all symbolic stack references to +//! | stackslots and spillslots are resolved +//! | to concrete FP-offset mem addresses.) +//! | [block/insn ordering] +//! | +//! VCode (machine instructions: +//! | - vcode.final_block_order is filled in. +//! | - new insn sequence from regalloc is +//! | placed back into vcode and block +//! | boundaries are updated.) +//! | [redundant branch/block +//! | removal] +//! | +//! VCode (machine instructions: +//! | - all blocks that were just an +//! | unconditional branch are removed.) +//! | +//! | [branch finalization +//! | (fallthroughs)] +//! | +//! VCode (machine instructions: +//! | - all branches are in lowered one- +//! | target form, but targets are still +//! | block indices.) +//! | +//! | [branch finalization +//! | (offsets)] +//! | +//! VCode (machine instructions: +//! | - all branch offsets from start of +//! | function are known, and all branches +//! | have resolved-offset targets.) +//! | +//! | [MemArg finalization] +//! | +//! VCode (machine instructions: +//! | - all MemArg references to the constant +//! | pool are replaced with offsets. +//! | - all constant-pool data is collected +//! | in the VCode.) +//! | +//! | [binary emission] +//! | +//! Vec (machine code!) +//! +//! ``` + +use crate::binemit::{CodeInfo, CodeOffset}; +use crate::entity::SecondaryMap; +use crate::ir::condcodes::IntCC; +use crate::ir::{Function, Type}; +use crate::result::CodegenResult; +use crate::settings::Flags; +use alloc::boxed::Box; +use alloc::vec::Vec; +use core::fmt::Debug; +use regalloc::Map as RegallocMap; +use regalloc::RegUsageCollector; +use regalloc::{RealReg, RealRegUniverse, Reg, RegClass, SpillSlot, VirtualReg, Writable}; +use std::string::String; +use target_lexicon::Triple; + +pub mod lower; +pub use lower::*; +pub mod vcode; +pub use vcode::*; +pub mod compile; +pub use compile::*; +pub mod blockorder; +pub use blockorder::*; +pub mod abi; +pub use abi::*; +pub mod pretty_print; +pub use pretty_print::*; +pub mod sections; +pub use sections::*; +pub mod adapter; +pub use adapter::*; + +/// A machine instruction. +pub trait MachInst: Clone + Debug { + /// Return the registers referenced by this machine instruction along with + /// the modes of reference (use, def, modify). + fn get_regs(&self, collector: &mut RegUsageCollector); + + /// Map virtual registers to physical registers using the given virt->phys + /// maps corresponding to the program points prior to, and after, this instruction. + fn map_regs( + &mut self, + pre_map: &RegallocMap, + post_map: &RegallocMap, + ); + + /// If this is a simple move, return the (source, destination) tuple of registers. + fn is_move(&self) -> Option<(Writable, Reg)>; + + /// Is this a terminator (branch or ret)? If so, return its type + /// (ret/uncond/cond) and target if applicable. + fn is_term<'a>(&'a self) -> MachTerminator<'a>; + + /// Returns true if the instruction is an epilogue placeholder. + fn is_epilogue_placeholder(&self) -> bool; + + /// Generate a move. + fn gen_move(to_reg: Writable, from_reg: Reg, ty: Type) -> Self; + + /// Generate a zero-length no-op. + fn gen_zero_len_nop() -> Self; + + /// Possibly operate on a value directly in a spill-slot rather than a + /// register. Useful if the machine has register-memory instruction forms + /// (e.g., add directly from or directly to memory), like x86. + fn maybe_direct_reload(&self, reg: VirtualReg, slot: SpillSlot) -> Option; + + /// Determine a register class to store the given CraneLift type. + fn rc_for_type(ty: Type) -> RegClass; + + /// Generate a jump to another target. Used during lowering of + /// control flow. + fn gen_jump(target: BlockIndex) -> Self; + + /// Generate a NOP. The `preferred_size` parameter allows the caller to + /// request a NOP of that size, or as close to it as possible. The machine + /// backend may return a NOP whose binary encoding is smaller than the + /// preferred size, but must not return a NOP that is larger. However, + /// the instruction must have a nonzero size. + fn gen_nop(preferred_size: usize) -> Self; + + /// Rewrite block targets using the block-target map. + fn with_block_rewrites(&mut self, block_target_map: &[BlockIndex]); + + /// Finalize branches once the block order (fallthrough) is known. + fn with_fallthrough_block(&mut self, fallthrough_block: Option); + + /// Update instruction once block offsets are known. These offsets are + /// relative to the beginning of the function. `targets` is indexed by + /// BlockIndex. + fn with_block_offsets(&mut self, my_offset: CodeOffset, targets: &[CodeOffset]); + + /// Get the register universe for this backend. + fn reg_universe() -> RealRegUniverse; + + /// Align a basic block offset (from start of function). By default, no + /// alignment occurs. + fn align_basic_block(offset: CodeOffset) -> CodeOffset { + offset + } +} + +/// Describes a block terminator (not call) in the vcode, when its branches +/// have not yet been finalized (so a branch may have two targets). +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum MachTerminator<'a> { + /// Not a terminator. + None, + /// A return instruction. + Ret, + /// An unconditional branch to another block. + Uncond(BlockIndex), + /// A conditional branch to one of two other blocks. + Cond(BlockIndex, BlockIndex), + /// An indirect branch with known possible targets. + Indirect(&'a [BlockIndex]), +} + +/// A trait describing the ability to encode a MachInst into binary machine code. +pub trait MachInstEmit { + /// Emit the instruction. + fn emit(&self, code: &mut O); +} + +/// The result of a `MachBackend::compile_function()` call. Contains machine +/// code (as bytes) and a disassembly, if requested. +pub struct MachCompileResult { + /// Machine code. + pub sections: MachSections, + /// Size of stack frame, in bytes. + pub frame_size: u32, + /// Disassembly, if requested. + pub disasm: Option, +} + +impl MachCompileResult { + /// Get a `CodeInfo` describing section sizes from this compilation result. + pub fn code_info(&self) -> CodeInfo { + let code_size = self.sections.total_size(); + CodeInfo { + code_size, + jumptables_size: 0, + rodata_size: 0, + total_size: code_size, + } + } +} + +/// Top-level machine backend trait, which wraps all monomorphized code and +/// allows a virtual call from the machine-independent `Function::compile()`. +pub trait MachBackend { + /// Compile the given function. + fn compile_function( + &self, + func: &Function, + want_disasm: bool, + ) -> CodegenResult; + + /// Return flags for this backend. + fn flags(&self) -> &Flags; + + /// Return triple for this backend. + fn triple(&self) -> Triple; + + /// Return name for this backend. + fn name(&self) -> &'static str; + + /// Return the register universe for this backend. + fn reg_universe(&self) -> RealRegUniverse; + + /// Machine-specific condcode info needed by TargetIsa. + fn unsigned_add_overflow_condition(&self) -> IntCC { + // TODO: this is what x86 specifies. Is this right for arm64? + IntCC::UnsignedLessThan + } + + /// Machine-specific condcode info needed by TargetIsa. + fn unsigned_sub_overflow_condition(&self) -> IntCC { + // TODO: this is what x86 specifies. Is this right for arm64? + IntCC::UnsignedLessThan + } +} diff --git a/cranelift/codegen/src/machinst/pretty_print.rs b/cranelift/codegen/src/machinst/pretty_print.rs new file mode 100644 index 000000000000..40e7c1b84298 --- /dev/null +++ b/cranelift/codegen/src/machinst/pretty_print.rs @@ -0,0 +1,66 @@ +//! Pretty-printing for machine code (virtual-registerized or final). + +use regalloc::{RealRegUniverse, Reg, Writable}; + +use std::fmt::Debug; +use std::hash::Hash; +use std::string::{String, ToString}; + +// FIXME: Should this go into regalloc.rs instead? + +/// A trait for printing instruction bits and pieces, with the the ability to +/// take a contextualising RealRegUniverse that is used to give proper names to +/// registers. +pub trait ShowWithRRU { + /// Return a string that shows the implementing object in context of the + /// given `RealRegUniverse`, if provided. + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String; + + /// The same as |show_rru|, but with an optional hint giving a size in + /// bytes. Its interpretation is object-dependent, and it is intended to + /// pass around enough information to facilitate printing sub-parts of + /// real registers correctly. Objects may ignore size hints that are + /// irrelevant to them. + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, _size: u8) -> String { + // Default implementation is to ignore the hint. + self.show_rru(mb_rru) + } +} + +impl ShowWithRRU for Reg { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + if self.is_real() { + if let Some(rru) = mb_rru { + let reg_ix = self.get_index(); + if reg_ix < rru.regs.len() { + return rru.regs[reg_ix].1.to_string(); + } else { + // We have a real reg which isn't listed in the universe. + // Per the regalloc.rs interface requirements, this is + // Totally Not Allowed. Print it generically anyway, so + // we have something to debug. + return format!("!!{:?}!!", self); + } + } + } + // The reg is virtual, or we have no universe. Be generic. + format!("%{:?}", self) + } + + fn show_rru_sized(&self, _mb_rru: Option<&RealRegUniverse>, _size: u8) -> String { + // For the specific case of Reg, we demand not to have a size hint, + // since interpretation of the size is target specific, but this code + // is used by all targets. + panic!("Reg::show_rru_sized: impossible to implement"); + } +} + +impl ShowWithRRU for Writable { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + self.to_reg().show_rru(mb_rru) + } + + fn show_rru_sized(&self, mb_rru: Option<&RealRegUniverse>, size: u8) -> String { + self.to_reg().show_rru_sized(mb_rru, size) + } +} diff --git a/cranelift/codegen/src/machinst/sections.rs b/cranelift/codegen/src/machinst/sections.rs new file mode 100644 index 000000000000..247adf5cef48 --- /dev/null +++ b/cranelift/codegen/src/machinst/sections.rs @@ -0,0 +1,354 @@ +//! In-memory representation of compiled machine code, in multiple sections +//! (text, constant pool / rodata, etc). Emission occurs into multiple sections +//! simultaneously, so we buffer the result in memory and hand off to the +//! caller at the end of compilation. + +use crate::binemit::{Addend, CodeOffset, CodeSink, Reloc}; +use crate::ir::{ExternalName, Opcode, SourceLoc, TrapCode}; + +use alloc::vec::Vec; + +/// A collection of sections with defined start-offsets. +pub struct MachSections { + /// Sections, in offset order. + pub sections: Vec, +} + +impl MachSections { + /// New, empty set of sections. + pub fn new() -> MachSections { + MachSections { sections: vec![] } + } + + /// Add a section with a known offset and size. Returns the index. + pub fn add_section(&mut self, start: CodeOffset, length: CodeOffset) -> usize { + let idx = self.sections.len(); + self.sections.push(MachSection::new(start, length)); + idx + } + + /// Mutably borrow the given section by index. + pub fn get_section<'a>(&'a mut self, idx: usize) -> &'a mut MachSection { + &mut self.sections[idx] + } + + /// Get mutable borrows of two sections simultaneously. Used during + /// instruction emission to provide references to the .text and .rodata + /// (constant pool) sections. + pub fn two_sections<'a>( + &'a mut self, + idx1: usize, + idx2: usize, + ) -> (&'a mut MachSection, &'a mut MachSection) { + assert!(idx1 < idx2); + assert!(idx1 < self.sections.len()); + assert!(idx2 < self.sections.len()); + let (first, rest) = self.sections.split_at_mut(idx2); + (&mut first[idx1], &mut rest[0]) + } + + /// Emit this set of sections to a set of sinks for the code, + /// relocations, traps, and stackmap. + pub fn emit(&self, sink: &mut CS) { + // N.B.: we emit every section into the .text section as far as + // the `CodeSink` is concerned; we do not bother to segregate + // the contents into the actual program text, the jumptable and the + // rodata (constant pool). This allows us to generate code assuming + // that these will not be relocated relative to each other, and avoids + // having to designate each section as belonging in one of the three + // fixed categories defined by `CodeSink`. If this becomes a problem + // later (e.g. because of memory permissions or similar), we can + // add this designation and segregate the output; take care, however, + // to add the appropriate relocations in this case. + + for section in &self.sections { + if section.data.len() > 0 { + while sink.offset() < section.start_offset { + sink.put1(0); + } + section.emit(sink); + } + } + sink.begin_jumptables(); + sink.begin_rodata(); + sink.end_codegen(); + } + + /// Get the total required size for these sections. + pub fn total_size(&self) -> CodeOffset { + if self.sections.len() == 0 { + 0 + } else { + // Find the last non-empty section. + self.sections + .iter() + .rev() + .find(|s| s.data.len() > 0) + .map(|s| s.cur_offset_from_start()) + .unwrap_or(0) + } + } +} + +/// An abstraction over MachSection and MachSectionSize: some +/// receiver of section data. +pub trait MachSectionOutput { + /// Get the current offset from the start of all sections. + fn cur_offset_from_start(&self) -> CodeOffset; + + /// Get the start offset of this section. + fn start_offset(&self) -> CodeOffset; + + /// Add 1 byte to the section. + fn put1(&mut self, _: u8); + + /// Add 2 bytes to the section. + fn put2(&mut self, value: u16) { + let [b0, b1] = value.to_le_bytes(); + self.put1(b0); + self.put1(b1); + } + + /// Add 4 bytes to the section. + fn put4(&mut self, value: u32) { + let [b0, b1, b2, b3] = value.to_le_bytes(); + self.put1(b0); + self.put1(b1); + self.put1(b2); + self.put1(b3); + } + + /// Add 8 bytes to the section. + fn put8(&mut self, value: u64) { + let [b0, b1, b2, b3, b4, b5, b6, b7] = value.to_le_bytes(); + self.put1(b0); + self.put1(b1); + self.put1(b2); + self.put1(b3); + self.put1(b4); + self.put1(b5); + self.put1(b6); + self.put1(b7); + } + + /// Add a slice of bytes to the section. + fn put_data(&mut self, data: &[u8]); + + /// Add a relocation at the current offset. + fn add_reloc(&mut self, loc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend); + + /// Add a trap record at the current offset. + fn add_trap(&mut self, loc: SourceLoc, code: TrapCode); + + /// Add a call return address record at the current offset. + fn add_call_site(&mut self, loc: SourceLoc, opcode: Opcode); + + /// Align up to the given alignment. + fn align_to(&mut self, align_to: CodeOffset) { + assert!(align_to.is_power_of_two()); + while self.cur_offset_from_start() & (align_to - 1) != 0 { + self.put1(0); + } + } +} + +/// A section of output to be emitted to a CodeSink / RelocSink in bulk. +/// Multiple sections may be created with known start offsets in advance; the +/// usual use-case is to create the .text (code) and .rodata (constant pool) at +/// once, after computing the length of the code, so that constant references +/// can use known offsets as instructions are emitted. +pub struct MachSection { + /// The starting offset of this section. + pub start_offset: CodeOffset, + /// The limit of this section, defined by the start of the next section. + pub length_limit: CodeOffset, + /// The section contents, as raw bytes. + pub data: Vec, + /// Any relocations referring to this section. + pub relocs: Vec, + /// Any trap records referring to this section. + pub traps: Vec, + /// Any call site record referring to this section. + pub call_sites: Vec, +} + +impl MachSection { + /// Create a new section, known to start at `start_offset` and with a size limited to `length_limit`. + pub fn new(start_offset: CodeOffset, length_limit: CodeOffset) -> MachSection { + MachSection { + start_offset, + length_limit, + data: vec![], + relocs: vec![], + traps: vec![], + call_sites: vec![], + } + } + + /// Emit this section to the CodeSink and other associated sinks. The + /// current offset of the CodeSink must match the starting offset of this + /// section. + pub fn emit(&self, sink: &mut CS) { + assert!(sink.offset() == self.start_offset); + + let mut next_reloc = 0; + let mut next_trap = 0; + let mut next_call_site = 0; + for (idx, byte) in self.data.iter().enumerate() { + if next_reloc < self.relocs.len() { + let reloc = &self.relocs[next_reloc]; + if reloc.offset == idx as CodeOffset { + sink.reloc_external(reloc.srcloc, reloc.kind, &reloc.name, reloc.addend); + next_reloc += 1; + } + } + if next_trap < self.traps.len() { + let trap = &self.traps[next_trap]; + if trap.offset == idx as CodeOffset { + sink.trap(trap.code, trap.srcloc); + next_trap += 1; + } + } + if next_call_site < self.call_sites.len() { + let call_site = &self.call_sites[next_call_site]; + if call_site.ret_addr == idx as CodeOffset { + sink.add_call_site(call_site.opcode, call_site.srcloc); + next_call_site += 1; + } + } + sink.put1(*byte); + } + } +} + +impl MachSectionOutput for MachSection { + fn cur_offset_from_start(&self) -> CodeOffset { + self.start_offset + self.data.len() as CodeOffset + } + + fn start_offset(&self) -> CodeOffset { + self.start_offset + } + + fn put1(&mut self, value: u8) { + assert!(((self.data.len() + 1) as CodeOffset) <= self.length_limit); + self.data.push(value); + } + + fn put_data(&mut self, data: &[u8]) { + assert!(((self.data.len() + data.len()) as CodeOffset) <= self.length_limit); + self.data.extend_from_slice(data); + } + + fn add_reloc(&mut self, srcloc: SourceLoc, kind: Reloc, name: &ExternalName, addend: Addend) { + let name = name.clone(); + self.relocs.push(MachReloc { + offset: self.data.len() as CodeOffset, + srcloc, + kind, + name, + addend, + }); + } + + fn add_trap(&mut self, srcloc: SourceLoc, code: TrapCode) { + self.traps.push(MachTrap { + offset: self.data.len() as CodeOffset, + srcloc, + code, + }); + } + + fn add_call_site(&mut self, srcloc: SourceLoc, opcode: Opcode) { + self.call_sites.push(MachCallSite { + ret_addr: self.data.len() as CodeOffset, + srcloc, + opcode, + }); + } +} + +/// A MachSectionOutput implementation that records only size. +pub struct MachSectionSize { + /// The starting offset of this section. + pub start_offset: CodeOffset, + /// The current offset of this section. + pub offset: CodeOffset, +} + +impl MachSectionSize { + /// Create a new size-counting dummy section. + pub fn new(start_offset: CodeOffset) -> MachSectionSize { + MachSectionSize { + start_offset, + offset: start_offset, + } + } + + /// Return the size this section would take if emitted with a real sink. + pub fn size(&self) -> CodeOffset { + self.offset - self.start_offset + } +} + +impl MachSectionOutput for MachSectionSize { + fn cur_offset_from_start(&self) -> CodeOffset { + // All size-counting sections conceptually start at offset 0; this doesn't + // matter when counting code size. + self.offset + } + + fn start_offset(&self) -> CodeOffset { + self.start_offset + } + + fn put1(&mut self, _: u8) { + self.offset += 1; + } + + fn put_data(&mut self, data: &[u8]) { + self.offset += data.len() as CodeOffset; + } + + fn add_reloc(&mut self, _: SourceLoc, _: Reloc, _: &ExternalName, _: Addend) {} + + fn add_trap(&mut self, _: SourceLoc, _: TrapCode) {} + + fn add_call_site(&mut self, _: SourceLoc, _: Opcode) {} +} + +/// A relocation resulting from a compilation. +pub struct MachReloc { + /// The offset at which the relocation applies, *relative to the + /// containing section*. + pub offset: CodeOffset, + /// The original source location. + pub srcloc: SourceLoc, + /// The kind of relocation. + pub kind: Reloc, + /// The external symbol / name to which this relocation refers. + pub name: ExternalName, + /// The addend to add to the symbol value. + pub addend: i64, +} + +/// A trap record resulting from a compilation. +pub struct MachTrap { + /// The offset at which the trap instruction occurs, *relative to the + /// containing section*. + pub offset: CodeOffset, + /// The original source location. + pub srcloc: SourceLoc, + /// The trap code. + pub code: TrapCode, +} + +/// A call site record resulting from a compilation. +pub struct MachCallSite { + /// The offset of the call's return address, *relative to the containing section*. + pub ret_addr: CodeOffset, + /// The original source location. + pub srcloc: SourceLoc, + /// The call's opcode. + pub opcode: Opcode, +} diff --git a/cranelift/codegen/src/machinst/vcode.rs b/cranelift/codegen/src/machinst/vcode.rs new file mode 100644 index 000000000000..6e3adea53aec --- /dev/null +++ b/cranelift/codegen/src/machinst/vcode.rs @@ -0,0 +1,730 @@ +//! This implements the VCode container: a CFG of Insts that have been lowered. +//! +//! VCode is virtual-register code. An instruction in VCode is almost a machine +//! instruction; however, its register slots can refer to virtual registers in +//! addition to real machine registers. +//! +//! VCode is structured with traditional basic blocks, and +//! each block must be terminated by an unconditional branch (one target), a +//! conditional branch (two targets), or a return (no targets). Note that this +//! slightly differs from the machine code of most ISAs: in most ISAs, a +//! conditional branch has one target (and the not-taken case falls through). +//! However, we expect that machine backends will elide branches to the following +//! block (i.e., zero-offset jumps), and will be able to codegen a branch-cond / +//! branch-uncond pair if *both* targets are not fallthrough. This allows us to +//! play with layout prior to final binary emission, as well, if we want. +//! +//! See the main module comment in `mod.rs` for more details on the VCode-based +//! backend pipeline. + +use crate::ir; +use crate::machinst::*; +use crate::settings; + +use regalloc::Function as RegallocFunction; +use regalloc::Set as RegallocSet; +use regalloc::{BlockIx, InstIx, Range, RegAllocResult, RegClass, RegUsageCollector}; + +use alloc::boxed::Box; +use alloc::vec::Vec; +use log::debug; +use smallvec::SmallVec; +use std::fmt; +use std::iter; +use std::string::String; + +/// Index referring to an instruction in VCode. +pub type InsnIndex = u32; +/// Index referring to a basic block in VCode. +pub type BlockIndex = u32; + +/// VCodeInst wraps all requirements for a MachInst to be in VCode: it must be +/// a `MachInst` and it must be able to emit itself at least to a `SizeCodeSink`. +pub trait VCodeInst: MachInst + MachInstEmit + MachInstEmit {} +impl + MachInstEmit> VCodeInst for I {} + +/// A function in "VCode" (virtualized-register code) form, after lowering. +/// This is essentially a standard CFG of basic blocks, where each basic block +/// consists of lowered instructions produced by the machine-specific backend. +pub struct VCode { + /// Function liveins. + liveins: RegallocSet, + + /// Function liveouts. + liveouts: RegallocSet, + + /// VReg IR-level types. + vreg_types: Vec, + + /// Lowered machine instructions in order corresponding to the original IR. + insts: Vec, + + /// Entry block. + entry: BlockIndex, + + /// Block instruction indices. + block_ranges: Vec<(InsnIndex, InsnIndex)>, + + /// Block successors: index range in the successor-list below. + block_succ_range: Vec<(usize, usize)>, + + /// Block successor lists, concatenated into one Vec. The `block_succ_range` + /// list of tuples above gives (start, end) ranges within this list that + /// correspond to each basic block's successors. + block_succs: Vec, + + /// Block indices by IR block. + block_by_bb: SecondaryMap, + + /// IR block for each VCode Block. The length of this Vec will likely be + /// less than the total number of Blocks, because new Blocks (for edge + /// splits, for example) are appended during lowering. + bb_by_block: Vec, + + /// Order of block IDs in final generated code. + final_block_order: Vec, + + /// Final block offsets. Computed during branch finalization and used + /// during emission. + final_block_offsets: Vec, + + /// Size of code, accounting for block layout / alignment. + code_size: CodeOffset, + + /// ABI object. + abi: Box>, +} + +/// A builder for a VCode function body. This builder is designed for the +/// lowering approach that we take: we traverse basic blocks in forward +/// (original IR) order, but within each basic block, we generate code from +/// bottom to top; and within each IR instruction that we visit in this reverse +/// order, we emit machine instructions in *forward* order again. +/// +/// Hence, to produce the final instructions in proper order, we perform two +/// swaps. First, the machine instructions (`I` instances) are produced in +/// forward order for an individual IR instruction. Then these are *reversed* +/// and concatenated to `bb_insns` at the end of the IR instruction lowering. +/// The `bb_insns` vec will thus contain all machine instructions for a basic +/// block, in reverse order. Finally, when we're done with a basic block, we +/// reverse the whole block's vec of instructions again, and concatenate onto +/// the VCode's insts. +pub struct VCodeBuilder { + /// In-progress VCode. + vcode: VCode, + + /// Current basic block instructions, in reverse order (because blocks are + /// built bottom-to-top). + bb_insns: SmallVec<[I; 32]>, + + /// Current IR-inst instructions, in forward order. + ir_inst_insns: SmallVec<[I; 4]>, + + /// Start of succs for the current block in the concatenated succs list. + succ_start: usize, +} + +impl VCodeBuilder { + /// Create a new VCodeBuilder. + pub fn new(abi: Box>) -> VCodeBuilder { + let vcode = VCode::new(abi); + VCodeBuilder { + vcode, + bb_insns: SmallVec::new(), + ir_inst_insns: SmallVec::new(), + succ_start: 0, + } + } + + /// Access the ABI object. + pub fn abi(&mut self) -> &mut dyn ABIBody { + &mut *self.vcode.abi + } + + /// Set the type of a VReg. + pub fn set_vreg_type(&mut self, vreg: VirtualReg, ty: Type) { + while self.vcode.vreg_types.len() <= vreg.get_index() { + self.vcode.vreg_types.push(ir::types::I8); // Default type. + } + self.vcode.vreg_types[vreg.get_index()] = ty; + } + + /// Return the underlying bb-to-BlockIndex map. + pub fn blocks_by_bb(&self) -> &SecondaryMap { + &self.vcode.block_by_bb + } + + /// Initialize the bb-to-BlockIndex map. Returns the first free + /// BlockIndex. + pub fn init_bb_map(&mut self, blocks: &[ir::Block]) -> BlockIndex { + let mut bindex: BlockIndex = 0; + for bb in blocks.iter() { + self.vcode.block_by_bb[*bb] = bindex; + self.vcode.bb_by_block.push(*bb); + bindex += 1; + } + bindex + } + + /// Get the BlockIndex for an IR block. + pub fn bb_to_bindex(&self, bb: ir::Block) -> BlockIndex { + self.vcode.block_by_bb[bb] + } + + /// Set the current block as the entry block. + pub fn set_entry(&mut self, block: BlockIndex) { + self.vcode.entry = block; + } + + /// End the current IR instruction. Must be called after pushing any + /// instructions and prior to ending the basic block. + pub fn end_ir_inst(&mut self) { + while let Some(i) = self.ir_inst_insns.pop() { + self.bb_insns.push(i); + } + } + + /// End the current basic block. Must be called after emitting vcode insts + /// for IR insts and prior to ending the function (building the VCode). + pub fn end_bb(&mut self) -> BlockIndex { + assert!(self.ir_inst_insns.is_empty()); + let block_num = self.vcode.block_ranges.len() as BlockIndex; + // Push the instructions. + let start_idx = self.vcode.insts.len() as InsnIndex; + while let Some(i) = self.bb_insns.pop() { + self.vcode.insts.push(i); + } + let end_idx = self.vcode.insts.len() as InsnIndex; + // Add the instruction index range to the list of blocks. + self.vcode.block_ranges.push((start_idx, end_idx)); + // End the successors list. + let succ_end = self.vcode.block_succs.len(); + self.vcode + .block_succ_range + .push((self.succ_start, succ_end)); + self.succ_start = succ_end; + + block_num + } + + /// Push an instruction for the current BB and current IR inst within the BB. + pub fn push(&mut self, insn: I) { + match insn.is_term() { + MachTerminator::None | MachTerminator::Ret => {} + MachTerminator::Uncond(target) => { + self.vcode.block_succs.push(target); + } + MachTerminator::Cond(true_branch, false_branch) => { + self.vcode.block_succs.push(true_branch); + self.vcode.block_succs.push(false_branch); + } + MachTerminator::Indirect(targets) => { + for target in targets { + self.vcode.block_succs.push(*target); + } + } + } + self.ir_inst_insns.push(insn); + } + + /// Build the final VCode. + pub fn build(self) -> VCode { + assert!(self.ir_inst_insns.is_empty()); + assert!(self.bb_insns.is_empty()); + self.vcode + } +} + +fn block_ranges(indices: &[InstIx], len: usize) -> Vec<(usize, usize)> { + let v = indices + .iter() + .map(|iix| iix.get() as usize) + .chain(iter::once(len)) + .collect::>(); + v.windows(2).map(|p| (p[0], p[1])).collect() +} + +fn is_redundant_move(insn: &I) -> bool { + if let Some((to, from)) = insn.is_move() { + to.to_reg() == from + } else { + false + } +} + +fn is_trivial_jump_block(vcode: &VCode, block: BlockIndex) -> Option { + let range = vcode.block_insns(BlockIx::new(block)); + + debug!( + "is_trivial_jump_block: block {} has len {}", + block, + range.len() + ); + + if range.len() != 1 { + return None; + } + let insn = range.first(); + + debug!( + " -> only insn is: {:?} with terminator {:?}", + vcode.get_insn(insn), + vcode.get_insn(insn).is_term() + ); + + match vcode.get_insn(insn).is_term() { + MachTerminator::Uncond(target) => Some(target), + _ => None, + } +} + +impl VCode { + /// New empty VCode. + fn new(abi: Box>) -> VCode { + VCode { + liveins: abi.liveins(), + liveouts: abi.liveouts(), + vreg_types: vec![], + insts: vec![], + entry: 0, + block_ranges: vec![], + block_succ_range: vec![], + block_succs: vec![], + block_by_bb: SecondaryMap::with_default(0), + bb_by_block: vec![], + final_block_order: vec![], + final_block_offsets: vec![], + code_size: 0, + abi, + } + } + + /// Get the IR-level type of a VReg. + pub fn vreg_type(&self, vreg: VirtualReg) -> Type { + self.vreg_types[vreg.get_index()] + } + + /// Get the entry block. + pub fn entry(&self) -> BlockIndex { + self.entry + } + + /// Get the number of blocks. Block indices will be in the range `0 .. + /// (self.num_blocks() - 1)`. + pub fn num_blocks(&self) -> usize { + self.block_ranges.len() + } + + /// Stack frame size for the full function's body. + pub fn frame_size(&self) -> u32 { + self.abi.frame_size() + } + + /// Get the successors for a block. + pub fn succs(&self, block: BlockIndex) -> &[BlockIndex] { + let (start, end) = self.block_succ_range[block as usize]; + &self.block_succs[start..end] + } + + /// Take the results of register allocation, with a sequence of + /// instructions including spliced fill/reload/move instructions, and replace + /// the VCode with them. + pub fn replace_insns_from_regalloc( + &mut self, + result: RegAllocResult, + flags: &settings::Flags, + ) { + self.final_block_order = compute_final_block_order(self); + + // Record the spillslot count and clobbered registers for the ABI/stack + // setup code. + self.abi.set_num_spillslots(result.num_spill_slots as usize); + self.abi + .set_clobbered(result.clobbered_registers.map(|r| Writable::from_reg(*r))); + + // We want to move instructions over in final block order, using the new + // block-start map given by the regalloc. + let block_ranges: Vec<(usize, usize)> = + block_ranges(result.target_map.elems(), result.insns.len()); + let mut final_insns = vec![]; + let mut final_block_ranges = vec![(0, 0); self.num_blocks()]; + + for block in &self.final_block_order { + let (start, end) = block_ranges[*block as usize]; + let final_start = final_insns.len() as InsnIndex; + + if *block == self.entry { + // Start with the prologue. + final_insns.extend(self.abi.gen_prologue(flags).into_iter()); + } + + for i in start..end { + let insn = &result.insns[i]; + + // Elide redundant moves at this point (we only know what is + // redundant once registers are allocated). + if is_redundant_move(insn) { + continue; + } + + // Whenever encountering a return instruction, replace it + // with the epilogue. + let is_ret = insn.is_term() == MachTerminator::Ret; + if is_ret { + final_insns.extend(self.abi.gen_epilogue(flags).into_iter()); + } else { + final_insns.push(insn.clone()); + } + } + + let final_end = final_insns.len() as InsnIndex; + final_block_ranges[*block as usize] = (final_start, final_end); + } + + self.insts = final_insns; + self.block_ranges = final_block_ranges; + } + + /// Removes redundant branches, rewriting targets to point directly to the + /// ultimate block at the end of a chain of trivial one-target jumps. + pub fn remove_redundant_branches(&mut self) { + // For each block, compute the actual target block, looking through up to one + // block with single-target jumps (this will remove empty edge blocks inserted + // by phi-lowering). + let block_rewrites: Vec = (0..self.num_blocks() as u32) + .map(|bix| is_trivial_jump_block(self, bix).unwrap_or(bix)) + .collect(); + let mut refcounts: Vec = vec![0; self.num_blocks()]; + + debug!( + "remove_redundant_branches: block_rewrites = {:?}", + block_rewrites + ); + + refcounts[self.entry as usize] = 1; + + for block in 0..self.num_blocks() as u32 { + for insn in self.block_insns(BlockIx::new(block)) { + self.get_insn_mut(insn) + .with_block_rewrites(&block_rewrites[..]); + match self.get_insn(insn).is_term() { + MachTerminator::Uncond(bix) => { + refcounts[bix as usize] += 1; + } + MachTerminator::Cond(bix1, bix2) => { + refcounts[bix1 as usize] += 1; + refcounts[bix2 as usize] += 1; + } + MachTerminator::Indirect(blocks) => { + for block in blocks { + refcounts[*block as usize] += 1; + } + } + _ => {} + } + } + } + + let deleted: Vec = refcounts.iter().map(|r| *r == 0).collect(); + + let block_order = std::mem::replace(&mut self.final_block_order, vec![]); + self.final_block_order = block_order + .into_iter() + .filter(|b| !deleted[*b as usize]) + .collect(); + + // Rewrite successor information based on the block-rewrite map. + for succ in &mut self.block_succs { + let new_succ = block_rewrites[*succ as usize]; + *succ = new_succ; + } + } + + /// Mutate branch instructions to (i) lower two-way condbrs to one-way, + /// depending on fallthrough; and (ii) use concrete offsets. + pub fn finalize_branches(&mut self) + where + I: MachInstEmit, + { + // Compute fallthrough block, indexed by block. + let num_final_blocks = self.final_block_order.len(); + let mut block_fallthrough: Vec> = vec![None; self.num_blocks()]; + for i in 0..(num_final_blocks - 1) { + let from = self.final_block_order[i]; + let to = self.final_block_order[i + 1]; + block_fallthrough[from as usize] = Some(to); + } + + // Pass over VCode instructions and finalize two-way branches into + // one-way branches with fallthrough. + for block in 0..self.num_blocks() { + let next_block = block_fallthrough[block]; + let (start, end) = self.block_ranges[block]; + + for iix in start..end { + let insn = &mut self.insts[iix as usize]; + insn.with_fallthrough_block(next_block); + } + } + + // Compute block offsets. + let mut code_section = MachSectionSize::new(0); + let mut block_offsets = vec![0; self.num_blocks()]; + for &block in &self.final_block_order { + code_section.offset = I::align_basic_block(code_section.offset); + block_offsets[block as usize] = code_section.offset; + let (start, end) = self.block_ranges[block as usize]; + for iix in start..end { + self.insts[iix as usize].emit(&mut code_section); + } + } + + // We now have the section layout. + self.final_block_offsets = block_offsets; + self.code_size = code_section.size(); + + // Update branches with known block offsets. This looks like the + // traversal above, but (i) does not update block_offsets, rather uses + // it (so forward references are now possible), and (ii) mutates the + // instructions. + let mut code_section = MachSectionSize::new(0); + for &block in &self.final_block_order { + code_section.offset = I::align_basic_block(code_section.offset); + let (start, end) = self.block_ranges[block as usize]; + for iix in start..end { + self.insts[iix as usize] + .with_block_offsets(code_section.offset, &self.final_block_offsets[..]); + self.insts[iix as usize].emit(&mut code_section); + } + } + } + + /// Emit the instructions to a list of sections. + pub fn emit(&self) -> MachSections + where + I: MachInstEmit, + { + let mut sections = MachSections::new(); + let code_idx = sections.add_section(0, self.code_size); + let code_section = sections.get_section(code_idx); + + for &block in &self.final_block_order { + let new_offset = I::align_basic_block(code_section.cur_offset_from_start()); + while new_offset > code_section.cur_offset_from_start() { + // Pad with NOPs up to the aligned block offset. + let nop = I::gen_nop((new_offset - code_section.cur_offset_from_start()) as usize); + nop.emit(code_section); + } + assert_eq!(code_section.cur_offset_from_start(), new_offset); + + let (start, end) = self.block_ranges[block as usize]; + for iix in start..end { + self.insts[iix as usize].emit(code_section); + } + } + + sections + } + + /// Get the IR block for a BlockIndex, if one exists. + pub fn bindex_to_bb(&self, block: BlockIndex) -> Option { + if (block as usize) < self.bb_by_block.len() { + Some(self.bb_by_block[block as usize]) + } else { + None + } + } +} + +impl RegallocFunction for VCode { + type Inst = I; + + fn insns(&self) -> &[I] { + &self.insts[..] + } + + fn insns_mut(&mut self) -> &mut [I] { + &mut self.insts[..] + } + + fn get_insn(&self, insn: InstIx) -> &I { + &self.insts[insn.get() as usize] + } + + fn get_insn_mut(&mut self, insn: InstIx) -> &mut I { + &mut self.insts[insn.get() as usize] + } + + fn blocks(&self) -> Range { + Range::new(BlockIx::new(0), self.block_ranges.len()) + } + + fn entry_block(&self) -> BlockIx { + BlockIx::new(self.entry) + } + + fn block_insns(&self, block: BlockIx) -> Range { + let (start, end) = self.block_ranges[block.get() as usize]; + Range::new(InstIx::new(start), (end - start) as usize) + } + + fn block_succs(&self, block: BlockIx) -> Vec { + let (start, end) = self.block_succ_range[block.get() as usize]; + self.block_succs[start..end] + .iter() + .cloned() + .map(BlockIx::new) + .collect() + } + + fn is_ret(&self, insn: InstIx) -> bool { + match self.insts[insn.get() as usize].is_term() { + MachTerminator::Ret => true, + _ => false, + } + } + + fn get_regs(insn: &I, collector: &mut RegUsageCollector) { + insn.get_regs(collector) + } + + fn map_regs( + insn: &mut I, + pre_map: &RegallocMap, + post_map: &RegallocMap, + ) { + insn.map_regs(pre_map, post_map); + } + + fn is_move(&self, insn: &I) -> Option<(Writable, Reg)> { + insn.is_move() + } + + fn get_spillslot_size(&self, regclass: RegClass, vreg: VirtualReg) -> u32 { + let ty = self.vreg_type(vreg); + self.abi.get_spillslot_size(regclass, ty) + } + + fn gen_spill(&self, to_slot: SpillSlot, from_reg: RealReg, vreg: VirtualReg) -> I { + let ty = self.vreg_type(vreg); + self.abi.gen_spill(to_slot, from_reg, ty) + } + + fn gen_reload(&self, to_reg: Writable, from_slot: SpillSlot, vreg: VirtualReg) -> I { + let ty = self.vreg_type(vreg); + self.abi.gen_reload(to_reg, from_slot, ty) + } + + fn gen_move(&self, to_reg: Writable, from_reg: RealReg, vreg: VirtualReg) -> I { + let ty = self.vreg_type(vreg); + I::gen_move(to_reg.map(|r| r.to_reg()), from_reg.to_reg(), ty) + } + + fn gen_zero_len_nop(&self) -> I { + I::gen_zero_len_nop() + } + + fn maybe_direct_reload(&self, insn: &I, reg: VirtualReg, slot: SpillSlot) -> Option { + insn.maybe_direct_reload(reg, slot) + } + + fn func_liveins(&self) -> RegallocSet { + self.liveins.clone() + } + + fn func_liveouts(&self) -> RegallocSet { + self.liveouts.clone() + } +} + +impl fmt::Debug for VCode { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + writeln!(f, "VCode_Debug {{")?; + writeln!(f, " Entry block: {}", self.entry)?; + writeln!(f, " Final block order: {:?}", self.final_block_order)?; + + for block in 0..self.num_blocks() { + writeln!(f, "Block {}:", block,)?; + for succ in self.succs(block as BlockIndex) { + writeln!(f, " (successor: Block {})", succ)?; + } + let (start, end) = self.block_ranges[block]; + writeln!(f, " (instruction range: {} .. {})", start, end)?; + for inst in start..end { + writeln!(f, " Inst {}: {:?}", inst, self.insts[inst as usize])?; + } + } + + writeln!(f, "}}")?; + Ok(()) + } +} + +/// Pretty-printing with `RealRegUniverse` context. +impl ShowWithRRU for VCode { + fn show_rru(&self, mb_rru: Option<&RealRegUniverse>) -> String { + use std::fmt::Write; + + // Calculate an order in which to display the blocks. This is the same + // as final_block_order, but also includes blocks which are in the + // representation but not in final_block_order. + let mut display_order = Vec::::new(); + // First display blocks in `final_block_order` + for bix in &self.final_block_order { + assert!((*bix as usize) < self.num_blocks()); + display_order.push(*bix as usize); + } + // Now also take care of those not listed in `final_block_order`. + // This is quadratic, but it's also debug-only code. + for bix in 0..self.num_blocks() { + if display_order.contains(&bix) { + continue; + } + display_order.push(bix); + } + + let mut s = String::new(); + write!(&mut s, "VCode_ShowWithRRU {{{{\n").unwrap(); + write!(&mut s, " Entry block: {}\n", self.entry).unwrap(); + write!( + &mut s, + " Final block order: {:?}\n", + self.final_block_order + ) + .unwrap(); + + for i in 0..self.num_blocks() { + let block = display_order[i]; + + let omitted = if !self.final_block_order.is_empty() && i >= self.final_block_order.len() + { + "** OMITTED **" + } else { + "" + }; + + write!(&mut s, "Block {}: {}\n", block, omitted).unwrap(); + if let Some(bb) = self.bindex_to_bb(block as BlockIndex) { + write!(&mut s, " (original IR block: {})\n", bb).unwrap(); + } + for succ in self.succs(block as BlockIndex) { + write!(&mut s, " (successor: Block {})\n", succ).unwrap(); + } + let (start, end) = self.block_ranges[block]; + write!(&mut s, " (instruction range: {} .. {})\n", start, end).unwrap(); + for inst in start..end { + write!( + &mut s, + " Inst {}: {}\n", + inst, + self.insts[inst as usize].show_rru(mb_rru) + ) + .unwrap(); + } + } + + write!(&mut s, "}}}}\n").unwrap(); + + s + } +} diff --git a/cranelift/codegen/src/num_uses.rs b/cranelift/codegen/src/num_uses.rs new file mode 100644 index 000000000000..fd6eee8ec152 --- /dev/null +++ b/cranelift/codegen/src/num_uses.rs @@ -0,0 +1,52 @@ +//! A pass that computes the number of uses of any given instruction. + +use crate::entity::SecondaryMap; +use crate::ir::dfg::ValueDef; +use crate::ir::Value; +use crate::ir::{DataFlowGraph, Function, Inst}; + +/// Auxiliary data structure that counts the number of uses of any given +/// instruction in a Function. This is used during instruction selection +/// to essentially do incremental DCE: when an instruction is no longer +/// needed because its computation has been isel'd into another machine +/// instruction at every use site, we can skip it. +#[derive(Clone, Debug)] +pub struct NumUses { + uses: SecondaryMap, +} + +impl NumUses { + fn new() -> NumUses { + NumUses { + uses: SecondaryMap::with_default(0), + } + } + + /// Compute the NumUses analysis result for a function. + pub fn compute(func: &Function) -> NumUses { + let mut uses = NumUses::new(); + for bb in func.layout.blocks() { + for inst in func.layout.block_insts(bb) { + for arg in func.dfg.inst_args(inst) { + let v = func.dfg.resolve_aliases(*arg); + uses.add_value(&func.dfg, v); + } + } + } + uses + } + + fn add_value(&mut self, dfg: &DataFlowGraph, v: Value) { + match dfg.value_def(v) { + ValueDef::Result(inst, _) => { + self.uses[inst] += 1; + } + _ => {} + } + } + + /// Take the complete uses map, consuming this analysis result. + pub fn take_uses(self) -> SecondaryMap { + self.uses + } +} diff --git a/cranelift/codegen/src/postopt.rs b/cranelift/codegen/src/postopt.rs index 42121817d5f7..9e2179982d3f 100644 --- a/cranelift/codegen/src/postopt.rs +++ b/cranelift/codegen/src/postopt.rs @@ -360,10 +360,11 @@ fn optimize_complex_addresses(pos: &mut EncCursor, inst: Inst, isa: &dyn TargetI pub fn do_postopt(func: &mut Function, isa: &dyn TargetIsa) { let _tt = timing::postopt(); let mut pos = EncCursor::new(func, isa); + let is_mach_backend = isa.get_mach_backend().is_some(); while let Some(_block) = pos.next_block() { let mut last_flags_clobber = None; while let Some(inst) = pos.next_inst() { - if isa.uses_cpu_flags() { + if !is_mach_backend && isa.uses_cpu_flags() { // Optimize instructions to make use of flags. optimize_cpu_flags(&mut pos, inst, last_flags_clobber, isa); diff --git a/cranelift/codegen/src/verifier/flags.rs b/cranelift/codegen/src/verifier/flags.rs index 1a20303d20c3..e4cfc8046220 100644 --- a/cranelift/codegen/src/verifier/flags.rs +++ b/cranelift/codegen/src/verifier/flags.rs @@ -28,10 +28,15 @@ pub fn verify_flags( errors: &mut VerifierErrors, ) -> VerifierStepResult<()> { let _tt = timing::verify_flags(); + let encinfo = if isa.is_none() || isa.unwrap().get_mach_backend().is_some() { + None + } else { + Some(isa.unwrap().encoding_info()) + }; let mut verifier = FlagsVerifier { func, cfg, - encinfo: isa.map(|isa| isa.encoding_info()), + encinfo, livein: SecondaryMap::new(), }; verifier.check(errors) diff --git a/cranelift/filetests/Cargo.toml b/cranelift/filetests/Cargo.toml index 35b46b87c85d..3bf3090507ae 100644 --- a/cranelift/filetests/Cargo.toml +++ b/cranelift/filetests/Cargo.toml @@ -22,3 +22,4 @@ memmap = "0.7.0" num_cpus = "1.8.0" region = "2.1.2" byteorder = { version = "1.3.2", default-features = false } +target-lexicon = "0.10" diff --git a/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif new file mode 100644 index 000000000000..1f6dcf6b8206 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/arithmetic.clif @@ -0,0 +1,243 @@ +test vcode +target aarch64 + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = iadd.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = isub.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = imul.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: madd x0, x0, x1, xzr +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = umulhi.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: umulh x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = smulhi.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: smulh x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = sdiv.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sdiv x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 2 + v2 = sdiv.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x1, #2 +; nextln: sdiv x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = udiv.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: udiv x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 2 + v2 = udiv.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x1, #2 +; nextln: udiv x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = srem.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sdiv x2, x0, x1 +; nextln: msub x0, x2, x1, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = urem.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: udiv x2, x0, x1 +; nextln: msub x0, x2, x1, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = band.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: and x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = bor.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orr x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = bxor.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: eor x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = band_not.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: bic x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = bor_not.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orn x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = bxor_not.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: eon x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = bnot.i64 v0 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: orn x0, xzr, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/basic1.clif b/cranelift/filetests/filetests/vcode/aarch64/basic1.clif new file mode 100644 index 000000000000..b5ec1ae16075 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/basic1.clif @@ -0,0 +1,14 @@ +test vcode +target aarch64 + +function %f(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + ; check: stp fp, lr, [sp, #-16]! + ; check: mov fp, sp + v2 = iadd v0, v1 + ; check: add w0, w0, w1 + return v2 + ; check: mov sp, fp + ; check: ldp fp, lr, [sp], #16 + ; check: ret +} diff --git a/cranelift/filetests/filetests/vcode/aarch64/bitops.clif b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif new file mode 100644 index 000000000000..8f5e81d32241 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/bitops.clif @@ -0,0 +1,158 @@ +test vcode +target aarch64 + +function %a(i32) -> i32 { +block0(v0: i32): + v1 = bitrev v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %a(i64) -> i64 { +block0(v0: i64): + v1 = bitrev v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit x0, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %b(i32) -> i32 { +block0(v0: i32): + v1 = clz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %b(i64) -> i64 { +block0(v0: i64): + v1 = clz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: clz x0, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %c(i32) -> i32 { +block0(v0: i32): + v1 = cls v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: cls w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %c(i64) -> i64 { +block0(v0: i64): + v1 = cls v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: cls x0, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i32) -> i32 { +block0(v0: i32): + v1 = ctz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit w0, w0 +; nextln: clz w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i64) -> i64 { +block0(v0: i64): + v1 = ctz v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: rbit x0, x0 +; nextln: clz x0, x0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i64) -> i64 { +block0(v0: i64): + v1 = popcnt v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsr x1, x0, #1 +; nextln: and x1, x1, #6148914691236517205 +; nextln: sub x1, x0, x1 +; nextln: and x0, x1, #3689348814741910323 +; nextln: lsr x1, x1, #2 +; nextln: and x1, x1, #3689348814741910323 +; nextln: add x0, x1, x0 +; nextln: add x0, x0, x0, LSR 4 +; nextln: and x0, x0, #1085102592571150095 +; nextln: add x0, x0, x0, LSL 8 +; nextln: add x0, x0, x0, LSL 16 +; nextln: add x0, x0, x0, LSL 32 +; nextln: lsr x0, x0, #56 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %d(i32) -> i32 { +block0(v0: i32): + v1 = popcnt v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsr w1, w0, #1 +; nextln: and x1, x1, #6148914691236517205 +; nextln: sub x1, x0, x1 +; nextln: and x0, x1, #3689348814741910323 +; nextln: lsr x1, x1, #2 +; nextln: and x1, x1, #3689348814741910323 +; nextln: add x0, x1, x0 +; nextln: add x0, x0, x0, LSR 4 +; nextln: and x0, x0, #1085102592571150095 +; nextln: add x0, x0, x0, LSL 8 +; nextln: add x0, x0, x0, LSL 16 +; nextln: add x0, x0, x0, LSL 32 +; nextln: lsr x0, x0, #56 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif b/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif new file mode 100644 index 000000000000..c5e8ea059667 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/call-indirect.clif @@ -0,0 +1,16 @@ +test vcode +target aarch64 + +function %f(i64, i64) -> i64 { + sig0 = (i64) -> i64 +block0(v0: i64, v1: i64): + v2 = call_indirect.i64 sig0, v1(v0) + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: blr x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/call.clif b/cranelift/filetests/filetests/vcode/aarch64/call.clif new file mode 100644 index 000000000000..1429dceed6f0 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/call.clif @@ -0,0 +1,17 @@ +test vcode +target aarch64 + +function %f(i64) -> i64 { + fn0 = %g(i64) -> i64 + +block0(v0: i64): + v1 = call fn0(v0) + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: bl 0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/condbr.clif b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif new file mode 100644 index 000000000000..596557d8e07f --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/condbr.clif @@ -0,0 +1,66 @@ +test vcode +target aarch64 + +function %f(i64, i64) -> b1 { +block0(v0: i64, v1: i64): + v2 = icmp eq v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: subs xzr, x0, x1 +; nextln: cset x0, eq +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ifcmp v0, v1 + brif eq v2, block1 + jump block2 + +block1: + v4 = iconst.i64 1 + return v4 + +block2: + v5 = iconst.i64 2 + return v5 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: subs xzr, x0, x1 +; nextln: b.eq 20 +; check: Block 2: +; check: movz x0, #2 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret +; check: Block 1: +; check: movz x0, #1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ifcmp v0, v1 + brif eq v2, block1 + jump block1 + +block1: + v4 = iconst.i64 1 + return v4 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: subs xzr, x0, x1 +; check: Block 1: +; check: movz x0, #1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/condops.clif b/cranelift/filetests/filetests/vcode/aarch64/condops.clif new file mode 100644 index 000000000000..e489836527df --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/condops.clif @@ -0,0 +1,43 @@ +test vcode +target aarch64 + +function %f(i8, i64, i64) -> i64 { +block0(v0: i8, v1: i64, v2: i64): + v3 = iconst.i8 42 + v4 = ifcmp v0, v3 + v5 = selectif.i64 eq v4, v1, v2 + return v5 +} + +; check: subs wzr +; check: csel x0, $(=x[0-9]+, x[0-9]+), eq + +function %g(i8) -> b1 { +block0(v0: i8): + v3 = iconst.i8 42 + v4 = ifcmp v0, v3 + v5 = trueif eq v4 + return v5 +} + +; check: subs wzr +; check: cset x0, eq + +function %h(i8, i8, i8) -> i8 { +block0(v0: i8, v1: i8, v2: i8): + v3 = bitselect.i8 v0, v1, v2 + return v3 +} + +; check: and +; nextln: bic +; nextln: orr + +function %i(b1, i8, i8) -> i8 { +block0(v0: b1, v1: i8, v2: i8): + v3 = select.i8 v0, v1, v2 + return v3 +} + +; check: subs wzr +; nextln: csel diff --git a/cranelift/filetests/filetests/vcode/aarch64/constants.clif b/cranelift/filetests/filetests/vcode/aarch64/constants.clif new file mode 100644 index 000000000000..67667d59c1f2 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/constants.clif @@ -0,0 +1,176 @@ +test vcode +target aarch64 + +function %f() -> i64 { +block0: + v0 = iconst.i64 0 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffff + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #65535 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffff0000 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #65535, LSL #16 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffff00000000 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #65535, LSL #32 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffff000000000000 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #65535, LSL #48 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffffffffffffffff + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffffffffffff0000 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #65535 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffffffff0000ffff + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #65535, LSL #16 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xffff0000ffffffff + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #65535, LSL #32 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0x0000ffffffffffff + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #65535, LSL #48 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0xf34bf0a31212003a ; random digits + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #58 +; nextln: movk x0, #4626, LSL #16 +; nextln: movk x0, #61603, LSL #32 +; nextln: movk x0, #62283, LSL #48 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0x12e900001ef40000 ; random digits with 2 clear half words + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x0, #7924, LSL #16 +; nextln: movk x0, #4841, LSL #48 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f() -> i64 { +block0: + v0 = iconst.i64 0x12e9ffff1ef4ffff ; random digits with 2 full half words + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movn x0, #57611, LSL #16 +; nextln: movk x0, #4841, LSL #48 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif b/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif new file mode 100644 index 000000000000..6194dd563f81 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/extend-op.clif @@ -0,0 +1,18 @@ +test vcode +target aarch64 + +function %f(i8) -> i64 { +block0(v0: i8): + v1 = sextend.i64 v0 + v2 = iconst.i64 42 + v3 = iadd.i64 v2, v1 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: movz x1, #42 +; nextln: add x0, x1, x0, SXTB +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif new file mode 100644 index 000000000000..0789173acbfa --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/jumptable.clif @@ -0,0 +1,44 @@ +test vcode +target aarch64 + +function %f(i64) -> i64 { + jt0 = jump_table [block1, block2, block3] + +block0(v0: i64): + br_table v0, block4, jt0 + +block1: + v1 = iconst.i64 1 + jump block5(v1) + +block2: + v2 = iconst.i64 2 + jump block5(v2) + +block3: + v3 = iconst.i64 3 + jump block5(v3) + +block4: + v4 = iconst.i64 4 + jump block5(v4) + +block5(v5: i64): + v6 = iadd.i64 v0, v5 + return v6 +} + +; check: subs wzr, w0, #3 +; nextln: b.hs +; nextln: adr x2, pc+16 ; ldrsw x1, [x2, x0, LSL 2] ; add x2, x2, x1 ; br x2 ; jt_entries + +; check: movz x1, #3 +; nextln: b + +; check: movz x1, #2 +; nextln: b + +; check: movz x1, #1 + +; check: add x0, x0, x1 + diff --git a/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif b/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif new file mode 100644 index 000000000000..d11fc224176c --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/narrow-arithmetic.clif @@ -0,0 +1,69 @@ +test vcode +target aarch64 + +function %add8(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = iadd.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %add16(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = iadd.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %add32(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = iadd.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %add32_8(i32, i8) -> i32 { +block0(v0: i32, v1: i8): + v2 = sextend.i32 v1 + v3 = iadd.i32 v0, v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add w0, w0, w1, SXTB +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %add64_32(i64, i32) -> i64 { +block0(v0: i64, v1: i32): + v2 = sextend.i64 v1 + v3 = iadd.i64 v0, v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, x1, SXTW +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif new file mode 100644 index 000000000000..60b45cc07aeb --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/saturating-ops.clif @@ -0,0 +1,36 @@ +test vcode +target aarch64 + +function %uaddsat64(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = uadd_sat.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov v0.d[0], x0 +; nextln: mov v1.d[0], x1 +; nextln: uqadd d0, d0, d1 +; nextln: mov x0, v0.d[0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %uaddsat8(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = uadd_sat.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb x0, w0 +; nextln: uxtb x1, w1 +; nextln: mov v0.d[0], x0 +; nextln: mov v1.d[0], x1 +; nextln: uqadd d0, d0, d1 +; nextln: mov x0, v0.d[0] +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif new file mode 100644 index 000000000000..12984620a1e9 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/shift-op.clif @@ -0,0 +1,17 @@ +test vcode +target aarch64 + +function %f(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i64 3 + v2 = ishl.i64 v0, v1 + v3 = iadd.i64 v0, v2 + return v3 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: add x0, x0, x0, LSL 3 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif new file mode 100644 index 000000000000..b865cc29027b --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/shift-rotate.clif @@ -0,0 +1,440 @@ +test vcode +target aarch64 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ROR, variable +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f0(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = rotr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ror x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f1(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = rotr.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ror w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f2(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = rotr.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: sub w2, w1, #16 +; nextln: sub w2, wzr, w2 +; nextln: lsr w1, w0, w1 +; nextln: lsl w0, w0, w2 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f3(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = rotr.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: sub w2, w1, #8 +; nextln: sub w2, wzr, w2 +; nextln: lsr w1, w0, w1 +; nextln: lsl w0, w0, w2 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ROL, variable +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f4(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = rotl.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub w2, w1, #64 +; nextln: sub w2, wzr, w2 +; nextln: lsl x1, x0, x1 +; nextln: lsr x0, x0, x2 +; nextln: orr x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f5(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = rotl.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sub w2, w1, #32 +; nextln: sub w2, wzr, w2 +; nextln: lsl w1, w0, w1 +; nextln: lsr w0, w0, w2 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f6(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = rotl.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: sub w2, w1, #16 +; nextln: sub w2, wzr, w2 +; nextln: lsl w1, w0, w1 +; nextln: lsr w0, w0, w2 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f7(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = rotl.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: sub w2, w1, #8 +; nextln: sub w2, wzr, w2 +; nextln: lsl w1, w0, w1 +; nextln: lsr w0, w0, w2 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; LSR, variable +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f8(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ushr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsr x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f9(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = ushr.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f10(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = ushr.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: lsr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f11(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = ushr.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: lsr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; LSL, variable +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f12(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = ishl.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f13(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = ishl.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f14(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = ishl.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f15(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = ishl.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ASR, variable +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f16(i64, i64) -> i64 { +block0(v0: i64, v1: i64): + v2 = sshr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: asr x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f17(i32, i32) -> i32 { +block0(v0: i32, v1: i32): + v2 = sshr.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: asr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f18(i16, i16) -> i16 { +block0(v0: i16, v1: i16): + v2 = sshr.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxth w0, w0 +; nextln: asr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f19(i8, i8) -> i8 { +block0(v0: i8, v1: i8): + v2 = sshr.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtb w0, w0 +; nextln: asr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; immediate forms +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f20(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 17 + v2 = rotr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ror x0, x0, #17 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f21(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 17 + v2 = rotl.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl x1, x0, #17 +; nextln: lsr x0, x0, #47 +; nextln: orr x0, x0, x1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f22(i32) -> i32 { +block0(v0: i32): + v1 = iconst.i32 17 + v2 = rotl.i32 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl w1, w0, #17 +; nextln: lsr w0, w0, #15 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f23(i16) -> i16 { +block0(v0: i16): + v1 = iconst.i32 10 + v2 = rotl.i16 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: lsl w1, w0, #10 +; nextln: lsr w0, w0, #6 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f24(i8) -> i8 { +block0(v0: i8): + v1 = iconst.i32 3 + v2 = rotl.i8 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: lsl w1, w0, #3 +; nextln: lsr w0, w0, #5 +; nextln: orr w0, w0, w1 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f25(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 17 + v2 = ushr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsr x0, x0, #17 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f26(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 17 + v2 = sshr.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: asr x0, x0, #17 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f27(i64) -> i64 { +block0(v0: i64): + v1 = iconst.i32 17 + v2 = ishl.i64 v0, v1 + return v2 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: lsl x0, x0, #17 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif b/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif new file mode 100644 index 000000000000..01c0a8a46b19 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/symbol-value.clif @@ -0,0 +1,17 @@ +test vcode +target aarch64 + +function %f() -> i64 { + gv0 = symbol %my_global + +block0: + v0 = symbol_value.i64 gv0 + return v0 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: ldr x0, 8 ; b 12 ; data +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/filetests/vcode/aarch64/traps.clif b/cranelift/filetests/filetests/vcode/aarch64/traps.clif new file mode 100644 index 000000000000..b4c4be344b31 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/traps.clif @@ -0,0 +1,29 @@ +test vcode +target aarch64 + +function %f() { +block0: + trap user0 +} + +; check: udf + +function %g(i64) { +block0(v0: i64): + v1 = iconst.i64 42 + v2 = ifcmp v0, v1 + trapif eq v2, user0 + return +} + +; check: subs xzr, x0, #42 +; nextln: b.ne 8 +; nextln: udf + +function %h() { +block0: + debugtrap + return +} + +; check: brk #0 diff --git a/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif b/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif new file mode 100644 index 000000000000..86084ff0cc57 --- /dev/null +++ b/cranelift/filetests/filetests/vcode/aarch64/uextend-sextend.clif @@ -0,0 +1,158 @@ +test vcode +target aarch64 + +function %f_u_8_64(i8) -> i64 { +block0(v0: i8): + v1 = uextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb x0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_u_8_32(i8) -> i32 { +block0(v0: i8): + v1 = uextend.i32 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_u_8_16(i8) -> i16 { +block0(v0: i8): + v1 = uextend.i16 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxtb w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_8_64(i8) -> i64 { +block0(v0: i8): + v1 = sextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtb x0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_8_32(i8) -> i32 { +block0(v0: i8): + v1 = sextend.i32 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtb w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_8_16(i8) -> i16 { +block0(v0: i8): + v1 = sextend.i16 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtb w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_u_16_64(i16) -> i64 { +block0(v0: i16): + v1 = uextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth x0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_u_16_32(i16) -> i32 { +block0(v0: i16): + v1 = uextend.i32 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: uxth w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_16_64(i16) -> i64 { +block0(v0: i16): + v1 = sextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxth x0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_16_32(i16) -> i32 { +block0(v0: i16): + v1 = sextend.i32 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxth w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_u_32_64(i32) -> i64 { +block0(v0: i32): + v1 = uextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: mov w0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret + +function %f_s_32_64(i32) -> i64 { +block0(v0: i32): + v1 = sextend.i64 v0 + return v1 +} + +; check: stp fp, lr, [sp, #-16]! +; nextln: mov fp, sp +; nextln: sxtw x0, w0 +; nextln: mov sp, fp +; nextln: ldp fp, lr, [sp], #16 +; nextln: ret diff --git a/cranelift/filetests/src/lib.rs b/cranelift/filetests/src/lib.rs index 0d3b12e45890..0fee249f12c4 100644 --- a/cranelift/filetests/src/lib.rs +++ b/cranelift/filetests/src/lib.rs @@ -56,6 +56,7 @@ mod test_shrink; mod test_simple_gvn; mod test_simple_preopt; mod test_unwind; +mod test_vcode; mod test_verifier; /// The result of running the test in a file. @@ -134,6 +135,7 @@ fn new_subtest(parsed: &TestCommand) -> subtest::SubtestResult test_run::subtest(parsed), "shrink" => test_shrink::subtest(parsed), "simple-gvn" => test_simple_gvn::subtest(parsed), + "vcode" => test_vcode::subtest(parsed), "verifier" => test_verifier::subtest(parsed), "preopt" => test_preopt::subtest(parsed), "safepoint" => test_safepoint::subtest(parsed), diff --git a/cranelift/filetests/src/test_vcode.rs b/cranelift/filetests/src/test_vcode.rs new file mode 100644 index 000000000000..93bce57a59e7 --- /dev/null +++ b/cranelift/filetests/src/test_vcode.rs @@ -0,0 +1,67 @@ +use crate::subtest::{run_filecheck, Context, SubTest, SubtestResult}; +use cranelift_codegen::ir::Function; +use cranelift_codegen::isa::lookup; +use cranelift_codegen::settings; +use cranelift_codegen::Context as CodegenContext; +use cranelift_reader::{TestCommand, TestOption}; + +use log::info; +use std::borrow::Cow; +use std::string::String; + +struct TestVCode { + arch: String, +} + +pub fn subtest(parsed: &TestCommand) -> SubtestResult> { + assert_eq!(parsed.command, "vcode"); + + let mut arch = "arm64".to_string(); + for option in &parsed.options { + match option { + TestOption::Value(k, v) if k == &"arch" => { + arch = v.to_string(); + } + _ => {} + } + } + + Ok(Box::new(TestVCode { arch })) +} + +impl SubTest for TestVCode { + fn name(&self) -> &'static str { + "vcode" + } + + fn is_mutating(&self) -> bool { + true + } + + fn needs_isa(&self) -> bool { + true + } + + fn run(&self, func: Cow, context: &Context) -> SubtestResult<()> { + let triple = context.isa.unwrap().triple().clone(); + let func = func.into_owned(); + + let mut isa = lookup(triple) + .map_err(|_| format!("Could not look up backend for arch '{}'", self.arch))? + .finish(settings::Flags::new(settings::builder())); + + let mut codectx = CodegenContext::for_function(func); + codectx.set_disasm(true); + + codectx + .compile(&mut *isa) + .map_err(|e| format!("Could not compile with arch '{}': {:?}", self.arch, e))?; + + let result = codectx.mach_compile_result.take().unwrap(); + let text = result.disasm.unwrap(); + + info!("text input to filecheck is:\n{}\n", text); + + run_filecheck(&text, context) + } +} diff --git a/cranelift/src/compile.rs b/cranelift/src/compile.rs index 7d888f311325..4d7111887606 100644 --- a/cranelift/src/compile.rs +++ b/cranelift/src/compile.rs @@ -49,42 +49,42 @@ fn handle_module( // If we have an isa from the command-line, use that. Otherwise if the // file contains a unique isa, use that. - let isa = if let Some(isa) = fisa.isa { - isa - } else if let Some(isa) = test_file.isa_spec.unique_isa() { - isa - } else { + let isa = fisa.isa.or(test_file.isa_spec.unique_isa()); + + if isa.is_none() { return Err(String::from("compilation requires a target isa")); }; for (func, _) in test_file.functions { - let mut context = Context::new(); - context.func = func; - let mut relocs = PrintRelocs::new(flag_print); let mut traps = PrintTraps::new(flag_print); let mut stackmaps = PrintStackmaps::new(flag_print); - let mut mem = vec![]; - // Compile and encode the result to machine code. - let code_info = context - .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps) - .map_err(|err| pretty_error(&context.func, Some(isa), err))?; + if let Some(isa) = isa { + let mut context = Context::new(); + context.func = func; + let mut mem = vec![]; - if flag_print { - println!("{}", context.func.display(isa)); - } + // Compile and encode the result to machine code. + let code_info = context + .compile_and_emit(isa, &mut mem, &mut relocs, &mut traps, &mut stackmaps) + .map_err(|err| pretty_error(&context.func, Some(isa), err))?; + + if flag_print { + println!("{}", context.func.display(isa)); + } - if flag_disasm { - print_all( - isa, - &mem, - code_info.code_size, - code_info.jumptables_size + code_info.rodata_size, - &relocs, - &traps, - &stackmaps, - )?; + if flag_disasm { + print_all( + isa, + &mem, + code_info.code_size, + code_info.jumptables_size + code_info.rodata_size, + &relocs, + &traps, + &stackmaps, + )?; + } } } diff --git a/crates/jit/src/link.rs b/crates/jit/src/link.rs index c8313b5d8603..824c35ced6a9 100644 --- a/crates/jit/src/link.rs +++ b/crates/jit/src/link.rs @@ -2,7 +2,7 @@ use crate::Compilation; use cranelift_codegen::binemit::Reloc; -use std::ptr::write_unaligned; +use std::ptr::{read_unaligned, write_unaligned}; use wasmtime_environ::{Module, Relocation, RelocationTarget}; use wasmtime_runtime::libcalls; use wasmtime_runtime::VMFunctionBody; @@ -101,6 +101,23 @@ fn apply_reloc( Reloc::X86PCRelRodata4 => { // ignore } + Reloc::Arm64Call => unsafe { + let reloc_address = body.add(r.offset as usize) as usize; + let reloc_addend = r.addend as isize; + let reloc_delta = (target_func_address as u64).wrapping_sub(reloc_address as u64); + // TODO: come up with a PLT-like solution for longer calls. We can't extend the + // code segment at this point, but we could conservatively allocate space at the + // end of the function during codegen, a fixed amount per call, to allow for + // potential branch islands. + assert!((reloc_delta as i64) < (1 << 27)); + assert!((reloc_delta as i64) >= -(1 << 27)); + let reloc_delta = reloc_delta as u32; + let reloc_delta = reloc_delta.wrapping_add(reloc_addend as u32); + let delta_bits = reloc_delta >> 2; + let insn = read_unaligned(reloc_address as *const u32); + let new_insn = (insn & 0xfc00_0000) | (delta_bits & 0x03ff_ffff); + write_unaligned(reloc_address as *mut u32, new_insn); + }, _ => panic!("unsupported reloc kind"), } } @@ -108,14 +125,11 @@ fn apply_reloc( // A declaration for the stack probe function in Rust's standard library, for // catching callstack overflow. cfg_if::cfg_if! { - if #[cfg(any( - target_arch="aarch64", - all( + if #[cfg(all( target_os = "windows", target_env = "msvc", target_pointer_width = "64" - ) - ))] { + ))] { extern "C" { pub fn __chkstk(); } @@ -128,6 +142,13 @@ cfg_if::cfg_if! { pub fn ___chkstk(); } const PROBESTACK: unsafe extern "C" fn() = ___chkstk; + } else if #[cfg(not(any(target_arch = "x86_64", target_arch = "x86")))] { + // As per + // https://github.com/rust-lang/compiler-builtins/blob/cae3e6ea23739166504f9f9fb50ec070097979d4/src/probestack.rs#L39, + // LLVM only has stack-probe support on x86-64 and x86. Thus, on any other CPU + // architecture, we simply use an empty stack-probe function. + extern "C" fn empty_probestack() {} + const PROBESTACK: unsafe extern "C" fn() = empty_probestack; } else { extern "C" { pub fn __rust_probestack(); diff --git a/crates/runtime/src/helpers.c b/crates/runtime/src/helpers.c index 213f34e5938d..6436922243da 100644 --- a/crates/runtime/src/helpers.c +++ b/crates/runtime/src/helpers.c @@ -26,3 +26,12 @@ void* GetPcFromUContext(ucontext_t *cx) { return (void*) cx->uc_mcontext->__ss.__rip; } #endif + +#if defined(__linux__) && defined(__aarch64__) +#include + +void* GetPcFromUContext(ucontext_t *cx) { + return (void*) cx->uc_mcontext.pc; +} + +#endif // __linux__ && __aarch64__ diff --git a/crates/runtime/src/traphandlers.rs b/crates/runtime/src/traphandlers.rs index 107f3e82f087..b37cd46e712e 100644 --- a/crates/runtime/src/traphandlers.rs +++ b/crates/runtime/src/traphandlers.rs @@ -156,6 +156,12 @@ cfg_if::cfg_if! { if #[cfg(all(target_os = "linux", target_arch = "x86_64"))] { let cx = &*(cx as *const libc::ucontext_t); cx.uc_mcontext.gregs[libc::REG_RIP as usize] as *const u8 + } else if #[cfg(all(target_os = "linux", target_arch = "aarch64"))] { + // libc doesn't seem to support Linux/aarch64 at the moment? + extern "C" { + fn GetPcFromUContext(cx: *mut libc::c_void) -> *const u8; + } + GetPcFromUContext(cx) } else if #[cfg(target_os = "macos")] { // FIXME(rust-lang/libc#1702) - once that lands and is // released we should inline the definition here