From 693e22c4553fac074d383eb5c4e6bd0ba7cbe916 Mon Sep 17 00:00:00 2001
From: dheaton-arm <Damian.Heaton@arm.com>
Date: Thu, 4 Aug 2022 14:49:49 +0100
Subject: [PATCH 1/2] Port `Fcopysign`..``FcvtToSintSat` to ISLE (AArch64)

Ported the existing implementations of the following opcodes to ISLE on
AArch64:
- `Fcopysign`
  - Also introduced missing support for `fcopysign` on vector values, as
    per the docs.
  - This introduces the vector encoding for the `SLI` machine
    instruction.
- `FcvtToUint`
- `FcvtToSint`
- `FcvtFromUint`
- `FcvtFromSint`
- `FcvtToUintSat`
- `FcvtToSintSat`

Copyright (c) 2022 Arm Limited
---
 cranelift/codegen/src/isa/aarch64/inst.isle   | 170 ++++++++
 .../codegen/src/isa/aarch64/inst/emit.rs      |  44 ++
 cranelift/codegen/src/isa/aarch64/inst/mod.rs |  20 +-
 cranelift/codegen/src/isa/aarch64/lower.isle  | 113 +++++
 cranelift/codegen/src/isa/aarch64/lower.rs    |  11 -
 .../codegen/src/isa/aarch64/lower/isle.rs     | 205 ++++++++-
 .../codegen/src/isa/aarch64/lower_inst.rs     | 406 +-----------------
 .../filetests/isa/aarch64/fcvt-small.clif     |  56 +--
 .../filetests/isa/aarch64/floating-point.clif | 235 +++++-----
 .../runtests/simd-fcopysign-64bit.clif        |  37 ++
 .../filetests/runtests/simd-fcopysign.clif    |  63 +++
 cranelift/interpreter/src/step.rs             |  14 +-
 12 files changed, 826 insertions(+), 548 deletions(-)
 create mode 100644 cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif
 create mode 100644 cranelift/filetests/filetests/runtests/simd-fcopysign.clif

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index 9c10e40002be..f52b90f23a3c 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -619,6 +619,14 @@
         (size VectorSize)
         (imm u8))
 
+       ;; Destructive vector shift by immediate.
+       (VecShiftImmMod
+        (op VecShiftImmModOp)
+        (rd WritableReg)
+        (rn Reg)
+        (size VectorSize)
+        (imm u8))
+
        ;; Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes
        ;; of `rm` followed by the uppermost `16 - imm4` bytes of `rn`.
        (VecExtract
@@ -1315,6 +1323,13 @@
     (Sshr)
 ))
 
+;; Destructive shift-by-immediate operation on each lane of a vector.
+(type VecShiftImmModOp
+  (enum
+    ;; Shift left and insert
+    (Sli)
+))
+
 ;; Atomic read-modify-write operations with acquire-release semantics
 (type AtomicRMWOp
   (enum
@@ -1386,6 +1401,24 @@
 (decl u64_into_imm_logic (Type u64) ImmLogic)
 (extern constructor u64_into_imm_logic u64_into_imm_logic)
 
+(decl min_fp_value (bool u8 u8) Reg)
+(extern constructor min_fp_value min_fp_value)
+
+(decl max_fp_value (bool u8 u8) Reg)
+(extern constructor max_fp_value max_fp_value)
+
+(decl min_fp_value_sat (bool u8 u8) Reg)
+(extern constructor min_fp_value_sat min_fp_value_sat)
+
+(decl max_fp_value_sat (bool u8 u8) Reg)
+(extern constructor max_fp_value_sat max_fp_value_sat)
+
+(decl fpu_op_ri_ushr (u8 u8) FPUOpRI)
+(extern constructor fpu_op_ri_ushr fpu_op_ri_ushr)
+
+(decl fpu_op_ri_sli (u8 u8) FPUOpRI)
+(extern constructor fpu_op_ri_sli fpu_op_ri_sli)
+
 (decl imm12_from_negated_u64 (Imm12) u64)
 (extern extractor imm12_from_negated_u64 imm12_from_negated_u64)
 
@@ -1533,6 +1566,12 @@
             (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
         dst))
 
+(decl fpu_rri (FPUOpRI Reg) Reg)
+(rule (fpu_rri op src)
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuRRI op dst src))))
+        dst))
+
 ;; Helper for emitting `MInst.FpuRRR` instructions.
 (decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg)
 (rule (fpu_rrr op src1 src2 size)
@@ -2611,3 +2650,134 @@
       ;; to clobber LR.
       (let ((_ Unit (emit (MInst.Xpaclri))))
            (mov_preg (preg_link))))
+
+;; Helper for getting the maximum shift amount for a type.
+
+(decl max_shift (Type) u8)
+(rule (max_shift $F64) 63)
+(rule (max_shift $F32) 31)
+
+;; Helper for generating `fcopysign` instruction sequences.
+
+(decl fcopy_sign (Reg Reg Type) Reg)
+(rule (fcopy_sign x y (ty_scalar_float ty))
+      (let ((dst WritableReg (temp_writable_reg $F64))
+            (_ Unit (emit (MInst.FpuMove64 dst x)))
+            (tmp Reg (fpu_rri (fpu_op_ri_ushr (ty_bits ty) (max_shift ty)) y))
+            (_ Unit (emit (MInst.FpuRRI (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst tmp))))
+       dst))
+(rule (fcopy_sign x y ty @ (multi_lane _ _))
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.FpuMove128 dst x)))
+            (tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty)))
+            (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst tmp (vector_size ty) (max_shift (lane_type ty))))))
+       dst))
+
+;; Helpers for generating `MInst.FpuToInt` instructions.
+
+(decl fpu_to_int_nan_check () ConsumesFlags)
+(rule (fpu_to_int_nan_check)
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond (Cond.Vs))
+                    (trap_code_bad_conversion_to_integer))))
+
+;; Emits the appropriate flag-reading op for an underflow check,
+;; accepting a boolean (whether the type is signed), input type,
+;; and output type.
+(decl fpu_to_int_underflow_check (bool Type Type) ConsumesFlags)
+(rule (fpu_to_int_underflow_check $true $F32 (fits_in_16 out_ty))
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond
+                  (Cond.Le))
+                  (trap_code_integer_overflow))))
+(rule (fpu_to_int_underflow_check $true $F64 (fits_in_32 out_ty))
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond
+                  (Cond.Le))
+                  (trap_code_integer_overflow))))
+(rule -1 (fpu_to_int_underflow_check $true _in_ty _out_ty)
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond
+                  (Cond.Lt))
+                  (trap_code_integer_overflow))))
+(rule (fpu_to_int_underflow_check $false _in_ty _out_ty)
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond
+                  (Cond.Le))
+                  (trap_code_integer_overflow))))
+
+(decl fpu_to_int_overflow_check () ConsumesFlags)
+(rule (fpu_to_int_overflow_check)
+      (ConsumesFlags.ConsumesFlagsSideEffect
+      (MInst.TrapIf (cond_br_cond
+                  (Cond.Ge))
+                  (trap_code_integer_overflow))))
+
+;; Emits the appropriate instruction sequence to convert a
+;; floating-point value to an integer, trapping if the value
+;; is a NaN or does not fit in the target type.
+;; Accepts the specific conversion op, the source register,
+;; whether the input is signed, and finally the input and output
+;; types.
+(decl fpu_to_int_cvt (FpuToIntOp Reg bool Type Type) Reg)
+(rule (fpu_to_int_cvt op src signed in_ty out_ty)
+      (let ((size ScalarSize (scalar_size in_ty))
+            (in_bits u8 (ty_bits in_ty))
+            (out_bits u8 (ty_bits out_ty))
+            (_ InstOutput (side_effect (with_flags_side_effect
+                   (fpu_cmp size src src)
+                   (fpu_to_int_nan_check))))
+            (min Reg (min_fp_value signed in_bits out_bits))
+            (_ InstOutput (side_effect (with_flags_side_effect
+                   (fpu_cmp size src min)
+                   (fpu_to_int_underflow_check signed in_ty out_ty))))
+            (max Reg (max_fp_value signed in_bits out_bits))
+            (_ InstOutput (side_effect (with_flags_side_effect
+                   (fpu_cmp size src max)
+                   (fpu_to_int_overflow_check)))))
+       (fpu_to_int op src)))
+
+;; Emits the appropriate instruction sequence to convert a
+;; floating-point value to an integer, saturating if the value
+;; does not fit in the target type.
+;; Accepts the specific conversion op, the source register,
+;; whether the input is signed, and finally the input and output
+;; types.
+(decl fpu_to_int_cvt_sat (FpuToIntOp Reg bool Type Type) Reg)
+(rule (fpu_to_int_cvt_sat op src $true in_ty out_ty)
+      (let ((size ScalarSize (scalar_size in_ty))
+            (in_bits u8 (ty_bits in_ty))
+            (out_bits u8 (ty_bits out_ty))
+            (max Reg (max_fp_value_sat $true in_bits out_bits))
+            (tmp Reg (fpu_rrr (FPUOp2.Min) src max size))
+            (min Reg (min_fp_value_sat $true in_bits out_bits))
+            (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size))
+            (zero Reg (constant_f128 0))
+            (tmp ValueRegs (with_flags (fpu_cmp size src src)
+                    (fpu_csel in_ty (Cond.Ne) zero tmp))))
+       (fpu_to_int op (value_regs_get tmp 0))))
+(rule (fpu_to_int_cvt_sat op src $false in_ty out_ty)
+      (let ((size ScalarSize (scalar_size in_ty))
+            (in_bits u8 (ty_bits in_ty))
+            (out_bits u8 (ty_bits out_ty))
+            (max Reg (max_fp_value_sat $false in_bits out_bits))
+            (tmp Reg (fpu_rrr (FPUOp2.Min) src max size))
+            (min Reg (min_fp_value_sat $false in_bits out_bits))
+            (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size))
+            (tmp ValueRegs (with_flags (fpu_cmp size src src)
+                    (fpu_csel in_ty (Cond.Ne) min tmp))))
+       (fpu_to_int op (value_regs_get tmp 0))))
+
+(decl fpu_to_int (FpuToIntOp Reg) Reg)
+(rule (fpu_to_int op src)
+      (let ((dst WritableReg (temp_writable_reg $I64))
+            (_ Unit (emit (MInst.FpuToInt op dst src))))
+       dst))
+
+;; Helper for generating `MInst.IntToFpu` instructions.
+
+(decl int_to_fpu (IntToFpuOp Reg) Reg)
+(rule (int_to_fpu op src)
+      (let ((dst WritableReg (temp_writable_reg $I8X16))
+            (_ Unit (emit (MInst.IntToFpu op dst src))))
+       dst))
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 6d9c323d4f7d..ec6117c008a5 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2033,6 +2033,50 @@ impl MachInstEmit for Inst {
                 let rd_enc = machreg_to_vec(rd.to_reg());
                 sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
             }
+            &Inst::VecShiftImmMod {
+                op,
+                rd,
+                rn,
+                size,
+                imm,
+            } => {
+                let rd = allocs.next_writable(rd);
+                let rn = allocs.next(rn);
+                let (is_shr, mut template) = match op {
+                    VecShiftImmModOp::Sli => (false, 0b_001_011110_0000_000_010101_00000_00000_u32),
+                };
+                if size.is_128bits() {
+                    template |= 0b1 << 30;
+                }
+                let imm = imm as u32;
+                // Deal with the somewhat strange encoding scheme for, and limits on,
+                // the shift amount.
+                let immh_immb = match (size.lane_size(), is_shr) {
+                    (ScalarSize::Size64, true) if imm >= 1 && imm <= 64 => {
+                        0b_1000_000_u32 | (64 - imm)
+                    }
+                    (ScalarSize::Size32, true) if imm >= 1 && imm <= 32 => {
+                        0b_0100_000_u32 | (32 - imm)
+                    }
+                    (ScalarSize::Size16, true) if imm >= 1 && imm <= 16 => {
+                        0b_0010_000_u32 | (16 - imm)
+                    }
+                    (ScalarSize::Size8, true) if imm >= 1 && imm <= 8 => {
+                        0b_0001_000_u32 | (8 - imm)
+                    }
+                    (ScalarSize::Size64, false) if imm <= 63 => 0b_1000_000_u32 | imm,
+                    (ScalarSize::Size32, false) if imm <= 31 => 0b_0100_000_u32 | imm,
+                    (ScalarSize::Size16, false) if imm <= 15 => 0b_0010_000_u32 | imm,
+                    (ScalarSize::Size8, false) if imm <= 7 => 0b_0001_000_u32 | imm,
+                    _ => panic!(
+                        "aarch64: Inst::VecShiftImmMod: emit: invalid op/size/imm {:?}, {:?}, {:?}",
+                        op, size, imm
+                    ),
+                };
+                let rn_enc = machreg_to_vec(rn);
+                let rd_enc = machreg_to_vec(rd.to_reg());
+                sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc);
+            }
             &Inst::VecExtract { rd, rn, rm, imm4 } => {
                 let rd = allocs.next_writable(rd);
                 let rn = allocs.next(rn);
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index eac2d1bb2356..84292a43bc7b 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -39,7 +39,7 @@ pub use crate::isa::aarch64::lower::isle::generated_code::{
     ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
     FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
     VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
-    VecRRRLongOp, VecShiftImmOp,
+    VecRRRLongOp, VecShiftImmModOp, VecShiftImmOp,
 };
 
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -767,6 +767,10 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
+        &Inst::VecShiftImmMod { rd, rn, .. } => {
+            collector.reg_mod(rd);
+            collector.reg_use(rn);
+        }
         &Inst::VecExtract { rd, rn, rm, .. } => {
             collector.reg_def(rd);
             collector.reg_use(rn);
@@ -2371,6 +2375,20 @@ impl Inst {
                 let rn = pretty_print_vreg_vector(rn, size, allocs);
                 format!("{} {}, {}, #{}", op, rd, rn, imm)
             }
+            &Inst::VecShiftImmMod {
+                op,
+                rd,
+                rn,
+                size,
+                imm,
+            } => {
+                let op = match op {
+                    VecShiftImmModOp::Sli => "sli",
+                };
+                let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
+                let rn = pretty_print_vreg_vector(rn, size, allocs);
+                format!("{} {}, {}, #{}", op, rd, rn, imm)
+            }
             &Inst::VecExtract { rd, rn, rm, imm4 } => {
                 let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs);
                 let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs);
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index 1bcd269d65c9..5457039059c1 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -406,6 +406,119 @@
 (rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
       (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
 
+;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty (fcopysign x y)))
+      (fcopy_sign x y ty))
+
+;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F32))))
+      (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x $false $F32 out_ty))
+
+(rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F32))))
+      (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x $false $F32 $I64))
+
+(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F64))))
+      (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x $false $F64 out_ty))
+
+(rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F64))))
+      (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x $false $F64 $I64))
+
+;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F32))))
+      (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x $true $F32 out_ty))
+
+(rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F32))))
+      (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x $true $F32 $I64))
+
+(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F64))))
+      (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x $true $F64 out_ty))
+
+(rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F64))))
+      (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x $true $F64 $I64))
+
+;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint x @ (value_type (multi_lane 32 _)))))
+      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
+
+(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint x @ (value_type (multi_lane 64 _)))))
+      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))
+
+(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
+      (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x)))
+
+(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
+      (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x)))
+
+(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type $I64))))
+      (int_to_fpu (IntToFpuOp.U64ToF32) x))
+
+(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type $I64))))
+      (int_to_fpu (IntToFpuOp.U64ToF64) x))
+
+;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint x @ (value_type (multi_lane 32 _)))))
+      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
+
+(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint x @ (value_type (multi_lane 64 _)))))
+      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))
+
+(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
+      (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x)))
+
+(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
+      (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x)))
+
+(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type $I64))))
+      (int_to_fpu (IntToFpuOp.I64ToF32) x))
+
+(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type $I64))))
+      (int_to_fpu (IntToFpuOp.I64ToF64) x))
+
+;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 32 _)))))
+      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
+
+(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 64 _)))))
+      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))
+
+(rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F32))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x $false $F32 $I32))
+
+(rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F32))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x $false $F32 $I64))
+
+(rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F64))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x $false $F64 $I32))
+
+(rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F64))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x $false $F64 $I64))
+
+;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 32 _)))))
+      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
+
+(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 64 _)))))
+      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))
+
+(rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F32))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x $true $F32 $I32))
+
+(rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F32))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x $true $F32 $I64))
+
+(rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F64))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x $true $F64 $I32))
+
+(rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F64))))
+      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x $true $F64 $I64))
+
 ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 ;; `i64` and smaller
diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs
index 5235db60b43d..3ec6bf3bbe99 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower.rs
@@ -1065,17 +1065,6 @@ pub(crate) fn condcode_is_signed(cc: IntCC) -> bool {
 //=============================================================================
 // Helpers for instruction lowering.
 
-pub(crate) fn choose_32_64<T: Copy>(ty: Type, op32: T, op64: T) -> T {
-    let bits = ty_bits(ty);
-    if bits <= 32 {
-        op32
-    } else if bits == 64 {
-        op64
-    } else {
-        panic!("choose_32_64 on > 64 bits!")
-    }
-}
-
 /// Checks for an instance of `op` feeding the given input.
 pub(crate) fn maybe_input_insn(
     c: &mut Lower<Inst>,
diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
index 35bbefec8776..1e190a3fc765 100644
--- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -5,12 +5,13 @@ pub mod generated_code;
 
 // Types that the generated ISLE code uses via `use super::*`.
 use super::{
-    insn_inputs, lower_constant_f128, lower_constant_f64, writable_zero_reg, zero_reg, AMode,
-    ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp,
-    FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo, MachLabel,
-    MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg, ScalarSize,
-    ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
+    insn_inputs, lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg,
+    zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond,
+    CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC,
+    JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize,
+    PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV,
 };
+use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm};
 use crate::isa::aarch64::lower::{lower_address, lower_splat_const};
 use crate::isa::aarch64::settings::Flags as IsaFlags;
 use crate::machinst::{isle::*, InputSourceInst};
@@ -519,4 +520,198 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6>
     fn preg_link(&mut self) -> PReg {
         super::regs::link_reg().to_real_reg().unwrap().into()
     }
+
+    fn min_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
+        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
+
+        if in_bits == 32 {
+            // From float32.
+            let min = match (signed, out_bits) {
+                (true, 8) => i8::MIN as f32 - 1.,
+                (true, 16) => i16::MIN as f32 - 1.,
+                (true, 32) => i32::MIN as f32, // I32_MIN - 1 isn't precisely representable as a f32.
+                (true, 64) => i64::MIN as f32, // I64_MIN - 1 isn't precisely representable as a f32.
+
+                (false, _) => -1.,
+                _ => unimplemented!(
+                    "unexpected {} output size of {} bits for 32-bit input",
+                    if signed { "signed" } else { "unsigned" },
+                    out_bits
+                ),
+            };
+
+            lower_constant_f32(self.lower_ctx, tmp, min);
+        } else if in_bits == 64 {
+            // From float64.
+            let min = match (signed, out_bits) {
+                (true, 8) => i8::MIN as f64 - 1.,
+                (true, 16) => i16::MIN as f64 - 1.,
+                (true, 32) => i32::MIN as f64 - 1.,
+                (true, 64) => i64::MIN as f64,
+
+                (false, _) => -1.,
+                _ => unimplemented!(
+                    "unexpected {} output size of {} bits for 64-bit input",
+                    if signed { "signed" } else { "unsigned" },
+                    out_bits
+                ),
+            };
+
+            lower_constant_f64(self.lower_ctx, tmp, min);
+        } else {
+            unimplemented!(
+                "unexpected input size for min_fp_value: {} (signed: {}, output size: {})",
+                in_bits,
+                signed,
+                out_bits
+            );
+        }
+
+        tmp.to_reg()
+    }
+
+    fn max_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
+        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
+
+        if in_bits == 32 {
+            // From float32.
+            let max = match (signed, out_bits) {
+                (true, 8) => i8::MAX as f32 + 1.,
+                (true, 16) => i16::MAX as f32 + 1.,
+                (true, 32) => (i32::MAX as u64 + 1) as f32,
+                (true, 64) => (i64::MAX as u64 + 1) as f32,
+
+                (false, 8) => u8::MAX as f32 + 1.,
+                (false, 16) => u16::MAX as f32 + 1.,
+                (false, 32) => (u32::MAX as u64 + 1) as f32,
+                (false, 64) => (u64::MAX as u128 + 1) as f32,
+                _ => unimplemented!(
+                    "unexpected {} output size of {} bits for 32-bit input",
+                    if signed { "signed" } else { "unsigned" },
+                    out_bits
+                ),
+            };
+
+            lower_constant_f32(self.lower_ctx, tmp, max);
+        } else if in_bits == 64 {
+            // From float64.
+            let max = match (signed, out_bits) {
+                (true, 8) => i8::MAX as f64 + 1.,
+                (true, 16) => i16::MAX as f64 + 1.,
+                (true, 32) => i32::MAX as f64 + 1.,
+                (true, 64) => (i64::MAX as u64 + 1) as f64,
+
+                (false, 8) => u8::MAX as f64 + 1.,
+                (false, 16) => u16::MAX as f64 + 1.,
+                (false, 32) => u32::MAX as f64 + 1.,
+                (false, 64) => (u64::MAX as u128 + 1) as f64,
+                _ => unimplemented!(
+                    "unexpected {} output size of {} bits for 64-bit input",
+                    if signed { "signed" } else { "unsigned" },
+                    out_bits
+                ),
+            };
+
+            lower_constant_f64(self.lower_ctx, tmp, max);
+        } else {
+            unimplemented!(
+                "unexpected input size for max_fp_value: {} (signed: {}, output size: {})",
+                in_bits,
+                signed,
+                out_bits
+            );
+        }
+
+        tmp.to_reg()
+    }
+
+    fn min_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
+        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
+
+        let min: f64 = match (out_bits, signed) {
+            (32, true) => i32::MIN as f64,
+            (32, false) => 0.0,
+            (64, true) => i64::MIN as f64,
+            (64, false) => 0.0,
+            _ => unimplemented!(
+                "unexpected {} output size of {} bits",
+                if signed { "signed" } else { "unsigned" },
+                out_bits
+            ),
+        };
+
+        if in_bits == 32 {
+            lower_constant_f32(self.lower_ctx, tmp, min as f32)
+        } else if in_bits == 64 {
+            lower_constant_f64(self.lower_ctx, tmp, min)
+        } else {
+            unimplemented!(
+                "unexpected input size for min_fp_value_sat: {} (signed: {}, output size: {})",
+                in_bits,
+                signed,
+                out_bits
+            );
+        }
+
+        tmp.to_reg()
+    }
+
+    fn max_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg {
+        let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap();
+
+        let max = match (out_bits, signed) {
+            (32, true) => i32::MAX as f64,
+            (32, false) => u32::MAX as f64,
+            (64, true) => i64::MAX as f64,
+            (64, false) => u64::MAX as f64,
+            _ => unimplemented!(
+                "unexpected {} output size of {} bits",
+                if signed { "signed" } else { "unsigned" },
+                out_bits
+            ),
+        };
+
+        if in_bits == 32 {
+            lower_constant_f32(self.lower_ctx, tmp, max as f32)
+        } else if in_bits == 64 {
+            lower_constant_f64(self.lower_ctx, tmp, max)
+        } else {
+            unimplemented!(
+                "unexpected input size for max_fp_value_sat: {} (signed: {}, output size: {})",
+                in_bits,
+                signed,
+                out_bits
+            );
+        }
+
+        tmp.to_reg()
+    }
+
+    fn fpu_op_ri_ushr(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
+        if ty_bits == 32 {
+            FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+        } else if ty_bits == 64 {
+            FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+        } else {
+            unimplemented!(
+                "unexpected input size for fpu_op_ri_ushr: {} (shift: {})",
+                ty_bits,
+                shift
+            );
+        }
+    }
+
+    fn fpu_op_ri_sli(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI {
+        if ty_bits == 32 {
+            FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+        } else if ty_bits == 64 {
+            FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap())
+        } else {
+            unimplemented!(
+                "unexpected input size for fpu_op_ri_sli: {} (shift: {})",
+                ty_bits,
+                shift
+            );
+        }
+    }
 }
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index af5cf88a43f3..10f8f3516df0 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -2,10 +2,9 @@
 
 use super::lower::*;
 use crate::binemit::CodeOffset;
-use crate::ir::condcodes::FloatCC;
 use crate::ir::types::*;
 use crate::ir::Inst as IRInst;
-use crate::ir::{InstructionData, Opcode, TrapCode};
+use crate::ir::{InstructionData, Opcode};
 use crate::isa::aarch64::abi::*;
 use crate::isa::aarch64::inst::*;
 use crate::isa::aarch64::settings as aarch64_settings;
@@ -978,408 +977,13 @@ pub(crate) fn lower_insn_to_regs(
 
         Opcode::Fma => implemented_in_isle(ctx),
 
-        Opcode::Fcopysign => {
-            // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence:
-            //
-            // This is a scalar Fcopysign.
-            // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit.
-            // In the latter case it still sets all bits except the lowest 32 to 0.
-            //
-            //  mov vd, vn
-            //  ushr vtmp, vm, #63 / #31
-            //  sli vd, vtmp, #63 / #31
+        Opcode::Fcopysign => implemented_in_isle(ctx),
 
-            let ty = ctx.output_ty(insn, 0);
+        Opcode::FcvtToUint | Opcode::FcvtToSint => implemented_in_isle(ctx),
 
-            if ty != F32 && ty != F64 {
-                return Err(CodegenError::Unsupported(format!(
-                    "Fcopysign: Unsupported type: {:?}",
-                    ty
-                )));
-            }
-
-            let bits = ty_bits(ty) as u8;
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-            let tmp = ctx.alloc_tmp(F64).only_reg().unwrap();
-
-            // Copy LHS to rd.
-            ctx.emit(Inst::gen_move(rd, rn, ty));
-
-            // Copy the sign bit to the lowest bit in tmp.
-            let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
-            ctx.emit(Inst::FpuRRI {
-                fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)),
-                rd: tmp,
-                rn: rm,
-            });
-
-            // Insert the bit from tmp into the sign bit of rd.
-            let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap();
-            ctx.emit(Inst::FpuRRI {
-                fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)),
-                rd,
-                rn: tmp.to_reg(),
-            });
-        }
-
-        Opcode::FcvtToUint | Opcode::FcvtToSint => {
-            let input_ty = ctx.input_ty(insn, 0);
-            let in_bits = ty_bits(input_ty);
-            let output_ty = ty.unwrap();
-            let out_bits = ty_bits(output_ty);
-            let signed = op == Opcode::FcvtToSint;
-            let op = match (signed, in_bits, out_bits) {
-                (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32,
-                (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32,
-                (false, 32, 64) => FpuToIntOp::F32ToU64,
-                (true, 32, 64) => FpuToIntOp::F32ToI64,
-                (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32,
-                (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32,
-                (false, 64, 64) => FpuToIntOp::F64ToU64,
-                (true, 64, 64) => FpuToIntOp::F64ToI64,
-                _ => {
-                    return Err(CodegenError::Unsupported(format!(
-                        "{}: Unsupported types: {:?} -> {:?}",
-                        op, input_ty, output_ty
-                    )))
-                }
-            };
-
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            // First, check the output: it's important to carry the NaN conversion before the
-            // in-bounds conversion, per wasm semantics.
-
-            // Check that the input is not a NaN.
-            ctx.emit(Inst::FpuCmp {
-                size: ScalarSize::from_ty(input_ty),
-                rn,
-                rm: rn,
-            });
-            let trap_code = TrapCode::BadConversionToInteger;
-            ctx.emit(Inst::TrapIf {
-                trap_code,
-                kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)),
-            });
-
-            let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap();
-
-            // Check that the input is in range, with "truncate towards zero" semantics. This means
-            // we allow values that are slightly out of range:
-            // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this
-            // can be represented), and strictly less than INT_MAX+1 (when this can be
-            // represented).
-            // - for unsigned conversions, we allow values strictly greater than -1, and strictly
-            // less than UINT_MAX+1 (when this can be represented).
-
-            if in_bits == 32 {
-                // From float32.
-                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
-                    (true, 8) => (
-                        i8::min_value() as f32 - 1.,
-                        FloatCC::GreaterThan,
-                        i8::max_value() as f32 + 1.,
-                    ),
-                    (true, 16) => (
-                        i16::min_value() as f32 - 1.,
-                        FloatCC::GreaterThan,
-                        i16::max_value() as f32 + 1.,
-                    ),
-                    (true, 32) => (
-                        i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32.
-                        FloatCC::GreaterThanOrEqual,
-                        i32::max_value() as f32 + 1.,
-                    ),
-                    (true, 64) => (
-                        i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32.
-                        FloatCC::GreaterThanOrEqual,
-                        i64::max_value() as f32 + 1.,
-                    ),
-                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.),
-                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.),
-                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.),
-                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.),
-                    _ => unreachable!(),
-                };
-
-                // >= low_bound
-                lower_constant_f32(ctx, tmp, low_bound);
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::Size32,
-                    rn,
-                    rm: tmp.to_reg(),
-                });
-                let trap_code = TrapCode::IntegerOverflow;
-                ctx.emit(Inst::TrapIf {
-                    trap_code,
-                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
-                });
-
-                // <= high_bound
-                lower_constant_f32(ctx, tmp, high_bound);
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::Size32,
-                    rn,
-                    rm: tmp.to_reg(),
-                });
-                let trap_code = TrapCode::IntegerOverflow;
-                ctx.emit(Inst::TrapIf {
-                    trap_code,
-                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
-                });
-            } else {
-                // From float64.
-                let (low_bound, low_cond, high_bound) = match (signed, out_bits) {
-                    (true, 8) => (
-                        i8::min_value() as f64 - 1.,
-                        FloatCC::GreaterThan,
-                        i8::max_value() as f64 + 1.,
-                    ),
-                    (true, 16) => (
-                        i16::min_value() as f64 - 1.,
-                        FloatCC::GreaterThan,
-                        i16::max_value() as f64 + 1.,
-                    ),
-                    (true, 32) => (
-                        i32::min_value() as f64 - 1.,
-                        FloatCC::GreaterThan,
-                        i32::max_value() as f64 + 1.,
-                    ),
-                    (true, 64) => (
-                        i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64.
-                        FloatCC::GreaterThanOrEqual,
-                        i64::max_value() as f64 + 1.,
-                    ),
-                    (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.),
-                    (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.),
-                    (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.),
-                    (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.),
-                    _ => unreachable!(),
-                };
-
-                // >= low_bound
-                lower_constant_f64(ctx, tmp, low_bound);
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::Size64,
-                    rn,
-                    rm: tmp.to_reg(),
-                });
-                let trap_code = TrapCode::IntegerOverflow;
-                ctx.emit(Inst::TrapIf {
-                    trap_code,
-                    kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()),
-                });
-
-                // <= high_bound
-                lower_constant_f64(ctx, tmp, high_bound);
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::Size64,
-                    rn,
-                    rm: tmp.to_reg(),
-                });
-                let trap_code = TrapCode::IntegerOverflow;
-                ctx.emit(Inst::TrapIf {
-                    trap_code,
-                    kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()),
-                });
-            };
-
-            // Do the conversion.
-            ctx.emit(Inst::FpuToInt { op, rd, rn });
-        }
-
-        Opcode::FcvtFromUint | Opcode::FcvtFromSint => {
-            let input_ty = ctx.input_ty(insn, 0);
-            let ty = ty.unwrap();
-            let signed = op == Opcode::FcvtFromSint;
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
+        Opcode::FcvtFromUint | Opcode::FcvtFromSint => implemented_in_isle(ctx),
 
-            if ty.is_vector() {
-                if input_ty.lane_bits() != ty.lane_bits() {
-                    return Err(CodegenError::Unsupported(format!(
-                        "{}: Unsupported types: {:?} -> {:?}",
-                        op, input_ty, ty
-                    )));
-                }
-
-                let op = if signed {
-                    VecMisc2::Scvtf
-                } else {
-                    VecMisc2::Ucvtf
-                };
-                let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-
-                ctx.emit(Inst::VecMisc {
-                    op,
-                    rd,
-                    rn,
-                    size: VectorSize::from_ty(ty),
-                });
-            } else {
-                let in_bits = ty_bits(input_ty);
-                let out_bits = ty_bits(ty);
-                let op = match (signed, in_bits, out_bits) {
-                    (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32,
-                    (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32,
-                    (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64,
-                    (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64,
-                    (false, 64, 32) => IntToFpuOp::U64ToF32,
-                    (true, 64, 32) => IntToFpuOp::I64ToF32,
-                    (false, 64, 64) => IntToFpuOp::U64ToF64,
-                    (true, 64, 64) => IntToFpuOp::I64ToF64,
-                    _ => {
-                        return Err(CodegenError::Unsupported(format!(
-                            "{}: Unsupported types: {:?} -> {:?}",
-                            op, input_ty, ty
-                        )))
-                    }
-                };
-                let narrow_mode = match (signed, in_bits) {
-                    (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32,
-                    (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32,
-                    (false, 64) => NarrowValueMode::ZeroExtend64,
-                    (true, 64) => NarrowValueMode::SignExtend64,
-                    _ => unreachable!(),
-                };
-                let rn = put_input_in_reg(ctx, inputs[0], narrow_mode);
-                ctx.emit(Inst::IntToFpu { op, rd, rn });
-            }
-        }
-
-        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => {
-            let in_ty = ctx.input_ty(insn, 0);
-            let ty = ty.unwrap();
-            let out_signed = op == Opcode::FcvtToSintSat;
-            let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None);
-            let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap();
-
-            if ty.is_vector() {
-                if in_ty.lane_bits() != ty.lane_bits() {
-                    return Err(CodegenError::Unsupported(format!(
-                        "{}: Unsupported types: {:?} -> {:?}",
-                        op, in_ty, ty
-                    )));
-                }
-
-                let op = if out_signed {
-                    VecMisc2::Fcvtzs
-                } else {
-                    VecMisc2::Fcvtzu
-                };
-
-                ctx.emit(Inst::VecMisc {
-                    op,
-                    rd,
-                    rn,
-                    size: VectorSize::from_ty(ty),
-                });
-            } else {
-                let in_bits = ty_bits(in_ty);
-                let out_bits = ty_bits(ty);
-                // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX
-                // FMIN Vtmp2, Vin, Vtmp1
-                // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN
-                // FMAX Vtmp2, Vtmp2, Vtmp1
-                // (if signed) FIMM Vtmp1, 0
-                // FCMP Vin, Vin
-                // FCSEL Vtmp2, Vtmp1, Vtmp2, NE  // on NaN, select 0
-                // convert Rout, Vtmp2
-
-                assert!(in_ty.is_float() && (in_bits == 32 || in_bits == 64));
-                assert!(out_bits == 32 || out_bits == 64);
-
-                let min: f64 = match (out_bits, out_signed) {
-                    (32, true) => std::i32::MIN as f64,
-                    (32, false) => 0.0,
-                    (64, true) => std::i64::MIN as f64,
-                    (64, false) => 0.0,
-                    _ => unreachable!(),
-                };
-
-                let max = match (out_bits, out_signed) {
-                    (32, true) => std::i32::MAX as f64,
-                    (32, false) => std::u32::MAX as f64,
-                    (64, true) => std::i64::MAX as f64,
-                    (64, false) => std::u64::MAX as f64,
-                    _ => unreachable!(),
-                };
-
-                let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
-                let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap();
-
-                if in_bits == 32 {
-                    lower_constant_f32(ctx, rtmp1, max as f32);
-                } else {
-                    lower_constant_f64(ctx, rtmp1, max);
-                }
-                ctx.emit(Inst::FpuRRR {
-                    fpu_op: FPUOp2::Min,
-                    size: ScalarSize::from_ty(in_ty),
-                    rd: rtmp2,
-                    rn,
-                    rm: rtmp1.to_reg(),
-                });
-                if in_bits == 32 {
-                    lower_constant_f32(ctx, rtmp1, min as f32);
-                } else {
-                    lower_constant_f64(ctx, rtmp1, min);
-                }
-                ctx.emit(Inst::FpuRRR {
-                    fpu_op: FPUOp2::Max,
-                    size: ScalarSize::from_ty(in_ty),
-                    rd: rtmp2,
-                    rn: rtmp2.to_reg(),
-                    rm: rtmp1.to_reg(),
-                });
-                if out_signed {
-                    if in_bits == 32 {
-                        lower_constant_f32(ctx, rtmp1, 0.0);
-                    } else {
-                        lower_constant_f64(ctx, rtmp1, 0.0);
-                    }
-                }
-                ctx.emit(Inst::FpuCmp {
-                    size: ScalarSize::from_ty(in_ty),
-                    rn,
-                    rm: rn,
-                });
-                if in_bits == 32 {
-                    ctx.emit(Inst::FpuCSel32 {
-                        rd: rtmp2,
-                        rn: rtmp1.to_reg(),
-                        rm: rtmp2.to_reg(),
-                        cond: Cond::Ne,
-                    });
-                } else {
-                    ctx.emit(Inst::FpuCSel64 {
-                        rd: rtmp2,
-                        rn: rtmp1.to_reg(),
-                        rm: rtmp2.to_reg(),
-                        cond: Cond::Ne,
-                    });
-                }
-
-                let cvt = match (in_bits, out_bits, out_signed) {
-                    (32, 32, false) => FpuToIntOp::F32ToU32,
-                    (32, 32, true) => FpuToIntOp::F32ToI32,
-                    (32, 64, false) => FpuToIntOp::F32ToU64,
-                    (32, 64, true) => FpuToIntOp::F32ToI64,
-                    (64, 32, false) => FpuToIntOp::F64ToU32,
-                    (64, 32, true) => FpuToIntOp::F64ToI32,
-                    (64, 64, false) => FpuToIntOp::F64ToU64,
-                    (64, 64, true) => FpuToIntOp::F64ToI64,
-                    _ => unreachable!(),
-                };
-                ctx.emit(Inst::FpuToInt {
-                    op: cvt,
-                    rd,
-                    rn: rtmp2.to_reg(),
-                });
-            }
-        }
+        Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx),
 
         Opcode::IaddIfcout => {
             // This is a two-output instruction that is needed for the
diff --git a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
index 0755c94feba6..8dbaf3e1c802 100644
--- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif
@@ -9,8 +9,8 @@ block0(v0: i8):
 }
 
 ; block0:
-;   uxtb w4, w0
-;   ucvtf s0, w4
+;   uxtb w3, w0
+;   ucvtf s0, w3
 ;   ret
 
 function u0:0(i8) -> f64 {
@@ -20,8 +20,8 @@ block0(v0: i8):
 }
 
 ; block0:
-;   uxtb w4, w0
-;   ucvtf d0, w4
+;   uxtb w3, w0
+;   ucvtf d0, w3
 ;   ret
 
 function u0:0(i16) -> f32 {
@@ -31,8 +31,8 @@ block0(v0: i16):
 }
 
 ; block0:
-;   uxth w4, w0
-;   ucvtf s0, w4
+;   uxth w3, w0
+;   ucvtf s0, w3
 ;   ret
 
 function u0:0(i16) -> f64 {
@@ -42,8 +42,8 @@ block0(v0: i16):
 }
 
 ; block0:
-;   uxth w4, w0
-;   ucvtf d0, w4
+;   uxth w3, w0
+;   ucvtf d0, w3
 ;   ret
 
 function u0:0(f32) -> i8 {
@@ -55,13 +55,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
-;   fcmp s0, s6
+;   fmov s5, #-1
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #17280, LSL #16
-;   fmov s6, w10
-;   fcmp s0, s6
-;   b.mi 8 ; udf
+;   fmov s18, w10
+;   fcmp s0, s18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
 
@@ -74,13 +74,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
-;   fcmp d0, d6
+;   fmov d5, #-1
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16496, LSL #48
-;   fmov d6, x10
-;   fcmp d0, d6
-;   b.mi 8 ; udf
+;   fmov d18, x10
+;   fcmp d0, d18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
 
@@ -93,13 +93,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
-;   fcmp s0, s6
+;   fmov s5, #-1
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #18304, LSL #16
-;   fmov s6, w10
-;   fcmp s0, s6
-;   b.mi 8 ; udf
+;   fmov s18, w10
+;   fcmp s0, s18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
 
@@ -112,13 +112,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
-;   fcmp d0, d6
+;   fmov d5, #-1
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16624, LSL #48
-;   fmov d6, x10
-;   fcmp d0, d6
-;   b.mi 8 ; udf
+;   fmov d18, x10
+;   fcmp d0, d18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
 
diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
index fc7df58b2fd9..16f38886a239 100644
--- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
+++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif
@@ -333,13 +333,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
-;   fcmp s0, s6
+;   fmov s5, #-1
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #20352, LSL #16
-;   fmov s6, w10
-;   fcmp s0, s6
-;   b.mi 8 ; udf
+;   fmov s18, w10
+;   fcmp s0, s18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, s0
 ;   ret
 
@@ -352,14 +352,14 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   movz x7, #52992, LSL #16
-;   fmov s7, w7
-;   fcmp s0, s7
+;   movz x6, #52992, LSL #16
+;   fmov s6, w6
+;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #20224, LSL #16
-;   fmov s7, w12
-;   fcmp s0, s7
-;   b.mi 8 ; udf
+;   fmov s20, w12
+;   fcmp s0, s20
+;   b.lt 8 ; udf
 ;   fcvtzs w0, s0
 ;   ret
 
@@ -372,13 +372,13 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   fmov s6, #-1
-;   fcmp s0, s6
+;   fmov s5, #-1
+;   fcmp s0, s5
 ;   b.gt 8 ; udf
 ;   movz x10, #24448, LSL #16
-;   fmov s6, w10
-;   fcmp s0, s6
-;   b.mi 8 ; udf
+;   fmov s18, w10
+;   fcmp s0, s18
+;   b.lt 8 ; udf
 ;   fcvtzu x0, s0
 ;   ret
 
@@ -391,14 +391,14 @@ block0(v0: f32):
 ; block0:
 ;   fcmp s0, s0
 ;   b.vc 8 ; udf
-;   movz x7, #57088, LSL #16
-;   fmov s7, w7
-;   fcmp s0, s7
+;   movz x6, #57088, LSL #16
+;   fmov s6, w6
+;   fcmp s0, s6
 ;   b.ge 8 ; udf
 ;   movz x12, #24320, LSL #16
-;   fmov s7, w12
-;   fcmp s0, s7
-;   b.mi 8 ; udf
+;   fmov s20, w12
+;   fcmp s0, s20
+;   b.lt 8 ; udf
 ;   fcvtzs x0, s0
 ;   ret
 
@@ -411,13 +411,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
-;   fcmp d0, d6
+;   fmov d5, #-1
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16880, LSL #48
-;   fmov d6, x10
-;   fcmp d0, d6
-;   b.mi 8 ; udf
+;   fmov d18, x10
+;   fcmp d0, d18
+;   b.lt 8 ; udf
 ;   fcvtzu w0, d0
 ;   ret
 
@@ -430,13 +430,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   ldr d6, pc+8 ; b 12 ; data.f64 -2147483649
-;   fcmp d0, d6
+;   ldr d5, pc+8 ; b 12 ; data.f64 -2147483649
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #16864, LSL #48
-;   fmov d6, x10
-;   fcmp d0, d6
-;   b.mi 8 ; udf
+;   fmov d18, x10
+;   fcmp d0, d18
+;   b.lt 8 ; udf
 ;   fcvtzs w0, d0
 ;   ret
 
@@ -449,13 +449,13 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   fmov d6, #-1
-;   fcmp d0, d6
+;   fmov d5, #-1
+;   fcmp d0, d5
 ;   b.gt 8 ; udf
 ;   movz x10, #17392, LSL #48
-;   fmov d6, x10
-;   fcmp d0, d6
-;   b.mi 8 ; udf
+;   fmov d18, x10
+;   fcmp d0, d18
+;   b.lt 8 ; udf
 ;   fcvtzu x0, d0
 ;   ret
 
@@ -468,14 +468,14 @@ block0(v0: f64):
 ; block0:
 ;   fcmp d0, d0
 ;   b.vc 8 ; udf
-;   movz x7, #50144, LSL #48
-;   fmov d7, x7
-;   fcmp d0, d7
+;   movz x6, #50144, LSL #48
+;   fmov d6, x6
+;   fcmp d0, d6
 ;   b.ge 8 ; udf
 ;   movz x12, #17376, LSL #48
-;   fmov d7, x12
-;   fcmp d0, d7
-;   b.mi 8 ; udf
+;   fmov d20, x12
+;   fcmp d0, d20
+;   b.lt 8 ; udf
 ;   fcvtzs x0, d0
 ;   ret
 
@@ -566,14 +566,14 @@ block0(v0: f32):
 }
 
 ; block0:
-;   movz x6, #20352, LSL #16
-;   fmov s5, w6
-;   fmin s7, s0, s5
-;   movi v5.2s, #0
-;   fmax s7, s7, s5
+;   movz x4, #20352, LSL #16
+;   fmov s4, w4
+;   fmin s7, s0, s4
+;   movi v17.2s, #0
+;   fmax s19, s7, s17
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
-;   fcvtzu w0, s7
+;   fcsel s22, s17, s19, ne
+;   fcvtzu w0, s22
 ;   ret
 
 function %f50(f32) -> i32 {
@@ -583,16 +583,16 @@ block0(v0: f32):
 }
 
 ; block0:
-;   movz x6, #20224, LSL #16
-;   fmov s5, w6
-;   fmin s7, s0, s5
+;   movz x4, #20224, LSL #16
+;   fmov s4, w4
+;   fmin s7, s0, s4
 ;   movz x10, #52992, LSL #16
-;   fmov s5, w10
-;   fmax s7, s7, s5
-;   movi v5.2s, #0
+;   fmov s18, w10
+;   fmax s21, s7, s18
+;   movi v23.16b, #0
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
-;   fcvtzs w0, s7
+;   fcsel s26, s23, s21, ne
+;   fcvtzs w0, s26
 ;   ret
 
 function %f51(f32) -> i64 {
@@ -602,14 +602,14 @@ block0(v0: f32):
 }
 
 ; block0:
-;   movz x6, #24448, LSL #16
-;   fmov s5, w6
-;   fmin s7, s0, s5
-;   movi v5.2s, #0
-;   fmax s7, s7, s5
+;   movz x4, #24448, LSL #16
+;   fmov s4, w4
+;   fmin s7, s0, s4
+;   movi v17.2s, #0
+;   fmax s19, s7, s17
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
-;   fcvtzu x0, s7
+;   fcsel s22, s17, s19, ne
+;   fcvtzu x0, s22
 ;   ret
 
 function %f52(f32) -> i64 {
@@ -619,16 +619,16 @@ block0(v0: f32):
 }
 
 ; block0:
-;   movz x6, #24320, LSL #16
-;   fmov s5, w6
-;   fmin s7, s0, s5
+;   movz x4, #24320, LSL #16
+;   fmov s4, w4
+;   fmin s7, s0, s4
 ;   movz x10, #57088, LSL #16
-;   fmov s5, w10
-;   fmax s7, s7, s5
-;   movi v5.2s, #0
+;   fmov s18, w10
+;   fmax s21, s7, s18
+;   movi v23.16b, #0
 ;   fcmp s0, s0
-;   fcsel s7, s5, s7, ne
-;   fcvtzs x0, s7
+;   fcsel s26, s23, s21, ne
+;   fcvtzs x0, s26
 ;   ret
 
 function %f53(f64) -> i32 {
@@ -638,13 +638,13 @@ block0(v0: f64):
 }
 
 ; block0:
-;   ldr d4, pc+8 ; b 12 ; data.f64 4294967295
-;   fmin d6, d0, d4
-;   movi v4.2s, #0
-;   fmax d6, d6, d4
+;   ldr d3, pc+8 ; b 12 ; data.f64 4294967295
+;   fmin d5, d0, d3
+;   movi v7.2s, #0
+;   fmax d17, d5, d7
 ;   fcmp d0, d0
-;   fcsel d6, d4, d6, ne
-;   fcvtzu w0, d6
+;   fcsel d20, d7, d17, ne
+;   fcvtzu w0, d20
 ;   ret
 
 function %f54(f64) -> i32 {
@@ -654,15 +654,15 @@ block0(v0: f64):
 }
 
 ; block0:
-;   ldr d4, pc+8 ; b 12 ; data.f64 2147483647
-;   fmin d6, d0, d4
+;   ldr d3, pc+8 ; b 12 ; data.f64 2147483647
+;   fmin d5, d0, d3
 ;   movz x8, #49632, LSL #48
-;   fmov d4, x8
-;   fmax d6, d6, d4
-;   movi v4.2s, #0
+;   fmov d16, x8
+;   fmax d19, d5, d16
+;   movi v21.16b, #0
 ;   fcmp d0, d0
-;   fcsel d6, d4, d6, ne
-;   fcvtzs w0, d6
+;   fcsel d24, d21, d19, ne
+;   fcvtzs w0, d24
 ;   ret
 
 function %f55(f64) -> i64 {
@@ -672,14 +672,14 @@ block0(v0: f64):
 }
 
 ; block0:
-;   movz x6, #17392, LSL #48
-;   fmov d5, x6
-;   fmin d7, d0, d5
-;   movi v5.2s, #0
-;   fmax d7, d7, d5
+;   movz x4, #17392, LSL #48
+;   fmov d4, x4
+;   fmin d7, d0, d4
+;   movi v17.2s, #0
+;   fmax d19, d7, d17
 ;   fcmp d0, d0
-;   fcsel d7, d5, d7, ne
-;   fcvtzu x0, d7
+;   fcsel d22, d17, d19, ne
+;   fcvtzu x0, d22
 ;   ret
 
 function %f56(f64) -> i64 {
@@ -689,16 +689,16 @@ block0(v0: f64):
 }
 
 ; block0:
-;   movz x6, #17376, LSL #48
-;   fmov d5, x6
-;   fmin d7, d0, d5
+;   movz x4, #17376, LSL #48
+;   fmov d4, x4
+;   fmin d7, d0, d4
 ;   movz x10, #50144, LSL #48
-;   fmov d5, x10
-;   fmax d7, d7, d5
-;   movi v5.2s, #0
+;   fmov d18, x10
+;   fmax d21, d7, d18
+;   movi v23.16b, #0
 ;   fcmp d0, d0
-;   fcsel d7, d5, d7, ne
-;   fcvtzs x0, d7
+;   fcsel d26, d23, d21, ne
+;   fcvtzs x0, d26
 ;   ret
 
 function %f57(f32x2) -> f32x2 {
@@ -946,3 +946,36 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2):
 ;   mov v0.16b, v2.16b
 ;   fmla v0.2d, v17.2d, v1.2d
 ;   ret
+
+function %f81(f32x2, f32x2) -> f32x2 {
+block0(v0: f32x2, v1: f32x2):
+  v2 = fcopysign v0, v1
+  return v2
+}
+
+; block0:
+;   ushr v7.2s, v1.2s, #31
+;   sli v0.2s, v7.2s, #31
+;   ret
+
+function %f82(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+  v2 = fcopysign v0, v1
+  return v2
+}
+
+; block0:
+;   ushr v7.4s, v1.4s, #31
+;   sli v0.4s, v7.4s, #31
+;   ret
+
+function %f83(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+  v2 = fcopysign v0, v1
+  return v2
+}
+
+; block0:
+;   ushr v7.2d, v1.2d, #63
+;   sli v0.2d, v7.2d, #63
+;   ret
diff --git a/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif b/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif
new file mode 100644
index 000000000000..253e4e74d6e8
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif
@@ -0,0 +1,37 @@
+test interpret
+test run
+target aarch64
+; x86_64 and s390x do not support 64-bit vectors in `fcopysign`.
+
+function %fcopysign_f32x2(f32x2, f32x2) -> f32x2 {
+block0(v0: f32x2, v1: f32x2):
+    v2 = fcopysign v0, v1
+    return v2
+}
+; run: %fcopysign_f32x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0]
+; run: %fcopysign_f32x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0]
+; run: %fcopysign_f32x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0]
+
+; F32 Inf
+; run: %fcopysign_f32x2([Inf -Inf], [Inf Inf]) == [Inf Inf]
+; run: %fcopysign_f32x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf]
+
+; F32 Epsilon  / Max / Min Positive
+; run: %fcopysign_f32x2([0x1.000000p-23 -0x1.000000p-23], [-0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23]
+; run: %fcopysign_f32x2([0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0]) == [-0x1.fffffep127 0x1.fffffep127]
+; run: %fcopysign_f32x2([0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126]
+
+; F32 Subnormals
+; run: %fcopysign_f32x2([0x0.800000p-126 -0x0.800000p-126], [-0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126]
+; run: %fcopysign_f32x2([0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0]) == [-0x0.000002p-126 0x0.000002p-126]
+
+; F32 NaN's
+; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
+; run: %fcopysign_f32x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0]
+; run: %fcopysign_f32x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN]
+; run: %fcopysign_f32x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0]
+; run: %fcopysign_f32x2([+NaN:0x1 +NaN:0x300001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x300001]
+; run: %fcopysign_f32x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1]
+; run: %fcopysign_f32x2([-NaN:0x300001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x300001 -sNaN:0x1]
+; run: %fcopysign_f32x2([-sNaN:0x1 +sNaN:0x200001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x200001]
+; run: %fcopysign_f32x2([-sNaN:0x200001 -sNaN:0x200001], [+NaN +NaN]) == [+sNaN:0x200001 +sNaN:0x200001]
diff --git a/cranelift/filetests/filetests/runtests/simd-fcopysign.clif b/cranelift/filetests/filetests/runtests/simd-fcopysign.clif
new file mode 100644
index 000000000000..331301038785
--- /dev/null
+++ b/cranelift/filetests/filetests/runtests/simd-fcopysign.clif
@@ -0,0 +1,63 @@
+test interpret
+test run
+target s390x
+target aarch64
+; x86_64 does not support SIMD fcopysign.
+
+function %fcopysign_f32x4(f32x4, f32x4) -> f32x4 {
+block0(v0: f32x4, v1: f32x4):
+    v2 = fcopysign v0, v1
+    return v2
+}
+; run: %fcopysign_f32x4([0x9.0 -0x9.0 0x9.0 -0x9.0], [0x9.0 0x9.0 -0x9.0 -0x9.0]) == [0x9.0 0x9.0 -0x9.0 -0x9.0]
+; run: %fcopysign_f32x4([0x0.0 -0x0.0 0x0.0 -0x0.0], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.0 0x0.0 -0x0.0 0x0.0]
+
+; F32 Inf
+; run: %fcopysign_f32x4([Inf -Inf Inf -Inf], [Inf Inf -Inf -Inf]) == [Inf Inf -Inf -Inf]
+
+; F32 Epsilon  / Max / Min Positive
+; run: %fcopysign_f32x4([0x1.000000p-23 -0x1.000000p-23 0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23 -0x1.fffffep127 0x1.fffffep127]
+; run: %fcopysign_f32x4([0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126]
+
+; F32 Subnormals
+; run: %fcopysign_f32x4([0x0.800000p-126 -0x0.800000p-126 0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126 -0x0.000002p-126 0x0.000002p-126]
+
+; F32 NaN's
+; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
+; run: %fcopysign_f32x4([0x0.0 0x3.0 Inf +NaN], [-NaN +sNaN:0x1 -NaN -NaN]) == [-0x0.0 0x3.0 -Inf -NaN]
+; run: %fcopysign_f32x4([-NaN +NaN:0x0 +NaN:0x1 +NaN:0x300001], [+NaN -NaN -NaN -NaN]) == [+NaN -NaN:0x0 -NaN:0x1 -NaN:0x300001]
+; run: %fcopysign_f32x4([-NaN:0x0 -NaN:0x1 -NaN:0x300001 +sNaN:0x1], [+NaN +NaN +NaN -NaN]) == [+NaN:0x0 +NaN:0x1 +NaN:0x300001 -sNaN:0x1]
+; run: %fcopysign_f32x4([-sNaN:0x1 +sNaN:0x200001 -sNaN:0x200001 -sNaN:0x200001], [+NaN -NaN +NaN +NaN]) == [+sNaN:0x1 -sNaN:0x200001 +sNaN:0x200001 +sNaN:0x200001]
+
+function %fcopysign_f64x2(f64x2, f64x2) -> f64x2 {
+block0(v0: f64x2, v1: f64x2):
+    v2 = fcopysign v0, v1
+    return v2
+}
+; run: %fcopysign_f64x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0]
+; run: %fcopysign_f64x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0]
+; run: %fcopysign_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0]
+
+; F64 Inf
+; run: %fcopysign_f64x2([Inf -Inf], [Inf Inf]) == [Inf Inf]
+; run: %fcopysign_f64x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf]
+
+; F64 Epsilon / Max / Min Positive
+; run: %fcopysign_f64x2([0x1.0000000000000p-52 -0x1.0000000000000p-52], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-52 0x1.0000000000000p-52]
+; run: %fcopysign_f64x2([0x1.fffffffffffffp1023 -0x1.fffffffffffffp1023], [-0x0.0 0x0.0]) == [-0x1.fffffffffffffp1023 0x1.fffffffffffffp1023]
+; run: %fcopysign_f64x2([0x1.0000000000000p-1022 -0x1.0000000000000p-1022], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-1022 0x1.0000000000000p-1022]
+
+; F64 Subnormals
+; run: %fcopysign_f64x2([0x0.8000000000000p-1022 -0x0.8000000000000p-1022], [-0x0.0 0x0.0]) == [-0x0.8000000000000p-1022 0x0.8000000000000p-1022]
+; run: %fcopysign_f64x2([0x0.0000000000001p-1022 -0x0.0000000000001p-1022], [-0x0.0 0x0.0]) == [-0x0.0000000000001p-1022 0x0.0000000000001p-1022]
+
+; F64 NaN's
+; Unlike with other operations fcopysign is guaranteed to only affect the sign bit
+; run: %fcopysign_f64x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0]
+; run: %fcopysign_f64x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN]
+; run: %fcopysign_f64x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0]
+; run: %fcopysign_f64x2([+NaN:0x1 +NaN:0x4000000000001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x4000000000001]
+; run: %fcopysign_f64x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1]
+; run: %fcopysign_f64x2([-NaN:0x4000000000001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x4000000000001 -sNaN:0x1]
+; run: %fcopysign_f64x2([-sNaN:0x1 +sNaN:0x4000000000001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x4000000000001]
+; run: %fcopysign_f64x2([-sNaN:0x4000000000001 -sNaN:0x4000000000001], [+NaN +NaN]) == [+sNaN:0x4000000000001 +sNaN:0x4000000000001]
diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs
index 8b3860acdce4..c30dd92b6408 100644
--- a/cranelift/interpreter/src/step.rs
+++ b/cranelift/interpreter/src/step.rs
@@ -808,7 +808,19 @@ where
         }
         Opcode::Fneg => assign(Value::neg(arg(0)?)?),
         Opcode::Fabs => assign(Value::abs(arg(0)?)?),
-        Opcode::Fcopysign => binary(Value::copysign, arg(0)?, arg(1)?)?,
+        Opcode::Fcopysign => {
+            let arg0 = extractlanes(&arg(0)?, ctrl_ty)?;
+            let arg1 = extractlanes(&arg(1)?, ctrl_ty)?;
+
+            assign(vectorizelanes(
+                &arg0
+                    .into_iter()
+                    .zip(arg1.into_iter())
+                    .map(|(x, y)| V::copysign(x, y))
+                    .collect::<ValueResult<SimdVec<V>>>()?,
+                ctrl_ty,
+            )?)
+        }
         Opcode::Fmin => assign(match (arg(0)?, arg(1)?) {
             (a, _) if a.is_nan()? => a,
             (_, b) if b.is_nan()? => b,

From 74fee50460ec78bcfcc4b8bca5a89084c610ebe6 Mon Sep 17 00:00:00 2001
From: dheaton-arm <Damian.Heaton@arm.com>
Date: Wed, 24 Aug 2022 11:37:53 +0100
Subject: [PATCH 2/2] Document helpers and abstract conversion checks

---
 cranelift/codegen/src/isa/aarch64/inst.isle | 127 +++++++++++++-------
 1 file changed, 82 insertions(+), 45 deletions(-)

diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index f52b90f23a3c..db111ba12e47 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1401,21 +1401,45 @@
 (decl u64_into_imm_logic (Type u64) ImmLogic)
 (extern constructor u64_into_imm_logic u64_into_imm_logic)
 
+;; Calculate the minimum floating-point bound for a conversion to floating
+;; point from an integer type.
+;; Accepts whether the output is signed, the size of the input
+;; floating point type in bits, and the size of the output integer type
+;; in bits.
 (decl min_fp_value (bool u8 u8) Reg)
 (extern constructor min_fp_value min_fp_value)
 
+;; Calculate the maximum floating-point bound for a conversion to floating
+;; point from an integer type.
+;; Accepts whether the output is signed, the size of the input
+;; floating point type in bits, and the size of the output integer type
+;; in bits.
 (decl max_fp_value (bool u8 u8) Reg)
 (extern constructor max_fp_value max_fp_value)
 
+;; Calculate the minimum acceptable floating-point value for a conversion to
+;; floating point from an integer type.
+;; Accepts whether the output is signed, the size of the input
+;; floating point type in bits, and the size of the output integer type
+;; in bits.
 (decl min_fp_value_sat (bool u8 u8) Reg)
 (extern constructor min_fp_value_sat min_fp_value_sat)
 
+;; Calculate the maximum acceptable floating-point value for a conversion to
+;; floating point from an integer type.
+;; Accepts whether the output is signed, the size of the input
+;; floating point type in bits, and the size of the output integer type
+;; in bits.
 (decl max_fp_value_sat (bool u8 u8) Reg)
 (extern constructor max_fp_value_sat max_fp_value_sat)
 
+;; Constructs an FPUOpRI.Ushr* given the size in bits of the value (or lane)
+;; and the amount to shift by.
 (decl fpu_op_ri_ushr (u8 u8) FPUOpRI)
 (extern constructor fpu_op_ri_ushr fpu_op_ri_ushr)
 
+;; Constructs an FPUOpRI.Sli* given the size in bits of the value (or lane)
+;; and the amount to shift by.
 (decl fpu_op_ri_sli (u8 u8) FPUOpRI)
 (extern constructor fpu_op_ri_sli fpu_op_ri_sli)
 
@@ -2675,43 +2699,62 @@
 
 ;; Helpers for generating `MInst.FpuToInt` instructions.
 
-(decl fpu_to_int_nan_check () ConsumesFlags)
-(rule (fpu_to_int_nan_check)
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond (Cond.Vs))
-                    (trap_code_bad_conversion_to_integer))))
-
-;; Emits the appropriate flag-reading op for an underflow check,
+(decl fpu_to_int_nan_check (ScalarSize Reg) Reg)
+(rule (fpu_to_int_nan_check size src)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp size src src)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Vs))
+                        (trap_code_bad_conversion_to_integer))
+                    src))))
+       (value_regs_get r 0)))
+
+;; Checks that the value is not less than the minimum bound,
 ;; accepting a boolean (whether the type is signed), input type,
-;; and output type.
-(decl fpu_to_int_underflow_check (bool Type Type) ConsumesFlags)
-(rule (fpu_to_int_underflow_check $true $F32 (fits_in_16 out_ty))
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond
-                  (Cond.Le))
-                  (trap_code_integer_overflow))))
-(rule (fpu_to_int_underflow_check $true $F64 (fits_in_32 out_ty))
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond
-                  (Cond.Le))
-                  (trap_code_integer_overflow))))
-(rule -1 (fpu_to_int_underflow_check $true _in_ty _out_ty)
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond
-                  (Cond.Lt))
-                  (trap_code_integer_overflow))))
-(rule (fpu_to_int_underflow_check $false _in_ty _out_ty)
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond
-                  (Cond.Le))
-                  (trap_code_integer_overflow))))
-
-(decl fpu_to_int_overflow_check () ConsumesFlags)
-(rule (fpu_to_int_overflow_check)
-      (ConsumesFlags.ConsumesFlagsSideEffect
-      (MInst.TrapIf (cond_br_cond
-                  (Cond.Ge))
-                  (trap_code_integer_overflow))))
+;; output type, and registers containing the source and minimum bound.
+(decl fpu_to_int_underflow_check (bool Type Type Reg Reg) Reg)
+(rule (fpu_to_int_underflow_check $true $F32 (fits_in_16 out_ty) src min)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp (ScalarSize.Size32) src min)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Le))
+                        (trap_code_integer_overflow))
+                    src))))
+       (value_regs_get r 0)))
+(rule (fpu_to_int_underflow_check $true $F64 (fits_in_32 out_ty) src min)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp (ScalarSize.Size64) src min)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Le))
+                        (trap_code_integer_overflow))
+                    src))))
+       (value_regs_get r 0)))
+(rule -1 (fpu_to_int_underflow_check $true in_ty _out_ty src min)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp (scalar_size in_ty) src min)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Lt))
+                        (trap_code_integer_overflow))
+                    src))))
+       (value_regs_get r 0)))
+(rule (fpu_to_int_underflow_check $false in_ty _out_ty src min)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp (scalar_size in_ty) src min)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Le))
+                        (trap_code_integer_overflow))
+                    src))))
+       (value_regs_get r 0)))
+
+(decl fpu_to_int_overflow_check (ScalarSize Reg Reg) Reg)
+(rule (fpu_to_int_overflow_check size src max)
+      (let ((r ValueRegs
+                  (with_flags (fpu_cmp size src max)
+                   (ConsumesFlags.ConsumesFlagsReturnsReg
+                    (MInst.TrapIf (cond_br_cond (Cond.Ge))
+                        (trap_code_integer_overflow))
+                    src))))
+       (value_regs_get r 0)))
 
 ;; Emits the appropriate instruction sequence to convert a
 ;; floating-point value to an integer, trapping if the value
@@ -2724,17 +2767,11 @@
       (let ((size ScalarSize (scalar_size in_ty))
             (in_bits u8 (ty_bits in_ty))
             (out_bits u8 (ty_bits out_ty))
-            (_ InstOutput (side_effect (with_flags_side_effect
-                   (fpu_cmp size src src)
-                   (fpu_to_int_nan_check))))
+            (src Reg (fpu_to_int_nan_check size src))
             (min Reg (min_fp_value signed in_bits out_bits))
-            (_ InstOutput (side_effect (with_flags_side_effect
-                   (fpu_cmp size src min)
-                   (fpu_to_int_underflow_check signed in_ty out_ty))))
+            (src Reg (fpu_to_int_underflow_check signed in_ty out_ty src min))
             (max Reg (max_fp_value signed in_bits out_bits))
-            (_ InstOutput (side_effect (with_flags_side_effect
-                   (fpu_cmp size src max)
-                   (fpu_to_int_overflow_check)))))
+            (src Reg (fpu_to_int_overflow_check size src max)))
        (fpu_to_int op src)))
 
 ;; Emits the appropriate instruction sequence to convert a