From 94bcbe844612306c136193f41f5cbdeec45a42ec Mon Sep 17 00:00:00 2001 From: Damian Heaton <87125748+dheaton-arm@users.noreply.github.com> Date: Wed, 24 Aug 2022 18:37:14 +0100 Subject: [PATCH] Port `Fcopysign`..`FcvtToSintSat` to ISLE (AArch64) (#4753) * Port `Fcopysign`..``FcvtToSintSat` to ISLE (AArch64) Ported the existing implementations of the following opcodes to ISLE on AArch64: - `Fcopysign` - Also introduced missing support for `fcopysign` on vector values, as per the docs. - This introduces the vector encoding for the `SLI` machine instruction. - `FcvtToUint` - `FcvtToSint` - `FcvtFromUint` - `FcvtFromSint` - `FcvtToUintSat` - `FcvtToSintSat` Copyright (c) 2022 Arm Limited * Document helpers and abstract conversion checks --- cranelift/codegen/src/isa/aarch64/inst.isle | 207 +++++++++ .../codegen/src/isa/aarch64/inst/emit.rs | 44 ++ cranelift/codegen/src/isa/aarch64/inst/mod.rs | 20 +- cranelift/codegen/src/isa/aarch64/lower.isle | 113 +++++ cranelift/codegen/src/isa/aarch64/lower.rs | 11 - .../codegen/src/isa/aarch64/lower/isle.rs | 205 ++++++++- .../codegen/src/isa/aarch64/lower_inst.rs | 406 +----------------- .../filetests/isa/aarch64/fcvt-small.clif | 56 +-- .../filetests/isa/aarch64/floating-point.clif | 235 +++++----- .../runtests/simd-fcopysign-64bit.clif | 37 ++ .../filetests/runtests/simd-fcopysign.clif | 63 +++ cranelift/interpreter/src/step.rs | 14 +- 12 files changed, 863 insertions(+), 548 deletions(-) create mode 100644 cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif create mode 100644 cranelift/filetests/filetests/runtests/simd-fcopysign.clif diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle index 9c10e40002be..db111ba12e47 100644 --- a/cranelift/codegen/src/isa/aarch64/inst.isle +++ b/cranelift/codegen/src/isa/aarch64/inst.isle @@ -619,6 +619,14 @@ (size VectorSize) (imm u8)) + ;; Destructive vector shift by immediate. + (VecShiftImmMod + (op VecShiftImmModOp) + (rd WritableReg) + (rn Reg) + (size VectorSize) + (imm u8)) + ;; Vector extract - create a new vector, being the concatenation of the lowest `imm4` bytes ;; of `rm` followed by the uppermost `16 - imm4` bytes of `rn`. (VecExtract @@ -1315,6 +1323,13 @@ (Sshr) )) +;; Destructive shift-by-immediate operation on each lane of a vector. +(type VecShiftImmModOp + (enum + ;; Shift left and insert + (Sli) +)) + ;; Atomic read-modify-write operations with acquire-release semantics (type AtomicRMWOp (enum @@ -1386,6 +1401,48 @@ (decl u64_into_imm_logic (Type u64) ImmLogic) (extern constructor u64_into_imm_logic u64_into_imm_logic) +;; Calculate the minimum floating-point bound for a conversion to floating +;; point from an integer type. +;; Accepts whether the output is signed, the size of the input +;; floating point type in bits, and the size of the output integer type +;; in bits. +(decl min_fp_value (bool u8 u8) Reg) +(extern constructor min_fp_value min_fp_value) + +;; Calculate the maximum floating-point bound for a conversion to floating +;; point from an integer type. +;; Accepts whether the output is signed, the size of the input +;; floating point type in bits, and the size of the output integer type +;; in bits. +(decl max_fp_value (bool u8 u8) Reg) +(extern constructor max_fp_value max_fp_value) + +;; Calculate the minimum acceptable floating-point value for a conversion to +;; floating point from an integer type. +;; Accepts whether the output is signed, the size of the input +;; floating point type in bits, and the size of the output integer type +;; in bits. +(decl min_fp_value_sat (bool u8 u8) Reg) +(extern constructor min_fp_value_sat min_fp_value_sat) + +;; Calculate the maximum acceptable floating-point value for a conversion to +;; floating point from an integer type. +;; Accepts whether the output is signed, the size of the input +;; floating point type in bits, and the size of the output integer type +;; in bits. +(decl max_fp_value_sat (bool u8 u8) Reg) +(extern constructor max_fp_value_sat max_fp_value_sat) + +;; Constructs an FPUOpRI.Ushr* given the size in bits of the value (or lane) +;; and the amount to shift by. +(decl fpu_op_ri_ushr (u8 u8) FPUOpRI) +(extern constructor fpu_op_ri_ushr fpu_op_ri_ushr) + +;; Constructs an FPUOpRI.Sli* given the size in bits of the value (or lane) +;; and the amount to shift by. +(decl fpu_op_ri_sli (u8 u8) FPUOpRI) +(extern constructor fpu_op_ri_sli fpu_op_ri_sli) + (decl imm12_from_negated_u64 (Imm12) u64) (extern extractor imm12_from_negated_u64 imm12_from_negated_u64) @@ -1533,6 +1590,12 @@ (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size)))) dst)) +(decl fpu_rri (FPUOpRI Reg) Reg) +(rule (fpu_rri op src) + (let ((dst WritableReg (temp_writable_reg $F64)) + (_ Unit (emit (MInst.FpuRRI op dst src)))) + dst)) + ;; Helper for emitting `MInst.FpuRRR` instructions. (decl fpu_rrr (FPUOp2 Reg Reg ScalarSize) Reg) (rule (fpu_rrr op src1 src2 size) @@ -2611,3 +2674,147 @@ ;; to clobber LR. (let ((_ Unit (emit (MInst.Xpaclri)))) (mov_preg (preg_link)))) + +;; Helper for getting the maximum shift amount for a type. + +(decl max_shift (Type) u8) +(rule (max_shift $F64) 63) +(rule (max_shift $F32) 31) + +;; Helper for generating `fcopysign` instruction sequences. + +(decl fcopy_sign (Reg Reg Type) Reg) +(rule (fcopy_sign x y (ty_scalar_float ty)) + (let ((dst WritableReg (temp_writable_reg $F64)) + (_ Unit (emit (MInst.FpuMove64 dst x))) + (tmp Reg (fpu_rri (fpu_op_ri_ushr (ty_bits ty) (max_shift ty)) y)) + (_ Unit (emit (MInst.FpuRRI (fpu_op_ri_sli (ty_bits ty) (max_shift ty)) dst tmp)))) + dst)) +(rule (fcopy_sign x y ty @ (multi_lane _ _)) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.FpuMove128 dst x))) + (tmp Reg (vec_shift_imm (VecShiftImmOp.Ushr) (max_shift (lane_type ty)) y (vector_size ty))) + (_ Unit (emit (MInst.VecShiftImmMod (VecShiftImmModOp.Sli) dst tmp (vector_size ty) (max_shift (lane_type ty)))))) + dst)) + +;; Helpers for generating `MInst.FpuToInt` instructions. + +(decl fpu_to_int_nan_check (ScalarSize Reg) Reg) +(rule (fpu_to_int_nan_check size src) + (let ((r ValueRegs + (with_flags (fpu_cmp size src src) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Vs)) + (trap_code_bad_conversion_to_integer)) + src)))) + (value_regs_get r 0))) + +;; Checks that the value is not less than the minimum bound, +;; accepting a boolean (whether the type is signed), input type, +;; output type, and registers containing the source and minimum bound. +(decl fpu_to_int_underflow_check (bool Type Type Reg Reg) Reg) +(rule (fpu_to_int_underflow_check $true $F32 (fits_in_16 out_ty) src min) + (let ((r ValueRegs + (with_flags (fpu_cmp (ScalarSize.Size32) src min) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Le)) + (trap_code_integer_overflow)) + src)))) + (value_regs_get r 0))) +(rule (fpu_to_int_underflow_check $true $F64 (fits_in_32 out_ty) src min) + (let ((r ValueRegs + (with_flags (fpu_cmp (ScalarSize.Size64) src min) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Le)) + (trap_code_integer_overflow)) + src)))) + (value_regs_get r 0))) +(rule -1 (fpu_to_int_underflow_check $true in_ty _out_ty src min) + (let ((r ValueRegs + (with_flags (fpu_cmp (scalar_size in_ty) src min) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Lt)) + (trap_code_integer_overflow)) + src)))) + (value_regs_get r 0))) +(rule (fpu_to_int_underflow_check $false in_ty _out_ty src min) + (let ((r ValueRegs + (with_flags (fpu_cmp (scalar_size in_ty) src min) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Le)) + (trap_code_integer_overflow)) + src)))) + (value_regs_get r 0))) + +(decl fpu_to_int_overflow_check (ScalarSize Reg Reg) Reg) +(rule (fpu_to_int_overflow_check size src max) + (let ((r ValueRegs + (with_flags (fpu_cmp size src max) + (ConsumesFlags.ConsumesFlagsReturnsReg + (MInst.TrapIf (cond_br_cond (Cond.Ge)) + (trap_code_integer_overflow)) + src)))) + (value_regs_get r 0))) + +;; Emits the appropriate instruction sequence to convert a +;; floating-point value to an integer, trapping if the value +;; is a NaN or does not fit in the target type. +;; Accepts the specific conversion op, the source register, +;; whether the input is signed, and finally the input and output +;; types. +(decl fpu_to_int_cvt (FpuToIntOp Reg bool Type Type) Reg) +(rule (fpu_to_int_cvt op src signed in_ty out_ty) + (let ((size ScalarSize (scalar_size in_ty)) + (in_bits u8 (ty_bits in_ty)) + (out_bits u8 (ty_bits out_ty)) + (src Reg (fpu_to_int_nan_check size src)) + (min Reg (min_fp_value signed in_bits out_bits)) + (src Reg (fpu_to_int_underflow_check signed in_ty out_ty src min)) + (max Reg (max_fp_value signed in_bits out_bits)) + (src Reg (fpu_to_int_overflow_check size src max))) + (fpu_to_int op src))) + +;; Emits the appropriate instruction sequence to convert a +;; floating-point value to an integer, saturating if the value +;; does not fit in the target type. +;; Accepts the specific conversion op, the source register, +;; whether the input is signed, and finally the input and output +;; types. +(decl fpu_to_int_cvt_sat (FpuToIntOp Reg bool Type Type) Reg) +(rule (fpu_to_int_cvt_sat op src $true in_ty out_ty) + (let ((size ScalarSize (scalar_size in_ty)) + (in_bits u8 (ty_bits in_ty)) + (out_bits u8 (ty_bits out_ty)) + (max Reg (max_fp_value_sat $true in_bits out_bits)) + (tmp Reg (fpu_rrr (FPUOp2.Min) src max size)) + (min Reg (min_fp_value_sat $true in_bits out_bits)) + (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size)) + (zero Reg (constant_f128 0)) + (tmp ValueRegs (with_flags (fpu_cmp size src src) + (fpu_csel in_ty (Cond.Ne) zero tmp)))) + (fpu_to_int op (value_regs_get tmp 0)))) +(rule (fpu_to_int_cvt_sat op src $false in_ty out_ty) + (let ((size ScalarSize (scalar_size in_ty)) + (in_bits u8 (ty_bits in_ty)) + (out_bits u8 (ty_bits out_ty)) + (max Reg (max_fp_value_sat $false in_bits out_bits)) + (tmp Reg (fpu_rrr (FPUOp2.Min) src max size)) + (min Reg (min_fp_value_sat $false in_bits out_bits)) + (tmp Reg (fpu_rrr (FPUOp2.Max) tmp min size)) + (tmp ValueRegs (with_flags (fpu_cmp size src src) + (fpu_csel in_ty (Cond.Ne) min tmp)))) + (fpu_to_int op (value_regs_get tmp 0)))) + +(decl fpu_to_int (FpuToIntOp Reg) Reg) +(rule (fpu_to_int op src) + (let ((dst WritableReg (temp_writable_reg $I64)) + (_ Unit (emit (MInst.FpuToInt op dst src)))) + dst)) + +;; Helper for generating `MInst.IntToFpu` instructions. + +(decl int_to_fpu (IntToFpuOp Reg) Reg) +(rule (int_to_fpu op src) + (let ((dst WritableReg (temp_writable_reg $I8X16)) + (_ Unit (emit (MInst.IntToFpu op dst src)))) + dst)) diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs index 6d9c323d4f7d..ec6117c008a5 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs @@ -2033,6 +2033,50 @@ impl MachInstEmit for Inst { let rd_enc = machreg_to_vec(rd.to_reg()); sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc); } + &Inst::VecShiftImmMod { + op, + rd, + rn, + size, + imm, + } => { + let rd = allocs.next_writable(rd); + let rn = allocs.next(rn); + let (is_shr, mut template) = match op { + VecShiftImmModOp::Sli => (false, 0b_001_011110_0000_000_010101_00000_00000_u32), + }; + if size.is_128bits() { + template |= 0b1 << 30; + } + let imm = imm as u32; + // Deal with the somewhat strange encoding scheme for, and limits on, + // the shift amount. + let immh_immb = match (size.lane_size(), is_shr) { + (ScalarSize::Size64, true) if imm >= 1 && imm <= 64 => { + 0b_1000_000_u32 | (64 - imm) + } + (ScalarSize::Size32, true) if imm >= 1 && imm <= 32 => { + 0b_0100_000_u32 | (32 - imm) + } + (ScalarSize::Size16, true) if imm >= 1 && imm <= 16 => { + 0b_0010_000_u32 | (16 - imm) + } + (ScalarSize::Size8, true) if imm >= 1 && imm <= 8 => { + 0b_0001_000_u32 | (8 - imm) + } + (ScalarSize::Size64, false) if imm <= 63 => 0b_1000_000_u32 | imm, + (ScalarSize::Size32, false) if imm <= 31 => 0b_0100_000_u32 | imm, + (ScalarSize::Size16, false) if imm <= 15 => 0b_0010_000_u32 | imm, + (ScalarSize::Size8, false) if imm <= 7 => 0b_0001_000_u32 | imm, + _ => panic!( + "aarch64: Inst::VecShiftImmMod: emit: invalid op/size/imm {:?}, {:?}, {:?}", + op, size, imm + ), + }; + let rn_enc = machreg_to_vec(rn); + let rd_enc = machreg_to_vec(rd.to_reg()); + sink.put4(template | (immh_immb << 16) | (rn_enc << 5) | rd_enc); + } &Inst::VecExtract { rd, rn, rm, imm4 } => { let rd = allocs.next_writable(rd); let rn = allocs.next(rn); diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs index eac2d1bb2356..84292a43bc7b 100644 --- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs +++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs @@ -39,7 +39,7 @@ pub use crate::isa::aarch64::lower::isle::generated_code::{ ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3, FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp, VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, - VecRRRLongOp, VecShiftImmOp, + VecRRRLongOp, VecShiftImmModOp, VecShiftImmOp, }; /// A floating-point unit (FPU) operation with two args, a register and an immediate. @@ -767,6 +767,10 @@ fn aarch64_get_operands VReg>(inst: &Inst, collector: &mut Operan collector.reg_def(rd); collector.reg_use(rn); } + &Inst::VecShiftImmMod { rd, rn, .. } => { + collector.reg_mod(rd); + collector.reg_use(rn); + } &Inst::VecExtract { rd, rn, rm, .. } => { collector.reg_def(rd); collector.reg_use(rn); @@ -2371,6 +2375,20 @@ impl Inst { let rn = pretty_print_vreg_vector(rn, size, allocs); format!("{} {}, {}, #{}", op, rd, rn, imm) } + &Inst::VecShiftImmMod { + op, + rd, + rn, + size, + imm, + } => { + let op = match op { + VecShiftImmModOp::Sli => "sli", + }; + let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs); + let rn = pretty_print_vreg_vector(rn, size, allocs); + format!("{} {}, {}, #{}", op, rd, rn, imm) + } &Inst::VecExtract { rd, rn, rm, imm4 } => { let rd = pretty_print_vreg_vector(rd.to_reg(), VectorSize::Size8x16, allocs); let rn = pretty_print_vreg_vector(rn, VectorSize::Size8x16, allocs); diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle index 1bcd269d65c9..5457039059c1 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.isle +++ b/cranelift/codegen/src/isa/aarch64/lower.isle @@ -406,6 +406,119 @@ (rule (lower (has_type (ty_scalar_float ty) (fma x y z))) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z)) +;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty (fcopysign x y))) + (fcopy_sign x y ty)) + +;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F32)))) + (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x $false $F32 out_ty)) + +(rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F32)))) + (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x $false $F32 $I64)) + +(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F64)))) + (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x $false $F64 out_ty)) + +(rule (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F64)))) + (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x $false $F64 $I64)) + +;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F32)))) + (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x $true $F32 out_ty)) + +(rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F32)))) + (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x $true $F32 $I64)) + +(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F64)))) + (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x $true $F64 out_ty)) + +(rule (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F64)))) + (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x $true $F64 $I64)) + +;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint x @ (value_type (multi_lane 32 _))))) + (vec_misc (VecMisc2.Ucvtf) x (vector_size ty))) + +(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint x @ (value_type (multi_lane 64 _))))) + (vec_misc (VecMisc2.Ucvtf) x (vector_size ty))) + +(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_32 _))))) + (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x))) + +(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_32 _))))) + (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x))) + +(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type $I64)))) + (int_to_fpu (IntToFpuOp.U64ToF32) x)) + +(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type $I64)))) + (int_to_fpu (IntToFpuOp.U64ToF64) x)) + +;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint x @ (value_type (multi_lane 32 _))))) + (vec_misc (VecMisc2.Scvtf) x (vector_size ty))) + +(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint x @ (value_type (multi_lane 64 _))))) + (vec_misc (VecMisc2.Scvtf) x (vector_size ty))) + +(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_32 _))))) + (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x))) + +(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_32 _))))) + (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x))) + +(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type $I64)))) + (int_to_fpu (IntToFpuOp.I64ToF32) x)) + +(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type $I64)))) + (int_to_fpu (IntToFpuOp.I64ToF64) x)) + +;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 32 _))))) + (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty))) + +(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 64 _))))) + (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty))) + +(rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F32)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x $false $F32 $I32)) + +(rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F32)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x $false $F32 $I64)) + +(rule (lower (has_type $I32 (fcvt_to_uint_sat x @ (value_type $F64)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x $false $F64 $I32)) + +(rule (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F64)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x $false $F64 $I64)) + +;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(rule (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 32 _))))) + (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty))) + +(rule (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 64 _))))) + (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty))) + +(rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F32)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x $true $F32 $I32)) + +(rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F32)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x $true $F32 $I64)) + +(rule (lower (has_type $I32 (fcvt_to_sint_sat x @ (value_type $F64)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x $true $F64 $I32)) + +(rule (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F64)))) + (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x $true $F64 $I64)) + ;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; `i64` and smaller diff --git a/cranelift/codegen/src/isa/aarch64/lower.rs b/cranelift/codegen/src/isa/aarch64/lower.rs index 5235db60b43d..3ec6bf3bbe99 100644 --- a/cranelift/codegen/src/isa/aarch64/lower.rs +++ b/cranelift/codegen/src/isa/aarch64/lower.rs @@ -1065,17 +1065,6 @@ pub(crate) fn condcode_is_signed(cc: IntCC) -> bool { //============================================================================= // Helpers for instruction lowering. -pub(crate) fn choose_32_64(ty: Type, op32: T, op64: T) -> T { - let bits = ty_bits(ty); - if bits <= 32 { - op32 - } else if bits == 64 { - op64 - } else { - panic!("choose_32_64 on > 64 bits!") - } -} - /// Checks for an instance of `op` feeding the given input. pub(crate) fn maybe_input_insn( c: &mut Lower, diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs index 35bbefec8776..1e190a3fc765 100644 --- a/cranelift/codegen/src/isa/aarch64/lower/isle.rs +++ b/cranelift/codegen/src/isa/aarch64/lower/isle.rs @@ -5,12 +5,13 @@ pub mod generated_code; // Types that the generated ISLE code uses via `use super::*`. use super::{ - insn_inputs, lower_constant_f128, lower_constant_f64, writable_zero_reg, zero_reg, AMode, - ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, CondBrKind, ExtendOp, - FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, JTSequenceInfo, MachLabel, - MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, PairAMode, Reg, ScalarSize, - ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV, + insn_inputs, lower_constant_f128, lower_constant_f32, lower_constant_f64, writable_zero_reg, + zero_reg, AMode, ASIMDFPModImm, ASIMDMovModImm, BranchTarget, CallIndInfo, CallInfo, Cond, + CondBrKind, ExtendOp, FPUOpRI, FloatCC, Imm12, ImmLogic, ImmShift, Inst as MInst, IntCC, + JTSequenceInfo, MachLabel, MoveWideConst, MoveWideOp, NarrowValueMode, Opcode, OperandSize, + PairAMode, Reg, ScalarSize, ShiftOpAndAmt, UImm5, VecMisc2, VectorSize, NZCV, }; +use crate::isa::aarch64::inst::{FPULeftShiftImm, FPURightShiftImm}; use crate::isa::aarch64::lower::{lower_address, lower_splat_const}; use crate::isa::aarch64::settings::Flags as IsaFlags; use crate::machinst::{isle::*, InputSourceInst}; @@ -519,4 +520,198 @@ impl generated_code::Context for IsleContext<'_, '_, MInst, Flags, IsaFlags, 6> fn preg_link(&mut self) -> PReg { super::regs::link_reg().to_real_reg().unwrap().into() } + + fn min_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg { + let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap(); + + if in_bits == 32 { + // From float32. + let min = match (signed, out_bits) { + (true, 8) => i8::MIN as f32 - 1., + (true, 16) => i16::MIN as f32 - 1., + (true, 32) => i32::MIN as f32, // I32_MIN - 1 isn't precisely representable as a f32. + (true, 64) => i64::MIN as f32, // I64_MIN - 1 isn't precisely representable as a f32. + + (false, _) => -1., + _ => unimplemented!( + "unexpected {} output size of {} bits for 32-bit input", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + lower_constant_f32(self.lower_ctx, tmp, min); + } else if in_bits == 64 { + // From float64. + let min = match (signed, out_bits) { + (true, 8) => i8::MIN as f64 - 1., + (true, 16) => i16::MIN as f64 - 1., + (true, 32) => i32::MIN as f64 - 1., + (true, 64) => i64::MIN as f64, + + (false, _) => -1., + _ => unimplemented!( + "unexpected {} output size of {} bits for 64-bit input", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + lower_constant_f64(self.lower_ctx, tmp, min); + } else { + unimplemented!( + "unexpected input size for min_fp_value: {} (signed: {}, output size: {})", + in_bits, + signed, + out_bits + ); + } + + tmp.to_reg() + } + + fn max_fp_value(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg { + let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap(); + + if in_bits == 32 { + // From float32. + let max = match (signed, out_bits) { + (true, 8) => i8::MAX as f32 + 1., + (true, 16) => i16::MAX as f32 + 1., + (true, 32) => (i32::MAX as u64 + 1) as f32, + (true, 64) => (i64::MAX as u64 + 1) as f32, + + (false, 8) => u8::MAX as f32 + 1., + (false, 16) => u16::MAX as f32 + 1., + (false, 32) => (u32::MAX as u64 + 1) as f32, + (false, 64) => (u64::MAX as u128 + 1) as f32, + _ => unimplemented!( + "unexpected {} output size of {} bits for 32-bit input", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + lower_constant_f32(self.lower_ctx, tmp, max); + } else if in_bits == 64 { + // From float64. + let max = match (signed, out_bits) { + (true, 8) => i8::MAX as f64 + 1., + (true, 16) => i16::MAX as f64 + 1., + (true, 32) => i32::MAX as f64 + 1., + (true, 64) => (i64::MAX as u64 + 1) as f64, + + (false, 8) => u8::MAX as f64 + 1., + (false, 16) => u16::MAX as f64 + 1., + (false, 32) => u32::MAX as f64 + 1., + (false, 64) => (u64::MAX as u128 + 1) as f64, + _ => unimplemented!( + "unexpected {} output size of {} bits for 64-bit input", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + lower_constant_f64(self.lower_ctx, tmp, max); + } else { + unimplemented!( + "unexpected input size for max_fp_value: {} (signed: {}, output size: {})", + in_bits, + signed, + out_bits + ); + } + + tmp.to_reg() + } + + fn min_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg { + let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap(); + + let min: f64 = match (out_bits, signed) { + (32, true) => i32::MIN as f64, + (32, false) => 0.0, + (64, true) => i64::MIN as f64, + (64, false) => 0.0, + _ => unimplemented!( + "unexpected {} output size of {} bits", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + if in_bits == 32 { + lower_constant_f32(self.lower_ctx, tmp, min as f32) + } else if in_bits == 64 { + lower_constant_f64(self.lower_ctx, tmp, min) + } else { + unimplemented!( + "unexpected input size for min_fp_value_sat: {} (signed: {}, output size: {})", + in_bits, + signed, + out_bits + ); + } + + tmp.to_reg() + } + + fn max_fp_value_sat(&mut self, signed: bool, in_bits: u8, out_bits: u8) -> Reg { + let tmp = self.lower_ctx.alloc_tmp(I8X16).only_reg().unwrap(); + + let max = match (out_bits, signed) { + (32, true) => i32::MAX as f64, + (32, false) => u32::MAX as f64, + (64, true) => i64::MAX as f64, + (64, false) => u64::MAX as f64, + _ => unimplemented!( + "unexpected {} output size of {} bits", + if signed { "signed" } else { "unsigned" }, + out_bits + ), + }; + + if in_bits == 32 { + lower_constant_f32(self.lower_ctx, tmp, max as f32) + } else if in_bits == 64 { + lower_constant_f64(self.lower_ctx, tmp, max) + } else { + unimplemented!( + "unexpected input size for max_fp_value_sat: {} (signed: {}, output size: {})", + in_bits, + signed, + out_bits + ); + } + + tmp.to_reg() + } + + fn fpu_op_ri_ushr(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI { + if ty_bits == 32 { + FPUOpRI::UShr32(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap()) + } else if ty_bits == 64 { + FPUOpRI::UShr64(FPURightShiftImm::maybe_from_u8(shift, ty_bits).unwrap()) + } else { + unimplemented!( + "unexpected input size for fpu_op_ri_ushr: {} (shift: {})", + ty_bits, + shift + ); + } + } + + fn fpu_op_ri_sli(&mut self, ty_bits: u8, shift: u8) -> FPUOpRI { + if ty_bits == 32 { + FPUOpRI::Sli32(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap()) + } else if ty_bits == 64 { + FPUOpRI::Sli64(FPULeftShiftImm::maybe_from_u8(shift, ty_bits).unwrap()) + } else { + unimplemented!( + "unexpected input size for fpu_op_ri_sli: {} (shift: {})", + ty_bits, + shift + ); + } + } } diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index af5cf88a43f3..10f8f3516df0 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -2,10 +2,9 @@ use super::lower::*; use crate::binemit::CodeOffset; -use crate::ir::condcodes::FloatCC; use crate::ir::types::*; use crate::ir::Inst as IRInst; -use crate::ir::{InstructionData, Opcode, TrapCode}; +use crate::ir::{InstructionData, Opcode}; use crate::isa::aarch64::abi::*; use crate::isa::aarch64::inst::*; use crate::isa::aarch64::settings as aarch64_settings; @@ -978,408 +977,13 @@ pub(crate) fn lower_insn_to_regs( Opcode::Fma => implemented_in_isle(ctx), - Opcode::Fcopysign => { - // Copy the sign bit from inputs[1] to inputs[0]. We use the following sequence: - // - // This is a scalar Fcopysign. - // This uses scalar NEON operations for 64-bit and vector operations (2S) for 32-bit. - // In the latter case it still sets all bits except the lowest 32 to 0. - // - // mov vd, vn - // ushr vtmp, vm, #63 / #31 - // sli vd, vtmp, #63 / #31 + Opcode::Fcopysign => implemented_in_isle(ctx), - let ty = ctx.output_ty(insn, 0); + Opcode::FcvtToUint | Opcode::FcvtToSint => implemented_in_isle(ctx), - if ty != F32 && ty != F64 { - return Err(CodegenError::Unsupported(format!( - "Fcopysign: Unsupported type: {:?}", - ty - ))); - } - - let bits = ty_bits(ty) as u8; - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rm = put_input_in_reg(ctx, inputs[1], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - let tmp = ctx.alloc_tmp(F64).only_reg().unwrap(); - - // Copy LHS to rd. - ctx.emit(Inst::gen_move(rd, rn, ty)); - - // Copy the sign bit to the lowest bit in tmp. - let imm = FPURightShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); - ctx.emit(Inst::FpuRRI { - fpu_op: choose_32_64(ty, FPUOpRI::UShr32(imm), FPUOpRI::UShr64(imm)), - rd: tmp, - rn: rm, - }); - - // Insert the bit from tmp into the sign bit of rd. - let imm = FPULeftShiftImm::maybe_from_u8(bits - 1, bits).unwrap(); - ctx.emit(Inst::FpuRRI { - fpu_op: choose_32_64(ty, FPUOpRI::Sli32(imm), FPUOpRI::Sli64(imm)), - rd, - rn: tmp.to_reg(), - }); - } - - Opcode::FcvtToUint | Opcode::FcvtToSint => { - let input_ty = ctx.input_ty(insn, 0); - let in_bits = ty_bits(input_ty); - let output_ty = ty.unwrap(); - let out_bits = ty_bits(output_ty); - let signed = op == Opcode::FcvtToSint; - let op = match (signed, in_bits, out_bits) { - (false, 32, 8) | (false, 32, 16) | (false, 32, 32) => FpuToIntOp::F32ToU32, - (true, 32, 8) | (true, 32, 16) | (true, 32, 32) => FpuToIntOp::F32ToI32, - (false, 32, 64) => FpuToIntOp::F32ToU64, - (true, 32, 64) => FpuToIntOp::F32ToI64, - (false, 64, 8) | (false, 64, 16) | (false, 64, 32) => FpuToIntOp::F64ToU32, - (true, 64, 8) | (true, 64, 16) | (true, 64, 32) => FpuToIntOp::F64ToI32, - (false, 64, 64) => FpuToIntOp::F64ToU64, - (true, 64, 64) => FpuToIntOp::F64ToI64, - _ => { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported types: {:?} -> {:?}", - op, input_ty, output_ty - ))) - } - }; - - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - // First, check the output: it's important to carry the NaN conversion before the - // in-bounds conversion, per wasm semantics. - - // Check that the input is not a NaN. - ctx.emit(Inst::FpuCmp { - size: ScalarSize::from_ty(input_ty), - rn, - rm: rn, - }); - let trap_code = TrapCode::BadConversionToInteger; - ctx.emit(Inst::TrapIf { - trap_code, - kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::Unordered)), - }); - - let tmp = ctx.alloc_tmp(I8X16).only_reg().unwrap(); - - // Check that the input is in range, with "truncate towards zero" semantics. This means - // we allow values that are slightly out of range: - // - for signed conversions, we allow values strictly greater than INT_MIN-1 (when this - // can be represented), and strictly less than INT_MAX+1 (when this can be - // represented). - // - for unsigned conversions, we allow values strictly greater than -1, and strictly - // less than UINT_MAX+1 (when this can be represented). - - if in_bits == 32 { - // From float32. - let (low_bound, low_cond, high_bound) = match (signed, out_bits) { - (true, 8) => ( - i8::min_value() as f32 - 1., - FloatCC::GreaterThan, - i8::max_value() as f32 + 1., - ), - (true, 16) => ( - i16::min_value() as f32 - 1., - FloatCC::GreaterThan, - i16::max_value() as f32 + 1., - ), - (true, 32) => ( - i32::min_value() as f32, // I32_MIN - 1 isn't precisely representable as a f32. - FloatCC::GreaterThanOrEqual, - i32::max_value() as f32 + 1., - ), - (true, 64) => ( - i64::min_value() as f32, // I64_MIN - 1 isn't precisely representable as a f32. - FloatCC::GreaterThanOrEqual, - i64::max_value() as f32 + 1., - ), - (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f32 + 1.), - (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f32 + 1.), - (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f32 + 1.), - (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f32 + 1.), - _ => unreachable!(), - }; - - // >= low_bound - lower_constant_f32(ctx, tmp, low_bound); - ctx.emit(Inst::FpuCmp { - size: ScalarSize::Size32, - rn, - rm: tmp.to_reg(), - }); - let trap_code = TrapCode::IntegerOverflow; - ctx.emit(Inst::TrapIf { - trap_code, - kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()), - }); - - // <= high_bound - lower_constant_f32(ctx, tmp, high_bound); - ctx.emit(Inst::FpuCmp { - size: ScalarSize::Size32, - rn, - rm: tmp.to_reg(), - }); - let trap_code = TrapCode::IntegerOverflow; - ctx.emit(Inst::TrapIf { - trap_code, - kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()), - }); - } else { - // From float64. - let (low_bound, low_cond, high_bound) = match (signed, out_bits) { - (true, 8) => ( - i8::min_value() as f64 - 1., - FloatCC::GreaterThan, - i8::max_value() as f64 + 1., - ), - (true, 16) => ( - i16::min_value() as f64 - 1., - FloatCC::GreaterThan, - i16::max_value() as f64 + 1., - ), - (true, 32) => ( - i32::min_value() as f64 - 1., - FloatCC::GreaterThan, - i32::max_value() as f64 + 1., - ), - (true, 64) => ( - i64::min_value() as f64, // I64_MIN - 1 is not precisely representable as an i64. - FloatCC::GreaterThanOrEqual, - i64::max_value() as f64 + 1., - ), - (false, 8) => (-1., FloatCC::GreaterThan, u8::max_value() as f64 + 1.), - (false, 16) => (-1., FloatCC::GreaterThan, u16::max_value() as f64 + 1.), - (false, 32) => (-1., FloatCC::GreaterThan, u32::max_value() as f64 + 1.), - (false, 64) => (-1., FloatCC::GreaterThan, u64::max_value() as f64 + 1.), - _ => unreachable!(), - }; - - // >= low_bound - lower_constant_f64(ctx, tmp, low_bound); - ctx.emit(Inst::FpuCmp { - size: ScalarSize::Size64, - rn, - rm: tmp.to_reg(), - }); - let trap_code = TrapCode::IntegerOverflow; - ctx.emit(Inst::TrapIf { - trap_code, - kind: CondBrKind::Cond(lower_fp_condcode(low_cond).invert()), - }); - - // <= high_bound - lower_constant_f64(ctx, tmp, high_bound); - ctx.emit(Inst::FpuCmp { - size: ScalarSize::Size64, - rn, - rm: tmp.to_reg(), - }); - let trap_code = TrapCode::IntegerOverflow; - ctx.emit(Inst::TrapIf { - trap_code, - kind: CondBrKind::Cond(lower_fp_condcode(FloatCC::LessThan).invert()), - }); - }; - - // Do the conversion. - ctx.emit(Inst::FpuToInt { op, rd, rn }); - } - - Opcode::FcvtFromUint | Opcode::FcvtFromSint => { - let input_ty = ctx.input_ty(insn, 0); - let ty = ty.unwrap(); - let signed = op == Opcode::FcvtFromSint; - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + Opcode::FcvtFromUint | Opcode::FcvtFromSint => implemented_in_isle(ctx), - if ty.is_vector() { - if input_ty.lane_bits() != ty.lane_bits() { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported types: {:?} -> {:?}", - op, input_ty, ty - ))); - } - - let op = if signed { - VecMisc2::Scvtf - } else { - VecMisc2::Ucvtf - }; - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - - ctx.emit(Inst::VecMisc { - op, - rd, - rn, - size: VectorSize::from_ty(ty), - }); - } else { - let in_bits = ty_bits(input_ty); - let out_bits = ty_bits(ty); - let op = match (signed, in_bits, out_bits) { - (false, 8, 32) | (false, 16, 32) | (false, 32, 32) => IntToFpuOp::U32ToF32, - (true, 8, 32) | (true, 16, 32) | (true, 32, 32) => IntToFpuOp::I32ToF32, - (false, 8, 64) | (false, 16, 64) | (false, 32, 64) => IntToFpuOp::U32ToF64, - (true, 8, 64) | (true, 16, 64) | (true, 32, 64) => IntToFpuOp::I32ToF64, - (false, 64, 32) => IntToFpuOp::U64ToF32, - (true, 64, 32) => IntToFpuOp::I64ToF32, - (false, 64, 64) => IntToFpuOp::U64ToF64, - (true, 64, 64) => IntToFpuOp::I64ToF64, - _ => { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported types: {:?} -> {:?}", - op, input_ty, ty - ))) - } - }; - let narrow_mode = match (signed, in_bits) { - (false, 8) | (false, 16) | (false, 32) => NarrowValueMode::ZeroExtend32, - (true, 8) | (true, 16) | (true, 32) => NarrowValueMode::SignExtend32, - (false, 64) => NarrowValueMode::ZeroExtend64, - (true, 64) => NarrowValueMode::SignExtend64, - _ => unreachable!(), - }; - let rn = put_input_in_reg(ctx, inputs[0], narrow_mode); - ctx.emit(Inst::IntToFpu { op, rd, rn }); - } - } - - Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => { - let in_ty = ctx.input_ty(insn, 0); - let ty = ty.unwrap(); - let out_signed = op == Opcode::FcvtToSintSat; - let rn = put_input_in_reg(ctx, inputs[0], NarrowValueMode::None); - let rd = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); - - if ty.is_vector() { - if in_ty.lane_bits() != ty.lane_bits() { - return Err(CodegenError::Unsupported(format!( - "{}: Unsupported types: {:?} -> {:?}", - op, in_ty, ty - ))); - } - - let op = if out_signed { - VecMisc2::Fcvtzs - } else { - VecMisc2::Fcvtzu - }; - - ctx.emit(Inst::VecMisc { - op, - rd, - rn, - size: VectorSize::from_ty(ty), - }); - } else { - let in_bits = ty_bits(in_ty); - let out_bits = ty_bits(ty); - // FIMM Vtmp1, u32::MAX or u64::MAX or i32::MAX or i64::MAX - // FMIN Vtmp2, Vin, Vtmp1 - // FIMM Vtmp1, 0 or 0 or i32::MIN or i64::MIN - // FMAX Vtmp2, Vtmp2, Vtmp1 - // (if signed) FIMM Vtmp1, 0 - // FCMP Vin, Vin - // FCSEL Vtmp2, Vtmp1, Vtmp2, NE // on NaN, select 0 - // convert Rout, Vtmp2 - - assert!(in_ty.is_float() && (in_bits == 32 || in_bits == 64)); - assert!(out_bits == 32 || out_bits == 64); - - let min: f64 = match (out_bits, out_signed) { - (32, true) => std::i32::MIN as f64, - (32, false) => 0.0, - (64, true) => std::i64::MIN as f64, - (64, false) => 0.0, - _ => unreachable!(), - }; - - let max = match (out_bits, out_signed) { - (32, true) => std::i32::MAX as f64, - (32, false) => std::u32::MAX as f64, - (64, true) => std::i64::MAX as f64, - (64, false) => std::u64::MAX as f64, - _ => unreachable!(), - }; - - let rtmp1 = ctx.alloc_tmp(in_ty).only_reg().unwrap(); - let rtmp2 = ctx.alloc_tmp(in_ty).only_reg().unwrap(); - - if in_bits == 32 { - lower_constant_f32(ctx, rtmp1, max as f32); - } else { - lower_constant_f64(ctx, rtmp1, max); - } - ctx.emit(Inst::FpuRRR { - fpu_op: FPUOp2::Min, - size: ScalarSize::from_ty(in_ty), - rd: rtmp2, - rn, - rm: rtmp1.to_reg(), - }); - if in_bits == 32 { - lower_constant_f32(ctx, rtmp1, min as f32); - } else { - lower_constant_f64(ctx, rtmp1, min); - } - ctx.emit(Inst::FpuRRR { - fpu_op: FPUOp2::Max, - size: ScalarSize::from_ty(in_ty), - rd: rtmp2, - rn: rtmp2.to_reg(), - rm: rtmp1.to_reg(), - }); - if out_signed { - if in_bits == 32 { - lower_constant_f32(ctx, rtmp1, 0.0); - } else { - lower_constant_f64(ctx, rtmp1, 0.0); - } - } - ctx.emit(Inst::FpuCmp { - size: ScalarSize::from_ty(in_ty), - rn, - rm: rn, - }); - if in_bits == 32 { - ctx.emit(Inst::FpuCSel32 { - rd: rtmp2, - rn: rtmp1.to_reg(), - rm: rtmp2.to_reg(), - cond: Cond::Ne, - }); - } else { - ctx.emit(Inst::FpuCSel64 { - rd: rtmp2, - rn: rtmp1.to_reg(), - rm: rtmp2.to_reg(), - cond: Cond::Ne, - }); - } - - let cvt = match (in_bits, out_bits, out_signed) { - (32, 32, false) => FpuToIntOp::F32ToU32, - (32, 32, true) => FpuToIntOp::F32ToI32, - (32, 64, false) => FpuToIntOp::F32ToU64, - (32, 64, true) => FpuToIntOp::F32ToI64, - (64, 32, false) => FpuToIntOp::F64ToU32, - (64, 32, true) => FpuToIntOp::F64ToI32, - (64, 64, false) => FpuToIntOp::F64ToU64, - (64, 64, true) => FpuToIntOp::F64ToI64, - _ => unreachable!(), - }; - ctx.emit(Inst::FpuToInt { - op: cvt, - rd, - rn: rtmp2.to_reg(), - }); - } - } + Opcode::FcvtToUintSat | Opcode::FcvtToSintSat => implemented_in_isle(ctx), Opcode::IaddIfcout => { // This is a two-output instruction that is needed for the diff --git a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif index 0755c94feba6..8dbaf3e1c802 100644 --- a/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif +++ b/cranelift/filetests/filetests/isa/aarch64/fcvt-small.clif @@ -9,8 +9,8 @@ block0(v0: i8): } ; block0: -; uxtb w4, w0 -; ucvtf s0, w4 +; uxtb w3, w0 +; ucvtf s0, w3 ; ret function u0:0(i8) -> f64 { @@ -20,8 +20,8 @@ block0(v0: i8): } ; block0: -; uxtb w4, w0 -; ucvtf d0, w4 +; uxtb w3, w0 +; ucvtf d0, w3 ; ret function u0:0(i16) -> f32 { @@ -31,8 +31,8 @@ block0(v0: i16): } ; block0: -; uxth w4, w0 -; ucvtf s0, w4 +; uxth w3, w0 +; ucvtf s0, w3 ; ret function u0:0(i16) -> f64 { @@ -42,8 +42,8 @@ block0(v0: i16): } ; block0: -; uxth w4, w0 -; ucvtf d0, w4 +; uxth w3, w0 +; ucvtf d0, w3 ; ret function u0:0(f32) -> i8 { @@ -55,13 +55,13 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; fmov s6, #-1 -; fcmp s0, s6 +; fmov s5, #-1 +; fcmp s0, s5 ; b.gt 8 ; udf ; movz x10, #17280, LSL #16 -; fmov s6, w10 -; fcmp s0, s6 -; b.mi 8 ; udf +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf ; fcvtzu w0, s0 ; ret @@ -74,13 +74,13 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; fmov d6, #-1 -; fcmp d0, d6 +; fmov d5, #-1 +; fcmp d0, d5 ; b.gt 8 ; udf ; movz x10, #16496, LSL #48 -; fmov d6, x10 -; fcmp d0, d6 -; b.mi 8 ; udf +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf ; fcvtzu w0, d0 ; ret @@ -93,13 +93,13 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; fmov s6, #-1 -; fcmp s0, s6 +; fmov s5, #-1 +; fcmp s0, s5 ; b.gt 8 ; udf ; movz x10, #18304, LSL #16 -; fmov s6, w10 -; fcmp s0, s6 -; b.mi 8 ; udf +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf ; fcvtzu w0, s0 ; ret @@ -112,13 +112,13 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; fmov d6, #-1 -; fcmp d0, d6 +; fmov d5, #-1 +; fcmp d0, d5 ; b.gt 8 ; udf ; movz x10, #16624, LSL #48 -; fmov d6, x10 -; fcmp d0, d6 -; b.mi 8 ; udf +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf ; fcvtzu w0, d0 ; ret diff --git a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif index fc7df58b2fd9..16f38886a239 100644 --- a/cranelift/filetests/filetests/isa/aarch64/floating-point.clif +++ b/cranelift/filetests/filetests/isa/aarch64/floating-point.clif @@ -333,13 +333,13 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; fmov s6, #-1 -; fcmp s0, s6 +; fmov s5, #-1 +; fcmp s0, s5 ; b.gt 8 ; udf ; movz x10, #20352, LSL #16 -; fmov s6, w10 -; fcmp s0, s6 -; b.mi 8 ; udf +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf ; fcvtzu w0, s0 ; ret @@ -352,14 +352,14 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; movz x7, #52992, LSL #16 -; fmov s7, w7 -; fcmp s0, s7 +; movz x6, #52992, LSL #16 +; fmov s6, w6 +; fcmp s0, s6 ; b.ge 8 ; udf ; movz x12, #20224, LSL #16 -; fmov s7, w12 -; fcmp s0, s7 -; b.mi 8 ; udf +; fmov s20, w12 +; fcmp s0, s20 +; b.lt 8 ; udf ; fcvtzs w0, s0 ; ret @@ -372,13 +372,13 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; fmov s6, #-1 -; fcmp s0, s6 +; fmov s5, #-1 +; fcmp s0, s5 ; b.gt 8 ; udf ; movz x10, #24448, LSL #16 -; fmov s6, w10 -; fcmp s0, s6 -; b.mi 8 ; udf +; fmov s18, w10 +; fcmp s0, s18 +; b.lt 8 ; udf ; fcvtzu x0, s0 ; ret @@ -391,14 +391,14 @@ block0(v0: f32): ; block0: ; fcmp s0, s0 ; b.vc 8 ; udf -; movz x7, #57088, LSL #16 -; fmov s7, w7 -; fcmp s0, s7 +; movz x6, #57088, LSL #16 +; fmov s6, w6 +; fcmp s0, s6 ; b.ge 8 ; udf ; movz x12, #24320, LSL #16 -; fmov s7, w12 -; fcmp s0, s7 -; b.mi 8 ; udf +; fmov s20, w12 +; fcmp s0, s20 +; b.lt 8 ; udf ; fcvtzs x0, s0 ; ret @@ -411,13 +411,13 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; fmov d6, #-1 -; fcmp d0, d6 +; fmov d5, #-1 +; fcmp d0, d5 ; b.gt 8 ; udf ; movz x10, #16880, LSL #48 -; fmov d6, x10 -; fcmp d0, d6 -; b.mi 8 ; udf +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf ; fcvtzu w0, d0 ; ret @@ -430,13 +430,13 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; ldr d6, pc+8 ; b 12 ; data.f64 -2147483649 -; fcmp d0, d6 +; ldr d5, pc+8 ; b 12 ; data.f64 -2147483649 +; fcmp d0, d5 ; b.gt 8 ; udf ; movz x10, #16864, LSL #48 -; fmov d6, x10 -; fcmp d0, d6 -; b.mi 8 ; udf +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf ; fcvtzs w0, d0 ; ret @@ -449,13 +449,13 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; fmov d6, #-1 -; fcmp d0, d6 +; fmov d5, #-1 +; fcmp d0, d5 ; b.gt 8 ; udf ; movz x10, #17392, LSL #48 -; fmov d6, x10 -; fcmp d0, d6 -; b.mi 8 ; udf +; fmov d18, x10 +; fcmp d0, d18 +; b.lt 8 ; udf ; fcvtzu x0, d0 ; ret @@ -468,14 +468,14 @@ block0(v0: f64): ; block0: ; fcmp d0, d0 ; b.vc 8 ; udf -; movz x7, #50144, LSL #48 -; fmov d7, x7 -; fcmp d0, d7 +; movz x6, #50144, LSL #48 +; fmov d6, x6 +; fcmp d0, d6 ; b.ge 8 ; udf ; movz x12, #17376, LSL #48 -; fmov d7, x12 -; fcmp d0, d7 -; b.mi 8 ; udf +; fmov d20, x12 +; fcmp d0, d20 +; b.lt 8 ; udf ; fcvtzs x0, d0 ; ret @@ -566,14 +566,14 @@ block0(v0: f32): } ; block0: -; movz x6, #20352, LSL #16 -; fmov s5, w6 -; fmin s7, s0, s5 -; movi v5.2s, #0 -; fmax s7, s7, s5 +; movz x4, #20352, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movi v17.2s, #0 +; fmax s19, s7, s17 ; fcmp s0, s0 -; fcsel s7, s5, s7, ne -; fcvtzu w0, s7 +; fcsel s22, s17, s19, ne +; fcvtzu w0, s22 ; ret function %f50(f32) -> i32 { @@ -583,16 +583,16 @@ block0(v0: f32): } ; block0: -; movz x6, #20224, LSL #16 -; fmov s5, w6 -; fmin s7, s0, s5 +; movz x4, #20224, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 ; movz x10, #52992, LSL #16 -; fmov s5, w10 -; fmax s7, s7, s5 -; movi v5.2s, #0 +; fmov s18, w10 +; fmax s21, s7, s18 +; movi v23.16b, #0 ; fcmp s0, s0 -; fcsel s7, s5, s7, ne -; fcvtzs w0, s7 +; fcsel s26, s23, s21, ne +; fcvtzs w0, s26 ; ret function %f51(f32) -> i64 { @@ -602,14 +602,14 @@ block0(v0: f32): } ; block0: -; movz x6, #24448, LSL #16 -; fmov s5, w6 -; fmin s7, s0, s5 -; movi v5.2s, #0 -; fmax s7, s7, s5 +; movz x4, #24448, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 +; movi v17.2s, #0 +; fmax s19, s7, s17 ; fcmp s0, s0 -; fcsel s7, s5, s7, ne -; fcvtzu x0, s7 +; fcsel s22, s17, s19, ne +; fcvtzu x0, s22 ; ret function %f52(f32) -> i64 { @@ -619,16 +619,16 @@ block0(v0: f32): } ; block0: -; movz x6, #24320, LSL #16 -; fmov s5, w6 -; fmin s7, s0, s5 +; movz x4, #24320, LSL #16 +; fmov s4, w4 +; fmin s7, s0, s4 ; movz x10, #57088, LSL #16 -; fmov s5, w10 -; fmax s7, s7, s5 -; movi v5.2s, #0 +; fmov s18, w10 +; fmax s21, s7, s18 +; movi v23.16b, #0 ; fcmp s0, s0 -; fcsel s7, s5, s7, ne -; fcvtzs x0, s7 +; fcsel s26, s23, s21, ne +; fcvtzs x0, s26 ; ret function %f53(f64) -> i32 { @@ -638,13 +638,13 @@ block0(v0: f64): } ; block0: -; ldr d4, pc+8 ; b 12 ; data.f64 4294967295 -; fmin d6, d0, d4 -; movi v4.2s, #0 -; fmax d6, d6, d4 +; ldr d3, pc+8 ; b 12 ; data.f64 4294967295 +; fmin d5, d0, d3 +; movi v7.2s, #0 +; fmax d17, d5, d7 ; fcmp d0, d0 -; fcsel d6, d4, d6, ne -; fcvtzu w0, d6 +; fcsel d20, d7, d17, ne +; fcvtzu w0, d20 ; ret function %f54(f64) -> i32 { @@ -654,15 +654,15 @@ block0(v0: f64): } ; block0: -; ldr d4, pc+8 ; b 12 ; data.f64 2147483647 -; fmin d6, d0, d4 +; ldr d3, pc+8 ; b 12 ; data.f64 2147483647 +; fmin d5, d0, d3 ; movz x8, #49632, LSL #48 -; fmov d4, x8 -; fmax d6, d6, d4 -; movi v4.2s, #0 +; fmov d16, x8 +; fmax d19, d5, d16 +; movi v21.16b, #0 ; fcmp d0, d0 -; fcsel d6, d4, d6, ne -; fcvtzs w0, d6 +; fcsel d24, d21, d19, ne +; fcvtzs w0, d24 ; ret function %f55(f64) -> i64 { @@ -672,14 +672,14 @@ block0(v0: f64): } ; block0: -; movz x6, #17392, LSL #48 -; fmov d5, x6 -; fmin d7, d0, d5 -; movi v5.2s, #0 -; fmax d7, d7, d5 +; movz x4, #17392, LSL #48 +; fmov d4, x4 +; fmin d7, d0, d4 +; movi v17.2s, #0 +; fmax d19, d7, d17 ; fcmp d0, d0 -; fcsel d7, d5, d7, ne -; fcvtzu x0, d7 +; fcsel d22, d17, d19, ne +; fcvtzu x0, d22 ; ret function %f56(f64) -> i64 { @@ -689,16 +689,16 @@ block0(v0: f64): } ; block0: -; movz x6, #17376, LSL #48 -; fmov d5, x6 -; fmin d7, d0, d5 +; movz x4, #17376, LSL #48 +; fmov d4, x4 +; fmin d7, d0, d4 ; movz x10, #50144, LSL #48 -; fmov d5, x10 -; fmax d7, d7, d5 -; movi v5.2s, #0 +; fmov d18, x10 +; fmax d21, d7, d18 +; movi v23.16b, #0 ; fcmp d0, d0 -; fcsel d7, d5, d7, ne -; fcvtzs x0, d7 +; fcsel d26, d23, d21, ne +; fcvtzs x0, d26 ; ret function %f57(f32x2) -> f32x2 { @@ -946,3 +946,36 @@ block0(v0: f64x2, v1: f64x2, v2: f64x2): ; mov v0.16b, v2.16b ; fmla v0.2d, v17.2d, v1.2d ; ret + +function %f81(f32x2, f32x2) -> f32x2 { +block0(v0: f32x2, v1: f32x2): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; ushr v7.2s, v1.2s, #31 +; sli v0.2s, v7.2s, #31 +; ret + +function %f82(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; ushr v7.4s, v1.4s, #31 +; sli v0.4s, v7.4s, #31 +; ret + +function %f83(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcopysign v0, v1 + return v2 +} + +; block0: +; ushr v7.2d, v1.2d, #63 +; sli v0.2d, v7.2d, #63 +; ret diff --git a/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif b/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif new file mode 100644 index 000000000000..253e4e74d6e8 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fcopysign-64bit.clif @@ -0,0 +1,37 @@ +test interpret +test run +target aarch64 +; x86_64 and s390x do not support 64-bit vectors in `fcopysign`. + +function %fcopysign_f32x2(f32x2, f32x2) -> f32x2 { +block0(v0: f32x2, v1: f32x2): + v2 = fcopysign v0, v1 + return v2 +} +; run: %fcopysign_f32x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0] +; run: %fcopysign_f32x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0] +; run: %fcopysign_f32x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0] + +; F32 Inf +; run: %fcopysign_f32x2([Inf -Inf], [Inf Inf]) == [Inf Inf] +; run: %fcopysign_f32x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf] + +; F32 Epsilon / Max / Min Positive +; run: %fcopysign_f32x2([0x1.000000p-23 -0x1.000000p-23], [-0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23] +; run: %fcopysign_f32x2([0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0]) == [-0x1.fffffep127 0x1.fffffep127] +; run: %fcopysign_f32x2([0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126] + +; F32 Subnormals +; run: %fcopysign_f32x2([0x0.800000p-126 -0x0.800000p-126], [-0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126] +; run: %fcopysign_f32x2([0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0]) == [-0x0.000002p-126 0x0.000002p-126] + +; F32 NaN's +; Unlike with other operations fcopysign is guaranteed to only affect the sign bit +; run: %fcopysign_f32x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0] +; run: %fcopysign_f32x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN] +; run: %fcopysign_f32x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0] +; run: %fcopysign_f32x2([+NaN:0x1 +NaN:0x300001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x300001] +; run: %fcopysign_f32x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1] +; run: %fcopysign_f32x2([-NaN:0x300001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x300001 -sNaN:0x1] +; run: %fcopysign_f32x2([-sNaN:0x1 +sNaN:0x200001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x200001] +; run: %fcopysign_f32x2([-sNaN:0x200001 -sNaN:0x200001], [+NaN +NaN]) == [+sNaN:0x200001 +sNaN:0x200001] diff --git a/cranelift/filetests/filetests/runtests/simd-fcopysign.clif b/cranelift/filetests/filetests/runtests/simd-fcopysign.clif new file mode 100644 index 000000000000..331301038785 --- /dev/null +++ b/cranelift/filetests/filetests/runtests/simd-fcopysign.clif @@ -0,0 +1,63 @@ +test interpret +test run +target s390x +target aarch64 +; x86_64 does not support SIMD fcopysign. + +function %fcopysign_f32x4(f32x4, f32x4) -> f32x4 { +block0(v0: f32x4, v1: f32x4): + v2 = fcopysign v0, v1 + return v2 +} +; run: %fcopysign_f32x4([0x9.0 -0x9.0 0x9.0 -0x9.0], [0x9.0 0x9.0 -0x9.0 -0x9.0]) == [0x9.0 0x9.0 -0x9.0 -0x9.0] +; run: %fcopysign_f32x4([0x0.0 -0x0.0 0x0.0 -0x0.0], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.0 0x0.0 -0x0.0 0x0.0] + +; F32 Inf +; run: %fcopysign_f32x4([Inf -Inf Inf -Inf], [Inf Inf -Inf -Inf]) == [Inf Inf -Inf -Inf] + +; F32 Epsilon / Max / Min Positive +; run: %fcopysign_f32x4([0x1.000000p-23 -0x1.000000p-23 0x1.fffffep127 -0x1.fffffep127], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-23 0x1.000000p-23 -0x1.fffffep127 0x1.fffffep127] +; run: %fcopysign_f32x4([0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x1.000000p-126 0x1.000000p-126 -0x1.000000p-126 0x1.000000p-126] + +; F32 Subnormals +; run: %fcopysign_f32x4([0x0.800000p-126 -0x0.800000p-126 0x0.000002p-126 -0x0.000002p-126], [-0x0.0 0x0.0 -0x0.0 0x0.0]) == [-0x0.800000p-126 0x0.800000p-126 -0x0.000002p-126 0x0.000002p-126] + +; F32 NaN's +; Unlike with other operations fcopysign is guaranteed to only affect the sign bit +; run: %fcopysign_f32x4([0x0.0 0x3.0 Inf +NaN], [-NaN +sNaN:0x1 -NaN -NaN]) == [-0x0.0 0x3.0 -Inf -NaN] +; run: %fcopysign_f32x4([-NaN +NaN:0x0 +NaN:0x1 +NaN:0x300001], [+NaN -NaN -NaN -NaN]) == [+NaN -NaN:0x0 -NaN:0x1 -NaN:0x300001] +; run: %fcopysign_f32x4([-NaN:0x0 -NaN:0x1 -NaN:0x300001 +sNaN:0x1], [+NaN +NaN +NaN -NaN]) == [+NaN:0x0 +NaN:0x1 +NaN:0x300001 -sNaN:0x1] +; run: %fcopysign_f32x4([-sNaN:0x1 +sNaN:0x200001 -sNaN:0x200001 -sNaN:0x200001], [+NaN -NaN +NaN +NaN]) == [+sNaN:0x1 -sNaN:0x200001 +sNaN:0x200001 +sNaN:0x200001] + +function %fcopysign_f64x2(f64x2, f64x2) -> f64x2 { +block0(v0: f64x2, v1: f64x2): + v2 = fcopysign v0, v1 + return v2 +} +; run: %fcopysign_f64x2([0x9.0 -0x9.0], [0x9.0 0x9.0]) == [0x9.0 0x9.0] +; run: %fcopysign_f64x2([0x9.0 -0x9.0], [-0x9.0 -0x9.0]) == [-0x9.0 -0x9.0] +; run: %fcopysign_f64x2([0x0.0 -0x0.0], [-0x0.0 0x0.0]) == [-0x0.0 0x0.0] + +; F64 Inf +; run: %fcopysign_f64x2([Inf -Inf], [Inf Inf]) == [Inf Inf] +; run: %fcopysign_f64x2([Inf -Inf], [-Inf -Inf]) == [-Inf -Inf] + +; F64 Epsilon / Max / Min Positive +; run: %fcopysign_f64x2([0x1.0000000000000p-52 -0x1.0000000000000p-52], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-52 0x1.0000000000000p-52] +; run: %fcopysign_f64x2([0x1.fffffffffffffp1023 -0x1.fffffffffffffp1023], [-0x0.0 0x0.0]) == [-0x1.fffffffffffffp1023 0x1.fffffffffffffp1023] +; run: %fcopysign_f64x2([0x1.0000000000000p-1022 -0x1.0000000000000p-1022], [-0x0.0 0x0.0]) == [-0x1.0000000000000p-1022 0x1.0000000000000p-1022] + +; F64 Subnormals +; run: %fcopysign_f64x2([0x0.8000000000000p-1022 -0x0.8000000000000p-1022], [-0x0.0 0x0.0]) == [-0x0.8000000000000p-1022 0x0.8000000000000p-1022] +; run: %fcopysign_f64x2([0x0.0000000000001p-1022 -0x0.0000000000001p-1022], [-0x0.0 0x0.0]) == [-0x0.0000000000001p-1022 0x0.0000000000001p-1022] + +; F64 NaN's +; Unlike with other operations fcopysign is guaranteed to only affect the sign bit +; run: %fcopysign_f64x2([0x0.0 0x3.0], [-NaN +sNaN:0x1]) == [-0x0.0 0x3.0] +; run: %fcopysign_f64x2([Inf +NaN], [-NaN -NaN]) == [-Inf -NaN] +; run: %fcopysign_f64x2([-NaN +NaN:0x0], [+NaN -NaN]) == [+NaN -NaN:0x0] +; run: %fcopysign_f64x2([+NaN:0x1 +NaN:0x4000000000001], [-NaN -NaN]) == [-NaN:0x1 -NaN:0x4000000000001] +; run: %fcopysign_f64x2([-NaN:0x0 -NaN:0x1], [+NaN +NaN]) == [+NaN:0x0 +NaN:0x1] +; run: %fcopysign_f64x2([-NaN:0x4000000000001 +sNaN:0x1], [+NaN -NaN]) == [+NaN:0x4000000000001 -sNaN:0x1] +; run: %fcopysign_f64x2([-sNaN:0x1 +sNaN:0x4000000000001], [+NaN -NaN]) == [+sNaN:0x1 -sNaN:0x4000000000001] +; run: %fcopysign_f64x2([-sNaN:0x4000000000001 -sNaN:0x4000000000001], [+NaN +NaN]) == [+sNaN:0x4000000000001 +sNaN:0x4000000000001] diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 8b3860acdce4..c30dd92b6408 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -808,7 +808,19 @@ where } Opcode::Fneg => assign(Value::neg(arg(0)?)?), Opcode::Fabs => assign(Value::abs(arg(0)?)?), - Opcode::Fcopysign => binary(Value::copysign, arg(0)?, arg(1)?)?, + Opcode::Fcopysign => { + let arg0 = extractlanes(&arg(0)?, ctrl_ty)?; + let arg1 = extractlanes(&arg(1)?, ctrl_ty)?; + + assign(vectorizelanes( + &arg0 + .into_iter() + .zip(arg1.into_iter()) + .map(|(x, y)| V::copysign(x, y)) + .collect::>>()?, + ctrl_ty, + )?) + } Opcode::Fmin => assign(match (arg(0)?, arg(1)?) { (a, _) if a.is_nan()? => a, (_, b) if b.is_nan()? => b,