cranelift/codegen/src/isa/aarch64/lower.isle

;; aarch64 instruction selection and CLIF-to-MachInst lowering.

;; The main lowering constructor term: takes a clif `Inst` and returns the
;; register(s) within which the lowered instruction's result values live.
(decl partial lower (Inst) InstOutput)

;; Variant of the main lowering constructor term, which receives an
;; additional argument (a vector of branch targets to be used) for
;; implementing branches.
;; For two-branch instructions, the first target is `taken` and the second
;; `not_taken`, even if it is a Fallthrough instruction: because we reorder
;; blocks while we lower, the fallthrough in the new order is not (necessarily)
;; the same as the fallthrough in CLIF. So, we use the explicitly-provided
;; target.
(decl partial lower_branch (Inst MachLabelSlice) Unit)

;;;; Rules for `iconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (iconst (u64_from_imm64 n))))
      (imm ty (ImmExtend.Zero) n))

;;;; Rules for `null` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (null)))
      (imm ty (ImmExtend.Zero) 0))

;;;; Rules for `f32const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (f32const (u32_from_ieee32 n)))
      (constant_f32 n))

;;;; Rules for `f64const` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (f64const (u64_from_ieee64 n)))
      (constant_f64 n))

;;;; Rules for `nop` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (nop))
      (invalid_reg))

;;;; Rules for `iadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller

;; Base case, simply adding things in registers.
(rule -1 (lower (has_type (fits_in_64 ty) (iadd x y)))
      (add ty  x y))

;; Special cases for when one operand is an immediate that fits in 12 bits.
(rule 4 (lower (has_type (fits_in_64 ty) (iadd x (imm12_from_value y))))
      (add_imm ty x y))

(rule 5 (lower (has_type (fits_in_64 ty) (iadd (imm12_from_value x) y)))
      (add_imm ty y x))

;; Same as the previous special cases, except we can switch the addition to a
;; subtraction if the negated immediate fits in 12 bits.
(rule 2 (lower (has_type (fits_in_64 ty) (iadd x y)))
      (if-let imm12_neg (imm12_from_negated_value y))
      (sub_imm ty x imm12_neg))

(rule 3 (lower (has_type (fits_in_64 ty) (iadd x y)))
      (if-let imm12_neg (imm12_from_negated_value x))
      (sub_imm ty y imm12_neg))

;; Special cases for when we're adding an extended register where the extending
;; operation can get folded into the add itself.
(rule 0 (lower (has_type (fits_in_64 ty) (iadd x (extended_value_from_value y))))
      (add_extend ty x y))

(rule 1 (lower (has_type (fits_in_64 ty) (iadd (extended_value_from_value x) y)))
      (add_extend ty y x))

;; Special cases for when we're adding the shift of a different
;; register by a constant amount and the shift can get folded into the add.
(rule 7 (lower (has_type (fits_in_64 ty)
                       (iadd x (ishl y (iconst k)))))
      (if-let amt (lshl_from_imm64 ty k))
      (add_shift ty x y amt))

(rule 6 (lower (has_type (fits_in_64 ty)
                       (iadd (ishl x (iconst k)) y)))
      (if-let amt (lshl_from_imm64 ty k))
      (add_shift ty y x amt))

;; Fold an `iadd` and `imul` combination into a `madd` instruction.
(rule 7 (lower (has_type (fits_in_64 ty) (iadd x (imul y z))))
      (madd ty y z x))

(rule 6 (lower (has_type (fits_in_64 ty) (iadd (imul x y) z)))
      (madd ty x y z))

;; Fold an `isub` and `imul` combination into a `msub` instruction.
(rule (lower (has_type (fits_in_64 ty) (isub x (imul y z))))
      (msub ty y z x))

;; vectors

(rule -2 (lower (has_type ty @ (multi_lane _ _) (iadd x y)))
      (add_vec x y (vector_size ty)))

;; `i128`
(rule -3 (lower (has_type $I128 (iadd x y)))
      (let
          ;; Get the high/low registers for `x`.
          ((x_regs ValueRegs x)
           (x_lo Reg (value_regs_get x_regs 0))
           (x_hi Reg (value_regs_get x_regs 1))

           ;; Get the high/low registers for `y`.
           (y_regs ValueRegs y)
           (y_lo Reg (value_regs_get y_regs 0))
           (y_hi Reg (value_regs_get y_regs 1)))
        ;; the actual addition is `adds` followed by `adc` which comprises the
        ;; low/high bits of the result
        (with_flags
          (add_with_flags_paired $I64 x_lo y_lo)
          (adc_paired $I64 x_hi y_hi))))

;;;; Rules for `shuffle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; When a single element of one vector is broadcast to all the destination
;; lanes then the `dup` instruction can be used for this operation. Note that
;; for now this only matches lane selection from the first vector `a`, but
;; if necessary in the future rules can be added to select from `b` as well.
(rule 6 (lower (shuffle a b (shuffle_dup8_from_imm n)))
        (vec_dup_from_fpu a (VectorSize.Size8x16) n))
(rule 5 (lower (shuffle a b (shuffle_dup16_from_imm n)))
        (vec_dup_from_fpu a (VectorSize.Size16x8) n))
(rule 4 (lower (shuffle a b (shuffle_dup32_from_imm n)))
        (vec_dup_from_fpu a (VectorSize.Size32x4) n))
(rule 3 (lower (shuffle a b (shuffle_dup64_from_imm n)))
        (vec_dup_from_fpu a (VectorSize.Size64x2) n))

;; If the `Immediate` specified to the extractor looks like a duplication of the
;; `n`th lane of the first vector of size K-byte lanes, then each extractor
;; returns the `n` value as a `u8` to be used as part of a `vec_dup_from_fpu`
;; instruction. Note that there's a different extractor for each bit-width of
;; lane.
(decl shuffle_dup8_from_imm (u8) Immediate)
(extern extractor shuffle_dup8_from_imm shuffle_dup8_from_imm)
(decl shuffle_dup16_from_imm (u8) Immediate)
(extern extractor shuffle_dup16_from_imm shuffle_dup16_from_imm)
(decl shuffle_dup32_from_imm (u8) Immediate)
(extern extractor shuffle_dup32_from_imm shuffle_dup32_from_imm)
(decl shuffle_dup64_from_imm (u8) Immediate)
(extern extractor shuffle_dup64_from_imm shuffle_dup64_from_imm)

;; When the shuffle looks like "concatenate `a` and `b` and shift right by n*8
;; bytes", that's an `ext` instruction.
(rule 2 (lower (shuffle a b (vec_extract_imm4_from_immediate n)))
        (vec_extract a b n))

;; Attempts to extract `n` from the specified shuffle `Immediate` where each
;; byte of the `Immediate` is a consecutive sequence starting from `n`. This
;; value of `n` is used as part of the `vec_extract` instruction which extracts
;; consecutive bytes from two vectors into one final vector, offset by `n`
;; bytes.
(decl vec_extract_imm4_from_immediate (u8) Immediate)
(extern extractor vec_extract_imm4_from_immediate vec_extract_imm4_from_immediate)

;; Rules for the `uzp1` and `uzp2` instructions which gather even-numbered lanes
;; or odd-numbered lanes
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e1c_1a18_1614_1210_0e0c_0a08_0604_0200)))
      (vec_uzp1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1d_1b19_1715_1311_0f0d_0b09_0705_0301)))
      (vec_uzp2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_1918_1514_1110_0d0c_0908_0504_0100)))
      (vec_uzp1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_1b1a_1716_1312_0f0e_0b0a_0706_0302)))
      (vec_uzp2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_13121110_0b0a0908_03020100)))
      (vec_uzp1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_17161514_0f0e0d0c_07060504)))
      (vec_uzp2 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716151413121110_0706050403020100)))
      (vec_uzp1 a b (VectorSize.Size64x2)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c1b1a1918_0f0e0d0c0b0a0908)))
      (vec_uzp2 a b (VectorSize.Size64x2)))

;; Rules for the `zip1` and `zip2` instructions which interleave lanes in the
;; low or high halves of the two input vectors.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1707_1606_1505_1404_1303_1202_1101_1000)))
      (vec_zip1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1e0e_1d0d_1c0c_1b0b_1a0a_1909_1808)))
      (vec_zip2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1716_0706_1514_0504_1312_0302_1110_0100)))
      (vec_zip1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1d1c_0d0c_1b1a_0b0a_1918_0908)))
      (vec_zip2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x17161514_07060504_13121110_03020100)))
      (vec_zip1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_1b1a1918_0b0a0908)))
      (vec_zip2 a b (VectorSize.Size32x4)))
;; Note that zip1/zip2 for i64x2 vectors is omitted since it's already covered
;; by the i64x2 cases of uzp1/uzp2 above where both zip and uzp have the same
;; semantics for 64-bit lanes.

;; Rules for the `trn1` and `trn2` instructions which interleave odd or even
;; lanes in the two input vectors.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1e0e_1c0c_1a0a_1808_1606_1404_1202_1000)))
      (vec_trn1 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f0f_1d0d_1b0b_1909_1707_1505_1303_1101)))
      (vec_trn2 a b (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1d1c_0d0c_1918_0908_1514_0504_1110_0100)))
      (vec_trn1 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e_0f0e_1b1a_0b0a_1716_0706_1312_0302)))
      (vec_trn2 a b (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1b1a1918_0b0a0908_13121110_03020100)))
      (vec_trn1 a b (VectorSize.Size32x4)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x1f1e1d1c_0f0e0d0c_17161514_07060504)))
      (vec_trn2 a b (VectorSize.Size32x4)))
;; Note that trn1/trn2 for i64x2 vectors is omitted since it's already covered
;; by the i64x2 cases of uzp1/uzp2 above where both trn and uzp have the same
;; semantics for 64-bit lanes.

;; Rules for the `rev{16,32,64}` instructions where reversals happen at either
;; the byte level, the 16-bit level, or 32-bit level. Note that all of these
;; patterns only match reversals in the first operand, but they can
;; theoretically be extended if necessary to reversals in the second operand.
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0e0f_0c0d_0a0b_0809_0607_0405_0203_0001)))
      (rev16 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0c0d0e0f_08090a0b_04050607_00010203)))
      (rev32 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0d0c0f0e_09080b0a_05040706_01000302)))
      (rev32 a (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x08090a0b0c0d0e0f_0001020304050607)))
      (rev64 a (VectorSize.Size8x16)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x09080b0a0d0c0f0e_0100030205040706)))
      (rev64 a (VectorSize.Size16x8)))
(rule 1 (lower (shuffle a b (u128_from_immediate 0x0b0a09080f0e0d0c_0302010007060504)))
      (rev64 a (VectorSize.Size32x4)))

(rule (lower (has_type ty (shuffle rn rn2 (u128_from_immediate mask))))
      (let ((mask_reg Reg (constant_f128 mask)))
       (vec_tbl2 rn rn2 mask_reg ty)))

;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type vec_i128_ty (swizzle rn rm)))
      (vec_tbl rn rm))

;;;; Rules for `isplit` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (isplit x @ (value_type $I128)))
      (let
          ((x_regs ValueRegs x)
           (x_lo ValueRegs (value_regs_get x_regs 0))
           (x_hi ValueRegs (value_regs_get x_regs 1)))
        (output_pair x_lo x_hi)))

;;;; Rules for `iconcat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I128 (iconcat lo hi)))
      (output (value_regs lo hi)))

;;;; Rules for `scalar_to_vector` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32X4 (scalar_to_vector x)))
      (fpu_extend x (ScalarSize.Size32)))

(rule (lower (has_type $F64X2 (scalar_to_vector x)))
      (fpu_extend x (ScalarSize.Size64)))

(rule -1 (lower (scalar_to_vector x @ (value_type $I64)))
      (mov_to_fpu x (ScalarSize.Size64)))

(rule -2 (lower (scalar_to_vector x @ (value_type (int_fits_in_32 _))))
      (mov_to_fpu (put_in_reg_zext32 x) (ScalarSize.Size32)))

;;;; Rules for `vall_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; cmeq vtmp.2d, vm.2d, #0
;; addp dtmp, vtmp.2d
;; fcmp dtmp, dtmp
;; cset xd, eq
;;
;; Note that after the ADDP the value of the temporary register will be either
;; 0 when all input elements are true, i.e. non-zero, or a NaN otherwise
;; (either -1 or -2 when represented as an integer); NaNs are the only
;; floating-point numbers that compare unequal to themselves.
(rule (lower (vall_true x @ (value_type (multi_lane 64 2))))
      (let ((x1 Reg (cmeq0 x (VectorSize.Size64x2)))
            (x2 Reg (addp x1 x1 (VectorSize.Size64x2))))
       (with_flags (fpu_cmp (ScalarSize.Size64) x2 x2)
                   (materialize_bool_result (Cond.Eq)))))

(rule (lower (vall_true x @ (value_type (multi_lane 32 2))))
      (let ((x1 Reg (mov_from_vec x 0 (ScalarSize.Size64))))
       (with_flags (cmp_rr_shift (OperandSize.Size64) (zero_reg) x1 32)
                   (ccmp_imm
                    (OperandSize.Size32)
                    x1
                    (u8_into_uimm5 0)
                    (nzcv $false $true $false $false)
                    (Cond.Ne)))))

;; This operation is implemented by using uminv to create a scalar value, which
;; is then compared against zero.
;;
;; uminv bn, vm.16b
;; mov xm, vn.d[0]
;; cmp xm, #0
;; cset xm, ne
(rule -1 (lower (vall_true x @ (value_type (lane_fits_in_32 ty))))
      (if (not_vec32x2 ty))
      (let ((x1 Reg (vec_lanes (VecLanesOp.Uminv) x (vector_size ty)))
            (x2 Reg (mov_from_vec x1 0 (ScalarSize.Size64))))
       (with_flags (cmp_imm (OperandSize.Size64) x2 (u8_into_imm12 0))
                   (materialize_bool_result (Cond.Ne)))))

;;;; Rules for `vany_true` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (vany_true x @ (value_type in_ty)))
      (with_flags (vanytrue x in_ty)
                  (materialize_bool_result (Cond.Ne))))

;;;; Rules for `iadd_pairwise` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; special case for the `i16x8.extadd_pairwise_i8x16_s` wasm instruction
(rule (lower (has_type $I16X8 (iadd_pairwise (swiden_low x) (swiden_high x))))
      (saddlp8 x))

;; special case for the `i32x4.extadd_pairwise_i16x8_s` wasm instruction
(rule (lower (has_type $I32X4 (iadd_pairwise (swiden_low x) (swiden_high x))))
      (saddlp16 x))

;; special case for the `i16x8.extadd_pairwise_i8x16_u` wasm instruction
(rule (lower (has_type $I16X8 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
      (uaddlp8 x))

;; special case for the `i32x4.extadd_pairwise_i16x8_u` wasm instruction
(rule (lower (has_type $I32X4 (iadd_pairwise (uwiden_low x) (uwiden_high x))))
      (uaddlp16 x))

(rule -1 (lower (has_type ty (iadd_pairwise x y)))
      (addp x y (vector_size ty)))

;;;; Rules for `iabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (iabs x)))
      (vec_abs x (vector_size ty)))

(rule 2 (lower (has_type $I64 (iabs x)))
      (abs (OperandSize.Size64) x))

(rule 1 (lower (has_type (fits_in_32 ty) (iabs x)))
      (abs (OperandSize.Size32) (put_in_reg_sext32 x)))

; `rustc` implementation.
; - create a bitmask of all 1s if negative, or 0s if positive.
; - xor all bits by bitmask. then subtract bitmask from xor'd values.
; - if `x` is positive, the xor'd bits = x and the mask = 0, so we end up with
;   `x - 0`.
; - if `x` is negative, the xor'd bits = ~x and the mask = -1, so we end up with
;   `~x - (-1) = ~x + 1`, which is exactly `abs(x)`.
(rule (lower (has_type $I128 (iabs x)))
      (let ((x_regs ValueRegs x)
            (x_lo Reg (value_regs_get x_regs 0))
            (x_hi Reg (value_regs_get x_regs 1))
            (asr_reg Reg (asr_imm $I64 x_hi (imm_shift_from_u8 63)))
            (eor_hi Reg (eor $I64 x_hi asr_reg))
            (eor_lo Reg (eor $I64 x_lo asr_reg))
            (subs_lo ProducesFlags (sub_with_flags_paired $I64 eor_lo asr_reg))
            (sbc_hi ConsumesFlags (sbc_paired $I64 eor_hi asr_reg)))
       (with_flags subs_lo sbc_hi)))

;;;; Rules for `avg_round` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I64X2 (avg_round x y)))
      (let ((one Reg (splat_const 1 (VectorSize.Size64x2)))
            (c Reg (orr_vec x y (VectorSize.Size64x2)))
            (c Reg (and_vec c one (VectorSize.Size64x2)))
            (x Reg (ushr_vec_imm x 1 (VectorSize.Size64x2)))
            (y Reg (ushr_vec_imm y 1 (VectorSize.Size64x2)))
            (sum Reg (add_vec x y (VectorSize.Size64x2))))
       (add_vec c sum (VectorSize.Size64x2))))

(rule -1 (lower (has_type (lane_fits_in_32 ty) (avg_round x y)))
      (vec_rrr (VecALUOp.Urhadd) x y (vector_size ty)))

;;;; Rules for `sqmul_round_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty @ (multi_lane _ _) (sqmul_round_sat x y)))
      (vec_rrr (VecALUOp.Sqrdmulh) x y (vector_size ty)))

;;;; Rules for `fadd` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fadd rn rm)))
      (vec_rrr (VecALUOp.Fadd) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fadd rn rm)))
      (fpu_rrr (FPUOp2.Add) rn rm (scalar_size ty)))

;;;; Rules for `fsub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fsub rn rm)))
      (vec_rrr (VecALUOp.Fsub) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fsub rn rm)))
      (fpu_rrr (FPUOp2.Sub) rn rm (scalar_size ty)))

;;;; Rules for `fmul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmul rn rm)))
      (vec_rrr (VecALUOp.Fmul) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fmul rn rm)))
      (fpu_rrr (FPUOp2.Mul) rn rm (scalar_size ty)))

;;;; Rules for `fdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fdiv rn rm)))
      (vec_rrr (VecALUOp.Fdiv) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fdiv rn rm)))
      (fpu_rrr (FPUOp2.Div) rn rm (scalar_size ty)))

;;;; Rules for `fmin` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmin rn rm)))
      (vec_rrr (VecALUOp.Fmin) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fmin rn rm)))
      (fpu_rrr (FPUOp2.Min) rn rm (scalar_size ty)))

;;;; Rules for `fmax` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fmax rn rm)))
      (vec_rrr (VecALUOp.Fmax) rn rm (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fmax rn rm)))
      (fpu_rrr (FPUOp2.Max) rn rm (scalar_size ty)))

;;;; Rules for `sqrt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (sqrt x)))
      (vec_misc (VecMisc2.Fsqrt) x (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (sqrt x)))
      (fpu_rr (FPUOp1.Sqrt) x (scalar_size ty)))

;;;; Rules for `fneg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fneg x)))
      (vec_misc (VecMisc2.Fneg) x (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fneg x)))
      (fpu_rr (FPUOp1.Neg) x (scalar_size ty)))

;;;; Rules for `fabs` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (fabs x)))
      (vec_misc (VecMisc2.Fabs) x (vector_size ty)))

(rule (lower (has_type (ty_scalar_float ty) (fabs x)))
      (fpu_rr (FPUOp1.Abs) x (scalar_size ty)))

;;;; Rules for `fpromote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F64 (fpromote x)))
      (fpu_rr (FPUOp1.Cvt32To64) x (ScalarSize.Size32)))

;;;; Rules for `fdemote` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $F32 (fdemote x)))
      (fpu_rr (FPUOp1.Cvt64To32) x (ScalarSize.Size64)))

;;;; Rules for `ceil` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (ceil x)))
      (vec_misc (VecMisc2.Frintp) x (vector_size ty)))

(rule (lower (has_type $F32 (ceil x)))
      (fpu_round (FpuRoundMode.Plus32) x))

(rule (lower (has_type $F64 (ceil x)))
      (fpu_round (FpuRoundMode.Plus64) x))

;;;; Rules for `floor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (floor x)))
      (vec_misc (VecMisc2.Frintm) x (vector_size ty)))

(rule (lower (has_type $F32 (floor x)))
      (fpu_round (FpuRoundMode.Minus32) x))

(rule (lower (has_type $F64 (floor x)))
      (fpu_round (FpuRoundMode.Minus64) x))

;;;; Rules for `trunc` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (trunc x)))
      (vec_misc (VecMisc2.Frintz) x (vector_size ty)))

(rule (lower (has_type $F32 (trunc x)))
      (fpu_round (FpuRoundMode.Zero32) x))

(rule (lower (has_type $F64 (trunc x)))
      (fpu_round (FpuRoundMode.Zero64) x))

;;;; Rules for `nearest` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane _ _) (nearest x)))
      (vec_misc (VecMisc2.Frintn) x (vector_size ty)))

(rule (lower (has_type $F32 (nearest x)))
      (fpu_round (FpuRoundMode.Nearest32) x))

(rule (lower (has_type $F64 (nearest x)))
      (fpu_round (FpuRoundMode.Nearest64) x))

;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(type IsFneg (enum (Result (negate u64) (value Value))))

(decl pure is_fneg (Value) IsFneg)
(rule 1 (is_fneg (fneg n)) (IsFneg.Result 1 n))
(rule 0 (is_fneg n) (IsFneg.Result 0 n))

(decl pure is_fneg_neg (IsFneg) u64)
(rule (is_fneg_neg (IsFneg.Result n _)) n)

(decl pure get_fneg_value (IsFneg) Value)
(rule (get_fneg_value (IsFneg.Result _ v)) v)

(decl fmadd_series (Type u64 Value Value Value) InstOutput)
(rule 0 (fmadd_series (ty_scalar_float ty) 0 x y z) (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))
(rule 0 (fmadd_series (ty_scalar_float ty) 1 x y z) (fpu_rrrr (FPUOp3.MSub) (scalar_size ty) x y z))

(rule (lower (has_type (ty_scalar_float ty) (fma x_src y_src z)))
  (let
      ((x_res IsFneg (is_fneg x_src))
       (y_res IsFneg (is_fneg y_src)))
      (fmadd_series ty (u64_xor (is_fneg_neg x_res) (is_fneg_neg y_res)) (get_fneg_value x_res) (get_fneg_value y_res) z)))

;; Delegate vector-based lowerings to helpers below
(rule 1 (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
        (lower_fmla (VecALUModOp.Fmla) x y z (vector_size ty)))

;; Lowers a fused-multiply-add operation handling various forms of the
;; instruction to get maximal coverage of what's available on AArch64.
(decl lower_fmla (VecALUModOp Value Value Value VectorSize) Reg)

;; Base case, emit the op requested.
(rule (lower_fmla op x y z size)
      (vec_rrr_mod op z x y size))

;; Special case: if one of the multiplicands are a splat then the element-based
;; fma can be used instead with 0 as the element index.
(rule 1 (lower_fmla op (splat x) y z size)
        (vec_fmla_elem op z y x size 0))
(rule 2 (lower_fmla op x (splat y) z size)
        (vec_fmla_elem op z x y size 0))

;; Special case: if one of the multiplicands is a shuffle to broadcast a
;; single element of a vector then the element-based fma can be used like splat
;; above.
;;
;; Note that in Cranelift shuffle always has i8x16 inputs and outputs so
;; a `bitcast` is matched here explicitly since that's the main way a shuffle
;; output will be fed into this instruction.
(rule 3 (lower_fmla op (bitcast _ (shuffle x x (shuffle32_from_imm n n n n))) y z size @ (VectorSize.Size32x4))
        (if-let $true (u64_lt n 4))
        (vec_fmla_elem op z y x size n))
(rule 4 (lower_fmla op x (bitcast _ (shuffle y y (shuffle32_from_imm n n n n))) z size @ (VectorSize.Size32x4))
        (if-let $true (u64_lt n 4))
        (vec_fmla_elem op z x y size n))
(rule 3 (lower_fmla op (bitcast _ (shuffle x x (shuffle64_from_imm n n))) y z size @ (VectorSize.Size64x2))
        (if-let $true (u64_lt n 2))
        (vec_fmla_elem op z y x size n))
(rule 4 (lower_fmla op x (bitcast _ (shuffle y y (shuffle64_from_imm n n))) z size @ (VectorSize.Size64x2))
        (if-let $true (u64_lt n 2))
        (vec_fmla_elem op z x y size n))

;; Special case: if one of the multiplicands is `fneg` then peel that away,
;; reverse the operation being performed, and then recurse on `lower_fmla`
;; again to generate the actual instruction.
;;
;; Note that these are the highest priority cases for `lower_fmla` to peel
;; away as many `fneg` operations as possible.
(rule 5 (lower_fmla op (fneg x) y z size)
        (lower_fmla (neg_fmla op) x y z size))
(rule 6 (lower_fmla op x (fneg y) z size)
        (lower_fmla (neg_fmla op) x y z size))

(decl neg_fmla (VecALUModOp) VecALUModOp)
(rule (neg_fmla (VecALUModOp.Fmla)) (VecALUModOp.Fmls))
(rule (neg_fmla (VecALUModOp.Fmls)) (VecALUModOp.Fmla))

;;;; Rules for `fcopysign` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (fcopysign x y)))
      (fcopy_sign x y ty))

;;;; Rules for `fcvt_to_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToU32) x $false $F32 out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToU64) x $false $F32 $I64))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToU32) x $false $F64 out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_uint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToU64) x $false $F64 $I64))

;;;; Rules for `fcvt_to_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToI32) x $true $F32 out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F32))))
      (fpu_to_int_cvt (FpuToIntOp.F32ToI64) x $true $F32 $I64))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToI32) x $true $F64 out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_sint x @ (value_type $F64))))
      (fpu_to_int_cvt (FpuToIntOp.F64ToI64) x $true $F64 $I64))

;;;; Rules for `fcvt_from_uint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_uint x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))

(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_uint x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Ucvtf) x (vector_size ty)))

(rule (lower (has_type $F32 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.U32ToF32) (put_in_reg_zext32 x)))

(rule (lower (has_type $F64 (fcvt_from_uint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.U32ToF64) (put_in_reg_zext32 x)))

(rule 1 (lower (has_type $F32 (fcvt_from_uint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.U64ToF32) x))

(rule 1 (lower (has_type $F64 (fcvt_from_uint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.U64ToF64) x))

;;;; Rules for `fcvt_from_sint` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_from_sint x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))

(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_from_sint x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Scvtf) x (vector_size ty)))

(rule (lower (has_type $F32 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.I32ToF32) (put_in_reg_sext32 x)))

(rule (lower (has_type $F64 (fcvt_from_sint x @ (value_type (fits_in_32 _)))))
      (int_to_fpu (IntToFpuOp.I32ToF64) (put_in_reg_sext32 x)))

(rule 1 (lower (has_type $F32 (fcvt_from_sint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.I64ToF32) x))

(rule 1 (lower (has_type $F64 (fcvt_from_sint x @ (value_type $I64))))
      (int_to_fpu (IntToFpuOp.I64ToF64) x))

;;;; Rules for `fcvt_to_uint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))

(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_uint_sat x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Fcvtzu) x (vector_size ty)))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU32) x $false out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToU64) x $false $I64))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_uint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU32) x $false out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_uint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToU64) x $false $I64))

;;;; Rules for `fcvt_to_sint_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty @ (multi_lane 32 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 32 _)))))
      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))

(rule -1 (lower (has_type ty @ (multi_lane 64 _) (fcvt_to_sint_sat x @ (value_type (multi_lane 64 _)))))
      (vec_misc (VecMisc2.Fcvtzs) x (vector_size ty)))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI32) x $true out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F32))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F32ToI64) x $true $I64))

(rule (lower (has_type (fits_in_32 out_ty) (fcvt_to_sint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI32) x $true out_ty))

(rule 1 (lower (has_type $I64 (fcvt_to_sint_sat x @ (value_type $F64))))
      (fpu_to_int_cvt_sat (FpuToIntOp.F64ToI64) x $true $I64))

;;;; Rules for `isub` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller

;; Base case, simply subtracting things in registers.
(rule -4 (lower (has_type (fits_in_64 ty) (isub x y)))
      (sub ty x y))

;; Special case for when one operand is an immediate that fits in 12 bits.
(rule 0 (lower (has_type (fits_in_64 ty) (isub x (imm12_from_value y))))
      (sub_imm ty x y))

;; Same as the previous special case, except we can switch the subtraction to an
;; addition if the negated immediate fits in 12 bits.
(rule 2 (lower (has_type (fits_in_64 ty) (isub x y)))
      (if-let imm12_neg (imm12_from_negated_value y))
      (add_imm ty x imm12_neg))

;; Special cases for when we're subtracting an extended register where the
;; extending operation can get folded into the sub itself.
(rule 1 (lower (has_type (fits_in_64 ty) (isub x (extended_value_from_value y))))
      (sub_extend ty x y))

;; Finally a special case for when we're subtracting the shift of a different
;; register by a constant amount and the shift can get folded into the sub.
(rule -3 (lower (has_type (fits_in_64 ty)
                       (isub x (ishl y (iconst k)))))
      (if-let amt (lshl_from_imm64 ty k))
      (sub_shift ty x y amt))

;; vectors
(rule -2 (lower (has_type ty @ (multi_lane _ _) (isub x y)))
      (sub_vec x y (vector_size ty)))

;; `i128`
(rule -1 (lower (has_type $I128 (isub x y)))
      (sub_i128 x y))

;;;; Rules for `uadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 ty) (uadd_sat x y)))
      (uqadd x y (vector_size ty)))

;;;; Rules for `sadd_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 ty) (sadd_sat x y)))
      (sqadd x y (vector_size ty)))

;;;; Rules for `usub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 ty) (usub_sat x y)))
      (uqsub x y (vector_size ty)))

;;;; Rules for `ssub_sat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 ty) (ssub_sat x y)))
      (sqsub x y (vector_size ty)))

;;;; Rules for `ineg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.
(rule 1 (lower (has_type (fits_in_64 ty) (ineg x)))
      (sub ty (zero_reg) x))

;; `i128`
(rule 2 (lower (has_type $I128 (ineg x)))
      (sub_i128 (value_regs_zero) x))

;; vectors.
(rule (lower (has_type (ty_vec128 ty) (ineg x)))
      (neg x (vector_size ty)))

;;;; Rules for `imul` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.
(rule -3 (lower (has_type (fits_in_64 ty) (imul x y)))
      (madd ty x y (zero_reg)))

;; `i128`.
(rule -1 (lower (has_type $I128 (imul x y)))
      (let
          ;; Get the high/low registers for `x`.
          ((x_regs ValueRegs x)
           (x_lo Reg (value_regs_get x_regs 0))
           (x_hi Reg (value_regs_get x_regs 1))

           ;; Get the high/low registers for `y`.
           (y_regs ValueRegs y)
           (y_lo Reg (value_regs_get y_regs 0))
           (y_hi Reg (value_regs_get y_regs 1))

           ;; 128bit mul formula:
           ;;   dst_lo = x_lo * y_lo
           ;;   dst_hi = umulhi(x_lo, y_lo) + (x_lo * y_hi) + (x_hi * y_lo)
           ;;
           ;; We can convert the above formula into the following
           ;; umulh   dst_hi, x_lo, y_lo
           ;; madd    dst_hi, x_lo, y_hi, dst_hi
           ;; madd    dst_hi, x_hi, y_lo, dst_hi
           ;; madd    dst_lo, x_lo, y_lo, zero
           (dst_hi1 Reg (umulh $I64 x_lo y_lo))
           (dst_hi2 Reg (madd $I64 x_lo y_hi dst_hi1))
           (dst_hi Reg (madd $I64 x_hi y_lo dst_hi2))
           (dst_lo Reg (madd $I64 x_lo y_lo (zero_reg))))
        (value_regs dst_lo dst_hi)))

;; Case for i8x16, i16x8, and i32x4.
(rule -2 (lower (has_type (ty_vec128 ty @ (not_i64x2)) (imul x y)))
      (mul x y (vector_size ty)))

;; Special lowering for i64x2.
;;
;; This I64X2 multiplication is performed with several 32-bit
;; operations.
;;
;; 64-bit numbers x and y, can be represented as:
;;   x = a + 2^32(b)
;;   y = c + 2^32(d)
;;
;; A 64-bit multiplication is:
;;   x * y = ac + 2^32(ad + bc) + 2^64(bd)
;; note: `2^64(bd)` can be ignored, the value is too large to fit in
;; 64 bits.
;;
;; This sequence implements a I64X2 multiply, where the registers
;; `rn` and `rm` are split up into 32-bit components:
;;   rn = |d|c|b|a|
;;   rm = |h|g|f|e|
;;
;;   rn * rm = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
;;
;;  The sequence is:
;;  rev64 rd.4s, rm.4s
;;  mul rd.4s, rd.4s, rn.4s
;;  xtn tmp1.2s, rn.2d
;;  addp rd.4s, rd.4s, rd.4s
;;  xtn tmp2.2s, rm.2d
;;  shll rd.2d, rd.2s, #32
;;  umlal rd.2d, tmp2.2s, tmp1.2s
(rule -1 (lower (has_type $I64X2 (imul x y)))
      (let ((rn Reg x)
            (rm Reg y)
            ;; Reverse the 32-bit elements in the 64-bit words.
            ;;   rd = |g|h|e|f|
            (rev Reg (rev64 rm (VectorSize.Size32x4)))

            ;; Calculate the high half components.
            ;;   rd = |dg|ch|be|af|
            ;;
            ;; Note that this 32-bit multiply of the high half
            ;; discards the bits that would overflow, same as
            ;; if 64-bit operations were used. Also the Shll
            ;; below would shift out the overflow bits anyway.
            (mul Reg (mul rev rn (VectorSize.Size32x4)))

            ;; Extract the low half components of rn.
            ;;   tmp1 = |c|a|
            (tmp1 Reg (xtn rn (ScalarSize.Size32)))

            ;; Sum the respective high half components.
            ;;   rd = |dg+ch|be+af||dg+ch|be+af|
            (sum Reg (addp mul mul (VectorSize.Size32x4)))

            ;; Extract the low half components of rm.
            ;;   tmp2 = |g|e|
            (tmp2 Reg (xtn rm (ScalarSize.Size32)))

            ;; Shift the high half components, into the high half.
            ;;   rd = |dg+ch << 32|be+af << 32|
            (shift Reg (shll32 sum $false))

            ;; Multiply the low components together, and accumulate with the high
            ;; half.
            ;;   rd = |rd[1] + cg|rd[0] + ae|
            (result Reg (umlal32 shift tmp2 tmp1 $false)))
        result))

;; Special case for `i16x8.extmul_low_i8x16_s`.
(rule (lower (has_type $I16X8
                       (imul (swiden_low x @ (value_type $I8X16))
                             (swiden_low y @ (value_type $I8X16)))))
      (smull8 x y $false))

;; Special case for `i16x8.extmul_high_i8x16_s`.
(rule (lower (has_type $I16X8
                       (imul (swiden_high x @ (value_type $I8X16))
                             (swiden_high y @ (value_type $I8X16)))))
      (smull8 x y $true))

;; Special case for `i16x8.extmul_low_i8x16_u`.
(rule (lower (has_type $I16X8
                       (imul (uwiden_low x @ (value_type $I8X16))
                             (uwiden_low y @ (value_type $I8X16)))))
      (umull8 x y $false))

;; Special case for `i16x8.extmul_high_i8x16_u`.
(rule (lower (has_type $I16X8
                       (imul (uwiden_high x @ (value_type $I8X16))
                             (uwiden_high y @ (value_type $I8X16)))))
      (umull8 x y $true))

;; Special case for `i32x4.extmul_low_i16x8_s`.
(rule (lower (has_type $I32X4
                       (imul (swiden_low x @ (value_type $I16X8))
                             (swiden_low y @ (value_type $I16X8)))))
      (smull16 x y $false))

;; Special case for `i32x4.extmul_high_i16x8_s`.
(rule (lower (has_type $I32X4
                       (imul (swiden_high x @ (value_type $I16X8))
                             (swiden_high y @ (value_type $I16X8)))))
      (smull16 x y $true))

;; Special case for `i32x4.extmul_low_i16x8_u`.
(rule (lower (has_type $I32X4
                       (imul (uwiden_low x @ (value_type $I16X8))
                             (uwiden_low y @ (value_type $I16X8)))))
      (umull16 x y $false))

;; Special case for `i32x4.extmul_high_i16x8_u`.
(rule (lower (has_type $I32X4
                       (imul (uwiden_high x @ (value_type $I16X8))
                             (uwiden_high y @ (value_type $I16X8)))))
      (umull16 x y $true))

;; Special case for `i64x2.extmul_low_i32x4_s`.
(rule (lower (has_type $I64X2
                       (imul (swiden_low x @ (value_type $I32X4))
                             (swiden_low y @ (value_type $I32X4)))))
      (smull32 x y $false))

;; Special case for `i64x2.extmul_high_i32x4_s`.
(rule (lower (has_type $I64X2
                       (imul (swiden_high x @ (value_type $I32X4))
                             (swiden_high y @ (value_type $I32X4)))))
      (smull32 x y $true))

;; Special case for `i64x2.extmul_low_i32x4_u`.
(rule (lower (has_type $I64X2
                       (imul (uwiden_low x @ (value_type $I32X4))
                             (uwiden_low y @ (value_type $I32X4)))))
      (umull32 x y $false))

;; Special case for `i64x2.extmul_high_i32x4_u`.
(rule (lower (has_type $I64X2
                       (imul (uwiden_high x @ (value_type $I32X4))
                             (uwiden_high y @ (value_type $I32X4)))))
      (umull32 x y $true))

;;;; Rules for `smulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type $I64 (smulhi x y)))
      (smulh $I64 x y))

(rule (lower (has_type (fits_in_32 ty) (smulhi x y)))
      (let ((x64 Reg (put_in_reg_sext64 x))
            (y64 Reg (put_in_reg_sext64 y))
            (mul Reg (madd $I64 x64 y64 (zero_reg)))
            (result Reg (asr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty)))))
        result))

;;;; Rules for `umulhi` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type $I64 (umulhi x y)))
      (umulh $I64 x y))

(rule (lower (has_type (fits_in_32 ty) (umulhi x y)))
      (let (
          (x64 Reg (put_in_reg_zext64 x))
          (y64 Reg (put_in_reg_zext64 y))
          (mul Reg (madd $I64 x64 y64 (zero_reg)))
          (result Reg (lsr_imm $I64 mul (imm_shift_from_u8 (ty_bits ty))))
        )
        (value_reg result)))

;;;; Rules for `udiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: Add UDiv32 to implement 32-bit directly, rather
;; than extending the input.
;;
;; Note that aarch64's `udiv` doesn't trap so to respect the semantics of
;; CLIF's `udiv` the check for zero needs to be manually performed.
(rule (lower (has_type (fits_in_64 ty) (udiv x y)))
      (a64_udiv $I64 (put_in_reg_zext64 x) (put_nonzero_in_reg_zext64 y)))

;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
(decl put_nonzero_in_reg_zext64 (Value) Reg)
(rule -1 (put_nonzero_in_reg_zext64 val)
      (trap_if_zero_divisor (put_in_reg_zext64 val)))

;; Special case where if a `Value` is known to be nonzero we can trivially
;; move it into a register.
(rule (put_nonzero_in_reg_zext64 (and (value_type ty)
                                      (iconst (nonzero_u64_from_imm64 n))))
      (imm ty (ImmExtend.Zero) n))

;;;; Rules for `sdiv` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; TODO: Add SDiv32 to implement 32-bit directly, rather
;; than extending the input.
;;
;; The sequence of checks here should look like:
;;
;;   cbnz rm, #8
;;   udf ; divide by zero
;;   cmn rm, 1
;;   ccmp rn, 1, #nzcv, eq
;;   b.vc #8
;;   udf ; signed overflow
;;
;; Note The div instruction does not trap on divide by zero or overflow, so
;; checks need to be manually inserted.
;;
;; TODO: if `y` is -1 then a check that `x` is not INT_MIN is all that's
;; necessary, but right now `y` is checked to not be -1 as well.
(rule (lower (has_type (fits_in_64 ty) (sdiv x y)))
      (let ((x64 Reg (put_in_reg_sext64 x))
            (y64 Reg (put_nonzero_in_reg_sext64 y))
            (valid_x64 Reg (trap_if_div_overflow ty x64 y64))
            (result Reg (a64_sdiv $I64 valid_x64 y64)))
        result))

;; Special case for `sdiv` where no checks are needed due to division by a
;; constant meaning the checks are always passed.
(rule 1 (lower (has_type (fits_in_64 ty) (sdiv x (iconst imm))))
      (if-let y (safe_divisor_from_imm64 ty imm))
      (a64_sdiv $I64 (put_in_reg_sext64 x) (imm ty (ImmExtend.Sign) y)))

;; Helper for placing a `Value` into a `Reg` and validating that it's nonzero.
(decl put_nonzero_in_reg_sext64 (Value) Reg)
(rule -1 (put_nonzero_in_reg_sext64 val)
      (trap_if_zero_divisor (put_in_reg_sext64 val)))

;; Note that this has a special case where if the `Value` is a constant that's
;; not zero we can skip the zero check.
(rule (put_nonzero_in_reg_sext64 (and (value_type ty)
                                      (iconst (nonzero_u64_from_imm64 n))))
      (imm ty (ImmExtend.Sign) n))

;;;; Rules for `urem` and `srem` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Remainder (x % y) is implemented as:
;;
;;   tmp = x / y
;;   result = x - (tmp*y)
;;
;; use 'result' for tmp and you have:
;;
;;   cbnz y, #8         ; branch over trap
;;   udf                ; divide by zero
;;   div rd, x, y       ; rd = x / y
;;   msub rd, rd, y, x  ; rd = x - rd * y

(rule (lower (has_type (fits_in_64 ty) (urem x y)))
      (let ((x64 Reg (put_in_reg_zext64 x))
            (y64 Reg (put_nonzero_in_reg_zext64 y))
            (div Reg (a64_udiv $I64 x64 y64))
            (result Reg (msub $I64 div y64 x64)))
        result))

(rule (lower (has_type (fits_in_64 ty) (srem x y)))
      (let ((x64 Reg (put_in_reg_sext64 x))
            (y64 Reg (put_nonzero_in_reg_sext64 y))
            (div Reg (a64_sdiv $I64 x64 y64))
            (result Reg (msub $I64 div y64 x64)))
        result))

;;; Rules for integer min/max: umin, smin, umax, smax ;;;;;;;;;;;;;;;;;;;;;;;;;

;; `i64` and smaller.

;; cmp     $x, $y
;; csel    .., $x, $y, $cc
(decl cmp_and_choose (Type Cond bool Value Value) ValueRegs)
(rule (cmp_and_choose (fits_in_64 ty) cc _ x y)
      (let ((x Reg (put_in_reg x))
            (y Reg (put_in_reg y)))
       (with_flags_reg (cmp (operand_size ty) x y)
                       (csel cc x y))))

;; `i16` and `i8` min/max require sign extension as
;; the comparison operates on (at least) 32 bits.
(rule 1 (cmp_and_choose (fits_in_16 ty) cc signed x y)
      (let ((x Reg (extend (put_in_reg x) signed (ty_bits ty) 32))
            (y Reg (extend (put_in_reg y) signed (ty_bits ty) 32)))
      (with_flags_reg (cmp (operand_size ty) x y)
                      (csel cc x y))))

(rule 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umin x y)))
      (cmp_and_choose ty (Cond.Lo) $false x y))
(rule 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smin x y)))
      (cmp_and_choose ty (Cond.Lt) $true x y))
(rule 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (umax x y)))
      (cmp_and_choose ty (Cond.Hi) $false x y))
(rule 2 (lower (has_type (and (fits_in_64 ty) (ty_int _)) (smax x y)))
      (cmp_and_choose ty (Cond.Gt) $true x y))

;; Vector types.

(rule (lower (has_type ty @ (not_i64x2) (smin x y)))
      (vec_rrr (VecALUOp.Smin) x y (vector_size ty)))

(rule 1 (lower (has_type $I64X2 (smin x y)))
      (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) y x (VectorSize.Size64x2)) x y))

(rule (lower (has_type ty @ (not_i64x2) (umin x y)))
      (vec_rrr (VecALUOp.Umin) x y (vector_size ty)))

(rule 1 (lower (has_type $I64X2 (umin x y)))
      (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) y x (VectorSize.Size64x2)) x y))

(rule (lower (has_type ty @ (not_i64x2) (smax x y)))
      (vec_rrr (VecALUOp.Smax) x y (vector_size ty)))

(rule 1 (lower (has_type $I64X2 (smax x y)))
      (bsl $I64X2 (vec_rrr (VecALUOp.Cmgt) x y (VectorSize.Size64x2)) x y))

(rule (lower (has_type ty @ (not_i64x2) (umax x y)))
      (vec_rrr (VecALUOp.Umax) x y (vector_size ty)))

(rule 1 (lower (has_type $I64X2 (umax x y)))
      (bsl $I64X2 (vec_rrr (VecALUOp.Cmhi) x y (VectorSize.Size64x2)) x y))

;;;; Rules for `uextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; General rule for extending input to an output which fits in a single
;; register.
(rule -2 (lower (has_type (fits_in_64 out) (uextend x @ (value_type in))))
      (extend x $false (ty_bits in) (ty_bits out)))

;; Extraction of a vector lane automatically extends as necessary, so we can
;; skip an explicit extending instruction.
(rule 1 (lower (has_type (fits_in_64 out)
                       (uextend (extractlane vec @ (value_type in)
                                             (u8_from_uimm8 lane)))))
      (mov_from_vec (put_in_reg vec) lane (lane_size in)))

;; Atomic loads will also automatically zero their upper bits so the `uextend`
;; instruction can effectively get skipped here.
(rule 1 (lower (has_type (fits_in_64 out)
                       (uextend x @ (and (value_type in) (atomic_load flags _)))))
      (if-let mem_op (is_sinkable_inst x))
      (load_acquire in flags (sink_atomic_load mem_op)))

;; Conversion to 128-bit needs a zero-extension of the lower bits and the upper
;; bits are all zero.
(rule -1 (lower (has_type $I128 (uextend x)))
      (value_regs (put_in_reg_zext64 x) (imm $I64 (ImmExtend.Zero) 0)))

;; Like above where vector extraction automatically zero-extends extending to
;; i128 only requires generating a 0 constant for the upper bits.
(rule (lower (has_type $I128
                       (uextend (extractlane vec @ (value_type in)
                                             (u8_from_uimm8 lane)))))
      (value_regs (mov_from_vec (put_in_reg vec) lane (lane_size in)) (imm $I64 (ImmExtend.Zero) 0)))

;; Zero extensions from a load can be encoded in the load itself
(rule (lower (has_type (fits_in_64 _) (uextend x @ (has_type in_ty (load flags address offset)))))
      (if-let inst (is_sinkable_inst x))
      (let ((_ Unit (sink_inst inst)))
            (aarch64_uload in_ty (amode in_ty address offset) flags)))

(decl aarch64_uload (Type AMode MemFlags) Reg)
(rule (aarch64_uload $I8 amode flags) (aarch64_uload8 amode flags))
(rule (aarch64_uload $I16 amode flags) (aarch64_uload16 amode flags))
(rule (aarch64_uload $I32 amode flags) (aarch64_uload32 amode flags))

;;;; Rules for `sextend` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; General rule for extending input to an output which fits in a single
;; register.
(rule -4 (lower (has_type (fits_in_64 out) (sextend x @ (value_type in))))
      (extend x $true (ty_bits in) (ty_bits out)))

;; Extraction of a vector lane automatically extends as necessary, so we can
;; skip an explicit extending instruction.
(rule -3 (lower (has_type (fits_in_64 out)
                       (sextend (extractlane vec @ (value_type in)
                                             (u8_from_uimm8 lane)))))
      (mov_from_vec_signed (put_in_reg vec)
                           lane
                           (vector_size in)
                           (size_from_ty out)))

;; 64-bit to 128-bit only needs to sign-extend the input to the upper bits.
(rule -2 (lower (has_type $I128 (sextend x)))
      (let ((lo Reg (put_in_reg_sext64 x))
            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
        (value_regs lo hi)))

;; Like above where vector extraction automatically zero-extends extending to
;; i128 only requires generating a 0 constant for the upper bits.
;;
;; Note that `mov_from_vec_signed` doesn't exist for i64x2, so that's
;; specifically excluded here.
(rule (lower (has_type $I128
                       (sextend (extractlane vec @ (value_type in @ (not_i64x2))
                                             (u8_from_uimm8 lane)))))
      (let ((lo Reg (mov_from_vec_signed (put_in_reg vec)
                                         lane
                                         (vector_size in)
                                         (size_from_ty $I64)))
            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
        (value_regs lo hi)))

;; Extension from an extraction of i64x2 into i128.
(rule -1 (lower (has_type $I128
                       (sextend (extractlane vec @ (value_type $I64X2)
                                             (u8_from_uimm8 lane)))))
      (let ((lo Reg (mov_from_vec (put_in_reg vec)
                                  lane
                                  (ScalarSize.Size64)))
            (hi Reg (asr_imm $I64 lo (imm_shift_from_u8 63))))
        (value_regs lo hi)))

;; Signed extensions from a load can be encoded in the load itself
(rule (lower (has_type (fits_in_64 _) (sextend x @ (has_type in_ty (load flags address offset)))))
      (if-let inst (is_sinkable_inst x))
      (let ((_ Unit (sink_inst inst)))
            (aarch64_sload in_ty (amode in_ty address offset) flags)))

(decl aarch64_sload (Type AMode MemFlags) Reg)
(rule (aarch64_sload $I8 amode flags) (aarch64_sload8 amode flags))
(rule (aarch64_sload $I16 amode flags) (aarch64_sload16 amode flags))
(rule (aarch64_sload $I32 amode flags) (aarch64_sload32 amode flags))

;;;; Rules for `bnot` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Base case using `orn` between two registers.
;;
;; Note that bitwise negation is implemented here as
;;
;;      NOT rd, rm ==> ORR_NOT rd, zero, rm
(rule -1 (lower (has_type (fits_in_64 ty) (bnot x)))
      (orr_not ty (zero_reg) x))

;; Special case to use `orr_not_shift` if it's a `bnot` of a const-left-shifted
;; value.
(rule 1 (lower (has_type (fits_in_64 ty)
                       (bnot (ishl x (iconst k)))))
      (if-let amt (lshl_from_imm64 ty k))
      (orr_not_shift ty (zero_reg) x amt))

;; Implementation of `bnot` for `i128`.
(rule (lower (has_type $I128 (bnot x)))
      (let ((x_regs ValueRegs x)
            (x_lo Reg (value_regs_get x_regs 0))
            (x_hi Reg (value_regs_get x_regs 1))
            (new_lo Reg (orr_not $I64 (zero_reg) x_lo))
            (new_hi Reg (orr_not $I64 (zero_reg) x_hi)))
        (value_regs new_lo new_hi)))

;; Implementation of `bnot` for vector types.
(rule -2 (lower (has_type (ty_vec128 ty) (bnot x)))
      (not x (vector_size ty)))

;; Special-cases for fusing a bnot with bxor
(rule 2 (lower (has_type (fits_in_64 ty) (bnot (bxor x y))))
      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
(rule 3 (lower (has_type $I128 (bnot (bxor x y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))

;;;; Rules for `band` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type (fits_in_64 ty) (band x y)))
      (alu_rs_imm_logic_commutative (ALUOp.And) ty x y))

(rule (lower (has_type $I128 (band x y))) (i128_alu_bitop (ALUOp.And) $I64 x y))

(rule -2 (lower (has_type (ty_vec128 ty) (band x y)))
      (and_vec x y (vector_size ty)))

;; Specialized lowerings for `(band x (bnot y))` which is additionally produced
;; by Cranelift's `band_not` instruction that is legalized into the simpler
;; forms early on.

(rule 1 (lower (has_type (fits_in_64 ty) (band x (bnot y))))
      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))
(rule 2 (lower (has_type (fits_in_64 ty) (band (bnot y) x)))
      (alu_rs_imm_logic (ALUOp.AndNot) ty x y))

(rule 3 (lower (has_type $I128 (band x (bnot y)))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))
(rule 4 (lower (has_type $I128 (band (bnot y) x))) (i128_alu_bitop (ALUOp.AndNot) $I64 x y))

(rule 5 (lower (has_type (ty_vec128 ty) (band x (bnot y))))
      (bic_vec x y (vector_size ty)))
(rule 6 (lower (has_type (ty_vec128 ty) (band (bnot y) x)))
      (bic_vec x y (vector_size ty)))

;;;; Rules for `bor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type (fits_in_64 ty) (bor x y)))
      (alu_rs_imm_logic_commutative (ALUOp.Orr) ty x y))

(rule (lower (has_type $I128 (bor x y))) (i128_alu_bitop (ALUOp.Orr) $I64 x y))

(rule -2 (lower (has_type (ty_vec128 ty) (bor x y)))
      (orr_vec x y (vector_size ty)))

;; Specialized lowerings for `(bor x (bnot y))` which is additionally produced
;; by Cranelift's `bor_not` instruction that is legalized into the simpler
;; forms early on.

(rule 1 (lower (has_type (fits_in_64 ty) (bor x (bnot y))))
      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))
(rule 2 (lower (has_type (fits_in_64 ty) (bor (bnot y) x)))
      (alu_rs_imm_logic (ALUOp.OrrNot) ty x y))

(rule 3 (lower (has_type $I128 (bor x (bnot y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
(rule 4 (lower (has_type $I128 (bor (bnot y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))

;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type (fits_in_64 ty) (bxor x y)))
      (alu_rs_imm_logic_commutative (ALUOp.Eor) ty x y))

(rule (lower (has_type $I128 (bxor x y))) (i128_alu_bitop (ALUOp.Eor) $I64 x y))

(rule -2 (lower (has_type (ty_vec128 ty) (bxor x y)))
      (eor_vec x y (vector_size ty)))

;; Specialized lowerings for `(bxor x (bnot y))` which is additionally produced
;; by Cranelift's `bxor_not` instruction that is legalized into the simpler
;; forms early on.

(rule 1 (lower (has_type (fits_in_64 ty) (bxor x (bnot y))))
      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))
(rule 2 (lower (has_type (fits_in_64 ty) (bxor (bnot y) x)))
      (alu_rs_imm_logic (ALUOp.EorNot) ty x y))

(rule 3 (lower (has_type $I128 (bxor x (bnot y)))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))
(rule 4 (lower (has_type $I128 (bxor (bnot y) x))) (i128_alu_bitop (ALUOp.EorNot) $I64 x y))

;;;; Rules for `ishl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Shift for i8/i16/i32.
(rule -1 (lower (has_type (fits_in_32 ty) (ishl x y)))
      (do_shift (ALUOp.Lsl) ty x y))

;; Shift for i64.
(rule (lower (has_type $I64 (ishl x y)))
      (do_shift (ALUOp.Lsl) $I64 x y))

;; Shift for i128.
(rule (lower (has_type $I128 (ishl x y)))
      (lower_shl128 x (value_regs_get y 0)))

;;     lsl     lo_lshift, src_lo, amt
;;     lsl     hi_lshift, src_hi, amt
;;     mvn     inv_amt, amt
;;     lsr     lo_rshift, src_lo, #1
;;     lsr     lo_rshift, lo_rshift, inv_amt
;;     orr     maybe_hi, hi_lshift, lo_rshift
;;     tst     amt, #0x40
;;     csel    dst_hi, lo_lshift, maybe_hi, ne
;;     csel    dst_lo, xzr, lo_lshift, ne
(decl lower_shl128 (ValueRegs Reg) ValueRegs)
(rule (lower_shl128 src amt)
      (let ((src_lo Reg (value_regs_get src 0))
            (src_hi Reg (value_regs_get src 1))
            (lo_lshift Reg (lsl $I64 src_lo amt))
            (hi_lshift Reg (lsl $I64 src_hi amt))
            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
            (lo_rshift Reg (lsr $I64 (lsr_imm $I64 src_lo (imm_shift_from_u8 1))
                                inv_amt))
          (maybe_hi Reg (orr $I64 hi_lshift lo_rshift))
        )
        (with_flags
         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
         (consumes_flags_concat
          (csel (Cond.Ne) (zero_reg) lo_lshift)
          (csel (Cond.Ne) lo_lshift maybe_hi)))))

;; Shift for vector types.
(rule -3 (lower (has_type (ty_vec128 ty) (ishl x y)))
      (let ((size VectorSize (vector_size ty))
            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
            (shift Reg (vec_dup masked_shift_amt size)))
        (sshl x shift size)))
(rule -2 (lower (has_type (ty_vec128 ty) (ishl x (iconst (u64_from_imm64 n)))))
        (ushl_vec_imm x (shift_masked_imm ty n) (vector_size ty)))

(decl pure shift_masked_imm (Type u64) u8)
(extern constructor shift_masked_imm shift_masked_imm)

;; Helper function to emit a shift operation with the opcode specified and
;; the output type specified. The `Reg` provided is shifted by the `Value`
;; given.
;;
;; Note that this automatically handles the clif semantics of masking the
;; shift amount where necessary.
(decl do_shift (ALUOp Type Reg Value) Reg)

;; 8/16-bit shift base case.
;;
;; When shifting for amounts larger than the size of the type, the CLIF shift
;; instructions implement a "wrapping" behaviour, such that an i8 << 8 is
;; equivalent to i8 << 0
;;
;; On i32 and i64 types this matches what the aarch64 spec does, but on smaller
;; types (i16, i8) we need to do this manually, so we wrap the shift amount
;; with an AND instruction
(rule -1 (do_shift op (fits_in_16 ty) x y)
      (let ((shift_amt Reg (value_regs_get y 0))
            (masked_shift_amt Reg (and_imm $I32 shift_amt (shift_mask ty))))
        (alu_rrr op $I32 x masked_shift_amt)))

(decl shift_mask (Type) ImmLogic)
(extern constructor shift_mask shift_mask)

;; 32/64-bit shift base cases.
(rule (do_shift op $I32 x y) (alu_rrr op $I32 x (value_regs_get y 0)))
(rule (do_shift op $I64 x y) (alu_rrr op $I64 x (value_regs_get y 0)))

;; Special case for shifting by a constant value where the value can fit into an
;; `ImmShift`.
;;
;; Note that this rule explicitly has a higher priority than the others
;; to ensure it's attempted first, otherwise the type-based filters on the
;; previous rules seem to take priority over this rule.
(rule 1 (do_shift op ty x (iconst k))
      (if-let shift (imm_shift_from_imm64 ty k))
      (alu_rr_imm_shift op ty x shift))

;;;; Rules for `ushr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Shift for i8/i16/i32.
(rule -1 (lower (has_type (fits_in_32 ty) (ushr x y)))
      (do_shift (ALUOp.Lsr) ty (put_in_reg_zext32 x) y))

;; Shift for i64.
(rule (lower (has_type $I64 (ushr x y)))
      (do_shift (ALUOp.Lsr) $I64 (put_in_reg_zext64 x) y))

;; Shift for i128.
(rule (lower (has_type $I128 (ushr x y)))
      (lower_ushr128 x (value_regs_get y 0)))

;; Vector shifts.
;;
;; Note that for constant shifts a 0-width shift can't be emitted so it's
;; special cased to pass through the input as-is since a 0-shift doesn't modify
;; the input anyway.
(rule -4 (lower (has_type (ty_vec128 ty) (ushr x y)))
      (let ((size VectorSize (vector_size ty))
            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
        (ushl x shift size)))
(rule -3 (lower (has_type (ty_vec128 ty) (ushr x (iconst (u64_from_imm64 n)))))
         (ushr_vec_imm x (shift_masked_imm ty n) (vector_size ty)))
(rule -2 (lower (has_type (ty_vec128 ty) (ushr x (iconst (u64_from_imm64 n)))))
          (if-let 0 (shift_masked_imm ty n))
          x)

;;     lsr       lo_rshift, src_lo, amt
;;     lsr       hi_rshift, src_hi, amt
;;     mvn       inv_amt, amt
;;     lsl       hi_lshift, src_hi, #1
;;     lsl       hi_lshift, hi_lshift, inv_amt
;;     tst       amt, #0x40
;;     orr       maybe_lo, lo_rshift, hi_lshift
;;     csel      dst_hi, xzr, hi_rshift, ne
;;     csel      dst_lo, hi_rshift, maybe_lo, ne
(decl lower_ushr128 (ValueRegs Reg) ValueRegs)
(rule (lower_ushr128 src amt)
      (let ((src_lo Reg (value_regs_get src 0))
            (src_hi Reg (value_regs_get src 1))
            (lo_rshift Reg (lsr $I64 src_lo amt))
            (hi_rshift Reg (lsr $I64 src_hi amt))

            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
            (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1))
                                inv_amt))
          (maybe_lo Reg (orr $I64 lo_rshift hi_lshift))
        )
        (with_flags
         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
         (consumes_flags_concat
          (csel (Cond.Ne) hi_rshift maybe_lo)
          (csel (Cond.Ne) (zero_reg) hi_rshift)))))

;;;; Rules for `sshr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Shift for i8/i16/i32.
(rule -4 (lower (has_type (fits_in_32 ty) (sshr x y)))
      (do_shift (ALUOp.Asr) ty (put_in_reg_sext32 x) y))

;; Shift for i64.
(rule (lower (has_type $I64 (sshr x y)))
      (do_shift (ALUOp.Asr) $I64 (put_in_reg_sext64 x) y))

;; Shift for i128.
(rule (lower (has_type $I128 (sshr x y)))
      (lower_sshr128 x (value_regs_get y 0)))

;; Vector shifts.
;;
;; Note that right shifts are implemented with a negative left shift. Also note
;; that for constant shifts a 0-width shift can't be emitted so it's special
;; cased to pass through the input as-is since a 0-shift doesn't modify the
;; input anyway.
(rule -3 (lower (has_type (ty_vec128 ty) (sshr x y)))
      (let ((size VectorSize (vector_size ty))
            (masked_shift_amt Reg (and_imm $I32 y (shift_mask ty)))
            (shift Reg (vec_dup (sub $I64 (zero_reg) masked_shift_amt) size)))
        (sshl x shift size)))
(rule -2 (lower (has_type (ty_vec128 ty) (sshr x (iconst (u64_from_imm64 n)))))
          (sshr_vec_imm x (shift_masked_imm ty n) (vector_size ty)))
(rule -1 (lower (has_type (ty_vec128 ty) (sshr x (iconst (u64_from_imm64 n)))))
          (if-let 0 (shift_masked_imm ty n))
          x)

;;     lsr       lo_rshift, src_lo, amt
;;     asr       hi_rshift, src_hi, amt
;;     mvn       inv_amt, amt
;;     lsl       hi_lshift, src_hi, #1
;;     lsl       hi_lshift, hi_lshift, inv_amt
;;     asr       hi_sign, src_hi, #63
;;     orr       maybe_lo, lo_rshift, hi_lshift
;;     tst       amt, #0x40
;;     csel      dst_hi, hi_sign, hi_rshift, ne
;;     csel      dst_lo, hi_rshift, maybe_lo, ne
(decl lower_sshr128 (ValueRegs Reg) ValueRegs)
(rule (lower_sshr128 src amt)
      (let ((src_lo Reg (value_regs_get src 0))
            (src_hi Reg (value_regs_get src 1))
            (lo_rshift Reg (lsr $I64 src_lo amt))
            (hi_rshift Reg (asr $I64 src_hi amt))

            (inv_amt Reg (orr_not $I32 (zero_reg) amt))
            (hi_lshift Reg (lsl $I64 (lsl_imm $I64 src_hi (imm_shift_from_u8 1))
                                inv_amt))
          (hi_sign Reg (asr_imm $I64 src_hi (imm_shift_from_u8 63)))
          (maybe_lo Reg (orr $I64 lo_rshift hi_lshift))
        )
        (with_flags
         (tst_imm $I64 amt (u64_into_imm_logic $I64 64))
         (consumes_flags_concat
          (csel (Cond.Ne) hi_rshift maybe_lo)
          (csel (Cond.Ne) hi_sign hi_rshift)))))

;;;; Rules for `rotl` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; General 8/16-bit case.
(rule -2 (lower (has_type (fits_in_16 ty) (rotl x y)))
      (let ((amt Reg (value_regs_get y 0))
            (neg_shift Reg (sub $I32 (zero_reg) amt)))
        (small_rotr ty (put_in_reg_zext32 x) neg_shift)))

;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
(rule -1 (lower (has_type (fits_in_16 ty) (rotl x (iconst k))))
      (if-let n (imm_shift_from_imm64 ty k))
      (small_rotr_imm ty (put_in_reg_zext32 x) (negate_imm_shift ty n)))

;; aarch64 doesn't have a left-rotate instruction, but a left rotation of K
;; places is effectively a right rotation of N - K places, if N is the integer's
;; bit size. We implement left rotations with this trick.
;;
;; Note that when negating the shift amount here the upper bits are ignored
;; by the rotr instruction, meaning that we'll still left-shift by the desired
;; amount.

;; General 32-bit case.
(rule (lower (has_type $I32 (rotl x y)))
      (let ((amt Reg (value_regs_get y 0))
            (neg_shift Reg (sub $I32 (zero_reg) amt)))
        (a64_rotr $I32 x neg_shift)))

;; General 64-bit case.
(rule (lower (has_type $I64 (rotl x y)))
      (let ((amt Reg (value_regs_get y 0))
            (neg_shift Reg (sub $I64 (zero_reg) amt)))
        (a64_rotr $I64 x neg_shift)))

;; Specialization for the 32-bit case when the rotation amount is an immediate.
(rule 1 (lower (has_type $I32 (rotl x (iconst k))))
      (if-let n (imm_shift_from_imm64 $I32 k))
      (a64_rotr_imm $I32 x (negate_imm_shift $I32 n)))

;; Specialization for the 64-bit case when the rotation amount is an immediate.
(rule 1 (lower (has_type $I64 (rotl x (iconst k))))
      (if-let n (imm_shift_from_imm64 $I64 k))
      (a64_rotr_imm $I64 x (negate_imm_shift $I64 n)))

(decl negate_imm_shift (Type ImmShift) ImmShift)
(extern constructor negate_imm_shift negate_imm_shift)

;; General 128-bit case.
;;
;; TODO: much better codegen is possible with a constant amount.
(rule (lower (has_type $I128 (rotl x y)))
      (let ((val ValueRegs x)
            (amt Reg (value_regs_get y 0))
            (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt))
            (lshift ValueRegs (lower_shl128 val amt))
            (rshift ValueRegs (lower_ushr128 val neg_amt)))
        (value_regs
          (orr $I64 (value_regs_get lshift 0) (value_regs_get rshift 0))
          (orr $I64 (value_regs_get lshift 1) (value_regs_get rshift 1)))))

;;;; Rules for `rotr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; General 8/16-bit case.
(rule -3 (lower (has_type (fits_in_16 ty) (rotr x y)))
      (small_rotr ty (put_in_reg_zext32 x) (value_regs_get y 0)))

;; General 32-bit case.
(rule -1 (lower (has_type $I32 (rotr x y)))
      (a64_rotr $I32 x (value_regs_get y 0)))

;; General 64-bit case.
(rule -1 (lower (has_type $I64 (rotr x y)))
      (a64_rotr $I64 x (value_regs_get y 0)))

;; Specialization for the 8/16-bit case when the rotation amount is an immediate.
(rule -2 (lower (has_type (fits_in_16 ty) (rotr x (iconst k))))
      (if-let n (imm_shift_from_imm64 ty k))
      (small_rotr_imm ty (put_in_reg_zext32 x) n))

;; Specialization for the 32-bit case when the rotation amount is an immediate.
(rule (lower (has_type $I32 (rotr x (iconst k))))
      (if-let n (imm_shift_from_imm64 $I32 k))
      (a64_rotr_imm $I32 x n))

;; Specialization for the 64-bit case when the rotation amount is an immediate.
(rule (lower (has_type $I64 (rotr x (iconst k))))
      (if-let n (imm_shift_from_imm64 $I64 k))
      (a64_rotr_imm $I64 x n))

;; For a < 32-bit rotate-right, we synthesize this as:
;;
;;    rotr rd, val, amt
;;
;;       =>
;;
;;    and masked_amt, amt, <bitwidth - 1>
;;    sub tmp_sub, masked_amt, <bitwidth>
;;    sub neg_amt, zero, tmp_sub  ; neg
;;    lsr val_rshift, val, masked_amt
;;    lsl val_lshift, val, neg_amt
;;    orr rd, val_lshift val_rshift
(decl small_rotr (Type Reg Reg) Reg)
(rule (small_rotr ty val amt)
      (let ((masked_amt Reg (and_imm $I32 amt (rotr_mask ty)))
            (tmp_sub Reg (sub_imm $I32 masked_amt (u8_into_imm12 (ty_bits ty))))
            (neg_amt Reg (sub $I32 (zero_reg) tmp_sub))
            (val_rshift Reg (lsr $I32 val masked_amt))
            (val_lshift Reg (lsl $I32 val neg_amt)))
        (orr $I32 val_lshift val_rshift)))

(decl rotr_mask (Type) ImmLogic)
(extern constructor rotr_mask rotr_mask)

;; For a constant amount, we can instead do:
;;
;;    rotr rd, val, #amt
;;
;;       =>
;;
;;    lsr val_rshift, val, #<amt>
;;    lsl val_lshift, val, <bitwidth - amt>
;;    orr rd, val_lshift, val_rshift
(decl small_rotr_imm (Type Reg ImmShift) Reg)
(rule (small_rotr_imm ty val amt)
      (let ((val_rshift Reg (lsr_imm $I32 val amt))
            (val_lshift Reg (lsl_imm $I32 val (rotr_opposite_amount ty amt))))
        (orr $I32 val_lshift val_rshift)))

(decl rotr_opposite_amount (Type ImmShift) ImmShift)
(extern constructor rotr_opposite_amount rotr_opposite_amount)

;; General 128-bit case.
;;
;; TODO: much better codegen is possible with a constant amount.
(rule (lower (has_type $I128 (rotr x y)))
      (let ((val ValueRegs x)
            (amt Reg (value_regs_get y 0))
            (neg_amt Reg (sub $I64 (imm $I64 (ImmExtend.Zero) 128) amt))
            (rshift ValueRegs (lower_ushr128 val amt))
            (lshift ValueRegs (lower_shl128 val neg_amt))
            (hi Reg (orr $I64 (value_regs_get rshift 1) (value_regs_get lshift 1)))
            (lo Reg (orr $I64 (value_regs_get rshift 0) (value_regs_get lshift 0))))
        (value_regs lo hi)))

;;;; Rules for `bitrev` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Reversing an 8-bit value with a 32-bit bitrev instruction will place
;; the reversed result in the highest 8 bits, so we need to shift them down into
;; place.
(rule (lower (has_type $I8 (bitrev x)))
      (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 24)))

;; Reversing an 16-bit value with a 32-bit bitrev instruction will place
;; the reversed result in the highest 16 bits, so we need to shift them down into
;; place.
(rule (lower (has_type $I16 (bitrev x)))
      (lsr_imm $I32 (rbit $I32 x) (imm_shift_from_u8 16)))

(rule (lower (has_type $I128 (bitrev x)))
      (let ((val ValueRegs x)
            (lo_rev Reg (rbit $I64 (value_regs_get val 0)))
            (hi_rev Reg (rbit $I64 (value_regs_get val 1))))
        (value_regs hi_rev lo_rev)))

(rule -1 (lower (has_type ty (bitrev x)))
      (rbit ty x))


;;;; Rules for `clz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8 (clz x)))
      (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 24)))

(rule (lower (has_type $I16 (clz x)))
      (sub_imm $I32 (a64_clz $I32 (put_in_reg_zext32 x)) (u8_into_imm12 16)))

(rule (lower (has_type $I128 (clz x)))
      (lower_clz128 x))

(rule -1 (lower (has_type ty (clz x)))
      (a64_clz ty x))

;; clz hi_clz, hi
;; clz lo_clz, lo
;; lsr tmp, hi_clz, #6
;; madd dst_lo, lo_clz, tmp, hi_clz
;; mov  dst_hi, 0
(decl lower_clz128 (ValueRegs) ValueRegs)
(rule (lower_clz128 val)
      (let ((hi_clz Reg (a64_clz $I64 (value_regs_get val 1)))
            (lo_clz Reg (a64_clz $I64 (value_regs_get val 0)))
            (tmp Reg (lsr_imm $I64 hi_clz (imm_shift_from_u8 6))))
        (value_regs (madd $I64 lo_clz tmp hi_clz) (imm $I64 (ImmExtend.Zero) 0))))

;;;; Rules for `ctz` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Note that all `ctz` instructions are implemented by reversing the bits and
;; then using a `clz` instruction since the tail zeros are the same as the
;; leading zeros of the reversed value.

(rule (lower (has_type $I8 (ctz x)))
      (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x800000))))

(rule (lower (has_type $I16 (ctz x)))
      (a64_clz $I32 (orr_imm $I32 (rbit $I32 x) (u64_into_imm_logic $I32 0x8000))))

(rule (lower (has_type $I128 (ctz x)))
      (let ((val ValueRegs x)
            (lo Reg (rbit $I64 (value_regs_get val 0)))
            (hi Reg (rbit $I64 (value_regs_get val 1))))
        (lower_clz128 (value_regs hi lo))))

(rule -1 (lower (has_type ty (ctz x)))
      (a64_clz ty (rbit ty x)))

;;;; Rules for `cls` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I8 (cls x)))
      (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 24)))

(rule (lower (has_type $I16 (cls x)))
      (sub_imm $I32 (a64_cls $I32 (put_in_reg_sext32 x)) (u8_into_imm12 16)))

;; cls lo_cls, lo
;; cls hi_cls, hi
;; eon sign_eq_eor, hi, lo
;; lsr sign_eq, sign_eq_eor, #63
;; madd lo_sign_bits, out_lo, sign_eq, sign_eq
;; cmp hi_cls, #63
;; csel maybe_lo, lo_sign_bits, xzr, eq
;; add  out_lo, maybe_lo, hi_cls
;; mov  out_hi, 0
(rule (lower (has_type $I128 (cls x)))
      (let ((val ValueRegs x)
            (lo Reg (value_regs_get val 0))
            (hi Reg (value_regs_get val 1))
            (lo_cls Reg (a64_cls $I64 lo))
            (hi_cls Reg (a64_cls $I64 hi))
            (sign_eq_eon Reg (eon $I64 hi lo))
            (sign_eq Reg (lsr_imm $I64 sign_eq_eon (imm_shift_from_u8 63)))
            (lo_sign_bits Reg (madd $I64 lo_cls sign_eq sign_eq))
            (maybe_lo Reg (with_flags_reg
                           (cmp64_imm hi_cls (u8_into_imm12 63))
                           (csel (Cond.Eq) lo_sign_bits (zero_reg)))))
        (value_regs (add $I64 maybe_lo hi_cls) (imm $I64 (ImmExtend.Zero) 0))))

(rule -1 (lower (has_type ty (cls x)))
      (a64_cls ty x))

;;;; Rules for `bswap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type $I16 (bswap x)))
      (a64_rev16 $I16 x))

(rule (lower (has_type $I32 (bswap x)))
      (a64_rev32 $I32 x))

(rule (lower (has_type $I64 (bswap x)))
      (a64_rev64 $I64 x))

(rule (lower (has_type $I128 (bswap x)))
      (value_regs
       (a64_rev64 $I64 (value_regs_get x 1))
       (a64_rev64 $I64 (value_regs_get x 0))))

;;;; Rules for `bmask` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; Bmask tests the value against zero, and uses `csetm` to assert the result.
(rule (lower (has_type out_ty (bmask x @ (value_type in_ty))))
      (lower_bmask out_ty in_ty x))

;;;; Rules for `popcnt` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; The implementation of `popcnt` for scalar types is done by moving the value
;; into a vector register, using the `cnt` instruction, and then collating the
;; result back into a normal register.
;;
;; The general sequence emitted here is
;;
;;     fmov tmp, in_lo
;;     if ty == i128:
;;         mov tmp.d[1], in_hi
;;
;;     cnt tmp.16b, tmp.16b / cnt tmp.8b, tmp.8b
;;     addv tmp, tmp.16b / addv tmp, tmp.8b / addp tmp.8b, tmp.8b, tmp.8b / (no instruction for 8-bit inputs)
;;
;;     umov out_lo, tmp.b[0]
;;     if ty == i128:
;;         mov out_hi, 0

(rule (lower (has_type $I8 (popcnt x)))
      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8))))
        (mov_from_vec nbits 0 (ScalarSize.Size8))))

;; Note that this uses `addp` instead of `addv` as it's usually cheaper.
(rule (lower (has_type $I16 (popcnt x)))
      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
            (added Reg (addp nbits nbits (VectorSize.Size8x8))))
        (mov_from_vec added 0 (ScalarSize.Size8))))

(rule (lower (has_type $I32 (popcnt x)))
      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size32)))
            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
            (added Reg (addv nbits (VectorSize.Size8x8))))
        (mov_from_vec added 0 (ScalarSize.Size8))))

(rule (lower (has_type $I64 (popcnt x)))
      (let ((tmp Reg (mov_to_fpu x (ScalarSize.Size64)))
            (nbits Reg (vec_cnt tmp (VectorSize.Size8x8)))
            (added Reg (addv nbits (VectorSize.Size8x8))))
        (mov_from_vec added 0 (ScalarSize.Size8))))

(rule (lower (has_type $I128 (popcnt x)))
      (let ((val ValueRegs x)
            (tmp_half Reg (mov_to_fpu (value_regs_get val 0) (ScalarSize.Size64)))
            (tmp Reg (mov_to_vec tmp_half (value_regs_get val 1) 1 (VectorSize.Size64x2)))
            (nbits Reg (vec_cnt tmp (VectorSize.Size8x16)))
            (added Reg (addv nbits (VectorSize.Size8x16))))
        (value_regs (mov_from_vec added 0 (ScalarSize.Size8)) (imm $I64 (ImmExtend.Zero) 0))))

(rule (lower (has_type $I8X16 (popcnt x)))
      (vec_cnt x (VectorSize.Size8x16)))

;;;; Rules for `bitselect` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (bitselect c x y)))
      (if (ty_int_ref_scalar_64 ty))
      (let ((tmp1 Reg (and_reg ty x c))
            (tmp2 Reg (bic ty y c)))
        (orr ty tmp1 tmp2)))

(rule 1 (lower (has_type (ty_vec128 ty) (bitselect c x y)))
        (bsl ty c x y))

;;;; Rules for `ireduce` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; T -> I{64,32,16,8}: We can simply pass through the value: values
;; are always stored with high bits undefined, so we can just leave
;; them be.
(rule (lower (has_type ty (ireduce src)))
    (if (ty_int_ref_scalar_64 ty))
    (value_regs_get src 0))

;;;; Rules for `fcmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 4 (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x y)))
      (if (zero_value y))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))

(rule 3 (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x y)))
      (if (zero_value y))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero cond rn vec_size))))

(rule 2 (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond_not_eq cond) x y)))
      (if (zero_value x))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (fcmeq0 rn vec_size) vec_size))))

(rule 1 (lower (has_type ty @ (multi_lane _ _) (fcmp (fcmp_zero_cond cond) x y)))
      (if (zero_value x))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (float_cmp_zero_swap cond rn vec_size))))

(rule 0 (lower (has_type out_ty
              (fcmp cond x @ (value_type (ty_scalar_float in_ty)) y)))
      (with_flags (fpu_cmp (scalar_size in_ty) x y)
                  (materialize_bool_result (fp_cond_code cond))))

(rule -1 (lower (has_type out_ty (fcmp cond x @ (value_type in_ty) y)))
      (if (ty_vector_float in_ty))
      (vec_cmp x y in_ty (fp_cond_code cond)))

;;;; Rules for `icmp` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 3 (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x y)))
      (if (zero_value y))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (cmeq0 rn vec_size) vec_size))))

(rule 2 (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x y)))
      (if (zero_value y))
      (let ((rn Reg x)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (int_cmp_zero cond rn vec_size))))

(rule 1 (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond_not_eq cond) x y)))
      (if (zero_value x))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (not (cmeq0 rn vec_size) vec_size))))

(rule 0 (lower (has_type ty @ (multi_lane _ _) (icmp (icmp_zero_cond cond) x y)))
      (if (zero_value x))
      (let ((rn Reg y)
            (vec_size VectorSize (vector_size ty)))
          (value_reg (int_cmp_zero_swap cond rn vec_size))))

(rule -1 (lower (icmp cond x @ (value_type in_ty) y))
      (lower_icmp_into_reg cond x y in_ty $I8))

;;;; Rules for `trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (trap trap_code))
      (side_effect (udf trap_code)))

;;;; Rules for `select` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty
                       (select (maybe_uextend (icmp cc
                                                    x @ (value_type in_ty)
                                                    y))
                               rn
                               rm)))
      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty)))
       (lower_select (flags_and_cc_flags comparison)
                     (cond_code (flags_and_cc_cc comparison))
                     ty
                     rn
                     rm)))

(rule (lower (has_type ty
                       (select (maybe_uextend (fcmp cc x @ (value_type in_ty) y))
                               rn
                               rm)))
      (let ((cond Cond (fp_cond_code cc)))
       (lower_select
        (fpu_cmp (scalar_size in_ty) x y)
        cond ty rn rm)))

(rule -1 (lower (has_type ty (select rcond @ (value_type $I8) rn rm)))
      (let ((rcond Reg rcond))
       (lower_select
         (tst_imm $I32 rcond (u64_into_imm_logic $I32 255))
         (Cond.Ne) ty rn rm)))

(rule -2 (lower (has_type ty (select rcond @ (value_type (fits_in_32 _)) rn rm)))
      (let ((rcond Reg (put_in_reg_zext32 rcond)))
       (lower_select
        (cmp (OperandSize.Size32) rcond (zero_reg))
        (Cond.Ne) ty rn rm)))

(rule -3 (lower (has_type ty (select rcond @ (value_type (fits_in_64 _)) rn rm)))
      (let ((rcond Reg (put_in_reg_zext64 rcond)))
       (lower_select
        (cmp (OperandSize.Size64) rcond (zero_reg))
        (Cond.Ne) ty rn rm)))

(rule -4 (lower (has_type ty (select rcond @ (value_type $I128) rn rm)))
      (let ((c ValueRegs (put_in_regs rcond))
            (c_lo Reg (value_regs_get c 0))
            (c_hi Reg (value_regs_get c 1))
            (rt Reg (orr $I64 c_lo c_hi)))
        (lower_select
         (cmp (OperandSize.Size64) rt (zero_reg))
         (Cond.Ne) ty rn rm)))

;;;; Rules for `select_spectre_guard` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty
                       (select_spectre_guard (maybe_uextend (icmp cc x @ (value_type in_ty) y))
                                             if_true
                                             if_false)))
      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y in_ty))
            (dst ValueRegs (lower_select
                            (flags_and_cc_flags comparison)
                            (cond_code (flags_and_cc_cc comparison))
                            ty
                            if_true
                            if_false))
            (_ InstOutput (side_effect (csdb))))
       dst))

(rule -1 (lower (has_type ty (select_spectre_guard rcond @ (value_type (fits_in_64 _)) rn rm)))
      (let ((rcond Reg (put_in_reg_zext64 rcond)))
       (lower_select
        (cmp (OperandSize.Size64) rcond (zero_reg))
        (Cond.Ne) ty rn rm)))

(rule -2 (lower (has_type ty (select_spectre_guard rcond @ (value_type $I128) rn rm)))
      (let ((c ValueRegs (put_in_regs rcond))
            (c_lo Reg (value_regs_get c 0))
            (c_hi Reg (value_regs_get c 1))
            (rt Reg (orr $I64 c_lo c_hi)))
        (lower_select
         (cmp (OperandSize.Size64) rt (zero_reg))
         (Cond.Ne) ty rn rm)))

;;;; Rules for `vconst` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (ty_vec128 _) (vconst (u128_from_constant x))))
      (constant_f128 x))

(rule 1 (lower (has_type ty (vconst (u64_from_constant x))))
      (if (ty_vec64 ty))
      (constant_f64 x))

;;;; Rules for `splat` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule -1 (lower (has_type ty (splat x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 in_ty))
      (vec_dup x (vector_size ty)))

(rule -2 (lower (has_type ty (splat x @ (value_type (ty_scalar_float _)))))
      (vec_dup_from_fpu x (vector_size ty) 0))

(rule (lower (has_type ty (splat (f32const (u32_from_ieee32 n)))))
      (splat_const n (vector_size ty)))

(rule (lower (has_type ty (splat (f64const (u64_from_ieee64 n)))))
      (splat_const n (vector_size ty)))

(rule (lower (has_type ty (splat (iconst (u64_from_imm64 n)))))
      (splat_const n (vector_size ty)))

(rule (lower (has_type ty (splat (ireduce (iconst (u64_from_imm64 n))))))
      (splat_const n (vector_size ty)))

(rule (lower (has_type ty (splat x @ (load flags _ _))))
      (if-let mem_op (is_sinkable_inst x))
      (let ((addr Reg (sink_load_into_addr (lane_type ty) mem_op)))
            (ld1r addr (vector_size ty) flags)))

;;;; Rules for `AtomicLoad` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (has_type (valid_atomic_transaction ty) (atomic_load flags addr)))
      (load_acquire ty flags addr))


;;;; Rules for `AtomicStore` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (atomic_store flags
                src @ (value_type (valid_atomic_transaction ty))
                addr))
      (side_effect (store_release ty flags src addr)))

;;;; Rules for `AtomicRMW` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Add) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Add) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Xor) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Eor) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Or) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Set) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Smax) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Smax) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Smin) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Smin) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Umax) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Umax) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Umin) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Umin) addr src ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.Sub) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Add) addr (sub ty (zero_reg) src) ty flags))
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                      (atomic_rmw flags (AtomicRmwOp.And) addr src))))
      (lse_atomic_rmw (AtomicRMWOp.Clr) addr (eon ty src (zero_reg)) ty flags))


(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Add) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Add) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Sub) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Sub) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.And) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.And) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Nand) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Nand) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Or) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Orr) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Xor) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Eor) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Smin) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Smin) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Smax) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Smax) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Umin) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Umin) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Umax) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Umax) addr src ty flags))
(rule (lower (has_type (valid_atomic_transaction ty)
             (atomic_rmw flags (AtomicRmwOp.Xchg) addr src)))
      (atomic_rmw_loop (AtomicRMWLoopOp.Xchg) addr src ty flags))

;;;; Rules for `AtomicCAS` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (and (use_lse)
                  (has_type (valid_atomic_transaction ty)
                  (atomic_cas flags addr src1 src2))))
      (lse_atomic_cas addr src1 src2 ty flags))

(rule (lower (and (has_type (valid_atomic_transaction ty)
                  (atomic_cas flags addr src1 src2))))
      (atomic_cas_loop addr src1 src2 ty flags))

;;;; Rules for 'fvdemote' ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule (lower (fvdemote x))
      (fcvtn x (ScalarSize.Size32)))


;;;; Rules for `snarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (has_type (ty_vec128_int ty) (snarrow x y)))
      (if (zero_value y))
      (sqxtn x (lane_size ty)))

(rule 2 (lower (has_type (ty_vec64_int ty) (snarrow x y)))
      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
            (sqxtn dst (lane_size ty))))

(rule 0 (lower (has_type (ty_vec128_int ty) (snarrow x y)))
      (let ((low_half Reg (sqxtn x (lane_size ty)))
            (result Reg (sqxtn2 low_half y (lane_size ty))))
        result))


;;;; Rules for `unarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(rule 1 (lower (has_type (ty_vec128_int ty) (unarrow x y)))
      (if (zero_value y))
      (sqxtun x (lane_size ty)))

(rule 2 (lower (has_type (ty_vec64_int ty) (unarrow x y)))
      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
            (sqxtun dst (lane_size ty))))

(rule 0 (lower (has_type (ty_vec128_int ty) (unarrow x y)))
      (let ((low_half Reg (sqxtun x (lane_size ty)))
            (result Reg (sqxtun2 low_half y (lane_size ty))))
        result))


;;;; Rules for `uunarrow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type (ty_vec128_int ty) (uunarrow x y)))
      (if (zero_value y))
      (uqxtn x (lane_size ty)))

(rule 2 (lower (has_type (ty_vec64_int ty) (uunarrow x y)))
      (let ((dst Reg (mov_vec_elem x y 1 0 (VectorSize.Size64x2))))
            (uqxtn dst (lane_size ty))))

(rule 0 (lower (has_type (ty_vec128_int ty) (uunarrow x y)))
      (let ((low_half Reg (uqxtn x (lane_size ty)))
            (result Reg (uqxtn2 low_half y (lane_size ty))))
        result))

;;;; Rules for `swiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (swiden_low x)))
      (vec_extend (VecExtendOp.Sxtl) x $false (lane_size ty)))

;;;; Rules for `swiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type (ty_vec128 ty) (swiden_high x)))
      (vec_extend (VecExtendOp.Sxtl) x $true (lane_size ty)))

(rule (lower (has_type ty (swiden_high x)))
      (if (ty_vec64 ty))
      (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
       (vec_extend (VecExtendOp.Sxtl) tmp $false (lane_size ty))))

;;;; Rules for `uwiden_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type ty (uwiden_low x)))
      (vec_extend (VecExtendOp.Uxtl) x $false (lane_size ty)))

;;;; Rules for `uwiden_high` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (has_type (ty_vec128 ty) (uwiden_high x)))
      (vec_extend (VecExtendOp.Uxtl) x $true (lane_size ty)))

(rule (lower (has_type ty (uwiden_high x)))
      (if (ty_vec64 ty))
      (let ((tmp Reg (fpu_move_from_vec x 1 (VectorSize.Size32x2))))
       (vec_extend (VecExtendOp.Uxtl) tmp $false (lane_size ty))))

;;;; Rules for `Fence` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (fence))
      (side_effect (aarch64_fence)))

;;;; Rules for `IsNull` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (is_null x @ (value_type ty)))
      (with_flags (cmp_imm (operand_size ty) x (u8_into_imm12 0))
                  (materialize_bool_result (Cond.Eq))))

;;;; Rules for `IsInvalid` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (is_invalid x @ (value_type ty)))
      (with_flags (cmn_imm (operand_size ty) x (u8_into_imm12 1))
                  (materialize_bool_result (Cond.Eq))))

;;;; Rules for `Debugtrap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (debugtrap))
      (side_effect (brk)))

;;;; Rules for `func_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (func_addr (func_ref_data _ extname _)))
      (load_ext_name (box_external_name extname) 0))

;;;; Rules for `symbol_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (symbol_value (symbol_value_data extname _ offset)))
      (load_ext_name (box_external_name extname) offset))

;;; Rules for `get_{frame,stack}_pointer` and `get_return_address` ;;;;;;;;;;;;;

(rule (lower (get_frame_pointer))
      (aarch64_fp))

(rule (lower (get_stack_pointer))
      (aarch64_sp))

(rule (lower (get_return_address))
      (aarch64_link))

;;;; Rules for calls ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (call (func_ref_data sig_ref extname dist) inputs))
      (gen_call sig_ref extname dist inputs))

(rule (lower (call_indirect sig_ref val inputs))
      (gen_call_indirect sig_ref val inputs))

;;;; Rules for `return` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; N.B.: the Ret itself is generated by the ABI.
(rule (lower (return args))
      (lower_return args))

;;;; Rules for `return_call` and `return_call_indirect` ;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (return_call (func_ref_data sig_ref extname dist) args))
      (gen_return_call sig_ref extname dist args))

(rule (lower (return_call_indirect sig_ref callee args))
      (gen_return_call_indirect sig_ref callee args))

;;;; Rules for loads ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower
       (has_type $I8 (load flags address offset)))
      (aarch64_uload8 (amode $I8 address offset) flags))
(rule (lower
       (has_type $I16 (load flags address offset)))
      (aarch64_uload16 (amode $I16 address offset) flags))
(rule (lower
       (has_type $I32 (load flags address offset)))
      (aarch64_uload32 (amode $I32 address offset) flags))
(rule (lower
       (has_type $I64 (load flags address offset)))
      (aarch64_uload64 (amode $I64 address offset) flags))
(rule (lower
       (has_type $R64 (load flags address offset)))
      (aarch64_uload64 (amode $I64 address offset) flags))
(rule (lower
       (has_type $F32 (load flags address offset)))
      (aarch64_fpuload32 (amode $F32 address offset) flags))
(rule (lower
       (has_type $F64 (load flags address offset)))
      (aarch64_fpuload64 (amode $F64 address offset) flags))
(rule (lower
       (has_type $I128 (load flags address offset)))
      (aarch64_loadp64 (pair_amode address offset) flags))
(rule -1 (lower
       (has_type (ty_vec64 _)
                        (load flags address offset)))
      (aarch64_fpuload64 (amode $F64 address offset) flags))
(rule -3 (lower
       (has_type (ty_vec128 _)
                        (load flags address offset)))
      (aarch64_fpuload128 (amode $I8X16 address offset) flags))
(rule -2 (lower
       (has_type (ty_dyn_vec64 _)
                        (load flags address offset)))
      (aarch64_fpuload64 (amode $F64 address offset) flags))
(rule -4 (lower
       (has_type (ty_dyn_vec128 _)
                        (load flags address offset)))
      (aarch64_fpuload128 (amode $I8X16 address offset) flags))

(rule (lower
       (uload8 flags address offset))
      (aarch64_uload8 (amode $I8 address offset) flags))
(rule (lower
       (sload8 flags address offset))
      (aarch64_sload8 (amode $I8 address offset) flags))
(rule (lower
       (uload16 flags address offset))
      (aarch64_uload16 (amode $I16 address offset) flags))
(rule (lower
       (sload16 flags address offset))
      (aarch64_sload16 (amode $I16 address offset) flags))
(rule (lower
       (uload32 flags address offset))
      (aarch64_uload32 (amode $I32 address offset) flags))
(rule (lower
       (sload32 flags address offset))
      (aarch64_sload32 (amode $I32 address offset) flags))

(rule (lower
       (sload8x8 flags address offset))
      (vec_extend (VecExtendOp.Sxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size16)))
(rule (lower
       (uload8x8 flags address offset))
      (vec_extend (VecExtendOp.Uxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size16)))
(rule (lower
       (sload16x4 flags address offset))
      (vec_extend (VecExtendOp.Sxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size32)))
(rule (lower
       (uload16x4 flags address offset))
      (vec_extend (VecExtendOp.Uxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size32)))
(rule (lower
       (sload32x2 flags address offset))
      (vec_extend (VecExtendOp.Sxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size64)))
(rule (lower
       (uload32x2 flags address offset))
      (vec_extend (VecExtendOp.Uxtl)
                  (aarch64_fpuload64 (amode $F64 address offset) flags)
                  $false
                  (ScalarSize.Size64)))

;;;; Rules for stores ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower
       (store flags value @ (value_type $I8) address offset))
      (side_effect
       (aarch64_store8 (amode $I8 address offset) flags value)))
(rule (lower
       (store flags value @ (value_type $I16) address offset))
      (side_effect
       (aarch64_store16 (amode $I16 address offset) flags value)))
(rule (lower
       (store flags value @ (value_type $I32) address offset))
      (side_effect
       (aarch64_store32 (amode $I32 address offset) flags value)))
(rule (lower
       (store flags value @ (value_type $I64) address offset))
      (side_effect
       (aarch64_store64 (amode $I64 address offset) flags value)))
(rule (lower
       (store flags value @ (value_type $R64) address offset))
      (side_effect
       (aarch64_store64 (amode $I64 address offset) flags value)))

(rule (lower
       (istore8 flags value address offset))
      (side_effect
       (aarch64_store8 (amode $I8 address offset) flags value)))
(rule (lower
       (istore16 flags value address offset))
      (side_effect
       (aarch64_store16 (amode $I16 address offset) flags value)))
(rule (lower
       (istore32 flags value address offset))
      (side_effect
       (aarch64_store32 (amode $I32 address offset) flags value)))

(rule (lower
       (store flags value @ (value_type $F32) address offset))
      (side_effect
       (aarch64_fpustore32 (amode $F32 address offset) flags value)))
(rule (lower
       (store flags value @ (value_type $F64) address offset))
      (side_effect
       (aarch64_fpustore64 (amode $F64 address offset) flags value)))

(rule (lower
       (store flags value @ (value_type $I128) address offset))
      (side_effect
       (aarch64_storep64 (pair_amode address offset) flags
                         (value_regs_get value 0)
                         (value_regs_get value 1))))

(rule -1 (lower
       (store flags value @ (value_type (ty_vec64 _)) address offset))
      (side_effect
       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
(rule -3 (lower
       (store flags value @ (value_type (ty_vec128 _)) address offset))
      (side_effect
       (aarch64_fpustore128 (amode $I8X16 address offset) flags value)))
(rule -2 (lower
       (store flags value @ (value_type (ty_dyn_vec64 _)) address offset))
      (side_effect
       (aarch64_fpustore64 (amode $F64 address offset) flags value)))
(rule -4 (lower
       (store flags value @ (value_type (ty_dyn_vec128 _)) address offset))
      (side_effect
       (aarch64_fpustore128 (amode $I8X16 address offset) flags value)))

;;; Rules for `{get,set}_pinned_reg` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (get_pinned_reg))
      (mov_from_preg (preg_pinned)))

(rule (lower (set_pinned_reg val))
      (side_effect (write_pinned_reg val)))

;;; Rules for `bitcast` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

; SIMD&FP <=> SIMD&FP
(rule 5 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type (ty_float_or_vec _)))))
      x)

; GPR => SIMD&FP
(rule 4 (lower (has_type (ty_float_or_vec _) (bitcast _ x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 in_ty))
      (mov_to_fpu x (scalar_size in_ty)))

; SIMD&FP => GPR
(rule 3 (lower (has_type out_ty (bitcast _ x @ (value_type (fits_in_64 (ty_float_or_vec _))))))
      (if (ty_int_ref_scalar_64 out_ty))
      (mov_from_vec x 0 (scalar_size out_ty)))

;; Bitcasts between `r{32,64}` and `i{32,64}` need to be a copy to avoid
;; conflicting regalloc constraints on reference type values that both need to
;; be in some register but also some safepoint stack slot at the same time.
(rule 2 (lower (has_type dst_ty (bitcast _ x @ (value_type src_ty))))
      (if (ty_int_ref_scalar_64 src_ty))
      (if (ty_int_ref_scalar_64 dst_ty))
      (if-let $true (is_ref_type src_ty))
      (if-let $false (is_ref_type dst_ty))
      (copy_reg dst_ty x))
(rule 2 (lower (has_type dst_ty (bitcast _ x @ (value_type src_ty))))
      (if (ty_int_ref_scalar_64 src_ty))
      (if (ty_int_ref_scalar_64 dst_ty))
      (if-let $false (is_ref_type src_ty))
      (if-let $true (is_ref_type dst_ty))
      (copy_reg src_ty x))

; GPR <=> GPR
(rule 1 (lower (has_type out_ty (bitcast _ x @ (value_type in_ty))))
      (if (ty_int_ref_scalar_64 out_ty))
      (if (ty_int_ref_scalar_64 in_ty))
      x)
(rule 0 (lower (has_type $I128 (bitcast _ x @ (value_type $I128)))) x)

;;; Rules for `extractlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; extractlane with lane 0 can pass through the value unchanged; upper
;; bits are undefined when a narrower type is in a wider register.
(rule 2 (lower (has_type (ty_scalar_float _) (extractlane val (u8_from_uimm8 0))))
      val)

(rule 0 (lower (has_type (ty_int ty)
                       (extractlane val
                                    (u8_from_uimm8 lane))))
      (mov_from_vec val lane (scalar_size ty)))

(rule 1 (lower (has_type (ty_scalar_float ty)
                       (extractlane val @ (value_type vty)
                                    (u8_from_uimm8 lane))))
      (fpu_move_from_vec val lane (vector_size vty)))

;;; Rules for `insertlane` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule 1 (lower (insertlane vec @ (value_type vty)
                         val @ (value_type (ty_int _))
                         (u8_from_uimm8 lane)))
      (mov_to_vec vec val lane (vector_size vty)))

(rule (lower (insertlane vec @ (value_type vty)
                         val @ (value_type (ty_scalar_float _))
                         (u8_from_uimm8 lane)))
      (mov_vec_elem vec val lane 0 (vector_size vty)))

;;; Rules for `stack_addr` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (stack_addr stack_slot offset))
      (compute_stack_addr stack_slot offset))

;;; Rules for `vhigh_bits` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; All three sequences use one integer temporary and two vector
;; temporaries.  The shift is done early so as to give the register
;; allocator the possibility of using the same reg for `tmp_v1` and
;; `src_v` in the case that this is the last use of `src_v`.  See
;; https://github.com/WebAssembly/simd/pull/201 for the background and
;; derivation of these sequences. Alternative sequences are discussed
;; in https://github.com/bytecodealliance/wasmtime/issues/2296,
;; although they are not used here.

(rule (lower (vhigh_bits vec @ (value_type $I8X16)))
      (let (
            ;; Replicate the MSB of each of the 16 byte lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (sshr_vec_imm vec 7 (VectorSize.Size8x16)))
            ;; Bitwise-and with a mask
            ;; `0x80402010_08040201_80402010_08040201` to get the bit
            ;; in the proper location for each group of 8 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x80402010_08040201_80402010_08040201) (VectorSize.Size8x16)))
            ;; Produce a version of `anded` with upper 8 lanes and
            ;; lower 8 lanes swapped.
            (anded_swapped Reg (vec_extract anded anded 8))
            ;; Zip together the two; with the above this produces the lane permutation:
            ;; 15 7 14 6 13 5 12 4 11 3 10 2 9 1 8 0
            (zipped Reg (zip1 anded anded_swapped (VectorSize.Size8x16)))
            ;; Add 16-bit lanes together ("add across vector"), so we
            ;; get, in the low 16 bits, 15+14+...+8 in the high byte
            ;; and 7+6+...+0 in the low byte. This effectively puts
            ;; the 16 MSBs together, giving our results.
            ;;
            ;; N.B.: `Size16x8` is not a typo!
            (result Reg (addv zipped (VectorSize.Size16x8))))
        (mov_from_vec result 0 (ScalarSize.Size16))))

(rule (lower (vhigh_bits vec @ (value_type $I16X8)))
      (let (
            ;; Replicate the MSB of each of the 8 16-bit lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (sshr_vec_imm vec 15 (VectorSize.Size16x8)))
            ;; Bitwise-and with a mask
            ;; `0x0080_0040_0020_0010_0008_0004_0002_0001` to get the
            ;; bit in the proper location for each group of 4 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x0080_0040_0020_0010_0008_0004_0002_0001) (VectorSize.Size16x8)))
            ;; Add lanes together to get the 8 MSBs in the low byte.
            (result Reg (addv anded (VectorSize.Size16x8))))
        (mov_from_vec result 0 (ScalarSize.Size16))))

(rule (lower (vhigh_bits vec @ (value_type $I32X4)))
      (let (
            ;; Replicate the MSB of each of the 4 32-bit lanes across
            ;; the whole lane (sshr is an arithmetic right shift).
            (shifted Reg (sshr_vec_imm vec 31 (VectorSize.Size32x4)))
            ;; Bitwise-and with a mask
            ;; `0x00000008_00000004_00000002_00000001` to get the bit
            ;; in the proper location for each group of 4 lanes.
            (anded Reg (and_vec shifted (constant_f128 0x00000008_00000004_00000002_00000001) (VectorSize.Size32x4)))
            ;; Add lanes together to get the 4 MSBs in the low byte.
            (result Reg (addv anded (VectorSize.Size32x4))))
        (mov_from_vec result 0 (ScalarSize.Size32))))

(rule (lower (vhigh_bits vec @ (value_type $I64X2)))
      (let (
            ;; Grab the MSB out of each of the lanes, right-shift to
            ;; LSB, and add with a left-shift of upper lane's MSB back
            ;; to bit 1.  the whole lane (sshr is an arithmetic right
            ;; shift).
            (upper_msb Reg (mov_from_vec vec 1 (ScalarSize.Size64)))
            (lower_msb Reg (mov_from_vec vec 0 (ScalarSize.Size64)))
            (upper_msb Reg (lsr_imm $I64 upper_msb (imm_shift_from_u8 63)))
            (lower_msb Reg (lsr_imm $I64 lower_msb (imm_shift_from_u8 63))))
        (add_shift $I64 lower_msb upper_msb (lshl_from_u64 $I64 1))))

;;; Rules for `uadd_overflow_trap` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (fits_in_64 ty) (uadd_overflow_trap a b tc)))
      (trap_if_overflow (add_with_flags_paired ty a b) tc))

;;;; Helpers for `*_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; put a narrow value into a register and sign-/zero-extend depending on the ArgumentExtension
(decl put_in_reg_ext32 (Value ArgumentExtension) Reg)
(rule (put_in_reg_ext32 val (ArgumentExtension.Sext))
      (put_in_reg_sext32 val))
(rule (put_in_reg_ext32 val (ArgumentExtension.Uext))
      (put_in_reg_zext32 val))

;; For narrow values emit a normal op with both arguments zero/sign extended.
;; Then check if the output is the same as itself zero/sign extended from the narrower width.
(decl overflow_op_small (Type Value Value ArgumentExtension ALUOp) InstOutput)
(rule (overflow_op_small ty a b arg_ext alu_op)
      (let ((extend ExtendOp (lower_extend_op ty arg_ext))

            ;; Instead of emitting two `{u,s}xt{b,h}` we do one as an instruction and
            ;; the other as an extend operation in the alu_op.
            ;;
            ;; uxtb    a_ext, a
            ;; alu_op  out, a_ext, b, {u,s}xtb
            ;; cmp     out, out, {u,s}xtb
            ;; cset    out_of, ne
            (a_ext Reg (put_in_reg_ext32 a arg_ext))
            (out Reg (alu_rrr_extend alu_op ty a_ext b extend))
            (out_of Reg (with_flags_reg
                  (cmp_extend (OperandSize.Size32) out out extend)
                  (cset (Cond.Ne)))))
      (output_pair
            (value_reg out)
            (value_reg out_of))))

;; For register sized op's just emit a op+cset, without further masking.
;;
;; op out, a, b
;; cset out_of, cond
;;
;; conds expected:
;; Hs: Carry set, unsigned overflow; Vs: Signed Over-/Underflow;
;; Lo: Carry clear, meaning no unsigned overflow.
;; (this is because subtraction is implemented as an add with the two's complement value on aarch64, meaning there is a sub-overflow if the add does not overflow)
(decl overflow_op_normal (Type Value Value ALUOp Cond) InstOutput)
(rule (overflow_op_normal ty a b alu_op cond)
      (let ((out ValueRegs
              (with_flags
                  (alu_rrr_with_flags_paired ty a b alu_op)
                  (cset_paired cond))))
      (output_pair
            (value_regs_get out 0)
            (value_regs_get out 1))))

;; For 128bit integers emit, for example, add+adcs+cset
(decl overflow_op_128 (Value Value ALUOp ALUOp Cond) InstOutput)
(rule (overflow_op_128 x y alu_op1 alu_op2 cond)
      (let
          ;; Get the high/low registers for `x`.
          ((x_regs ValueRegs x)
           (x_lo Reg (value_regs_get x_regs 0))
           (x_hi Reg (value_regs_get x_regs 1))

           ;; Get the high/low registers for `y`.
           (y_regs ValueRegs y)
           (y_lo Reg (value_regs_get y_regs 0))
           (y_hi Reg (value_regs_get y_regs 1)))
        ;; cannot use the with_flags helper here but it should be fine right now
        (let
            ((lo_inst ProducesFlags (alu_rrr_with_flags_paired $I64 x_lo y_lo alu_op1))
             (hi_inst ConsumesAndProducesFlags (alu_rrr_with_flags_chained $I64 x_hi y_hi alu_op2))
             (of_inst ConsumesFlags (cset_paired cond))

             (result MultiReg (with_flags_chained lo_inst hi_inst of_inst)))
            (multi_reg_to_pair_and_single result)))
)

;;;; Rules for `uadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; For values smaller than a register, we do a normal `add` with both arguments
;; zero extended. We then check if the output is the same as itself zero extended.
(rule 1 (lower (has_type (fits_in_16 ty) (uadd_overflow a b)))
      (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Add)))

;; For register sized add's we just emit a adds+cset, without further masking.
(rule 2 (lower (has_type (ty_32_or_64 ty) (uadd_overflow a b)))
      (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Hs)))

;; For 128bit integers we emit add+adcs+cset
(rule 0 (lower (has_type $I128 (uadd_overflow x y)))
      (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Hs)))

;;;; Rules for `sadd_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; sxt{b,h} a_ext, a
;; add out, a_ext, b, sxt{b,h}
;; cmp out, out, sxt{b,h}
;; cset of, ne
(rule 1 (lower (has_type (fits_in_16 ty) (sadd_overflow a b)))
      (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Add)))

;; adds a, b
;; cset of, vs
(rule 2 (lower (has_type (ty_32_or_64 ty) (sadd_overflow a b)))
      (overflow_op_normal ty a b (ALUOp.AddS) (Cond.Vs)))

;; adds x_lo, y_lo
;; addcs x_hi, y_hi
;; cset of, vs
(rule 0 (lower (has_type $I128 (sadd_overflow x y)))
      (overflow_op_128 x y (ALUOp.AddS) (ALUOp.AdcS) (Cond.Vs)))

;;;; Rules for `usub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; uxt{b,h} a_ext, a
;; sub out, a_ext, b, ext{b,h}
;; cmp out, out, uxt{b,h}
;; cset of, ne
(rule 1 (lower (has_type (fits_in_16 ty) (usub_overflow a b)))
      (overflow_op_small ty a b (ArgumentExtension.Uext) (ALUOp.Sub)))

;; subs a, b
;; cset of, lo
(rule 2 (lower (has_type (ty_32_or_64 ty) (usub_overflow a b)))
      (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Lo)))

;; subs x_lo, y_lo
;; sbcs x_hi, y_hi
;; cset of, lo
(rule 0 (lower (has_type $I128 (usub_overflow x y)))
      (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Lo)))

;;;; Rules for `ssub_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; sxt{b,h} a_ext, a
;; sub out, a_ext, b, sxt{b,h}
;; cmp out, out, sxt{b,h}
;; cset of, ne
(rule 1 (lower (has_type (fits_in_16 ty) (ssub_overflow a b)))
      (overflow_op_small ty a b (ArgumentExtension.Sext) (ALUOp.Sub)))

;; subs a, b
;; cset of, vs
(rule 2 (lower (has_type (ty_32_or_64 ty) (ssub_overflow a b)))
      (overflow_op_normal ty a b (ALUOp.SubS) (Cond.Vs)))

;; subs x_lo, y_lo
;; sbcs x_hi, y_hi
;; cset of, vs
(rule 0 (lower (has_type $I128 (ssub_overflow x y)))
      (overflow_op_128 x y (ALUOp.SubS) (ALUOp.SbcS) (Cond.Vs)))

;;;; Rules for `umul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; uxt{b,h} a_ext, a
;; uxt{b,h} b_ext, b
;; mul out, a_ext, b_ext
;; cmp out, out, uxt{b,h}
;; cset of, ne
(rule 1 (lower (has_type (fits_in_16 ty) (umul_overflow a b)))
       (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Uext)))

             (a_uext Reg (put_in_reg_zext32 a))
             (b_uext Reg (put_in_reg_zext32 b))
             (out Reg (madd ty a_uext b_uext (zero_reg)))
             (out_of Reg (with_flags_reg
                   (cmp_extend (OperandSize.Size32) out out extend)
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;; umull out, a, b
;; cmp out, out, uxtw
;; cset of, ne
(rule 2 (lower (has_type $I32 (umul_overflow a b)))
       (let (
             (out Reg (umaddl a b (zero_reg)))
             (out_of Reg (with_flags_reg
                   (cmp_extend (OperandSize.Size64) out out (ExtendOp.UXTW))
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;; mul out, a, b
;; umulh tmp, a, b
;; cmp tmp, #0
;; cset of, ne
(rule 2 (lower (has_type $I64 (umul_overflow a b)))
       (let (
             (out Reg (madd $I64 a b (zero_reg)))
             (tmp Reg (umulh $I64 a b))
             (out_of Reg (with_flags_reg
                   (cmp64_imm tmp (u8_into_imm12 0))
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;;;; Rules for `smul_overflow` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; sxt{b,h} a_ext, a
;; sxt{b,h} b_ext, b
;; mul out, a_ext, b_ext
;; cmp out, out, sxt{b,h}
;; cset of, ne
(rule 1 (lower (has_type (fits_in_16 ty) (smul_overflow a b)))
       (let ((extend ExtendOp (lower_extend_op ty (ArgumentExtension.Sext)))

             (a_sext Reg (put_in_reg_sext32 a))
             (b_sext Reg (put_in_reg_sext32 b))
             (out Reg (madd ty a_sext b_sext (zero_reg)))
             (out_of Reg (with_flags_reg
                   (cmp_extend (OperandSize.Size32) out out extend)
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;; smull out, a, b
;; cmp out, out, sxtw
;; cset of, ne
(rule 2 (lower (has_type $I32 (smul_overflow a b)))
       (let (
             (out Reg (smaddl a b (zero_reg)))
             (out_of Reg (with_flags_reg
                   (cmp_extend (OperandSize.Size64) out out (ExtendOp.SXTW))
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;; mul out, a, b
;; smulh tmp, a, b
;; cmp tmp, out, ASR #63
;; cset of, ne
(rule 2 (lower (has_type $I64 (smul_overflow a b)))
       (let (
             (out Reg (madd $I64 a b (zero_reg)))
             (tmp Reg (smulh $I64 a b))
             (out_of Reg (with_flags_reg
                   (cmp_rr_shift_asr (OperandSize.Size64) tmp out 63)
                   (cset (Cond.Ne)))))
       (output_pair
             (value_reg out)
             (value_reg out_of))))

;;; Rules for `tls_value` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (has_type (tls_model (TlsModel.ElfGd)) (tls_value (symbol_value_data name _ _))))
      (elf_tls_get_addr name))

(rule (lower (has_type (tls_model (TlsModel.Macho)) (tls_value (symbol_value_data name _ _))))
      (macho_tls_get_addr name))

;;; Rules for `fvpromote_low` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower (fvpromote_low val))
      (vec_rr_long (VecRRLongOp.Fcvtl32) val $false))

;;; Rules for `brif` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `brif` following `icmp`
(rule (lower_branch (brif (maybe_uextend (icmp cc x @ (value_type ty) y)) _ _) (two_targets taken not_taken))
      (let ((comparison FlagsAndCC (lower_icmp_into_flags cc x y ty))
            (cond Cond (cond_code (flags_and_cc_cc comparison))))
        (emit_side_effect
         (with_flags_side_effect (flags_and_cc_flags comparison)
                                 (cond_br taken
                                          not_taken
                                          (cond_br_cond cond))))))

;; `brif` following `fcmp`
(rule (lower_branch (brif (maybe_uextend (fcmp cc x @ (value_type (ty_scalar_float ty)) y)) _ _) (two_targets taken not_taken))
      (let ((cond Cond (fp_cond_code cc)))
       (emit_side_effect
        (with_flags_side_effect (fpu_cmp (scalar_size ty) x y)
                                (cond_br taken not_taken
                                 (cond_br_cond cond))))))

;; standard `brif`
(rule -1 (lower_branch (brif c @ (value_type $I128) _ _) (two_targets taken not_taken))
      (let ((flags ProducesFlags (flags_to_producesflags c))
            (c ValueRegs (put_in_regs c))
            (c_lo Reg (value_regs_get c 0))
            (c_hi Reg (value_regs_get c 1))
            (rt Reg (orr $I64 c_lo c_hi)))
       (emit_side_effect
        (with_flags_side_effect flags
         (cond_br taken not_taken (cond_br_not_zero rt))))))
(rule -2 (lower_branch (brif c @ (value_type ty) _ _) (two_targets taken not_taken))
      (if (ty_int_ref_scalar_64 ty))
      (let ((flags ProducesFlags (flags_to_producesflags c))
            (rt Reg (put_in_reg_zext64 c)))
       (emit_side_effect
        (with_flags_side_effect flags
         (cond_br taken not_taken (cond_br_not_zero rt))))))

;; Special lowerings for `tbnz` - "Test bit and Branch if Nonzero"
(rule 1 (lower_branch (brif (band x @ (value_type ty) (u64_from_iconst n)) _ _)
                     (two_targets taken not_taken))
  (if-let bit (test_and_compare_bit_const ty n))
  (emit_side_effect (tbnz taken not_taken x bit)))

;; Special lowering for `tbz` - "Test bit and Branch if Zero"
(rule 1 (lower_branch (brif (icmp (IntCC.Equal)
                                  (band x @ (value_type (fits_in_64 ty))
                                        (u64_from_iconst n))
                                  (u64_from_iconst 0)) _ _)
                     (two_targets taken not_taken))
  (if-let bit (test_and_compare_bit_const ty n))
  (emit_side_effect (tbz taken not_taken x bit)))

(decl pure partial test_and_compare_bit_const (Type u64) u8)
(extern constructor test_and_compare_bit_const test_and_compare_bit_const)

;;; Rules for `jump` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule (lower_branch (jump _) (single_target label))
      (emit_side_effect (aarch64_jump label)))

;;; Rules for `br_table` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;; `targets` contains the default target with the list of branch targets
;; concatenated.
(rule (lower_branch (br_table idx _) (jump_table_targets default targets))
      (let ((jt_size u32 (jump_table_size targets))
            (_ InstOutput (side_effect
                  (emit_island (targets_jt_space targets))))
            (ridx Reg (put_in_reg_zext32 idx)))
       (br_table_impl (u32_as_u64 jt_size) ridx default targets)))