diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
index b5826a226ea9..f45caf58b594 100644
--- a/cranelift/codegen/src/isa/aarch64/inst.isle
+++ b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -576,6 +576,14 @@
         (rm Reg)
         (size VectorSize))
 
+       ;; A vector ALU op modifying a source register.
+       (VecRRRMod
+        (alu_op VecALUModOp)
+        (rd WritableReg)
+        (rn Reg)
+        (rm Reg)
+        (size VectorSize))
+
        ;; Vector two register miscellaneous instruction.
        (VecMisc
         (op VecMisc2)
@@ -1108,10 +1116,6 @@
     (Orr)
     ;; Bitwise exclusive or
     (Eor)
-    ;; Bitwise select
-    ;; This opcode should only be used with the `vec_rrr_inplace`
-    ;; constructor.
-    (Bsl)
     ;; Unsigned maximum pairwise
     (Umaxp)
     ;; Add
@@ -1146,10 +1150,6 @@
     (Fmin)
     ;; Floating-point multiply
     (Fmul)
-    ;; Floating-point fused multiply-add vectors
-    ;; This opcode should only be used with the `vec_rrr_inplace`
-    ;; constructor.
-    (Fmla)
     ;; Add pairwise
     (Addp)
     ;; Zip vectors (primary) [meaning, high halves]
@@ -1158,6 +1158,15 @@
     (Sqrdmulh)
 ))
 
+;; A Vector ALU operation which modifies a source register.
+(type VecALUModOp
+  (enum
+    ;; Bitwise select
+    (Bsl)
+    ;; Floating-point fused multiply-add vectors
+    (Fmla)
+))
+
 ;; A Vector miscellaneous operation with two registers.
 (type VecMisc2
   (enum
@@ -1508,11 +1517,11 @@
 
 ;; Helper for emitting `MInst.VecRRR` instructions which use three registers,
 ;; one of which is both source and output.
-(decl vec_rrr_inplace (VecALUOp Reg Reg Reg VectorSize) Reg)
-(rule (vec_rrr_inplace op src1 src2 src3 size)
+(decl vec_rrr_mod (VecALUModOp Reg Reg Reg VectorSize) Reg)
+(rule (vec_rrr_mod op src1 src2 src3 size)
       (let ((dst WritableReg (temp_writable_reg $I8X16))
             (_1 Unit (emit (MInst.FpuMove128 dst src1)))
-            (_2 Unit (emit (MInst.VecRRR op dst src2 src3 size))))
+            (_2 Unit (emit (MInst.VecRRRMod op dst src2 src3 size))))
         dst))
 
 ;; Helper for emitting `MInst.FpuRRR` instructions.
@@ -2198,10 +2207,7 @@
 
 (decl bsl (Type Reg Reg Reg) Reg)
 (rule (bsl ty c x y)
-      (let ((dst WritableReg (temp_writable_reg ty))
-            (_ Unit (emit (MInst.FpuMove128 dst c)))
-            (_ Unit (emit (MInst.VecRRR (VecALUOp.Bsl) dst x y (vector_size ty)))))
-        dst))
+      (vec_rrr_mod (VecALUModOp.Bsl) c x y (vector_size ty)))
 
 ;; Helper for generating a `udf` instruction.
 
diff --git a/cranelift/codegen/src/isa/aarch64/inst/args.rs b/cranelift/codegen/src/isa/aarch64/inst/args.rs
index 7ce8a048d183..0d307ba7c923 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/args.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/args.rs
@@ -752,6 +752,16 @@ impl VectorSize {
 
         (q, size)
     }
+
+    /// Return the encoding bit that is used by some floating-point SIMD
+    /// instructions for a particular operand size.
+    pub fn enc_float_size(&self) -> u32 {
+        match self.lane_size() {
+            ScalarSize::Size32 => 0b0,
+            ScalarSize::Size64 => 0b1,
+            size => panic!("Unsupported floating-point size for vector op: {:?}", size),
+        }
+    }
 }
 
 pub(crate) fn dynamic_to_fixed(ty: Type) -> Type {
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
index 4ebf4de99449..d8a0f805aca6 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -2543,17 +2543,9 @@ impl MachInstEmit for Inst {
                     | VecALUOp::Fdiv
                     | VecALUOp::Fmax
                     | VecALUOp::Fmin
-                    | VecALUOp::Fmul
-                    | VecALUOp::Fmla => true,
+                    | VecALUOp::Fmul => true,
                     _ => false,
                 };
-                let enc_float_size = match (is_float, size) {
-                    (true, VectorSize::Size32x2) => 0b0,
-                    (true, VectorSize::Size32x4) => 0b0,
-                    (true, VectorSize::Size64x2) => 0b1,
-                    (true, _) => unimplemented!(),
-                    _ => 0,
-                };
 
                 let (top11, bit15_10) = match alu_op {
                     VecALUOp::Sqadd => (0b000_01110_00_1 | enc_size << 1, 0b000011),
@@ -2574,7 +2566,6 @@ impl MachInstEmit for Inst {
                     VecALUOp::Bic => (0b000_01110_01_1, 0b000111),
                     VecALUOp::Orr => (0b000_01110_10_1, 0b000111),
                     VecALUOp::Eor => (0b001_01110_00_1, 0b000111),
-                    VecALUOp::Bsl => (0b001_01110_01_1, 0b000111),
                     VecALUOp::Umaxp => {
                         debug_assert_ne!(size, VectorSize::Size64x2);
 
@@ -2619,7 +2610,6 @@ impl MachInstEmit for Inst {
                     VecALUOp::Fmax => (0b000_01110_00_1, 0b111101),
                     VecALUOp::Fmin => (0b000_01110_10_1, 0b111101),
                     VecALUOp::Fmul => (0b001_01110_00_1, 0b110111),
-                    VecALUOp::Fmla => (0b000_01110_00_1, 0b110011),
                     VecALUOp::Addp => (0b000_01110_00_1 | enc_size << 1, 0b101111),
                     VecALUOp::Zip1 => (0b01001110_00_0 | enc_size << 1, 0b001110),
                     VecALUOp::Sqrdmulh => {
@@ -2632,12 +2622,32 @@ impl MachInstEmit for Inst {
                     }
                 };
                 let top11 = if is_float {
-                    top11 | enc_float_size << 1
+                    top11 | size.enc_float_size() << 1
                 } else {
                     top11
                 };
                 sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
             }
+            &Inst::VecRRRMod {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let rd = allocs.next_writable(rd);
+                let rn = allocs.next(rn);
+                let rm = allocs.next(rm);
+                let (q, _enc_size) = size.enc_size();
+
+                let (top11, bit15_10) = match alu_op {
+                    VecALUModOp::Bsl => (0b001_01110_01_1, 0b000111),
+                    VecALUModOp::Fmla => {
+                        (0b000_01110_00_1 | (size.enc_float_size() << 1), 0b110011)
+                    }
+                };
+                sink.put4(enc_vec_rrr(top11 | q << 9, rm, bit15_10, rn, rd));
+            }
             &Inst::VecLoadReplicate {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
index 01d3e0fe48b5..86b6a543f525 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -3383,8 +3383,8 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Bsl,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Bsl,
             rd: writable_vreg(8),
             rn: vreg(9),
             rm: vreg(1),
@@ -4055,8 +4055,8 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
             rn: vreg(0),
             rm: vreg(5),
@@ -4067,8 +4067,8 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
             rn: vreg(0),
             rm: vreg(5),
@@ -4079,8 +4079,8 @@ fn test_aarch64_binemit() {
     ));
 
     insns.push((
-        Inst::VecRRR {
-            alu_op: VecALUOp::Fmla,
+        Inst::VecRRRMod {
+            alu_op: VecALUModOp::Fmla,
             rd: writable_vreg(2),
             rn: vreg(0),
             rm: vreg(5),
diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
index a35e97e1c59a..6e45beb66b39 100644
--- a/cranelift/codegen/src/isa/aarch64/inst/mod.rs
+++ b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -37,9 +37,9 @@ mod emit_tests;
 
 pub use crate::isa::aarch64::lower::isle::generated_code::{
     ALUOp, ALUOp3, APIKey, AtomicRMWLoopOp, AtomicRMWOp, BitOp, FPUOp1, FPUOp2, FPUOp3,
-    FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUOp, VecExtendOp,
-    VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp, VecRRRLongOp,
-    VecShiftImmOp,
+    FpuRoundMode, FpuToIntOp, IntToFpuOp, MInst as Inst, MoveWideOp, VecALUModOp, VecALUOp,
+    VecExtendOp, VecLanesOp, VecMisc2, VecPairOp, VecRRLongOp, VecRRNarrowOp, VecRRPairLongOp,
+    VecRRRLongOp, VecShiftImmOp,
 };
 
 /// A floating-point unit (FPU) operation with two args, a register and an immediate.
@@ -957,14 +957,13 @@ fn aarch64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
             collector.reg_def(rd);
             collector.reg_use(rn);
         }
-        &Inst::VecRRR {
-            alu_op, rd, rn, rm, ..
-        } => {
-            if alu_op == VecALUOp::Bsl || alu_op == VecALUOp::Fmla {
-                collector.reg_mod(rd);
-            } else {
-                collector.reg_def(rd);
-            }
+        &Inst::VecRRR { rd, rn, rm, .. } => {
+            collector.reg_def(rd);
+            collector.reg_use(rn);
+            collector.reg_use(rm);
+        }
+        &Inst::VecRRRMod { rd, rn, rm, .. } => {
+            collector.reg_mod(rd);
             collector.reg_use(rn);
             collector.reg_use(rm);
         }
@@ -2208,7 +2207,6 @@ impl Inst {
                     VecALUOp::Bic => ("bic", VectorSize::Size8x16),
                     VecALUOp::Orr => ("orr", VectorSize::Size8x16),
                     VecALUOp::Eor => ("eor", VectorSize::Size8x16),
-                    VecALUOp::Bsl => ("bsl", VectorSize::Size8x16),
                     VecALUOp::Umaxp => ("umaxp", size),
                     VecALUOp::Add => ("add", size),
                     VecALUOp::Sub => ("sub", size),
@@ -2226,7 +2224,6 @@ impl Inst {
                     VecALUOp::Fmax => ("fmax", size),
                     VecALUOp::Fmin => ("fmin", size),
                     VecALUOp::Fmul => ("fmul", size),
-                    VecALUOp::Fmla => ("fmla", size),
                     VecALUOp::Addp => ("addp", size),
                     VecALUOp::Zip1 => ("zip1", size),
                     VecALUOp::Sqrdmulh => ("sqrdmulh", size),
@@ -2236,6 +2233,22 @@ impl Inst {
                 let rm = pretty_print_vreg_vector(rm, size, allocs);
                 format!("{} {}, {}, {}", op, rd, rn, rm)
             }
+            &Inst::VecRRRMod {
+                rd,
+                rn,
+                rm,
+                alu_op,
+                size,
+            } => {
+                let (op, size) = match alu_op {
+                    VecALUModOp::Bsl => ("bsl", VectorSize::Size8x16),
+                    VecALUModOp::Fmla => ("fmla", size),
+                };
+                let rd = pretty_print_vreg_vector(rd.to_reg(), size, allocs);
+                let rn = pretty_print_vreg_vector(rn, size, allocs);
+                let rm = pretty_print_vreg_vector(rm, size, allocs);
+                format!("{} {}, {}, {}", op, rd, rn, rm)
+            }
             &Inst::VecRRRLong {
                 rd,
                 rn,
diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
index deeac5193840..293cd9bc0f13 100644
--- a/cranelift/codegen/src/isa/aarch64/lower.isle
+++ b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -380,7 +380,7 @@
 ;;;; Rules for `fma` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule (lower (has_type ty @ (multi_lane _ _) (fma x y z)))
-      (vec_rrr_inplace (VecALUOp.Fmla) z x y (vector_size ty)))
+      (vec_rrr_mod (VecALUModOp.Fmla) z x y (vector_size ty)))
 
 (rule (lower (has_type (ty_scalar_float ty) (fma x y z)))
       (fpu_rrrr (FPUOp3.MAdd) (scalar_size ty) x y z))