diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 8afe4e400b2b..afceebff31a1 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1635,10 +1635,12 @@ fn define_simd(
     let usub_sat = shared.by_name("usub_sat");
     let vconst = shared.by_name("vconst");
     let vselect = shared.by_name("vselect");
+    let x86_cvtt2si = x86.by_name("x86_cvtt2si");
     let x86_insertps = x86.by_name("x86_insertps");
     let x86_movlhps = x86.by_name("x86_movlhps");
     let x86_movsd = x86.by_name("x86_movsd");
     let x86_packss = x86.by_name("x86_packss");
+    let x86_pblendw = x86.by_name("x86_pblendw");
     let x86_pextr = x86.by_name("x86_pextr");
     let x86_pinsr = x86.by_name("x86_pinsr");
     let x86_pmaxs = x86.by_name("x86_pmaxs");
@@ -1655,10 +1657,12 @@ fn define_simd(
     let x86_ptest = x86.by_name("x86_ptest");
     let x86_punpckh = x86.by_name("x86_punpckh");
     let x86_punpckl = x86.by_name("x86_punpckl");
+    let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");
 
     // Shorthands for recipes.
     let rec_blend = r.template("blend");
     let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
+    let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
     let rec_f_ib = r.template("f_ib");
     let rec_fa = r.template("fa");
     let rec_fa_ib = r.template("fa_ib");
@@ -1702,6 +1706,7 @@ fn define_simd(
     let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
     let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
     let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
+    let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");
 
     // SIMD vector size: eventually multiple vector sizes may be supported but for now only
     // SSE-sized vectors are available.
@@ -1741,6 +1746,13 @@ fn define_simd(
         e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
     }
 
+    // PBLENDW, select lanes using a u8 immediate.
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
+        let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
+        let template = rec_fa_ib.opcodes(&PBLENDW);
+        e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
+    }
+
     // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
     // to the Intel manual: "When the destination operand is an XMM register, the source operand is
     // written to the low doubleword of the register and the register is zero-extended to 128 bits."
@@ -1885,6 +1897,19 @@ fn define_simd(
             .bind(vector(F32, sse_vector_size))
             .bind(vector(I32, sse_vector_size));
         e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));
+
+        e.enc_32_64_maybe_isap(
+            x86_vcvtudq2ps,
+            rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
+            Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
+        );
+
+        e.enc_both_inferred(
+            x86_cvtt2si
+                .bind(vector(I32, sse_vector_size))
+                .bind(vector(F32, sse_vector_size)),
+            rec_furm.opcodes(&CVTTPS2DQ),
+        );
     }
 
     // SIMD vconst for special cases (all zeroes, all ones)
diff --git a/cranelift/codegen/meta/src/isa/x86/instructions.rs b/cranelift/codegen/meta/src/isa/x86/instructions.rs
index 53d91ca8614d..4afbc8874702 100644
--- a/cranelift/codegen/meta/src/isa/x86/instructions.rs
+++ b/cranelift/codegen/meta/src/isa/x86/instructions.rs
@@ -145,6 +145,37 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let f32x4 = &TypeVar::new(
+        "f32x4",
+        "A floating point number",
+        TypeSetBuilder::new()
+            .floats(32..32)
+            .simd_lanes(4..4)
+            .build(),
+    );
+    let i32x4 = &TypeVar::new(
+        "i32x4",
+        "An integer type with the same number of lanes",
+        TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
+    );
+    let x = &Operand::new("x", i32x4);
+    let a = &Operand::new("a", f32x4);
+
+    ig.push(
+        Inst::new(
+            "x86_vcvtudq2ps",
+            r#"
+        Convert unsigned integer to floating point.
+
+        Convert packed doubleword unsigned integers to packed single-precision floating-point 
+        values. This instruction does not trap.
+        "#,
+            &formats.unary,
+        )
+        .operands_in(vec![x])
+        .operands_out(vec![a]),
+    );
+
     let x = &Operand::new("x", Float);
     let a = &Operand::new("a", Float);
     let y = &Operand::new("y", Float);
@@ -302,6 +333,20 @@ pub(crate) fn define(
         .operands_out(vec![a]),
     );
 
+    let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
+    ig.push(
+        Inst::new(
+            "x86_pblendw",
+            r#"
+    Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a 
+    lane in ``b``: if the bit is set, the lane is copied into ``a``.
+    "#,
+            &formats.ternary_imm8,
+        )
+        .operands_in(vec![a, b, mask])
+        .operands_out(vec![a]),
+    );
+
     let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
     let x = &Operand::new("x", TxN);
     let a = &Operand::new("a", &TxN.lane_of());
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index 6e5c791b7984..130205fee083 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -380,6 +380,8 @@ fn define_simd(
     let bxor = insts.by_name("bxor");
     let extractlane = insts.by_name("extractlane");
     let fcmp = insts.by_name("fcmp");
+    let fcvt_from_uint = insts.by_name("fcvt_from_uint");
+    let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
     let fabs = insts.by_name("fabs");
     let fneg = insts.by_name("fneg");
     let iadd_imm = insts.by_name("iadd_imm");
@@ -787,7 +789,8 @@ fn define_simd(
     narrow.custom_legalize(ineg, "convert_ineg");
     narrow.custom_legalize(ushr, "convert_ushr");
     narrow.custom_legalize(ishl, "convert_ishl");
+    narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
 
-    // This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
     narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
+    narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
 }
diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs
index 2e9305e9f730..8d2e33be732c 100644
--- a/cranelift/codegen/meta/src/isa/x86/mod.rs
+++ b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -48,6 +48,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_32.legalize_type(F32, x86_expand);
     x86_32.legalize_type(F64, x86_expand);
     x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
+    x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
 
     x86_64.legalize_monomorphic(expand_flags);
     x86_64.legalize_default(x86_narrow);
@@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_64.legalize_type(F32, x86_expand);
     x86_64.legalize_type(F64, x86_expand);
     x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
+    x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);
 
     let recipes = recipes::define(shared_defs, &settings, &regs);
 
diff --git a/cranelift/codegen/meta/src/isa/x86/opcodes.rs b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
index c1d4fa0ef5a2..b4218dbf5229 100644
--- a/cranelift/codegen/meta/src/isa/x86/opcodes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/opcodes.rs
@@ -103,6 +103,10 @@ pub static CVTSI2SS: [u8; 3] = [0xf3, 0x0f, 0x2a];
 /// float-point value.
 pub static CVTSS2SD: [u8; 3] = [0xf3, 0x0f, 0x5a];
 
+/// Convert four packed single-precision floating-point values from xmm2/mem to four packed signed
+/// doubleword values in xmm1 using truncation (SSE2).
+pub static CVTTPS2DQ: [u8; 3] = [0xf3, 0x0f, 0x5b];
+
 /// Convert with truncation scalar double-precision floating-point value to signed
 /// integer.
 pub static CVTTSD2SI: [u8; 3] = [0xf2, 0x0f, 0x2c];
@@ -347,6 +351,10 @@ pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
 /// in XMM0 and store the values into xmm1 (SSE4.1).
 pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];
 
+/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
+/// (SSE4.1).
+pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];
+
 /// Compare packed data for equal (SSE2).
 pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];
 
@@ -665,6 +673,10 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
 /// Raise invalid opcode instruction.
 pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];
 
+/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
+/// single-precision floating-point values in xmm1 with writemask k1 (AVX512VL, AVX512F).
+pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];
+
 /// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
 pub static XOR_IMM: [u8; 1] = [0x81];
 
diff --git a/cranelift/codegen/meta/src/isa/x86/recipes.rs b/cranelift/codegen/meta/src/isa/x86/recipes.rs
index 0cfd83d3734a..74645d0b5922 100644
--- a/cranelift/codegen/meta/src/isa/x86/recipes.rs
+++ b/cranelift/codegen/meta/src/isa/x86/recipes.rs
@@ -3417,5 +3417,23 @@ pub(crate) fn define<'shared>(
         regs).rex_kind(RecipePrefixKind::Evex)
     );
 
+    recipes.add_template(
+        Template::new(
+            EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1)
+                .operands_in(vec![fpr])
+                .operands_out(vec![fpr])
+                .emit(
+                    r#"
+                // instruction encoding operands: reg (op1, w), rm (op2, r)
+                // this maps to:                  out_reg0,     in_reg0
+                let context = EvexContext::Other { length: EvexVectorLength::V128 };
+                let masking = EvexMasking::None;
+                put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm
+                modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg
+                "#,
+                ),
+            regs).rex_kind(RecipePrefixKind::Evex)
+    );
+
     recipes
 }
diff --git a/cranelift/codegen/meta/src/isa/x86/settings.rs b/cranelift/codegen/meta/src/isa/x86/settings.rs
index 0ef36b668673..8848a9143902 100644
--- a/cranelift/codegen/meta/src/isa/x86/settings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/settings.rs
@@ -9,6 +9,21 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
         false,
     );
 
+    settings.add_bool(
+        "assert_no_nans",
+        "If set, Cranelift will assume that floating-point operations will not produce \
+        NaNs; in certain cases, Cranelift can use this information to produce faster code.",
+        false,
+    );
+
+    settings.add_bool(
+        "assert_in_bounds",
+        "If set, Cranelift will assume that operations with bounds are in bounds (e.g. \
+        float/integers conversions, dynamic lane indices); in certain cases, Cranelift can use \
+        this information to produce faster code.",
+        false,
+    );
+
     // CPUID.01H:ECX
     let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
     let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
@@ -23,7 +38,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
     );
     let has_avx512vl = settings.add_bool(
         "has_avx512vl",
-        "AVX512DQ: CPUID.07H:EBX.AVX512VL[bit 31]",
+        "AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
+        false,
+    );
+    let has_avx512f = settings.add_bool(
+        "has_avx512f",
+        "AVX512F: CPUID.07H:EBX.AVX512F[bit 16]",
         false,
     );
     let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false);
@@ -76,6 +96,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
         "use_avx512vl_simd",
         predicate!(shared_enable_simd && has_avx512vl),
     );
+    settings.add_predicate(
+        "use_avx512f_simd",
+        predicate!(shared_enable_simd && has_avx512f),
+    );
 
     settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
     settings.add_predicate("use_bmi1", predicate!(has_bmi1));
diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
index daeb0c33c350..6a5a3d3491bb 100644
--- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs
+++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs
@@ -1896,6 +1896,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Pop
         | Opcode::X86Bsr
         | Opcode::X86Bsf
+        | Opcode::X86Pblendw
         | Opcode::X86Pshufd
         | Opcode::X86Pshufb
         | Opcode::X86Pextr
@@ -1916,6 +1917,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
         | Opcode::X86Packss
         | Opcode::X86Punpckh
         | Opcode::X86Punpckl
+        | Opcode::X86Vcvtudq2ps
         | Opcode::X86ElfTlsGetAddr
         | Opcode::X86MachoTlsGetAddr => {
             panic!("x86-specific opcode in supposedly arch-neutral IR!");
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
index d9f8f87f9ad1..b347c546ef28 100644
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -597,6 +597,9 @@ fn expand_minmax(
 
 /// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
 /// i64 with a pattern, the rest needs more code.
+///
+/// Note that this is the scalar implementation; for the vector implemenation see
+/// [expand_fcvt_from_uint_vector].
 fn expand_fcvt_from_uint(
     inst: ir::Inst,
     func: &mut ir::Function,
@@ -678,6 +681,55 @@ fn expand_fcvt_from_uint(
     cfg.recompute_block(pos.func, done);
 }
 
+/// To convert packed unsigned integers to their float equivalents, we must legalize to a special
+/// AVX512 instruction or use a long sequence of instructions. This logic is separate from
+/// [expand_fcvt_from_uint] above (the scalar version), only due to how the transform groups are
+/// set up; TODO if we change the SIMD legalization groups, then this logic could be merged into
+/// [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_from_uint_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtFromUint,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == F32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4);
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
+                // If we have certain AVX512 features, we can lower this instruction simply.
+                pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg);
+            } else {
+                // Otherwise, we default to a very lengthy SSE4.1-compatible sequence.
+                let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg);
+                let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into());
+                let zero = pos.ins().vconst(I16X8, zero_constant);
+                let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55);
+                let bitcast_low = pos.ins().raw_bitcast(I32X4, low);
+                let high = pos.ins().isub(arg, bitcast_low);
+                let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low);
+                let shift_high = pos.ins().ushr_imm(high, 1);
+                let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high);
+                let double_high = pos.ins().fadd(convert_high, convert_high);
+                pos.func.dfg.replace(inst).fadd(double_high, convert_low);
+            }
+        } else {
+            unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
+        }
+    }
+}
+
 fn expand_fcvt_to_sint(
     inst: ir::Inst,
     func: &mut ir::Function,
@@ -910,6 +962,58 @@ fn expand_fcvt_to_sint_sat(
     cfg.recompute_block(pos.func, done_block);
 }
 
+/// This legalization converts a vector of 32-bit floating point lanes to signed integer lanes
+/// using CVTTPS2DQ (see encoding of `x86_cvtt2si`). If user-defined flags `assert_no_nans` and
+/// `assert_in_bounds` are set, only CVTTPS2DQ is emitted; otherwise, a longer sequence of NaN
+/// quieting and lane saturation is emitted. This logic is separate from [expand_fcvt_to_sint_sat]
+/// above (the scalar version), only due to how the transform groups are set up; TODO if we change
+/// the SIMD legalization groups, then this logic could be merged into [expand_fcvt_to_sint_sat]
+/// (see https://github.com/bytecodealliance/wasmtime/issues/1745).
+fn expand_fcvt_to_sint_sat_vector(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Unary {
+        opcode: ir::Opcode::FcvtToSintSat,
+        arg,
+    } = pos.func.dfg[inst]
+    {
+        let controlling_type = pos.func.dfg.ctrl_typevar(inst);
+        if controlling_type == I32X4 {
+            debug_assert_eq!(pos.func.dfg.value_type(arg), F32X4);
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.assert_no_nans() && x86_isa.isa_flags.assert_in_bounds() {
+                // When no NaNs are possible and we know the results do not need saturation, we can
+                // emit faster code.
+                pos.func.dfg.replace(inst).x86_cvtt2si(F32X4, arg);
+            } else {
+                // Otherwise, we must both quiet any NaNs--setting that lane to 0--and saturate any
+                // lanes that might overflow during conversion to the highest/lowest integer
+                // allowed in that lane.
+                let ones_constant = pos.func.dfg.constants.insert(vec![0xff; 16].into());
+                let ones = pos.ins().vconst(F32X4, ones_constant);
+                let arg1 = pos.ins().band(arg, ones);
+                let tmp1 = pos.ins().bxor(ones, arg);
+                let arg2 = pos.ins().x86_cvtt2si(I32X4, arg1);
+                let tmp2 = pos.ins().raw_bitcast(I32X4, tmp1);
+                let tmp3 = pos.ins().band(tmp2, arg2);
+                let tmp4 = pos.ins().sshr_imm(tmp3, 31);
+                pos.func.dfg.replace(inst).bxor(arg2, tmp4);
+            }
+        } else {
+            unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
+        }
+    }
+}
+
 fn expand_fcvt_to_uint(
     inst: ir::Inst,
     func: &mut ir::Function,
diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif
new file mode 100644
index 000000000000..37abef0e61c9
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-binemit.clif
@@ -0,0 +1,9 @@
+test binemit
+set enable_simd
+target x86_64 has_avx512vl=true
+
+function %fcvt_from_uint(i32x4) {
+block0(v0: i32x4 [%xmm2]):
+[-, %xmm6]  v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2
+    return
+}
diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif
new file mode 100644
index 000000000000..78dc1cf2200e
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-conversion-legalize.clif
@@ -0,0 +1,10 @@
+test legalizer
+set enable_simd
+target x86_64 skylake has_avx512f=true
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0:i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    ; check: v1 = x86_vcvtudq2ps v0
+    return v1
+}
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize-fast.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize-fast.clif
new file mode 100644
index 000000000000..96e5c9e8deb1
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize-fast.clif
@@ -0,0 +1,12 @@
+test legalizer
+set enable_simd
+target x86_64 skylake assert_no_nans=true assert_in_bounds=true
+
+; When we assert that no NaNs are possible and the conversion will not overflow, we can emit a shorter legalization
+; sequence.
+function %fcvt_to_sint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_sint_sat.i32x4 v0
+    ; check: v1 = x86_cvtt2si.i32x4 v0
+    return v1
+}
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
new file mode 100644
index 000000000000..fc98fff15831
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-legalize.clif
@@ -0,0 +1,33 @@
+test legalizer
+set enable_simd
+target x86_64 skylake
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0:i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    ; check: v2 = raw_bitcast.i16x8 v0
+    ; nextln: v3 = vconst.i16x8 const0
+    ; nextln: v4 = x86_pblendw v3, v2, 85
+    ; nextln: v5 = raw_bitcast.i32x4 v4
+    ; nextln: v6 = isub v0, v5
+    ; nextln: v7 = fcvt_from_sint.f32x4 v5
+    ; nextln: v8 = ushr_imm v6, 1
+    ; nextln: v9 = fcvt_from_sint.f32x4 v8
+    ; nextln: v10 = fadd v9, v9
+    ; nextln: v1 = fadd v10, v7
+    return v1
+}
+
+function %fcvt_to_sint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_sint_sat.i32x4 v0
+    ; check: v2 = vconst.f32x4 const0
+    ; nextln: v3 = band v0, v2
+    ; nextln: v4 = bxor v2, v0
+    ; nextln: v5 = x86_cvtt2si.i32x4 v3
+    ; nextln: v6 = raw_bitcast.i32x4 v4
+    ; nextln: v7 = band v6, v5
+    ; nextln: v8 = sshr_imm v7, 31
+    ; nextln: v1 = bxor v5, v8
+    return v1
+}
diff --git a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif
index 3484818aa3db..ee45da3d4b42 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-conversion-run.clif
@@ -13,3 +13,17 @@ block0:
     return v4
 }
 ; run
+
+function %fcvt_from_uint(i32x4) -> f32x4 {
+block0(v0:i32x4):
+    v1 = fcvt_from_uint.f32x4 v0
+    return v1
+}
+; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
+
+function %fcvt_to_sint_sat(f32x4) -> i32x4 {
+block0(v0:f32x4):
+    v1 = fcvt_to_sint_sat.i32x4 v0
+    return v1
+}
+; run: %fcvt_to_sint_sat([0x0.0 -0x1.0 0x1.0 0x1.0p100]) == [0 -1 1 0x7FFFFFFF]
diff --git a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
index e5eea1f6372a..24bc8cfa2409 100644
--- a/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
+++ b/cranelift/filetests/filetests/isa/x86/simd-lane-access-binemit.clif
@@ -96,6 +96,14 @@ block0:
             return
 }
 
+;; blend
+
+function %pblendw(b16x8, b16x8) {
+block0(v0: b16x8 [%xmm10], v1: b16x8 [%xmm2]):
+[-, %xmm10] v2 = x86_pblendw v0, v1, 0x55   ; bin: 66 44 0f 3a 0e d2 55
+            return
+}
+
 ;; pack/unpack
 
 function %unpack_high_i8x16(i8x16, i8x16) {
diff --git a/cranelift/native/src/lib.rs b/cranelift/native/src/lib.rs
index b45dab8dd5c5..903fbb3522ba 100644
--- a/cranelift/native/src/lib.rs
+++ b/cranelift/native/src/lib.rs
@@ -91,6 +91,9 @@ fn parse_x86_cpuid(isa_builder: &mut isa::Builder) -> Result<(), &'static str> {
         if info.has_avx512vl() {
             isa_builder.enable("has_avx512vl").unwrap();
         }
+        if info.has_avx512f() {
+            isa_builder.enable("has_avx512f").unwrap();
+        }
     }
     if let Some(info) = cpuid.get_extended_function_info() {
         if info.has_lzcnt() {
diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs
index d11d1fb442dc..b6980cbc9f95 100644
--- a/cranelift/wasm/src/code_translator.rs
+++ b/cranelift/wasm/src/code_translator.rs
@@ -1544,9 +1544,15 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
             let a = pop1_with_bitcast(state, I32X4, builder);
             state.push1(builder.ins().fcvt_from_sint(F32X4, a))
         }
-        Operator::I32x4TruncSatF32x4S
-        | Operator::I32x4TruncSatF32x4U
-        | Operator::F32x4ConvertI32x4U
+        Operator::F32x4ConvertI32x4U => {
+            let a = pop1_with_bitcast(state, I32X4, builder);
+            state.push1(builder.ins().fcvt_from_uint(F32X4, a))
+        }
+        Operator::I32x4TruncSatF32x4S => {
+            let a = pop1_with_bitcast(state, F32X4, builder);
+            state.push1(builder.ins().fcvt_to_sint_sat(I32X4, a))
+        }
+        Operator::I32x4TruncSatF32x4U
         | Operator::I8x16Abs
         | Operator::I16x8Abs
         | Operator::I32x4Abs