Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fcvt_from_uint (i32x4 -> f32x4) for x86 #1765

Merged
merged 5 commits into from
Jun 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,7 @@ fn define_simd(
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
let x86_packss = x86.by_name("x86_packss");
let x86_pblendw = x86.by_name("x86_pblendw");
let x86_pextr = x86.by_name("x86_pextr");
let x86_pinsr = x86.by_name("x86_pinsr");
let x86_pmaxs = x86.by_name("x86_pmaxs");
Expand All @@ -1655,10 +1656,12 @@ fn define_simd(
let x86_ptest = x86.by_name("x86_ptest");
let x86_punpckh = x86.by_name("x86_punpckh");
let x86_punpckl = x86.by_name("x86_punpckl");
let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");

// Shorthands for recipes.
let rec_blend = r.template("blend");
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
let rec_f_ib = r.template("f_ib");
let rec_fa = r.template("fa");
let rec_fa_ib = r.template("fa_ib");
Expand Down Expand Up @@ -1702,6 +1705,7 @@ fn define_simd(
let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");

// SIMD vector size: eventually multiple vector sizes may be supported but for now only
// SSE-sized vectors are available.
Expand Down Expand Up @@ -1741,6 +1745,13 @@ fn define_simd(
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}

// PBLENDW, select lanes using a u8 immediate.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
let template = rec_fa_ib.opcodes(&PBLENDW);
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}

// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
Expand Down Expand Up @@ -1885,6 +1896,12 @@ fn define_simd(
.bind(vector(F32, sse_vector_size))
.bind(vector(I32, sse_vector_size));
e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));

e.enc_32_64_maybe_isap(
x86_vcvtudq2ps,
rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
);
}

// SIMD vconst for special cases (all zeroes, all ones)
Expand Down
45 changes: 45 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,37 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let f32x4 = &TypeVar::new(
"f32x4",
"A floating point number",
TypeSetBuilder::new()
.floats(32..32)
.simd_lanes(4..4)
.build(),
);
let i32x4 = &TypeVar::new(
"i32x4",
"An integer type with the same number of lanes",
TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
);
let x = &Operand::new("x", i32x4);
let a = &Operand::new("a", f32x4);

ig.push(
Inst::new(
"x86_vcvtudq2ps",
r#"
Convert unsigned integer to floating point.

Convert packed doubleword unsigned integers to packed single-precision floating-point
values. This instruction does not trap.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);

let x = &Operand::new("x", Float);
let a = &Operand::new("a", Float);
let y = &Operand::new("y", Float);
Expand Down Expand Up @@ -302,6 +333,20 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
ig.push(
Inst::new(
"x86_pblendw",
r#"
Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a
lane in ``b``: if the bit is set, the lane is copied into ``a``.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![a, b, mask])
.operands_out(vec![a]),
);

let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
let x = &Operand::new("x", TxN);
let a = &Operand::new("a", &TxN.lane_of());
Expand Down
3 changes: 2 additions & 1 deletion cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ fn define_simd(
let bxor = insts.by_name("bxor");
let extractlane = insts.by_name("extractlane");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fabs = insts.by_name("fabs");
let fneg = insts.by_name("fneg");
let iadd_imm = insts.by_name("iadd_imm");
Expand Down Expand Up @@ -788,6 +789,6 @@ fn define_simd(
narrow.custom_legalize(ushr, "convert_ushr");
narrow.custom_legalize(ishl, "convert_ishl");

// This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
}
2 changes: 2 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_32.legalize_type(F32, x86_expand);
x86_32.legalize_type(F64, x86_expand);
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

x86_64.legalize_monomorphic(expand_flags);
x86_64.legalize_default(x86_narrow);
Expand All @@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_64.legalize_type(F32, x86_expand);
x86_64.legalize_type(F64, x86_expand);
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

let recipes = recipes::define(shared_defs, &settings, &regs);

Expand Down
10 changes: 10 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/opcodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,10 @@ pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
/// in XMM0 and store the values into xmm1 (SSE4.1).
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];

/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
/// (SSE4.1).
pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];

/// Compare packed data for equal (SSE2).
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];

Expand Down Expand Up @@ -665,6 +669,12 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
/// Raise invalid opcode instruction.
pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];

/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
/// single-precision floating-point values in xmm1 with writemask k1. Rounding behavior
/// is controlled by MXCSR but can be overriden by EVEX.L'L in static rounding mode
/// (AVX512VL, AVX512F).
pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];
abrown marked this conversation as resolved.
Show resolved Hide resolved

/// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
pub static XOR_IMM: [u8; 1] = [0x81];

Expand Down
18 changes: 18 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/recipes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3417,5 +3417,23 @@ pub(crate) fn define<'shared>(
regs).rex_kind(RecipePrefixKind::Evex)
);

recipes.add_template(
Template::new(
EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1)
.operands_in(vec![fpr])
.operands_out(vec![fpr])
.emit(
r#"
// instruction encoding operands: reg (op1, w), rm (op2, r)
// this maps to: out_reg0, in_reg0
let context = EvexContext::Other { length: EvexVectorLength::V128 };
let masking = EvexMasking::None;
put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm
modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg
"#,
),
regs).rex_kind(RecipePrefixKind::Evex)
);

recipes
}
11 changes: 10 additions & 1 deletion cranelift/codegen/meta/src/isa/x86/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
);
let has_avx512vl = settings.add_bool(
"has_avx512vl",
"AVX512DQ: CPUID.07H:EBX.AVX512VL[bit 31]",
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
false,
);
let has_avx512f = settings.add_bool(
"has_avx512f",
"AVX512F: CPUID.07H:EBX.AVX512F[bit 16]",
false,
);
let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false);
Expand Down Expand Up @@ -76,6 +81,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
"use_avx512vl_simd",
predicate!(shared_enable_simd && has_avx512vl),
);
settings.add_predicate(
"use_avx512f_simd",
predicate!(shared_enable_simd && has_avx512f),
);

settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
settings.add_predicate("use_bmi1", predicate!(has_bmi1));
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2046,6 +2046,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Pop
| Opcode::X86Bsr
| Opcode::X86Bsf
| Opcode::X86Pblendw
| Opcode::X86Pshufd
| Opcode::X86Pshufb
| Opcode::X86Pextr
Expand All @@ -2066,6 +2067,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Packss
| Opcode::X86Punpckh
| Opcode::X86Punpckl
| Opcode::X86Vcvtudq2ps
| Opcode::X86ElfTlsGetAddr
| Opcode::X86MachoTlsGetAddr => {
panic!("x86-specific opcode in supposedly arch-neutral IR!");
Expand Down
53 changes: 53 additions & 0 deletions cranelift/codegen/src/isa/x86/enc_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,9 @@ fn expand_minmax(

/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
/// i64 with a pattern, the rest needs more code.
///
/// Note that this is the scalar implementation; for the vector implemenation see
/// [expand_fcvt_from_uint_vector].
fn expand_fcvt_from_uint(
inst: ir::Inst,
func: &mut ir::Function,
Expand Down Expand Up @@ -679,6 +682,56 @@ fn expand_fcvt_from_uint(
cfg.recompute_block(pos.func, done);
}

/// To convert packed unsigned integers to their float equivalents, we must legalize to a special
/// AVX512 instruction (using MCSR rounding) or use a long sequence of instructions. This logic is
/// separate from [expand_fcvt_from_uint] above (the scalar version), only due to how the transform
/// groups are set up; TODO if we change the SIMD legalization groups, then this logic could be
/// merged into [expand_fcvt_from_uint] (see https://github.com/bytecodealliance/wasmtime/issues/1745).
fn expand_fcvt_from_uint_vector(
abrown marked this conversation as resolved.
Show resolved Hide resolved
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
isa: &dyn TargetIsa,
) {
let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);

if let ir::InstructionData::Unary {
opcode: ir::Opcode::FcvtFromUint,
arg,
} = pos.func.dfg[inst]
{
let controlling_type = pos.func.dfg.ctrl_typevar(inst);
if controlling_type == F32X4 {
debug_assert_eq!(pos.func.dfg.value_type(arg), I32X4);
let x86_isa = isa
.as_any()
.downcast_ref::<isa::x86::Isa>()
.expect("the target ISA must be x86 at this point");
if x86_isa.isa_flags.use_avx512vl_simd() || x86_isa.isa_flags.use_avx512f_simd() {
// If we have certain AVX512 features, we can lower this instruction simply.
pos.func.dfg.replace(inst).x86_vcvtudq2ps(arg);
} else {
// Otherwise, we default to a very lengthy SSE4.1-compatible sequence: PXOR,
// PBLENDW, PSUB, CVTDQ2PS, PSRLD, CVTDQ2PS, ADDPS, ADDPS
let bitcast_arg = pos.ins().raw_bitcast(I16X8, arg);
let zero_constant = pos.func.dfg.constants.insert(vec![0; 16].into());
let zero = pos.ins().vconst(I16X8, zero_constant);
let low = pos.ins().x86_pblendw(zero, bitcast_arg, 0x55);
let bitcast_low = pos.ins().raw_bitcast(I32X4, low);
let high = pos.ins().isub(arg, bitcast_low);
let convert_low = pos.ins().fcvt_from_sint(F32X4, bitcast_low);
let shift_high = pos.ins().ushr_imm(high, 1);
let convert_high = pos.ins().fcvt_from_sint(F32X4, shift_high);
let double_high = pos.ins().fadd(convert_high, convert_high);
pos.func.dfg.replace(inst).fadd(double_high, convert_low);
}
} else {
unimplemented!("cannot legalize {}", pos.func.dfg.display_inst(inst, None))
}
}
}

fn expand_fcvt_to_sint(
inst: ir::Inst,
func: &mut ir::Function,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
test binemit
set enable_simd
target x86_64 has_avx512vl=true

function %fcvt_from_uint(i32x4) {
block0(v0: i32x4 [%xmm2]):
[-, %xmm6] v1 = x86_vcvtudq2ps v0 ; bin: 62 f1 7f 08 7a f2
return
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
test legalizer
set enable_simd
target x86_64 skylake has_avx512f=true

function %fcvt_from_uint(i32x4) -> f32x4 {
block0(v0:i32x4):
v1 = fcvt_from_uint.f32x4 v0
; check: v1 = x86_vcvtudq2ps v0
return v1
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
test legalizer
set enable_simd
target x86_64 skylake

function %fcvt_from_uint(i32x4) -> f32x4 {
block0(v0:i32x4):
v1 = fcvt_from_uint.f32x4 v0
; check: v2 = raw_bitcast.i16x8 v0
; nextln: v3 = vconst.i16x8 const0
; nextln: v4 = x86_pblendw v3, v2, 85
; nextln: v5 = raw_bitcast.i32x4 v4
; nextln: v6 = isub v0, v5
; nextln: v7 = fcvt_from_sint.f32x4 v5
; nextln: v8 = ushr_imm v6, 1
; nextln: v9 = fcvt_from_sint.f32x4 v8
; nextln: v10 = fadd v9, v9
; nextln: v1 = fadd v10, v7
return v1
}
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,10 @@ block0:
return v4
}
; run

function %fcvt_from_uint(i32x4) -> f32x4 {
block0(v0:i32x4):
v1 = fcvt_from_uint.f32x4 v0
return v1
}
; run: %fcvt_from_uint([0 0 0 0]) == [0x0.0 0x0.0 0x0.0 0x0.0]
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,14 @@ block0:
return
}

;; blend

function %pblendw(b16x8, b16x8) {
block0(v0: b16x8 [%xmm10], v1: b16x8 [%xmm2]):
[-, %xmm10] v2 = x86_pblendw v0, v1, 0x55 ; bin: 66 44 0f 3a 0e d2 55
return
}

;; pack/unpack

function %unpack_high_i8x16(i8x16, i8x16) {
Expand Down
3 changes: 3 additions & 0 deletions cranelift/native/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ fn parse_x86_cpuid(isa_builder: &mut isa::Builder) -> Result<(), &'static str> {
if info.has_avx512vl() {
isa_builder.enable("has_avx512vl").unwrap();
}
if info.has_avx512f() {
isa_builder.enable("has_avx512f").unwrap();
}
}
if let Some(info) = cpuid.get_extended_function_info() {
if info.has_lzcnt() {
Expand Down
5 changes: 4 additions & 1 deletion cranelift/wasm/src/code_translator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1544,9 +1544,12 @@ pub fn translate_operator<FE: FuncEnvironment + ?Sized>(
let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().fcvt_from_sint(F32X4, a))
}
Operator::F32x4ConvertI32x4U => {
let a = pop1_with_bitcast(state, I32X4, builder);
state.push1(builder.ins().fcvt_from_uint(F32X4, a))
}
Operator::I32x4TruncSatF32x4S
| Operator::I32x4TruncSatF32x4U
| Operator::F32x4ConvertI32x4U
| Operator::I8x16Abs
| Operator::I16x8Abs
| Operator::I32x4Abs
Expand Down