Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement fcvt_to_sint_sat (f32x4 -> i32x4) for x86 #1820

Closed
wants to merge 10 commits into from
25 changes: 25 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1635,10 +1635,12 @@ fn define_simd(
let usub_sat = shared.by_name("usub_sat");
let vconst = shared.by_name("vconst");
let vselect = shared.by_name("vselect");
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
let x86_insertps = x86.by_name("x86_insertps");
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
let x86_packss = x86.by_name("x86_packss");
let x86_pblendw = x86.by_name("x86_pblendw");
let x86_pextr = x86.by_name("x86_pextr");
let x86_pinsr = x86.by_name("x86_pinsr");
let x86_pmaxs = x86.by_name("x86_pmaxs");
Expand All @@ -1655,10 +1657,12 @@ fn define_simd(
let x86_ptest = x86.by_name("x86_ptest");
let x86_punpckh = x86.by_name("x86_punpckh");
let x86_punpckl = x86.by_name("x86_punpckl");
let x86_vcvtudq2ps = x86.by_name("x86_vcvtudq2ps");

// Shorthands for recipes.
let rec_blend = r.template("blend");
let rec_evex_reg_vvvv_rm_128 = r.template("evex_reg_vvvv_rm_128");
let rec_evex_reg_rm_128 = r.template("evex_reg_rm_128");
let rec_f_ib = r.template("f_ib");
let rec_fa = r.template("fa");
let rec_fa_ib = r.template("fa_ib");
Expand Down Expand Up @@ -1702,6 +1706,7 @@ fn define_simd(
let use_sse41_simd = settings.predicate_by_name("use_sse41_simd");
let use_sse42_simd = settings.predicate_by_name("use_sse42_simd");
let use_avx512dq_simd = settings.predicate_by_name("use_avx512dq_simd");
let use_avx512vl_simd = settings.predicate_by_name("use_avx512vl_simd");

// SIMD vector size: eventually multiple vector sizes may be supported but for now only
// SSE-sized vectors are available.
Expand Down Expand Up @@ -1741,6 +1746,13 @@ fn define_simd(
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}

// PBLENDW, select lanes using a u8 immediate.
for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 16) {
let instruction = x86_pblendw.bind(vector(ty, sse_vector_size));
let template = rec_fa_ib.opcodes(&PBLENDW);
e.enc_both_inferred_maybe_isap(instruction, template, Some(use_sse41_simd));
}

// SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
// to the Intel manual: "When the destination operand is an XMM register, the source operand is
// written to the low doubleword of the register and the register is zero-extended to 128 bits."
Expand Down Expand Up @@ -1885,6 +1897,19 @@ fn define_simd(
.bind(vector(F32, sse_vector_size))
.bind(vector(I32, sse_vector_size));
e.enc_both(fcvt_from_sint_32, rec_furm.opcodes(&CVTDQ2PS));

e.enc_32_64_maybe_isap(
x86_vcvtudq2ps,
rec_evex_reg_rm_128.opcodes(&VCVTUDQ2PS),
Some(use_avx512vl_simd), // TODO need an OR predicate to join with AVX512F
);

e.enc_both_inferred(
x86_cvtt2si
.bind(vector(I32, sse_vector_size))
.bind(vector(F32, sse_vector_size)),
rec_furm.opcodes(&CVTTPS2DQ),
);
}

// SIMD vconst for special cases (all zeroes, all ones)
Expand Down
45 changes: 45 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/instructions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,37 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let f32x4 = &TypeVar::new(
"f32x4",
"A floating point number",
TypeSetBuilder::new()
.floats(32..32)
.simd_lanes(4..4)
.build(),
);
let i32x4 = &TypeVar::new(
"i32x4",
"An integer type with the same number of lanes",
TypeSetBuilder::new().ints(32..32).simd_lanes(4..4).build(),
);
let x = &Operand::new("x", i32x4);
let a = &Operand::new("a", f32x4);

ig.push(
Inst::new(
"x86_vcvtudq2ps",
r#"
Convert unsigned integer to floating point.

Convert packed doubleword unsigned integers to packed single-precision floating-point
values. This instruction does not trap.
"#,
&formats.unary,
)
.operands_in(vec![x])
.operands_out(vec![a]),
);

let x = &Operand::new("x", Float);
let a = &Operand::new("a", Float);
let y = &Operand::new("y", Float);
Expand Down Expand Up @@ -302,6 +333,20 @@ pub(crate) fn define(
.operands_out(vec![a]),
);

let mask = &Operand::new("mask", uimm8).with_doc("mask to select lanes from b");
ig.push(
Inst::new(
"x86_pblendw",
r#"
Blend packed words using an immediate mask. Each bit of the 8-bit immediate corresponds to a
lane in ``b``: if the bit is set, the lane is copied into ``a``.
"#,
&formats.ternary_imm8,
)
.operands_in(vec![a, b, mask])
.operands_out(vec![a]),
);

let Idx = &Operand::new("Idx", uimm8).with_doc("Lane index");
let x = &Operand::new("x", TxN);
let a = &Operand::new("a", &TxN.lane_of());
Expand Down
5 changes: 4 additions & 1 deletion cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ fn define_simd(
let bxor = insts.by_name("bxor");
let extractlane = insts.by_name("extractlane");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
let fabs = insts.by_name("fabs");
let fneg = insts.by_name("fneg");
let iadd_imm = insts.by_name("iadd_imm");
Expand Down Expand Up @@ -787,7 +789,8 @@ fn define_simd(
narrow.custom_legalize(ineg, "convert_ineg");
narrow.custom_legalize(ushr, "convert_ushr");
narrow.custom_legalize(ishl, "convert_ishl");
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");

// This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
}
2 changes: 2 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_32.legalize_type(F32, x86_expand);
x86_32.legalize_type(F64, x86_expand);
x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_32.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

x86_64.legalize_monomorphic(expand_flags);
x86_64.legalize_default(x86_narrow);
Expand All @@ -60,6 +61,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
x86_64.legalize_type(F32, x86_expand);
x86_64.legalize_type(F64, x86_expand);
x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
x86_64.legalize_value_type(VectorType::new(F32.into(), 4), x86_narrow_avx);

let recipes = recipes::define(shared_defs, &settings, &regs);

Expand Down
12 changes: 12 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/opcodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ pub static CVTSI2SS: [u8; 3] = [0xf3, 0x0f, 0x2a];
/// float-point value.
pub static CVTSS2SD: [u8; 3] = [0xf3, 0x0f, 0x5a];

/// Convert four packed single-precision floating-point values from xmm2/mem to four packed signed
/// doubleword values in xmm1 using truncation (SSE2).
pub static CVTTPS2DQ: [u8; 3] = [0xf3, 0x0f, 0x5b];

/// Convert with truncation scalar double-precision floating-point value to signed
/// integer.
pub static CVTTSD2SI: [u8; 3] = [0xf2, 0x0f, 0x2c];
Expand Down Expand Up @@ -347,6 +351,10 @@ pub static PAVGW: [u8; 3] = [0x66, 0x0f, 0xE3];
/// in XMM0 and store the values into xmm1 (SSE4.1).
pub static PBLENDVB: [u8; 4] = [0x66, 0x0f, 0x38, 0x10];

/// Select words from xmm1 and xmm2/m128 from mask specified in imm8 and store the values into xmm1
/// (SSE4.1).
pub static PBLENDW: [u8; 4] = [0x66, 0x0f, 0x3a, 0x0e];

/// Compare packed data for equal (SSE2).
pub static PCMPEQB: [u8; 3] = [0x66, 0x0f, 0x74];

Expand Down Expand Up @@ -665,6 +673,10 @@ pub static UCOMISS: [u8; 2] = [0x0f, 0x2e];
/// Raise invalid opcode instruction.
pub static UNDEFINED2: [u8; 2] = [0x0f, 0x0b];

/// Convert four packed unsigned doubleword integers from xmm2/m128/m32bcst to packed
/// single-precision floating-point values in xmm1 with writemask k1 (AVX512VL, AVX512F).
pub static VCVTUDQ2PS: [u8; 3] = [0xf2, 0x0f, 0x7a];

/// imm{16,32} XOR r/m{16,32,64}, possibly sign-extended.
pub static XOR_IMM: [u8; 1] = [0x81];

Expand Down
18 changes: 18 additions & 0 deletions cranelift/codegen/meta/src/isa/x86/recipes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3417,5 +3417,23 @@ pub(crate) fn define<'shared>(
regs).rex_kind(RecipePrefixKind::Evex)
);

recipes.add_template(
Template::new(
EncodingRecipeBuilder::new("evex_reg_rm_128", &formats.unary, 1)
.operands_in(vec![fpr])
.operands_out(vec![fpr])
.emit(
r#"
// instruction encoding operands: reg (op1, w), rm (op2, r)
// this maps to: out_reg0, in_reg0
let context = EvexContext::Other { length: EvexVectorLength::V128 };
let masking = EvexMasking::None;
put_evex(bits, out_reg0, 0, in_reg0, context, masking, sink); // params: reg, vvvv, rm
modrm_rr(in_reg0, out_reg0, sink); // params: rm, reg
"#,
),
regs).rex_kind(RecipePrefixKind::Evex)
);

recipes
}
26 changes: 25 additions & 1 deletion cranelift/codegen/meta/src/isa/x86/settings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
false,
);

settings.add_bool(
"assert_no_nans",
"If set, Cranelift will assume that floating-point operations will not produce \
NaNs; in certain cases, Cranelift can use this information to produce faster code.",
false,
);

settings.add_bool(
"assert_in_bounds",
"If set, Cranelift will assume that operations with bounds are in bounds (e.g. \
float/integers conversions, dynamic lane indices); in certain cases, Cranelift can use \
this information to produce faster code.",
false,
);

// CPUID.01H:ECX
let has_sse3 = settings.add_bool("has_sse3", "SSE3: CPUID.01H:ECX.SSE3[bit 0]", false);
let has_ssse3 = settings.add_bool("has_ssse3", "SSSE3: CPUID.01H:ECX.SSSE3[bit 9]", false);
Expand All @@ -23,7 +38,12 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
);
let has_avx512vl = settings.add_bool(
"has_avx512vl",
"AVX512DQ: CPUID.07H:EBX.AVX512VL[bit 31]",
"AVX512VL: CPUID.07H:EBX.AVX512VL[bit 31]",
false,
);
let has_avx512f = settings.add_bool(
"has_avx512f",
"AVX512F: CPUID.07H:EBX.AVX512F[bit 16]",
false,
);
let has_popcnt = settings.add_bool("has_popcnt", "POPCNT: CPUID.01H:ECX.POPCNT[bit 23]", false);
Expand Down Expand Up @@ -76,6 +96,10 @@ pub(crate) fn define(shared: &SettingGroup) -> SettingGroup {
"use_avx512vl_simd",
predicate!(shared_enable_simd && has_avx512vl),
);
settings.add_predicate(
"use_avx512f_simd",
predicate!(shared_enable_simd && has_avx512f),
);

settings.add_predicate("use_popcnt", predicate!(has_popcnt && has_sse42));
settings.add_predicate("use_bmi1", predicate!(has_bmi1));
Expand Down
2 changes: 2 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower_inst.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1896,6 +1896,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Pop
| Opcode::X86Bsr
| Opcode::X86Bsf
| Opcode::X86Pblendw
| Opcode::X86Pshufd
| Opcode::X86Pshufb
| Opcode::X86Pextr
Expand All @@ -1916,6 +1917,7 @@ pub(crate) fn lower_insn_to_regs<C: LowerCtx<I = Inst>>(
| Opcode::X86Packss
| Opcode::X86Punpckh
| Opcode::X86Punpckl
| Opcode::X86Vcvtudq2ps
| Opcode::X86ElfTlsGetAddr
| Opcode::X86MachoTlsGetAddr => {
panic!("x86-specific opcode in supposedly arch-neutral IR!");
Expand Down
Loading