diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index 5d398f8e0df4..8e351a20e493 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -28,6 +28,17 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct .isa("x86") .chain_with(shared.transform_groups.by_name("narrow_flags").id); + let mut narrow_avx = TransformGroupBuilder::new( + "x86_narrow_avx", + r#" + Legalize instructions by narrowing with CPU feature checks. + + This special case converts using x86 AVX instructions where available."#, + ) + .isa("x86"); + // We cannot chain with the x86_narrow group until this group is built, see bottom of this + // function for where this is chained. + let mut widen = TransformGroupBuilder::new( "x86_widen", r#" @@ -343,10 +354,13 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct widen.custom_legalize(ineg, "convert_ineg"); // To reduce compilation times, separate out large blocks of legalizations by theme. - define_simd(shared, x86_instructions, &mut narrow, &mut expand); + define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx); expand.build_and_add_to(&mut shared.transform_groups); - narrow.build_and_add_to(&mut shared.transform_groups); + let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups); + narrow_avx + .chain_with(narrow_id) + .build_and_add_to(&mut shared.transform_groups); widen.build_and_add_to(&mut shared.transform_groups); } @@ -354,7 +368,7 @@ fn define_simd( shared: &mut SharedDefinitions, x86_instructions: &InstructionGroup, narrow: &mut TransformGroupBuilder, - expand: &mut TransformGroupBuilder, + narrow_avx: &mut TransformGroupBuilder, ) { let insts = &shared.instructions; let band = insts.by_name("band"); @@ -755,12 +769,6 @@ fn define_simd( ); } - // SIMD imul - { - let imul = imul.bind(vector(I64, sse_vector_size)); - narrow.legalize(def!(c = imul(a, b)), vec![def!(c = x86_pmullq(a, b))]); - } - narrow.custom_legalize(shuffle, "convert_shuffle"); narrow.custom_legalize(extractlane, "convert_extractlane"); narrow.custom_legalize(insertlane, "convert_insertlane"); @@ -768,5 +776,6 @@ fn define_simd( narrow.custom_legalize(ushr, "convert_ushr"); narrow.custom_legalize(ishl, "convert_ishl"); - narrow.build_and_add_to(&mut shared.transform_groups); + // This lives in the expand group to avoid conflicting with, e.g., i128 legalizations. + narrow_avx.custom_legalize(imul, "convert_i64x2_imul"); } diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs index 3b4848b16626..2e9305e9f730 100644 --- a/cranelift/codegen/meta/src/isa/x86/mod.rs +++ b/cranelift/codegen/meta/src/isa/x86/mod.rs @@ -1,6 +1,6 @@ use crate::cdsl::cpu_modes::CpuMode; use crate::cdsl::isa::TargetIsa; -use crate::cdsl::types::ReferenceType; +use crate::cdsl::types::{ReferenceType, VectorType}; use crate::shared::types::Bool::B1; use crate::shared::types::Float::{F32, F64}; @@ -35,6 +35,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { let expand_flags = shared_defs.transform_groups.by_name("expand_flags"); let x86_widen = shared_defs.transform_groups.by_name("x86_widen"); let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow"); + let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx"); let x86_expand = shared_defs.transform_groups.by_name("x86_expand"); x86_32.legalize_monomorphic(expand_flags); @@ -46,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_32.legalize_value_type(ReferenceType(R32), x86_expand); x86_32.legalize_type(F32, x86_expand); x86_32.legalize_type(F64, x86_expand); + x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); x86_64.legalize_monomorphic(expand_flags); x86_64.legalize_default(x86_narrow); @@ -57,6 +59,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa { x86_64.legalize_value_type(ReferenceType(R64), x86_expand); x86_64.legalize_type(F32, x86_expand); x86_64.legalize_type(F64, x86_expand); + x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx); let recipes = recipes::define(shared_defs, &settings, ®s); diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index c00ca973575b..728b4fe15bc9 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -1499,6 +1499,47 @@ fn convert_ishl( } } +/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2. +fn convert_i64x2_imul( + inst: ir::Inst, + func: &mut ir::Function, + _cfg: &mut ControlFlowGraph, + isa: &dyn TargetIsa, +) { + let mut pos = FuncCursor::new(func).at_inst(inst); + pos.use_srcloc(inst); + + if let ir::InstructionData::Binary { + opcode: ir::Opcode::Imul, + args: [arg0, arg1], + } = pos.func.dfg[inst] + { + let ty = pos.func.dfg.ctrl_typevar(inst); + if ty == I64X2 { + let x86_isa = isa + .as_any() + .downcast_ref::() + .expect("the target ISA must be x86 at this point"); + if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() { + // If we have certain AVX512 features, we can lower this instruction simply. + pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1); + } else { + // Otherwise, we default to a very lengthy SSE2-compatible sequence. + let high0 = pos.ins().ushr_imm(arg0, 32); + let mul0 = pos.ins().x86_pmuludq(high0, arg1); + let high1 = pos.ins().ushr_imm(arg1, 32); + let mul1 = pos.ins().x86_pmuludq(high1, arg0); + let addhigh = pos.ins().iadd(mul0, mul1); + let high = pos.ins().ishl_imm(addhigh, 32); + let low = pos.ins().x86_pmuludq(arg0, arg1); + pos.func.dfg.replace(inst).iadd(low, high); + } + } else { + // Other imul variants should be encodable. + } + } +} + fn expand_tls_value( inst: ir::Inst, func: &mut ir::Function, diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif index f9984cdd9c58..39814b37bbe2 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif @@ -70,9 +70,16 @@ block0: return } -function %imul(i64x2, i64x2) { +function %imul_i64x2(i64x2, i64x2) { block0(v0:i64x2, v1:i64x2): v2 = imul v0, v1 - ; check: v2 = x86_pmullq v0, v1 + ; check: v3 = ushr_imm v0, 32 + ; nextln: v4 = x86_pmuludq v3, v1 + ; nextln: v5 = ushr_imm v1, 32 + ; nextln: v6 = x86_pmuludq v5, v0 + ; nextln: v7 = iadd v4, v6 + ; nextln: v8 = ishl_imm v7, 32 + ; nextln: v9 = x86_pmuludq v0, v1 + ; nextln: v2 = iadd v9, v8 return } diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif index 30ce4f710399..103a0785471c 100644 --- a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif +++ b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif @@ -49,6 +49,13 @@ block0: } ; run +function %imul_i64x2(i64x2, i64x2) -> i64x2 { +block0(v0: i64x2, v1: i64x2): + v2 = imul v0, v1 + return v2 +} +; run: %imul_i64x2([0 2], [0 2]) == [0 4] + function %imul_i32x4() -> b1 { block0: v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01] diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-legalize.clif new file mode 100644 index 000000000000..0c7dafcf0223 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-legalize.clif @@ -0,0 +1,10 @@ +test legalizer +set enable_simd +target x86_64 skylake has_avx512dq=true + +function %imul_i64x2(i64x2, i64x2) { +block0(v0:i64x2, v1:i64x2): + v2 = imul v0, v1 + ; check: v2 = x86_pmullq v0, v1 + return +}