Add x86 legalization for imul.i64x2 for non-AVX CPUs

The `convert_i64x2_imul` custom legalization checks the ISA flags for AVX512DQ or AVX512VL support and legalizes `imul.i64x2` to an `x86_pmullq` in this case; if not, it uses a lengthy SSE2-compatible instruction sequence.
bytecodealliance · May 27, 2020 · 6ab817e · 6ab817e
1 parent a1d2acf
commit 6ab817e
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 13 deletions.
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -28,6 +28,17 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     .isa("x86")
     .chain_with(shared.transform_groups.by_name("narrow_flags").id);
 
+    let mut narrow_avx = TransformGroupBuilder::new(
+        "x86_narrow_avx",
+        r#"
+    Legalize instructions by narrowing with CPU feature checks.
+
+    This special case converts using x86 AVX instructions where available."#,
+    )
+    .isa("x86");
+    // We cannot chain with the x86_narrow group until this group is built, see bottom of this
+    // function for where this is chained.
+
     let mut widen = TransformGroupBuilder::new(
         "x86_widen",
         r#"
@@ -343,18 +354,21 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     widen.custom_legalize(ineg, "convert_ineg");
 
     // To reduce compilation times, separate out large blocks of legalizations by theme.
-    define_simd(shared, x86_instructions, &mut narrow, &mut expand);
+    define_simd(shared, x86_instructions, &mut narrow, &mut narrow_avx);
 
     expand.build_and_add_to(&mut shared.transform_groups);
-    narrow.build_and_add_to(&mut shared.transform_groups);
+    let narrow_id = narrow.build_and_add_to(&mut shared.transform_groups);
+    narrow_avx
+        .chain_with(narrow_id)
+        .build_and_add_to(&mut shared.transform_groups);
     widen.build_and_add_to(&mut shared.transform_groups);
 }
 
 fn define_simd(
     shared: &mut SharedDefinitions,
     x86_instructions: &InstructionGroup,
     narrow: &mut TransformGroupBuilder,
-    expand: &mut TransformGroupBuilder,
+    narrow_avx: &mut TransformGroupBuilder,
 ) {
     let insts = &shared.instructions;
     let band = insts.by_name("band");
@@ -755,18 +769,13 @@ fn define_simd(
         );
     }
 
-    // SIMD imul
-    {
-        let imul = imul.bind(vector(I64, sse_vector_size));
-        narrow.legalize(def!(c = imul(a, b)), vec![def!(c = x86_pmullq(a, b))]);
-    }
-
     narrow.custom_legalize(shuffle, "convert_shuffle");
     narrow.custom_legalize(extractlane, "convert_extractlane");
     narrow.custom_legalize(insertlane, "convert_insertlane");
     narrow.custom_legalize(ineg, "convert_ineg");
     narrow.custom_legalize(ushr, "convert_ushr");
     narrow.custom_legalize(ishl, "convert_ishl");
 
-    narrow.build_and_add_to(&mut shared.transform_groups);
+    // This lives in the expand group to avoid conflicting with, e.g., i128 legalizations.
+    narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
 }
diff --git a/cranelift/codegen/meta/src/isa/x86/mod.rs b/cranelift/codegen/meta/src/isa/x86/mod.rs
@@ -1,6 +1,6 @@
 use crate::cdsl::cpu_modes::CpuMode;
 use crate::cdsl::isa::TargetIsa;
-use crate::cdsl::types::ReferenceType;
+use crate::cdsl::types::{ReferenceType, VectorType};
 
 use crate::shared::types::Bool::B1;
 use crate::shared::types::Float::{F32, F64};
@@ -35,6 +35,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     let expand_flags = shared_defs.transform_groups.by_name("expand_flags");
     let x86_widen = shared_defs.transform_groups.by_name("x86_widen");
     let x86_narrow = shared_defs.transform_groups.by_name("x86_narrow");
+    let x86_narrow_avx = shared_defs.transform_groups.by_name("x86_narrow_avx");
     let x86_expand = shared_defs.transform_groups.by_name("x86_expand");
 
     x86_32.legalize_monomorphic(expand_flags);
@@ -46,6 +47,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_32.legalize_value_type(ReferenceType(R32), x86_expand);
     x86_32.legalize_type(F32, x86_expand);
     x86_32.legalize_type(F64, x86_expand);
+    x86_32.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
 
     x86_64.legalize_monomorphic(expand_flags);
     x86_64.legalize_default(x86_narrow);
@@ -57,6 +59,7 @@ pub(crate) fn define(shared_defs: &mut SharedDefinitions) -> TargetIsa {
     x86_64.legalize_value_type(ReferenceType(R64), x86_expand);
     x86_64.legalize_type(F32, x86_expand);
     x86_64.legalize_type(F64, x86_expand);
+    x86_64.legalize_value_type(VectorType::new(I64.into(), 2), x86_narrow_avx);
 
     let recipes = recipes::define(shared_defs, &settings, &regs);
 

diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -1499,6 +1499,47 @@ fn convert_ishl(
     }
 }
 
+/// Convert an imul.i64x2 to a valid code sequence on x86, first with AVX512 and then with SSE2.
+fn convert_i64x2_imul(
+    inst: ir::Inst,
+    func: &mut ir::Function,
+    _cfg: &mut ControlFlowGraph,
+    isa: &dyn TargetIsa,
+) {
+    let mut pos = FuncCursor::new(func).at_inst(inst);
+    pos.use_srcloc(inst);
+
+    if let ir::InstructionData::Binary {
+        opcode: ir::Opcode::Imul,
+        args: [arg0, arg1],
+    } = pos.func.dfg[inst]
+    {
+        let ty = pos.func.dfg.ctrl_typevar(inst);
+        if ty == I64X2 {
+            let x86_isa = isa
+                .as_any()
+                .downcast_ref::<isa::x86::Isa>()
+                .expect("the target ISA must be x86 at this point");
+            if x86_isa.isa_flags.use_avx512dq_simd() || x86_isa.isa_flags.use_avx512vl_simd() {
+                // If we have certain AVX512 features, we can lower this instruction simply.
+                pos.func.dfg.replace(inst).x86_pmullq(arg0, arg1);
+            } else {
+                // Otherwise, we default to a very lengthy SSE2-compatible sequence.
+                let high0 = pos.ins().ushr_imm(arg0, 32);
+                let mul0 = pos.ins().x86_pmuludq(high0, arg1);
+                let high1 = pos.ins().ushr_imm(arg1, 32);
+                let mul1 = pos.ins().x86_pmuludq(high1, arg0);
+                let addhigh = pos.ins().iadd(mul0, mul1);
+                let high = pos.ins().ishl_imm(addhigh, 32);
+                let low = pos.ins().x86_pmuludq(arg0, arg1);
+                pos.func.dfg.replace(inst).iadd(low, high);
+            }
+        } else {
+            // Other imul variants should be encodable.
+        }
+    }
+}
+
 fn expand_tls_value(
     inst: ir::Inst,
     func: &mut ir::Function,

diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-legalize.clif
@@ -70,9 +70,16 @@ block0:
     return
 }
 
-function %imul(i64x2, i64x2) {
+function %imul_i64x2(i64x2, i64x2) {
 block0(v0:i64x2, v1:i64x2):
     v2 = imul v0, v1
-    ; check: v2 = x86_pmullq v0, v1
+    ; check: v3 = ushr_imm v0, 32
+    ; nextln: v4 = x86_pmuludq v3, v1
+    ; nextln: v5 = ushr_imm v1, 32
+    ; nextln: v6 = x86_pmuludq v5, v0
+    ; nextln: v7 = iadd v4, v6
+    ; nextln: v8 = ishl_imm v7, 32
+    ; nextln: v9 = x86_pmuludq v0, v1
+    ; nextln: v2 = iadd v9, v8
     return
 }
diff --git a/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif b/cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif
@@ -49,6 +49,13 @@ block0:
 }
 ; run
 
+function %imul_i64x2(i64x2, i64x2) -> i64x2 {
+block0(v0: i64x2, v1: i64x2):
+    v2 = imul v0, v1
+    return v2
+}
+; run: %imul_i64x2([0 2], [0 2]) == [0 4]
+
 function %imul_i32x4() -> b1 {
 block0:
     v0 = vconst.i32x4 [-1 0 1 0x80_00_00_01]

diff --git a/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-legalize.clif b/cranelift/filetests/filetests/isa/x86/simd-avx512-arithmetic-legalize.clif
@@ -0,0 +1,10 @@
+test legalizer
+set enable_simd
+target x86_64 skylake has_avx512dq=true
+
+function %imul_i64x2(i64x2, i64x2) {
+block0(v0:i64x2, v1:i64x2):
+    v2 = imul v0, v1
+    ; check: v2 = x86_pmullq v0, v1
+    return
+}