From 4ec16fa057774dcd0d82dce05f8160689f0fe050 Mon Sep 17 00:00:00 2001
From: whitequark <whitequark@whitequark.org>
Date: Tue, 28 Apr 2020 01:32:02 +0000
Subject: [PATCH] Legalize 64 bit shifts on x86_32 using PSLLQ/PSRLQ.

Co-authored-by: iximeow <git@iximeow.net>
---
 .../codegen/meta/src/isa/x86/encodings.rs     |  7 ++-
 .../codegen/meta/src/isa/x86/legalize.rs      | 29 +++++++++++
 cranelift/codegen/src/isa/x86/enc_tables.rs   | 47 +++++++++++++++++
 .../isa/x86/legalize-x86_32-shifts.clif       | 51 +++++++++++++++++++
 4 files changed, 133 insertions(+), 1 deletion(-)
 create mode 100644 cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif

diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs
index 937df6830ea8..7863e2bd85f8 100644
--- a/cranelift/codegen/meta/src/isa/x86/encodings.rs
+++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs
@@ -1493,8 +1493,13 @@ fn define_alu(
     for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] {
         // Cannot use enc_i32_i64 for this pattern because instructions require
         // to bind any.
+        e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr));
         e.enc32(
-            inst.bind(I32).bind(Any),
+            inst.bind(I32).bind(I16),
+            rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
+        );
+        e.enc32(
+            inst.bind(I32).bind(I32),
             rec_rc.opcodes(&ROTATE_CL).rrr(rrr),
         );
         e.enc64(
diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs
index f38e4249bfa4..3b073c1fa643 100644
--- a/cranelift/codegen/meta/src/isa/x86/legalize.rs
+++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs
@@ -37,6 +37,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     let imul = insts.by_name("imul");
     let ineg = insts.by_name("ineg");
     let isub = insts.by_name("isub");
+    let ishl = insts.by_name("ishl");
+    let ireduce = insts.by_name("ireduce");
     let popcnt = insts.by_name("popcnt");
     let sdiv = insts.by_name("sdiv");
     let selectif = insts.by_name("selectif");
@@ -45,6 +47,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
     let tls_value = insts.by_name("tls_value");
     let udiv = insts.by_name("udiv");
     let umulhi = insts.by_name("umulhi");
+    let ushr = insts.by_name("ushr");
     let ushr_imm = insts.by_name("ushr_imm");
     let urem = insts.by_name("urem");
 
@@ -55,6 +58,32 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct
 
     let imm = &shared.imm;
 
+    // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce
+    // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is
+    // not encodable.
+    let a = var("a");
+    let x = var("x");
+    let y = var("y");
+    let z = var("z");
+
+    for &ty in &[I8, I16, I32] {
+        let ishl_by_i64 = ishl.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        group.legalize(
+            def!(a = ishl_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
+    for &ty in &[I8, I16, I32] {
+        let ushr_by_i64 = ushr.bind(ty).bind(I64);
+        let ireduce = ireduce.bind(I32);
+        group.legalize(
+            def!(a = ushr_by_i64(x, y)),
+            vec![def!(z = ireduce(y)), def!(a = ishl(x, z))],
+        );
+    }
+
     // Division and remainder.
     //
     // The srem expansion requires custom code because srem INT_MIN, -1 is not
diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs
index 9b44568454c0..c00ca973575b 100644
--- a/cranelift/codegen/src/isa/x86/enc_tables.rs
+++ b/cranelift/codegen/src/isa/x86/enc_tables.rs
@@ -1318,6 +1318,39 @@ fn convert_ineg(
     }
 }
 
+fn expand_dword_to_xmm<'f>(
+    pos: &mut FuncCursor<'_>,
+    arg: ir::Value,
+    arg_type: ir::Type,
+) -> ir::Value {
+    if arg_type == I64 {
+        let (arg_lo, arg_hi) = pos.ins().isplit(arg);
+        let arg = pos.ins().scalar_to_vector(I32X4, arg_lo);
+        let arg = pos.ins().insertlane(arg, 1, arg_hi);
+        let arg = pos.ins().raw_bitcast(I64X2, arg);
+        arg
+    } else {
+        pos.ins().bitcast(I64X2, arg)
+    }
+}
+
+fn contract_dword_from_xmm<'f>(
+    pos: &mut FuncCursor<'f>,
+    inst: ir::Inst,
+    ret: ir::Value,
+    ret_type: ir::Type,
+) {
+    if ret_type == I64 {
+        let ret = pos.ins().raw_bitcast(I32X4, ret);
+        let ret_lo = pos.ins().extractlane(ret, 0);
+        let ret_hi = pos.ins().extractlane(ret, 1);
+        pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi);
+    } else {
+        let ret = pos.ins().extractlane(ret, 0);
+        pos.func.dfg.replace(inst).ireduce(ret_type, ret);
+    }
+}
+
 // Masks for i8x16 unsigned right shift.
 static USHR_MASKS: [u8; 128] = [
     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -1379,7 +1412,14 @@ fn convert_ushr(
         } else if arg0_type.is_vector() {
             // x86 has encodings for these shifts.
             pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+            let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+            let shifted = pos.ins().x86_psrl(value, amount);
+            contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
         } else {
+            // Everything else should be already legal.
             unreachable!()
         }
     }
@@ -1446,7 +1486,14 @@ fn convert_ishl(
         } else if arg0_type.is_vector() {
             // x86 has encodings for these shifts.
             pos.func.dfg.replace(inst).x86_psll(arg0, shift_index);
+        } else if arg0_type == I64 {
+            // 64 bit shifts need to be legalized on x86_32.
+            let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type);
+            let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type);
+            let shifted = pos.ins().x86_psll(value, amount);
+            contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type);
         } else {
+            // Everything else should be already legal.
             unreachable!()
         }
     }
diff --git a/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif b/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif
new file mode 100644
index 000000000000..bbcbf7091de0
--- /dev/null
+++ b/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif
@@ -0,0 +1,51 @@
+test compile
+set enable_simd
+target i686 haswell
+
+function u0:1(i32) -> i64 system_v {
+    block1(v0: i32):
+        v1 = load.i64 notrap aligned v0+0
+        v2 = load.i32 notrap aligned v0+16
+        v3 = ishl v1, v2
+        return v3
+}
+
+function u0:2(i32) -> i64 system_v {
+    block1(v0: i32):
+        v1 = load.i64 notrap aligned v0+0
+        v2 = load.i64 notrap aligned v0+16
+        v3 = ishl v1, v2
+        return v3
+}
+
+function u0:3(i32) -> i32 system_v {
+    block1(v0: i32):
+        v1 = load.i32 notrap aligned v0+0
+        v2 = load.i64 notrap aligned v0+16
+        v3 = ishl v1, v2
+        return v3
+}
+
+function u0:4(i32) -> i64 system_v {
+    block1(v0: i32):
+        v1 = load.i64 notrap aligned v0+0
+        v2 = load.i32 notrap aligned v0+16
+        v3 = ushr v1, v2
+        return v3
+}
+
+function u0:5(i32) -> i64 system_v {
+    block1(v0: i32):
+        v1 = load.i64 notrap aligned v0+0
+        v2 = load.i64 notrap aligned v0+16
+        v3 = ushr v1, v2
+        return v3
+}
+
+function u0:6(i32) -> i32 system_v {
+    block1(v0: i32):
+        v1 = load.i32 notrap aligned v0+0
+        v2 = load.i64 notrap aligned v0+16
+        v3 = ushr v1, v2
+        return v3
+}