From 4ec16fa057774dcd0d82dce05f8160689f0fe050 Mon Sep 17 00:00:00 2001 From: whitequark Date: Tue, 28 Apr 2020 01:32:02 +0000 Subject: [PATCH] Legalize 64 bit shifts on x86_32 using PSLLQ/PSRLQ. Co-authored-by: iximeow --- .../codegen/meta/src/isa/x86/encodings.rs | 7 ++- .../codegen/meta/src/isa/x86/legalize.rs | 29 +++++++++++ cranelift/codegen/src/isa/x86/enc_tables.rs | 47 +++++++++++++++++ .../isa/x86/legalize-x86_32-shifts.clif | 51 +++++++++++++++++++ 4 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif diff --git a/cranelift/codegen/meta/src/isa/x86/encodings.rs b/cranelift/codegen/meta/src/isa/x86/encodings.rs index 937df6830ea8..7863e2bd85f8 100644 --- a/cranelift/codegen/meta/src/isa/x86/encodings.rs +++ b/cranelift/codegen/meta/src/isa/x86/encodings.rs @@ -1493,8 +1493,13 @@ fn define_alu( for &(inst, rrr) in &[(rotl, 0), (rotr, 1), (ishl, 4), (ushr, 5), (sshr, 7)] { // Cannot use enc_i32_i64 for this pattern because instructions require // to bind any. + e.enc32(inst.bind(I32).bind(I8), rec_rc.opcodes(&ROTATE_CL).rrr(rrr)); e.enc32( - inst.bind(I32).bind(Any), + inst.bind(I32).bind(I16), + rec_rc.opcodes(&ROTATE_CL).rrr(rrr), + ); + e.enc32( + inst.bind(I32).bind(I32), rec_rc.opcodes(&ROTATE_CL).rrr(rrr), ); e.enc64( diff --git a/cranelift/codegen/meta/src/isa/x86/legalize.rs b/cranelift/codegen/meta/src/isa/x86/legalize.rs index f38e4249bfa4..3b073c1fa643 100644 --- a/cranelift/codegen/meta/src/isa/x86/legalize.rs +++ b/cranelift/codegen/meta/src/isa/x86/legalize.rs @@ -37,6 +37,8 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let imul = insts.by_name("imul"); let ineg = insts.by_name("ineg"); let isub = insts.by_name("isub"); + let ishl = insts.by_name("ishl"); + let ireduce = insts.by_name("ireduce"); let popcnt = insts.by_name("popcnt"); let sdiv = insts.by_name("sdiv"); let selectif = insts.by_name("selectif"); @@ -45,6 +47,7 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let tls_value = insts.by_name("tls_value"); let udiv = insts.by_name("udiv"); let umulhi = insts.by_name("umulhi"); + let ushr = insts.by_name("ushr"); let ushr_imm = insts.by_name("ushr_imm"); let urem = insts.by_name("urem"); @@ -55,6 +58,32 @@ pub(crate) fn define(shared: &mut SharedDefinitions, x86_instructions: &Instruct let imm = &shared.imm; + // Shift by a 64-bit amount is equivalent to a shift by that amount mod 32, so we can reduce + // the size of the shift amount. This is useful for x86_32, where an I64 shift amount is + // not encodable. + let a = var("a"); + let x = var("x"); + let y = var("y"); + let z = var("z"); + + for &ty in &[I8, I16, I32] { + let ishl_by_i64 = ishl.bind(ty).bind(I64); + let ireduce = ireduce.bind(I32); + group.legalize( + def!(a = ishl_by_i64(x, y)), + vec![def!(z = ireduce(y)), def!(a = ishl(x, z))], + ); + } + + for &ty in &[I8, I16, I32] { + let ushr_by_i64 = ushr.bind(ty).bind(I64); + let ireduce = ireduce.bind(I32); + group.legalize( + def!(a = ushr_by_i64(x, y)), + vec![def!(z = ireduce(y)), def!(a = ishl(x, z))], + ); + } + // Division and remainder. // // The srem expansion requires custom code because srem INT_MIN, -1 is not diff --git a/cranelift/codegen/src/isa/x86/enc_tables.rs b/cranelift/codegen/src/isa/x86/enc_tables.rs index 9b44568454c0..c00ca973575b 100644 --- a/cranelift/codegen/src/isa/x86/enc_tables.rs +++ b/cranelift/codegen/src/isa/x86/enc_tables.rs @@ -1318,6 +1318,39 @@ fn convert_ineg( } } +fn expand_dword_to_xmm<'f>( + pos: &mut FuncCursor<'_>, + arg: ir::Value, + arg_type: ir::Type, +) -> ir::Value { + if arg_type == I64 { + let (arg_lo, arg_hi) = pos.ins().isplit(arg); + let arg = pos.ins().scalar_to_vector(I32X4, arg_lo); + let arg = pos.ins().insertlane(arg, 1, arg_hi); + let arg = pos.ins().raw_bitcast(I64X2, arg); + arg + } else { + pos.ins().bitcast(I64X2, arg) + } +} + +fn contract_dword_from_xmm<'f>( + pos: &mut FuncCursor<'f>, + inst: ir::Inst, + ret: ir::Value, + ret_type: ir::Type, +) { + if ret_type == I64 { + let ret = pos.ins().raw_bitcast(I32X4, ret); + let ret_lo = pos.ins().extractlane(ret, 0); + let ret_hi = pos.ins().extractlane(ret, 1); + pos.func.dfg.replace(inst).iconcat(ret_lo, ret_hi); + } else { + let ret = pos.ins().extractlane(ret, 0); + pos.func.dfg.replace(inst).ireduce(ret_type, ret); + } +} + // Masks for i8x16 unsigned right shift. static USHR_MASKS: [u8; 128] = [ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, @@ -1379,7 +1412,14 @@ fn convert_ushr( } else if arg0_type.is_vector() { // x86 has encodings for these shifts. pos.func.dfg.replace(inst).x86_psrl(arg0, shift_index); + } else if arg0_type == I64 { + // 64 bit shifts need to be legalized on x86_32. + let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); + let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); + let shifted = pos.ins().x86_psrl(value, amount); + contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); } else { + // Everything else should be already legal. unreachable!() } } @@ -1446,7 +1486,14 @@ fn convert_ishl( } else if arg0_type.is_vector() { // x86 has encodings for these shifts. pos.func.dfg.replace(inst).x86_psll(arg0, shift_index); + } else if arg0_type == I64 { + // 64 bit shifts need to be legalized on x86_32. + let value = expand_dword_to_xmm(&mut pos, arg0, arg0_type); + let amount = expand_dword_to_xmm(&mut pos, arg1, arg1_type); + let shifted = pos.ins().x86_psll(value, amount); + contract_dword_from_xmm(&mut pos, inst, shifted, arg0_type); } else { + // Everything else should be already legal. unreachable!() } } diff --git a/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif b/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif new file mode 100644 index 000000000000..bbcbf7091de0 --- /dev/null +++ b/cranelift/filetests/filetests/isa/x86/legalize-x86_32-shifts.clif @@ -0,0 +1,51 @@ +test compile +set enable_simd +target i686 haswell + +function u0:1(i32) -> i64 system_v { + block1(v0: i32): + v1 = load.i64 notrap aligned v0+0 + v2 = load.i32 notrap aligned v0+16 + v3 = ishl v1, v2 + return v3 +} + +function u0:2(i32) -> i64 system_v { + block1(v0: i32): + v1 = load.i64 notrap aligned v0+0 + v2 = load.i64 notrap aligned v0+16 + v3 = ishl v1, v2 + return v3 +} + +function u0:3(i32) -> i32 system_v { + block1(v0: i32): + v1 = load.i32 notrap aligned v0+0 + v2 = load.i64 notrap aligned v0+16 + v3 = ishl v1, v2 + return v3 +} + +function u0:4(i32) -> i64 system_v { + block1(v0: i32): + v1 = load.i64 notrap aligned v0+0 + v2 = load.i32 notrap aligned v0+16 + v3 = ushr v1, v2 + return v3 +} + +function u0:5(i32) -> i64 system_v { + block1(v0: i32): + v1 = load.i64 notrap aligned v0+0 + v2 = load.i64 notrap aligned v0+16 + v3 = ushr v1, v2 + return v3 +} + +function u0:6(i32) -> i32 system_v { + block1(v0: i32): + v1 = load.i32 notrap aligned v0+0 + v2 = load.i64 notrap aligned v0+16 + v3 = ushr v1, v2 + return v3 +}