Skip to content

Commit

Permalink
x64 backend: implement 128-bit ops and misc fixes.
Browse files Browse the repository at this point in the history
This implements all of the ops on I128 that are implemented by the
legacy x86 backend, and includes all that are required by at least one
major use-case (cg_clif rustc backend).

The sequences are open-coded where necessary; for e.g. the bit
operations, this can be somewhat complex, but these sequences have been
tested carefully. This PR also includes a drive-by fix of clz/ctz for 8-
and 16-bit cases where they were incorrect previously.

Also includes ridealong fixes developed while bringing up cg_clif
support, because they are difficult to completely separate due to
other refactors that occurred in this PR:

- fix REX prefix logic for some 8-bit instructions.

  When using an 8-bit register in 64-bit mode on x86-64, the REX prefix
  semantics are somewhat subtle: without the REX prefix, register numbers
  4--7 correspond to the second-to-lowest byte of the first four registers
  (AH, CH, BH, DH), whereas with the REX prefix, these register numbers
  correspond to the usual encoding (SPL, BPL, SIL, DIL). We could always
  emit a REX byte for instructions with 8-bit cases (this is harmless even
  if unneeded), but this would unnecessarily inflate code size; instead,
  the usual approach is to emit it only for these registers.

  This logic was present in some cases but missing for some other
  instructions: divide, not, negate, shifts.

  Fixes #2508.

- avoid unaligned SSE loads on some f64 ops.

  The implementations of several FP ops, such as fabs/fneg, used SSE
  instructions. This is not a problem per-se, except that load-op merging
  did not take *alignment* into account. Specifically, if an op on an f64
  loaded from memory happened to merge that load, and the instruction into
  which it was merged was an SSE instruction, then the SSE instruction
  imposes stricter (128-bit) alignment requirements than the load.f64 did.

  This PR simply forces any instruction lowerings that could use SSE
  instructions to implement non-SIMD operations to take inputs in
  registers only, and avoid load-op merging.

  Fixes #2507.

- two bugfixes exposed by cg_clif: urem/srem.i8, select.b1.

  - urem/srem.i8: the 8-bit form of the DIV instruction on x86-64 places
    the remainder in AH, not RDX, different from all the other width-forms
    of this instruction.

  - select.b1: we were not recognizing selects of boolean values as
    integer-typed operations, so we were generating XMM moves instead (!).
  • Loading branch information
cfallin committed Jan 4, 2021
1 parent 616811b commit 8b38e20
Show file tree
Hide file tree
Showing 12 changed files with 3,273 additions and 652 deletions.
103 changes: 49 additions & 54 deletions cranelift/codegen/src/isa/x64/abi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,42 +138,62 @@ impl ABIMachineSpec for X64ABIMachineSpec {
),
}

let intreg = in_int_reg(param.value_type);
let vecreg = in_vec_reg(param.value_type);
debug_assert!(intreg || vecreg);
debug_assert!(!(intreg && vecreg));

let (next_reg, candidate) = if intreg {
let candidate = match args_or_rets {
ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr),
ArgsOrRets::Rets => get_intreg_for_retval_systemv(&call_conv, next_gpr, i),
};
debug_assert!(candidate
.map(|r| r.get_class() == RegClass::I64)
.unwrap_or(true));
(&mut next_gpr, candidate)
} else {
let candidate = match args_or_rets {
ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg),
ArgsOrRets::Rets => get_fltreg_for_retval_systemv(&call_conv, next_vreg, i),
};
debug_assert!(candidate
.map(|r| r.get_class() == RegClass::V128)
.unwrap_or(true));
(&mut next_vreg, candidate)
};

if let Some(param) = try_fill_baldrdash_reg(call_conv, param) {
assert!(intreg);
ret.push(param);
} else if let Some(reg) = candidate {
continue;
}

// Find regclass(es) of the register(s) used to store a value of this type.
let (rcs, _) = Inst::rc_for_type(param.value_type)?;
let intreg = rcs[0] == RegClass::I64;
let num_regs = rcs.len();
assert!(num_regs <= 2);
if num_regs == 2 {
assert_eq!(rcs[0], rcs[1]);
}

let mut regs: SmallVec<[RealReg; 2]> = smallvec![];
for j in 0..num_regs {
let nextreg = if intreg {
match args_or_rets {
ArgsOrRets::Args => get_intreg_for_arg_systemv(&call_conv, next_gpr + j),
ArgsOrRets::Rets => {
get_intreg_for_retval_systemv(&call_conv, next_gpr + j, i + j)
}
}
} else {
match args_or_rets {
ArgsOrRets::Args => get_fltreg_for_arg_systemv(&call_conv, next_vreg + j),
ArgsOrRets::Rets => {
get_fltreg_for_retval_systemv(&call_conv, next_vreg + j, i + j)
}
}
};
if let Some(reg) = nextreg {
regs.push(reg.to_real_reg());
} else {
regs.clear();
break;
}
}

if regs.len() > 0 {
let regs = match num_regs {
1 => ValueRegs::one(regs[0]),
2 => ValueRegs::two(regs[0], regs[1]),
_ => panic!("More than two registers unexpected"),
};
ret.push(ABIArg::Reg(
ValueRegs::one(reg.to_real_reg()),
regs,
param.value_type,
param.extension,
param.purpose,
));
*next_reg += 1;
if intreg {
next_gpr += num_regs;
} else {
next_vreg += num_regs;
}
} else {
// Compute size. Every arg takes a minimum slot of 8 bytes. (16-byte
// stack alignment happens separately after all args.)
Expand Down Expand Up @@ -658,31 +678,6 @@ impl From<StackAMode> for SyntheticAmode {
}
}

fn in_int_reg(ty: types::Type) -> bool {
match ty {
types::I8
| types::I16
| types::I32
| types::I64
| types::B1
| types::B8
| types::B16
| types::B32
| types::B64
| types::R64 => true,
types::R32 => panic!("unexpected 32-bits refs on x64!"),
_ => false,
}
}

fn in_vec_reg(ty: types::Type) -> bool {
match ty {
types::F32 | types::F64 => true,
_ if ty.is_vector() => true,
_ => false,
}
}

fn get_intreg_for_arg_systemv(call_conv: &CallConv, idx: usize) -> Option<Reg> {
match call_conv {
CallConv::Fast
Expand Down
26 changes: 24 additions & 2 deletions cranelift/codegen/src/isa/x64/inst/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use std::string::String;

/// A possible addressing mode (amode) that can be used in instructions.
/// These denote a 64-bit value only.
#[derive(Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub enum Amode {
/// Immediate sign-extended and a Register.
ImmReg {
Expand Down Expand Up @@ -346,23 +346,35 @@ impl PrettyPrintSized for RegMem {
#[derive(Copy, Clone, PartialEq)]
pub enum AluRmiROpcode {
Add,
Adc,
Sub,
Sbb,
And,
Or,
Xor,
/// The signless, non-extending (N x N -> N, for N in {32,64}) variant.
Mul,
/// 8-bit form of And. Handled separately as we don't have full 8-bit op
/// support (we just use wider instructions). Used only with some sequences
/// with SETcc.
And8,
/// 8-bit form of Or.
Or8,
}

impl fmt::Debug for AluRmiROpcode {
fn fmt(&self, fmt: &mut fmt::Formatter) -> fmt::Result {
let name = match self {
AluRmiROpcode::Add => "add",
AluRmiROpcode::Adc => "adc",
AluRmiROpcode::Sub => "sub",
AluRmiROpcode::Sbb => "sbb",
AluRmiROpcode::And => "and",
AluRmiROpcode::Or => "or",
AluRmiROpcode::Xor => "xor",
AluRmiROpcode::Mul => "imul",
AluRmiROpcode::And8 => "and",
AluRmiROpcode::Or8 => "or",
};
write!(fmt, "{}", name)
}
Expand All @@ -374,6 +386,16 @@ impl fmt::Display for AluRmiROpcode {
}
}

impl AluRmiROpcode {
/// Is this a special-cased 8-bit ALU op?
pub fn is_8bit(self) -> bool {
match self {
AluRmiROpcode::And8 | AluRmiROpcode::Or8 => true,
_ => false,
}
}
}

#[derive(Clone, PartialEq)]
pub enum UnaryRmROpcode {
/// Bit-scan reverse.
Expand Down Expand Up @@ -1002,7 +1024,7 @@ impl fmt::Display for ExtMode {
}

/// These indicate the form of a scalar shift/rotate: left, signed right, unsigned right.
#[derive(Clone)]
#[derive(Clone, Copy)]
pub enum ShiftKind {
ShiftLeft,
/// Inserts zeros in the most significant bits.
Expand Down
Loading

0 comments on commit 8b38e20

Please sign in to comment.