Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Legalize fmin/fmax with NaN quieting semantics #1905

Merged
merged 2 commits into from
Jun 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -197,9 +197,6 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool {
("simd", _) if target.contains("aarch64") => return true,

("simd", "simd_conversions") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
("simd", "simd_f32x4") => return true, // FIXME expected V128(F32x4([CanonicalNan, CanonicalNan, Value(Float32 { bits: 0 }), Value(Float32 { bits: 0 })])), got V128(18428729675200069632)
("simd", "simd_f64x2") => return true, // FIXME expected V128(F64x2([Value(Float64 { bits: 9221120237041090560 }), Value(Float64 { bits: 0 })])), got V128(0)
("simd", "simd_f64x2_arith") => return true, // FIXME expected V128(F64x2([Value(Float64 { bits: 9221120237041090560 }), Value(Float64 { bits: 13835058055282163712 })])), got V128(255211775190703847615975447847722024960)
("simd", "simd_load") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S
("simd", "simd_splat") => return true, // FIXME Unsupported feature: proposed SIMD operator I32x4TruncSatF32x4S

Expand Down
12 changes: 6 additions & 6 deletions cranelift/codegen/meta/src/isa/x86/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1596,8 +1596,6 @@ fn define_simd(
let fdiv = shared.by_name("fdiv");
let fill = shared.by_name("fill");
let fill_nop = shared.by_name("fill_nop");
let fmax = shared.by_name("fmax");
let fmin = shared.by_name("fmin");
let fmul = shared.by_name("fmul");
let fsub = shared.by_name("fsub");
let iadd = shared.by_name("iadd");
Expand Down Expand Up @@ -1637,6 +1635,8 @@ fn define_simd(
let vselect = shared.by_name("vselect");
let x86_cvtt2si = x86.by_name("x86_cvtt2si");
let x86_insertps = x86.by_name("x86_insertps");
let x86_fmax = x86.by_name("x86_fmax");
let x86_fmin = x86.by_name("x86_fmin");
let x86_movlhps = x86.by_name("x86_movlhps");
let x86_movsd = x86.by_name("x86_movsd");
let x86_packss = x86.by_name("x86_packss");
Expand Down Expand Up @@ -2276,10 +2276,10 @@ fn define_simd(
(F64, fmul, &MULPD[..]),
(F32, fdiv, &DIVPS[..]),
(F64, fdiv, &DIVPD[..]),
(F32, fmin, &MINPS[..]),
(F64, fmin, &MINPD[..]),
(F32, fmax, &MAXPS[..]),
(F64, fmax, &MAXPD[..]),
(F32, x86_fmin, &MINPS[..]),
(F64, x86_fmin, &MINPD[..]),
(F32, x86_fmax, &MAXPS[..]),
(F64, x86_fmax, &MAXPD[..]),
] {
let inst = inst.bind(vector(*ty, sse_vector_size));
e.enc_both_inferred(inst, rec_fa.opcodes(opcodes));
Expand Down
6 changes: 5 additions & 1 deletion cranelift/codegen/meta/src/isa/x86/legalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,10 +379,12 @@ fn define_simd(
let bnot = insts.by_name("bnot");
let bxor = insts.by_name("bxor");
let extractlane = insts.by_name("extractlane");
let fabs = insts.by_name("fabs");
let fcmp = insts.by_name("fcmp");
let fcvt_from_uint = insts.by_name("fcvt_from_uint");
let fcvt_to_sint_sat = insts.by_name("fcvt_to_sint_sat");
let fabs = insts.by_name("fabs");
let fmax = insts.by_name("fmax");
let fmin = insts.by_name("fmin");
let fneg = insts.by_name("fneg");
let iadd_imm = insts.by_name("iadd_imm");
let icmp = insts.by_name("icmp");
Expand Down Expand Up @@ -790,6 +792,8 @@ fn define_simd(
narrow.custom_legalize(ushr, "convert_ushr");
narrow.custom_legalize(ishl, "convert_ishl");
narrow.custom_legalize(fcvt_to_sint_sat, "expand_fcvt_to_sint_sat_vector");
narrow.custom_legalize(fmin, "expand_minmax_vector");
narrow.custom_legalize(fmax, "expand_minmax_vector");
abrown marked this conversation as resolved.
Show resolved Hide resolved

narrow_avx.custom_legalize(imul, "convert_i64x2_imul");
narrow_avx.custom_legalize(fcvt_from_uint, "expand_fcvt_from_uint_vector");
Expand Down
94 changes: 94 additions & 0 deletions cranelift/codegen/src/isa/x86/enc_tables.rs
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,100 @@ fn expand_minmax(
cfg.recompute_block(pos.func, done);
}

/// This legalization converts a minimum/maximum operation into a sequence that matches the
/// non-x86-friendly WebAssembly semantics of NaN handling. This logic is kept separate from
/// [expand_minmax] above (the scalar version) for code clarity.
fn expand_minmax_vector(
inst: ir::Inst,
func: &mut ir::Function,
_cfg: &mut ControlFlowGraph,
_isa: &dyn TargetIsa,
) {
let ty = func.dfg.ctrl_typevar(inst);
debug_assert!(ty.is_vector());
let (x, y, x86_opcode, is_max) = match func.dfg[inst] {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: Can we get rid of is_max by just looking at the opcode, instead of the one use of is_max?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could but then it would be let (x, y, x86_opcode, clif_opcode) and we would have to do the opcode comparison twice (once in the match and below in the current if using is_max).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm going to merge this as-is but I can make a subsequent change to let (x, y, x86_opcode, clif_opcode) if you prefer that.

ir::InstructionData::Binary {
opcode: ir::Opcode::Fmin,
args,
} => (args[0], args[1], ir::Opcode::X86Fmin, false),
ir::InstructionData::Binary {
opcode: ir::Opcode::Fmax,
args,
} => (args[0], args[1], ir::Opcode::X86Fmax, true),
_ => panic!("Expected fmin/fmax: {}", func.dfg.display_inst(inst, None)),
};

let mut pos = FuncCursor::new(func).at_inst(inst);
pos.use_srcloc(inst);

// This sequence is complex due to how x86 handles NaNs and +0/-0. If x86 finds a NaN in
// either lane it returns the second operand; likewise, if both operands are in {+0.0, -0.0}
// it returns the second operand. To match the behavior of "return the minimum of the
// operands or a canonical NaN if either operand is NaN," we must compare in both
// directions.
let (forward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, x, y);
let forward = dfg.first_result(forward_inst);
let (backward_inst, dfg) = pos.ins().Binary(x86_opcode, ty, y, x);
let backward = dfg.first_result(backward_inst);

let (value, mask) = if is_max {
// For maximum:
// Find any differences between the forward and backward `max` operation.
let difference = pos.ins().bxor(forward, backward);
// Merge in the differences.
let propagate_nans_and_plus_zero = pos.ins().bor(backward, difference);
let value = pos.ins().fsub(propagate_nans_and_plus_zero, difference);
// Discover which lanes have NaNs in them.
let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, difference, value);
(value, find_nan_lanes_mask)
} else {
// For minimum:
// If either lane is a NaN, we want to use these bits, not the second operand bits.
let propagate_nans = pos.ins().bor(backward, forward);
// Find which lanes contain a NaN with an unordered comparison, filling the mask with
// 1s.
let find_nan_lanes_mask = pos.ins().fcmp(FloatCC::Unordered, forward, propagate_nans);
let bitcast_find_nan_lanes_mask = pos.ins().raw_bitcast(ty, find_nan_lanes_mask);
// Then flood the value lane with all 1s if that lane is a NaN. This causes all NaNs
// along this code path to be quieted and negative: after the upcoming shift and and_not,
// all upper bits (sign, exponent, and payload MSB) will be 1s.
let tmp = pos.ins().bor(propagate_nans, bitcast_find_nan_lanes_mask);
(tmp, bitcast_find_nan_lanes_mask)
};

// During this lowering we will need to know how many bits to shift by and what type to
// convert to when using an integer shift. Recall that an IEEE754 number looks like:
// `[sign bit] [exponent bits] [significand bits]`
// A quiet NaN has all exponent bits set to 1 and the most significant bit of the
// significand set to 1; a signaling NaN has the same exponent but the MSB of the
// significand is set to 0. The payload of the NaN is the remaining significand bits, and
// WebAssembly assumes a canonical NaN is quiet and has 0s in its payload. To compute this
// canonical NaN, we create a mask for the top 10 bits on F32X4 (1 sign + 8 exp. + 1 MSB
// sig.) and the top 13 bits on F64X2 (1 sign + 11 exp. + 1 MSB sig.). This means that all
// NaNs produced with the mask will be negative (`-NaN`) which is allowed by the sign
// non-determinism in the spec: https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0
let (shift_by, ty_as_int) = match ty {
F32X4 => (10, I32X4),
F64X2 => (13, I64X2),
_ => unimplemented!("this legalization only understands 128-bit floating point types"),
};

// In order to clear the NaN payload for canonical NaNs, we shift right the NaN lanes (all
// 1s) leaving 0s in the top bits. Remember that non-NaN lanes are all 0s so this has
// little effect.
let mask_as_int = pos.ins().raw_bitcast(ty_as_int, mask);
let shift_mask = pos.ins().ushr_imm(mask_as_int, shift_by);
let shift_mask_as_float = pos.ins().raw_bitcast(ty, shift_mask);

// Finally, we replace the value with `value & ~shift_mask`. For non-NaN lanes, this is
// equivalent to `... & 1111...` but for NaN lanes this will only have 1s in the top bits,
// clearing the payload.
pos.func
.dfg
.replace(inst)
.band_not(value, shift_mask_as_float);
}

/// x86 has no unsigned-to-float conversions. We handle the easy case of zero-extending i32 to
/// i64 with a pattern, the rest needs more code.
///
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm5]):
[-, %xmm3] v3 = fsub v0, v1 ; bin: 0f 5c dd
[-, %xmm3] v4 = fmul v0, v1 ; bin: 0f 59 dd
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 0f 5e dd
[-, %xmm3] v6 = fmin v0, v1 ; bin: 0f 5d dd
[-, %xmm3] v7 = fmax v0, v1 ; bin: 0f 5f dd
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 0f 5d dd
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 0f 5f dd
[-, %xmm3] v8 = sqrt v0 ; bin: 0f 51 db
return
}
Expand All @@ -70,8 +70,8 @@ block0(v0: f32x4 [%xmm3], v1: f32x4 [%xmm10]):
[-, %xmm3] v3 = fsub v0, v1 ; bin: 41 0f 5c da
[-, %xmm3] v4 = fmul v0, v1 ; bin: 41 0f 59 da
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 41 0f 5e da
[-, %xmm3] v6 = fmin v0, v1 ; bin: 41 0f 5d da
[-, %xmm3] v7 = fmax v0, v1 ; bin: 41 0f 5f da
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 41 0f 5d da
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 41 0f 5f da
[-, %xmm3] v8 = sqrt v1 ; bin: 41 0f 51 da
return
}
Expand All @@ -82,8 +82,8 @@ block0(v0: f64x2 [%xmm3], v1: f64x2 [%xmm5]):
[-, %xmm3] v3 = fsub v0, v1 ; bin: 66 0f 5c dd
[-, %xmm3] v4 = fmul v0, v1 ; bin: 66 0f 59 dd
[-, %xmm3] v5 = fdiv v0, v1 ; bin: 66 0f 5e dd
[-, %xmm3] v6 = fmin v0, v1 ; bin: 66 0f 5d dd
[-, %xmm3] v7 = fmax v0, v1 ; bin: 66 0f 5f dd
[-, %xmm3] v6 = x86_fmin v0, v1 ; bin: 66 0f 5d dd
[-, %xmm3] v7 = x86_fmax v0, v1 ; bin: 66 0f 5f dd
[-, %xmm3] v8 = sqrt v0 ; bin: 66 0f 51 db
return
}
Expand All @@ -94,8 +94,8 @@ block0(v0: f64x2 [%xmm11], v1: f64x2 [%xmm13]):
[-, %xmm11] v3 = fsub v0, v1 ; bin: 66 45 0f 5c dd
[-, %xmm11] v4 = fmul v0, v1 ; bin: 66 45 0f 59 dd
[-, %xmm11] v5 = fdiv v0, v1 ; bin: 66 45 0f 5e dd
[-, %xmm11] v6 = fmin v0, v1 ; bin: 66 45 0f 5d dd
[-, %xmm11] v7 = fmax v0, v1 ; bin: 66 45 0f 5f dd
[-, %xmm11] v6 = x86_fmin v0, v1 ; bin: 66 45 0f 5d dd
[-, %xmm11] v7 = x86_fmax v0, v1 ; bin: 66 45 0f 5f dd
[-, %xmm11] v8 = sqrt v0 ; bin: 66 45 0f 51 db
return
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,35 @@ block0(v0:i64x2, v1:i64x2):
; nextln: v2 = iadd v9, v8
return
}

function %fmin_f32x4(f32x4, f32x4) {
block0(v0:f32x4, v1:f32x4):
v2 = fmin v0, v1
; check: v3 = x86_fmin v0, v1
; nextln: v4 = x86_fmin v1, v0
; nextln: v5 = bor v4, v3
; nextln: v6 = fcmp uno v3, v5
; nextln: v7 = raw_bitcast.f32x4 v6
; nextln: v8 = bor v5, v7
; nextln: v9 = raw_bitcast.i32x4 v7
; nextln: v10 = ushr_imm v9, 10
; nextln: v11 = raw_bitcast.f32x4 v10
; nextln: v2 = band_not v8, v11
return
}

function %fmax_f64x2(f64x2, f64x2) {
block0(v0:f64x2, v1:f64x2):
v2 = fmax v0, v1
; check: v3 = x86_fmax v0, v1
; nextln: v4 = x86_fmax v1, v0
; nextln: v5 = bxor v3, v4
; nextln: v6 = bor v4, v5
; nextln: v7 = fsub v6, v5
; nextln: v8 = fcmp uno v5, v7
; nextln: v9 = raw_bitcast.i64x2 v8
; nextln: v10 = ushr_imm v9, 13
; nextln: v11 = raw_bitcast.f64x2 v10
; nextln: v2 = band_not v7, v11
return
}
42 changes: 21 additions & 21 deletions cranelift/filetests/filetests/isa/x86/simd-arithmetic-run.clif
Original file line number Diff line number Diff line change
Expand Up @@ -192,31 +192,31 @@ block0:
}
; run

function %fmax_f64x2() -> b1 {
block0:
v0 = vconst.f64x2 [-0.0 -0x1.0]
v1 = vconst.f64x2 [+0.0 +0x1.0]

function %fmax_f64x2(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = fmax v0, v1
v3 = fcmp eq v2, v1
v4 = vall_true v3

return v4
return v2
}
; run

function %fmin_f64x2() -> b1 {
block0:
v0 = vconst.f64x2 [-0x1.0 -0x1.0]
v1 = vconst.f64x2 [+0.0 +0x1.0]

; note below how NaNs are quieted but (unlike fmin), retain their sign: this discrepancy is allowed by non-determinism
; in the spec, see https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
; run: %fmax_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [+0x0.0 0x1.0]
; run: %fmax_f64x2([-NaN NaN], [0x0.0 0x100.0]) == [-NaN NaN]
abrown marked this conversation as resolved.
Show resolved Hide resolved
; run: %fmax_f64x2([NaN 0.0], [0.0 0.0]) == [NaN 0.0]
; run: %fmax_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
; run: %fmax_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [NaN 0.0]

function %fmin_f64x2(f64x2, f64x2) -> f64x2 {
block0(v0: f64x2, v1: f64x2):
v2 = fmin v0, v1
v3 = fcmp eq v2, v0
v4 = vall_true v3

return v4
return v2
}
; run
; note below how NaNs are quieted and negative: this is due to non-determinism in the spec for NaNs, see
; https://webassembly.github.io/spec/core/bikeshed/index.html#nan-propagation%E2%91%A0.
; run: %fmin_f64x2([-0x0.0 -0x1.0], [+0x0.0 0x1.0]) == [-0x0.0 -0x1.0]
; run: %fmin_f64x2([-NaN 0x100.0], [0.0 NaN]) == [-NaN -NaN]
; run: %fmin_f64x2([NaN 0.0], [0.0 0.0]) == [-NaN 0.0]
; run: %fmin_f64x2([-NaN 0.0], [0x1.0 0.0]) == [-NaN 0.0]
; run: %fmin_f64x2([NaN:0x42 0.0], [0x1.0 0.0]) == [-NaN 0.0]

function %fneg_f64x2() -> b1 {
block0:
Expand Down