Skip to content

Commit

Permalink
Bug 1724201 - Relaxed SIMD FMA/FMS for x86 and arm64. r=yury
Browse files Browse the repository at this point in the history
Implement the fused multiply-add and fused multiply-sub relaxed SIMD
operations.

See WebAssembly/relaxed-simd#27 for proposed
spec of these operations.

There's no wat support for this yet - it will comes in separately - so
the test cases are a little rudimentary for now.  More tests will
appear later.

Differential Revision: https://phabricator.services.mozilla.com/D121870
  • Loading branch information
Lars T Hansen committed Aug 13, 2021
1 parent 4b46866 commit e63f63b
Show file tree
Hide file tree
Showing 14 changed files with 248 additions and 5 deletions.
4 changes: 4 additions & 0 deletions js/src/jit-test/lib/wasm-binary.js
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ const F64x2PMinCode = 0xf6;
const F64x2PMaxCode = 0xf7;
const V128Load32ZeroCode = 0xfc;
const V128Load64ZeroCode = 0xfd;
const F32x4RelaxedFmaCode = 0xaf;
const F32x4RelaxedFmsCode = 0xb0;
const F64x2RelaxedFmaCode = 0xcf;
const F64x2RelaxedFmsCode = 0xd0;

// SIMD wormhole opcodes.
const WORMHOLE_SELFTEST = 0;
Expand Down
43 changes: 42 additions & 1 deletion js/src/jit-test/tests/wasm/simd/experimental.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// |jit-test| --wasm-relaxed-simd; skip-if: !wasmSimdEnabled()

// Experimental opcodes. We have no text parsing support for these yet. The
// tests will be cleaned up and moved into ad-hack.js if the opcodes are
// adopted.
Expand Down Expand Up @@ -61,4 +63,43 @@ function V128StoreExpr(addr, v) {
SimdPrefix, V128StoreCode, 4, varU32(0)];
}

// (Currently no tests here but there were some in the past and there will be more in the future.)
// FMA/FMS, https://github.com/WebAssembly/relaxed-simd/issues/27

function fma(a, x, y) { return a + (x * y) }
function fms(a, x, y) { return a - (x * y) }

var fas = [0, 100, 500, 700];
var fxs = [10, 20, 30, 40];
var fys = [-2, -3, -4, -5];
var das = [0, 100];
var dxs = [10, 20];
var dys = [-2, -3];

for ( let [opcode, as, xs, ys, operator] of [[F32x4RelaxedFmaCode, fas, fxs, fys, fma],
[F32x4RelaxedFmsCode, fas, fxs, fys, fms],
[F64x2RelaxedFmaCode, das, dxs, dys, fma],
[F64x2RelaxedFmsCode, das, dxs, dys, fms]] ) {
var k = xs.length;
var ans = iota(k).map((i) => operator(as[i], xs[i], ys[i]))

var ins = wasmEval(moduleWithSections([
sigSection([v2vSig]),
declSection([0]),
memorySection(1),
exportSection([{funcIndex: 0, name: "run"},
{memIndex: 0, name: "mem"}]),
bodySection([
funcBody({locals:[],
body: [...V128StoreExpr(0, [...V128Load(16),
...V128Load(32),
...V128Load(48),
SimdPrefix, varU32(opcode)])]})])]));

var mem = new (k == 4 ? Float32Array : Float64Array)(ins.exports.mem.buffer);
set(mem, k, as);
set(mem, 2*k, xs);
set(mem, 3*k, ys);
ins.exports.run();
var result = get(mem, 0, k);
assertSame(result, ans);
}
14 changes: 14 additions & 0 deletions js/src/jit/MacroAssembler.h
Original file line number Diff line number Diff line change
Expand Up @@ -3455,6 +3455,20 @@ class MacroAssembler : public MacroAssemblerSpecific {
inline void nearestFloat64x2(FloatRegister src, FloatRegister dest)
DEFINED_ON(x86_shared, arm64);

// Floating multiply-accumulate: srcDest [+-]= src1 * src2

inline void fmaFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);

inline void fmsFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);

inline void fmaFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);

inline void fmsFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) DEFINED_ON(x86_shared, arm64);

public:
// ========================================================================
// Truncate floating point.
Expand Down
16 changes: 16 additions & 0 deletions js/src/jit/arm64/CodeGenerator-arm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2976,6 +2976,22 @@ void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
masm.bitwiseSelectSimd128(lhs, rhs, controlDest);
break;
}
case wasm::SimdOp::F32x4RelaxedFma:
masm.fmaFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F32x4RelaxedFms:
masm.fmsFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F64x2RelaxedFma:
masm.fmaFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F64x2RelaxedFms:
masm.fmsFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
default:
MOZ_CRASH("NYI");
}
Expand Down
10 changes: 10 additions & 0 deletions js/src/jit/arm64/Lowering-arm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,16 @@ void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
defineReuseInput(lir, ins, LWasmTernarySimd128::V2);
break;
}
case wasm::SimdOp::F32x4RelaxedFma:
case wasm::SimdOp::F32x4RelaxedFms:
case wasm::SimdOp::F64x2RelaxedFma:
case wasm::SimdOp::F64x2RelaxedFms: {
auto* lir = new (alloc())
LWasmTernarySimd128(ins->simdOp(), useRegisterAtStart(ins->v0()),
useRegister(ins->v1()), useRegister(ins->v2()));
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
break;
}
default:
MOZ_CRASH("NYI");
}
Expand Down
22 changes: 22 additions & 0 deletions js/src/jit/arm64/MacroAssembler-arm64-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3827,6 +3827,28 @@ void MacroAssembler::nearestFloat64x2(FloatRegister src, FloatRegister dest) {
Frintn(Simd2D(dest), Simd2D(src));
}

// Floating multiply-accumulate: srcDest [+-]= src1 * src2

void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
Fmla(Simd4S(srcDest), Simd4S(src1), Simd4S(src2));
}

void MacroAssembler::fmsFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
Fmls(Simd4S(srcDest), Simd4S(src1), Simd4S(src2));
}

void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
Fmla(Simd2D(srcDest), Simd2D(src1), Simd2D(src2));
}

void MacroAssembler::fmsFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
Fmls(Simd2D(srcDest), Simd2D(src1), Simd2D(src2));
}

//}}} check_macroassembler_style
// ===============================================================

Expand Down
16 changes: 16 additions & 0 deletions js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2270,6 +2270,22 @@ void CodeGenerator::visitWasmTernarySimd128(LWasmTernarySimd128* ins) {
masm.bitwiseSelectSimd128(control, lhsDest, rhs, lhsDest, temp);
break;
}
case wasm::SimdOp::F32x4RelaxedFma:
masm.fmaFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F32x4RelaxedFms:
masm.fmsFloat32x4(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F64x2RelaxedFma:
masm.fmaFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
case wasm::SimdOp::F64x2RelaxedFms:
masm.fmsFloat64x2(ToFloatRegister(ins->v1()), ToFloatRegister(ins->v2()),
ToFloatRegister(ins->v0()));
break;
default:
MOZ_CRASH("NYI");
}
Expand Down
10 changes: 10 additions & 0 deletions js/src/jit/x86-shared/Lowering-x86-shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,16 @@ void LIRGenerator::visitWasmTernarySimd128(MWasmTernarySimd128* ins) {
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
break;
}
case wasm::SimdOp::F32x4RelaxedFma:
case wasm::SimdOp::F32x4RelaxedFms:
case wasm::SimdOp::F64x2RelaxedFma:
case wasm::SimdOp::F64x2RelaxedFms: {
auto* lir = new (alloc())
LWasmTernarySimd128(ins->simdOp(), useRegisterAtStart(ins->v0()),
useRegister(ins->v1()), useRegister(ins->v2()));
defineReuseInput(lir, ins, LWasmTernarySimd128::V0);
break;
}
default:
MOZ_CRASH("NYI");
}
Expand Down
35 changes: 35 additions & 0 deletions js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -2837,6 +2837,41 @@ void MacroAssembler::unsignedWidenHighInt32x4(FloatRegister src,
vpmovzxdq(Operand(dest), dest);
}

// Floating multiply-accumulate: srcDest [+-]= src1 * src2
// The Intel FMA feature is some AVX* special sauce, no support yet.

void MacroAssembler::fmaFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
ScratchFloat32Scope scratch(*this);
moveSimd128(src1, scratch);
mulFloat32x4(src2, scratch);
addFloat32x4(scratch, srcDest);
}

void MacroAssembler::fmsFloat32x4(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
ScratchFloat32Scope scratch(*this);
moveSimd128(src1, scratch);
mulFloat32x4(src2, scratch);
subFloat32x4(scratch, srcDest);
}

void MacroAssembler::fmaFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
ScratchFloat32Scope scratch(*this);
moveSimd128(src1, scratch);
mulFloat64x2(src2, scratch);
addFloat64x2(scratch, srcDest);
}

void MacroAssembler::fmsFloat64x2(FloatRegister src1, FloatRegister src2,
FloatRegister srcDest) {
ScratchFloat32Scope scratch(*this);
moveSimd128(src1, scratch);
mulFloat64x2(src2, scratch);
subFloat64x2(scratch, srcDest);
}

// ========================================================================
// Truncate floating point.

Expand Down
45 changes: 45 additions & 0 deletions js/src/wasm/WasmBaselineCompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15240,6 +15240,28 @@ static void BitselectV128(MacroAssembler& masm, RegV128 rhs, RegV128 control,
}
# endif

# ifdef ENABLE_WASM_RELAXED_SIMD
static void RelaxedFmaF32x4(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
RegV128 rsd) {
masm.fmaFloat32x4(rs1, rs2, rsd);
}

static void RelaxedFmsF32x4(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
RegV128 rsd) {
masm.fmsFloat32x4(rs1, rs2, rsd);
}

static void RelaxedFmaF64x2(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
RegV128 rsd) {
masm.fmaFloat64x2(rs1, rs2, rsd);
}

static void RelaxedFmsF64x2(MacroAssembler& masm, RegV128 rs1, RegV128 rs2,
RegV128 rsd) {
masm.fmsFloat64x2(rs1, rs2, rsd);
}
# endif

void BaseCompiler::emitVectorAndNot() {
// We want x & ~y but the available operation is ~x & y, so reverse the
// operands.
Expand Down Expand Up @@ -16950,6 +16972,29 @@ bool BaseCompiler::emitBody() {
CHECK_NEXT(emitStoreLane(4));
case uint32_t(SimdOp::V128Store64Lane):
CHECK_NEXT(emitStoreLane(8));
# ifdef ENABLE_WASM_RELAXED_SIMD
case uint32_t(SimdOp::F32x4RelaxedFma):
if (!moduleEnv_.v128RelaxedEnabled()) {
return iter_.unrecognizedOpcode(&op);
}
CHECK_NEXT(dispatchTernary1(RelaxedFmaF32x4, ValType::V128));
case uint32_t(SimdOp::F32x4RelaxedFms):
if (!moduleEnv_.v128RelaxedEnabled()) {
return iter_.unrecognizedOpcode(&op);
}
CHECK_NEXT(dispatchTernary1(RelaxedFmsF32x4, ValType::V128));
case uint32_t(SimdOp::F64x2RelaxedFma):
if (!moduleEnv_.v128RelaxedEnabled()) {
return iter_.unrecognizedOpcode(&op);
}
CHECK_NEXT(dispatchTernary1(RelaxedFmaF64x2, ValType::V128));
case uint32_t(SimdOp::F64x2RelaxedFms):
if (!moduleEnv_.v128RelaxedEnabled()) {
return iter_.unrecognizedOpcode(&op);
}
CHECK_NEXT(dispatchTernary1(RelaxedFmsF64x2, ValType::V128));
break;
# endif
default:
break;
} // switch (op.b1)
Expand Down
8 changes: 4 additions & 4 deletions js/src/wasm/WasmConstants.h
Original file line number Diff line number Diff line change
Expand Up @@ -676,8 +676,8 @@ enum class SimdOp {
I32x4ShrS = 0xac,
I32x4ShrU = 0xad,
I32x4Add = 0xae,
// AddSatS = 0xaf
// AddSatU = 0xb0
F32x4RelaxedFma = 0xaf,
F32x4RelaxedFms = 0xb0,
I32x4Sub = 0xb1,
// SubSatS = 0xb2
// SubSatU = 0xb3
Expand Down Expand Up @@ -708,8 +708,8 @@ enum class SimdOp {
I64x2ShrS = 0xcc,
I64x2ShrU = 0xcd,
I64x2Add = 0xce,
// Unused = 0xcf
// Unused = 0xd0
F64x2RelaxedFma = 0xcf,
F64x2RelaxedFms = 0xd0,
I64x2Sub = 0xd1,
// Unused = 0xd2
// Unused = 0xd3
Expand Down
12 changes: 12 additions & 0 deletions js/src/wasm/WasmIonCompile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5384,6 +5384,18 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
CHECK(EmitStoreLaneSimd128(f, 4));
case uint32_t(SimdOp::V128Store64Lane):
CHECK(EmitStoreLaneSimd128(f, 8));
# ifdef ENABLE_WASM_RELAXED_SIMD
case uint32_t(SimdOp::F32x4RelaxedFma):
case uint32_t(SimdOp::F32x4RelaxedFms):
case uint32_t(SimdOp::F64x2RelaxedFma):
case uint32_t(SimdOp::F64x2RelaxedFms): {
if (!f.moduleEnv().v128RelaxedEnabled()) {
return f.iter().unrecognizedOpcode(&op);
}
CHECK(EmitTernarySimd128(f, SimdOp(op.b1)));
}
# endif

default:
return f.iter().unrecognizedOpcode(&op);
} // switch (op.b1)
Expand Down
5 changes: 5 additions & 0 deletions js/src/wasm/WasmOpIter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,11 @@ OpKind wasm::Classify(OpBytes op) {
case SimdOp::V128Store32Lane:
case SimdOp::V128Store64Lane:
WASM_SIMD_OP(OpKind::StoreLane);
case SimdOp::F32x4RelaxedFma:
case SimdOp::F32x4RelaxedFms:
case SimdOp::F64x2RelaxedFma:
case SimdOp::F64x2RelaxedFms:
WASM_SIMD_OP(OpKind::Ternary);
# ifdef ENABLE_WASM_SIMD_WORMHOLE
case SimdOp::MozWHSELFTEST:
case SimdOp::MozWHPMADDUBSW:
Expand Down
13 changes: 13 additions & 0 deletions js/src/wasm/WasmValidate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1007,6 +1007,19 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
CHECK(iter.readStoreLane(8, &addr, &noIndex, &nothing));
}

# ifdef ENABLE_WASM_RELAXED_SIMD
case uint32_t(SimdOp::F32x4RelaxedFma):
case uint32_t(SimdOp::F32x4RelaxedFms):
case uint32_t(SimdOp::F64x2RelaxedFma):
case uint32_t(SimdOp::F64x2RelaxedFms): {
if (!env.v128RelaxedEnabled()) {
return iter.unrecognizedOpcode(&op);
}
CHECK(
iter.readTernary(ValType::V128, &nothing, &nothing, &nothing));
}
# endif

default:
return iter.unrecognizedOpcode(&op);
}
Expand Down

0 comments on commit e63f63b

Please sign in to comment.