Skip to content

Commit

Permalink
feat(simd): add new fns, switch to f32x4 namespaced ops, update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
postspectacular committed Oct 20, 2019
1 parent 8b7287e commit 4023a8f
Show file tree
Hide file tree
Showing 13 changed files with 268 additions and 81 deletions.
33 changes: 23 additions & 10 deletions packages/simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,23 @@ See
[/assembly](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/assembly)
for sources:

- `dot2_f32_aos()`
- `dot4_f32_aos()`
- `dot4_f32_soa()`
- `madd4_f32()`
- `maddn4_f32()`
- `mul_m23v2_aos()`
- `mul_m44v4_aos()`
- `add4_f32`
- `div4_f32` (*)
- `dot2_f32_aos` (2x vec2 per iteration)
- `dot4_f32_aos`
- `dot4_f32_soa`
- `invsqrt4_f32` (*)
- `madd4_f32`
- `maddn4_f32`
- `mul4_f32`
- `mul_m23v2_aos`
- `mul_m23v2_aos_single` (2x vec2 per iteration)
- `mul_m44v4_aos`
- `mul_m44v4_aos_single`
- `sqrt4_f32` (*)
- `sub4_f32`

(*) Missing native implementation, waiting on...

Also see [src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts) for documentation about the exposed TS/JS API...

Expand All @@ -58,7 +68,9 @@ yarn add @thi.ng/simd

## Usage examples

The [WebAssembly SIMD spec](https://github.com/WebAssembly/simd) is still WIP and (at the time of writing) only partially implemented.
The [WebAssembly SIMD spec](https://github.com/WebAssembly/simd) is
still WIP and (at the time of writing) only partially implemented and
hidden behind feature flags.

- NodeJS (v12.10+): `node --experimental-wasm-simd`
- Chrome: Enable SIMD support via [chrome://flags](chrome://flags)
Expand All @@ -68,6 +80,7 @@ import { init } from "@thi.ng/simd";

// the WASM module doesn't specify any own memory and it must be provided by user
// the returned object contains all available vector functions & memory views
// (an error will be thrown if WASM isn't available or SIMD unsupported)
const simd = init(new WebAssembly.Memory({ initial: 1 }));

// input data: 3x vec4 buffers
Expand All @@ -78,8 +91,8 @@ const out = simd.f32.subarray(16, 18);
a.set([1, 2, 3, 4])
b.set([10, 20, 30, 40, 40, 30, 20, 10]);

// compute dot products
// by using 0 as stride for A, all dot products are using [1,2,3,4] for A
// compute dot products: dot(A[i], B[i])
// by using 0 as stride for A, all dot products are using the same vec
simd.dot4_f32_aos(
out.byteOffset, // output addr / pointer
a.byteOffset, // vector A addr
Expand Down
21 changes: 21 additions & 0 deletions packages/simd/assembly/add.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function add4_f32(
out: usize,
a: usize,
b: usize,
num: usize,
so: usize = 4,
sa: usize = 4,
sb: usize = 4
): usize {
so <<= 2;
sa <<= 2;
sb <<= 2;
const res = out;
for (; num-- > 0; ) {
v128.store(out, f32x4.add(v128.load(a), v128.load(b)));
out += so;
a += sa;
b += sb;
}
return res;
}
21 changes: 21 additions & 0 deletions packages/simd/assembly/div.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function div4_f32(
out: usize,
a: usize,
b: usize,
num: usize,
so: usize,
sa: usize,
sb: usize
): usize {
so <<= 2;
sa <<= 2;
sb <<= 2;
const res = out;
for (; num-- > 0; ) {
v128.store(out, f32x4.div(v128.load(a), v128.load(b)));
out += so;
a += sa;
b += sb;
}
return res;
}
31 changes: 14 additions & 17 deletions packages/simd/assembly/dot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ export function dot2_f32_aos(
so <<= 2;
num >>= 1;
for (; num-- > 0; ) {
let m = v128.mul<f32>(v128.load(a), v128.load(b));
m = v128.add<f32>(m, v128.shuffle<f32>(m, m, 1, 0, 3, 2));
store<f32>(out, v128.extract_lane<f32>(m, 0));
store<f32>(out + so, v128.extract_lane<f32>(m, 2));
let m = f32x4.mul(v128.load(a), v128.load(b));
m = f32x4.add(m, v128.shuffle<f32>(m, m, 1, 0, 3, 2));
f32.store(out, f32x4.extract_lane(m, 0));
f32.store(out + so, f32x4.extract_lane(m, 2));
out += so2;
a += 16;
b += 16;
Expand Down Expand Up @@ -64,12 +64,9 @@ export function dot4_f32_aos(
sb <<= 2;
// a1*b1 + a2*b2 + a3*b3 + a4*b4
for (; num-- > 0; ) {
let m = v128.mul<f32>(v128.load(a), v128.load(b));
m = v128.add<f32>(m, v128.shuffle<f32>(m, m, 2, 3, 0, 1));
store<f32>(
out,
v128.extract_lane<f32>(m, 0) + v128.extract_lane<f32>(m, 1)
);
let m = f32x4.mul(v128.load(a), v128.load(b));
m = f32x4.add(m, v128.shuffle<f32>(m, m, 2, 3, 0, 1));
f32.store(out, f32x4.extract_lane(m, 0) + f32x4.extract_lane(m, 1));
out += so;
a += sa;
b += sb;
Expand All @@ -96,15 +93,15 @@ export function dot4_f32_soa(
for (; num-- > 0; ) {
v128.store(
out,
v128.add<f32>(
v128.add<f32>(
v128.add<f32>(
v128.mul<f32>(v128.load(a), v128.load(b)),
v128.mul<f32>(v128.load(a + sa), v128.load(b + sb))
f32x4.add(
f32x4.add(
f32x4.add(
f32x4.mul(v128.load(a), v128.load(b)),
f32x4.mul(v128.load(a + sa), v128.load(b + sb))
),
v128.mul<f32>(v128.load(a + sa2), v128.load(b + sb2))
f32x4.mul(v128.load(a + sa2), v128.load(b + sb2))
),
v128.mul<f32>(v128.load(a + sa3), v128.load(b + sb3))
f32x4.mul(v128.load(a + sa3), v128.load(b + sb3))
)
);
out += 16;
Expand Down
10 changes: 10 additions & 0 deletions packages/simd/assembly/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
export * from "./add";

// TODO waiting for native impl
// export * from "./div";

export * from "./dot";
export * from "./madd";
export * from "./maddn";
export * from "./mul";
export * from "./mulv";
export * from "./sub";

// TODO waiting for native impl
// export * from "./sqrt";
5 changes: 1 addition & 4 deletions packages/simd/assembly/madd.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@ export function madd4_f32(
for (; num-- > 0; ) {
v128.store(
out,
v128.add<f32>(
v128.mul<f32>(v128.load(a), v128.load(b)),
v128.load(c)
)
f32x4.add(f32x4.mul(v128.load(a), v128.load(b)), v128.load(c))
);
out += so;
a += sa;
Expand Down
5 changes: 1 addition & 4 deletions packages/simd/assembly/maddn.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@ export function maddn4_f32(
sc <<= 2;
const vb = v128.splat<f32>(b);
for (; num-- > 0; ) {
v128.store(
out,
v128.add<f32>(v128.mul<f32>(v128.load(a), vb), v128.load(c))
);
v128.store(out, f32x4.add(f32x4.mul(v128.load(a), vb), v128.load(c)));
out += so;
a += sa;
c += sc;
Expand Down
21 changes: 21 additions & 0 deletions packages/simd/assembly/mul.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function mul4_f32(
out: usize,
a: usize,
b: usize,
num: usize,
so: usize,
sa: usize,
sb: usize
): usize {
so <<= 2;
sa <<= 2;
sb <<= 2;
const res = out;
for (; num-- > 0; ) {
v128.store(out, f32x4.mul(v128.load(a), v128.load(b)));
out += so;
a += sa;
b += sb;
}
return res;
}
33 changes: 15 additions & 18 deletions packages/simd/assembly/mulv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ export function mul_m23v2_aos(
const v = v128.load(vec);
v128.store(
out,
v128.add<f32>(
v128.add<f32>(
v128.mul<f32>(v128.shuffle<f32>(v, v, 0, 0, 2, 2), m1),
v128.mul<f32>(v128.shuffle<f32>(v, v, 1, 1, 3, 3), m2)
f32x4.add(
f32x4.add(
f32x4.mul(v128.shuffle<f32>(v, v, 0, 0, 2, 2), m1),
f32x4.mul(v128.shuffle<f32>(v, v, 1, 1, 3, 3), m2)
),
m3
)
Expand All @@ -45,13 +45,13 @@ export function mul_m23v2_aos_single(
const v = v128.load(vec);
v128.store(
out,
v128.add<f32>(
v128.add<f32>(
v128.mul<f32>(
f32x4.add(
f32x4.add(
f32x4.mul(
v128.shuffle<f32>(v, v, 0, 0, 2, 2),
v128.shuffle<f32>(m, m, 0, 1, 0, 1)
),
v128.mul<f32>(
f32x4.mul(
v128.shuffle<f32>(v, v, 1, 1, 3, 3),
v128.shuffle<f32>(m, m, 2, 3, 2, 3)
)
Expand Down Expand Up @@ -90,23 +90,20 @@ export function mul_m44v4_aos_single(
const v = v128.load(vec);
v128.store(
out,
v128.add<f32>(
v128.add<f32>(
v128.mul<f32>(
v128.shuffle<f32>(v, v, 0, 0, 0, 0),
v128.load(mat)
),
v128.mul<f32>(
f32x4.add(
f32x4.add(
f32x4.mul(v128.shuffle<f32>(v, v, 0, 0, 0, 0), v128.load(mat)),
f32x4.mul(
v128.shuffle<f32>(v, v, 1, 1, 1, 1),
v128.load(mat, 16)
)
),
v128.add<f32>(
v128.mul<f32>(
f32x4.add(
f32x4.mul(
v128.shuffle<f32>(v, v, 2, 2, 2, 2),
v128.load(mat, 32)
),
v128.mul<f32>(
f32x4.mul(
v128.shuffle<f32>(v, v, 3, 3, 3, 3),
v128.load(mat, 48)
)
Expand Down
36 changes: 36 additions & 0 deletions packages/simd/assembly/sqrt.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
export function sqrt4_f32(
out: usize,
a: usize,
num: usize,
so: usize,
sa: usize
): usize {
so <<= 2;
sa <<= 2;
const res = out;
for (; num-- > 0; ) {
v128.store(out, f32x4.sqrt(v128.load(a)));
out += so;
a += sa;
}
return res;
}

export function invsqrt4_f32(
out: usize,
a: usize,
num: usize,
so: usize,
sa: usize
): usize {
so <<= 2;
sa <<= 2;
const res = out;
const one = f32x4.splat(1);
for (; num-- > 0; ) {
v128.store(out, f32x4.div(one, f32x4.sqrt(v128.load(a))));
out += so;
a += sa;
}
return res;
}
21 changes: 21 additions & 0 deletions packages/simd/assembly/sub.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
export function sub4_f32(
out: usize,
a: usize,
b: usize,
num: usize,
so: usize,
sa: usize,
sb: usize
): usize {
so <<= 2;
sa <<= 2;
sb <<= 2;
const res = out;
for (; num-- > 0; ) {
v128.store(out, f32x4.sub(v128.load(a), v128.load(b)));
out += so;
a += sa;
b += sb;
}
return res;
}
Loading

0 comments on commit 4023a8f

Please sign in to comment.