Skip to content

Commit

Permalink
refactor(simd): extract SIMD API, update readme
Browse files Browse the repository at this point in the history
  • Loading branch information
postspectacular committed Oct 20, 2019
1 parent d266fec commit 8b7287e
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 155 deletions.
117 changes: 87 additions & 30 deletions packages/simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This project is part of the
<!-- TOC depthFrom:2 depthTo:3 -->

- [About](#about)
- [Available functions](#available-functions)
- [Status](#status)
- [Installation](#installation)
- [Dependencies](#dependencies)
- [Usage examples](#usage-examples)
Expand All @@ -20,9 +22,30 @@ This project is part of the

## About

WASM based SIMD vector operations for batch processing, written in
[WebAssembly SIMD](https://github.com/WebAssembly/simd) vector
operations for batch processing, written in
[AssemblyScript](https://docs.assemblyscript.org/).

## Available functions

See
[/assembly](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/assembly)
for sources:

- `dot2_f32_aos()`
- `dot4_f32_aos()`
- `dot4_f32_soa()`
- `madd4_f32()`
- `maddn4_f32()`
- `mul_m23v2_aos()`
- `mul_m44v4_aos()`

Also see [src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts) for documentation about the exposed TS/JS API...

## Status

ALPHA - unreleased

## Installation

```bash
Expand All @@ -35,38 +58,72 @@ yarn add @thi.ng/simd

## Usage examples

The [WebAssembly SIMD spec](https://github.com/WebAssembly/simd) is still WIP and (at the time of writing) only partially implemented.

- NodeJS (v12.10+): `node --experimental-wasm-simd`
- Chrome: Enable SIMD support via [chrome://flags](chrome://flags)

```ts
import { init } from "@thi.ng/simd";

(async () => {
const simd = await init("simd.wasm", new WebAssembly.Memory({ initial: 1 })); })();
const f32 = new Float32Array(simd.memory.buffer);
// input data: 3x vec4
f32.set([
1, 2, 3, 4,
10, 20, 30, 40,
40, 30, 20, 10
]);

// compute dot products
simd.dot4(
48, // output addr / pointer (bytes)
0, // vector A addr (bytes)
16, // vector B addr (bytes)
2, // number of vectors to process
1, // output stride (floats)
0, // A stride (floats)
4 // B stride (floats)
);
// by using 0 as stride for A, all dot products are using [1,2,3,4] for A

// result for dot(a0, b0)
f32[48 >> 2];
// 300

// result for dot(a0, b1)
f32[(48 >> 2) + 1];
// 200
// the WASM module doesn't specify any own memory and it must be provided by user
// the returned object contains all available vector functions & memory views
const simd = init(new WebAssembly.Memory({ initial: 1 }));

// input data: 3x vec4 buffers
const a = simd.f32.subarray(0, 4);
const b = simd.f32.subarray(4, 16);
const out = simd.f32.subarray(16, 18);

a.set([1, 2, 3, 4])
b.set([10, 20, 30, 40, 40, 30, 20, 10]);

// compute dot products
// by using 0 as stride for A, all dot products are using [1,2,3,4] for A
simd.dot4_f32_aos(
out.byteOffset, // output addr / pointer
a.byteOffset, // vector A addr
b.byteOffset, // vector B addr
2, // number of vectors to process
1, // output stride (floats)
0, // A stride (floats)
4 // B stride (floats)
);

// results for [dot(a0, b0), dot(a0, b1)]
out
// [300, 200]

// mat4 * vec4 matrix-vector multiplies
const mat = simd.f32.subarray(0, 16);
const points = simd.f32.subarray(16, 24);

// mat4 (col major)
mat.set([
10, 0, 0, 0,
0, 20, 0, 0,
0, 0, 30, 0,
100, 200, 300, 1
]);

// vec4 array
points.set([
1, 2, 3, 1,
4, 5, 6, 1,
]);

simd.mul_m44v4_aos(
points.byteOffset, // output addr / pointer
mat.byteOffset, // mat4 addr
points.byteOffset, // vec4 addr
2, // number of vectors to process
4, // output stride (float)
4 // vec stride (float)
);

// transformed points
points
// [110, 240, 390, 1, 140, 300, 480, 1]
```

## Authors
Expand Down
126 changes: 126 additions & 0 deletions packages/simd/src/api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
export interface SIMD {
/**
* WASM memory instance given to `init()`.
*/
memory: WebAssembly.Memory;
/**
* Float64 view of WASM memory.
*/
f64: Float64Array;
/**
* Float32 view of WASM memory.
*/
f32: Float32Array;
/**
* Uint32 view of WASM memory.
*/
u32: Uint32Array;
/**
* Int32 view of WASM memory.
*/
i32: Int32Array;
/**
* Uint16 of WASM memory.
*/
u16: Uint16Array;
/**
* Int16 view of WASM memory.
*/
i16: Int16Array;
/**
* Uint8 view of WASM memory.
*/
u8: Uint8Array;
/**
* Int8 view of WASM memory.
*/
i8: Int8Array;

/**
* Takes two densely packed vec2 AOS buffers `a` and `b`, computes
* their 2D dot products and stores results in `out`. Computes two
* results per iteration, hence `num` must be an even number or else
* the last vector will not be processed. `so` should be 1 for
* packed result buffer.
*
* `a` & `b` should be aligned to 16, `out` to multiples of 4.
*
* @param out
* @param a
* @param b
* @param num
* @param so
*/
// prettier-ignore
dot2_f32_aos(out: number, a: number, b: number, num: number, so: number): number;

/**
* Takes two vec4 AOS buffers, computes their dot products and
* stores results in `out`. `so` should be 1 for packed result
* buffer. `sa` and `sb` indicate the stride lengths (in floats)
* between each vector in each respective buffer and should be a
* multiple of 4.
*
* @param out
* @param a
* @param b
* @param num
* @param so
* @param sa
* @param sb
*/
// prettier-ignore
dot4_f32_aos(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number;

/**
* Takes two vec4 SOA buffers and computes their 4D dot products and
* writes results to `out`. `sa` and `sb` indicate the element
* stride size (in floats) of the respective vectors (should be
* multiple of 4). The results are always stored in a packed layout.
* Processes 4 vectors per iteration, hence `num` should be a
* multiple of 4 too.
*
* @param out
* @param a
* @param b
* @param num
* @param sa
* @param sb
*/
// prettier-ignore
dot4_f32_soa(out: number, a: number, b: number, num: number, sa: number, sb: number): number;

/**
* Takes three vec4 buffers, computes componentwise `a * b + c` and
* stores results in `out`. Both AOS / SOA layouts are supported, as
* long as all buffers are using the same layout.
*
* All strides must by multiples of 4. All pointers should be
* aligned to multiples of 16. Returns `out` pointer.
*
* @param out
* @param a
* @param b
* @param c
* @param num number of vec4
* @param so out element stride
* @param sa A element stride
* @param sb B element stride
* @param sc C element stride
*/
// prettier-ignore
madd4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sb: number, sc: number): number;

// prettier-ignore
maddn4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sc: number): number;

// prettier-ignore
mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number;

mul_m23v2_aos_single(out: number, mat: number, vec: number): number;

// prettier-ignore
mul_m44v4_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number;

mul_m44v4_aos_single(out: number, mat: number, vec: number): number;
}
Loading

0 comments on commit 8b7287e

Please sign in to comment.