diff --git a/packages/simd/README.md b/packages/simd/README.md index 964cee7d6a..3d75cdf82c 100644 --- a/packages/simd/README.md +++ b/packages/simd/README.md @@ -10,6 +10,8 @@ This project is part of the - [About](#about) +- [Available functions](#available-functions) +- [Status](#status) - [Installation](#installation) - [Dependencies](#dependencies) - [Usage examples](#usage-examples) @@ -20,9 +22,30 @@ This project is part of the ## About -WASM based SIMD vector operations for batch processing, written in +[WebAssembly SIMD](https://github.com/WebAssembly/simd) vector +operations for batch processing, written in [AssemblyScript](https://docs.assemblyscript.org/). +## Available functions + +See +[/assembly](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/assembly) +for sources: + +- `dot2_f32_aos()` +- `dot4_f32_aos()` +- `dot4_f32_soa()` +- `madd4_f32()` +- `maddn4_f32()` +- `mul_m23v2_aos()` +- `mul_m44v4_aos()` + +Also see [src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts) for documentation about the exposed TS/JS API... + +## Status + +ALPHA - unreleased + ## Installation ```bash @@ -35,38 +58,72 @@ yarn add @thi.ng/simd ## Usage examples +The [WebAssembly SIMD spec](https://github.com/WebAssembly/simd) is still WIP and (at the time of writing) only partially implemented. + +- NodeJS (v12.10+): `node --experimental-wasm-simd` +- Chrome: Enable SIMD support via [chrome://flags](chrome://flags) + ```ts import { init } from "@thi.ng/simd"; -(async () => { - const simd = await init("simd.wasm", new WebAssembly.Memory({ initial: 1 })); })(); - const f32 = new Float32Array(simd.memory.buffer); - // input data: 3x vec4 - f32.set([ - 1, 2, 3, 4, - 10, 20, 30, 40, - 40, 30, 20, 10 - ]); - - // compute dot products - simd.dot4( - 48, // output addr / pointer (bytes) - 0, // vector A addr (bytes) - 16, // vector B addr (bytes) - 2, // number of vectors to process - 1, // output stride (floats) - 0, // A stride (floats) - 4 // B stride (floats) - ); - // by using 0 as stride for A, all dot products are using [1,2,3,4] for A - - // result for dot(a0, b0) - f32[48 >> 2]; - // 300 - - // result for dot(a0, b1) - f32[(48 >> 2) + 1]; - // 200 +// the WASM module doesn't specify any own memory and it must be provided by user +// the returned object contains all available vector functions & memory views +const simd = init(new WebAssembly.Memory({ initial: 1 })); + +// input data: 3x vec4 buffers +const a = simd.f32.subarray(0, 4); +const b = simd.f32.subarray(4, 16); +const out = simd.f32.subarray(16, 18); + +a.set([1, 2, 3, 4]) +b.set([10, 20, 30, 40, 40, 30, 20, 10]); + +// compute dot products +// by using 0 as stride for A, all dot products are using [1,2,3,4] for A +simd.dot4_f32_aos( + out.byteOffset, // output addr / pointer + a.byteOffset, // vector A addr + b.byteOffset, // vector B addr + 2, // number of vectors to process + 1, // output stride (floats) + 0, // A stride (floats) + 4 // B stride (floats) +); + +// results for [dot(a0, b0), dot(a0, b1)] +out +// [300, 200] + +// mat4 * vec4 matrix-vector multiplies +const mat = simd.f32.subarray(0, 16); +const points = simd.f32.subarray(16, 24); + +// mat4 (col major) +mat.set([ + 10, 0, 0, 0, + 0, 20, 0, 0, + 0, 0, 30, 0, + 100, 200, 300, 1 +]); + +// vec4 array +points.set([ + 1, 2, 3, 1, + 4, 5, 6, 1, +]); + +simd.mul_m44v4_aos( + points.byteOffset, // output addr / pointer + mat.byteOffset, // mat4 addr + points.byteOffset, // vec4 addr + 2, // number of vectors to process + 4, // output stride (float) + 4 // vec stride (float) +); + +// transformed points +points +// [110, 240, 390, 1, 140, 300, 480, 1] ``` ## Authors diff --git a/packages/simd/src/api.ts b/packages/simd/src/api.ts new file mode 100644 index 0000000000..e4f39918ea --- /dev/null +++ b/packages/simd/src/api.ts @@ -0,0 +1,126 @@ +export interface SIMD { + /** + * WASM memory instance given to `init()`. + */ + memory: WebAssembly.Memory; + /** + * Float64 view of WASM memory. + */ + f64: Float64Array; + /** + * Float32 view of WASM memory. + */ + f32: Float32Array; + /** + * Uint32 view of WASM memory. + */ + u32: Uint32Array; + /** + * Int32 view of WASM memory. + */ + i32: Int32Array; + /** + * Uint16 of WASM memory. + */ + u16: Uint16Array; + /** + * Int16 view of WASM memory. + */ + i16: Int16Array; + /** + * Uint8 view of WASM memory. + */ + u8: Uint8Array; + /** + * Int8 view of WASM memory. + */ + i8: Int8Array; + + /** + * Takes two densely packed vec2 AOS buffers `a` and `b`, computes + * their 2D dot products and stores results in `out`. Computes two + * results per iteration, hence `num` must be an even number or else + * the last vector will not be processed. `so` should be 1 for + * packed result buffer. + * + * `a` & `b` should be aligned to 16, `out` to multiples of 4. + * + * @param out + * @param a + * @param b + * @param num + * @param so + */ + // prettier-ignore + dot2_f32_aos(out: number, a: number, b: number, num: number, so: number): number; + + /** + * Takes two vec4 AOS buffers, computes their dot products and + * stores results in `out`. `so` should be 1 for packed result + * buffer. `sa` and `sb` indicate the stride lengths (in floats) + * between each vector in each respective buffer and should be a + * multiple of 4. + * + * @param out + * @param a + * @param b + * @param num + * @param so + * @param sa + * @param sb + */ + // prettier-ignore + dot4_f32_aos(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; + + /** + * Takes two vec4 SOA buffers and computes their 4D dot products and + * writes results to `out`. `sa` and `sb` indicate the element + * stride size (in floats) of the respective vectors (should be + * multiple of 4). The results are always stored in a packed layout. + * Processes 4 vectors per iteration, hence `num` should be a + * multiple of 4 too. + * + * @param out + * @param a + * @param b + * @param num + * @param sa + * @param sb + */ + // prettier-ignore + dot4_f32_soa(out: number, a: number, b: number, num: number, sa: number, sb: number): number; + + /** + * Takes three vec4 buffers, computes componentwise `a * b + c` and + * stores results in `out`. Both AOS / SOA layouts are supported, as + * long as all buffers are using the same layout. + * + * All strides must by multiples of 4. All pointers should be + * aligned to multiples of 16. Returns `out` pointer. + * + * @param out + * @param a + * @param b + * @param c + * @param num number of vec4 + * @param so out element stride + * @param sa A element stride + * @param sb B element stride + * @param sc C element stride + */ + // prettier-ignore + madd4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sb: number, sc: number): number; + + // prettier-ignore + maddn4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sc: number): number; + + // prettier-ignore + mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; + + mul_m23v2_aos_single(out: number, mat: number, vec: number): number; + + // prettier-ignore + mul_m44v4_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; + + mul_m44v4_aos_single(out: number, mat: number, vec: number): number; +} diff --git a/packages/simd/src/index.ts b/packages/simd/src/index.ts index 2d45db7150..dc79b9cf1b 100644 --- a/packages/simd/src/index.ts +++ b/packages/simd/src/index.ts @@ -1,131 +1,7 @@ import { base64Decode } from "@thi.ng/transducers-binary"; +import { SIMD } from "./api"; import { BINARY } from "./binary"; -export interface SIMD { - /** - * WASM memory instance given to `init()`. - */ - memory: WebAssembly.Memory; - /** - * Float64 view of WASM memory. - */ - f64: Float64Array; - /** - * Float32 view of WASM memory. - */ - f32: Float32Array; - /** - * Uint32 view of WASM memory. - */ - u32: Uint32Array; - /** - * Int32 view of WASM memory. - */ - i32: Int32Array; - /** - * Uint16 of WASM memory. - */ - u16: Uint16Array; - /** - * Int16 view of WASM memory. - */ - i16: Int16Array; - /** - * Uint8 view of WASM memory. - */ - u8: Uint8Array; - /** - * Int8 view of WASM memory. - */ - i8: Int8Array; - - /** - * Takes two densely packed vec2 AOS buffers `a` and `b`, computes their - * 2D dot products and stores results in `out`. Computes two results per - * iteration, hence `num` must be an even number or else the last vector - * will not be processed. `so` should be 1 for packed result buffer. - * - * `a` & `b` should be aligned to 16, `out` to multiples of 4. - * - * @param out - * @param a - * @param b - * @param num - * @param so - */ - // prettier-ignore - dot2_f32_aos(out: number, a: number, b: number, num: number, so: number): number; - - /** - * Takes two vec4 AOS buffers, computes their dot products and stores - * results in `out`. `so` should be 1 for packed result buffer. `sa` and - * `sb` indicate the stride lengths (in floats) between each vector in - * each respective buffer and should be a multiple of 4. - * - * @param out - * @param a - * @param b - * @param num - * @param so - * @param sa - * @param sb - */ - // prettier-ignore - dot4_f32_aos(out: number, a: number, b: number, num: number, so: number, sa: number, sb: number): number; - - /** - * Takes two vec4 SOA buffers and computes their 4D dot products and - * writes results to `out`. `sa` and `sb` indicate the element - * stride size (in floats) of the respective vectors (should be - * multiple of 4). The results are always stored in a packed layout. - * Processes 4 vectors per iteration, hence `num` should be a - * multiple of 4 too. - * - * @param out - * @param a - * @param b - * @param num - * @param sa - * @param sb - */ - // prettier-ignore - dot4_f32_soa(out: number, a: number, b: number, num: number, sa: number, sb: number): number; - - /** - * Takes three vec4 buffers, computes componentwise `a * b + c` and stores - * results in `out`. Both AOS / SOA layouts are supported, as long as - * all buffers are using the same layout. - * - * All strides must by multiples of 4. All pointers should be aligned to - * multiples of 16. Returns `out` pointer. - * - * @param out - * @param a - * @param b - * @param c - * @param num number of vec4 - * @param so out element stride - * @param sa A element stride - * @param sb B element stride - * @param sc C element stride - */ - // prettier-ignore - madd4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sb: number, sc: number): number; - - // prettier-ignore - maddn4_f32(out: number, a: number, b: number, c: number, num: number, so: number, sa: number, sc: number): number; - - // prettier-ignore - mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; - - mul_m23v2_aos_single(out: number, mat: number, vec: number): number; - - // prettier-ignore - mul_m44v4_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number; - - mul_m44v4_aos_single(out: number, mat: number, vec: number): number; -} - export const init = (memory: WebAssembly.Memory): SIMD | undefined => { try { const buf = memory.buffer;