Skip to content

Commit

Permalink
feat(simd): add hadd* inline fns, update dot, normalize, mulv, sum
Browse files Browse the repository at this point in the history
  • Loading branch information
postspectacular committed Oct 21, 2019
1 parent 0e0dfde commit a1011ea
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 43 deletions.
17 changes: 10 additions & 7 deletions packages/simd/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ for sources:
- `addn4_f32`
- `clamp4_f32`
- `clampn4_f32`
- `div4_f32` (*)
- `divn4_f32` (*)
- `dot2_f32_aos` (2x vec2 per iteration)
- `div4_f32` (!)
- `divn4_f32` (!)
- `dot2_f32_aos` (2)
- `dot4_f32_aos`
- `dot4_f32_soa`
- `invsqrt4_f32` (*)
- `invsqrt4_f32` (!)
- `madd4_f32`
- `maddn4_f32`
- `max4_f32`
Expand All @@ -51,16 +51,19 @@ for sources:
- `msubn4_f32`
- `mul4_f32`
- `muln4_f32`
- `mul_m23v2_aos` (2x vec2 per iteration)
- `mul_m23v2_aos` (2)
- `mul_m44v4_aos`
- `neg4_f32`
- `normalize2_f32_aos` (2)
- `normalize4_f32_aos`
- `sqrt4_f32` (*)
- `sqrt4_f32` (!)
- `sub4_f32`
- `subn4_f32`
- `sum4_f32`

(*) Missing native implementation, waiting on...
(!) Missing native implementation, waiting on...

(2) 2x vec2 per iteration

Also see
[src/api.ts](https://github.com/thi-ng/umbrella/tree/feature/simd/packages/simd/src/api.ts)
Expand Down
24 changes: 10 additions & 14 deletions packages/simd/assembly/dot.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { hadd2_f32, hadd4_f32 } from "./hadd";

/**
* Takes two densely packed vec2 AOS buffers `a` and `b`, computes their
* 2D dot products and stores results in `out`. Computes two results per
Expand All @@ -16,19 +18,15 @@ export function dot2_f32_aos(
out: usize,
a: usize,
b: usize,
num: usize,
so: usize
num: usize
): usize {
const res = out;
const so2 = so << 3;
so <<= 2;
num >>= 1;
for (; num-- > 0; ) {
let m = f32x4.mul(v128.load(a), v128.load(b));
m = f32x4.add(m, v128.shuffle<f32>(m, m, 1, 0, 3, 2));
const m = hadd2_f32(f32x4.mul(v128.load(a), v128.load(b)));
f32.store(out, f32x4.extract_lane(m, 0));
f32.store(out + so, f32x4.extract_lane(m, 2));
out += so2;
f32.store(out, f32x4.extract_lane(m, 2), 4);
out += 8;
a += 16;
b += 16;
}
Expand All @@ -37,9 +35,9 @@ export function dot2_f32_aos(

/**
* Takes two vec4 AOS buffers, computes their dot products and stores
* results in `out`. `so` should be 1 for packed result buffer. `sa` and
* `sb` indicate the stride lengths (in floats) between each vector in
* each respective buffer and should be a multiple of 4.
* results in `out`. `so` should be 1 for a packed result buffer. `sa`
* and `sb` indicate the stride lengths (in floats) between each vector
* in each respective buffer and should be a multiple of 4.
*
* @param out
* @param a
Expand All @@ -64,9 +62,7 @@ export function dot4_f32_aos(
sb <<= 2;
// a1*b1 + a2*b2 + a3*b3 + a4*b4
for (; num-- > 0; ) {
let m = f32x4.mul(v128.load(a), v128.load(b));
m = f32x4.add(m, v128.shuffle<f32>(m, m, 2, 3, 0, 1));
f32.store(out, f32x4.extract_lane(m, 0) + f32x4.extract_lane(m, 1));
f32.store(out, hadd4_f32(f32x4.mul(v128.load(a), v128.load(b))));
out += so;
a += sa;
b += sb;
Expand Down
29 changes: 29 additions & 0 deletions packages/simd/assembly/hadd.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/**
* Pairwise horizontal sum of `v`:
*
* ```
* [a, b, c, d] => [a+b, a+b, c+d, c+d]
* ```
*
* @param v
*/
// @ts-ignore: decorator
@inline
export function hadd2_f32(v: v128): v128 {
return f32x4.add(v, v128.shuffle<f32>(v, v, 1, 0, 3, 2));
}

/**
* Full horizontal sum of `v`:
*
* ```
* [a, b, c, d] => a + c + b + d
* ```
* @param v
*/
// @ts-ignore: decorator
@inline
export function hadd4_f32(v: v128): f32 {
v = f32x4.add(v, v128.shuffle<f32>(v, v, 2, 3, 0, 1));
return f32x4.extract_lane(v, 0) + f32x4.extract_lane(v, 1);
}
10 changes: 3 additions & 7 deletions packages/simd/assembly/mulv.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@ export function mul_m23v2_aos(
out: usize,
mat: usize,
vec: usize,
num: usize,
so: usize,
sa: usize
num: usize
): usize {
const res = out;
so <<= 2;
sa <<= 2;
num >>= 1;
const m0 = v128.load(mat);
const m1 = v128.shuffle<f32>(m0, m0, 0, 1, 0, 1);
Expand All @@ -28,8 +24,8 @@ export function mul_m23v2_aos(
m3
)
);
out += so;
vec += sa;
out += 16;
vec += 16;
}
return res;
}
Expand Down
38 changes: 34 additions & 4 deletions packages/simd/assembly/normalize.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,35 @@
import { hadd2_f32, hadd4_f32 } from "./hadd";

export function normalize2_f32_aos(
out: usize,
a: usize,
num: usize,
norm: f32
): usize {
const res = out;
num >>= 1;
for (; num-- > 0; ) {
const v = v128.load(a);
let vm = hadd2_f32(f32x4.mul(v, v));
const m1 = f32x4.extract_lane(vm, 0);
const m2 = f32x4.extract_lane(vm, 2);
vm = f32x4.replace_lane(
vm,
0,
m1 > f32.EPSILON ? norm / sqrt<f32>(m1) : 1
);
vm = f32x4.replace_lane(
vm,
2,
m2 > f32.EPSILON ? norm / sqrt<f32>(m2) : 1
);
v128.store(out, f32x4.mul(v, v128.shuffle<f32>(vm, vm, 0, 0, 2, 2)));
out += 16;
a += 16;
}
return res;
}

export function normalize4_f32_aos(
out: usize,
a: usize,
Expand All @@ -11,13 +43,11 @@ export function normalize4_f32_aos(
const res = out;
for (; num-- > 0; ) {
const v = v128.load(a);
let m = f32x4.mul(v, v);
m = f32x4.add(m, v128.shuffle<f32>(m, m, 2, 3, 0, 1));
const mag = f32x4.extract_lane(m, 0) + f32x4.extract_lane(m, 1);
const mag = hadd4_f32(f32x4.mul(v, v));
v128.store(
out,
mag > f32.EPSILON
? f32x4.mul(v, f32x4.splat(sqrt<f32>(norm / mag)))
? f32x4.mul(v, f32x4.splat(norm / sqrt<f32>(mag)))
: v
);
out += so;
Expand Down
7 changes: 4 additions & 3 deletions packages/simd/assembly/sum.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
export function sum4_f32(a: usize, num: usize, sa: usize): f64 {
import { hadd4_f32 } from "./hadd";

export function sum4_f32(a: usize, num: usize, sa: usize): f32 {
sa <<= 2;
let acc = f32x4.splat(0);
for (; num-- > 0; ) {
acc = f32x4.add(acc, v128.load(a));
a += sa;
}
acc = f32x4.add(acc, v128.shuffle<f32>(acc, acc, 2, 3, 0, 1));
return f32x4.extract_lane(acc, 0) + f32x4.extract_lane(acc, 1);
return hadd4_f32(acc);
}
13 changes: 7 additions & 6 deletions packages/simd/src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,17 @@ export interface SIMD {
* Takes two densely packed vec2 AOS buffers `a` and `b`, computes
* their 2D dot products and stores results in `out`. Computes two
* results per iteration, hence `num` must be an even number or else
* the last vector will not be processed. `so` should be 1 for
* packed result buffer.
* the last vector will not be processed.
*
* `a` & `b` should be aligned to 16, `out` to multiples of 4.
* `a` & `b` should be aligned to 16.
*
* @param out
* @param a
* @param b
* @param num
* @param so
*/
// prettier-ignore
dot2_f32_aos(out: number, a: number, b: number, num: number, so: number): number;
dot2_f32_aos(out: number, a: number, b: number, num: number): number;

/**
* Takes two vec4 AOS buffers, computes their dot products and
Expand Down Expand Up @@ -125,7 +123,7 @@ export interface SIMD {
muln4_f32(out: number, a: number, n: number, num: number, so: number, sa: number): number;

// prettier-ignore
mul_m23v2_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number;
mul_m23v2_aos(out: number, mat: number, vec: number, num: number): number;

// prettier-ignore
mul_m44v4_aos(out: number, mat: number, vec: number, num: number, so: number, sv: number): number;
Expand All @@ -139,6 +137,9 @@ export interface SIMD {
// prettier-ignore
neg4_f32(out: number, a: number, num: number, so: number, sa: number): number;

// prettier-ignore
normalize2_f32_aos(out: number, a: number, num: number, norm: number): number;

// prettier-ignore
normalize4_f32(out: number, a: number, num: number, norm: number, so: number, sa: number): number;

Expand Down
4 changes: 2 additions & 2 deletions packages/simd/test/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ simd.f32.set([
// b
10, 20, 30, 40
]);
simd.dot2_f32_aos(1024, 0, 16, 2, 1);
simd.dot2_f32_aos(1024, 0, 16, 2);
assertEqual(res_f32(1024, 2), [50, 250]);

// dot4_aos
Expand Down Expand Up @@ -96,7 +96,7 @@ simd.f32.set([
// vec4
1, 2, 3, 4
]);
simd.mul_m23v2_aos(1024, 0, 32, 2, 2, 4);
simd.mul_m23v2_aos(1024, 0, 32, 2);
assertEqual(res_f32(1024, 4), [110, 240, 130, 280]);

// mul_m44v4_aos
Expand Down

0 comments on commit a1011ea

Please sign in to comment.