From 9db24b9be61285d2706ea1ee35bb4772dd14f565 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 21 Jan 2023 12:42:10 +0100 Subject: [PATCH 1/2] fse: Optimize table building MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skipping the loop body when v == 0 helps endzerobits and normcount2. Not writing to s.symbolLen in every iteration helps the other benchmarks. name old speed new speed delta Compress/gettysburg-8 181MB/s ± 1% 183MB/s ± 0% +1.15% (p=0.002 n=10+8) Compress/digits-8 241MB/s ± 0% 241MB/s ± 1% ~ (p=0.434 n=9+10) Compress/twain-8 218MB/s ± 0% 218MB/s ± 0% ~ (p=0.755 n=10+10) Compress/low-ent-8 239MB/s ± 0% 239MB/s ± 1% ~ (p=0.853 n=10+10) Compress/superlow-ent-8 208MB/s ± 1% 208MB/s ± 0% ~ (p=0.408 n=9+7) Compress/endzerobits-8 11.5MB/s ± 1% 13.3MB/s ± 1% +16.35% (p=0.000 n=10+9) Compress/pngdata.001-8 224MB/s ± 0% 224MB/s ± 1% +0.38% (p=0.004 n=8+10) Compress/normcount2-8 35.7MB/s ± 1% 36.6MB/s ± 1% +2.66% (p=0.000 n=10+9) --- fse/compress.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fse/compress.go b/fse/compress.go index 6f341914c6..d1dc92756d 100644 --- a/fse/compress.go +++ b/fse/compress.go @@ -459,15 +459,17 @@ func (s *Scratch) countSimple(in []byte) (max int) { for _, v := range in { s.count[v]++ } - m := uint32(0) + m, symlen := uint32(0), s.symbolLen for i, v := range s.count[:] { + if v == 0 { + continue + } if v > m { m = v } - if v > 0 { - s.symbolLen = uint16(i) + 1 - } + symlen = uint16(i) + 1 } + s.symbolLen = symlen return int(m) } From f2ae4c35f463d34570eef63de914476622babca9 Mon Sep 17 00:00:00 2001 From: greatroar <61184462+greatroar@users.noreply.github.com> Date: Sat, 21 Jan 2023 14:24:08 +0100 Subject: [PATCH 2/2] fse: Skip bounds checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit each occurrence of v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] now incurs three bounds checks instead of four. I haven't found a way to eliminate the remaining three. name old speed new speed delta Compress/gettysburg-8 183MB/s ± 0% 189MB/s ± 0% +3.32% (p=0.000 n=8+9) Compress/digits-8 241MB/s ± 1% 251MB/s ± 1% +4.14% (p=0.000 n=10+9) Compress/twain-8 218MB/s ± 0% 228MB/s ± 0% +4.36% (p=0.000 n=10+10) Compress/low-ent-8 239MB/s ± 1% 244MB/s ± 1% +1.90% (p=0.000 n=10+10) Compress/superlow-ent-8 208MB/s ± 0% 210MB/s ± 0% +0.89% (p=0.000 n=7+8) Compress/endzerobits-8 13.3MB/s ± 1% 13.4MB/s ± 1% +0.40% (p=0.019 n=9+10) Compress/pngdata.001-8 224MB/s ± 1% 225MB/s ± 1% +0.41% (p=0.006 n=10+9) Compress/normcount2-8 36.6MB/s ± 1% 36.4MB/s ± 1% -0.62% (p=0.012 n=9+10) --- fse/compress.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fse/compress.go b/fse/compress.go index d1dc92756d..dac97e58a2 100644 --- a/fse/compress.go +++ b/fse/compress.go @@ -146,54 +146,51 @@ func (s *Scratch) compress(src []byte) error { c1.encodeZero(tt[src[ip-2]]) ip -= 2 } + src = src[:ip] // Main compression loop. switch { case !s.zeroBits && s.actualTableLog <= 8: // We can encode 4 symbols without requiring a flush. // We do not need to check if any output is 0 bits. - for ip >= 4 { + for ; len(src) >= 4; src = src[:len(src)-4] { s.bw.flush32() - v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] + v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] c2.encode(tt[v0]) c1.encode(tt[v1]) c2.encode(tt[v2]) c1.encode(tt[v3]) - ip -= 4 } case !s.zeroBits: // We do not need to check if any output is 0 bits. - for ip >= 4 { + for ; len(src) >= 4; src = src[:len(src)-4] { s.bw.flush32() - v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] + v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] c2.encode(tt[v0]) c1.encode(tt[v1]) s.bw.flush32() c2.encode(tt[v2]) c1.encode(tt[v3]) - ip -= 4 } case s.actualTableLog <= 8: // We can encode 4 symbols without requiring a flush - for ip >= 4 { + for ; len(src) >= 4; src = src[:len(src)-4] { s.bw.flush32() - v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] + v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] c2.encodeZero(tt[v0]) c1.encodeZero(tt[v1]) c2.encodeZero(tt[v2]) c1.encodeZero(tt[v3]) - ip -= 4 } default: - for ip >= 4 { + for ; len(src) >= 4; src = src[:len(src)-4] { s.bw.flush32() - v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1] + v3, v2, v1, v0 := src[len(src)-4], src[len(src)-3], src[len(src)-2], src[len(src)-1] c2.encodeZero(tt[v0]) c1.encodeZero(tt[v1]) s.bw.flush32() c2.encodeZero(tt[v2]) c1.encodeZero(tt[v3]) - ip -= 4 } }