Skip to content

Commit

Permalink
huff0: Improve 4X decompression speed 5-10% (#437)
Browse files Browse the repository at this point in the history
Improve huffman 4X decompression speed for tablelog <= 8.

```
λ benchcmp before.txt after.txt
benchmark                                            old ns/op     new ns/op     delta
BenchmarkDecompress4XNoTable/digits-32               167490        158439        -5.40%
BenchmarkDecompress4XNoTable/gettysburg-32           2762          2782          +0.72%
BenchmarkDecompress4XNoTable/twain-32                578974        584448        +0.95%
BenchmarkDecompress4XNoTable/low-ent.10k-32          57714         54112         -6.24%
BenchmarkDecompress4XNoTable/superlow-ent-10k-32     15440         14349         -7.07%
BenchmarkDecompress4XNoTable/case1-32                232           215           -7.28%
BenchmarkDecompress4XNoTable/case2-32                181           172           -4.97%
BenchmarkDecompress4XNoTable/case3-32                187           178           -5.28%
BenchmarkDecompress4XNoTable/pngdata.001-32          78498         79716         +1.55%
BenchmarkDecompress4XNoTable/normcount2-32           299           270           -9.60%
BenchmarkDecompress4XTable/digits-32                 167728        158709        -5.38%
BenchmarkDecompress4XTable/gettysburg-32             3993          3956          -0.93%
BenchmarkDecompress4XTable/twain-32                  586985        584482        -0.43%
BenchmarkDecompress4XTable/low-ent.10k-32            58317         54652         -6.28%
BenchmarkDecompress4XTable/superlow-ent-10k-32       15895         14903         -6.24%
BenchmarkDecompress4XTable/case1-32                  2030          1996          -1.67%
BenchmarkDecompress4XTable/case2-32                  1986          1956          -1.51%
BenchmarkDecompress4XTable/case3-32                  2004          1980          -1.20%
BenchmarkDecompress4XTable/pngdata.001-32            81922         81627         -0.36%
BenchmarkDecompress4XTable/normcount2-32             1351          1339          -0.89%
```
  • Loading branch information
klauspost authored Jan 8, 2022
1 parent dff5f6b commit f59d5b1
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 172 deletions.
206 changes: 113 additions & 93 deletions huff0/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ type dEntrySingle struct {

// double-symbols decoding
type dEntryDouble struct {
seq uint16
seq [4]byte
nBits uint8
len uint8
}
Expand Down Expand Up @@ -914,7 +914,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
out := dst
dstEvery := (dstSize + 3) / 4

shift := (8 - d.actualTableLog) & 7
shift := (56 + (8 - d.actualTableLog)) & 63

const tlSize = 1 << 8
single := d.dt.single[:tlSize]
Expand All @@ -935,79 +935,91 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
// Interleave 2 decodes.
const stream = 0
const stream2 = 1
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
br1 := &br[stream]
br2 := &br[stream2]
br1.fillFast()
br2.fillFast()

v := single[uint8(br1.value>>shift)].entry
v2 := single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
buf[off+bufoff*stream+3] = uint8(v >> 8)
}

{
const stream = 2
const stream2 = 3
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
br1 := &br[stream]
br2 := &br[stream2]
br1.fillFast()
br2.fillFast()

v := single[uint8(br1.value>>shift)].entry
v2 := single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
v = single[uint8(br1.value>>shift)].entry
v2 = single[uint8(br2.value>>shift)].entry
br1.bitsRead += uint8(v)
br1.value <<= v & 63
br2.bitsRead += uint8(v2)
br2.value <<= v2 & 63
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
buf[off+bufoff*stream+3] = uint8(v >> 8)
}

off += 4
Expand Down Expand Up @@ -1073,7 +1085,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
}

// Read value and increment offset.
v := single[br.peekByteFast()>>shift].entry
v := single[uint8(br.value>>shift)].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= int(nBits)
Expand Down Expand Up @@ -1121,7 +1133,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
out := dst
dstEvery := (dstSize + 3) / 4

const shift = 0
const shift = 56
const tlSize = 1 << 8
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
Expand All @@ -1145,37 +1157,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
}

{
Expand All @@ -1184,37 +1200,41 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
}

off += 4
Expand Down Expand Up @@ -1280,7 +1300,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
}

// Read value and increment offset.
v := single[br.peekByteFast()>>shift].entry
v := single[br.peekByteFast()].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= int(nBits)
Expand Down
Loading

0 comments on commit f59d5b1

Please sign in to comment.