From a4773d2d37a07bbe04b14f997d386b4fc9f69d80 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 30 Nov 2021 09:08:33 -0800 Subject: [PATCH] deflate: Speed up huffman encoding (#455) Huffman only: 320.36 -> 414.67MB/s (github-june-2days-2019.json) 237.41-> 293.24 MB/s (github-ranks-backup.bin) Level 1: 278.94 -> 308.47 MB/s (github-june-2days-2019.json) 232.93 -> 261.16MB/s (github-ranks-backup.bin) --- flate/huffman_bit_writer.go | 47 ++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 3ad5e98072..fda85c2571 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -222,7 +222,7 @@ func (w *huffmanBitWriter) write(b []byte) { } func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { - w.bits |= uint64(b) << w.nbits + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { w.writeOutBits() @@ -423,7 +423,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { func (w *huffmanBitWriter) writeCode(c hcode) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << w.nbits + w.bits |= uint64(c.code) << (w.nbits & 63) w.nbits += c.len if w.nbits >= 48 { w.writeOutBits() @@ -768,7 +768,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if t < matchType { //w.writeCode(lits[t.literal()]) c := lits[t.literal()] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -796,7 +796,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := lengths[lengthCode&31] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -819,7 +819,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if extraLengthBits > 0 { //w.writeBits(extraLength, extraLengthBits) extraLength := int32(length - lengthBase[lengthCode&31]) - bits |= uint64(extraLength) << nbits + bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -846,7 +846,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := offs[offsetCode] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -867,7 +867,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) offsetComb := offsetCombined[offsetCode] if offsetComb > 1<<16 { //w.writeBits(extraOffset, extraOffsetBits) - bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << nbits + bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << (nbits & 63) nbits += uint16(offsetComb >> 16) if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -996,10 +996,41 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { encoding := w.literalEncoding.codes[:256] // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + + // Unroll, write 3 codes/loop. + // Fastest number of unrolls. + for len(input) > 3 { + // We must have at least 48 bits free. + if nbits >= 8 { + n := nbits >> 3 + binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + bits >>= (n * 8) & 63 + nbits -= n * 8 + nbytes += uint8(n) + } + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + a, b := encoding[input[0]], encoding[input[1]] + bits |= uint64(a.code) << (nbits & 63) + bits |= uint64(b.code) << ((nbits + a.len) & 63) + c := encoding[input[2]] + nbits += b.len + a.len + bits |= uint64(c.code) << (nbits & 63) + nbits += c.len + input = input[3:] + } + + // Remaining... for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if debugDeflate { count += int(c.len)