From eca5b78396af8e6b47bda85b717bec3e8737b13f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 30 Nov 2021 17:29:52 +0100 Subject: [PATCH 1/3] Unroll Huffman only writer 320.36 -> 365.47MB/s (github-june-2days-2019.json) 237.41-> 266.14 MB/s (github-ranks-backup.bin) --- flate/huffman_bit_writer.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 3ad5e98072..327921c81f 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -996,6 +996,37 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { encoding := w.literalEncoding.codes[:256] // Go 1.16 LOVES having these on stack. At least 1.5x the speed. bits, nbits, nbytes := w.bits, w.nbits, w.nbytes + + // Unroll, write 3 codes/loop. + // Fastest number of unrolls. + for len(input) > 3 { + // We must have at least 48 bits free. + if nbits >= 8 { + n := nbits >> 3 + binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) + bits >>= (n * 8) & 63 + nbits -= n * 8 + nbytes += uint8(n) + } + if nbytes >= bufferFlushSize { + if w.err != nil { + nbytes = 0 + return + } + _, w.err = w.writer.Write(w.bytes[:nbytes]) + nbytes = 0 + } + a, b := encoding[input[0]], encoding[input[1]] + bits |= uint64(a.code) << nbits + bits |= uint64(b.code) << (nbits + a.len) + c := encoding[input[2]] + nbits += b.len + a.len + bits |= uint64(c.code) << nbits + nbits += c.len + input = input[3:] + } + + // Remaining... for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] From b1241e691a2d441511e07509126f3c4165c2bb4d Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 30 Nov 2021 17:36:19 +0100 Subject: [PATCH 2/3] AND when shifting. --- flate/huffman_bit_writer.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 327921c81f..2c64a3cb3c 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -1017,11 +1017,11 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { nbytes = 0 } a, b := encoding[input[0]], encoding[input[1]] - bits |= uint64(a.code) << nbits - bits |= uint64(b.code) << (nbits + a.len) + bits |= uint64(a.code) << (nbits & 63) + bits |= uint64(b.code) << ((nbits + a.len) & 63) c := encoding[input[2]] nbits += b.len + a.len - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len input = input[3:] } @@ -1030,7 +1030,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) { for _, t := range input { // Bitwriting inlined, ~30% speedup c := encoding[t] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if debugDeflate { count += int(c.len) From 9b2b407bd701493e38e045ee48e9833e1862addc Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 30 Nov 2021 17:41:14 +0100 Subject: [PATCH 3/3] AND all variable shifts... --- flate/huffman_bit_writer.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go index 2c64a3cb3c..fda85c2571 100644 --- a/flate/huffman_bit_writer.go +++ b/flate/huffman_bit_writer.go @@ -222,7 +222,7 @@ func (w *huffmanBitWriter) write(b []byte) { } func (w *huffmanBitWriter) writeBits(b int32, nb uint16) { - w.bits |= uint64(b) << w.nbits + w.bits |= uint64(b) << (w.nbits & 63) w.nbits += nb if w.nbits >= 48 { w.writeOutBits() @@ -423,7 +423,7 @@ func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) { func (w *huffmanBitWriter) writeCode(c hcode) { // The function does not get inlined if we "& 63" the shift. - w.bits |= uint64(c.code) << w.nbits + w.bits |= uint64(c.code) << (w.nbits & 63) w.nbits += c.len if w.nbits >= 48 { w.writeOutBits() @@ -768,7 +768,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if t < matchType { //w.writeCode(lits[t.literal()]) c := lits[t.literal()] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -796,7 +796,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := lengths[lengthCode&31] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -819,7 +819,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) if extraLengthBits > 0 { //w.writeBits(extraLength, extraLengthBits) extraLength := int32(length - lengthBase[lengthCode&31]) - bits |= uint64(extraLength) << nbits + bits |= uint64(extraLength) << (nbits & 63) nbits += extraLengthBits if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -846,7 +846,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) } else { // inlined c := offs[offsetCode] - bits |= uint64(c.code) << nbits + bits |= uint64(c.code) << (nbits & 63) nbits += c.len if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits) @@ -867,7 +867,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) offsetComb := offsetCombined[offsetCode] if offsetComb > 1<<16 { //w.writeBits(extraOffset, extraOffsetBits) - bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << nbits + bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << (nbits & 63) nbits += uint16(offsetComb >> 16) if nbits >= 48 { binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)