klauspost · klauspost · Dec 28, 2021 · Dec 21, 2021 · Dec 22, 2021 · Dec 22, 2021
diff --git a/flate/deflate.go b/flate/deflate.go
@@ -41,9 +41,11 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
@@ -74,7 +76,7 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 12, 24, 24, skipNever, 7},
+	{6, 10, 12, 16, skipNever, 7},
 	{10, 24, 32, 64, skipNever, 8},
 	{32, 258, 258, 1024, skipNever, 9},
 }
@@ -175,7 +177,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -301,7 +304,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (lengt
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
 			if n > length {
-				newGain := n*bpb - bits.Len32(uint32(pos-i)) - 1
+				newGain := n*bpb - bits.Len32(uint32(pos-i))
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -541,20 +544,27 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > uint16(d.nice) {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
 								return
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -697,13 +707,13 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 8
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()

diff --git a/flate/flate_test.go b/flate/flate_test.go
@@ -163,7 +163,7 @@ func TestRegressions(t *testing.T) {
 					t.Error(err)
 				}
 				if !bytes.Equal(data1, data2) {
-					fmt.Printf("want:%x\ngot: %x\n", data1, data2)
+					//fmt.Printf("want:%x\ngot: %x\n", data1, data2)
 					t.Error("not equal")
 				}
 			})

diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
@@ -155,37 +155,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -566,7 +562,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -595,7 +591,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -634,22 +630,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -660,35 +673,79 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+			// Check if we get a reasonable size decrease.
+			if storable && ssize <= size {
+				w.writeStoredHeader(len(input), eof)
+				w.writeBytes(input)
+				return
+			}
+			w.writeFixedHeader(eof)
+			if !sync {
+				tokens.AddEOB()
+			}
+			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+			return
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+			// Store bytes, if we don't get an improvement.
+			if storable && ssize <= preSize {
+				w.writeStoredHeader(len(input), eof)
+				w.writeBytes(input)
+				return
+			}
+			w.writeFixedHeader(eof)
+			if !sync {
+				tokens.AddEOB()
+			}
+			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+			return
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -699,6 +756,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
@@ -733,7 +803,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -867,7 +937,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		offsetComb := offsetCombined[offsetCode]
 		if offsetComb > 1<<16 {
 			//w.writeBits(extraOffset, extraOffsetBits)
-			bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << (nbits & 63)
+			bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63)
 			nbits += uint16(offsetComb >> 16)
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)

diff --git a/flate/inflate.go b/flate/inflate.go
@@ -328,11 +328,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -341,6 +347,9 @@ func (f *decompressor) nextBlock() {
 		f.hl = &f.h1
 		f.hd = &f.h2
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {

diff --git a/flate/testdata/huffman-null-max.dyn.expect b/flate/testdata/huffman-null-max.dyn.expect
diff --git a/flate/testdata/huffman-null-max.dyn.expect-noinput b/flate/testdata/huffman-null-max.dyn.expect-noinput
diff --git a/flate/testdata/huffman-pi.dyn.expect b/flate/testdata/huffman-pi.dyn.expect
diff --git a/flate/testdata/huffman-pi.dyn.expect-noinput b/flate/testdata/huffman-pi.dyn.expect-noinput
diff --git a/flate/testdata/huffman-rand-1k.dyn.expect-noinput b/flate/testdata/huffman-rand-1k.dyn.expect-noinput
diff --git a/flate/testdata/huffman-rand-limit.dyn.expect b/flate/testdata/huffman-rand-limit.dyn.expect
diff --git a/flate/testdata/huffman-rand-limit.dyn.expect-noinput b/flate/testdata/huffman-rand-limit.dyn.expect-noinput
diff --git a/flate/testdata/huffman-rand-limit.sync.expect b/flate/testdata/huffman-rand-limit.sync.expect
diff --git a/flate/testdata/huffman-rand-limit.sync.expect-noinput b/flate/testdata/huffman-rand-limit.sync.expect-noinput
diff --git a/flate/testdata/huffman-shifts.dyn.expect b/flate/testdata/huffman-shifts.dyn.expect
diff --git a/flate/testdata/huffman-shifts.dyn.expect-noinput b/flate/testdata/huffman-shifts.dyn.expect-noinput
diff --git a/flate/testdata/huffman-text-shift.dyn.expect b/flate/testdata/huffman-text-shift.dyn.expect
diff --git a/flate/testdata/huffman-text-shift.dyn.expect-noinput b/flate/testdata/huffman-text-shift.dyn.expect-noinput
@@ -1,2 +1 @@
-�`�@�����5R|@ו1C�ᚄ4ϒ��|ʂ���������������.��zgEN�L������E#2¬EQ<��D��8.IDHÂ�D�@.E^������� @"�Ҡ ����`�M
-KS4��*�n%P�1n��AA�`�OS^.���a�JUx�x�2�s��4�%yW��X+&F�$I�&�)�Igd<l9��7�TC���Y�mE�+T"�d��e�!�eˇ���1闍Ș�+�<
+��J�0���!�<(l�P*(�:�s�覂�Д4aI�%|SV��xO&�U>�7�C�qM�u��d29ߨ�x�s�޷�$�� Qi^�t�wU�͊�;���C����C�Zf��ȥ6�J���v�Զ�Xdp�j�(����]��^v8:K��dH�@�>.��3�SAJ��.3�{�;��5F吒oJ�Y6ϯ�˛��������l�_���?�����8�d

diff --git a/flate/testdata/huffman-text.dyn.expect b/flate/testdata/huffman-text.dyn.expect
diff --git a/flate/testdata/huffman-text.dyn.expect-noinput b/flate/testdata/huffman-text.dyn.expect-noinput
@@ -1,3 +1,4 @@
-�`�J�|�ஏb���F��=M/MX�+�K������������ˊ�;��޹���`�.�&;$
-���A	A�:��F8T�	h� ͍�˘�P� �"PI&@��	lG p`7�Td�x���D�GA^k�, � �OA�U�!���AV�J��QV�2,��ށ���j(,;]X�`��
-��*xqF_��2>n^��A��Um�� �Œ���2>�T���	g�O�� ���U��+�����d��5ʕ�d��6_�i�2�
+���J�0���r=�`K��2Aasē)�H�����Iɟb�]�k��y�{h�0E{6�6�[��c��db�;��"%�#u����["�llB
+%*�
+�&��H�S�v����a���h�9��'B62CI���C��G6�����t���g�R]��K�m�!Č*�ꚺx5[��g�QF�ء��?>�)�
+7���풳�^�w;�$�d��2E�^��/έ{�-���x�6S�.9���

diff --git a/flate/testdata/huffman-zero.dyn.expect b/flate/testdata/huffman-zero.dyn.expect
diff --git a/flate/testdata/huffman-zero.dyn.expect-noinput b/flate/testdata/huffman-zero.dyn.expect-noinput
diff --git a/flate/testdata/huffman-zero.sync.expect b/flate/testdata/huffman-zero.sync.expect
diff --git a/flate/testdata/huffman-zero.sync.expect-noinput b/flate/testdata/huffman-zero.sync.expect-noinput
diff --git a/flate/testdata/null-long-match.dyn.expect-noinput b/flate/testdata/null-long-match.dyn.expect-noinput