diff --git a/flate/deflate.go b/flate/deflate.go
index 18ec6bdd4e..b27f5a93bc 100644
--- a/flate/deflate.go
+++ b/flate/deflate.go
@@ -41,9 +41,11 @@ const (
 	maxMatchLength   = 258 // The longest match for the compressor
 	minOffsetSize    = 1   // The shortest offset that makes any sense
 
-	// The maximum number of tokens we put into a single flat block, just too
-	// stop things from getting too large.
-	maxFlateBlockTokens = 1 << 14
+	// The maximum number of tokens we will encode at the time.
+	// Smaller sizes usually creates less optimal blocks.
+	// Bigger can make context switching slow.
+	// We use this for levels 7-9, so we make it big.
+	maxFlateBlockTokens = 1 << 15
 	maxStoreBlockSize   = 65535
 	hashBits            = 17 // After 17 performance degrades
 	hashSize            = 1 << hashBits
@@ -74,7 +76,7 @@ var levels = []compressionLevel{
 	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
-	{8, 12, 24, 24, skipNever, 7},
+	{6, 10, 12, 16, skipNever, 7},
 	{10, 24, 32, 64, skipNever, 8},
 	{32, 258, 258, 1024, skipNever, 9},
 }
@@ -175,7 +177,8 @@ func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
 			window = d.window[d.blockStart:index]
 		}
 		d.blockStart = index
-		d.w.writeBlock(tok, eof, window)
+		//d.w.writeBlock(tok, eof, window)
+		d.w.writeBlockDynamic(tok, eof, window, d.sync)
 		return d.w.err
 	}
 	return nil
@@ -301,7 +304,7 @@ func (d *compressor) findMatch(pos int, prevHead int, lookahead, bpb int) (lengt
 		if wEnd == win[i+length] {
 			n := matchLen(win[i:i+minMatchLook], wPos)
 			if n > length {
-				newGain := n*bpb - bits.Len32(uint32(pos-i)) - 1
+				newGain := n*bpb - bits.Len32(uint32(pos-i))
 				if newGain > cGain {
 					length = n
 					offset = pos - i
@@ -541,13 +544,12 @@ func (d *compressor) deflateLazy() {
 
 				// If we have a long run of no matches, skip additional bytes
 				// Resets when s.ii overflows after 64KB.
-				if s.ii > uint16(d.nice) {
-					n := int(s.ii >> 5)
+				if n := int(s.ii) - d.chain; n > 0 {
+					n = 1 + int(n>>6)
 					for j := 0; j < n; j++ {
 						if s.index >= d.windowEnd-1 {
 							break
 						}
-
 						d.tokens.AddLiteral(d.window[s.index-1])
 						if d.tokens.n == maxFlateBlockTokens {
 							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
@@ -555,6 +557,14 @@ func (d *compressor) deflateLazy() {
 							}
 							d.tokens.Reset()
 						}
+						// Index...
+						if s.index < s.maxInsertIndex {
+							h := hash4(d.window[s.index:])
+							ch := s.hashHead[h]
+							s.chainHead = int(ch)
+							s.hashPrev[s.index&windowMask] = ch
+							s.hashHead[h] = uint32(s.index + s.hashOffset)
+						}
 						s.index++
 					}
 					// Flush last byte
@@ -697,13 +707,13 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 8
+		d.w.logNewTablePenalty = 7
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logNewTablePenalty = 10
+		d.w.logNewTablePenalty = 8
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
diff --git a/flate/flate_test.go b/flate/flate_test.go
index 648d9796b8..182909ab0e 100644
--- a/flate/flate_test.go
+++ b/flate/flate_test.go
@@ -163,7 +163,7 @@ func TestRegressions(t *testing.T) {
 					t.Error(err)
 				}
 				if !bytes.Equal(data1, data2) {
-					fmt.Printf("want:%x\ngot: %x\n", data1, data2)
+					//fmt.Printf("want:%x\ngot: %x\n", data1, data2)
 					t.Error("not equal")
 				}
 			})
diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index fda85c2571..fb1701eecc 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -155,37 +155,33 @@ func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.lastHuffMan = false
 }
 
-func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
-	offsets, lits = true, true
+func (w *huffmanBitWriter) canReuse(t *tokens) (ok bool) {
 	a := t.offHist[:offsetCodeCount]
-	b := w.offsetFreq[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			offsets = false
-			break
+	b := w.offsetEncoding.codes
+	b = b[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
 
 	a = t.extraHist[:literalCount-256]
-	b = w.literalFreq[256:literalCount]
+	b = w.literalEncoding.codes[256:literalCount]
 	b = b[:len(a)]
-	for i := range a {
-		if b[i] == 0 && a[i] != 0 {
-			lits = false
-			break
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	if lits {
-		a = t.litHist[:]
-		b = w.literalFreq[:len(a)]
-		for i := range a {
-			if b[i] == 0 && a[i] != 0 {
-				lits = false
-				break
-			}
+
+	a = t.litHist[:256]
+	b = w.literalEncoding.codes[:len(a)]
+	for i, v := range a {
+		if v != 0 && b[i].len == 0 {
+			return false
 		}
 	}
-	return
+	return true
 }
 
 func (w *huffmanBitWriter) flush() {
@@ -566,7 +562,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 		w.lastHeader = 0
 	}
 	numLiterals, numOffsets := w.indexTokens(tokens, false)
-	w.generate(tokens)
+	w.generate()
 	var extraBits int
 	storedSize, storable := w.storedSize(input)
 	if storable {
@@ -595,7 +591,7 @@ func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
 	}
 
 	// Stored bytes?
-	if storable && storedSize < size {
+	if storable && storedSize <= size {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
@@ -634,22 +630,39 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		w.lastHeader = 0
 		w.lastHuffMan = false
 	}
-	if !sync {
-		tokens.Fill()
+
+	// fillReuse enables filling of empty values.
+	// This will make encodings always reusable without testing.
+	// However, this does not appear to benefit on most cases.
+	const fillReuse = false
+
+	// Check if we can reuse...
+	if !fillReuse && w.lastHeader > 0 && !w.canReuse(tokens) {
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
 	}
+
 	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+	extraBits := 0
+	ssize, storable := w.storedSize(input)
+
+	const usePrefs = true
+	if storable || w.lastHeader > 0 {
+		extraBits = w.extraBitSize()
+	}
 
 	var size int
+
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
 		// Estimate size for using a new table.
 		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
-		newSize += newSize >> w.logNewTablePenalty
+		newSize += int(w.literalEncoding.codes[endBlockMarker].len) + newSize>>w.logNewTablePenalty
 
 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + extraBits
 
 		// Check if a new table is better.
 		if newSize < reuseSize {
@@ -660,35 +673,79 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		} else {
 			size = reuseSize
 		}
+
+		if preSize := w.fixedSize(extraBits) + 7; usePrefs && preSize < size {
+			// Check if we get a reasonable size decrease.
+			if storable && ssize <= size {
+				w.writeStoredHeader(len(input), eof)
+				w.writeBytes(input)
+				return
+			}
+			w.writeFixedHeader(eof)
+			if !sync {
+				tokens.AddEOB()
+			}
+			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+			return
+		}
 		// Check if we get a reasonable size decrease.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if storable && ssize <= size {
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 	}
 
 	// We want a new block/table
 	if w.lastHeader == 0 {
-		w.generate(tokens)
+		if fillReuse && !sync {
+			w.fillTokens()
+			numLiterals, numOffsets = maxNumLit, maxNumDist
+		} else {
+			w.literalFreq[endBlockMarker] = 1
+		}
+
+		w.generate()
 		// Generate codegen and codegenFrequencies, which indicates how to encode
 		// the literalEncoding and the offsetEncoding.
 		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
 		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+
 		var numCodegens int
-		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
-		// Store bytes, if we don't get a reasonable improvement.
-		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+		if fillReuse && !sync {
+			// Reindex for accurate size...
+			w.indexTokens(tokens, true)
+		}
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+		// Store predefined, if we don't get a reasonable improvement.
+		if preSize := w.fixedSize(extraBits); usePrefs && preSize <= size {
+			// Store bytes, if we don't get an improvement.
+			if storable && ssize <= preSize {
+				w.writeStoredHeader(len(input), eof)
+				w.writeBytes(input)
+				return
+			}
+			w.writeFixedHeader(eof)
+			if !sync {
+				tokens.AddEOB()
+			}
+			w.writeTokens(tokens.Slice(), fixedLiteralEncoding.codes, fixedOffsetEncoding.codes)
+			return
+		}
+
+		if storable && ssize <= size {
+			// Store bytes, if we don't get an improvement.
 			w.writeStoredHeader(len(input), eof)
 			w.writeBytes(input)
-			w.lastHeader = 0
 			return
 		}
 
 		// Write Huffman table.
 		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
-		w.lastHeader, _ = w.headerSize()
+		if !sync {
+			w.lastHeader, _ = w.headerSize()
+		}
 		w.lastHuffMan = false
 	}
 
@@ -699,6 +756,19 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
 }
 
+func (w *huffmanBitWriter) fillTokens() {
+	for i, v := range w.literalFreq[:literalCount] {
+		if v == 0 {
+			w.literalFreq[i] = 1
+		}
+	}
+	for i, v := range w.offsetFreq[:offsetCodeCount] {
+		if v == 0 {
+			w.offsetFreq[i] = 1
+		}
+	}
+}
+
 // indexTokens indexes a slice of tokens, and updates
 // literalFreq and offsetFreq, and generates literalEncoding
 // and offsetEncoding.
@@ -733,7 +803,7 @@ func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, num
 	return
 }
 
-func (w *huffmanBitWriter) generate(t *tokens) {
+func (w *huffmanBitWriter) generate() {
 	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
 	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
 }
@@ -867,7 +937,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		offsetComb := offsetCombined[offsetCode]
 		if offsetComb > 1<<16 {
 			//w.writeBits(extraOffset, extraOffsetBits)
-			bits |= uint64(offset&matchOffsetOnlyMask-(offsetComb&0xffff)) << (nbits & 63)
+			bits |= uint64(offset-(offsetComb&0xffff)) << (nbits & 63)
 			nbits += uint16(offsetComb >> 16)
 			if nbits >= 48 {
 				binary.LittleEndian.PutUint64(w.bytes[nbytes:], bits)
diff --git a/flate/inflate.go b/flate/inflate.go
index d1edb356c4..d5f62f6a2c 100644
--- a/flate/inflate.go
+++ b/flate/inflate.go
@@ -328,11 +328,17 @@ func (f *decompressor) nextBlock() {
 	switch typ {
 	case 0:
 		f.dataBlock()
+		if debugDecode {
+			fmt.Println("stored block")
+		}
 	case 1:
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("predefinied huffman block")
+		}
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -341,6 +347,9 @@ func (f *decompressor) nextBlock() {
 		f.hl = &f.h1
 		f.hd = &f.h2
 		f.huffmanBlockDecoder()()
+		if debugDecode {
+			fmt.Println("dynamic huffman block")
+		}
 	default:
 		// 3 is reserved.
 		if debugDecode {
diff --git a/flate/testdata/huffman-null-max.dyn.expect b/flate/testdata/huffman-null-max.dyn.expect
index 0a3c71ceb3..f4e27a8146 100644
Binary files a/flate/testdata/huffman-null-max.dyn.expect and b/flate/testdata/huffman-null-max.dyn.expect differ
diff --git a/flate/testdata/huffman-null-max.dyn.expect-noinput b/flate/testdata/huffman-null-max.dyn.expect-noinput
index 0a3c71ceb3..f4e27a8146 100644
Binary files a/flate/testdata/huffman-null-max.dyn.expect-noinput and b/flate/testdata/huffman-null-max.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-pi.dyn.expect b/flate/testdata/huffman-pi.dyn.expect
index 11756feafb..66c76ceb23 100644
Binary files a/flate/testdata/huffman-pi.dyn.expect and b/flate/testdata/huffman-pi.dyn.expect differ
diff --git a/flate/testdata/huffman-pi.dyn.expect-noinput b/flate/testdata/huffman-pi.dyn.expect-noinput
index 11756feafb..66c76ceb23 100644
Binary files a/flate/testdata/huffman-pi.dyn.expect-noinput and b/flate/testdata/huffman-pi.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-rand-1k.dyn.expect-noinput b/flate/testdata/huffman-rand-1k.dyn.expect-noinput
index 5162399686..e45583ee31 100644
Binary files a/flate/testdata/huffman-rand-1k.dyn.expect-noinput and b/flate/testdata/huffman-rand-1k.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-rand-limit.dyn.expect b/flate/testdata/huffman-rand-limit.dyn.expect
index 57e59322e9..881e59c9ab 100644
Binary files a/flate/testdata/huffman-rand-limit.dyn.expect and b/flate/testdata/huffman-rand-limit.dyn.expect differ
diff --git a/flate/testdata/huffman-rand-limit.dyn.expect-noinput b/flate/testdata/huffman-rand-limit.dyn.expect-noinput
index 008b9afee9..881e59c9ab 100644
Binary files a/flate/testdata/huffman-rand-limit.dyn.expect-noinput and b/flate/testdata/huffman-rand-limit.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-rand-limit.sync.expect b/flate/testdata/huffman-rand-limit.sync.expect
index 2d6527934e..881e59c9ab 100644
Binary files a/flate/testdata/huffman-rand-limit.sync.expect and b/flate/testdata/huffman-rand-limit.sync.expect differ
diff --git a/flate/testdata/huffman-rand-limit.sync.expect-noinput b/flate/testdata/huffman-rand-limit.sync.expect-noinput
index 2d6527934e..881e59c9ab 100644
Binary files a/flate/testdata/huffman-rand-limit.sync.expect-noinput and b/flate/testdata/huffman-rand-limit.sync.expect-noinput differ
diff --git a/flate/testdata/huffman-shifts.dyn.expect b/flate/testdata/huffman-shifts.dyn.expect
index 2f4fd17add..9ad731f3cf 100644
Binary files a/flate/testdata/huffman-shifts.dyn.expect and b/flate/testdata/huffman-shifts.dyn.expect differ
diff --git a/flate/testdata/huffman-shifts.dyn.expect-noinput b/flate/testdata/huffman-shifts.dyn.expect-noinput
index 2f4fd17add..9ad731f3cf 100644
Binary files a/flate/testdata/huffman-shifts.dyn.expect-noinput and b/flate/testdata/huffman-shifts.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-text-shift.dyn.expect b/flate/testdata/huffman-text-shift.dyn.expect
index 3a4dcc4cab..486bdf6f69 100644
Binary files a/flate/testdata/huffman-text-shift.dyn.expect and b/flate/testdata/huffman-text-shift.dyn.expect differ
diff --git a/flate/testdata/huffman-text-shift.dyn.expect-noinput b/flate/testdata/huffman-text-shift.dyn.expect-noinput
index 29788aa0a8..486bdf6f69 100644
--- a/flate/testdata/huffman-text-shift.dyn.expect-noinput
+++ b/flate/testdata/huffman-text-shift.dyn.expect-noinput
@@ -1,2 +1 @@
-ì`Ó@‡‚»¬…5R|@×•1Cºáš„4Ï’ÿò|Ê‚»»»»»»»»»»»´¸»ó.÷îzgEN‡L²ûøä‘E#2Â¬EQ<Æ»D–¦8.IDHÃ‚€Dà@.E^÷³û›ˆ®¸ @"òÒ  ‘ƒ­¶`»M
-KS4åÂ*€n%P1n¤AAÐ`¬OS^.Ìûªa†JUxxç2›s˜4å%yWËç‹X+&FÖ$I…&¨)ŠIgd<l9ˆ7ÒTCš²øYýmEŠ+T"¸d©Òeì!¡eË‡ëõÒ1é—È˜‰+Ä<
\ No newline at end of file
+ìÝJó0Çñ¿È!ž<(léP*(éº:§sþè¦‚ˆÐ”4aI¾%|SVÅÛxO&ÝU>Ÿ7¯CÄqMçu­ðd29ß¨¤xžsÈÞ·§$Ž· Qi^¨t­wUÙÍŠù;Ó£…C…Â•‚«´CµZfùºÈ¥6œJ±«­výÔ¶XdpÂj¿(´¾œÚ]ôö^v8:K’ÓdHÉ@Ž>.¦À3SAJëÆ.3¶{®;þâ5Få’oJÒY6Ï¯×Ë›ÛÕÝúþáñ©ØlŸ_÷Ýç?öÿ÷çè8¾d
\ No newline at end of file
diff --git a/flate/testdata/huffman-text.dyn.expect b/flate/testdata/huffman-text.dyn.expect
index 1fb84b322c..b9cc20d0eb 100644
Binary files a/flate/testdata/huffman-text.dyn.expect and b/flate/testdata/huffman-text.dyn.expect differ
diff --git a/flate/testdata/huffman-text.dyn.expect-noinput b/flate/testdata/huffman-text.dyn.expect-noinput
index 6ef6dd44dd..b9cc20d0eb 100644
--- a/flate/testdata/huffman-text.dyn.expect-noinput
+++ b/flate/testdata/huffman-text.dyn.expect-noinput
@@ -1,3 +1,4 @@
-ì`ÓJô|à®b¬éâFîÃ=M/MXš+¹K¡¸»»»»»»»»»»ËŠ»;çïÞ¹û™Í`Å.&;$
-üý³A	A:•°F8Tð¢	hˆ ÍìË˜ÍP– À"PI&@°®	lG p`7ÒTd›xÈœÏD¨GA^kŠ, •  OAàU°!±®ÚAVJŠ²QVÇ2,ãâ…ÞÀÉËj(,;]X£`ÀÄ
-Šº*xqF_¨Ç2>n^€“AÆÊUmŠü ³Å’ÑâË2>¢T‡ì€	gÕO‘Ñ é¢èäU“ª+ŠÉÉ×ádÕà5Ê•×dŠŒ6_–iÀ2
\ No newline at end of file
+ìÝßJó0Æñãr=ê`KÇû2AasÄ“)ˆHšþ²„¥IÉŸbï]Ökùòyž{hÂ0E{6ÿ6›[¼ÂcÀ¾dbØ;‡™"%Š#u‚³¦Á["llB
+%*‚
+Á&œÃHÑS‡v‚Äýéaòäh¾9«È'B62CI– Cñ¬G6„ç§Ãñåt„¶ŽgœR]ä™ÐKë¯mû!ÄŒ*¤êšºx5[½Äg‹QF´Ø¡—ª?>Û)Ó
+7Ûíÿí’³…^áw;„$‘d¦º2Eë^úµ/Î­{ù-¬¯æ©x6SÝ.9ûåì
\ No newline at end of file
diff --git a/flate/testdata/huffman-zero.dyn.expect b/flate/testdata/huffman-zero.dyn.expect
index 230433ca0c..dbe401c54c 100644
Binary files a/flate/testdata/huffman-zero.dyn.expect and b/flate/testdata/huffman-zero.dyn.expect differ
diff --git a/flate/testdata/huffman-zero.dyn.expect-noinput b/flate/testdata/huffman-zero.dyn.expect-noinput
index cefc1d3f66..dbe401c54c 100644
Binary files a/flate/testdata/huffman-zero.dyn.expect-noinput and b/flate/testdata/huffman-zero.dyn.expect-noinput differ
diff --git a/flate/testdata/huffman-zero.sync.expect b/flate/testdata/huffman-zero.sync.expect
index 830348a79a..dbe401c54c 100644
Binary files a/flate/testdata/huffman-zero.sync.expect and b/flate/testdata/huffman-zero.sync.expect differ
diff --git a/flate/testdata/huffman-zero.sync.expect-noinput b/flate/testdata/huffman-zero.sync.expect-noinput
index 830348a79a..dbe401c54c 100644
Binary files a/flate/testdata/huffman-zero.sync.expect-noinput and b/flate/testdata/huffman-zero.sync.expect-noinput differ
diff --git a/flate/testdata/null-long-match.dyn.expect-noinput b/flate/testdata/null-long-match.dyn.expect-noinput
index 14167a3344..62d55e6b83 100644
Binary files a/flate/testdata/null-long-match.dyn.expect-noinput and b/flate/testdata/null-long-match.dyn.expect-noinput differ