diff --git a/flate/deflate.go b/flate/deflate.go
index 25dbe3e15f..3f428d8b52 100644
--- a/flate/deflate.go
+++ b/flate/deflate.go
@@ -645,15 +645,15 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logNewTablePenalty = 4
-		d.window = make([]byte, maxStoreBlockSize)
+		d.w.logNewTablePenalty = 8
+		d.window = make([]byte, 32<<10)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
 	case level == DefaultCompression:
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logNewTablePenalty = 6
+		d.w.logNewTablePenalty = 8
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
diff --git a/flate/fast_encoder.go b/flate/fast_encoder.go
index 4a73e1bdd3..f2477abfcb 100644
--- a/flate/fast_encoder.go
+++ b/flate/fast_encoder.go
@@ -6,6 +6,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"fmt"
 	"math/bits"
 )
@@ -65,26 +66,15 @@ func load32(b []byte, i int) uint32 {
 }
 
 func load64(b []byte, i int) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func load3232(b []byte, i int32) uint32 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:4]
-	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+	return binary.LittleEndian.Uint32(b[i:])
 }
 
 func load6432(b []byte, i int32) uint64 {
-	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
-	b = b[i:]
-	b = b[:8]
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
-		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+	return binary.LittleEndian.Uint64(b[i:])
 }
 
 func hash(u uint32) uint32 {
@@ -225,9 +215,9 @@ func (e *fastGen) Reset() {
 func matchLen(a, b []byte) int {
 	b = b[:len(a)]
 	var checked int
-	if len(a) > 4 {
+	if len(a) >= 4 {
 		// Try 4 bytes first
-		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+		if diff := binary.LittleEndian.Uint32(a) ^ binary.LittleEndian.Uint32(b); diff != 0 {
 			return bits.TrailingZeros32(diff) >> 3
 		}
 		// Switch to 8 byte matching.
@@ -236,7 +226,7 @@ func matchLen(a, b []byte) int {
 		b = b[4:]
 		for len(a) >= 8 {
 			b = b[:len(a)]
-			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+			if diff := binary.LittleEndian.Uint64(a) ^ binary.LittleEndian.Uint64(b); diff != 0 {
 				return checked + (bits.TrailingZeros64(diff) >> 3)
 			}
 			checked += 8
diff --git a/flate/huffman_bit_writer.go b/flate/huffman_bit_writer.go
index 208d66711d..db54be1398 100644
--- a/flate/huffman_bit_writer.go
+++ b/flate/huffman_bit_writer.go
@@ -5,6 +5,7 @@
 package flate
 
 import (
+	"encoding/binary"
 	"io"
 )
 
@@ -206,7 +207,7 @@ func (w *huffmanBitWriter) write(b []byte) {
 }
 
 func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
-	w.bits |= uint64(b) << (w.nbits & reg16SizeMask64)
+	w.bits |= uint64(b) << w.nbits
 	w.nbits += nb
 	if w.nbits >= 48 {
 		w.writeOutBits()
@@ -420,13 +421,11 @@ func (w *huffmanBitWriter) writeOutBits() {
 	w.bits >>= 48
 	w.nbits -= 48
 	n := w.nbytes
-	w.bytes[n] = byte(bits)
-	w.bytes[n+1] = byte(bits >> 8)
-	w.bytes[n+2] = byte(bits >> 16)
-	w.bytes[n+3] = byte(bits >> 24)
-	w.bytes[n+4] = byte(bits >> 32)
-	w.bytes[n+5] = byte(bits >> 40)
+
+	// We over-write, but faster...
+	binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 	n += 6
+
 	if n >= bufferFlushSize {
 		if w.err != nil {
 			n = 0
@@ -435,6 +434,7 @@ func (w *huffmanBitWriter) writeOutBits() {
 		w.write(w.bytes[:n])
 		n = 0
 	}
+
 	w.nbytes = n
 }
 
@@ -759,7 +759,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := lengths[lengthCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
+			w.bits |= uint64(c.code) << w.nbits
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
@@ -779,7 +779,7 @@ func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode)
 		} else {
 			// inlined
 			c := offs[offsetCode&31]
-			w.bits |= uint64(c.code) << (w.nbits & reg16SizeMask64)
+			w.bits |= uint64(c.code) << w.nbits
 			w.nbits += c.len
 			if w.nbits >= 48 {
 				w.writeOutBits()
@@ -830,8 +830,8 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	// Assume header is around 70 bytes:
 	// https://stackoverflow.com/a/25454430
 	const guessHeaderSizeBits = 70 * 8
-	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
-	estBits += w.lastHeader + 15
+	estBits := histogramSize(input, w.literalFreq[:], !eof && !sync)
+	estBits += w.lastHeader + len(input)/32
 	if w.lastHeader == 0 {
 		estBits += guessHeaderSizeBits
 	}
@@ -845,9 +845,9 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 		return
 	}
 
+	reuseSize := 0
 	if w.lastHeader > 0 {
-		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
-		estBits += estExtra
+		reuseSize = w.literalEncoding.bitLength(w.literalFreq[:256])
 
 		if estBits < reuseSize {
 			// We owe an EOB
@@ -859,6 +859,10 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	const numLiterals = endBlockMarker + 1
 	const numOffsets = 1
 	if w.lastHeader == 0 {
+		if !eof && !sync {
+			// Generate a slightly suboptimal tree that can be used for all.
+			fillHist(w.literalFreq[:numLiterals])
+		}
 		w.literalFreq[endBlockMarker] = 1
 		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
 
@@ -878,19 +882,14 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	for _, t := range input {
 		// Bitwriting inlined, ~30% speedup
 		c := encoding[t]
-		w.bits |= uint64(c.code) << ((w.nbits) & reg16SizeMask64)
+		w.bits |= uint64(c.code) << w.nbits
 		w.nbits += c.len
 		if w.nbits >= 48 {
 			bits := w.bits
 			w.bits >>= 48
 			w.nbits -= 48
 			n := w.nbytes
-			w.bytes[n] = byte(bits)
-			w.bytes[n+1] = byte(bits >> 8)
-			w.bytes[n+2] = byte(bits >> 16)
-			w.bytes[n+3] = byte(bits >> 24)
-			w.bytes[n+4] = byte(bits >> 32)
-			w.bytes[n+5] = byte(bits >> 40)
+			binary.LittleEndian.PutUint64(w.bytes[n:], bits)
 			n += 6
 			if n >= bufferFlushSize {
 				if w.err != nil {
diff --git a/flate/huffman_code.go b/flate/huffman_code.go
index 4c39a30187..0d3445a1cc 100644
--- a/flate/huffman_code.go
+++ b/flate/huffman_code.go
@@ -122,6 +122,16 @@ func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	return total
 }
 
+func (h *huffmanEncoder) bitLengthRaw(b []byte) int {
+	var total int
+	for _, f := range b {
+		if f != 0 {
+			total += int(h.codes[f].len)
+		}
+	}
+	return total
+}
+
 // Return the number of literals assigned to each bit size in the Huffman encoding
 //
 // This method is only called when list.length >= 3
@@ -327,37 +337,40 @@ func atLeastOne(v float32) float32 {
 	return v
 }
 
+// Unassigned values are assigned '1' in the histogram.
+func fillHist(b []uint16) {
+	for i, v := range b {
+		if v == 0 {
+			b[i] = 1
+		}
+	}
+}
+
 // histogramSize accumulates a histogram of b in h.
 // An estimated size in bits is returned.
-// Unassigned values are assigned '1' in the histogram.
 // len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
+func histogramSize(b []byte, h []uint16, fill bool) (bits int) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
-	invTotal := 1.0 / float32(len(b))
-	shannon := float32(0.0)
-	var extra float32
+	total := len(b)
 	if fill {
-		oneBits := atLeastOne(-mFastLog2(invTotal))
-		for i, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			} else {
-				h[i] = 1
-				extra += oneBits
+		for _, v := range h {
+			if v == 0 {
+				total++
 			}
 		}
-	} else {
-		for _, v := range h[:] {
-			if v > 0 {
-				n := float32(v)
-				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
-			}
+	}
+
+	invTotal := 1.0 / float32(total)
+	shannon := float32(0.0)
+	for _, v := range h {
+		if v > 0 {
+			n := float32(v)
+			shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
 		}
 	}
 
-	return int(shannon + 0.99), int(extra + 0.99)
+	return int(shannon + 0.99)
 }
diff --git a/flate/testdata/huffman-rand-limit.golden b/flate/testdata/huffman-rand-limit.golden
index 7ef6745049..57e59322e9 100644
Binary files a/flate/testdata/huffman-rand-limit.golden and b/flate/testdata/huffman-rand-limit.golden differ
diff --git a/flate/testdata/huffman-text-shift.golden b/flate/testdata/huffman-text-shift.golden
index 80531ad983..3a4dcc4cab 100644
Binary files a/flate/testdata/huffman-text-shift.golden and b/flate/testdata/huffman-text-shift.golden differ