diff --git a/.gitignore b/.gitignore
index 48f061587f1..4896ebb4d63 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
 /cmd/tempo-cli/tempo-cli
 /example/docker-compose/example-data/tempo
 .DS_Store
+/tempodb/encoding/benchmark_block
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2af41ef8479..807722c9ac7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 * [CHANGE] Fixed ingester latency spikes on read [#461](https://github.com/grafana/tempo/pull/461)
 * [CHANGE] Ingester cut blocks based on size instead of trace count.  Replace ingester `traces_per_block` setting with `max_block_bytes`. This is a **breaking change**. [#474](https://github.com/grafana/tempo/issues/474)
 * [CHANGE] Refactor cache section in tempodb. This is a **breaking change** b/c the cache config section has changed. [#485](https://github.com/grafana/tempo/pull/485)
+* [FEATURE] Added block compression.  This is a **breaking change** b/c some configuration fields moved. [#504](https://github.com/grafana/tempo/pull/504)
 * [ENHANCEMENT] Serve config at the "/config" endpoint. [#446](https://github.com/grafana/tempo/pull/446)
 * [ENHANCEMENT] Switch blocklist polling and retention to different concurrency mechanism, add configuration options. [#475](https://github.com/grafana/tempo/issues/475)
 * [ENHANCEMENT] Add S3 options region and forcepathstyle [#431](https://github.com/grafana/tempo/issues/431)
diff --git a/cmd/tempo-cli/cmd-list-block.go b/cmd/tempo-cli/cmd-list-block.go
index 3f356cd3b3c..5761692bbcc 100644
--- a/cmd/tempo-cli/cmd-list-block.go
+++ b/cmd/tempo-cli/cmd-list-block.go
@@ -64,12 +64,12 @@ func dumpBlock(r tempodb_backend.Reader, c tempodb_backend.Compactor, tenantID s
 			Version:  unifiedMeta.version,
 			TenantID: tenantID,
 			BlockID:  id,
-		})
+		}, r)
 		if err != nil {
 			return err
 		}
 
-		iter, err := block.Iterator(10*1024*1024, r)
+		iter, err := block.Iterator(10 * 1024 * 1024)
 		if err != nil {
 			return err
 		}
diff --git a/example/docker-compose/etc/tempo-azure.yaml b/example/docker-compose/etc/tempo-azure.yaml
index 56eecb02468..c33a0785f07 100644
--- a/example/docker-compose/etc/tempo-azure.yaml
+++ b/example/docker-compose/etc/tempo-azure.yaml
@@ -33,10 +33,12 @@ compactor:
 storage:
   trace:
     backend: azure                     # backend configuration to use
-    wal:
-      path: /tmp/tempo/wal             # where to store the the wal locally
+    block:
       bloom_filter_false_positive: .05 # bloom filter false positive rate.  lower values create larger filters but fewer false positives
       index_downsample: 10             # number of traces per index record
+      encoding: lz4-64k                # block encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    wal:
+      path: /tmp/tempo/wal             # where to store the the wal locally
     azure:
       container-name: tempo                    # how to store data in azure
       endpoint-suffix: azurite:10000
diff --git a/example/docker-compose/etc/tempo-gcs-fake.yaml b/example/docker-compose/etc/tempo-gcs-fake.yaml
index 609ca486477..4ec6f139468 100644
--- a/example/docker-compose/etc/tempo-gcs-fake.yaml
+++ b/example/docker-compose/etc/tempo-gcs-fake.yaml
@@ -34,10 +34,12 @@ compactor:
 storage:
   trace:
     backend: gcs                       # backend configuration to use
-    wal:
-      path: /tmp/tempo/wal             # where to store the the wal locally
+    block:
       bloom_filter_false_positive: .05 # bloom filter false positive rate.  lower values create larger filters but fewer false positives
       index_downsample: 10             # number of traces per index record
+      encoding: lz4-64k                # block encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    wal:
+      path: /tmp/tempo/wal             # where to store the the wal locally
     gcs:
       bucket_name: tempo
       endpoint: https://gcs:4443/storage/v1/
diff --git a/example/docker-compose/etc/tempo-local.yaml b/example/docker-compose/etc/tempo-local.yaml
index 1623f9a16f6..6d95cebaa54 100644
--- a/example/docker-compose/etc/tempo-local.yaml
+++ b/example/docker-compose/etc/tempo-local.yaml
@@ -33,10 +33,12 @@ compactor:
 storage:
   trace:
     backend: local                     # backend configuration to use
-    wal:
-      path: /tmp/tempo/wal             # where to store the the wal locally
+    block:
       bloom_filter_false_positive: .05 # bloom filter false positive rate.  lower values create larger filters but fewer false positives
       index_downsample: 10             # number of traces per index record
+      encoding: lz4-64k                # block encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    wal:
+      path: /tmp/tempo/wal             # where to store the the wal locally
     local:
       path: /tmp/tempo/blocks
     pool:
diff --git a/example/docker-compose/etc/tempo-s3-minio.yaml b/example/docker-compose/etc/tempo-s3-minio.yaml
index 4823f471c92..7a33b5ea668 100644
--- a/example/docker-compose/etc/tempo-s3-minio.yaml
+++ b/example/docker-compose/etc/tempo-s3-minio.yaml
@@ -34,10 +34,12 @@ compactor:
 storage:
   trace:
     backend: s3                        # backend configuration to use
-    wal:
-      path: /tmp/tempo/wal             # where to store the the wal locally
+    block:
       bloom_filter_false_positive: .05 # bloom filter false positive rate.  lower values create larger filters but fewer false positives
       index_downsample: 10             # number of traces per index record
+      encoding: lz4-64k                # block encoding/compression.  options: none, gzip, lz4-64k, lz4-256k, lz4-1M, lz4, snappy, zstd
+    wal:
+      path: /tmp/tempo/wal             # where to store the the wal locally
     s3:
       bucket: tempo                    # how to store data in s3
       endpoint: minio:9000
diff --git a/go.mod b/go.mod
index d82f9537005..e325f0c0da3 100644
--- a/go.mod
+++ b/go.mod
@@ -13,6 +13,7 @@ require (
 	github.com/gogo/status v1.0.3
 	github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b
 	github.com/golang/protobuf v1.4.3
+	github.com/golang/snappy v0.0.2
 	github.com/google/uuid v1.1.1
 	github.com/gorilla/mux v1.7.4
 	github.com/grafana/loki v1.3.0
@@ -20,10 +21,12 @@ require (
 	github.com/hashicorp/go-hclog v0.14.0
 	github.com/jaegertracing/jaeger v1.18.2-0.20200707061226-97d2319ff2be
 	github.com/jsternberg/zap-logfmt v1.0.0
+	github.com/klauspost/compress v1.10.5
 	github.com/minio/minio-go/v7 v7.0.5
 	github.com/olekukonko/tablewriter v0.0.2
 	github.com/open-telemetry/opentelemetry-proto v0.4.0
 	github.com/opentracing/opentracing-go v1.2.0
+	github.com/pierrec/lz4/v4 v4.1.3
 	github.com/pkg/errors v0.9.1
 	github.com/prometheus/client_golang v1.8.0
 	github.com/prometheus/client_model v0.2.0
diff --git a/go.sum b/go.sum
index 6ca1b94321d..79568c41d29 100644
--- a/go.sum
+++ b/go.sum
@@ -1233,6 +1233,7 @@ github.com/klauspost/compress v1.4.0/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0
 github.com/klauspost/compress v1.9.4/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.5/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.10.4/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.10.5 h1:7q6vHIqubShURwQz8cQK6yIe/xC3IF0Vm7TGfqjewrc=
 github.com/klauspost/compress v1.10.5/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/cpuid v0.0.0-20170728055534-ae7887de9fa5/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
@@ -1576,7 +1577,10 @@ github.com/pierrec/lz4 v0.0.0-20190327172049-315a67e90e41/go.mod h1:3/3N9NVKO0je
 github.com/pierrec/lz4 v1.0.2-0.20190131084431-473cd7ce01a1/go.mod h1:3/3N9NVKO0jef7pBehbT1qWhCMrIgbYNnFAZCqQ5LRc=
 github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
 github.com/pierrec/lz4 v2.3.1-0.20191115212037-9085dacd1e1e+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
+github.com/pierrec/lz4 v2.4.1+incompatible h1:mFe7ttWaflA46Mhqh+jUfjp2qTbPYxLB2/OyBppH9dg=
 github.com/pierrec/lz4 v2.4.1+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY=
+github.com/pierrec/lz4/v4 v4.1.3 h1:/dvQpkb0o1pVlSgKNQqfkavlnXaIK+hJ0LXsKRUN9D4=
+github.com/pierrec/lz4/v4 v4.1.3/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4=
 github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1-0.20171018195549-f15c970de5b7/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
diff --git a/modules/ingester/ingester.go b/modules/ingester/ingester.go
index 16bd76d3758..d4e12f00d53 100644
--- a/modules/ingester/ingester.go
+++ b/modules/ingester/ingester.go
@@ -227,7 +227,7 @@ func (i *Ingester) getOrCreateInstance(instanceID string) (*instance, error) {
 	inst, ok = i.instances[instanceID]
 	if !ok {
 		var err error
-		inst, err = newInstance(instanceID, i.limiter, i.store.WAL())
+		inst, err = newInstance(instanceID, i.limiter, i.store)
 		if err != nil {
 			return nil, err
 		}
diff --git a/modules/ingester/ingester_test.go b/modules/ingester/ingester_test.go
index cf6d847a744..f9481a4accc 100644
--- a/modules/ingester/ingester_test.go
+++ b/modules/ingester/ingester_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/go-kit/kit/log"
 	"github.com/golang/protobuf/proto"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
 	"github.com/weaveworks/common/user"
 
 	"github.com/grafana/tempo/modules/overrides"
@@ -21,7 +22,9 @@ import (
 	"github.com/grafana/tempo/pkg/tempopb"
 	"github.com/grafana/tempo/pkg/util/test"
 	"github.com/grafana/tempo/tempodb"
+	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/wal"
 )
 
@@ -179,7 +182,7 @@ func TestFlush(t *testing.T) {
 func defaultIngester(t *testing.T, tmpDir string) (*Ingester, []*tempopb.Trace, [][]byte) {
 	ingesterConfig := defaultIngesterTestConfig()
 	limits, err := overrides.NewOverrides(defaultLimitsTestConfig())
-	assert.NoError(t, err, "unexpected error creating overrides")
+	require.NoError(t, err, "unexpected error creating overrides")
 
 	s, err := storage.NewStore(storage.Config{
 		Trace: tempodb.Config{
@@ -187,20 +190,23 @@ func defaultIngester(t *testing.T, tmpDir string) (*Ingester, []*tempopb.Trace,
 			Local: &local.Config{
 				Path: tmpDir,
 			},
-			WAL: &wal.Config{
-				Filepath:        tmpDir,
+			Block: &encoding.BlockConfig{
 				IndexDownsample: 2,
 				BloomFP:         .01,
+				Encoding:        backend.EncLZ4_1M,
+			},
+			WAL: &wal.Config{
+				Filepath: tmpDir,
 			},
 		},
 	}, log.NewNopLogger())
-	assert.NoError(t, err, "unexpected error store")
+	require.NoError(t, err, "unexpected error store")
 
 	ingester, err := New(ingesterConfig, s, limits)
-	assert.NoError(t, err, "unexpected error creating ingester")
+	require.NoError(t, err, "unexpected error creating ingester")
 
 	err = ingester.starting(context.Background())
-	assert.NoError(t, err, "unexpected error starting ingester")
+	require.NoError(t, err, "unexpected error starting ingester")
 
 	// make some fake traceIDs/requests
 	traces := make([]*tempopb.Trace, 0)
@@ -209,7 +215,7 @@ func defaultIngester(t *testing.T, tmpDir string) (*Ingester, []*tempopb.Trace,
 	for i := 0; i < 10; i++ {
 		id := make([]byte, 16)
 		_, err = rand.Read(id)
-		assert.NoError(t, err)
+		require.NoError(t, err)
 
 		traces = append(traces, test.MakeTrace(10, id))
 		traceIDs = append(traceIDs, id)
@@ -222,7 +228,7 @@ func defaultIngester(t *testing.T, tmpDir string) (*Ingester, []*tempopb.Trace,
 				&tempopb.PushRequest{
 					Batch: batch,
 				})
-			assert.NoError(t, err, "unexpected error pushing")
+			require.NoError(t, err, "unexpected error pushing")
 		}
 	}
 
diff --git a/modules/ingester/instance.go b/modules/ingester/instance.go
index 44c90a7b308..85339ae0942 100644
--- a/modules/ingester/instance.go
+++ b/modules/ingester/instance.go
@@ -20,6 +20,7 @@ import (
 
 	"github.com/grafana/tempo/pkg/tempopb"
 	"github.com/grafana/tempo/pkg/util"
+	"github.com/grafana/tempo/tempodb"
 	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/encoding/common"
 	"github.com/grafana/tempo/tempodb/wal"
@@ -63,12 +64,12 @@ type instance struct {
 	tracesCreatedTotal prometheus.Counter
 	bytesWrittenTotal  prometheus.Counter
 	limiter            *Limiter
-	wal                *wal.WAL
+	writer             tempodb.Writer
 
 	hash hash.Hash32
 }
 
-func newInstance(instanceID string, limiter *Limiter, wal *wal.WAL) (*instance, error) {
+func newInstance(instanceID string, limiter *Limiter, writer tempodb.Writer) (*instance, error) {
 	i := &instance{
 		traces: map[uint32]*trace{},
 
@@ -76,7 +77,7 @@ func newInstance(instanceID string, limiter *Limiter, wal *wal.WAL) (*instance,
 		tracesCreatedTotal: metricTracesCreatedTotal.WithLabelValues(instanceID),
 		bytesWrittenTotal:  metricBytesWrittenTotal.WithLabelValues(instanceID),
 		limiter:            limiter,
-		wal:                wal,
+		writer:             writer,
 
 		hash: fnv.New32(),
 	}
@@ -153,7 +154,7 @@ func (i *instance) CutBlockIfReady(maxBlockLifetime time.Duration, maxBlockBytes
 
 		// todo : this should be a queue of blocks to complete with workers
 		go func() {
-			completeBlock, err := i.completingBlock.Complete(i.wal, i)
+			completeBlock, err := i.writer.CompleteBlock(i.completingBlock, i)
 			i.blocksMtx.Lock()
 			defer i.blocksMtx.Unlock()
 
@@ -307,7 +308,7 @@ func (i *instance) tokenForTraceID(id []byte) uint32 {
 // resetHeadBlock() should be called under lock
 func (i *instance) resetHeadBlock() error {
 	var err error
-	i.headBlock, err = i.wal.NewBlock(uuid.New(), i.instanceID)
+	i.headBlock, err = i.writer.WAL().NewBlock(uuid.New(), i.instanceID)
 	i.lastBlockCut = time.Now()
 	return err
 }
diff --git a/modules/ingester/instance_test.go b/modules/ingester/instance_test.go
index cdf3412d11c..ca511fd3770 100644
--- a/modules/ingester/instance_test.go
+++ b/modules/ingester/instance_test.go
@@ -9,9 +9,15 @@ import (
 	"testing"
 	"time"
 
+	"github.com/go-kit/kit/log"
 	"github.com/grafana/tempo/modules/overrides"
+	"github.com/grafana/tempo/modules/storage"
 	"github.com/grafana/tempo/pkg/tempopb"
 	"github.com/grafana/tempo/pkg/util/test"
+	"github.com/grafana/tempo/tempodb"
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/wal"
 
 	"github.com/stretchr/testify/assert"
@@ -36,11 +42,9 @@ func TestInstance(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 
 	ingester, _, _ := defaultIngester(t, tempDir)
-	wal := ingester.store.WAL()
-
 	request := test.MakeRequest(10, []byte{})
 
-	i, err := newInstance("fake", limiter, wal)
+	i, err := newInstance("fake", limiter, ingester.store)
 	assert.NoError(t, err, "unexpected error creating new instance")
 	err = i.Push(context.Background(), request)
 	assert.NoError(t, err)
@@ -89,12 +93,10 @@ func TestInstanceFind(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 
 	ingester, _, _ := defaultIngester(t, tempDir)
-	wal := ingester.store.WAL()
-
 	request := test.MakeRequest(10, []byte{})
 	traceID := test.MustTraceID(request)
 
-	i, err := newInstance("fake", limiter, wal)
+	i, err := newInstance("fake", limiter, ingester.store)
 	assert.NoError(t, err, "unexpected error creating new instance")
 	err = i.Push(context.Background(), request)
 	assert.NoError(t, err)
@@ -128,9 +130,8 @@ func TestInstanceDoesNotRace(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 
 	ingester, _, _ := defaultIngester(t, tempDir)
-	wal := ingester.store.WAL()
 
-	i, err := newInstance("fake", limiter, wal)
+	i, err := newInstance("fake", limiter, ingester.store)
 	assert.NoError(t, err, "unexpected error creating new instance")
 
 	end := make(chan struct{})
@@ -197,9 +198,8 @@ func TestInstanceLimits(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 
 	ingester, _, _ := defaultIngester(t, tempDir)
-	wal := ingester.store.WAL()
 
-	i, err := newInstance("fake", limiter, wal)
+	i, err := newInstance("fake", limiter, ingester.store)
 	assert.NoError(t, err, "unexpected error creating new instance")
 
 	type push struct {
@@ -427,19 +427,30 @@ func TestInstanceCutBlockIfReady(t *testing.T) {
 	}
 }
 
-func defaultInstance(t assert.TestingT, tempDir string) *instance {
+func defaultInstance(t assert.TestingT, tmpDir string) *instance {
 	limits, err := overrides.NewOverrides(overrides.Limits{})
 	assert.NoError(t, err, "unexpected error creating limits")
 	limiter := NewLimiter(limits, &ringCountMock{count: 1}, 1)
 
-	wal, err := wal.New(&wal.Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         .01,
-	})
-	assert.NoError(t, err, "unexpected error creating wal")
+	s, err := storage.NewStore(storage.Config{
+		Trace: tempodb.Config{
+			Backend: "local",
+			Local: &local.Config{
+				Path: tmpDir,
+			},
+			Block: &encoding.BlockConfig{
+				IndexDownsample: 2,
+				BloomFP:         .01,
+				Encoding:        backend.EncLZ4_1M,
+			},
+			WAL: &wal.Config{
+				Filepath: tmpDir,
+			},
+		},
+	}, log.NewNopLogger())
+	assert.NoError(t, err, "unexpected error creating store")
 
-	instance, err := newInstance("fake", limiter, wal)
+	instance, err := newInstance("fake", limiter, s)
 	assert.NoError(t, err, "unexpected error creating new instance")
 
 	return instance
diff --git a/modules/querier/querier.go b/modules/querier/querier.go
index 8d1ef103c0b..90c89790ead 100644
--- a/modules/querier/querier.go
+++ b/modules/querier/querier.go
@@ -29,18 +29,6 @@ import (
 )
 
 var (
-	metricQueryReads = promauto.NewHistogramVec(prometheus.HistogramOpts{
-		Namespace: "tempo",
-		Name:      "query_reads",
-		Help:      "count of reads",
-		Buckets:   prometheus.ExponentialBuckets(1, 2, 10),
-	}, []string{"layer"})
-	metricQueryBytesRead = promauto.NewHistogramVec(prometheus.HistogramOpts{
-		Namespace: "tempo",
-		Name:      "query_bytes_read",
-		Help:      "bytes read",
-		Buckets:   prometheus.ExponentialBuckets(1024*1024, 2, 8),
-	}, []string{"layer"})
 	metricIngesterClients = promauto.NewGauge(prometheus.GaugeOpts{
 		Namespace: "tempo",
 		Name:      "querier_ingester_clients",
@@ -195,19 +183,12 @@ func (q *Querier) FindTraceByID(ctx context.Context, req *tempopb.TraceByIDReque
 	}
 
 	span.LogFields(ot_log.String("msg", "searching store"))
-	partialTraces, metrics, err := q.store.Find(opentracing.ContextWithSpan(ctx, span), userID, req.TraceID, req.BlockStart, req.BlockEnd)
+	partialTraces, err := q.store.Find(opentracing.ContextWithSpan(ctx, span), userID, req.TraceID, req.BlockStart, req.BlockEnd)
 	if err != nil {
 		return nil, errors.Wrap(err, "error querying store in Querier.FindTraceByID")
 	}
 
 	span.LogFields(ot_log.String("msg", "done searching store"))
-	metricQueryReads.WithLabelValues("bloom").Observe(float64(metrics.BloomFilterReads.Load()))
-	metricQueryBytesRead.WithLabelValues("bloom").Observe(float64(metrics.BloomFilterBytesRead.Load()))
-	metricQueryReads.WithLabelValues("index").Observe(float64(metrics.IndexReads.Load()))
-	metricQueryBytesRead.WithLabelValues("index").Observe(float64(metrics.IndexBytesRead.Load()))
-	metricQueryReads.WithLabelValues("block").Observe(float64(metrics.BlockReads.Load()))
-	metricQueryBytesRead.WithLabelValues("block").Observe(float64(metrics.BlockBytesRead.Load()))
-
 	// combine partialTraces with completeTrace
 	for _, partialTrace := range partialTraces {
 		storeTrace := &tempopb.Trace{}
diff --git a/modules/querier/querier_test.go b/modules/querier/querier_test.go
index 6b67582cab9..a5d23c4c6d3 100644
--- a/modules/querier/querier_test.go
+++ b/modules/querier/querier_test.go
@@ -20,7 +20,9 @@ import (
 	"github.com/grafana/tempo/pkg/util"
 	"github.com/grafana/tempo/pkg/util/test"
 	"github.com/grafana/tempo/tempodb"
+	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/pool"
 	"github.com/grafana/tempo/tempodb/wal"
 )
@@ -51,11 +53,14 @@ func TestReturnAllHits(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
+			Encoding:        backend.EncNone,
 			IndexDownsample: 10,
 			BloomFP:         .05,
 		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
+		},
 		BlocklistPoll: 50 * time.Millisecond,
 	}, log.NewNopLogger())
 	assert.NoError(t, err, "unexpected error creating tempodb")
@@ -85,7 +90,7 @@ func TestReturnAllHits(t *testing.T) {
 		err = head.Write(testTraceID, bReq)
 		assert.NoError(t, err, "unexpected error writing req")
 
-		complete, err := head.Complete(wal, &mockSharder{})
+		complete, err := w.CompleteBlock(head, &mockSharder{})
 		assert.NoError(t, err)
 
 		err = w.WriteBlock(context.Background(), complete)
@@ -96,7 +101,7 @@ func TestReturnAllHits(t *testing.T) {
 	time.Sleep(100 * time.Millisecond)
 
 	// find should return both now
-	foundBytes, _, err := r.Find(context.Background(), util.FakeTenantID, testTraceID, tempodb.BlockIDMin, tempodb.BlockIDMax)
+	foundBytes, err := r.Find(context.Background(), util.FakeTenantID, testTraceID, tempodb.BlockIDMin, tempodb.BlockIDMax)
 	assert.NoError(t, err)
 	require.Len(t, foundBytes, 2)
 
diff --git a/modules/storage/config.go b/modules/storage/config.go
index 9c14de678bc..a2e19decbee 100644
--- a/modules/storage/config.go
+++ b/modules/storage/config.go
@@ -6,10 +6,12 @@ import (
 
 	"github.com/grafana/tempo/pkg/util"
 	"github.com/grafana/tempo/tempodb"
+	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/azure"
 	"github.com/grafana/tempo/tempodb/backend/gcs"
 	"github.com/grafana/tempo/tempodb/backend/local"
 	"github.com/grafana/tempo/tempodb/backend/s3"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/pool"
 	"github.com/grafana/tempo/tempodb/wal"
 )
@@ -31,8 +33,11 @@ func (cfg *Config) RegisterFlagsAndApplyDefaults(prefix string, f *flag.FlagSet)
 
 	cfg.Trace.WAL = &wal.Config{}
 	f.StringVar(&cfg.Trace.WAL.Filepath, util.PrefixConfig(prefix, "trace.wal.path"), "/var/tempo/wal", "Path at which store WAL blocks.")
-	f.Float64Var(&cfg.Trace.WAL.BloomFP, util.PrefixConfig(prefix, "trace.wal.bloom-filter-false-positive"), .05, "Bloom False Positive.")
-	f.IntVar(&cfg.Trace.WAL.IndexDownsample, util.PrefixConfig(prefix, "trace.wal.index-downsample"), 100, "Number of traces per index record.")
+
+	cfg.Trace.Block = &encoding.BlockConfig{}
+	f.Float64Var(&cfg.Trace.Block.BloomFP, util.PrefixConfig(prefix, "trace.block.bloom-filter-false-positive"), .05, "Bloom False Positive.")
+	f.IntVar(&cfg.Trace.Block.IndexDownsample, util.PrefixConfig(prefix, "trace.block.index-downsample"), 100, "Number of traces per index record.")
+	cfg.Trace.Block.Encoding = backend.EncLZ4_256k
 
 	cfg.Trace.Azure = &azure.Config{}
 	f.StringVar(&cfg.Trace.Azure.StorageAccountName.Value, util.PrefixConfig(prefix, "trace.azure.storage-account-name"), "", "Azure storage account name.")
diff --git a/operations/tempo-mixin/out/tempo-operational.json b/operations/tempo-mixin/out/tempo-operational.json
index 99abc3d1835..a7757317ce8 100644
--- a/operations/tempo-mixin/out/tempo-operational.json
+++ b/operations/tempo-mixin/out/tempo-operational.json
@@ -27,7 +27,8 @@
  "editable": true,
  "gnetId": null,
  "graphTooltip": 1,
- "iteration": 1609831418010,
+ "id": 115,
+ "iteration": 1612469301683,
  "links": [
 
  ],
@@ -92,7 +93,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -201,7 +202,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -310,7 +311,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -323,7 +324,7 @@
    "targets": [
     {
      "expr": "go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"$namespace/$component\"}",
-     "legendFormat": "{{job}}",
+     "legendFormat": "{{pod}}",
      "refId": "A"
     }
    ],
@@ -418,7 +419,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -528,7 +529,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -540,7 +541,7 @@
    "steppedLine": false,
    "targets": [
     {
-     "expr": "container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\"}",
+     "expr": "container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$component.*\", container!=\"POD\"}",
      "interval": "",
      "legendFormat": "{{pod}}",
      "refId": "A"
@@ -637,7 +638,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -754,7 +755,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -862,7 +863,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -987,7 +988,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1096,7 +1097,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1214,7 +1215,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": true,
    "renderer": "flot",
@@ -1334,7 +1335,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1444,7 +1445,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": true,
    "renderer": "flot",
@@ -1565,7 +1566,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1680,7 +1681,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1806,7 +1807,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -1916,7 +1917,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2037,7 +2038,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2158,7 +2159,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2238,127 +2239,6 @@
   {
    "aliasColors": {
 
-   },
-   "bars": false,
-   "dashLength": 10,
-   "dashes": false,
-   "datasource": "$ds",
-   "fieldConfig": {
-    "defaults": {
-     "custom": {
-
-     }
-    },
-    "overrides": [
-
-    ]
-   },
-   "fill": 1,
-   "fillGradient": 0,
-   "gridPos": {
-    "h": 5,
-    "w": 3,
-    "x": 20,
-    "y": 13
-   },
-   "hiddenSeries": false,
-   "id": 13,
-   "legend": {
-    "avg": false,
-    "current": false,
-    "max": false,
-    "min": false,
-    "show": false,
-    "total": false,
-    "values": false
-   },
-   "lines": true,
-   "linewidth": 1,
-   "nullPointMode": "null",
-   "options": {
-    "alertThreshold": true
-   },
-   "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
-   "pointradius": 2,
-   "points": false,
-   "renderer": "flot",
-   "seriesOverrides": [
-
-   ],
-   "spaceLength": 10,
-   "stack": false,
-   "steppedLine": false,
-   "targets": [
-    {
-     "expr": "histogram_quantile(.99, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-     "interval": "",
-     "legendFormat": ".99",
-     "refId": "A"
-    },
-    {
-     "expr": "histogram_quantile(.9, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-     "interval": "",
-     "legendFormat": ".9",
-     "refId": "B"
-    },
-    {
-     "expr": "histogram_quantile(.5, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-     "interval": "",
-     "legendFormat": ".5",
-     "refId": "C"
-    }
-   ],
-   "thresholds": [
-
-   ],
-   "timeFrom": null,
-   "timeRegions": [
-
-   ],
-   "timeShift": null,
-   "title": "Query  Bytes Read",
-   "tooltip": {
-    "shared": true,
-    "sort": 0,
-    "value_type": "individual"
-   },
-   "type": "graph",
-   "xaxis": {
-    "buckets": null,
-    "mode": "time",
-    "name": null,
-    "show": true,
-    "values": [
-
-    ]
-   },
-   "yaxes": [
-    {
-     "format": "decbytes",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    },
-    {
-     "format": "short",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    }
-   ],
-   "yaxis": {
-    "align": false,
-    "alignLevel": null
-   }
-  },
-  {
-   "aliasColors": {
-
    },
    "bars": false,
    "dashLength": 10,
@@ -2400,7 +2280,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2477,226 +2357,6 @@
     "alignLevel": null
    }
   },
-  {
-   "aliasColors": {
-
-   },
-   "bars": false,
-   "dashLength": 10,
-   "dashes": false,
-   "datasource": "$ds",
-   "fieldConfig": {
-    "defaults": {
-     "custom": {
-
-     }
-    },
-    "overrides": [
-
-    ]
-   },
-   "fill": 1,
-   "fillGradient": 0,
-   "gridPos": {
-    "h": 5,
-    "w": 3,
-    "x": 16,
-    "y": 18
-   },
-   "hiddenSeries": false,
-   "id": 15,
-   "legend": {
-    "avg": false,
-    "current": false,
-    "max": false,
-    "min": false,
-    "show": false,
-    "total": false,
-    "values": false
-   },
-   "lines": true,
-   "linewidth": 1,
-   "nullPointMode": "null",
-   "options": {
-    "alertThreshold": true
-   },
-   "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
-   "pointradius": 2,
-   "points": false,
-   "renderer": "flot",
-   "seriesOverrides": [
-
-   ],
-   "spaceLength": 10,
-   "stack": false,
-   "steppedLine": false,
-   "targets": [
-    {
-     "expr": "histogram_quantile(.5, sum(rate(tempo_query_reads_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (layer, le))",
-     "hide": false,
-     "interval": "",
-     "legendFormat": "{{layer}}",
-     "refId": "A"
-    }
-   ],
-   "thresholds": [
-
-   ],
-   "timeFrom": null,
-   "timeRegions": [
-
-   ],
-   "timeShift": null,
-   "title": "p50 Reads",
-   "tooltip": {
-    "shared": true,
-    "sort": 0,
-    "value_type": "individual"
-   },
-   "type": "graph",
-   "xaxis": {
-    "buckets": null,
-    "mode": "time",
-    "name": null,
-    "show": true,
-    "values": [
-
-    ]
-   },
-   "yaxes": [
-    {
-     "format": "short",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    },
-    {
-     "format": "short",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    }
-   ],
-   "yaxis": {
-    "align": false,
-    "alignLevel": null
-   }
-  },
-  {
-   "aliasColors": {
-
-   },
-   "bars": false,
-   "dashLength": 10,
-   "dashes": false,
-   "datasource": "$ds",
-   "fieldConfig": {
-    "defaults": {
-     "custom": {
-
-     }
-    },
-    "overrides": [
-
-    ]
-   },
-   "fill": 1,
-   "fillGradient": 0,
-   "gridPos": {
-    "h": 5,
-    "w": 3,
-    "x": 19,
-    "y": 18
-   },
-   "hiddenSeries": false,
-   "id": 14,
-   "legend": {
-    "avg": false,
-    "current": false,
-    "max": false,
-    "min": false,
-    "show": false,
-    "total": false,
-    "values": false
-   },
-   "lines": true,
-   "linewidth": 1,
-   "nullPointMode": "null",
-   "options": {
-    "alertThreshold": true
-   },
-   "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
-   "pointradius": 2,
-   "points": false,
-   "renderer": "flot",
-   "seriesOverrides": [
-
-   ],
-   "spaceLength": 10,
-   "stack": false,
-   "steppedLine": false,
-   "targets": [
-    {
-     "expr": "histogram_quantile(.99, sum(rate(tempo_query_reads_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (layer, le))",
-     "hide": false,
-     "interval": "",
-     "legendFormat": "{{layer}}",
-     "refId": "A"
-    }
-   ],
-   "thresholds": [
-
-   ],
-   "timeFrom": null,
-   "timeRegions": [
-
-   ],
-   "timeShift": null,
-   "title": "p99 Reads",
-   "tooltip": {
-    "shared": true,
-    "sort": 0,
-    "value_type": "individual"
-   },
-   "type": "graph",
-   "xaxis": {
-    "buckets": null,
-    "mode": "time",
-    "name": null,
-    "show": true,
-    "values": [
-
-    ]
-   },
-   "yaxes": [
-    {
-     "format": "short",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    },
-    {
-     "format": "short",
-     "label": null,
-     "logBase": 1,
-     "max": null,
-     "min": null,
-     "show": true
-    }
-   ],
-   "yaxis": {
-    "align": false,
-    "alignLevel": null
-   }
-  },
   {
    "collapsed": false,
    "datasource": null,
@@ -2757,7 +2417,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2866,7 +2526,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -2975,7 +2635,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3084,7 +2744,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3193,7 +2853,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3302,7 +2962,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": true,
    "renderer": "flot",
@@ -3423,7 +3083,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3532,7 +3192,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3647,7 +3307,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3768,7 +3428,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -3908,7 +3568,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -4020,7 +3680,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -4858,7 +4518,7 @@
        },
        {
         "color": "red",
-        "value": 0.05
+        "value": 0.050000000000000003
        }
       ]
      },
@@ -4961,7 +4621,7 @@
 
     }
    },
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "targets": [
     {
      "expr": "sum(rate(tempo_vulture_trace_error_total{cluster=\"$cluster\", namespace=\"$namespace\", error=\"notfound\"}[1h])) by (secondsago) / sum(rate(tempo_vulture_trace_total{cluster=\"$cluster\", namespace=\"$namespace\"}[1h])) by (secondsago)",
@@ -4997,7 +4657,7 @@
        },
        {
         "color": "red",
-        "value": 0.05
+        "value": 0.050000000000000003
        }
       ]
      },
@@ -5100,7 +4760,7 @@
 
     }
    },
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "targets": [
     {
      "expr": "sum(rate(tempo_vulture_trace_error_total{cluster=\"$cluster\", namespace=\"$namespace\", error=\"missingspans\"}[1h])) by (secondsago) / sum(rate(tempo_vulture_trace_total{cluster=\"$cluster\", namespace=\"$namespace\"}[1h])) by (secondsago)",
@@ -5174,7 +4834,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -5286,7 +4946,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -5395,7 +5055,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -5504,7 +5164,7 @@
     "alertThreshold": true
    },
    "percentage": false,
-   "pluginVersion": "7.4.0-9155pre",
+   "pluginVersion": "7.5.0-11433pre",
    "pointradius": 2,
    "points": false,
    "renderer": "flot",
@@ -5767,4 +5427,4 @@
  "title": "Tempo Operational",
  "uid": "AOFXrRUZz",
  "version": 1
-}
\ No newline at end of file
+}
diff --git a/operations/tempo-mixin/out/tempo-reads.json b/operations/tempo-mixin/out/tempo-reads.json
index 593558103d2..b815b54d6bb 100644
--- a/operations/tempo-mixin/out/tempo-reads.json
+++ b/operations/tempo-mixin/out/tempo-reads.json
@@ -1895,4 +1895,4 @@
  "title": "Tempo / Reads",
  "uid": "",
  "version": 0
-}
\ No newline at end of file
+}
diff --git a/operations/tempo-mixin/out/tempo-resources.json b/operations/tempo-mixin/out/tempo-resources.json
index c8ca9b8d469..406acc9b3b9 100644
--- a/operations/tempo-mixin/out/tempo-resources.json
+++ b/operations/tempo-mixin/out/tempo-resources.json
@@ -1933,4 +1933,4 @@
  "title": "Tempo / Resources",
  "uid": "",
  "version": 0
-}
\ No newline at end of file
+}
diff --git a/operations/tempo-mixin/out/tempo-writes.json b/operations/tempo-mixin/out/tempo-writes.json
index 4a243dab73c..f155150b3a6 100644
--- a/operations/tempo-mixin/out/tempo-writes.json
+++ b/operations/tempo-mixin/out/tempo-writes.json
@@ -1507,4 +1507,4 @@
  "title": "Tempo / Writes",
  "uid": "",
  "version": 0
-}
\ No newline at end of file
+}
diff --git a/operations/tempo-mixin/tempo-operational.json b/operations/tempo-mixin/tempo-operational.json
index b6abacc7fac..17db9360100 100644
--- a/operations/tempo-mixin/tempo-operational.json
+++ b/operations/tempo-mixin/tempo-operational.json
@@ -25,7 +25,8 @@
   "editable": true,
   "gnetId": null,
   "graphTooltip": 1,
-  "iteration": 1609831418010,
+  "id": 115,
+  "iteration": 1612469301683,
   "links": [],
   "panels": [
     {
@@ -80,7 +81,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -175,7 +176,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -270,7 +271,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -364,7 +365,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -460,7 +461,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -555,7 +556,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -658,7 +659,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -752,7 +753,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -861,7 +862,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -956,7 +957,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1060,7 +1061,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": true,
       "renderer": "flot",
@@ -1166,7 +1167,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1262,7 +1263,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": true,
       "renderer": "flot",
@@ -1369,7 +1370,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1470,7 +1471,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1580,7 +1581,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1676,7 +1677,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1783,7 +1784,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1890,7 +1891,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1959,113 +1960,6 @@
         "alignLevel": null
       }
     },
-    {
-      "aliasColors": {},
-      "bars": false,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "$ds",
-      "fieldConfig": {
-        "defaults": {
-          "custom": {}
-        },
-        "overrides": []
-      },
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 5,
-        "w": 3,
-        "x": 20,
-        "y": 13
-      },
-      "hiddenSeries": false,
-      "id": 13,
-      "legend": {
-        "avg": false,
-        "current": false,
-        "max": false,
-        "min": false,
-        "show": false,
-        "total": false,
-        "values": false
-      },
-      "lines": true,
-      "linewidth": 1,
-      "nullPointMode": "null",
-      "options": {
-        "alertThreshold": true
-      },
-      "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
-      "pointradius": 2,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "histogram_quantile(.99, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-          "interval": "",
-          "legendFormat": ".99",
-          "refId": "A"
-        },
-        {
-          "expr": "histogram_quantile(.9, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-          "interval": "",
-          "legendFormat": ".9",
-          "refId": "B"
-        },
-        {
-          "expr": "histogram_quantile(.5, sum(rate(tempo_query_bytes_read_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (le))",
-          "interval": "",
-          "legendFormat": ".5",
-          "refId": "C"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "Query  Bytes Read",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "decbytes",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
     {
       "aliasColors": {},
       "bars": false,
@@ -2104,7 +1998,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2173,198 +2067,6 @@
         "alignLevel": null
       }
     },
-    {
-      "aliasColors": {},
-      "bars": false,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "$ds",
-      "fieldConfig": {
-        "defaults": {
-          "custom": {}
-        },
-        "overrides": []
-      },
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 5,
-        "w": 3,
-        "x": 16,
-        "y": 18
-      },
-      "hiddenSeries": false,
-      "id": 15,
-      "legend": {
-        "avg": false,
-        "current": false,
-        "max": false,
-        "min": false,
-        "show": false,
-        "total": false,
-        "values": false
-      },
-      "lines": true,
-      "linewidth": 1,
-      "nullPointMode": "null",
-      "options": {
-        "alertThreshold": true
-      },
-      "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
-      "pointradius": 2,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "histogram_quantile(.5, sum(rate(tempo_query_reads_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (layer, le))",
-          "hide": false,
-          "interval": "",
-          "legendFormat": "{{layer}}",
-          "refId": "A"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "p50 Reads",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
-    {
-      "aliasColors": {},
-      "bars": false,
-      "dashLength": 10,
-      "dashes": false,
-      "datasource": "$ds",
-      "fieldConfig": {
-        "defaults": {
-          "custom": {}
-        },
-        "overrides": []
-      },
-      "fill": 1,
-      "fillGradient": 0,
-      "gridPos": {
-        "h": 5,
-        "w": 3,
-        "x": 19,
-        "y": 18
-      },
-      "hiddenSeries": false,
-      "id": 14,
-      "legend": {
-        "avg": false,
-        "current": false,
-        "max": false,
-        "min": false,
-        "show": false,
-        "total": false,
-        "values": false
-      },
-      "lines": true,
-      "linewidth": 1,
-      "nullPointMode": "null",
-      "options": {
-        "alertThreshold": true
-      },
-      "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
-      "pointradius": 2,
-      "points": false,
-      "renderer": "flot",
-      "seriesOverrides": [],
-      "spaceLength": 10,
-      "stack": false,
-      "steppedLine": false,
-      "targets": [
-        {
-          "expr": "histogram_quantile(.99, sum(rate(tempo_query_reads_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])) by (layer, le))",
-          "hide": false,
-          "interval": "",
-          "legendFormat": "{{layer}}",
-          "refId": "A"
-        }
-      ],
-      "thresholds": [],
-      "timeFrom": null,
-      "timeRegions": [],
-      "timeShift": null,
-      "title": "p99 Reads",
-      "tooltip": {
-        "shared": true,
-        "sort": 0,
-        "value_type": "individual"
-      },
-      "type": "graph",
-      "xaxis": {
-        "buckets": null,
-        "mode": "time",
-        "name": null,
-        "show": true,
-        "values": []
-      },
-      "yaxes": [
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        },
-        {
-          "format": "short",
-          "label": null,
-          "logBase": 1,
-          "max": null,
-          "min": null,
-          "show": true
-        }
-      ],
-      "yaxis": {
-        "align": false,
-        "alignLevel": null
-      }
-    },
     {
       "collapsed": false,
       "datasource": null,
@@ -2417,7 +2119,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2512,7 +2214,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2607,7 +2309,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2702,7 +2404,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2797,7 +2499,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -2892,7 +2594,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": true,
       "renderer": "flot",
@@ -2999,7 +2701,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -3094,7 +2796,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -3195,7 +2897,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -3302,7 +3004,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -3424,7 +3126,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -3520,7 +3222,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -4349,7 +4051,7 @@
         "showUnfilled": true,
         "text": {}
       },
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "targets": [
         {
           "expr": "sum(rate(tempo_vulture_trace_error_total{cluster=\"$cluster\", namespace=\"$namespace\", error=\"notfound\"}[1h])) by (secondsago) / sum(rate(tempo_vulture_trace_total{cluster=\"$cluster\", namespace=\"$namespace\"}[1h])) by (secondsago)",
@@ -4482,7 +4184,7 @@
         "showUnfilled": true,
         "text": {}
       },
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "targets": [
         {
           "expr": "sum(rate(tempo_vulture_trace_error_total{cluster=\"$cluster\", namespace=\"$namespace\", error=\"missingspans\"}[1h])) by (secondsago) / sum(rate(tempo_vulture_trace_total{cluster=\"$cluster\", namespace=\"$namespace\"}[1h])) by (secondsago)",
@@ -4548,7 +4250,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -4646,7 +4348,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -4741,7 +4443,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -4836,7 +4538,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0-9155pre",
+      "pluginVersion": "7.5.0-11433pre",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
diff --git a/tempodb/backend/block_meta.go b/tempodb/backend/block_meta.go
index c60048fe4e1..b5bfd329d22 100644
--- a/tempodb/backend/block_meta.go
+++ b/tempodb/backend/block_meta.go
@@ -23,18 +23,20 @@ type BlockMeta struct {
 	EndTime         time.Time `json:"endTime"`
 	TotalObjects    int       `json:"totalObjects"`
 	CompactionLevel uint8     `json:"compactionLevel"`
+	Encoding        Encoding  `json:"encoding"`
 }
 
-func NewBlockMeta(tenantID string, blockID uuid.UUID) *BlockMeta {
+func NewBlockMeta(tenantID string, blockID uuid.UUID, version string, encoding Encoding) *BlockMeta {
 	now := time.Now()
 	b := &BlockMeta{
-		Version:   "v0", // jpe - think about this
+		Version:   version,
 		BlockID:   blockID,
 		MinID:     []byte{},
 		MaxID:     []byte{},
 		TenantID:  tenantID,
 		StartTime: now,
 		EndTime:   now,
+		Encoding:  encoding,
 	}
 
 	return b
diff --git a/tempodb/backend/block_meta_test.go b/tempodb/backend/block_meta_test.go
index 78168a33c9f..54d41417f4f 100644
--- a/tempodb/backend/block_meta_test.go
+++ b/tempodb/backend/block_meta_test.go
@@ -14,11 +14,16 @@ const (
 )
 
 func TestBlockMeta(t *testing.T) {
+	testVersion := "blerg"
+	testEncoding := EncLZ4_256k
+
 	id := uuid.New()
-	b := NewBlockMeta(testTenantID, id)
+	b := NewBlockMeta(testTenantID, id, testVersion, testEncoding)
 
 	assert.Equal(t, id, b.BlockID)
 	assert.Equal(t, testTenantID, b.TenantID)
+	assert.Equal(t, testVersion, b.Version)
+	assert.Equal(t, testEncoding, b.Encoding)
 
 	randID1 := make([]byte, 10)
 	randID2 := make([]byte, 10)
@@ -32,6 +37,5 @@ func TestBlockMeta(t *testing.T) {
 	b.ObjectAdded(randID2)
 	assert.True(t, b.EndTime.After(b.StartTime))
 	assert.Equal(t, 1, bytes.Compare(b.MaxID, b.MinID))
-
 	assert.Equal(t, 2, b.TotalObjects)
 }
diff --git a/tempodb/backend/encoding.go b/tempodb/backend/encoding.go
new file mode 100644
index 00000000000..e482674ed90
--- /dev/null
+++ b/tempodb/backend/encoding.go
@@ -0,0 +1,124 @@
+package backend
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"strings"
+)
+
+// Encoding is the identifier for a chunk encoding.
+type Encoding byte
+
+// The different available encodings.
+// Make sure to preserve the order, as these numeric values are written to the chunks!
+const (
+	EncNone Encoding = iota
+	EncGZIP
+	EncLZ4_64k
+	EncLZ4_256k
+	EncLZ4_1M
+	EncLZ4_4M
+	EncSnappy
+	EncZstd
+)
+
+// SupportedEncoding is a slice of all supported encodings
+var SupportedEncoding = []Encoding{
+	EncNone,
+	EncGZIP,
+	EncLZ4_64k,
+	EncLZ4_256k,
+	EncLZ4_1M,
+	EncLZ4_4M,
+	EncSnappy,
+	EncZstd,
+}
+
+func (e Encoding) String() string {
+	switch e {
+	case EncNone:
+		return "none"
+	case EncGZIP:
+		return "gzip"
+	case EncLZ4_64k:
+		return "lz4-64k"
+	case EncLZ4_256k:
+		return "lz4-256k"
+	case EncLZ4_1M:
+		return "lz4-1M"
+	case EncLZ4_4M:
+		return "lz4"
+	case EncSnappy:
+		return "snappy"
+	case EncZstd:
+		return "zstd"
+	default:
+		return "unsupported"
+	}
+}
+
+// UnmarshalYAML implements the Unmarshaler interface of the yaml pkg.
+func (e *Encoding) UnmarshalYAML(unmarshal func(interface{}) error) error {
+	var encString string
+	err := unmarshal(&encString)
+	if err != nil {
+		return err
+	}
+
+	*e, err = ParseEncoding(encString)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// MarshalYAML implements the Marshaler interface of the yaml pkg
+func (e Encoding) MarshalYAML() (interface{}, error) {
+	return e.String(), nil
+}
+
+// UnmarshalJSON implements the Unmarshaler interface of the json pkg.
+func (e *Encoding) UnmarshalJSON(b []byte) error {
+	var encString string
+	err := json.Unmarshal(b, &encString)
+	if err != nil {
+		return err
+	}
+
+	*e, err = ParseEncoding(encString)
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+// MarshalJSON implements the marshaler interface of the json pkg.
+func (e Encoding) MarshalJSON() ([]byte, error) {
+	buffer := bytes.NewBufferString("\"" + e.String() + "\"")
+	return buffer.Bytes(), nil
+}
+
+// ParseEncoding parses an chunk encoding (compression algorithm) by its name.
+func ParseEncoding(enc string) (Encoding, error) {
+	for _, e := range SupportedEncoding {
+		if strings.EqualFold(e.String(), enc) {
+			return e, nil
+		}
+	}
+	return 0, fmt.Errorf("invalid encoding: %s, supported: %s", enc, SupportedEncodingString())
+}
+
+// SupportedEncodingString returns the list of supported Encoding.
+func SupportedEncodingString() string {
+	var sb strings.Builder
+	for i := range SupportedEncoding {
+		sb.WriteString(SupportedEncoding[i].String())
+		if i != len(SupportedEncoding)-1 {
+			sb.WriteString(", ")
+		}
+	}
+	return sb.String()
+}
diff --git a/tempodb/backend/encoding_test.go b/tempodb/backend/encoding_test.go
new file mode 100644
index 00000000000..5a01d940c5b
--- /dev/null
+++ b/tempodb/backend/encoding_test.go
@@ -0,0 +1,45 @@
+package backend
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"gopkg.in/yaml.v3"
+)
+
+type marshalTest struct {
+	Test Encoding
+}
+
+func TestUnmarshalMarshalYaml(t *testing.T) {
+	for _, enc := range SupportedEncoding {
+		expected := marshalTest{
+			Test: enc,
+		}
+		actual := marshalTest{}
+
+		buff, err := yaml.Marshal(expected)
+		assert.NoError(t, err)
+		err = yaml.Unmarshal(buff, &actual)
+		assert.NoError(t, err)
+
+		assert.Equal(t, expected, actual)
+	}
+}
+
+func TestUnmarshalMarshalJson(t *testing.T) {
+	for _, enc := range SupportedEncoding {
+		expected := marshalTest{
+			Test: enc,
+		}
+		actual := marshalTest{}
+
+		buff, err := json.Marshal(expected)
+		assert.NoError(t, err)
+		err = json.Unmarshal(buff, &actual)
+		assert.NoError(t, err)
+
+		assert.Equal(t, expected, actual)
+	}
+}
diff --git a/tempodb/backend/gcs/gcs.go b/tempodb/backend/gcs/gcs.go
index 90297a2738d..32871d53426 100644
--- a/tempodb/backend/gcs/gcs.go
+++ b/tempodb/backend/gcs/gcs.go
@@ -132,6 +132,10 @@ func (rw *readerWriter) Append(ctx context.Context, name string, blockID uuid.UU
 
 // CloseAppend implements backend.Writer
 func (rw *readerWriter) CloseAppend(_ context.Context, tracker backend.AppendTracker) error {
+	if tracker == nil {
+		return nil
+	}
+
 	w := tracker.(*storage.Writer)
 	return w.Close()
 }
diff --git a/tempodb/backend/local/local.go b/tempodb/backend/local/local.go
index f0022f9fea3..1c71bc2d254 100644
--- a/tempodb/backend/local/local.go
+++ b/tempodb/backend/local/local.go
@@ -112,6 +112,10 @@ func (rw *readerWriter) Append(ctx context.Context, name string, blockID uuid.UU
 
 // CloseAppend implements backend.Writer
 func (rw *readerWriter) CloseAppend(ctx context.Context, tracker backend.AppendTracker) error {
+	if tracker == nil {
+		return nil
+	}
+
 	var dst *os.File = tracker.(*os.File)
 	return dst.Close()
 }
diff --git a/tempodb/backend/readerat.go b/tempodb/backend/readerat.go
new file mode 100644
index 00000000000..527ff40b8b8
--- /dev/null
+++ b/tempodb/backend/readerat.go
@@ -0,0 +1,26 @@
+package backend
+
+import "context"
+
+// ReaderAt is a shim that allows a backend.Reader to be used as an io.ReaderAt
+type ReaderAt struct {
+	meta *BlockMeta
+	name string
+	r    Reader
+}
+
+// NewReaderAt creates a ReaderAt for the given BlockMeta
+func NewReaderAt(meta *BlockMeta, name string, r Reader) *ReaderAt {
+	return &ReaderAt{
+		meta: meta,
+		name: name,
+		r:    r,
+	}
+}
+
+// ReadAt implements ReaderAt
+func (b *ReaderAt) ReadAt(p []byte, off int64) (int, error) {
+	// todo:  how to preserve context?  len(p) is cheating
+	err := b.r.ReadRange(context.Background(), b.name, b.meta.BlockID, b.meta.TenantID, uint64(off), p)
+	return len(p), err
+}
diff --git a/tempodb/backend/s3/s3.go b/tempodb/backend/s3/s3.go
index 4e3b7cb378b..875067ae86e 100644
--- a/tempodb/backend/s3/s3.go
+++ b/tempodb/backend/s3/s3.go
@@ -226,6 +226,10 @@ func (rw *readerWriter) Append(ctx context.Context, name string, blockID uuid.UU
 
 // CloseAppend implements backend.Writer
 func (rw *readerWriter) CloseAppend(ctx context.Context, tracker backend.AppendTracker) error {
+	if tracker == nil {
+		return nil
+	}
+
 	a := tracker.(appendTracker)
 	completeParts := make([]minio.CompletePart, 0)
 	for _, p := range a.parts {
diff --git a/tempodb/compactor.go b/tempodb/compactor.go
index 115b103313a..8b5d45df033 100644
--- a/tempodb/compactor.go
+++ b/tempodb/compactor.go
@@ -134,12 +134,12 @@ func (rw *readerWriter) compact(blockMetas []*backend.BlockMeta, tenantID string
 		level.Info(rw.logger).Log("msg", "compacting block", "block", fmt.Sprintf("%+v", blockMeta))
 		totalRecords += blockMeta.TotalObjects
 
-		block, err := encoding.NewBackendBlock(blockMeta)
+		block, err := encoding.NewBackendBlock(blockMeta, rw.r)
 		if err != nil {
 			return err
 		}
 
-		iter, err := block.Iterator(rw.compactorCfg.ChunkSizeBytes, rw.r)
+		iter, err := block.Iterator(rw.compactorCfg.ChunkSizeBytes)
 		if err != nil {
 			return err
 		}
@@ -188,7 +188,7 @@ func (rw *readerWriter) compact(blockMetas []*backend.BlockMeta, tenantID string
 
 		// make a new block if necessary
 		if currentBlock == nil {
-			currentBlock, err = encoding.NewCompactorBlock(uuid.New(), tenantID, rw.cfg.WAL.BloomFP, rw.cfg.WAL.IndexDownsample, blockMetas, recordsPerBlock)
+			currentBlock, err = encoding.NewCompactorBlock(rw.cfg.Block, uuid.New(), tenantID, blockMetas, recordsPerBlock)
 			if err != nil {
 				return errors.Wrap(err, "error making new compacted block")
 			}
@@ -242,12 +242,12 @@ func (rw *readerWriter) compact(blockMetas []*backend.BlockMeta, tenantID string
 func appendBlock(rw *readerWriter, tracker backend.AppendTracker, block *encoding.CompactorBlock) (backend.AppendTracker, error) {
 	compactionLevelLabel := strconv.Itoa(int(block.BlockMeta().CompactionLevel - 1))
 	metricCompactionObjectsWritten.WithLabelValues(compactionLevelLabel).Add(float64(block.CurrentBufferedObjects()))
-	metricCompactionBytesWritten.WithLabelValues(compactionLevelLabel).Add(float64(block.CurrentBufferLength()))
 
-	tracker, err := block.FlushBuffer(context.TODO(), tracker, rw.w)
+	tracker, bytesFlushed, err := block.FlushBuffer(context.TODO(), tracker, rw.w)
 	if err != nil {
 		return nil, err
 	}
+	metricCompactionBytesWritten.WithLabelValues(compactionLevelLabel).Add(float64(bytesFlushed))
 
 	return tracker, nil
 }
@@ -255,15 +255,12 @@ func appendBlock(rw *readerWriter, tracker backend.AppendTracker, block *encodin
 func finishBlock(rw *readerWriter, tracker backend.AppendTracker, block *encoding.CompactorBlock) error {
 	level.Info(rw.logger).Log("msg", "writing compacted block", "block", fmt.Sprintf("%+v", block.BlockMeta()))
 
-	tracker, err := appendBlock(rw, tracker, block)
-	if err != nil {
-		return err
-	}
-
-	err = block.Complete(context.TODO(), tracker, rw.w)
+	bytesFlushed, err := block.Complete(context.TODO(), tracker, rw.w)
 	if err != nil {
 		return err
 	}
+	compactionLevelLabel := strconv.Itoa(int(block.BlockMeta().CompactionLevel - 1))
+	metricCompactionBytesWritten.WithLabelValues(compactionLevelLabel).Add(float64(bytesFlushed))
 
 	return nil
 }
diff --git a/tempodb/compactor_bookmark_test.go b/tempodb/compactor_bookmark_test.go
index 0b19e5b046f..9287b87f106 100644
--- a/tempodb/compactor_bookmark_test.go
+++ b/tempodb/compactor_bookmark_test.go
@@ -14,6 +14,7 @@ import (
 	"github.com/golang/protobuf/proto"
 	"github.com/google/uuid"
 	"github.com/grafana/tempo/pkg/util/test"
+	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
 	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/wal"
@@ -30,10 +31,13 @@ func TestCurrentClear(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncGZIP,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -64,16 +68,16 @@ func TestCurrentClear(t *testing.T) {
 		assert.NoError(t, err, "unexpected error writing req")
 	}
 
-	complete, err := head.Complete(wal, &mockSharder{})
+	complete, err := w.CompleteBlock(head, &mockSharder{})
 	assert.NoError(t, err)
 
 	err = w.WriteBlock(context.Background(), complete)
 	assert.NoError(t, err)
 
 	rw := r.(*readerWriter)
-	block, err := encoding.NewBackendBlock(complete.BlockMeta())
+	block, err := encoding.NewBackendBlock(complete.BlockMeta(), rw.r)
 	assert.NoError(t, err)
-	iter, err := block.Iterator(10, rw.r)
+	iter, err := block.Iterator(10)
 	assert.NoError(t, err)
 	bm := newBookmark(iter)
 
diff --git a/tempodb/compactor_test.go b/tempodb/compactor_test.go
index e2f2cbf0e63..de3c90a22f9 100644
--- a/tempodb/compactor_test.go
+++ b/tempodb/compactor_test.go
@@ -18,6 +18,7 @@ import (
 	"github.com/grafana/tempo/pkg/util/test"
 	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/pool"
 	"github.com/grafana/tempo/tempodb/wal"
 )
@@ -51,10 +52,13 @@ func TestCompaction(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 11,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_4M,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -99,7 +103,7 @@ func TestCompaction(t *testing.T) {
 		allReqs = append(allReqs, reqs...)
 		allIds = append(allIds, ids...)
 
-		complete, err := head.Complete(wal, &mockSharder{})
+		complete, err := w.CompleteBlock(head, &mockSharder{})
 		assert.NoError(t, err)
 
 		err = w.WriteBlock(context.Background(), complete)
@@ -149,7 +153,7 @@ func TestCompaction(t *testing.T) {
 
 	// now see if we can find our ids
 	for i, id := range allIds {
-		b, _, err := rw.Find(context.Background(), testTenantID, id, BlockIDMin, BlockIDMax)
+		b, err := rw.Find(context.Background(), testTenantID, id, BlockIDMin, BlockIDMax)
 		assert.NoError(t, err)
 
 		out := &tempopb.PushRequest{}
@@ -174,10 +178,13 @@ func TestSameIDCompaction(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
-			IndexDownsample: rand.Int()%20 + 1,
+		Block: &encoding.BlockConfig{
+			IndexDownsample: 11,
 			BloomFP:         .01,
+			Encoding:        backend.EncSnappy,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -206,7 +213,7 @@ func TestSameIDCompaction(t *testing.T) {
 		err = head.Write(id, rec)
 		assert.NoError(t, err, "unexpected error writing req")
 
-		complete, err := head.Complete(wal, &mockSharder{})
+		complete, err := w.CompleteBlock(head, &mockSharder{})
 		assert.NoError(t, err)
 
 		err = w.WriteBlock(context.Background(), complete)
@@ -258,10 +265,13 @@ func TestCompactionUpdatesBlocklist(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
-			IndexDownsample: rand.Int()%20 + 1,
+		Block: &encoding.BlockConfig{
+			IndexDownsample: 11,
 			BloomFP:         .01,
+			Encoding:        backend.EncNone,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -298,7 +308,7 @@ func TestCompactionUpdatesBlocklist(t *testing.T) {
 	// Make sure all expected traces are found.
 	for i := 0; i < blockCount; i++ {
 		for j := 0; j < recordCount; j++ {
-			trace, _, err := rw.Find(context.TODO(), testTenantID, makeTraceID(i, j), BlockIDMin, BlockIDMax)
+			trace, err := rw.Find(context.TODO(), testTenantID, makeTraceID(i, j), BlockIDMin, BlockIDMax)
 			assert.NotNil(t, trace)
 			assert.Greater(t, len(trace), 0)
 			assert.NoError(t, err)
@@ -320,10 +330,13 @@ func TestCompactionMetrics(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
-			IndexDownsample: rand.Int()%20 + 1,
+		Block: &encoding.BlockConfig{
+			IndexDownsample: 11,
 			BloomFP:         .01,
+			Encoding:        backend.EncNone,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -391,10 +404,13 @@ func TestCompactionIteratesThroughTenants(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
-			IndexDownsample: rand.Int()%20 + 1,
+		Block: &encoding.BlockConfig{
+			IndexDownsample: 11,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_64k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -444,7 +460,7 @@ func cutTestBlocks(t *testing.T, w Writer, tenantID string, blockCount int, reco
 			assert.NoError(t, err, "unexpected error writing rec")
 		}
 
-		complete, err := head.Complete(wal, &mockSharder{})
+		complete, err := w.CompleteBlock(head, &mockSharder{})
 		assert.NoError(t, err)
 
 		err = w.WriteBlock(context.Background(), complete)
diff --git a/tempodb/config.go b/tempodb/config.go
index b4dc3c438b0..7d731554f00 100644
--- a/tempodb/config.go
+++ b/tempodb/config.go
@@ -1,6 +1,8 @@
 package tempodb
 
 import (
+	"errors"
+	"fmt"
 	"time"
 
 	"github.com/grafana/tempo/tempodb/backend/azure"
@@ -9,6 +11,7 @@ import (
 	"github.com/grafana/tempo/tempodb/backend/gcs"
 	"github.com/grafana/tempo/tempodb/backend/local"
 	"github.com/grafana/tempo/tempodb/backend/s3"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/pool"
 	"github.com/grafana/tempo/tempodb/wal"
 )
@@ -16,23 +19,29 @@ import (
 const DefaultBlocklistPollConcurrency = uint(50)
 const DefaultRetentionConcurrency = uint(10)
 
+// Config holds the entirety of tempodb configuration
 type Config struct {
+	Pool  *pool.Config          `yaml:"pool,omitempty"`
+	WAL   *wal.Config           `yaml:"wal"`
+	Block *encoding.BlockConfig `yaml:"block"`
+
+	BlocklistPoll            time.Duration `yaml:"blocklist_poll"`
+	BlocklistPollConcurrency uint          `yaml:"blocklist_poll_concurrency"`
+
+	// backends
 	Backend string        `yaml:"backend"`
 	Local   *local.Config `yaml:"local"`
 	GCS     *gcs.Config   `yaml:"gcs"`
 	S3      *s3.Config    `yaml:"s3"`
 	Azure   *azure.Config `yaml:"azure"`
-	Pool    *pool.Config  `yaml:"pool,omitempty"`
-	WAL     *wal.Config   `yaml:"wal"`
 
+	// caches
 	Cache     string            `yaml:"cache"`
 	Memcached *memcached.Config `yaml:"memcached"`
 	Redis     *redis.Config     `yaml:"redis"`
-
-	BlocklistPoll            time.Duration `yaml:"blocklist_poll"`
-	BlocklistPollConcurrency uint          `yaml:"blocklist_poll_concurrency"`
 }
 
+// CompactorConfig contains compaction configuration options
 type CompactorConfig struct {
 	ChunkSizeBytes          uint32        `yaml:"chunk_size_bytes"` // todo: do we need this?
 	FlushSizeBytes          uint32        `yaml:"flush_size_bytes"`
@@ -42,3 +51,20 @@ type CompactorConfig struct {
 	CompactedBlockRetention time.Duration `yaml:"compacted_block_retention"`
 	RetentionConcurrency    uint          `yaml:"retention_concurrency"`
 }
+
+func validateConfig(cfg *Config) error {
+	if cfg.WAL == nil {
+		return errors.New("wal config should be non-nil")
+	}
+
+	if cfg.Block == nil {
+		return errors.New("block config should be non-nil")
+	}
+
+	err := encoding.ValidateConfig(cfg.Block)
+	if err != nil {
+		return fmt.Errorf("block config validation failed: %w", err)
+	}
+
+	return nil
+}
diff --git a/tempodb/encoding/backend_block.go b/tempodb/encoding/backend_block.go
new file mode 100644
index 00000000000..1939465b644
--- /dev/null
+++ b/tempodb/encoding/backend_block.go
@@ -0,0 +1,123 @@
+package encoding
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+
+	"github.com/opentracing/opentracing-go"
+	willf_bloom "github.com/willf/bloom"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding/common"
+)
+
+// BackendBlock represents a block already in the backend.
+type BackendBlock struct {
+	encoding versionedEncoding
+
+	meta   *backend.BlockMeta
+	reader backend.Reader
+}
+
+// NewBackendBlock returns a BackendBlock for the given backend.BlockMeta
+//  It is version aware.
+func NewBackendBlock(meta *backend.BlockMeta, r backend.Reader) (*BackendBlock, error) {
+	var encoding versionedEncoding
+
+	switch meta.Version {
+	case "v0":
+		encoding = v0Encoding{}
+	case "v1":
+		encoding = v1Encoding{}
+	default:
+		return nil, fmt.Errorf("%s is not a valid block version", meta.Version)
+	}
+
+	return &BackendBlock{
+		encoding: encoding,
+		meta:     meta,
+		reader:   r,
+	}, nil
+}
+
+// Find searches a block for the ID and returns an object if found.
+func (b *BackendBlock) Find(ctx context.Context, id common.ID) ([]byte, error) {
+	var err error
+	span, ctx := opentracing.StartSpanFromContext(ctx, "BackendBlock.Find")
+	defer func() {
+		if err != nil {
+			span.SetTag("error", true)
+		}
+		span.Finish()
+	}()
+
+	span.SetTag("block", b.meta.BlockID.String())
+
+	shardKey := common.ShardKeyForTraceID(id)
+	blockID := b.meta.BlockID
+	tenantID := b.meta.TenantID
+
+	bloomBytes, err := b.reader.Read(ctx, b.encoding.nameBloom(shardKey), blockID, tenantID)
+	if err != nil {
+		return nil, fmt.Errorf("error retrieving bloom (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	filter := &willf_bloom.BloomFilter{}
+	_, err = filter.ReadFrom(bytes.NewReader(bloomBytes))
+	if err != nil {
+		return nil, fmt.Errorf("error parsing bloom (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	if !filter.Test(id) {
+		return nil, nil
+	}
+
+	indexBytes, err := b.reader.Read(ctx, b.encoding.nameIndex(), blockID, tenantID)
+	if err != nil {
+		return nil, fmt.Errorf("error reading index (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	indexReader, err := b.encoding.newIndexReader(indexBytes)
+	if err != nil {
+		return nil, fmt.Errorf("error building index reader (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	ra := backend.NewReaderAt(b.meta, b.encoding.nameObjects(), b.reader)
+	pageReader, err := b.encoding.newPageReader(ra, b.meta.Encoding)
+	if err != nil {
+		return nil, fmt.Errorf("error building page reader (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	// passing nil for objectCombiner here.  this is fine b/c a backend block should never have dupes
+	finder := b.encoding.newPagedFinder(indexReader, pageReader, nil)
+	objectBytes, err := finder.Find(id)
+
+	if err != nil {
+		return nil, fmt.Errorf("error using pageFinder (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	return objectBytes, nil
+}
+
+// Iterator returns an Iterator that iterates over the objects in the block from the backend
+func (b *BackendBlock) Iterator(chunkSizeBytes uint32) (common.Iterator, error) {
+	// read index
+	indexBytes, err := b.reader.Read(context.TODO(), b.encoding.nameIndex(), b.meta.BlockID, b.meta.TenantID)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read index (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	ra := backend.NewReaderAt(b.meta, b.encoding.nameObjects(), b.reader)
+	pageReader, err := b.encoding.newPageReader(ra, b.meta.Encoding)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create pageReader (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	iterator, err := b.encoding.newPagedIterator(chunkSizeBytes, indexBytes, pageReader)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create iterator (%s, %s): %w", b.meta.TenantID, b.meta.BlockID, err)
+	}
+
+	return iterator, nil
+}
diff --git a/tempodb/encoding/v0/backend_block_test.go b/tempodb/encoding/backend_block_test.go
similarity index 79%
rename from tempodb/encoding/v0/backend_block_test.go
rename to tempodb/encoding/backend_block_test.go
index 95807fe917f..534058fd7a6 100644
--- a/tempodb/encoding/v0/backend_block_test.go
+++ b/tempodb/encoding/backend_block_test.go
@@ -1,143 +1,25 @@
-package v0
+package encoding
 
 import (
-	"bytes"
 	"context"
-	"errors"
-	"strings"
 	"testing"
 
 	"github.com/google/uuid"
 	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
-	"github.com/grafana/tempo/tempodb/backend/util"
-	"github.com/grafana/tempo/tempodb/encoding/common"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
-	"github.com/willf/bloom"
 )
 
-func TestBackendBlock(t *testing.T) {
-	id := []byte{0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01}
-	object := []byte{0x01, 0x02}
-
-	bloom := bloom.New(1, 1)
-	bloom.Add(id)
-
-	bloomBuffer := bytes.Buffer{}
-	_, err := bloom.WriteTo(&bloomBuffer)
-	require.NoError(t, err)
-
-	objectBuffer := bytes.Buffer{}
-	_, err = marshalObjectToWriter(id, object, &objectBuffer)
-	require.NoError(t, err)
-
-	record := newRecord()
-	record.ID = id
-	record.Length = uint32(objectBuffer.Len())
-	indexBytes, err := marshalRecords([]*common.Record{record})
-	require.NoError(t, err)
-
-	tests := []struct {
-		name               string
-		id                 []byte
-		readerError        error
-		readerBloom        []byte
-		readerIndex        []byte
-		readerRange        []byte
-		expected           []byte
-		expectedBloomReads int32
-		expectedBloomBytes int32
-		expectedIndexReads int32
-		expectedIndexBytes int32
-		expectedBlockReads int32
-		expectedBlockBytes int32
-	}{
-		{
-			name:        "error",
-			id:          id,
-			readerError: errors.New("wups"),
-		},
-		{
-			name:               "bloom passes",
-			id:                 id,
-			readerBloom:        bloomBuffer.Bytes(),
-			expectedBloomReads: 1,
-			expectedBloomBytes: int32(bloomBuffer.Len()),
-			expectedIndexReads: 1,
-		},
-		{
-			name:               "index passes",
-			id:                 id,
-			readerBloom:        bloomBuffer.Bytes(),
-			readerIndex:        indexBytes,
-			expectedBloomReads: 1,
-			expectedBloomBytes: int32(bloomBuffer.Len()),
-			expectedIndexReads: 1,
-			expectedIndexBytes: int32(len(indexBytes)),
-			expectedBlockReads: 1,
-			expectedBlockBytes: int32(objectBuffer.Len()),
-		},
-		{
-			name:               "obj found",
-			id:                 id,
-			readerBloom:        bloomBuffer.Bytes(),
-			readerIndex:        indexBytes,
-			readerRange:        objectBuffer.Bytes(),
-			expected:           object,
-			expectedBloomReads: 1,
-			expectedBloomBytes: int32(bloomBuffer.Len()),
-			expectedIndexReads: 1,
-			expectedIndexBytes: int32(len(indexBytes)),
-			expectedBlockReads: 1,
-			expectedBlockBytes: int32(objectBuffer.Len()),
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			fn := func(name string, blockID uuid.UUID, tenantID string) ([]byte, error) {
-				if tt.readerError != nil {
-					return nil, tt.readerError
-				}
-
-				if strings.Contains(name, nameBloomPrefix) {
-					return tt.readerBloom, nil
-				}
-
-				return tt.readerIndex, nil
-			}
-
-			mockR := &util.MockReader{
-				ReadFn: fn,
-				Range:  tt.readerRange,
-			}
-
-			findMetrics := common.NewFindMetrics()
-			block := NewBackendBlock(&backend.BlockMeta{})
-			actual, err := block.Find(context.Background(), mockR, tt.id, &findMetrics)
-
-			assert.True(t, errors.Is(err, tt.readerError))
-			assert.Equal(t, tt.expected, actual)
-			assert.Equal(t, tt.expectedBloomReads, findMetrics.BloomFilterReads.Load())
-			assert.Equal(t, tt.expectedBloomBytes, findMetrics.BloomFilterBytesRead.Load())
-			assert.Equal(t, tt.expectedIndexReads, findMetrics.IndexReads.Load())
-			assert.Equal(t, tt.expectedIndexBytes, findMetrics.IndexBytesRead.Load())
-			assert.Equal(t, tt.expectedBlockReads, findMetrics.BlockReads.Load())
-			assert.Equal(t, tt.expectedBlockBytes, findMetrics.BlockBytesRead.Load())
-		})
-	}
-
-}
-
 func TestV0Block(t *testing.T) {
 	r, _, _, err := local.New(&local.Config{
-		Path: "./v0_test",
+		Path: "./v0test",
 	})
 	require.NoError(t, err, "error creating backend")
 
-	meta := backend.NewBlockMeta("fake", uuid.MustParse("00f5a116-639e-4880-bbe7-be9b0c828033"))
-	backendBlock := NewBackendBlock(meta)
+	meta := backend.NewBlockMeta("fake", uuid.MustParse("00f5a116-639e-4880-bbe7-be9b0c828033"), "v0", backend.EncNone)
+	backendBlock, err := NewBackendBlock(meta, r)
+	require.NoError(t, err, "error creating backendblock")
 
 	// known ids and objs written
 	ids := [][]byte{
@@ -167,16 +49,15 @@ func TestV0Block(t *testing.T) {
 	}
 
 	// test Find
-	m := common.NewFindMetrics()
 	for i, id := range ids {
-		foundBytes, err := backendBlock.Find(context.Background(), r, id, &m)
+		foundBytes, err := backendBlock.Find(context.Background(), id)
 		assert.NoError(t, err)
 
 		assert.Equal(t, reqs[i], foundBytes)
 	}
 
 	// test Iterator
-	iterator, err := backendBlock.Iterator(10, r)
+	iterator, err := backendBlock.Iterator(10)
 	require.NoError(t, err, "error getting iterator")
 	i := 0
 	for {
diff --git a/tempodb/encoding/common/index_reader.go b/tempodb/encoding/common/index_reader.go
new file mode 100644
index 00000000000..eba268d095a
--- /dev/null
+++ b/tempodb/encoding/common/index_reader.go
@@ -0,0 +1,31 @@
+package common
+
+import (
+	"bytes"
+	"sort"
+)
+
+// Records is a slice of *Record
+type Records []*Record
+
+// At implements IndexReader
+func (r Records) At(i int) *Record {
+	if i < 0 || i >= len(r) {
+		return nil
+	}
+
+	return r[i]
+}
+
+// Find implements IndexReader
+func (r Records) Find(id ID) (*Record, int) {
+	i := sort.Search(len(r), func(idx int) bool {
+		return bytes.Compare(r[idx].ID, id) >= 0
+	})
+
+	if i < 0 || i >= len(r) {
+		return nil, -1
+	}
+
+	return r[i], i
+}
diff --git a/tempodb/encoding/common/types.go b/tempodb/encoding/common/types.go
index 6ad019bd731..18af2775852 100644
--- a/tempodb/encoding/common/types.go
+++ b/tempodb/encoding/common/types.go
@@ -1,7 +1,5 @@
 package common
 
-import "go.uber.org/atomic"
-
 // This file contains types that need to be referenced by both the ./encoding and ./encoding/vX packages.
 // It primarily exists here to break dependency loops.
 
@@ -28,7 +26,7 @@ type Finder interface {
 // Appender is capable of tracking objects and ids that are added to it
 type Appender interface {
 	Append(ID, []byte) error
-	Complete()
+	Complete() error
 	Records() []*Record
 	Length() int
 	DataLength() uint64
@@ -39,24 +37,21 @@ type ObjectCombiner interface {
 	Combine(objA []byte, objB []byte) []byte
 }
 
-// FindMetrics is a threadsafe struct for tracking metrics related to a parallelized query
-type FindMetrics struct {
-	BloomFilterReads     *atomic.Int32
-	BloomFilterBytesRead *atomic.Int32
-	IndexReads           *atomic.Int32
-	IndexBytesRead       *atomic.Int32
-	BlockReads           *atomic.Int32
-	BlockBytesRead       *atomic.Int32
+// PageReader returns a slice of pages in the encoding/v0 format referenced by
+// the slice of *Records passed in.  The length of the returned slice is guaranteed
+// to be equal to the length of the provided records unless error is non nil.
+// PageReader is the primary abstraction point for supporting multiple data
+// formats.
+type PageReader interface {
+	Read([]*Record) ([][]byte, error)
 }
 
-// NewFindMetrics constructs a FindMetrics
-func NewFindMetrics() FindMetrics {
-	return FindMetrics{
-		BloomFilterReads:     atomic.NewInt32(0),
-		BloomFilterBytesRead: atomic.NewInt32(0),
-		IndexReads:           atomic.NewInt32(0),
-		IndexBytesRead:       atomic.NewInt32(0),
-		BlockReads:           atomic.NewInt32(0),
-		BlockBytesRead:       atomic.NewInt32(0),
-	}
+// IndexReader is used to abstract away the details of an index.  Currently
+// only used in the paged finder, it could eventually provide a way to
+// support multiple index formats.
+// IndexReader is the primary abstraction point for supporting multiple index
+// formats.
+type IndexReader interface {
+	At(i int) *Record
+	Find(id ID) (*Record, int)
 }
diff --git a/tempodb/encoding/compactor_block.go b/tempodb/encoding/compactor_block.go
index 9117044eb1e..611523fd282 100644
--- a/tempodb/encoding/compactor_block.go
+++ b/tempodb/encoding/compactor_block.go
@@ -11,6 +11,8 @@ import (
 )
 
 type CompactorBlock struct {
+	encoding versionedEncoding
+
 	compactedMeta *backend.BlockMeta
 	inMetas       []*backend.BlockMeta
 
@@ -21,7 +23,8 @@ type CompactorBlock struct {
 	appender        common.Appender
 }
 
-func NewCompactorBlock(id uuid.UUID, tenantID string, bloomFP float64, indexDownsample int, metas []*backend.BlockMeta, estimatedObjects int) (*CompactorBlock, error) {
+// NewCompactorBlock creates a ... new compactor block!
+func NewCompactorBlock(cfg *BlockConfig, id uuid.UUID, tenantID string, metas []*backend.BlockMeta, estimatedObjects int) (*CompactorBlock, error) {
 	if len(metas) == 0 {
 		return nil, fmt.Errorf("empty block meta list")
 	}
@@ -31,13 +34,18 @@ func NewCompactorBlock(id uuid.UUID, tenantID string, bloomFP float64, indexDown
 	}
 
 	c := &CompactorBlock{
-		compactedMeta: backend.NewBlockMeta(tenantID, id),
-		bloom:         common.NewWithEstimates(uint(estimatedObjects), bloomFP),
+		encoding:      latestEncoding(),
+		compactedMeta: backend.NewBlockMeta(tenantID, id, currentVersion, cfg.Encoding),
+		bloom:         common.NewWithEstimates(uint(estimatedObjects), cfg.BloomFP),
 		inMetas:       metas,
 	}
 
+	var err error
 	c.appendBuffer = &bytes.Buffer{}
-	c.appender = newBufferedAppender(c.appendBuffer, indexDownsample, estimatedObjects)
+	c.appender, err = c.encoding.newBufferedAppender(c.appendBuffer, cfg.Encoding, cfg.IndexDownsample, estimatedObjects)
+	if err != nil {
+		return nil, fmt.Errorf("failed to created appender: %w", err)
+	}
 
 	return c, nil
 }
@@ -66,32 +74,46 @@ func (c *CompactorBlock) Length() int {
 }
 
 // FlushBuffer flushes any existing objects to the backend
-func (c *CompactorBlock) FlushBuffer(ctx context.Context, tracker backend.AppendTracker, w backend.Writer) (backend.AppendTracker, error) {
+func (c *CompactorBlock) FlushBuffer(ctx context.Context, tracker backend.AppendTracker, w backend.Writer) (backend.AppendTracker, int, error) {
+	if c.appender.Length() == 0 {
+		return tracker, 0, nil
+	}
+
 	meta := c.BlockMeta()
-	tracker, err := appendBlockData(ctx, w, meta, tracker, c.appendBuffer.Bytes())
+	tracker, err := c.encoding.appendBlockData(ctx, w, meta, tracker, c.appendBuffer.Bytes())
 	if err != nil {
-		return nil, err
+		return nil, 0, err
 	}
 
+	bytesFlushed := c.appendBuffer.Len()
 	c.appendBuffer.Reset()
 	c.bufferedObjects = 0
 
-	return tracker, nil
+	return tracker, bytesFlushed, nil
 }
 
 // Complete finishes writes the compactor metadata and closes all buffers and appenders
-func (c *CompactorBlock) Complete(ctx context.Context, tracker backend.AppendTracker, w backend.Writer) error {
-	c.appender.Complete()
+func (c *CompactorBlock) Complete(ctx context.Context, tracker backend.AppendTracker, w backend.Writer) (int, error) {
+	err := c.appender.Complete()
+	if err != nil {
+		return 0, err
+	}
+
+	// one final flush
+	tracker, bytesFlushed, err := c.FlushBuffer(ctx, tracker, w)
+	if err != nil {
+		return 0, err
+	}
 
 	records := c.appender.Records()
 	meta := c.BlockMeta()
 
-	err := writeBlockMeta(ctx, w, meta, records, c.bloom)
+	err = c.encoding.writeBlockMeta(ctx, w, meta, records, c.bloom)
 	if err != nil {
-		return err
+		return 0, err
 	}
 
-	return w.CloseAppend(ctx, tracker)
+	return bytesFlushed, w.CloseAppend(ctx, tracker)
 }
 
 func (c *CompactorBlock) BlockMeta() *backend.BlockMeta {
diff --git a/tempodb/encoding/compactor_block_test.go b/tempodb/encoding/compactor_block_test.go
index f9ad37a0e58..7e61574cb3a 100644
--- a/tempodb/encoding/compactor_block_test.go
+++ b/tempodb/encoding/compactor_block_test.go
@@ -19,13 +19,12 @@ const (
 )
 
 func TestCompactorBlockError(t *testing.T) {
-	_, err := NewCompactorBlock(uuid.New(), "", 0, 0, nil, 0)
+	_, err := NewCompactorBlock(nil, uuid.New(), "", nil, 0)
 	assert.Error(t, err)
 }
 
 func TestCompactorBlockAddObject(t *testing.T) {
 	indexDownsample := 3
-	bloomFP := .01
 
 	metas := []*backend.BlockMeta{
 		{
@@ -39,7 +38,11 @@ func TestCompactorBlockAddObject(t *testing.T) {
 	}
 
 	numObjects := (rand.Int() % 20) + 1
-	cb, err := NewCompactorBlock(uuid.New(), testTenantID, bloomFP, indexDownsample, metas, numObjects)
+	cb, err := NewCompactorBlock(&BlockConfig{
+		BloomFP:         .01,
+		IndexDownsample: indexDownsample,
+		Encoding:        backend.EncGZIP,
+	}, uuid.New(), testTenantID, metas, numObjects)
 	assert.NoError(t, err)
 
 	var minID common.ID
@@ -67,8 +70,8 @@ func TestCompactorBlockAddObject(t *testing.T) {
 			maxID = id
 		}
 	}
-	cb.appender.Complete()
-
+	err = cb.appender.Complete()
+	assert.NoError(t, err)
 	assert.Equal(t, numObjects, cb.Length())
 
 	// test meta
diff --git a/tempodb/encoding/complete_block.go b/tempodb/encoding/complete_block.go
index 0949f74cba6..9fb8df6c91f 100644
--- a/tempodb/encoding/complete_block.go
+++ b/tempodb/encoding/complete_block.go
@@ -18,6 +18,8 @@ import (
 // A CompleteBlock also knows the filepath of the append wal file it was cut from.  It is responsible for
 // cleaning this block up once it has been flushed to the backend.
 type CompleteBlock struct {
+	encoding versionedEncoding
+
 	meta    *backend.BlockMeta
 	bloom   *common.ShardedBloomFilter
 	records []*common.Record
@@ -31,10 +33,11 @@ type CompleteBlock struct {
 }
 
 // NewCompleteBlock creates a new block and takes _ALL_ the parameters necessary to build the ordered, deduped file on disk
-func NewCompleteBlock(originatingMeta *backend.BlockMeta, iterator common.Iterator, bloomFP float64, estimatedObjects int, indexDownsample int, filepath string, walFilename string) (*CompleteBlock, error) {
+func NewCompleteBlock(cfg *BlockConfig, originatingMeta *backend.BlockMeta, iterator common.Iterator, estimatedObjects int, filepath string, walFilename string) (*CompleteBlock, error) {
 	c := &CompleteBlock{
-		meta:        backend.NewBlockMeta(originatingMeta.TenantID, uuid.New()),
-		bloom:       common.NewWithEstimates(uint(estimatedObjects), bloomFP),
+		encoding:    latestEncoding(),
+		meta:        backend.NewBlockMeta(originatingMeta.TenantID, uuid.New(), currentVersion, cfg.Encoding),
+		bloom:       common.NewWithEstimates(uint(estimatedObjects), cfg.BloomFP),
 		records:     make([]*common.Record, 0),
 		filepath:    filepath,
 		walFilename: walFilename,
@@ -50,7 +53,10 @@ func NewCompleteBlock(originatingMeta *backend.BlockMeta, iterator common.Iterat
 		return nil, err
 	}
 
-	appender := newBufferedAppender(appendFile, indexDownsample, estimatedObjects)
+	appender, err := c.encoding.newBufferedAppender(appendFile, cfg.Encoding, cfg.IndexDownsample, estimatedObjects)
+	if err != nil {
+		return nil, err
+	}
 	for {
 		bytesID, bytesObject, err := iterator.Next()
 		if bytesID == nil {
@@ -73,7 +79,10 @@ func NewCompleteBlock(originatingMeta *backend.BlockMeta, iterator common.Iterat
 			return nil, err
 		}
 	}
-	appender.Complete()
+	err = appender.Complete()
+	if err != nil {
+		return nil, err
+	}
 	appendFile.Close()
 	c.records = appender.Records()
 	c.meta.StartTime = originatingMeta.StartTime
@@ -101,12 +110,12 @@ func (c *CompleteBlock) Write(ctx context.Context, w backend.Writer) error {
 		return err
 	}
 
-	err = writeBlockData(ctx, w, c.meta, src, fileStat.Size())
+	err = c.encoding.writeBlockData(ctx, w, c.meta, src, fileStat.Size())
 	if err != nil {
 		return err
 	}
 
-	err = writeBlockMeta(ctx, w, c.meta, c.records, c.bloom)
+	err = c.encoding.writeBlockMeta(ctx, w, c.meta, c.records, c.bloom)
 	if err != nil {
 		return err
 	}
@@ -133,8 +142,12 @@ func (c *CompleteBlock) Find(id common.ID, combiner common.ObjectCombiner) ([]by
 		return nil, err
 	}
 
-	finder := newDedupingFinder(c.records, file, combiner)
+	pageReader, err := c.encoding.newPageReader(file, c.meta.Encoding)
+	if err != nil {
+		return nil, err
+	}
 
+	finder := c.encoding.newPagedFinder(common.Records(c.records), pageReader, combiner)
 	return finder.Find(id)
 }
 
diff --git a/tempodb/encoding/complete_block_test.go b/tempodb/encoding/complete_block_test.go
index 7bd33cc1740..0a54e0cd56e 100644
--- a/tempodb/encoding/complete_block_test.go
+++ b/tempodb/encoding/complete_block_test.go
@@ -4,6 +4,8 @@ import (
 	"bufio"
 	"bytes"
 	"context"
+	"fmt"
+	"io"
 	"io/ioutil"
 	"math/rand"
 	"os"
@@ -19,6 +21,7 @@ import (
 	"github.com/grafana/tempo/tempodb/backend/local"
 	"github.com/grafana/tempo/tempodb/encoding/common"
 	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+	v1 "github.com/grafana/tempo/tempodb/encoding/v1"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 )
@@ -39,7 +42,11 @@ func TestCompleteBlock(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 	require.NoError(t, err, "unexpected error creating temp dir")
 
-	block, ids, reqs := completeBlock(t, tempDir)
+	block, ids, reqs := completeBlock(t, &BlockConfig{
+		IndexDownsample: 13,
+		BloomFP:         .01,
+		Encoding:        backend.EncGZIP,
+	}, tempDir)
 
 	// test Find
 	for i, id := range ids {
@@ -61,12 +68,24 @@ func TestCompleteBlock(t *testing.T) {
 	}
 }
 
-func TestCompleteBlockToBackendBlock(t *testing.T) {
+func TestCompleteBlockAll(t *testing.T) {
+	for _, enc := range backend.SupportedEncoding {
+		testCompleteBlockToBackendBlock(t,
+			&BlockConfig{
+				IndexDownsample: 13,
+				BloomFP:         .01,
+				Encoding:        enc,
+			},
+		)
+	}
+}
+
+func testCompleteBlockToBackendBlock(t *testing.T, cfg *BlockConfig) {
 	tempDir, err := ioutil.TempDir("/tmp", "")
 	defer os.RemoveAll(tempDir)
 	require.NoError(t, err, "unexpected error creating temp dir")
 
-	block, ids, reqs := completeBlock(t, tempDir)
+	block, ids, reqs := completeBlock(t, cfg, tempDir)
 
 	backendTmpDir, err := ioutil.TempDir("/tmp", "")
 	defer os.RemoveAll(backendTmpDir)
@@ -88,13 +107,12 @@ func TestCompleteBlockToBackendBlock(t *testing.T) {
 	meta, err := r.BlockMeta(context.Background(), uuids[0], testTenantID)
 	require.NoError(t, err, "error getting meta")
 
-	backendBlock, err := NewBackendBlock(meta)
+	backendBlock, err := NewBackendBlock(meta, r)
 	require.NoError(t, err, "error creating block")
 
-	m := common.NewFindMetrics()
 	// test Find
 	for i, id := range ids {
-		foundBytes, err := backendBlock.Find(context.Background(), r, id, &m)
+		foundBytes, err := backendBlock.Find(context.Background(), id)
 		assert.NoError(t, err)
 
 		assert.Equal(t, reqs[i], foundBytes)
@@ -107,7 +125,7 @@ func TestCompleteBlockToBackendBlock(t *testing.T) {
 	}
 	sort.Slice(ids, func(i int, j int) bool { return bytes.Compare(ids[i], ids[j]) == -1 })
 
-	iterator, err := backendBlock.Iterator(10, r)
+	iterator, err := backendBlock.Iterator(10 * 1024 * 1024)
 	require.NoError(t, err, "error getting iterator")
 	i := 0
 	for {
@@ -124,10 +142,9 @@ func TestCompleteBlockToBackendBlock(t *testing.T) {
 	assert.Equal(t, len(ids), i)
 }
 
-func completeBlock(t *testing.T, tempDir string) (*CompleteBlock, [][]byte, [][]byte) {
+func completeBlock(t *testing.T, cfg *BlockConfig, tempDir string) (*CompleteBlock, [][]byte, [][]byte) {
 	rand.Seed(time.Now().Unix())
 
-	indexDownsample := 13
 	buffer := &bytes.Buffer{}
 	writer := bufio.NewWriter(buffer)
 	appender := v0.NewAppender(writer)
@@ -155,20 +172,21 @@ func completeBlock(t *testing.T, tempDir string) (*CompleteBlock, [][]byte, [][]
 			minID = id
 		}
 	}
-	appender.Complete()
-	err := writer.Flush()
+	err := appender.Complete()
+	require.NoError(t, err)
+	err = writer.Flush()
 	require.NoError(t, err, "unexpected error flushing writer")
 
-	originatingMeta := backend.NewBlockMeta(testTenantID, uuid.New())
+	originatingMeta := backend.NewBlockMeta(testTenantID, uuid.New(), "should_be_ignored", backend.EncGZIP)
 	originatingMeta.StartTime = time.Now().Add(-5 * time.Minute)
 	originatingMeta.EndTime = time.Now().Add(5 * time.Minute)
 
 	iterator := v0.NewRecordIterator(appender.Records(), bytes.NewReader(buffer.Bytes()))
-	block, err := NewCompleteBlock(originatingMeta, iterator, .01, numMsgs, indexDownsample, tempDir, "")
+	block, err := NewCompleteBlock(cfg, originatingMeta, iterator, numMsgs, tempDir, "")
 	require.NoError(t, err, "unexpected error completing block")
 
 	// test downsample config
-	require.Equal(t, numMsgs/indexDownsample+1, len(block.records))
+	require.Equal(t, numMsgs/cfg.IndexDownsample+1, len(block.records))
 	require.True(t, block.FlushedTime().IsZero())
 
 	require.True(t, bytes.Equal(block.meta.MinID, minID))
@@ -179,3 +197,100 @@ func completeBlock(t *testing.T, tempDir string) (*CompleteBlock, [][]byte, [][]
 
 	return block, ids, reqs
 }
+
+const benchDownsample = 200
+
+func BenchmarkWriteGzip(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncGZIP, benchDownsample, false)
+}
+func BenchmarkWriteSnappy(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncSnappy, benchDownsample, false)
+}
+func BenchmarkWriteLZ4256(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncLZ4_256k, benchDownsample, false)
+}
+func BenchmarkWriteLZ41M(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncLZ4_1M, benchDownsample, false)
+}
+func BenchmarkWriteNone(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncNone, benchDownsample, false)
+}
+
+func BenchmarkWriteZstd(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, false)
+}
+
+func BenchmarkReadGzip(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncGZIP, benchDownsample, true)
+}
+func BenchmarkReadSnappy(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncSnappy, benchDownsample, true)
+}
+func BenchmarkReadLZ4256(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncLZ4_256k, benchDownsample, true)
+}
+func BenchmarkReadLZ41M(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncLZ4_1M, benchDownsample, true)
+}
+func BenchmarkReadNone(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncNone, benchDownsample, true)
+}
+
+func BenchmarkReadZstd(b *testing.B) {
+	benchmarkCompressBlock(b, backend.EncZstd, benchDownsample, true)
+}
+
+// Download a block from your backend and place in ./benchmark_block/fake/<guid>
+//nolint:unparam
+func benchmarkCompressBlock(b *testing.B, encoding backend.Encoding, indexDownsample int, benchRead bool) {
+	tempDir, err := ioutil.TempDir("/tmp", "")
+	defer os.RemoveAll(tempDir)
+	require.NoError(b, err, "unexpected error creating temp dir")
+
+	r, _, _, err := local.New(&local.Config{
+		Path: "./benchmark_block",
+	})
+	require.NoError(b, err, "error creating backend")
+
+	backendBlock, err := NewBackendBlock(backend.NewBlockMeta("fake", uuid.MustParse("9f15417a-1242-40e4-9de3-a057d3b176c1"), "v0", backend.EncNone), r)
+	require.NoError(b, err, "error creating backend block")
+
+	iterator, err := backendBlock.Iterator(10 * 1024 * 1024)
+	require.NoError(b, err, "error creating iterator")
+
+	if !benchRead {
+		b.ResetTimer()
+	}
+
+	originatingMeta := backend.NewBlockMeta(testTenantID, uuid.New(), "should_be_ignored", backend.EncGZIP)
+	cb, err := NewCompleteBlock(&BlockConfig{
+		IndexDownsample: indexDownsample,
+		BloomFP:         .05,
+		Encoding:        encoding,
+	}, originatingMeta, iterator, 10000, tempDir, "")
+	require.NoError(b, err, "error creating block")
+
+	lastRecord := cb.records[len(cb.records)-1]
+	fmt.Println("size: ", lastRecord.Start+uint64(lastRecord.Length))
+
+	if !benchRead {
+		return
+	}
+
+	b.ResetTimer()
+	file, err := os.Open(cb.fullFilename())
+	require.NoError(b, err)
+	pr, err := v1.NewPageReader(file, encoding)
+	require.NoError(b, err)
+	iterator = v1.NewPagedIterator(10*1024*1024, common.Records(cb.records), pr)
+
+	for {
+		id, _, err := iterator.Next()
+		if err != io.EOF {
+			require.NoError(b, err)
+		}
+		if id == nil {
+			break
+		}
+	}
+}
diff --git a/tempodb/encoding/config.go b/tempodb/encoding/config.go
new file mode 100644
index 00000000000..53606edee0f
--- /dev/null
+++ b/tempodb/encoding/config.go
@@ -0,0 +1,27 @@
+package encoding
+
+import (
+	"fmt"
+
+	"github.com/grafana/tempo/tempodb/backend"
+)
+
+// BlockConfig holds configuration options for newly created blocks
+type BlockConfig struct {
+	IndexDownsample int              `yaml:"index_downsample"`
+	BloomFP         float64          `yaml:"bloom_filter_false_positive"`
+	Encoding        backend.Encoding `yaml:"encoding"`
+}
+
+// ValidateConfig returns true if the config is valid
+func ValidateConfig(b *BlockConfig) error {
+	if b.IndexDownsample == 0 {
+		return fmt.Errorf("Non-zero index downsample required")
+	}
+
+	if b.BloomFP <= 0.0 {
+		return fmt.Errorf("invalid bloom filter fp rate %v", b.BloomFP)
+	}
+
+	return nil
+}
diff --git a/tempodb/encoding/v0/appender.go b/tempodb/encoding/v0/appender.go
index f9587f73421..91362f50a43 100644
--- a/tempodb/encoding/v0/appender.go
+++ b/tempodb/encoding/v0/appender.go
@@ -25,7 +25,7 @@ func NewAppender(writer io.Writer) common.Appender {
 // Append appends the id/object to the writer.  Note that the caller is giving up ownership of the two byte arrays backing the slices.
 //   Copies should be made and passed in if this is a problem
 func (a *appender) Append(id common.ID, b []byte) error {
-	length, err := marshalObjectToWriter(id, b, a.writer)
+	length, err := MarshalObjectToWriter(id, b, a.writer)
 	if err != nil {
 		return err
 	}
@@ -57,6 +57,6 @@ func (a *appender) DataLength() uint64 {
 	return a.currentOffset
 }
 
-func (a *appender) Complete() {
-
+func (a *appender) Complete() error {
+	return nil
 }
diff --git a/tempodb/encoding/v0/appender_buffered.go b/tempodb/encoding/v0/appender_buffered.go
index f4227f9915e..f22f53e6385 100644
--- a/tempodb/encoding/v0/appender_buffered.go
+++ b/tempodb/encoding/v0/appender_buffered.go
@@ -29,7 +29,7 @@ func NewBufferedAppender(writer io.Writer, indexDownsample int, totalObjectsEsti
 // Append appends the id/object to the writer.  Note that the caller is giving up ownership of the two byte arrays backing the slices.
 //   Copies should be made and passed in if this is a problem
 func (a *bufferedAppender) Append(id common.ID, b []byte) error {
-	length, err := marshalObjectToWriter(id, b, a.writer)
+	length, err := MarshalObjectToWriter(id, b, a.writer)
 	if err != nil {
 		return err
 	}
@@ -65,10 +65,12 @@ func (a *bufferedAppender) DataLength() uint64 {
 	return a.currentOffset
 }
 
-func (a *bufferedAppender) Complete() {
+func (a *bufferedAppender) Complete() error {
 	if a.currentRecord == nil {
-		return
+		return nil
 	}
 	a.records = append(a.records, a.currentRecord)
 	a.currentRecord = nil
+
+	return nil
 }
diff --git a/tempodb/encoding/v0/backend_block.go b/tempodb/encoding/v0/backend_block.go
deleted file mode 100644
index 6b6f6248c4b..00000000000
--- a/tempodb/encoding/v0/backend_block.go
+++ /dev/null
@@ -1,105 +0,0 @@
-package v0
-
-import (
-	"bytes"
-	"context"
-	"fmt"
-
-	"github.com/opentracing/opentracing-go"
-	willf_bloom "github.com/willf/bloom"
-
-	"github.com/grafana/tempo/tempodb/backend"
-	"github.com/grafana/tempo/tempodb/encoding/common"
-)
-
-type BackendBlock struct {
-	meta *backend.BlockMeta
-}
-
-// NewBackendBlock returns a block used for finding traces in the backend
-func NewBackendBlock(meta *backend.BlockMeta) *BackendBlock {
-	return &BackendBlock{
-		meta: meta,
-	}
-}
-
-// Find searches a block for the ID and returns an object if found.
-func (b *BackendBlock) Find(ctx context.Context, r backend.Reader, id common.ID, metrics *common.FindMetrics) ([]byte, error) {
-	var err error
-	span, ctx := opentracing.StartSpanFromContext(ctx, "BackendBlock.Find")
-	defer func() {
-		if err != nil {
-			span.SetTag("error", true)
-		}
-		span.Finish()
-	}()
-
-	span.SetTag("block", b.meta.BlockID.String())
-
-	shardKey := common.ShardKeyForTraceID(id)
-	blockID := b.meta.BlockID
-	tenantID := b.meta.TenantID
-
-	bloomBytes, err := r.Read(ctx, bloomName(shardKey), blockID, tenantID)
-	if err != nil {
-		return nil, fmt.Errorf("error retrieving bloom %w", err)
-	}
-
-	filter := &willf_bloom.BloomFilter{}
-	_, err = filter.ReadFrom(bytes.NewReader(bloomBytes))
-	if err != nil {
-		return nil, fmt.Errorf("error parsing bloom %w", err)
-	}
-
-	metrics.BloomFilterReads.Inc()
-	metrics.BloomFilterBytesRead.Add(int32(len(bloomBytes)))
-	if !filter.Test(id) {
-		return nil, nil
-	}
-
-	indexBytes, err := r.Read(ctx, nameIndex, blockID, tenantID)
-	metrics.IndexReads.Inc()
-	metrics.IndexBytesRead.Add(int32(len(indexBytes)))
-	if err != nil {
-		return nil, fmt.Errorf("error reading index %w", err)
-	}
-
-	record, err := findRecord(id, indexBytes) // todo: replace with backend.Finder
-	if err != nil {
-		return nil, fmt.Errorf("error finding record %w", err)
-	}
-
-	if record == nil {
-		return nil, nil
-	}
-
-	objectBytes := make([]byte, record.Length)
-	err = r.ReadRange(ctx, nameObjects, blockID, tenantID, record.Start, objectBytes)
-	metrics.BlockReads.Inc()
-	metrics.BlockBytesRead.Add(int32(len(objectBytes)))
-	if err != nil {
-		return nil, fmt.Errorf("error reading object %w", err)
-	}
-
-	iter := NewIterator(bytes.NewReader(objectBytes))
-	var foundObject []byte
-	for {
-		iterID, iterObject, err := iter.Next()
-		if iterID == nil {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
-		if bytes.Equal(iterID, id) {
-			foundObject = iterObject
-			break
-		}
-	}
-	return foundObject, nil
-}
-
-// Iterator searches a block for the ID and returns an object if found.
-func (b *BackendBlock) Iterator(chunkSizeBytes uint32, r backend.Reader) (common.Iterator, error) {
-	return NewBackendIterator(b.meta.TenantID, b.meta.BlockID, chunkSizeBytes, r)
-}
diff --git a/tempodb/encoding/v0/block.go b/tempodb/encoding/v0/block.go
index 3fb05e164f8..77ddc1469f2 100644
--- a/tempodb/encoding/v0/block.go
+++ b/tempodb/encoding/v0/block.go
@@ -11,12 +11,15 @@ import (
 )
 
 const (
-	nameObjects     = "data"
-	nameIndex       = "index"
+	// NameObjects names the backend data object
+	NameObjects = "data"
+	// NameIndex names the backend index object
+	NameIndex       = "index"
 	nameBloomPrefix = "bloom-"
 )
 
-func bloomName(shard int) string {
+// BloomName returns the backend bloom name for the given shard
+func BloomName(shard int) string {
 	return nameBloomPrefix + strconv.Itoa(shard)
 }
 
@@ -33,14 +36,14 @@ func WriteBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMe
 	}
 
 	// index
-	err = w.Write(ctx, nameIndex, meta.BlockID, meta.TenantID, index)
+	err = w.Write(ctx, NameIndex, meta.BlockID, meta.TenantID, index)
 	if err != nil {
 		return fmt.Errorf("unexpected error writing index %w", err)
 	}
 
 	// bloom
 	for i, bloom := range blooms {
-		err := w.Write(ctx, bloomName(i), meta.BlockID, meta.TenantID, bloom)
+		err := w.Write(ctx, BloomName(i), meta.BlockID, meta.TenantID, bloom)
 		if err != nil {
 			return fmt.Errorf("unexpected error writing bloom-%d %w", i, err)
 		}
@@ -57,10 +60,10 @@ func WriteBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMe
 
 // WriteBlockData writes the data object from an io.Reader to the backend.Writer
 func WriteBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error {
-	return w.WriteReader(ctx, nameObjects, meta.BlockID, meta.TenantID, r, size)
+	return w.WriteReader(ctx, NameObjects, meta.BlockID, meta.TenantID, r, size)
 }
 
 // AppendBlockData appends the bytes passed to the block data
 func AppendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error) {
-	return w.Append(ctx, nameObjects, meta.BlockID, meta.TenantID, tracker, buffer)
+	return w.Append(ctx, NameObjects, meta.BlockID, meta.TenantID, tracker, buffer)
 }
diff --git a/tempodb/encoding/v0/finder_deduping.go b/tempodb/encoding/v0/finder_deduping.go
deleted file mode 100644
index ca63572fa76..00000000000
--- a/tempodb/encoding/v0/finder_deduping.go
+++ /dev/null
@@ -1,90 +0,0 @@
-package v0
-
-import (
-	"bytes"
-	"io"
-	"sort"
-
-	"github.com/grafana/tempo/tempodb/encoding/common"
-)
-
-type dedupingFinder struct {
-	ra            io.ReaderAt
-	sortedRecords []*common.Record
-	combiner      common.ObjectCombiner
-}
-
-// NewDedupingFinder returns a dedupingFinder. This finder is used for searching
-//  a set of records and returning an object. If a set of consecutive records has
-//  matching ids they will be combined using the ObjectCombiner.
-func NewDedupingFinder(sortedRecords []*common.Record, ra io.ReaderAt, combiner common.ObjectCombiner) common.Finder {
-	return &dedupingFinder{
-		ra:            ra,
-		sortedRecords: sortedRecords,
-		combiner:      combiner,
-	}
-}
-
-func (f *dedupingFinder) Find(id common.ID) ([]byte, error) {
-	i := sort.Search(len(f.sortedRecords), func(idx int) bool {
-		return bytes.Compare(f.sortedRecords[idx].ID, id) >= 0
-	})
-
-	if i < 0 || i >= len(f.sortedRecords) {
-		return nil, nil
-	}
-
-	var bytesFound []byte
-
-	for {
-		record := f.sortedRecords[i]
-
-		bytesOne, err := f.findOne(id, record)
-		if err != nil {
-			return nil, err
-		}
-
-		bytesFound = f.combiner.Combine(bytesFound, bytesOne)
-
-		// we need to check the next record to see if it also matches our id
-		i++
-		if i >= len(f.sortedRecords) {
-			break
-		}
-
-		if !bytes.Equal(f.sortedRecords[i].ID, id) {
-			break
-		}
-	}
-
-	return bytesFound, nil
-}
-
-func (f *dedupingFinder) findOne(id common.ID, record *common.Record) ([]byte, error) {
-	buff := make([]byte, record.Length)
-	_, err := f.ra.ReadAt(buff, int64(record.Start))
-	if err != nil {
-		return nil, err
-	}
-
-	iter := NewIterator(bytes.NewReader(buff))
-	iter, err = NewDedupingIterator(iter, f.combiner)
-	if err != nil {
-		return nil, err
-	}
-
-	for {
-		foundID, b, err := iter.Next()
-		if foundID == nil {
-			break
-		}
-		if err != nil {
-			return nil, err
-		}
-		if bytes.Equal(foundID, id) {
-			return b, nil
-		}
-	}
-
-	return nil, nil
-}
diff --git a/tempodb/encoding/v0/finder_paged.go b/tempodb/encoding/v0/finder_paged.go
new file mode 100644
index 00000000000..d276d48b9ce
--- /dev/null
+++ b/tempodb/encoding/v0/finder_paged.go
@@ -0,0 +1,93 @@
+package v0
+
+import (
+	"bytes"
+	"errors"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+)
+
+type pagedFinder struct {
+	r        common.PageReader
+	index    common.IndexReader
+	combiner common.ObjectCombiner
+}
+
+// NewPagedFinder returns a paged. This finder is used for searching
+//  a set of records and returning an object. If a set of consecutive records has
+//  matching ids they will be combined using the ObjectCombiner.
+func NewPagedFinder(index common.IndexReader, r common.PageReader, combiner common.ObjectCombiner) common.Finder {
+	return &pagedFinder{
+		r:        r,
+		index:    index,
+		combiner: combiner,
+	}
+}
+
+func (f *pagedFinder) Find(id common.ID) ([]byte, error) {
+	var bytesFound []byte
+	record, i := f.index.Find(id)
+
+	if record == nil {
+		return nil, nil
+	}
+
+	for {
+		bytesOne, err := f.findOne(id, record)
+		if err != nil {
+			return nil, err
+		}
+
+		if f.combiner == nil {
+			bytesFound = bytesOne
+			break
+		}
+
+		bytesFound = f.combiner.Combine(bytesFound, bytesOne)
+
+		// we need to check the next record to see if it also matches our id
+		i++
+		record = f.index.At(i)
+		if record == nil {
+			break
+		}
+		if !bytes.Equal(record.ID, id) {
+			break
+		}
+	}
+
+	return bytesFound, nil
+}
+
+func (f *pagedFinder) findOne(id common.ID, record *common.Record) ([]byte, error) {
+	pages, err := f.r.Read([]*common.Record{record})
+	if err != nil {
+		return nil, err
+	}
+	if len(pages) == 0 {
+		return nil, errors.New("unexpected 0 length pages in findOne")
+	}
+
+	iter := NewIterator(bytes.NewReader(pages[0]))
+	if f.combiner != nil {
+		iter, err = NewDedupingIterator(iter, f.combiner)
+	}
+	if err != nil {
+		return nil, err
+	}
+
+	for {
+		foundID, b, err := iter.Next()
+		if foundID == nil {
+			break
+		}
+		if err != nil {
+			return nil, err
+		}
+		if bytes.Equal(foundID, id) {
+			return b, nil
+		}
+	}
+
+	return nil, nil
+}
diff --git a/tempodb/encoding/v0/index_reader.go b/tempodb/encoding/v0/index_reader.go
new file mode 100644
index 00000000000..7b3d8cc28ee
--- /dev/null
+++ b/tempodb/encoding/v0/index_reader.go
@@ -0,0 +1,56 @@
+package v0
+
+import (
+	"bytes"
+	"fmt"
+	"sort"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+)
+
+type readerBytes struct {
+	index []byte
+}
+
+// NewIndexReader returns an index reader for a byte slice of marshalled
+// ordered records.
+func NewIndexReader(index []byte) (common.IndexReader, error) {
+	mod := len(index) % recordLength
+	if mod != 0 {
+		return nil, fmt.Errorf("records are an unexpected number of bytes %d", len(index))
+	}
+
+	return &readerBytes{
+		index: index,
+	}, nil
+}
+
+func (r *readerBytes) At(i int) *common.Record {
+	if i < 0 || i >= len(r.index)/recordLength {
+		return nil
+	}
+
+	buff := r.index[i*recordLength : (i+1)*recordLength]
+	return unmarshalRecord(buff)
+}
+
+func (r *readerBytes) Find(id common.ID) (*common.Record, int) {
+	numRecords := recordCount(r.index)
+	var record *common.Record
+
+	i := sort.Search(numRecords, func(i int) bool {
+		buff := r.index[i*recordLength : (i+1)*recordLength]
+		record = unmarshalRecord(buff)
+
+		return bytes.Compare(record.ID, id) >= 0
+	})
+
+	if i >= 0 && i < numRecords {
+		buff := r.index[i*recordLength : (i+1)*recordLength]
+		record = unmarshalRecord(buff)
+
+		return record, i
+	}
+
+	return nil, -1
+}
diff --git a/tempodb/encoding/v0/index_reader_test.go b/tempodb/encoding/v0/index_reader_test.go
new file mode 100644
index 00000000000..414f61a5e9a
--- /dev/null
+++ b/tempodb/encoding/v0/index_reader_test.go
@@ -0,0 +1,90 @@
+package v0
+
+import (
+	"testing"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestIndexReader(t *testing.T) {
+	record1 := &common.Record{
+		ID:     []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		Start:  0,
+		Length: 1,
+	}
+	record2 := &common.Record{
+		ID:     []byte{0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		Start:  1,
+		Length: 2,
+	}
+	record3 := &common.Record{
+		ID:     []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+		Start:  2,
+		Length: 3,
+	}
+
+	recordBytes, err := marshalRecords([]*common.Record{record1, record2, record3})
+	require.NoError(t, err)
+
+	tests := []struct {
+		recordBytes       []byte
+		expectedError     bool
+		at                int
+		expectedAt        *common.Record
+		find              common.ID
+		expectedFind      *common.Record
+		expectedFindIndex int
+	}{
+		{
+			recordBytes:       []byte{},
+			expectedFindIndex: -1,
+		},
+		{
+			recordBytes:   []byte{0x01},
+			expectedError: true,
+		},
+		{
+			recordBytes:       []byte{},
+			at:                12,
+			expectedFindIndex: -1,
+		},
+		{
+			recordBytes:  recordBytes,
+			at:           0,
+			expectedAt:   record1,
+			find:         []byte{0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			expectedFind: record1,
+		},
+		{
+			recordBytes:       recordBytes,
+			at:                1,
+			expectedAt:        record2,
+			find:              []byte{0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			expectedFind:      record2,
+			expectedFindIndex: 1,
+		},
+		{
+			recordBytes:       recordBytes,
+			at:                2,
+			expectedAt:        record3,
+			find:              []byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
+			expectedFind:      record3,
+			expectedFindIndex: 2,
+		},
+	}
+
+	for _, tc := range tests {
+		reader, err := NewIndexReader(tc.recordBytes)
+		if tc.expectedError {
+			assert.Error(t, err)
+			continue
+		}
+
+		assert.Equal(t, tc.expectedAt, reader.At(tc.at))
+		actualFind, actualIndex := reader.Find(tc.find)
+		assert.Equal(t, tc.expectedFind, actualFind)
+		assert.Equal(t, tc.expectedFindIndex, actualIndex)
+	}
+}
diff --git a/tempodb/encoding/v0/iterator_backend.go b/tempodb/encoding/v0/iterator_backend.go
deleted file mode 100644
index b871d4b6570..00000000000
--- a/tempodb/encoding/v0/iterator_backend.go
+++ /dev/null
@@ -1,98 +0,0 @@
-package v0
-
-import (
-	"context"
-	"io"
-	"math"
-
-	"github.com/google/uuid"
-	"github.com/grafana/tempo/tempodb/backend"
-	"github.com/grafana/tempo/tempodb/encoding/common"
-	"github.com/pkg/errors"
-)
-
-type backendIterator struct {
-	tenantID string
-	blockID  uuid.UUID
-	r        backend.Reader
-
-	indexBuffer         []byte
-	objectsBuffer       []byte
-	activeObjectsBuffer []byte
-}
-
-// NewBackendIterator returns a backendIterator.  This iterator is used to iterate
-//  through objects stored in object storage.
-func NewBackendIterator(tenantID string, blockID uuid.UUID, chunkSizeBytes uint32, reader backend.Reader) (common.Iterator, error) {
-	index, err := reader.Read(context.TODO(), nameIndex, blockID, tenantID)
-	if err != nil {
-		return nil, err
-	}
-
-	return &backendIterator{
-		tenantID:      tenantID,
-		blockID:       blockID,
-		r:             reader,
-		indexBuffer:   index,
-		objectsBuffer: make([]byte, chunkSizeBytes),
-	}, err
-}
-
-// For performance reasons the ID and object slices returned from this method are owned by
-// the iterator.  If you have need to keep these values for longer than a single iteration
-// you need to make a copy of them.
-func (i *backendIterator) Next() (common.ID, []byte, error) {
-	var err error
-	var id common.ID
-	var object []byte
-
-	i.activeObjectsBuffer, id, object, err = unmarshalAndAdvanceBuffer(i.activeObjectsBuffer)
-	if err != nil && err != io.EOF {
-		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
-	} else if err != io.EOF {
-		return id, object, nil
-	}
-
-	// objects reader was empty, check the index
-	// if no index left, EOF
-	if len(i.indexBuffer) == 0 {
-		return nil, nil, io.EOF
-	}
-
-	// pull next n bytes into objects
-	var start uint64
-	var length uint32
-
-	start = math.MaxUint64
-	for len(i.indexBuffer) > 0 {
-		record := unmarshalRecord(i.indexBuffer[:recordLength])
-
-		// see if we can fit this record in.  we have to get at least one record in
-		if length+record.Length > uint32(len(i.objectsBuffer)) && start != math.MaxUint64 {
-			break
-		}
-		// advance index buffer
-		i.indexBuffer = i.indexBuffer[recordLength:]
-
-		if start == math.MaxUint64 {
-			start = record.Start
-		}
-		length += record.Length
-	}
-	if length > uint32(len(i.objectsBuffer)) {
-		i.objectsBuffer = make([]byte, length)
-	}
-	i.activeObjectsBuffer = i.objectsBuffer[:length]
-	err = i.r.ReadRange(context.TODO(), nameObjects, i.blockID, i.tenantID, start, i.activeObjectsBuffer)
-	if err != nil {
-		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
-	}
-
-	// attempt to get next object from objects
-	i.activeObjectsBuffer, id, object, err = unmarshalAndAdvanceBuffer(i.activeObjectsBuffer)
-	if err != nil {
-		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
-	}
-
-	return id, object, nil
-}
diff --git a/tempodb/encoding/v0/iterator_paged.go b/tempodb/encoding/v0/iterator_paged.go
new file mode 100644
index 00000000000..0d3c4ecfe4e
--- /dev/null
+++ b/tempodb/encoding/v0/iterator_paged.go
@@ -0,0 +1,95 @@
+package v0
+
+import (
+	"io"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	"github.com/pkg/errors"
+)
+
+type pagedIterator struct {
+	pageReader   common.PageReader
+	indexReader  common.IndexReader
+	currentIndex int
+
+	chunkSizeBytes uint32
+	pages          [][]byte
+	activePage     []byte
+}
+
+// NewPagedIterator returns a backendIterator.  This iterator is used to iterate
+//  through objects stored in object storage.
+func NewPagedIterator(chunkSizeBytes uint32, indexReader common.IndexReader, pageReader common.PageReader) common.Iterator {
+	return &pagedIterator{
+		pageReader:     pageReader,
+		indexReader:    indexReader,
+		chunkSizeBytes: chunkSizeBytes,
+	}
+}
+
+// For performance reasons the ID and object slices returned from this method are owned by
+// the iterator.  If you have need to keep these values for longer than a single iteration
+// you need to make a copy of them.
+func (i *pagedIterator) Next() (common.ID, []byte, error) {
+	var err error
+	var id common.ID
+	var object []byte
+
+	// if the current page is empty advance to the next one
+	if len(i.activePage) == 0 && len(i.pages) > 0 {
+		i.activePage = i.pages[0]
+		i.pages = i.pages[1:] // advance pages
+	}
+
+	i.activePage, id, object, err = unmarshalAndAdvanceBuffer(i.activePage)
+	if err != nil && err != io.EOF {
+		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
+	} else if err != io.EOF {
+		return id, object, nil
+	}
+
+	// objects reader was empty, check the index
+	// if no index left, EOF
+	currentRecord := i.indexReader.At(i.currentIndex)
+	if currentRecord == nil {
+		return nil, nil, io.EOF
+	}
+
+	// pull next n bytes into objects
+	var length uint32
+	records := make([]*common.Record, 0, 5) // 5?  why not?
+	for currentRecord != nil {
+		//record := unmarshalRecord(i.indexBuffer[:recordLength])
+		// see if we can fit this record in.  we have to get at least one record in
+		if length+currentRecord.Length > i.chunkSizeBytes && len(records) != 0 {
+			break
+		}
+
+		// add currentRecord to the batch
+		records = append(records, currentRecord)
+		length += currentRecord.Length
+
+		// get next
+		i.currentIndex++
+		currentRecord = i.indexReader.At(i.currentIndex)
+	}
+
+	i.pages, err = i.pageReader.Read(records)
+	if err != nil {
+		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
+	}
+	if len(i.pages) == 0 {
+		return nil, nil, errors.Wrap(err, "unexpected 0 length pages in pagedIterator")
+	}
+
+	i.activePage = i.pages[0]
+	i.pages = i.pages[1:] // advance pages
+
+	// attempt to get next object from objects
+	i.activePage, id, object, err = unmarshalAndAdvanceBuffer(i.activePage)
+	if err != nil {
+		return nil, nil, errors.Wrap(err, "error iterating through object in backend")
+	}
+
+	return id, object, nil
+}
diff --git a/tempodb/encoding/v0/object.go b/tempodb/encoding/v0/object.go
index 54dcb48c3e2..9832a9cb681 100644
--- a/tempodb/encoding/v0/object.go
+++ b/tempodb/encoding/v0/object.go
@@ -17,7 +17,7 @@ const (
 	| total length | id length | id | object bytes |
 */
 
-func marshalObjectToWriter(id common.ID, b []byte, w io.Writer) (int, error) {
+func MarshalObjectToWriter(id common.ID, b []byte, w io.Writer) (int, error) {
 	idLength := len(id)
 	totalLength := len(b) + idLength + uint32Size*2
 
diff --git a/tempodb/encoding/v0/object_test.go b/tempodb/encoding/v0/object_test.go
index c8f1caa86be..9d3b34d7efd 100644
--- a/tempodb/encoding/v0/object_test.go
+++ b/tempodb/encoding/v0/object_test.go
@@ -19,7 +19,7 @@ func TestMarshalUnmarshal(t *testing.T) {
 	bReq, err := proto.Marshal(req)
 	assert.NoError(t, err)
 
-	_, err = marshalObjectToWriter(id, bReq, buffer)
+	_, err = MarshalObjectToWriter(id, bReq, buffer)
 	assert.NoError(t, err)
 
 	outID, outObject, err := unmarshalObjectFromReader(buffer)
@@ -46,7 +46,7 @@ func TestMarshalUnmarshalFromBuffer(t *testing.T) {
 		bReq, err := proto.Marshal(req)
 		assert.NoError(t, err)
 
-		_, err = marshalObjectToWriter(id, bReq, buffer)
+		_, err = MarshalObjectToWriter(id, bReq, buffer)
 		assert.NoError(t, err)
 	}
 
diff --git a/tempodb/encoding/v0/page_reader.go b/tempodb/encoding/v0/page_reader.go
new file mode 100644
index 00000000000..1eb37e938c7
--- /dev/null
+++ b/tempodb/encoding/v0/page_reader.go
@@ -0,0 +1,64 @@
+package v0
+
+import (
+	"fmt"
+	"io"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+)
+
+type pageReader struct {
+	r io.ReaderAt
+}
+
+// NewPageReader returns a new v0 pageReader.  A v0 pageReader
+// is basically a no-op.  It retrieves the requested byte
+// ranges and returns them as is.
+// A pages "format" is a contiguous collection of objects
+// | -- object -- | -- object -- | ...
+func NewPageReader(r io.ReaderAt) common.PageReader {
+	return &pageReader{
+		r: r,
+	}
+}
+
+// Read returns the pages requested in the passed records.  It
+// assumes that if there are multiple records they are ordered
+// and contiguous
+func (r *pageReader) Read(records []*common.Record) ([][]byte, error) {
+	if len(records) == 0 {
+		return nil, nil
+	}
+
+	start := records[0].Start
+	length := uint32(0)
+	for _, record := range records {
+		length += record.Length
+	}
+
+	contiguousPages := make([]byte, length)
+	_, err := r.r.ReadAt(contiguousPages, int64(start))
+	if err != nil {
+		return nil, err
+	}
+
+	slicePages := make([][]byte, 0, len(records))
+	cursor := uint32(0)
+	previousEnd := uint64(0)
+	for _, record := range records {
+		end := cursor + record.Length
+		if end > uint32(len(contiguousPages)) {
+			return nil, fmt.Errorf("record out of bounds while reading pages: %d, %d, %d, %d", cursor, record.Length, end, len(contiguousPages))
+		}
+
+		if previousEnd != record.Start && previousEnd != 0 {
+			return nil, fmt.Errorf("non-contiguous pages requested from pageReader: %d, %+v", previousEnd, record)
+		}
+
+		slicePages = append(slicePages, contiguousPages[cursor:end])
+		cursor += record.Length
+		previousEnd = record.Start + uint64(record.Length)
+	}
+
+	return slicePages, nil
+}
diff --git a/tempodb/encoding/v0/page_reader_test.go b/tempodb/encoding/v0/page_reader_test.go
new file mode 100644
index 00000000000..9f904834d2f
--- /dev/null
+++ b/tempodb/encoding/v0/page_reader_test.go
@@ -0,0 +1,135 @@
+package v0
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestPageReader(t *testing.T) {
+
+	tests := []struct {
+		readerBytes   []byte
+		records       []*common.Record
+		expectedBytes [][]byte
+		expectedError bool
+	}{
+		{},
+		{
+			records: []*common.Record{
+				{
+					Start:  0,
+					Length: 1,
+				},
+			},
+			expectedError: true,
+		},
+		{
+			readerBytes: []byte{0x01, 0x02},
+			records: []*common.Record{
+				{
+					Start:  0,
+					Length: 1,
+				},
+			},
+			expectedBytes: [][]byte{
+				{0x01},
+			},
+		},
+		{
+			readerBytes: []byte{0x01, 0x02},
+			records: []*common.Record{
+				{
+					Start:  1,
+					Length: 1,
+				},
+			},
+			expectedBytes: [][]byte{
+				{0x02},
+			},
+		},
+		{
+			readerBytes: []byte{0x01, 0x02},
+			records: []*common.Record{
+				{
+					Start:  0,
+					Length: 1,
+				},
+				{
+					Start:  1,
+					Length: 1,
+				},
+			},
+			expectedBytes: [][]byte{
+				{0x01},
+				{0x02},
+			},
+		},
+		{
+			readerBytes: []byte{0x01, 0x02},
+			records: []*common.Record{
+				{
+					Start:  0,
+					Length: 5,
+				},
+			},
+			expectedError: true,
+		},
+		{
+			readerBytes: []byte{0x01, 0x02},
+			records: []*common.Record{
+				{
+					Start:  5,
+					Length: 5,
+				},
+			},
+			expectedError: true,
+		},
+		{
+			readerBytes: []byte{0x01, 0x02, 0x03},
+			records: []*common.Record{
+				{
+					Start:  1,
+					Length: 1,
+				},
+				{
+					Start:  2,
+					Length: 1,
+				},
+			},
+			expectedBytes: [][]byte{
+				{0x02},
+				{0x03},
+			},
+		},
+		{
+			readerBytes: []byte{0x01, 0x02, 0x03},
+			records: []*common.Record{
+				{
+					Start:  0,
+					Length: 1,
+				},
+				{
+					Start:  2,
+					Length: 1,
+				},
+			},
+			expectedError: true,
+		},
+	}
+
+	for _, tc := range tests {
+		reader := NewPageReader(bytes.NewReader(tc.readerBytes))
+		actual, err := reader.Read(tc.records)
+
+		if tc.expectedError {
+			assert.Error(t, err)
+			continue
+		}
+
+		assert.NoError(t, err)
+		assert.Equal(t, tc.expectedBytes, actual)
+	}
+}
diff --git a/tempodb/encoding/v0/record.go b/tempodb/encoding/v0/record.go
index 621820b06d1..d7a5d85c152 100644
--- a/tempodb/encoding/v0/record.go
+++ b/tempodb/encoding/v0/record.go
@@ -73,33 +73,6 @@ func unmarshalRecords(recordBytes []byte) ([]*common.Record, error) {
 	return records, nil
 }
 
-// binary search the bytes.  records are not compressed and ordered
-func findRecord(id common.ID, recordBytes []byte) (*common.Record, error) {
-	mod := len(recordBytes) % recordLength
-	if mod != 0 {
-		return nil, fmt.Errorf("records are an unexpected number of bytes %d", mod)
-	}
-
-	numRecords := recordCount(recordBytes)
-	var record *common.Record
-
-	i := sort.Search(numRecords, func(i int) bool {
-		buff := recordBytes[i*recordLength : (i+1)*recordLength]
-		record = unmarshalRecord(buff)
-
-		return bytes.Compare(record.ID, id) >= 0
-	})
-
-	if i >= 0 && i < numRecords {
-		buff := recordBytes[i*recordLength : (i+1)*recordLength]
-		record = unmarshalRecord(buff)
-
-		return record, nil
-	}
-
-	return nil, nil
-}
-
 func recordCount(b []byte) int {
 	return len(b) / recordLength
 }
diff --git a/tempodb/encoding/v0/record_test.go b/tempodb/encoding/v0/record_test.go
index dbf4d5a6da8..be8264fea56 100644
--- a/tempodb/encoding/v0/record_test.go
+++ b/tempodb/encoding/v0/record_test.go
@@ -43,7 +43,7 @@ func TestMarshalUnmarshalRecords(t *testing.T) {
 	assert.Equal(t, expected, actual)
 }
 
-func TestFindRecord(t *testing.T) {
+/*func TestFindRecord(t *testing.T) { // jpe - move to index_reader_test.go
 	numRecords := 10
 	expected := make([]*common.Record, 0, numRecords)
 
@@ -66,7 +66,7 @@ func TestFindRecord(t *testing.T) {
 		assert.NoError(t, err, "unexpected error finding records")
 		assert.Equal(t, r, found)
 	}
-}
+}*/
 
 func TestSortRecord(t *testing.T) {
 	numRecords := 10
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-0 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-0
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-0
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-0
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-1 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-1
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-1
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-1
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-2 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-2
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-2
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-2
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-3 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-3
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-3
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-3
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-4 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-4
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-4
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-4
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-5 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-5
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-5
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-5
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-6 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-6
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-6
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-6
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-7 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-7
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-7
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-7
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-8 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-8
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-8
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-8
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-9 b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-9
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-9
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/bloom-9
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/data b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/data
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/data
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/data
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/index b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/index
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/index
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/index
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/meta.json b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/meta.json
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/meta.json
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/meta.json
diff --git a/tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/readme.md b/tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/readme.md
similarity index 100%
rename from tempodb/encoding/v0/v0_test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/readme.md
rename to tempodb/encoding/v0test/fake/00f5a116-639e-4880-bbe7-be9b0c828033/readme.md
diff --git a/tempodb/encoding/v1/appender_buffered.go b/tempodb/encoding/v1/appender_buffered.go
new file mode 100644
index 00000000000..d5d74c15ee9
--- /dev/null
+++ b/tempodb/encoding/v1/appender_buffered.go
@@ -0,0 +1,137 @@
+package v1
+
+import (
+	"bytes"
+	"io"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+// meteredWriter is a struct that is used to count the number of bytes
+// written to a block after compression.  Unfortunately the compression io.Reader
+// returns bytes before compression so this is necessary to know the actual number of
+// byte written.
+type meteredWriter struct {
+	wrappedWriter io.Writer
+	bytesWritten  int
+}
+
+func (m *meteredWriter) Write(p []byte) (n int, err error) {
+	m.bytesWritten += len(p)
+	return m.wrappedWriter.Write(p)
+}
+
+// buffer up in memory and then write a big ol' compressed block o shit at once
+//  used by CompleteBlock/CompactorBlock
+// may need additional code?  i.e. a signal that it's "about to flush" triggering a compression
+type bufferedAppender struct {
+	v0Buffer     *bytes.Buffer
+	outputWriter *meteredWriter
+	pool         WriterPool
+	records      []*common.Record
+
+	totalObjects    int
+	currentOffset   uint64
+	currentRecord   *common.Record
+	indexDownsample int
+}
+
+// NewBufferedAppender returns an bufferedAppender.  This appender builds a writes to
+//  the provided writer and also builds a downsampled records slice.
+func NewBufferedAppender(writer io.Writer, encoding backend.Encoding, indexDownsample int, totalObjectsEstimate int) (common.Appender, error) {
+	pool, err := getWriterPool(encoding)
+	if err != nil {
+		return nil, err
+	}
+
+	return &bufferedAppender{
+		v0Buffer:        &bytes.Buffer{},
+		indexDownsample: indexDownsample,
+		records:         make([]*common.Record, 0, totalObjectsEstimate/indexDownsample+1),
+
+		outputWriter: &meteredWriter{
+			wrappedWriter: writer,
+		},
+		pool: pool,
+	}, nil
+}
+
+// Append appends the id/object to the writer.  Note that the caller is giving up ownership of the two byte arrays backing the slices.
+//   Copies should be made and passed in if this is a problem
+func (a *bufferedAppender) Append(id common.ID, b []byte) error {
+	_, err := v0.MarshalObjectToWriter(id, b, a.v0Buffer)
+	if err != nil {
+		return err
+	}
+
+	if a.currentRecord == nil {
+		a.currentRecord = &common.Record{
+			Start: a.currentOffset,
+		}
+	}
+	a.totalObjects++
+	a.currentRecord.ID = id
+
+	if a.totalObjects%a.indexDownsample == 0 {
+		err := a.flush()
+		if err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (a *bufferedAppender) Records() []*common.Record {
+	return a.records
+}
+
+func (a *bufferedAppender) Length() int {
+	return a.totalObjects
+}
+
+func (a *bufferedAppender) DataLength() uint64 {
+	return a.currentOffset
+}
+
+// compress everything left and flush?
+func (a *bufferedAppender) Complete() error {
+	err := a.flush()
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (a *bufferedAppender) flush() error {
+	if a.currentRecord == nil {
+		return nil
+	}
+
+	compressedWriter := a.pool.GetWriter(a.outputWriter)
+
+	// write compressed data
+	buffer := a.v0Buffer.Bytes()
+	_, err := compressedWriter.Write(buffer)
+	if err != nil {
+		return err
+	}
+
+	// now clear our v0 buffer so we can start the new block page
+	compressedWriter.Close()
+	a.v0Buffer.Reset()
+	a.pool.PutWriter(compressedWriter)
+
+	a.currentOffset += uint64(a.outputWriter.bytesWritten)
+	a.currentRecord.Length += uint32(a.outputWriter.bytesWritten)
+	a.outputWriter.bytesWritten = 0
+
+	// update index
+	a.records = append(a.records, a.currentRecord)
+	a.currentRecord = nil
+
+	return nil
+}
diff --git a/tempodb/encoding/v1/block.go b/tempodb/encoding/v1/block.go
new file mode 100644
index 00000000000..943449ee392
--- /dev/null
+++ b/tempodb/encoding/v1/block.go
@@ -0,0 +1,43 @@
+package v1
+
+import (
+	"context"
+	"io"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+// These methods control the layout of the block in the backend.  Nothing has changed between v0 and v1 here
+// so we will just passthrough
+
+// NameObjects returns v0 name
+func NameObjects() string {
+	return v0.NameObjects
+}
+
+// NameIndex returns v0 name
+func NameIndex() string {
+	return v0.NameIndex
+}
+
+// BloomName returns v0 name
+func BloomName(shard int) string {
+	return v0.BloomName(shard)
+}
+
+// WriteBlockMeta writes the bloom filter, meta and index to the passed in backend.Writer
+func WriteBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, records []*common.Record, b *common.ShardedBloomFilter) error {
+	return v0.WriteBlockMeta(ctx, w, meta, records, b)
+}
+
+// WriteBlockData writes the data object from an io.Reader to the backend.Writer
+func WriteBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error {
+	return v0.WriteBlockData(ctx, w, meta, r, size)
+}
+
+// AppendBlockData appends the bytes passed to the block data
+func AppendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error) {
+	return v0.AppendBlockData(ctx, w, meta, tracker, buffer)
+}
diff --git a/tempodb/encoding/v1/finder_paged.go b/tempodb/encoding/v1/finder_paged.go
new file mode 100644
index 00000000000..da2043ebe4b
--- /dev/null
+++ b/tempodb/encoding/v1/finder_paged.go
@@ -0,0 +1,12 @@
+package v1
+
+import (
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+// NewPagedFinder returns a v0.pagedFinder.  There are no changes
+// to logic from the v0 finder and all compression changes are handled in the pageReader
+func NewPagedFinder(index common.IndexReader, r common.PageReader, combiner common.ObjectCombiner) common.Finder {
+	return v0.NewPagedFinder(index, r, combiner)
+}
diff --git a/tempodb/encoding/v1/index_reader.go b/tempodb/encoding/v1/index_reader.go
new file mode 100644
index 00000000000..d037a3c63ed
--- /dev/null
+++ b/tempodb/encoding/v1/index_reader.go
@@ -0,0 +1,13 @@
+package v1
+
+import (
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+// NewIndexReader returns an index reader for a byte slice of marshalled
+// ordered records.
+// The index has not changed between v0 and v1.
+func NewIndexReader(index []byte) (common.IndexReader, error) {
+	return v0.NewIndexReader(index)
+}
diff --git a/tempodb/encoding/v1/iterator_paged.go b/tempodb/encoding/v1/iterator_paged.go
new file mode 100644
index 00000000000..67fa675e7e1
--- /dev/null
+++ b/tempodb/encoding/v1/iterator_paged.go
@@ -0,0 +1,12 @@
+package v1
+
+import (
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+// NewPagedIterator returns a v0.backendIterator.  There are no changes
+// to logic from the v0 iterator and all compression changes are handled in the pageReader
+func NewPagedIterator(chunkSizeBytes uint32, indexReader common.IndexReader, pageReader common.PageReader) common.Iterator {
+	return v0.NewPagedIterator(chunkSizeBytes, indexReader, pageReader)
+}
diff --git a/tempodb/encoding/v1/page_reader.go b/tempodb/encoding/v1/page_reader.go
new file mode 100644
index 00000000000..cadb8345c77
--- /dev/null
+++ b/tempodb/encoding/v1/page_reader.go
@@ -0,0 +1,56 @@
+package v1
+
+import (
+	"bytes"
+	"io"
+	"io/ioutil"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+)
+
+type pageReader struct {
+	v0PageReader common.PageReader
+
+	pool ReaderPool
+}
+
+// NewPageReader constructs a v1 PageReader that handles compression
+func NewPageReader(r io.ReaderAt, encoding backend.Encoding) (common.PageReader, error) {
+	pool, err := getReaderPool(encoding)
+	if err != nil {
+		return nil, err
+	}
+
+	return &pageReader{
+		v0PageReader: v0.NewPageReader(r),
+		pool:         pool,
+	}, nil
+}
+
+// Read returns the pages requested in the passed records.  It
+// assumes that if there are multiple records they are ordered
+// and contiguous
+func (r *pageReader) Read(records []*common.Record) ([][]byte, error) {
+	compressedPages, err := r.v0PageReader.Read(records)
+	if err != nil {
+		return nil, err
+	}
+
+	// now decompress
+	decompressedPages := make([][]byte, 0, len(compressedPages))
+	for _, page := range compressedPages {
+		reader := r.pool.GetReader(bytes.NewReader(page))
+
+		page, err := ioutil.ReadAll(reader)
+		r.pool.PutReader(reader)
+		if err != nil {
+			return nil, err
+		}
+
+		decompressedPages = append(decompressedPages, page)
+	}
+
+	return decompressedPages, nil
+}
diff --git a/tempodb/encoding/v1/page_reader_test.go b/tempodb/encoding/v1/page_reader_test.go
new file mode 100644
index 00000000000..c29fe9b8616
--- /dev/null
+++ b/tempodb/encoding/v1/page_reader_test.go
@@ -0,0 +1,57 @@
+package v1
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding/common"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestPageReader(t *testing.T) {
+
+	tests := []struct {
+		readerBytes []byte
+	}{
+		{
+			readerBytes: []byte{0x01, 0x02},
+		},
+	}
+
+	for _, tc := range tests {
+		for _, enc := range backend.SupportedEncoding {
+			t.Run(enc.String(), func(t *testing.T) {
+				wPool, err := getWriterPool(enc)
+				require.NoError(t, err)
+
+				buff := bytes.NewBuffer([]byte{})
+				mw := &meteredWriter{
+					wrappedWriter: buff,
+				}
+				writer := wPool.GetWriter(mw)
+
+				_, err = writer.Write(tc.readerBytes)
+				require.NoError(t, err)
+				err = writer.Close()
+				require.NoError(t, err)
+
+				encryptedBytes := buff.Bytes()
+				reader, err := NewPageReader(bytes.NewReader(encryptedBytes), enc)
+				require.NoError(t, err)
+
+				actual, err := reader.Read([]*common.Record{
+					{
+						Start:  0,
+						Length: uint32(mw.bytesWritten),
+					},
+				})
+
+				assert.NoError(t, err)
+				assert.Len(t, actual, 1)
+				assert.Equal(t, tc.readerBytes, actual[0])
+			})
+		}
+	}
+}
diff --git a/tempodb/encoding/v1/pool.go b/tempodb/encoding/v1/pool.go
new file mode 100644
index 00000000000..8446dd542b4
--- /dev/null
+++ b/tempodb/encoding/v1/pool.go
@@ -0,0 +1,333 @@
+package v1
+
+import (
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/golang/snappy"
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/klauspost/compress/gzip"
+	"github.com/klauspost/compress/zstd"
+	"github.com/pierrec/lz4/v4"
+	"github.com/prometheus/prometheus/pkg/pool"
+)
+
+const maxEncoding = backend.EncZstd
+
+// WriterPool is a pool of io.Writer
+// This is used by every chunk to avoid unnecessary allocations.
+type WriterPool interface {
+	GetWriter(io.Writer) io.WriteCloser
+	PutWriter(io.WriteCloser)
+	Encoding() backend.Encoding
+}
+
+// ReaderPool similar to WriterPool but for reading chunks.
+type ReaderPool interface {
+	GetReader(io.Reader) io.Reader
+	PutReader(io.Reader)
+	Encoding() backend.Encoding
+}
+
+var (
+	// Gzip is the gnu zip compression pool
+	Gzip = GzipPool{level: gzip.DefaultCompression}
+	// Lz4_64k is the l4z compression pool, with 64k buffer size
+	Lz4_64k = LZ4Pool{bufferSize: 1 << 16}
+	// Lz4_256k uses 256k buffer
+	Lz4_256k = LZ4Pool{bufferSize: 1 << 18}
+	// Lz4_1M uses 1M buffer
+	Lz4_1M = LZ4Pool{bufferSize: 1 << 20}
+	// Lz4_4M uses 4M buffer
+	Lz4_4M = LZ4Pool{bufferSize: 1 << 22}
+	// Snappy is the snappy compression pool
+	Snappy SnappyPool
+	// Noop is the no compression pool
+	Noop NoopPool
+	// Zstd Pool
+	Zstd = ZstdPool{}
+
+	// BytesBufferPool is a bytes buffer used for lines decompressed.
+	// Buckets [0.5KB,1KB,2KB,4KB,8KB]
+	BytesBufferPool = pool.New(1<<9, 1<<13, 2, func(size int) interface{} { return make([]byte, 0, size) })
+)
+
+func getWriterPool(enc backend.Encoding) (WriterPool, error) {
+	r, err := getReaderPool(enc)
+	if err != nil {
+		return nil, err
+	}
+
+	return r.(WriterPool), nil
+}
+
+func getReaderPool(enc backend.Encoding) (ReaderPool, error) {
+	switch enc {
+	case backend.EncNone:
+		return &Noop, nil
+	case backend.EncGZIP:
+		return &Gzip, nil
+	case backend.EncLZ4_64k:
+		return &Lz4_64k, nil
+	case backend.EncLZ4_256k:
+		return &Lz4_256k, nil
+	case backend.EncLZ4_1M:
+		return &Lz4_1M, nil
+	case backend.EncLZ4_4M:
+		return &Lz4_4M, nil
+	case backend.EncSnappy:
+		return &Snappy, nil
+	case backend.EncZstd:
+		return &Zstd, nil
+	default:
+		return nil, fmt.Errorf("Unknown pool encoding %d", enc)
+	}
+}
+
+// GzipPool is a gun zip compression pool
+type GzipPool struct {
+	readers sync.Pool
+	writers sync.Pool
+	level   int
+}
+
+// Encoding implements WriterPool and ReaderPool
+func (pool *GzipPool) Encoding() backend.Encoding {
+	return backend.EncGZIP
+}
+
+// GetReader gets or creates a new CompressionReader and reset it to read from src
+func (pool *GzipPool) GetReader(src io.Reader) io.Reader {
+	if r := pool.readers.Get(); r != nil {
+		reader := r.(*gzip.Reader)
+		err := reader.Reset(src)
+		if err != nil {
+			panic(err)
+		}
+		return reader
+	}
+	reader, err := gzip.NewReader(src)
+	if err != nil {
+		panic(err)
+	}
+	return reader
+}
+
+// PutReader places back in the pool a CompressionReader
+func (pool *GzipPool) PutReader(reader io.Reader) {
+	pool.readers.Put(reader)
+}
+
+// GetWriter gets or creates a new CompressionWriter and reset it to write to dst
+func (pool *GzipPool) GetWriter(dst io.Writer) io.WriteCloser {
+	if w := pool.writers.Get(); w != nil {
+		writer := w.(*gzip.Writer)
+		writer.Reset(dst)
+		return writer
+	}
+
+	level := pool.level
+	if level == 0 {
+		level = gzip.DefaultCompression
+	}
+	w, err := gzip.NewWriterLevel(dst, level)
+	if err != nil {
+		panic(err) // never happens, error is only returned on wrong compression level.
+	}
+	return w
+}
+
+// PutWriter places back in the pool a CompressionWriter
+func (pool *GzipPool) PutWriter(writer io.WriteCloser) {
+	pool.writers.Put(writer)
+}
+
+// LZ4Pool is an pool...of lz4s...
+type LZ4Pool struct {
+	readers    sync.Pool
+	writers    sync.Pool
+	bufferSize uint32 // available values: 1<<16 (64k), 1<<18 (256k), 1<<20 (1M), 1<<22 (4M). Defaults to 4MB, if not set.
+}
+
+// Encoding implements WriterPool and ReaderPool
+func (pool *LZ4Pool) Encoding() backend.Encoding {
+	switch pool.bufferSize {
+	case 1 << 16:
+		return backend.EncLZ4_64k
+	case 1 << 18:
+		return backend.EncLZ4_256k
+	case 1 << 20:
+		return backend.EncLZ4_1M
+	case 1 << 22:
+		return backend.EncLZ4_4M
+	}
+
+	return backend.EncNone
+}
+
+// GetReader gets or creates a new CompressionReader and reset it to read from src
+func (pool *LZ4Pool) GetReader(src io.Reader) io.Reader {
+	var r *lz4.Reader
+	if pooled := pool.readers.Get(); pooled != nil {
+		r = pooled.(*lz4.Reader)
+		r.Reset(src)
+	} else {
+		r = lz4.NewReader(src)
+	}
+	return r
+}
+
+// PutReader places back in the pool a CompressionReader
+func (pool *LZ4Pool) PutReader(reader io.Reader) {
+	pool.readers.Put(reader)
+}
+
+// GetWriter gets or creates a new CompressionWriter and reset it to write to dst
+func (pool *LZ4Pool) GetWriter(dst io.Writer) io.WriteCloser {
+	var w *lz4.Writer
+	if fromPool := pool.writers.Get(); fromPool != nil {
+		w = fromPool.(*lz4.Writer)
+		w.Reset(dst)
+	} else {
+		w = lz4.NewWriter(dst)
+	}
+	err := w.Apply(
+		lz4.ChecksumOption(false),
+		lz4.BlockSizeOption(lz4.BlockSize(pool.bufferSize)),
+		lz4.CompressionLevelOption(lz4.Fast),
+	)
+	if err != nil {
+		panic(err)
+	}
+	return w
+}
+
+// PutWriter places back in the pool a CompressionWriter
+func (pool *LZ4Pool) PutWriter(writer io.WriteCloser) {
+	pool.writers.Put(writer)
+}
+
+// SnappyPool is a really cool looking pool.  Dang that pool is _snappy_.
+type SnappyPool struct {
+	readers sync.Pool
+	writers sync.Pool
+}
+
+// Encoding implements WriterPool and ReaderPool
+func (pool *SnappyPool) Encoding() backend.Encoding {
+	return backend.EncSnappy
+}
+
+// GetReader gets or creates a new CompressionReader and reset it to read from src
+func (pool *SnappyPool) GetReader(src io.Reader) io.Reader {
+	if r := pool.readers.Get(); r != nil {
+		reader := r.(*snappy.Reader)
+		reader.Reset(src)
+		return reader
+	}
+	return snappy.NewReader(src)
+}
+
+// PutReader places back in the pool a CompressionReader
+func (pool *SnappyPool) PutReader(reader io.Reader) {
+	pool.readers.Put(reader)
+}
+
+// GetWriter gets or creates a new CompressionWriter and reset it to write to dst
+func (pool *SnappyPool) GetWriter(dst io.Writer) io.WriteCloser {
+	if w := pool.writers.Get(); w != nil {
+		writer := w.(*snappy.Writer)
+		writer.Reset(dst)
+		return writer
+	}
+	return snappy.NewBufferedWriter(dst)
+}
+
+// PutWriter places back in the pool a CompressionWriter
+func (pool *SnappyPool) PutWriter(writer io.WriteCloser) {
+	pool.writers.Put(writer)
+}
+
+// NoopPool is for people who think compression is for the weak
+type NoopPool struct{}
+
+// Encoding implements WriterPool and ReaderPool
+func (pool *NoopPool) Encoding() backend.Encoding {
+	return backend.EncNone
+}
+
+// GetReader gets or creates a new CompressionReader and reset it to read from src
+func (pool *NoopPool) GetReader(src io.Reader) io.Reader {
+	return src
+}
+
+// PutReader places back in the pool a CompressionReader
+func (pool *NoopPool) PutReader(reader io.Reader) {}
+
+type noopCloser struct {
+	io.Writer
+}
+
+func (noopCloser) Close() error { return nil }
+
+// GetWriter gets or creates a new CompressionWriter and reset it to write to dst
+func (pool *NoopPool) GetWriter(dst io.Writer) io.WriteCloser {
+	return noopCloser{dst}
+}
+
+// PutWriter places back in the pool a CompressionWriter
+func (pool *NoopPool) PutWriter(writer io.WriteCloser) {}
+
+// ZstdPool is a zstd compression pool
+type ZstdPool struct {
+	readers sync.Pool
+	writers sync.Pool
+}
+
+// Encoding implements WriterPool and ReaderPool
+func (pool *ZstdPool) Encoding() backend.Encoding {
+	return backend.EncZstd
+}
+
+// GetReader gets or creates a new CompressionReader and reset it to read from src
+func (pool *ZstdPool) GetReader(src io.Reader) io.Reader {
+	if r := pool.readers.Get(); r != nil {
+		reader := r.(*zstd.Decoder)
+		err := reader.Reset(src)
+		if err != nil {
+			panic(err)
+		}
+		return reader
+	}
+	reader, err := zstd.NewReader(src)
+	if err != nil {
+		panic(err)
+	}
+	return reader
+}
+
+// PutReader places back in the pool a CompressionReader
+func (pool *ZstdPool) PutReader(reader io.Reader) {
+	pool.readers.Put(reader)
+}
+
+// GetWriter gets or creates a new CompressionWriter and reset it to write to dst
+func (pool *ZstdPool) GetWriter(dst io.Writer) io.WriteCloser {
+	if w := pool.writers.Get(); w != nil {
+		writer := w.(*zstd.Encoder)
+		writer.Reset(dst)
+		return writer
+	}
+
+	w, err := zstd.NewWriter(dst)
+	if err != nil {
+		panic(err) // never happens, error is only returned on wrong compression level.
+	}
+	return w
+}
+
+// PutWriter places back in the pool a CompressionWriter
+func (pool *ZstdPool) PutWriter(writer io.WriteCloser) {
+	pool.writers.Put(writer)
+}
diff --git a/tempodb/encoding/v1/pool_test.go b/tempodb/encoding/v1/pool_test.go
new file mode 100644
index 00000000000..190ec3ab5b8
--- /dev/null
+++ b/tempodb/encoding/v1/pool_test.go
@@ -0,0 +1,33 @@
+package v1
+
+import (
+	"fmt"
+	"testing"
+
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestGetPool(t *testing.T) {
+	for _, enc := range backend.SupportedEncoding {
+		t.Run(fmt.Sprintf("testing %s", enc), func(t *testing.T) {
+			rPool, err := getReaderPool(enc)
+			assert.NotNil(t, rPool)
+			assert.NoError(t, err)
+			assert.Equal(t, enc, rPool.Encoding())
+
+			wPool, err := getWriterPool(enc)
+			assert.NotNil(t, wPool)
+			assert.NoError(t, err)
+			assert.Equal(t, enc, wPool.Encoding())
+		})
+	}
+
+	rPool, err := getReaderPool(maxEncoding + 1)
+	assert.Nil(t, rPool)
+	assert.Error(t, err)
+
+	wPool, err := getWriterPool(maxEncoding + 1)
+	assert.Nil(t, wPool)
+	assert.Error(t, err)
+}
diff --git a/tempodb/encoding/versioned.go b/tempodb/encoding/versioned.go
index ee0e636cb01..826b86111c9 100644
--- a/tempodb/encoding/versioned.go
+++ b/tempodb/encoding/versioned.go
@@ -2,56 +2,121 @@ package encoding
 
 import (
 	"context"
-	"fmt"
 	"io"
 
 	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/encoding/common"
 	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
+	v1 "github.com/grafana/tempo/tempodb/encoding/v1"
 )
 
-// Current contains a string indicating the most recent block version
-const Current = "v0"
+const currentVersion = "v1"
 
-// BackendBlock defines an object that can be used to interact with a block in object storage
-type BackendBlock interface {
-	// Find searches for a given ID and returns the object if exists
-	Find(ctx context.Context, r backend.Reader, id common.ID, metrics *common.FindMetrics) ([]byte, error)
-	// Iterator returns an iterator that can be used to examine every object in the block
-	Iterator(chunkSizeBytes uint32, r backend.Reader) (common.Iterator, error)
-}
+// versionedEncoding has a whole bunch of versioned functionality.  This is
+//  currently quite sloppy and could easily be tightened up to just a few methods
+//  but it is what it is for now!
+type versionedEncoding interface {
+	newBufferedAppender(writer io.Writer, encoding backend.Encoding, indexDownsample int, totalObjectsEstimate int) (common.Appender, error)
+	newPagedFinder(indexReader common.IndexReader, pageReader common.PageReader, combiner common.ObjectCombiner) common.Finder
+	newPagedIterator(chunkSizeBytes uint32, indexBytes []byte, pageReader common.PageReader) (common.Iterator, error)
 
-// newBackendBlock returns a BackendBlock for the given backend.BlockMeta
-//  It is version aware.
-func NewBackendBlock(meta *backend.BlockMeta) (BackendBlock, error) {
-	if meta.Version == "v0" {
-		return v0.NewBackendBlock(meta), nil
-	}
+	writeBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, records []*common.Record, b *common.ShardedBloomFilter) error
+	writeBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error
+	appendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error)
 
-	return nil, fmt.Errorf("%s is not a valid block version", meta.Version)
+	newPageReader(ra io.ReaderAt, encoding backend.Encoding) (common.PageReader, error)
+	newIndexReader(indexBytes []byte) (common.IndexReader, error)
+
+	nameIndex() string
+	nameObjects() string
+	nameBloom(shard int) string
 }
 
-// newBufferedAppender returns the most recent Appender
-func newBufferedAppender(writer io.Writer, indexDownsample int, totalObjectsEstimate int) common.Appender {
-	return v0.NewBufferedAppender(writer, indexDownsample, totalObjectsEstimate)
+// latestEncoding is used by Compactor and Complete block
+func latestEncoding() versionedEncoding {
+	return v1Encoding{}
 }
 
-// newDedupingFinder returns the most recent Finder
-func newDedupingFinder(sortedRecords []*common.Record, ra io.ReaderAt, combiner common.ObjectCombiner) common.Finder {
-	return v0.NewDedupingFinder(sortedRecords, ra, combiner)
+// v0Encoding
+type v0Encoding struct{}
+
+func (v v0Encoding) newBufferedAppender(writer io.Writer, _ backend.Encoding, indexDownsample int, totalObjectsEstimate int) (common.Appender, error) {
+	return v0.NewBufferedAppender(writer, indexDownsample, totalObjectsEstimate), nil
 }
+func (v v0Encoding) newPagedIterator(chunkSizeBytes uint32, indexBytes []byte, pageReader common.PageReader) (common.Iterator, error) {
+	reader, err := v0.NewIndexReader(indexBytes)
+	if err != nil {
+		return nil, err
+	}
 
-// writeBlockMeta calls the most recent WriteBlockMeta
-func writeBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, records []*common.Record, b *common.ShardedBloomFilter) error {
+	return v0.NewPagedIterator(chunkSizeBytes, reader, pageReader), nil
+}
+func (v v0Encoding) newPagedFinder(indexReader common.IndexReader, pageReader common.PageReader, combiner common.ObjectCombiner) common.Finder {
+	return v0.NewPagedFinder(indexReader, pageReader, combiner)
+}
+func (v v0Encoding) newIndexReader(indexBytes []byte) (common.IndexReader, error) {
+	return v0.NewIndexReader(indexBytes)
+}
+func (v v0Encoding) newPageReader(ra io.ReaderAt, encoding backend.Encoding) (common.PageReader, error) {
+	return v0.NewPageReader(ra), nil
+}
+func (v v0Encoding) writeBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, records []*common.Record, b *common.ShardedBloomFilter) error {
 	return v0.WriteBlockMeta(ctx, w, meta, records, b)
 }
-
-// writeBlockData calls the most recent WriteBlockData
-func writeBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error {
+func (v v0Encoding) writeBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error {
 	return v0.WriteBlockData(ctx, w, meta, r, size)
 }
-
-// appendBlockData calls the most recent AppendBlockData
-func appendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error) {
+func (v v0Encoding) appendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error) {
 	return v0.AppendBlockData(ctx, w, meta, tracker, buffer)
 }
+func (v v0Encoding) nameObjects() string {
+	return v0.NameObjects
+}
+func (v v0Encoding) nameIndex() string {
+	return v0.NameIndex
+}
+func (v v0Encoding) nameBloom(shard int) string {
+	return v0.BloomName(shard)
+}
+
+// v1Encoding
+type v1Encoding struct{}
+
+func (v v1Encoding) newBufferedAppender(writer io.Writer, encoding backend.Encoding, indexDownsample int, totalObjectsEstimate int) (common.Appender, error) {
+	return v1.NewBufferedAppender(writer, encoding, indexDownsample, totalObjectsEstimate)
+}
+func (v v1Encoding) newPagedIterator(chunkSizeBytes uint32, indexBytes []byte, pageReader common.PageReader) (common.Iterator, error) {
+	reader, err := v1.NewIndexReader(indexBytes)
+	if err != nil {
+		return nil, err
+	}
+
+	return v1.NewPagedIterator(chunkSizeBytes, reader, pageReader), nil
+}
+func (v v1Encoding) newPagedFinder(indexReader common.IndexReader, pageReader common.PageReader, combiner common.ObjectCombiner) common.Finder {
+	return v1.NewPagedFinder(indexReader, pageReader, combiner)
+}
+func (v v1Encoding) newPageReader(ra io.ReaderAt, encoding backend.Encoding) (common.PageReader, error) {
+	return v1.NewPageReader(ra, encoding)
+}
+func (v v1Encoding) newIndexReader(indexBytes []byte) (common.IndexReader, error) {
+	return v1.NewIndexReader(indexBytes)
+}
+func (v v1Encoding) writeBlockMeta(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, records []*common.Record, b *common.ShardedBloomFilter) error {
+	return v1.WriteBlockMeta(ctx, w, meta, records, b)
+}
+func (v v1Encoding) writeBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, r io.Reader, size int64) error {
+	return v1.WriteBlockData(ctx, w, meta, r, size)
+}
+func (v v1Encoding) appendBlockData(ctx context.Context, w backend.Writer, meta *backend.BlockMeta, tracker backend.AppendTracker, buffer []byte) (backend.AppendTracker, error) {
+	return v1.AppendBlockData(ctx, w, meta, tracker, buffer)
+}
+func (v v1Encoding) nameObjects() string {
+	return v1.NameObjects()
+}
+func (v v1Encoding) nameIndex() string {
+	return v1.NameIndex()
+}
+func (v v1Encoding) nameBloom(shard int) string {
+	return v1.BloomName(shard)
+}
diff --git a/tempodb/retention_test.go b/tempodb/retention_test.go
index e84b1aa528c..ea449ea9cc5 100644
--- a/tempodb/retention_test.go
+++ b/tempodb/retention_test.go
@@ -12,7 +12,9 @@ import (
 	"github.com/google/uuid"
 	"github.com/stretchr/testify/assert"
 
+	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/wal"
 )
 
@@ -26,10 +28,13 @@ func TestRetention(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -50,7 +55,7 @@ func TestRetention(t *testing.T) {
 	head, err := wal.NewBlock(blockID, testTenantID)
 	assert.NoError(t, err)
 
-	complete, err := head.Complete(wal, &mockSharder{})
+	complete, err := w.CompleteBlock(head, &mockSharder{})
 	assert.NoError(t, err)
 	blockID = complete.BlockMeta().BlockID
 
diff --git a/tempodb/tempodb.go b/tempodb/tempodb.go
index 4b7283f619b..f6c2e0ab439 100644
--- a/tempodb/tempodb.go
+++ b/tempodb/tempodb.go
@@ -81,11 +81,12 @@ var (
 
 type Writer interface {
 	WriteBlock(ctx context.Context, block WriteableBlock) error
+	CompleteBlock(block *wal.AppendBlock, combiner common.ObjectCombiner) (*encoding.CompleteBlock, error)
 	WAL() *wal.WAL
 }
 
 type Reader interface {
-	Find(ctx context.Context, tenantID string, id common.ID, blockStart string, blockEnd string) ([][]byte, common.FindMetrics, error)
+	Find(ctx context.Context, tenantID string, id common.ID, blockStart string, blockEnd string) ([][]byte, error)
 	Shutdown()
 }
 
@@ -121,12 +122,17 @@ type readerWriter struct {
 	compactorTenantOffset uint
 }
 
+// New creates a new tempodb
 func New(cfg *Config, logger log.Logger) (Reader, Writer, Compactor, error) {
-	var err error
 	var r backend.Reader
 	var w backend.Writer
 	var c backend.Compactor
 
+	err := validateConfig(cfg)
+	if err != nil {
+		return nil, nil, nil, fmt.Errorf("invalid config while creating tempodb: %w", err)
+	}
+
 	switch cfg.Backend {
 	case "local":
 		r, w, c, err = local.New(cfg.Local)
@@ -189,13 +195,15 @@ func (rw *readerWriter) WriteBlock(ctx context.Context, c WriteableBlock) error
 	return c.Write(ctx, rw.w)
 }
 
+func (rw *readerWriter) CompleteBlock(block *wal.AppendBlock, combiner common.ObjectCombiner) (*encoding.CompleteBlock, error) {
+	return block.Complete(rw.cfg.Block, rw.wal, combiner)
+}
+
 func (rw *readerWriter) WAL() *wal.WAL {
 	return rw.wal
 }
 
-func (rw *readerWriter) Find(ctx context.Context, tenantID string, id common.ID, blockStart string, blockEnd string) ([][]byte, common.FindMetrics, error) {
-	metrics := common.NewFindMetrics()
-
+func (rw *readerWriter) Find(ctx context.Context, tenantID string, id common.ID, blockStart string, blockEnd string) ([][]byte, error) {
 	// tracing instrumentation
 	logger := util.WithContext(ctx, util.Logger)
 	span, ctx := opentracing.StartSpanFromContext(ctx, "store.Find")
@@ -203,19 +211,19 @@ func (rw *readerWriter) Find(ctx context.Context, tenantID string, id common.ID,
 
 	blockStartUUID, err := uuid.Parse(blockStart)
 	if err != nil {
-		return nil, metrics, err
+		return nil, err
 	}
 	blockStartBytes, err := blockStartUUID.MarshalBinary()
 	if err != nil {
-		return nil, metrics, err
+		return nil, err
 	}
 	blockEndUUID, err := uuid.Parse(blockEnd)
 	if err != nil {
-		return nil, metrics, err
+		return nil, err
 	}
 	blockEndBytes, err := blockEndUUID.MarshalBinary()
 	if err != nil {
-		return nil, metrics, err
+		return nil, err
 	}
 
 	rw.blockListsMtx.Lock()
@@ -237,17 +245,17 @@ func (rw *readerWriter) Find(ctx context.Context, tenantID string, id common.ID,
 
 	// deliberately placed outside the blocklist mtx unlock
 	if !found {
-		return nil, metrics, nil
+		return nil, nil
 	}
 
 	partialTraces, err := rw.pool.RunJobs(ctx, copiedBlocklist, func(ctx context.Context, payload interface{}) ([]byte, error) {
 		meta := payload.(*backend.BlockMeta)
-		block, err := encoding.NewBackendBlock(meta)
+		block, err := encoding.NewBackendBlock(meta, rw.r)
 		if err != nil {
 			return nil, err
 		}
 
-		foundObject, err := block.Find(ctx, rw.r, id, &metrics)
+		foundObject, err := block.Find(ctx, id)
 		if err != nil {
 			return nil, err
 		}
@@ -262,7 +270,7 @@ func (rw *readerWriter) Find(ctx context.Context, tenantID string, id common.ID,
 		return foundObject, nil
 	})
 
-	return partialTraces, metrics, err
+	return partialTraces, err
 }
 
 func (rw *readerWriter) Shutdown() {
diff --git a/tempodb/tempodb_test.go b/tempodb/tempodb_test.go
index 615f37ecc65..d318f6891d2 100644
--- a/tempodb/tempodb_test.go
+++ b/tempodb/tempodb_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/grafana/tempo/pkg/util/test"
 	"github.com/grafana/tempo/tempodb/backend"
 	"github.com/grafana/tempo/tempodb/backend/local"
+	"github.com/grafana/tempo/tempodb/encoding"
 	"github.com/grafana/tempo/tempodb/wal"
 	"github.com/stretchr/testify/assert"
 )
@@ -36,10 +37,13 @@ func TestDB(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncGZIP,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -76,7 +80,7 @@ func TestDB(t *testing.T) {
 		assert.NoError(t, err, "unexpected error writing req")
 	}
 
-	complete, err := head.Complete(wal, &mockSharder{})
+	complete, err := w.CompleteBlock(head, &mockSharder{})
 	assert.NoError(t, err)
 
 	err = w.WriteBlock(context.Background(), complete)
@@ -87,7 +91,7 @@ func TestDB(t *testing.T) {
 
 	// read
 	for i, id := range ids {
-		bFound, _, err := r.Find(context.Background(), testTenantID, id, BlockIDMin, BlockIDMax)
+		bFound, err := r.Find(context.Background(), testTenantID, id, BlockIDMin, BlockIDMax)
 		assert.NoError(t, err)
 
 		out := &tempopb.PushRequest{}
@@ -112,10 +116,13 @@ func TestBlockSharding(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -139,7 +146,7 @@ func TestBlockSharding(t *testing.T) {
 	assert.NoError(t, err, "unexpected error writing req")
 
 	// write block to backend
-	complete, err := head.Complete(wal, &mockSharder{})
+	complete, err := w.CompleteBlock(head, &mockSharder{})
 	assert.NoError(t, err)
 
 	err = w.WriteBlock(context.Background(), complete)
@@ -157,7 +164,7 @@ func TestBlockSharding(t *testing.T) {
 	// check if it respects the blockstart/blockend params - case1: hit
 	blockStart := uuid.MustParse(BlockIDMin).String()
 	blockEnd := uuid.MustParse(BlockIDMax).String()
-	bFound, _, err := r.Find(context.Background(), testTenantID, id, blockStart, blockEnd)
+	bFound, err := r.Find(context.Background(), testTenantID, id, blockStart, blockEnd)
 	assert.NoError(t, err)
 	assert.Greater(t, len(bFound), 0)
 
@@ -169,7 +176,7 @@ func TestBlockSharding(t *testing.T) {
 	// check if it respects the blockstart/blockend params - case2: miss
 	blockStart = uuid.MustParse(BlockIDMin).String()
 	blockEnd = uuid.MustParse(BlockIDMin).String()
-	bFound, _, err = r.Find(context.Background(), testTenantID, id, blockStart, blockEnd)
+	bFound, err = r.Find(context.Background(), testTenantID, id, blockStart, blockEnd)
 	assert.NoError(t, err)
 	assert.Len(t, bFound, 0)
 }
@@ -184,16 +191,19 @@ func TestNilOnUnknownTenantID(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
 	assert.NoError(t, err)
 
-	buff, _, err := r.Find(context.Background(), "unknown", []byte{0x01}, BlockIDMin, BlockIDMax)
+	buff, err := r.Find(context.Background(), "unknown", []byte{0x01}, BlockIDMin, BlockIDMax)
 	assert.Nil(t, buff)
 	assert.Nil(t, err)
 }
@@ -208,10 +218,13 @@ func TestBlockCleanup(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -232,7 +245,7 @@ func TestBlockCleanup(t *testing.T) {
 	head, err := wal.NewBlock(blockID, testTenantID)
 	assert.NoError(t, err)
 
-	complete, err := head.Complete(wal, &mockSharder{})
+	complete, err := w.CompleteBlock(head, &mockSharder{})
 	assert.NoError(t, err)
 
 	err = w.WriteBlock(context.Background(), complete)
@@ -287,10 +300,13 @@ func TestCleanMissingTenants(t *testing.T) {
 				Local: &local.Config{
 					Path: path.Join("/tmp", "traces"),
 				},
-				WAL: &wal.Config{
-					Filepath:        path.Join("/tmp", "wal"),
+				Block: &encoding.BlockConfig{
 					IndexDownsample: 17,
 					BloomFP:         .01,
+					Encoding:        backend.EncLZ4_256k,
+				},
+				WAL: &wal.Config{
+					Filepath: path.Join("/tmp", "wal"),
 				},
 				BlocklistPoll: 0,
 			}, log.NewNopLogger())
@@ -344,10 +360,13 @@ func TestUpdateBlocklist(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
@@ -528,10 +547,13 @@ func TestUpdateBlocklistCompacted(t *testing.T) {
 		Local: &local.Config{
 			Path: path.Join(tempDir, "traces"),
 		},
-		WAL: &wal.Config{
-			Filepath:        path.Join(tempDir, "wal"),
+		Block: &encoding.BlockConfig{
 			IndexDownsample: 17,
 			BloomFP:         .01,
+			Encoding:        backend.EncLZ4_256k,
+		},
+		WAL: &wal.Config{
+			Filepath: path.Join(tempDir, "wal"),
 		},
 		BlocklistPoll: 0,
 	}, log.NewNopLogger())
diff --git a/tempodb/wal/append_block.go b/tempodb/wal/append_block.go
index 95080d88a62..b577a7bea5f 100644
--- a/tempodb/wal/append_block.go
+++ b/tempodb/wal/append_block.go
@@ -10,6 +10,12 @@ import (
 	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
 )
 
+// these values should never be used.  these dummy values can help detect
+// if they leak elsewhere.  append blocks are not versioned and do not
+// support different encodings
+const appendBlockVersion = "append"
+const appendBlockEncoding = backend.EncNone
+
 // AppendBlock is a block that is actively used to append new objects to.  It stores all data in the appendFile
 // in the order it was received and an in memory sorted index.
 type AppendBlock struct {
@@ -22,7 +28,7 @@ type AppendBlock struct {
 func newAppendBlock(id uuid.UUID, tenantID string, filepath string) (*AppendBlock, error) {
 	h := &AppendBlock{
 		block: block{
-			meta:     backend.NewBlockMeta(tenantID, id),
+			meta:     backend.NewBlockMeta(tenantID, id, appendBlockVersion, appendBlockEncoding),
 			filepath: filepath,
 		},
 	}
@@ -61,7 +67,7 @@ func (h *AppendBlock) DataLength() uint64 {
 // includes an on disk file containing all objects in order.
 // Note that calling this method leaves the original file on disk.  This file is still considered to be part of the WAL
 // until Write() is successfully called on the CompleteBlock.
-func (h *AppendBlock) Complete(w *WAL, combiner common.ObjectCombiner) (*encoding.CompleteBlock, error) {
+func (h *AppendBlock) Complete(cfg *encoding.BlockConfig, w *WAL, combiner common.ObjectCombiner) (*encoding.CompleteBlock, error) {
 	if h.appendFile != nil {
 		err := h.appendFile.Close()
 		if err != nil {
@@ -81,7 +87,7 @@ func (h *AppendBlock) Complete(w *WAL, combiner common.ObjectCombiner) (*encodin
 		return nil, err
 	}
 
-	orderedBlock, err := encoding.NewCompleteBlock(h.meta, iterator, w.c.BloomFP, len(records), w.c.IndexDownsample, w.c.CompletedFilepath, h.fullFilename())
+	orderedBlock, err := encoding.NewCompleteBlock(cfg, h.meta, iterator, len(records), w.c.CompletedFilepath, h.fullFilename())
 	if err != nil {
 		return nil, err
 	}
@@ -96,7 +102,7 @@ func (h *AppendBlock) Find(id common.ID, combiner common.ObjectCombiner) ([]byte
 		return nil, err
 	}
 
-	finder := v0.NewDedupingFinder(records, file, combiner)
+	finder := v0.NewPagedFinder(common.Records(records), v0.NewPageReader(file), combiner)
 
 	return finder.Find(id)
 }
diff --git a/tempodb/wal/wal.go b/tempodb/wal/wal.go
index bdd2f1fc25f..6f4978e5633 100644
--- a/tempodb/wal/wal.go
+++ b/tempodb/wal/wal.go
@@ -22,8 +22,6 @@ type WAL struct {
 type Config struct {
 	Filepath          string `yaml:"path"`
 	CompletedFilepath string
-	IndexDownsample   int     `yaml:"index_downsample"`
-	BloomFP           float64 `yaml:"bloom_filter_false_positive"`
 }
 
 func New(c *Config) (*WAL, error) {
@@ -31,14 +29,6 @@ func New(c *Config) (*WAL, error) {
 		return nil, fmt.Errorf("please provide a path for the WAL")
 	}
 
-	if c.IndexDownsample == 0 {
-		return nil, fmt.Errorf("Non-zero index downsample required")
-	}
-
-	if c.BloomFP <= 0.0 {
-		return nil, fmt.Errorf("invalid bloom filter fp rate %v", c.BloomFP)
-	}
-
 	// make folder
 	err := os.MkdirAll(c.Filepath, os.ModePerm)
 	if err != nil {
@@ -84,7 +74,7 @@ func (w *WAL) AllBlocks() ([]*ReplayBlock, error) {
 
 		blocks = append(blocks, &ReplayBlock{
 			block: block{
-				meta:     backend.NewBlockMeta(tenantID, blockID),
+				meta:     backend.NewBlockMeta(tenantID, blockID, appendBlockVersion, appendBlockEncoding),
 				filepath: w.c.Filepath,
 			},
 		})
diff --git a/tempodb/wal/wal_test.go b/tempodb/wal/wal_test.go
index 5db54838b11..e7cd11a301a 100644
--- a/tempodb/wal/wal_test.go
+++ b/tempodb/wal/wal_test.go
@@ -13,6 +13,8 @@ import (
 
 	"github.com/grafana/tempo/pkg/tempopb"
 	"github.com/grafana/tempo/pkg/util/test"
+	"github.com/grafana/tempo/tempodb/backend"
+	"github.com/grafana/tempo/tempodb/encoding"
 	v0 "github.com/grafana/tempo/tempodb/encoding/v0"
 )
 
@@ -37,9 +39,7 @@ func TestCreateBlock(t *testing.T) {
 	assert.NoError(t, err, "unexpected error creating temp dir")
 
 	wal, err := New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         0.1,
+		Filepath: tempDir,
 	})
 	assert.NoError(t, err, "unexpected error creating temp wal")
 
@@ -61,9 +61,7 @@ func TestReadWrite(t *testing.T) {
 	assert.NoError(t, err, "unexpected error creating temp dir")
 
 	wal, err := New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         0.1,
+		Filepath: tempDir,
 	})
 	assert.NoError(t, err, "unexpected error creating temp wal")
 
@@ -93,9 +91,7 @@ func TestAppend(t *testing.T) {
 	assert.NoError(t, err, "unexpected error creating temp dir")
 
 	wal, err := New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         0.1,
+		Filepath: tempDir,
 	})
 	assert.NoError(t, err, "unexpected error creating temp wal")
 
@@ -143,11 +139,8 @@ func TestAppendBlockComplete(t *testing.T) {
 	defer os.RemoveAll(tempDir)
 	assert.NoError(t, err, "unexpected error creating temp dir")
 
-	indexDownsample := 13
 	wal, err := New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: indexDownsample,
-		BloomFP:         .01,
+		Filepath: tempDir,
 	})
 	assert.NoError(t, err, "unexpected error creating temp wal")
 
@@ -171,7 +164,11 @@ func TestAppendBlockComplete(t *testing.T) {
 		assert.NoError(t, err, "unexpected error writing req")
 	}
 
-	complete, err := block.Complete(wal, &mockCombiner{})
+	complete, err := block.Complete(&encoding.BlockConfig{
+		IndexDownsample: 13,
+		BloomFP:         .01,
+		Encoding:        backend.EncGZIP,
+	}, wal, &mockCombiner{})
 	assert.NoError(t, err, "unexpected error completing block")
 
 	for i, id := range ids {
@@ -198,9 +195,7 @@ func TestWorkDir(t *testing.T) {
 	assert.NoError(t, err, "unexpected error creating testfile")
 
 	_, err = New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         0.1,
+		Filepath: tempDir,
 	})
 	assert.NoError(t, err, "unexpected error creating temp wal")
 
@@ -218,9 +213,7 @@ func BenchmarkWriteRead(b *testing.B) {
 	defer os.RemoveAll(tempDir)
 
 	wal, _ := New(&Config{
-		Filepath:        tempDir,
-		IndexDownsample: 2,
-		BloomFP:         0.1,
+		Filepath: tempDir,
 	})
 
 	blockID := uuid.New()
diff --git a/vendor/github.com/klauspost/compress/LICENSE b/vendor/github.com/klauspost/compress/LICENSE
new file mode 100644
index 00000000000..1eb75ef68e4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2012 The Go Authors. All rights reserved.
+Copyright (c) 2019 Klaus Post. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/flate/deflate.go b/vendor/github.com/klauspost/compress/flate/deflate.go
new file mode 100644
index 00000000000..25dbe3e15f4
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@@ -0,0 +1,821 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright (c) 2015 Klaus Post
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	NoCompression      = 0
+	BestSpeed          = 1
+	BestCompression    = 9
+	DefaultCompression = -1
+
+	// HuffmanOnly disables Lempel-Ziv match searching and only performs Huffman
+	// entropy encoding. This mode is useful in compressing data that has
+	// already been compressed with an LZ style algorithm (e.g. Snappy or LZ4)
+	// that lacks an entropy encoder. Compression gains are achieved when
+	// certain bytes in the input stream occur more frequently than others.
+	//
+	// Note that HuffmanOnly produces a compressed output that is
+	// RFC 1951 compliant. That is, any valid DEFLATE decompressor will
+	// continue to be able to decompress this output.
+	HuffmanOnly         = -2
+	ConstantCompression = HuffmanOnly // compatibility alias.
+
+	logWindowSize    = 15
+	windowSize       = 1 << logWindowSize
+	windowMask       = windowSize - 1
+	logMaxOffsetSize = 15  // Standard DEFLATE
+	minMatchLength   = 4   // The smallest match that the compressor looks for
+	maxMatchLength   = 258 // The longest match for the compressor
+	minOffsetSize    = 1   // The shortest offset that makes any sense
+
+	// The maximum number of tokens we put into a single flat block, just too
+	// stop things from getting too large.
+	maxFlateBlockTokens = 1 << 14
+	maxStoreBlockSize   = 65535
+	hashBits            = 17 // After 17 performance degrades
+	hashSize            = 1 << hashBits
+	hashMask            = (1 << hashBits) - 1
+	hashShift           = (hashBits + minMatchLength - 1) / minMatchLength
+	maxHashOffset       = 1 << 24
+
+	skipNever = math.MaxInt32
+
+	debugDeflate = false
+)
+
+type compressionLevel struct {
+	good, lazy, nice, chain, fastSkipHashing, level int
+}
+
+// Compression levels have been rebalanced from zlib deflate defaults
+// to give a bigger spread in speed and compression.
+// See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
+var levels = []compressionLevel{
+	{}, // 0
+	// Level 1-6 uses specialized algorithm - values not used
+	{0, 0, 0, 0, 0, 1},
+	{0, 0, 0, 0, 0, 2},
+	{0, 0, 0, 0, 0, 3},
+	{0, 0, 0, 0, 0, 4},
+	{0, 0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 0, 6},
+	// Levels 7-9 use increasingly more lazy matching
+	// and increasingly stringent conditions for "good enough".
+	{8, 8, 24, 16, skipNever, 7},
+	{10, 16, 24, 64, skipNever, 8},
+	{32, 258, 258, 4096, skipNever, 9},
+}
+
+// advancedState contains state for the advanced levels, with bigger hash tables, etc.
+type advancedState struct {
+	// deflate state
+	length         int
+	offset         int
+	maxInsertIndex int
+
+	// Input hash chains
+	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
+	// If hashHead[hashValue] is within the current window, then
+	// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
+	// with the same hash value.
+	chainHead  int
+	hashHead   [hashSize]uint32
+	hashPrev   [windowSize]uint32
+	hashOffset int
+
+	// input window: unprocessed data is window[index:windowEnd]
+	index     int
+	hashMatch [maxMatchLength + minMatchLength]uint32
+
+	hash uint32
+	ii   uint16 // position of last match, intended to overflow to reset.
+}
+
+type compressor struct {
+	compressionLevel
+
+	w *huffmanBitWriter
+
+	// compression algorithm
+	fill func(*compressor, []byte) int // copy data to window
+	step func(*compressor)             // process window
+
+	window     []byte
+	windowEnd  int
+	blockStart int // window index where current tokens start
+	err        error
+
+	// queued output tokens
+	tokens tokens
+	fast   fastEnc
+	state  *advancedState
+
+	sync          bool // requesting flush
+	byteAvailable bool // if true, still need to process window[index-1].
+}
+
+func (d *compressor) fillDeflate(b []byte) int {
+	s := d.state
+	if s.index >= 2*windowSize-(minMatchLength+maxMatchLength) {
+		// shift the window by windowSize
+		copy(d.window[:], d.window[windowSize:2*windowSize])
+		s.index -= windowSize
+		d.windowEnd -= windowSize
+		if d.blockStart >= windowSize {
+			d.blockStart -= windowSize
+		} else {
+			d.blockStart = math.MaxInt32
+		}
+		s.hashOffset += windowSize
+		if s.hashOffset > maxHashOffset {
+			delta := s.hashOffset - 1
+			s.hashOffset -= delta
+			s.chainHead -= delta
+			// Iterate over slices instead of arrays to avoid copying
+			// the entire table onto the stack (Issue #18625).
+			for i, v := range s.hashPrev[:] {
+				if int(v) > delta {
+					s.hashPrev[i] = uint32(int(v) - delta)
+				} else {
+					s.hashPrev[i] = 0
+				}
+			}
+			for i, v := range s.hashHead[:] {
+				if int(v) > delta {
+					s.hashHead[i] = uint32(int(v) - delta)
+				} else {
+					s.hashHead[i] = 0
+				}
+			}
+		}
+	}
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+func (d *compressor) writeBlock(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		var window []byte
+		if d.blockStart <= index {
+			window = d.window[d.blockStart:index]
+		}
+		d.blockStart = index
+		d.w.writeBlock(tok, eof, window)
+		return d.w.err
+	}
+	return nil
+}
+
+// writeBlockSkip writes the current block and uses the number of tokens
+// to determine if the block should be stored on no matches, or
+// only huffman encoded.
+func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
+	if index > 0 || eof {
+		if d.blockStart <= index {
+			window := d.window[d.blockStart:index]
+			// If we removed less than a 64th of all literals
+			// we huffman compress the block.
+			if int(tok.n) > len(window)-int(tok.n>>6) {
+				d.w.writeBlockHuff(eof, window, d.sync)
+			} else {
+				// Write a dynamic huffman block.
+				d.w.writeBlockDynamic(tok, eof, window, d.sync)
+			}
+		} else {
+			d.w.writeBlock(tok, eof, nil)
+		}
+		d.blockStart = index
+		return d.w.err
+	}
+	return nil
+}
+
+// fillWindow will fill the current window with the supplied
+// dictionary and calculate all hashes.
+// This is much faster than doing a full encode.
+// Should only be used after a start/reset.
+func (d *compressor) fillWindow(b []byte) {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
+		return
+	}
+	if d.fast != nil {
+		// encode the last data, but discard the result
+		if len(b) > maxMatchOffset {
+			b = b[len(b)-maxMatchOffset:]
+		}
+		d.fast.Encode(&d.tokens, b)
+		d.tokens.Reset()
+		return
+	}
+	s := d.state
+	// If we are given too much, cut it.
+	if len(b) > windowSize {
+		b = b[len(b)-windowSize:]
+	}
+	// Add all to window.
+	n := copy(d.window[d.windowEnd:], b)
+
+	// Calculate 256 hashes at the time (more L1 cache hits)
+	loops := (n + 256 - minMatchLength) / 256
+	for j := 0; j < loops; j++ {
+		startindex := j * 256
+		end := startindex + 256 + minMatchLength - 1
+		if end > n {
+			end = n
+		}
+		tocheck := d.window[startindex:end]
+		dstSize := len(tocheck) - minMatchLength + 1
+
+		if dstSize <= 0 {
+			continue
+		}
+
+		dst := s.hashMatch[:dstSize]
+		bulkHash4(tocheck, dst)
+		var newH uint32
+		for i, val := range dst {
+			di := i + startindex
+			newH = val & hashMask
+			// Get previous value with the same hash.
+			// Our chain should point to the previous value.
+			s.hashPrev[di&windowMask] = s.hashHead[newH]
+			// Set the head of the hash chain to us.
+			s.hashHead[newH] = uint32(di + s.hashOffset)
+		}
+		s.hash = newH
+	}
+	// Update window information.
+	d.windowEnd += n
+	s.index = n
+}
+
+// Try to find a match starting at index whose length is greater than prevSize.
+// We only look at chainCount possibilities before giving up.
+// pos = s.index, prevHead = s.chainHead-s.hashOffset, prevLength=minMatchLength-1, lookahead
+func (d *compressor) findMatch(pos int, prevHead int, prevLength int, lookahead int) (length, offset int, ok bool) {
+	minMatchLook := maxMatchLength
+	if lookahead < minMatchLook {
+		minMatchLook = lookahead
+	}
+
+	win := d.window[0 : pos+minMatchLook]
+
+	// We quit when we get a match that's at least nice long
+	nice := len(win) - pos
+	if d.nice < nice {
+		nice = d.nice
+	}
+
+	// If we've got a match that's good enough, only look in 1/4 the chain.
+	tries := d.chain
+	length = prevLength
+	if length >= d.good {
+		tries >>= 2
+	}
+
+	wEnd := win[pos+length]
+	wPos := win[pos:]
+	minIndex := pos - windowSize
+
+	for i := prevHead; tries > 0; tries-- {
+		if wEnd == win[i+length] {
+			n := matchLen(win[i:i+minMatchLook], wPos)
+
+			if n > length && (n > minMatchLength || pos-i <= 4096) {
+				length = n
+				offset = pos - i
+				ok = true
+				if n >= nice {
+					// The match is good enough that we don't try to find a better one.
+					break
+				}
+				wEnd = win[pos+n]
+			}
+		}
+		if i == minIndex {
+			// hashPrev[i & windowMask] has already been overwritten, so stop now.
+			break
+		}
+		i = int(d.state.hashPrev[i&windowMask]) - d.state.hashOffset
+		if i < minIndex || i < 0 {
+			break
+		}
+	}
+	return
+}
+
+func (d *compressor) writeStoredBlock(buf []byte) error {
+	if d.w.writeStoredHeader(len(buf), false); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.writeBytes(buf)
+	return d.w.err
+}
+
+// hash4 returns a hash representation of the first 4 bytes
+// of the supplied slice.
+// The caller must ensure that len(b) >= 4.
+func hash4(b []byte) uint32 {
+	b = b[:4]
+	return hash4u(uint32(b[3])|uint32(b[2])<<8|uint32(b[1])<<16|uint32(b[0])<<24, hashBits)
+}
+
+// bulkHash4 will compute hashes using the same
+// algorithm as hash4
+func bulkHash4(b []byte, dst []uint32) {
+	if len(b) < 4 {
+		return
+	}
+	hb := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
+	dst[0] = hash4u(hb, hashBits)
+	end := len(b) - 4 + 1
+	for i := 1; i < end; i++ {
+		hb = (hb << 8) | uint32(b[i+3])
+		dst[i] = hash4u(hb, hashBits)
+	}
+}
+
+func (d *compressor) initDeflate() {
+	d.window = make([]byte, 2*windowSize)
+	d.byteAvailable = false
+	d.err = nil
+	if d.state == nil {
+		return
+	}
+	s := d.state
+	s.index = 0
+	s.hashOffset = 1
+	s.length = minMatchLength - 1
+	s.offset = 0
+	s.hash = 0
+	s.chainHead = -1
+}
+
+// deflateLazy is the same as deflate, but with d.fastSkipHashing == skipNever,
+// meaning it always has lazy matching on.
+func (d *compressor) deflateLazy() {
+	s := d.state
+	// Sanity enables additional runtime tests.
+	// It's intended to be used during development
+	// to supplement the currently ad-hoc unit tests.
+	const sanity = debugDeflate
+
+	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
+		return
+	}
+
+	s.maxInsertIndex = d.windowEnd - (minMatchLength - 1)
+	if s.index < s.maxInsertIndex {
+		s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+	}
+
+	for {
+		if sanity && s.index > d.windowEnd {
+			panic("index > windowEnd")
+		}
+		lookahead := d.windowEnd - s.index
+		if lookahead < minMatchLength+maxMatchLength {
+			if !d.sync {
+				return
+			}
+			if sanity && s.index > d.windowEnd {
+				panic("index > windowEnd")
+			}
+			if lookahead == 0 {
+				// Flush current output block if any.
+				if d.byteAvailable {
+					// There is still one pending token that needs to be flushed
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+				}
+				if d.tokens.n > 0 {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				return
+			}
+		}
+		if s.index < s.maxInsertIndex {
+			// Update the hash
+			s.hash = hash4(d.window[s.index : s.index+minMatchLength])
+			ch := s.hashHead[s.hash&hashMask]
+			s.chainHead = int(ch)
+			s.hashPrev[s.index&windowMask] = ch
+			s.hashHead[s.hash&hashMask] = uint32(s.index + s.hashOffset)
+		}
+		prevLength := s.length
+		prevOffset := s.offset
+		s.length = minMatchLength - 1
+		s.offset = 0
+		minIndex := s.index - windowSize
+		if minIndex < 0 {
+			minIndex = 0
+		}
+
+		if s.chainHead-s.hashOffset >= minIndex && lookahead > prevLength && prevLength < d.lazy {
+			if newLength, newOffset, ok := d.findMatch(s.index, s.chainHead-s.hashOffset, minMatchLength-1, lookahead); ok {
+				s.length = newLength
+				s.offset = newOffset
+			}
+		}
+		if prevLength >= minMatchLength && s.length <= prevLength {
+			// There was a match at the previous step, and the current match is
+			// not better. Output the previous match.
+			d.tokens.AddMatch(uint32(prevLength-3), uint32(prevOffset-minOffsetSize))
+
+			// Insert in the hash table all strings up to the end of the match.
+			// index and index-1 are already inserted. If there is not enough
+			// lookahead, the last two strings are not inserted into the hash
+			// table.
+			var newIndex int
+			newIndex = s.index + prevLength - 1
+			// Calculate missing hashes
+			end := newIndex
+			if end > s.maxInsertIndex {
+				end = s.maxInsertIndex
+			}
+			end += minMatchLength - 1
+			startindex := s.index + 1
+			if startindex > s.maxInsertIndex {
+				startindex = s.maxInsertIndex
+			}
+			tocheck := d.window[startindex:end]
+			dstSize := len(tocheck) - minMatchLength + 1
+			if dstSize > 0 {
+				dst := s.hashMatch[:dstSize]
+				bulkHash4(tocheck, dst)
+				var newH uint32
+				for i, val := range dst {
+					di := i + startindex
+					newH = val & hashMask
+					// Get previous value with the same hash.
+					// Our chain should point to the previous value.
+					s.hashPrev[di&windowMask] = s.hashHead[newH]
+					// Set the head of the hash chain to us.
+					s.hashHead[newH] = uint32(di + s.hashOffset)
+				}
+				s.hash = newH
+			}
+
+			s.index = newIndex
+			d.byteAvailable = false
+			s.length = minMatchLength - 1
+			if d.tokens.n == maxFlateBlockTokens {
+				// The block includes the current character
+				if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+					return
+				}
+				d.tokens.Reset()
+			}
+		} else {
+			// Reset, if we got a match this run.
+			if s.length >= minMatchLength {
+				s.ii = 0
+			}
+			// We have a byte waiting. Emit it.
+			if d.byteAvailable {
+				s.ii++
+				d.tokens.AddLiteral(d.window[s.index-1])
+				if d.tokens.n == maxFlateBlockTokens {
+					if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+						return
+					}
+					d.tokens.Reset()
+				}
+				s.index++
+
+				// If we have a long run of no matches, skip additional bytes
+				// Resets when s.ii overflows after 64KB.
+				if s.ii > 31 {
+					n := int(s.ii >> 5)
+					for j := 0; j < n; j++ {
+						if s.index >= d.windowEnd-1 {
+							break
+						}
+
+						d.tokens.AddLiteral(d.window[s.index-1])
+						if d.tokens.n == maxFlateBlockTokens {
+							if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+								return
+							}
+							d.tokens.Reset()
+						}
+						s.index++
+					}
+					// Flush last byte
+					d.tokens.AddLiteral(d.window[s.index-1])
+					d.byteAvailable = false
+					// s.length = minMatchLength - 1 // not needed, since s.ii is reset above, so it should never be > minMatchLength
+					if d.tokens.n == maxFlateBlockTokens {
+						if d.err = d.writeBlock(&d.tokens, s.index, false); d.err != nil {
+							return
+						}
+						d.tokens.Reset()
+					}
+				}
+			} else {
+				s.index++
+				d.byteAvailable = true
+			}
+		}
+	}
+}
+
+func (d *compressor) store() {
+	if d.windowEnd > 0 && (d.windowEnd == maxStoreBlockSize || d.sync) {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		d.windowEnd = 0
+	}
+}
+
+// fillWindow will fill the buffer with data for huffman-only compression.
+// The number of bytes copied is returned.
+func (d *compressor) fillBlock(b []byte) int {
+	n := copy(d.window[d.windowEnd:], b)
+	d.windowEnd += n
+	return n
+}
+
+// storeHuff will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeHuff() {
+	if d.windowEnd < len(d.window) && !d.sync || d.windowEnd == 0 {
+		return
+	}
+	d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+	d.err = d.w.err
+	d.windowEnd = 0
+}
+
+// storeFast will compress and store the currently added data,
+// if enough has been accumulated or we at the end of the stream.
+// Any error that occurred will be in d.err
+func (d *compressor) storeFast() {
+	// We only compress if we have maxStoreBlockSize.
+	if d.windowEnd < len(d.window) {
+		if !d.sync {
+			return
+		}
+		// Handle extremely small sizes.
+		if d.windowEnd < 128 {
+			if d.windowEnd == 0 {
+				return
+			}
+			if d.windowEnd <= 32 {
+				d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+			} else {
+				d.w.writeBlockHuff(false, d.window[:d.windowEnd], true)
+				d.err = d.w.err
+			}
+			d.tokens.Reset()
+			d.windowEnd = 0
+			d.fast.Reset()
+			return
+		}
+	}
+
+	d.fast.Encode(&d.tokens, d.window[:d.windowEnd])
+	// If we made zero matches, store the block as is.
+	if d.tokens.n == 0 {
+		d.err = d.writeStoredBlock(d.window[:d.windowEnd])
+		// If we removed less than 1/16th, huffman compress the block.
+	} else if int(d.tokens.n) > d.windowEnd-(d.windowEnd>>4) {
+		d.w.writeBlockHuff(false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	} else {
+		d.w.writeBlockDynamic(&d.tokens, false, d.window[:d.windowEnd], d.sync)
+		d.err = d.w.err
+	}
+	d.tokens.Reset()
+	d.windowEnd = 0
+}
+
+// write will add input byte to the stream.
+// Unless an error occurs all bytes will be consumed.
+func (d *compressor) write(b []byte) (n int, err error) {
+	if d.err != nil {
+		return 0, d.err
+	}
+	n = len(b)
+	for len(b) > 0 {
+		d.step(d)
+		b = b[d.fill(d, b):]
+		if d.err != nil {
+			return 0, d.err
+		}
+	}
+	return n, d.err
+}
+
+func (d *compressor) syncFlush() error {
+	d.sync = true
+	if d.err != nil {
+		return d.err
+	}
+	d.step(d)
+	if d.err == nil {
+		d.w.writeStoredHeader(0, false)
+		d.w.flush()
+		d.err = d.w.err
+	}
+	d.sync = false
+	return d.err
+}
+
+func (d *compressor) init(w io.Writer, level int) (err error) {
+	d.w = newHuffmanBitWriter(w)
+
+	switch {
+	case level == NoCompression:
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).store
+	case level == ConstantCompression:
+		d.w.logNewTablePenalty = 4
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeHuff
+	case level == DefaultCompression:
+		level = 5
+		fallthrough
+	case level >= 1 && level <= 6:
+		d.w.logNewTablePenalty = 6
+		d.fast = newFastEnc(level)
+		d.window = make([]byte, maxStoreBlockSize)
+		d.fill = (*compressor).fillBlock
+		d.step = (*compressor).storeFast
+	case 7 <= level && level <= 9:
+		d.w.logNewTablePenalty = 10
+		d.state = &advancedState{}
+		d.compressionLevel = levels[level]
+		d.initDeflate()
+		d.fill = (*compressor).fillDeflate
+		d.step = (*compressor).deflateLazy
+	default:
+		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
+	}
+	d.level = level
+	return nil
+}
+
+// reset the state of the compressor.
+func (d *compressor) reset(w io.Writer) {
+	d.w.reset(w)
+	d.sync = false
+	d.err = nil
+	// We only need to reset a few things for Snappy.
+	if d.fast != nil {
+		d.fast.Reset()
+		d.windowEnd = 0
+		d.tokens.Reset()
+		return
+	}
+	switch d.compressionLevel.chain {
+	case 0:
+		// level was NoCompression or ConstantCompresssion.
+		d.windowEnd = 0
+	default:
+		s := d.state
+		s.chainHead = -1
+		for i := range s.hashHead {
+			s.hashHead[i] = 0
+		}
+		for i := range s.hashPrev {
+			s.hashPrev[i] = 0
+		}
+		s.hashOffset = 1
+		s.index, d.windowEnd = 0, 0
+		d.blockStart, d.byteAvailable = 0, false
+		d.tokens.Reset()
+		s.length = minMatchLength - 1
+		s.offset = 0
+		s.hash = 0
+		s.ii = 0
+		s.maxInsertIndex = 0
+	}
+}
+
+func (d *compressor) close() error {
+	if d.err != nil {
+		return d.err
+	}
+	d.sync = true
+	d.step(d)
+	if d.err != nil {
+		return d.err
+	}
+	if d.w.writeStoredHeader(0, true); d.w.err != nil {
+		return d.w.err
+	}
+	d.w.flush()
+	d.w.reset(nil)
+	return d.w.err
+}
+
+// NewWriter returns a new Writer compressing data at the given level.
+// Following zlib, levels range from 1 (BestSpeed) to 9 (BestCompression);
+// higher levels typically run slower but compress more.
+// Level 0 (NoCompression) does not attempt any compression; it only adds the
+// necessary DEFLATE framing.
+// Level -1 (DefaultCompression) uses the default compression level.
+// Level -2 (ConstantCompression) will use Huffman compression only, giving
+// a very fast compression for all types of input, but sacrificing considerable
+// compression efficiency.
+//
+// If level is in the range [-2, 9] then the error returned will be nil.
+// Otherwise the error returned will be non-nil.
+func NewWriter(w io.Writer, level int) (*Writer, error) {
+	var dw Writer
+	if err := dw.d.init(w, level); err != nil {
+		return nil, err
+	}
+	return &dw, nil
+}
+
+// NewWriterDict is like NewWriter but initializes the new
+// Writer with a preset dictionary.  The returned Writer behaves
+// as if the dictionary had been written to it without producing
+// any compressed output.  The compressed data written to w
+// can only be decompressed by a Reader initialized with the
+// same dictionary.
+func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
+	zw, err := NewWriter(w, level)
+	if err != nil {
+		return nil, err
+	}
+	zw.d.fillWindow(dict)
+	zw.dict = append(zw.dict, dict...) // duplicate dictionary for Reset method.
+	return zw, err
+}
+
+// A Writer takes data written to it and writes the compressed
+// form of that data to an underlying writer (see NewWriter).
+type Writer struct {
+	d    compressor
+	dict []byte
+}
+
+// Write writes data to w, which will eventually write the
+// compressed form of data to its underlying writer.
+func (w *Writer) Write(data []byte) (n int, err error) {
+	return w.d.write(data)
+}
+
+// Flush flushes any pending data to the underlying writer.
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet.
+// Flush does not return until the data has been written.
+// Calling Flush when there is no pending data still causes the Writer
+// to emit a sync marker of at least 4 bytes.
+// If the underlying writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (w *Writer) Flush() error {
+	// For more about flushing:
+	// http://www.bolet.org/~pornin/deflate-flush.html
+	return w.d.syncFlush()
+}
+
+// Close flushes and closes the writer.
+func (w *Writer) Close() error {
+	return w.d.close()
+}
+
+// Reset discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level and dictionary.
+func (w *Writer) Reset(dst io.Writer) {
+	if len(w.dict) > 0 {
+		// w was created with NewWriterDict
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
+	} else {
+		// w was created with NewWriter
+		w.d.reset(dst)
+	}
+}
+
+// ResetDict discards the writer's state and makes it equivalent to
+// the result of NewWriter or NewWriterDict called with dst
+// and w's level, but sets a specific dictionary.
+func (w *Writer) ResetDict(dst io.Writer, dict []byte) {
+	w.dict = dict
+	w.d.reset(dst)
+	w.d.fillWindow(w.dict)
+}
diff --git a/vendor/github.com/klauspost/compress/flate/dict_decoder.go b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
new file mode 100644
index 00000000000..71c75a065ea
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/dict_decoder.go
@@ -0,0 +1,184 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// dictDecoder implements the LZ77 sliding dictionary as used in decompression.
+// LZ77 decompresses data through sequences of two forms of commands:
+//
+//	* Literal insertions: Runs of one or more symbols are inserted into the data
+//	stream as is. This is accomplished through the writeByte method for a
+//	single symbol, or combinations of writeSlice/writeMark for multiple symbols.
+//	Any valid stream must start with a literal insertion if no preset dictionary
+//	is used.
+//
+//	* Backward copies: Runs of one or more symbols are copied from previously
+//	emitted data. Backward copies come as the tuple (dist, length) where dist
+//	determines how far back in the stream to copy from and length determines how
+//	many bytes to copy. Note that it is valid for the length to be greater than
+//	the distance. Since LZ77 uses forward copies, that situation is used to
+//	perform a form of run-length encoding on repeated runs of symbols.
+//	The writeCopy and tryWriteCopy are used to implement this command.
+//
+// For performance reasons, this implementation performs little to no sanity
+// checks about the arguments. As such, the invariants documented for each
+// method call must be respected.
+type dictDecoder struct {
+	hist []byte // Sliding window history
+
+	// Invariant: 0 <= rdPos <= wrPos <= len(hist)
+	wrPos int  // Current output position in buffer
+	rdPos int  // Have emitted hist[:rdPos] already
+	full  bool // Has a full window length been written yet?
+}
+
+// init initializes dictDecoder to have a sliding window dictionary of the given
+// size. If a preset dict is provided, it will initialize the dictionary with
+// the contents of dict.
+func (dd *dictDecoder) init(size int, dict []byte) {
+	*dd = dictDecoder{hist: dd.hist}
+
+	if cap(dd.hist) < size {
+		dd.hist = make([]byte, size)
+	}
+	dd.hist = dd.hist[:size]
+
+	if len(dict) > len(dd.hist) {
+		dict = dict[len(dict)-len(dd.hist):]
+	}
+	dd.wrPos = copy(dd.hist, dict)
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos = 0
+		dd.full = true
+	}
+	dd.rdPos = dd.wrPos
+}
+
+// histSize reports the total amount of historical data in the dictionary.
+func (dd *dictDecoder) histSize() int {
+	if dd.full {
+		return len(dd.hist)
+	}
+	return dd.wrPos
+}
+
+// availRead reports the number of bytes that can be flushed by readFlush.
+func (dd *dictDecoder) availRead() int {
+	return dd.wrPos - dd.rdPos
+}
+
+// availWrite reports the available amount of output buffer space.
+func (dd *dictDecoder) availWrite() int {
+	return len(dd.hist) - dd.wrPos
+}
+
+// writeSlice returns a slice of the available buffer to write data to.
+//
+// This invariant will be kept: len(s) <= availWrite()
+func (dd *dictDecoder) writeSlice() []byte {
+	return dd.hist[dd.wrPos:]
+}
+
+// writeMark advances the writer pointer by cnt.
+//
+// This invariant must be kept: 0 <= cnt <= availWrite()
+func (dd *dictDecoder) writeMark(cnt int) {
+	dd.wrPos += cnt
+}
+
+// writeByte writes a single byte to the dictionary.
+//
+// This invariant must be kept: 0 < availWrite()
+func (dd *dictDecoder) writeByte(c byte) {
+	dd.hist[dd.wrPos] = c
+	dd.wrPos++
+}
+
+// writeCopy copies a string at a given (dist, length) to the output.
+// This returns the number of bytes copied and may be less than the requested
+// length if the available space in the output buffer is too small.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) writeCopy(dist, length int) int {
+	dstBase := dd.wrPos
+	dstPos := dstBase
+	srcPos := dstPos - dist
+	endPos := dstPos + length
+	if endPos > len(dd.hist) {
+		endPos = len(dd.hist)
+	}
+
+	// Copy non-overlapping section after destination position.
+	//
+	// This section is non-overlapping in that the copy length for this section
+	// is always less than or equal to the backwards distance. This can occur
+	// if a distance refers to data that wraps-around in the buffer.
+	// Thus, a backwards copy is performed here; that is, the exact bytes in
+	// the source prior to the copy is placed in the destination.
+	if srcPos < 0 {
+		srcPos += len(dd.hist)
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:])
+		srcPos = 0
+	}
+
+	// Copy possibly overlapping section before destination position.
+	//
+	// This section can overlap if the copy length for this section is larger
+	// than the backwards distance. This is allowed by LZ77 so that repeated
+	// strings can be succinctly represented using (dist, length) pairs.
+	// Thus, a forwards copy is performed here; that is, the bytes copied is
+	// possibly dependent on the resulting bytes in the destination as the copy
+	// progresses along. This is functionally equivalent to the following:
+	//
+	//	for i := 0; i < endPos-dstPos; i++ {
+	//		dd.hist[dstPos+i] = dd.hist[srcPos+i]
+	//	}
+	//	dstPos = endPos
+	//
+	for dstPos < endPos {
+		dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// tryWriteCopy tries to copy a string at a given (distance, length) to the
+// output. This specialized version is optimized for short distances.
+//
+// This method is designed to be inlined for performance reasons.
+//
+// This invariant must be kept: 0 < dist <= histSize()
+func (dd *dictDecoder) tryWriteCopy(dist, length int) int {
+	dstPos := dd.wrPos
+	endPos := dstPos + length
+	if dstPos < dist || endPos > len(dd.hist) {
+		return 0
+	}
+	dstBase := dstPos
+	srcPos := dstPos - dist
+
+	// Copy possibly overlapping section before destination position.
+loop:
+	dstPos += copy(dd.hist[dstPos:endPos], dd.hist[srcPos:dstPos])
+	if dstPos < endPos {
+		goto loop // Avoid for-loop so that this function can be inlined
+	}
+
+	dd.wrPos = dstPos
+	return dstPos - dstBase
+}
+
+// readFlush returns a slice of the historical buffer that is ready to be
+// emitted to the user. The data returned by readFlush must be fully consumed
+// before calling any other dictDecoder methods.
+func (dd *dictDecoder) readFlush() []byte {
+	toRead := dd.hist[dd.rdPos:dd.wrPos]
+	dd.rdPos = dd.wrPos
+	if dd.wrPos == len(dd.hist) {
+		dd.wrPos, dd.rdPos = 0, 0
+		dd.full = true
+	}
+	return toRead
+}
diff --git a/vendor/github.com/klauspost/compress/flate/fast_encoder.go b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
new file mode 100644
index 00000000000..6d4c1e98bc5
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@@ -0,0 +1,254 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Modified for deflate by Klaus Post (c) 2015.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"fmt"
+	"math/bits"
+)
+
+type fastEnc interface {
+	Encode(dst *tokens, src []byte)
+	Reset()
+}
+
+func newFastEnc(level int) fastEnc {
+	switch level {
+	case 1:
+		return &fastEncL1{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 2:
+		return &fastEncL2{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 3:
+		return &fastEncL3{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 4:
+		return &fastEncL4{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 5:
+		return &fastEncL5{fastGen: fastGen{cur: maxStoreBlockSize}}
+	case 6:
+		return &fastEncL6{fastGen: fastGen{cur: maxStoreBlockSize}}
+	default:
+		panic("invalid level specified")
+	}
+}
+
+const (
+	tableBits       = 15             // Bits used in the table
+	tableSize       = 1 << tableBits // Size of the table
+	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
+	baseMatchOffset = 1              // The smallest match offset
+	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
+	maxMatchOffset  = 1 << 15        // The largest match offset
+
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
+)
+
+const (
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+func load32(b []byte, i int) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func load3232(b []byte, i int32) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func hash(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> tableShift
+}
+
+type tableEntry struct {
+	offset int32
+}
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastGen struct {
+	hist []byte
+	cur  int32
+}
+
+func (e *fastGen) addBlock(src []byte) int32 {
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			e.hist = make([]byte, 0, allocHistory)
+		} else {
+			if cap(e.hist) < maxMatchOffset*2 {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - maxMatchOffset
+			copy(e.hist[0:maxMatchOffset], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:maxMatchOffset]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4u(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> ((32 - h) & 31)
+}
+
+type tableEntryPrev struct {
+	Cur  tableEntry
+	Prev tableEntry
+}
+
+// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4x64(u uint64, h uint8) uint32 {
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+// matchlen will return the match length between offsets and t in src.
+// The maximum length returned is maxMatchLength - 4.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlen(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	s1 := int(s) + maxMatchLength - 4
+	if s1 > len(src) {
+		s1 = len(src)
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:s1], src[t:]))
+}
+
+// matchlenLong will return the match length between offsets and t in src.
+// It is assumed that s > t, that t >=0 and s < len(src).
+func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {
+	if debugDecode {
+		if t >= s {
+			panic(fmt.Sprint("t >=s:", t, s))
+		}
+		if int(s) >= len(src) {
+			panic(fmt.Sprint("s >= len(src):", s, len(src)))
+		}
+		if t < 0 {
+			panic(fmt.Sprint("t < 0:", t))
+		}
+		if s-t > maxMatchOffset {
+			panic(fmt.Sprint(s, "-", t, "(", s-t, ") > maxMatchLength (", maxMatchOffset, ")"))
+		}
+	}
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastGen) Reset() {
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
+
+// matchLen returns the maximum length.
+// 'a' must be the shortest of the two.
+func matchLen(a, b []byte) int {
+	b = b[:len(a)]
+	var checked int
+	if len(a) > 4 {
+		// Try 4 bytes first
+		if diff := load32(a, 0) ^ load32(b, 0); diff != 0 {
+			return bits.TrailingZeros32(diff) >> 3
+		}
+		// Switch to 8 byte matching.
+		checked = 4
+		a = a[4:]
+		b = b[4:]
+		for len(a) >= 8 {
+			b = b[:len(a)]
+			if diff := load64(a, 0) ^ load64(b, 0); diff != 0 {
+				return checked + (bits.TrailingZeros64(diff) >> 3)
+			}
+			checked += 8
+			a = a[8:]
+			b = b[8:]
+		}
+	}
+	b = b[:len(a)]
+	for i := range a {
+		if a[i] != b[i] {
+			return int(i) + checked
+		}
+	}
+	return len(a) + checked
+}
diff --git a/vendor/github.com/klauspost/compress/flate/gen_inflate.go b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
new file mode 100644
index 00000000000..c74a95fe7f6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
@@ -0,0 +1,274 @@
+// +build generate
+
+//go:generate go run $GOFILE && gofmt -w inflate_gen.go
+
+package main
+
+import (
+	"os"
+	"strings"
+)
+
+func main() {
+	f, err := os.Create("inflate_gen.go")
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
+	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
+	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
+	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+`)
+
+	for _, imp := range imports {
+		f.WriteString("\t\"" + imp + "\"\n")
+	}
+	f.WriteString(")\n\n")
+
+	template := `
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) $FUNCNAME$() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.($TYPE$)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).$FUNCNAME$
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+`
+	for i, t := range types {
+		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
+		s = strings.Replace(s, "$TYPE$", t, -1)
+		f.WriteString(s)
+	}
+	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
+	f.WriteString("\tswitch f.r.(type) {\n")
+	for i, t := range types {
+		f.WriteString("\t\tcase " + t + ":\n")
+		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
+	}
+	f.WriteString("\t\tdefault:\n")
+	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
+	f.WriteString("\t}\n}\n")
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
new file mode 100644
index 00000000000..53fe1d06e25
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@@ -0,0 +1,911 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"io"
+)
+
+const (
+	// The largest offset code.
+	offsetCodeCount = 30
+
+	// The special code used to mark the end of a block.
+	endBlockMarker = 256
+
+	// The first length code.
+	lengthCodesStart = 257
+
+	// The number of codegen codes.
+	codegenCodeCount = 19
+	badCode          = 255
+
+	// bufferFlushSize indicates the buffer size
+	// after which bytes are flushed to the writer.
+	// Should preferably be a multiple of 6, since
+	// we accumulate 6 bytes between writes to the buffer.
+	bufferFlushSize = 240
+
+	// bufferSize is the actual output byte buffer size.
+	// It must have additional headroom for a flush
+	// which can contain up to 8 bytes.
+	bufferSize = bufferFlushSize + 8
+)
+
+// The number of extra bits needed by length code X - LENGTH_CODES_START.
+var lengthExtraBits = [32]int8{
+	/* 257 */ 0, 0, 0,
+	/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
+	/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
+	/* 280 */ 4, 5, 5, 5, 5, 0,
+}
+
+// The length indicated by length code X - LENGTH_CODES_START.
+var lengthBase = [32]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
+	12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
+	64, 80, 96, 112, 128, 160, 192, 224, 255,
+}
+
+// offset code word extra bits.
+var offsetExtraBits = [64]int8{
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
+	4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+	/* extended window */
+	14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
+}
+
+var offsetBase = [64]uint32{
+	/* normal deflate */
+	0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
+	0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
+	0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
+	0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
+	0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
+	0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
+
+	/* extended window */
+	0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
+	0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
+	0x100000, 0x180000, 0x200000, 0x300000,
+}
+
+// The odd order in which the codegen code sizes are written.
+var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+type huffmanBitWriter struct {
+	// writer is the underlying writer.
+	// Do not use it directly; use the write method, which ensures
+	// that Write errors are sticky.
+	writer io.Writer
+
+	// Data waiting to be written is bytes[0:nbytes]
+	// and then the low nbits of bits.
+	bits            uint64
+	nbits           uint16
+	nbytes          uint8
+	literalEncoding *huffmanEncoder
+	offsetEncoding  *huffmanEncoder
+	codegenEncoding *huffmanEncoder
+	err             error
+	lastHeader      int
+	// Set between 0 (reused block can be up to 2x the size)
+	logNewTablePenalty uint
+	lastHuffMan        bool
+	bytes              [256]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16
+
+	// codegen must have an extra space for the final symbol.
+	codegen [literalCount + offsetCodeCount + 1]uint8
+}
+
+// Huffman reuse.
+//
+// The huffmanBitWriter supports reusing huffman tables and thereby combining block sections.
+//
+// This is controlled by several variables:
+//
+// If lastHeader is non-zero the Huffman table can be reused.
+// This also indicates that a Huffman table has been generated that can output all
+// possible symbols.
+// It also indicates that an EOB has not yet been emitted, so if a new tabel is generated
+// an EOB with the previous table must be written.
+//
+// If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
+//
+// An incoming block estimates the output size of a new table using a 'fresh' by calculating the
+// optimal size and adding a penalty in 'logNewTablePenalty'.
+// A Huffman table is not optimal, which is why we add a penalty, and generating a new table
+// is slower both for compression and decompression.
+
+func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
+	return &huffmanBitWriter{
+		writer:          w,
+		literalEncoding: newHuffmanEncoder(literalCount),
+		codegenEncoding: newHuffmanEncoder(codegenCodeCount),
+		offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
+	}
+}
+
+func (w *huffmanBitWriter) reset(writer io.Writer) {
+	w.writer = writer
+	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
+	w.lastHeader = 0
+	w.lastHuffMan = false
+}
+
+func (w *huffmanBitWriter) canReuse(t *tokens) (offsets, lits bool) {
+	offsets, lits = true, true
+	a := t.offHist[:offsetCodeCount]
+	b := w.offsetFreq[:len(a)]
+	for i := range a {
+		if b[i] == 0 && a[i] != 0 {
+			offsets = false
+			break
+		}
+	}
+
+	a = t.extraHist[:literalCount-256]
+	b = w.literalFreq[256:literalCount]
+	b = b[:len(a)]
+	for i := range a {
+		if b[i] == 0 && a[i] != 0 {
+			lits = false
+			break
+		}
+	}
+	if lits {
+		a = t.litHist[:]
+		b = w.literalFreq[:len(a)]
+		for i := range a {
+			if b[i] == 0 && a[i] != 0 {
+				lits = false
+				break
+			}
+		}
+	}
+	return
+}
+
+func (w *huffmanBitWriter) flush() {
+	if w.err != nil {
+		w.nbits = 0
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	n := w.nbytes
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		if w.nbits > 8 { // Avoid underflow
+			w.nbits -= 8
+		} else {
+			w.nbits = 0
+		}
+		n++
+	}
+	w.bits = 0
+	w.write(w.bytes[:n])
+	w.nbytes = 0
+}
+
+func (w *huffmanBitWriter) write(b []byte) {
+	if w.err != nil {
+		return
+	}
+	_, w.err = w.writer.Write(b)
+}
+
+func (w *huffmanBitWriter) writeBits(b int32, nb uint16) {
+	w.bits |= uint64(b) << (w.nbits & 63)
+	w.nbits += nb
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+func (w *huffmanBitWriter) writeBytes(bytes []byte) {
+	if w.err != nil {
+		return
+	}
+	n := w.nbytes
+	if w.nbits&7 != 0 {
+		w.err = InternalError("writeBytes with unfinished bits")
+		return
+	}
+	for w.nbits != 0 {
+		w.bytes[n] = byte(w.bits)
+		w.bits >>= 8
+		w.nbits -= 8
+		n++
+	}
+	if n != 0 {
+		w.write(w.bytes[:n])
+	}
+	w.nbytes = 0
+	w.write(bytes)
+}
+
+// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
+// the literal and offset lengths arrays (which are concatenated into a single
+// array).  This method generates that run-length encoding.
+//
+// The result is written into the codegen array, and the frequencies
+// of each code is written into the codegenFreq array.
+// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
+// information. Code badCode is an end marker
+//
+//  numLiterals      The number of literals in literalEncoding
+//  numOffsets       The number of offsets in offsetEncoding
+//  litenc, offenc   The literal and offset encoder to use
+func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int, litEnc, offEnc *huffmanEncoder) {
+	for i := range w.codegenFreq {
+		w.codegenFreq[i] = 0
+	}
+	// Note that we are using codegen both as a temporary variable for holding
+	// a copy of the frequencies, and as the place where we put the result.
+	// This is fine because the output is always shorter than the input used
+	// so far.
+	codegen := w.codegen[:] // cache
+	// Copy the concatenated code sizes to codegen. Put a marker at the end.
+	cgnl := codegen[:numLiterals]
+	for i := range cgnl {
+		cgnl[i] = uint8(litEnc.codes[i].len)
+	}
+
+	cgnl = codegen[numLiterals : numLiterals+numOffsets]
+	for i := range cgnl {
+		cgnl[i] = uint8(offEnc.codes[i].len)
+	}
+	codegen[numLiterals+numOffsets] = badCode
+
+	size := codegen[0]
+	count := 1
+	outIndex := 0
+	for inIndex := 1; size != badCode; inIndex++ {
+		// INVARIANT: We have seen "count" copies of size that have not yet
+		// had output generated for them.
+		nextSize := codegen[inIndex]
+		if nextSize == size {
+			count++
+			continue
+		}
+		// We need to generate codegen indicating "count" of size.
+		if size != 0 {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+			count--
+			for count >= 3 {
+				n := 6
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 16
+				outIndex++
+				codegen[outIndex] = uint8(n - 3)
+				outIndex++
+				w.codegenFreq[16]++
+				count -= n
+			}
+		} else {
+			for count >= 11 {
+				n := 138
+				if n > count {
+					n = count
+				}
+				codegen[outIndex] = 18
+				outIndex++
+				codegen[outIndex] = uint8(n - 11)
+				outIndex++
+				w.codegenFreq[18]++
+				count -= n
+			}
+			if count >= 3 {
+				// count >= 3 && count <= 10
+				codegen[outIndex] = 17
+				outIndex++
+				codegen[outIndex] = uint8(count - 3)
+				outIndex++
+				w.codegenFreq[17]++
+				count = 0
+			}
+		}
+		count--
+		for ; count >= 0; count-- {
+			codegen[outIndex] = size
+			outIndex++
+			w.codegenFreq[size]++
+		}
+		// Set up invariant for next time through the loop.
+		size = nextSize
+		count = 1
+	}
+	// Marker indicating the end of the codegen.
+	codegen[outIndex] = badCode
+}
+
+func (w *huffmanBitWriter) codegens() int {
+	numCodegens := len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return numCodegens
+}
+
+func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
+	numCodegens = len(w.codegenFreq)
+	for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
+		numCodegens--
+	}
+	return 3 + 5 + 5 + 4 + (3 * numCodegens) +
+		w.codegenEncoding.bitLength(w.codegenFreq[:]) +
+		int(w.codegenFreq[16])*2 +
+		int(w.codegenFreq[17])*3 +
+		int(w.codegenFreq[18])*7, numCodegens
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
+	header, numCodegens := w.headerSize()
+	size = header +
+		litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:]) +
+		extraBits
+	return size, numCodegens
+}
+
+// extraBitSize will return the number of bits that will be written
+// as "extra" bits on matches.
+func (w *huffmanBitWriter) extraBitSize() int {
+	total := 0
+	for i, n := range w.literalFreq[257:literalCount] {
+		total += int(n) * int(lengthExtraBits[i&31])
+	}
+	for i, n := range w.offsetFreq[:offsetCodeCount] {
+		total += int(n) * int(offsetExtraBits[i&31])
+	}
+	return total
+}
+
+// fixedSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) fixedSize(extraBits int) int {
+	return 3 +
+		fixedLiteralEncoding.bitLength(w.literalFreq[:]) +
+		fixedOffsetEncoding.bitLength(w.offsetFreq[:]) +
+		extraBits
+}
+
+// storedSize calculates the stored size, including header.
+// The function returns the size in bits and whether the block
+// fits inside a single block.
+func (w *huffmanBitWriter) storedSize(in []byte) (int, bool) {
+	if in == nil {
+		return 0, false
+	}
+	if len(in) <= maxStoreBlockSize {
+		return (len(in) + 5) * 8, true
+	}
+	return 0, false
+}
+
+func (w *huffmanBitWriter) writeCode(c hcode) {
+	// The function does not get inlined if we "& 63" the shift.
+	w.bits |= uint64(c.code) << w.nbits
+	w.nbits += c.len
+	if w.nbits >= 48 {
+		w.writeOutBits()
+	}
+}
+
+// writeOutBits will write bits to the buffer.
+func (w *huffmanBitWriter) writeOutBits() {
+	bits := w.bits
+	w.bits >>= 48
+	w.nbits -= 48
+	n := w.nbytes
+	w.bytes[n] = byte(bits)
+	w.bytes[n+1] = byte(bits >> 8)
+	w.bytes[n+2] = byte(bits >> 16)
+	w.bytes[n+3] = byte(bits >> 24)
+	w.bytes[n+4] = byte(bits >> 32)
+	w.bytes[n+5] = byte(bits >> 40)
+	n += 6
+	if n >= bufferFlushSize {
+		if w.err != nil {
+			n = 0
+			return
+		}
+		w.write(w.bytes[:n])
+		n = 0
+	}
+	w.nbytes = n
+}
+
+// Write the header of a dynamic Huffman block to the output stream.
+//
+//  numLiterals  The number of literals specified in codegen
+//  numOffsets   The number of offsets specified in codegen
+//  numCodegens  The number of codegens used in codegen
+func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	var firstBits int32 = 4
+	if isEof {
+		firstBits = 5
+	}
+	w.writeBits(firstBits, 3)
+	w.writeBits(int32(numLiterals-257), 5)
+	w.writeBits(int32(numOffsets-1), 5)
+	w.writeBits(int32(numCodegens-4), 4)
+
+	for i := 0; i < numCodegens; i++ {
+		value := uint(w.codegenEncoding.codes[codegenOrder[i]].len)
+		w.writeBits(int32(value), 3)
+	}
+
+	i := 0
+	for {
+		var codeWord = uint32(w.codegen[i])
+		i++
+		if codeWord == badCode {
+			break
+		}
+		w.writeCode(w.codegenEncoding.codes[codeWord])
+
+		switch codeWord {
+		case 16:
+			w.writeBits(int32(w.codegen[i]), 2)
+			i++
+		case 17:
+			w.writeBits(int32(w.codegen[i]), 3)
+			i++
+		case 18:
+			w.writeBits(int32(w.codegen[i]), 7)
+			i++
+		}
+	}
+}
+
+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
+func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
+	var flag int32
+	if isEof {
+		flag = 1
+	}
+	w.writeBits(flag, 3)
+	w.flush()
+	w.writeBits(int32(length), 16)
+	w.writeBits(int32(^uint16(length)), 16)
+}
+
+func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
+	if w.err != nil {
+		return
+	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+
+	// Indicate that we are a fixed Huffman block
+	var value int32 = 2
+	if isEof {
+		value = 3
+	}
+	w.writeBits(value, 3)
+}
+
+// writeBlock will write a block of tokens with the smallest encoding.
+// The original input can be supplied, and if the huffman encoded data
+// is larger than the original bytes, the data will be written as a
+// stored block.
+// If the input is nil, the tokens will always be Huffman encoded.
+func (w *huffmanBitWriter) writeBlock(tokens *tokens, eof bool, input []byte) {
+	if w.err != nil {
+		return
+	}
+
+	tokens.AddEOB()
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, false)
+	w.generate(tokens)
+	var extraBits int
+	storedSize, storable := w.storedSize(input)
+	if storable {
+		extraBits = w.extraBitSize()
+	}
+
+	// Figure out smallest code.
+	// Fixed Huffman baseline.
+	var literalEncoding = fixedLiteralEncoding
+	var offsetEncoding = fixedOffsetEncoding
+	var size = w.fixedSize(extraBits)
+
+	// Dynamic Huffman?
+	var numCodegens int
+
+	// Generate codegen and codegenFrequencies, which indicates how to encode
+	// the literalEncoding and the offsetEncoding.
+	w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+	w.codegenEncoding.generate(w.codegenFreq[:], 7)
+	dynamicSize, numCodegens := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extraBits)
+
+	if dynamicSize < size {
+		size = dynamicSize
+		literalEncoding = w.literalEncoding
+		offsetEncoding = w.offsetEncoding
+	}
+
+	// Stored bytes?
+	if storable && storedSize < size {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	// Huffman.
+	if literalEncoding == fixedLiteralEncoding {
+		w.writeFixedHeader(eof)
+	} else {
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+	}
+
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), literalEncoding.codes, offsetEncoding.codes)
+}
+
+// writeBlockDynamic encodes a block using a dynamic Huffman table.
+// This should be used if the symbols used have a disproportionate
+// histogram distribution.
+// If input is supplied and the compression savings are below 1/16th of the
+// input size the block is stored.
+func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	sync = sync || eof
+	if sync {
+		tokens.AddEOB()
+	}
+
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
+		// We will not try to reuse.
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+	if !sync {
+		tokens.Fill()
+	}
+	numLiterals, numOffsets := w.indexTokens(tokens, !sync)
+
+	var size int
+	// Check if we should reuse.
+	if w.lastHeader > 0 {
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
+		newSize := w.lastHeader + tokens.EstimatedBits()
+		newSize += newSize >> w.logNewTablePenalty
+
+		// The estimated size is calculated as an optimal table.
+		// We add a penalty to make it more realistic and re-use a bit more.
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()
+
+		// Check if a new table is better.
+		if newSize < reuseSize {
+			// Write the EOB we owe.
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			size = newSize
+			w.lastHeader = 0
+		} else {
+			size = reuseSize
+		}
+		// Check if we get a reasonable size decrease.
+		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			w.lastHeader = 0
+			return
+		}
+	}
+
+	// We want a new block/table
+	if w.lastHeader == 0 {
+		w.generate(tokens)
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		var numCodegens int
+		size, numCodegens = w.dynamicSize(w.literalEncoding, w.offsetEncoding, w.extraBitSize())
+		// Store bytes, if we don't get a reasonable improvement.
+		if ssize, storable := w.storedSize(input); storable && ssize < (size+size>>4) {
+			w.writeStoredHeader(len(input), eof)
+			w.writeBytes(input)
+			w.lastHeader = 0
+			return
+		}
+
+		// Write Huffman table.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHeader, _ = w.headerSize()
+		w.lastHuffMan = false
+	}
+
+	if sync {
+		w.lastHeader = 0
+	}
+	// Write the tokens.
+	w.writeTokens(tokens.Slice(), w.literalEncoding.codes, w.offsetEncoding.codes)
+}
+
+// indexTokens indexes a slice of tokens, and updates
+// literalFreq and offsetFreq, and generates literalEncoding
+// and offsetEncoding.
+// The number of literal and offset tokens is returned.
+func (w *huffmanBitWriter) indexTokens(t *tokens, filled bool) (numLiterals, numOffsets int) {
+	copy(w.literalFreq[:], t.litHist[:])
+	copy(w.literalFreq[256:], t.extraHist[:])
+	copy(w.offsetFreq[:], t.offHist[:offsetCodeCount])
+
+	if t.n == 0 {
+		return
+	}
+	if filled {
+		return maxNumLit, maxNumDist
+	}
+	// get the number of literals
+	numLiterals = len(w.literalFreq)
+	for w.literalFreq[numLiterals-1] == 0 {
+		numLiterals--
+	}
+	// get the number of offsets
+	numOffsets = len(w.offsetFreq)
+	for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
+		numOffsets--
+	}
+	if numOffsets == 0 {
+		// We haven't found a single match. If we want to go with the dynamic encoding,
+		// we should count at least one offset to be sure that the offset huffman tree could be encoded.
+		w.offsetFreq[0] = 1
+		numOffsets = 1
+	}
+	return
+}
+
+func (w *huffmanBitWriter) generate(t *tokens) {
+	w.literalEncoding.generate(w.literalFreq[:literalCount], 15)
+	w.offsetEncoding.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeTokens writes a slice of tokens to the output.
+// codes for literal and offset encoding must be supplied.
+func (w *huffmanBitWriter) writeTokens(tokens []token, leCodes, oeCodes []hcode) {
+	if w.err != nil {
+		return
+	}
+	if len(tokens) == 0 {
+		return
+	}
+
+	// Only last token should be endBlockMarker.
+	var deferEOB bool
+	if tokens[len(tokens)-1] == endBlockMarker {
+		tokens = tokens[:len(tokens)-1]
+		deferEOB = true
+	}
+
+	// Create slices up to the next power of two to avoid bounds checks.
+	lits := leCodes[:256]
+	offs := oeCodes[:32]
+	lengths := leCodes[lengthCodesStart:]
+	lengths = lengths[:32]
+	for _, t := range tokens {
+		if t < matchType {
+			w.writeCode(lits[t.literal()])
+			continue
+		}
+
+		// Write the length
+		length := t.length()
+		lengthCode := lengthCode(length)
+		if false {
+			w.writeCode(lengths[lengthCode&31])
+		} else {
+			// inlined
+			c := lengths[lengthCode&31]
+			w.bits |= uint64(c.code) << (w.nbits & 63)
+			w.nbits += c.len
+			if w.nbits >= 48 {
+				w.writeOutBits()
+			}
+		}
+
+		extraLengthBits := uint16(lengthExtraBits[lengthCode&31])
+		if extraLengthBits > 0 {
+			extraLength := int32(length - lengthBase[lengthCode&31])
+			w.writeBits(extraLength, extraLengthBits)
+		}
+		// Write the offset
+		offset := t.offset()
+		offsetCode := offsetCode(offset)
+		if false {
+			w.writeCode(offs[offsetCode&31])
+		} else {
+			// inlined
+			c := offs[offsetCode&31]
+			w.bits |= uint64(c.code) << (w.nbits & 63)
+			w.nbits += c.len
+			if w.nbits >= 48 {
+				w.writeOutBits()
+			}
+		}
+		extraOffsetBits := uint16(offsetExtraBits[offsetCode&63])
+		if extraOffsetBits > 0 {
+			extraOffset := int32(offset - offsetBase[offsetCode&63])
+			w.writeBits(extraOffset, extraOffsetBits)
+		}
+	}
+	if deferEOB {
+		w.writeCode(leCodes[endBlockMarker])
+	}
+}
+
+// huffOffset is a static offset encoder used for huffman only encoding.
+// It can be reused since we will not be encoding offset values.
+var huffOffset *huffmanEncoder
+
+func init() {
+	w := newHuffmanBitWriter(nil)
+	w.offsetFreq[0] = 1
+	huffOffset = newHuffmanEncoder(offsetCodeCount)
+	huffOffset.generate(w.offsetFreq[:offsetCodeCount], 15)
+}
+
+// writeBlockHuff encodes a block of bytes as either
+// Huffman encoded literals or uncompressed bytes if the
+// results only gains very little from compression.
+func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
+	if w.err != nil {
+		return
+	}
+
+	// Clear histogram
+	for i := range w.literalFreq[:] {
+		w.literalFreq[i] = 0
+	}
+	if !w.lastHuffMan {
+		for i := range w.offsetFreq[:] {
+			w.offsetFreq[i] = 0
+		}
+	}
+
+	// Add everything as literals
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
+	estBits += w.lastHeader + 15
+	if w.lastHeader == 0 {
+		estBits += guessHeaderSizeBits
+	}
+	estBits += estBits >> w.logNewTablePenalty
+
+	// Store bytes, if we don't get a reasonable improvement.
+	ssize, storable := w.storedSize(input)
+	if storable && ssize < estBits {
+		w.writeStoredHeader(len(input), eof)
+		w.writeBytes(input)
+		return
+	}
+
+	if w.lastHeader > 0 {
+		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
+		estBits += estExtra
+
+		if estBits < reuseSize {
+			// We owe an EOB
+			w.writeCode(w.literalEncoding.codes[endBlockMarker])
+			w.lastHeader = 0
+		}
+	}
+
+	const numLiterals = endBlockMarker + 1
+	const numOffsets = 1
+	if w.lastHeader == 0 {
+		w.literalFreq[endBlockMarker] = 1
+		w.literalEncoding.generate(w.literalFreq[:numLiterals], 15)
+
+		// Generate codegen and codegenFrequencies, which indicates how to encode
+		// the literalEncoding and the offsetEncoding.
+		w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
+		w.codegenEncoding.generate(w.codegenFreq[:], 7)
+		numCodegens := w.codegens()
+
+		// Huffman.
+		w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
+		w.lastHuffMan = true
+		w.lastHeader, _ = w.headerSize()
+	}
+
+	encoding := w.literalEncoding.codes[:257]
+	for _, t := range input {
+		// Bitwriting inlined, ~30% speedup
+		c := encoding[t]
+		w.bits |= uint64(c.code) << ((w.nbits) & 63)
+		w.nbits += c.len
+		if w.nbits >= 48 {
+			bits := w.bits
+			w.bits >>= 48
+			w.nbits -= 48
+			n := w.nbytes
+			w.bytes[n] = byte(bits)
+			w.bytes[n+1] = byte(bits >> 8)
+			w.bytes[n+2] = byte(bits >> 16)
+			w.bytes[n+3] = byte(bits >> 24)
+			w.bytes[n+4] = byte(bits >> 32)
+			w.bytes[n+5] = byte(bits >> 40)
+			n += 6
+			if n >= bufferFlushSize {
+				if w.err != nil {
+					n = 0
+					return
+				}
+				w.write(w.bytes[:n])
+				n = 0
+			}
+			w.nbytes = n
+		}
+	}
+	if eof || sync {
+		w.writeCode(encoding[endBlockMarker])
+		w.lastHeader = 0
+		w.lastHuffMan = false
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_code.go b/vendor/github.com/klauspost/compress/flate/huffman_code.go
new file mode 100644
index 00000000000..4c39a301871
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@@ -0,0 +1,363 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"math"
+	"math/bits"
+)
+
+const (
+	maxBitsLimit = 16
+	// number of valid literals
+	literalCount = 286
+)
+
+// hcode is a huffman code with a bit code and bit length.
+type hcode struct {
+	code, len uint16
+}
+
+type huffmanEncoder struct {
+	codes     []hcode
+	freqcache []literalNode
+	bitCount  [17]int32
+}
+
+type literalNode struct {
+	literal uint16
+	freq    uint16
+}
+
+// A levelInfo describes the state of the constructed tree for a given depth.
+type levelInfo struct {
+	// Our level.  for better printing
+	level int32
+
+	// The frequency of the last node at this level
+	lastFreq int32
+
+	// The frequency of the next character to add to this level
+	nextCharFreq int32
+
+	// The frequency of the next pair (from level below) to add to this level.
+	// Only valid if the "needed" value of the next lower level is 0.
+	nextPairFreq int32
+
+	// The number of chains remaining to generate for this level before moving
+	// up to the next level
+	needed int32
+}
+
+// set sets the code and length of an hcode.
+func (h *hcode) set(code uint16, length uint16) {
+	h.len = length
+	h.code = code
+}
+
+func reverseBits(number uint16, bitLength byte) uint16 {
+	return bits.Reverse16(number << ((16 - bitLength) & 15))
+}
+
+func maxNode() literalNode { return literalNode{math.MaxUint16, math.MaxUint16} }
+
+func newHuffmanEncoder(size int) *huffmanEncoder {
+	// Make capacity to next power of two.
+	c := uint(bits.Len32(uint32(size - 1)))
+	return &huffmanEncoder{codes: make([]hcode, size, 1<<c)}
+}
+
+// Generates a HuffmanCode corresponding to the fixed literal table
+func generateFixedLiteralEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(literalCount)
+	codes := h.codes
+	var ch uint16
+	for ch = 0; ch < literalCount; ch++ {
+		var bits uint16
+		var size uint16
+		switch {
+		case ch < 144:
+			// size 8, 000110000  .. 10111111
+			bits = ch + 48
+			size = 8
+		case ch < 256:
+			// size 9, 110010000 .. 111111111
+			bits = ch + 400 - 144
+			size = 9
+		case ch < 280:
+			// size 7, 0000000 .. 0010111
+			bits = ch - 256
+			size = 7
+		default:
+			// size 8, 11000000 .. 11000111
+			bits = ch + 192 - 280
+			size = 8
+		}
+		codes[ch] = hcode{code: reverseBits(bits, byte(size)), len: size}
+	}
+	return h
+}
+
+func generateFixedOffsetEncoding() *huffmanEncoder {
+	h := newHuffmanEncoder(30)
+	codes := h.codes
+	for ch := range codes {
+		codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
+	}
+	return h
+}
+
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()
+
+func (h *huffmanEncoder) bitLength(freq []uint16) int {
+	var total int
+	for i, f := range freq {
+		if f != 0 {
+			total += int(f) * int(h.codes[i].len)
+		}
+	}
+	return total
+}
+
+// Return the number of literals assigned to each bit size in the Huffman encoding
+//
+// This method is only called when list.length >= 3
+// The cases of 0, 1, and 2 literals are handled by special case code.
+//
+// list  An array of the literals with non-zero frequencies
+//             and their associated frequencies. The array is in order of increasing
+//             frequency, and has as its last element a special element with frequency
+//             MaxInt32
+// maxBits     The maximum number of bits that should be used to encode any literal.
+//             Must be less than 16.
+// return      An integer array in which array[i] indicates the number of literals
+//             that should be encoded in i bits.
+func (h *huffmanEncoder) bitCounts(list []literalNode, maxBits int32) []int32 {
+	if maxBits >= maxBitsLimit {
+		panic("flate: maxBits too large")
+	}
+	n := int32(len(list))
+	list = list[0 : n+1]
+	list[n] = maxNode()
+
+	// The tree can't have greater depth than n - 1, no matter what. This
+	// saves a little bit of work in some small cases
+	if maxBits > n-1 {
+		maxBits = n - 1
+	}
+
+	// Create information about each of the levels.
+	// A bogus "Level 0" whose sole purpose is so that
+	// level1.prev.needed==0.  This makes level1.nextPairFreq
+	// be a legitimate value that never gets chosen.
+	var levels [maxBitsLimit]levelInfo
+	// leafCounts[i] counts the number of literals at the left
+	// of ancestors of the rightmost node at level i.
+	// leafCounts[i][j] is the number of literals at the left
+	// of the level j ancestor.
+	var leafCounts [maxBitsLimit][maxBitsLimit]int32
+
+	for level := int32(1); level <= maxBits; level++ {
+		// For every level, the first two items are the first two characters.
+		// We initialize the levels as if we had already figured this out.
+		levels[level] = levelInfo{
+			level:        level,
+			lastFreq:     int32(list[1].freq),
+			nextCharFreq: int32(list[2].freq),
+			nextPairFreq: int32(list[0].freq) + int32(list[1].freq),
+		}
+		leafCounts[level][level] = 2
+		if level == 1 {
+			levels[level].nextPairFreq = math.MaxInt32
+		}
+	}
+
+	// We need a total of 2*n - 2 items at top level and have already generated 2.
+	levels[maxBits].needed = 2*n - 4
+
+	level := maxBits
+	for {
+		l := &levels[level]
+		if l.nextPairFreq == math.MaxInt32 && l.nextCharFreq == math.MaxInt32 {
+			// We've run out of both leafs and pairs.
+			// End all calculations for this level.
+			// To make sure we never come back to this level or any lower level,
+			// set nextPairFreq impossibly large.
+			l.needed = 0
+			levels[level+1].nextPairFreq = math.MaxInt32
+			level++
+			continue
+		}
+
+		prevFreq := l.lastFreq
+		if l.nextCharFreq < l.nextPairFreq {
+			// The next item on this row is a leaf node.
+			n := leafCounts[level][level] + 1
+			l.lastFreq = l.nextCharFreq
+			// Lower leafCounts are the same of the previous node.
+			leafCounts[level][level] = n
+			e := list[n]
+			if e.literal < math.MaxUint16 {
+				l.nextCharFreq = int32(e.freq)
+			} else {
+				l.nextCharFreq = math.MaxInt32
+			}
+		} else {
+			// The next item on this row is a pair from the previous row.
+			// nextPairFreq isn't valid until we generate two
+			// more values in the level below
+			l.lastFreq = l.nextPairFreq
+			// Take leaf counts from the lower level, except counts[level] remains the same.
+			copy(leafCounts[level][:level], leafCounts[level-1][:level])
+			levels[l.level-1].needed = 2
+		}
+
+		if l.needed--; l.needed == 0 {
+			// We've done everything we need to do for this level.
+			// Continue calculating one level up. Fill in nextPairFreq
+			// of that level with the sum of the two nodes we've just calculated on
+			// this level.
+			if l.level == maxBits {
+				// All done!
+				break
+			}
+			levels[l.level+1].nextPairFreq = prevFreq + l.lastFreq
+			level++
+		} else {
+			// If we stole from below, move down temporarily to replenish it.
+			for levels[level-1].needed > 0 {
+				level--
+			}
+		}
+	}
+
+	// Somethings is wrong if at the end, the top level is null or hasn't used
+	// all of the leaves.
+	if leafCounts[maxBits][maxBits] != n {
+		panic("leafCounts[maxBits][maxBits] != n")
+	}
+
+	bitCount := h.bitCount[:maxBits+1]
+	bits := 1
+	counts := &leafCounts[maxBits]
+	for level := maxBits; level > 0; level-- {
+		// chain.leafCount gives the number of literals requiring at least "bits"
+		// bits to encode.
+		bitCount[bits] = counts[level] - counts[level-1]
+		bits++
+	}
+	return bitCount
+}
+
+// Look at the leaves and assign them a bit count and an encoding as specified
+// in RFC 1951 3.2.2
+func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalNode) {
+	code := uint16(0)
+	for n, bits := range bitCount {
+		code <<= 1
+		if n == 0 || bits == 0 {
+			continue
+		}
+		// The literals list[len(list)-bits] .. list[len(list)-bits]
+		// are encoded using "bits" bits, and get the values
+		// code, code + 1, ....  The code values are
+		// assigned in literal order (not frequency order).
+		chunk := list[len(list)-int(bits):]
+
+		sortByLiteral(chunk)
+		for _, node := range chunk {
+			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
+			code++
+		}
+		list = list[0 : len(list)-int(bits)]
+	}
+}
+
+// Update this Huffman Code object to be the minimum code for the specified frequency count.
+//
+// freq  An array of frequencies, in which frequency[i] gives the frequency of literal i.
+// maxBits  The maximum number of bits to use for any literal.
+func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
+	if h.freqcache == nil {
+		// Allocate a reusable buffer with the longest possible frequency table.
+		// Possible lengths are codegenCodeCount, offsetCodeCount and literalCount.
+		// The largest of these is literalCount, so we allocate for that case.
+		h.freqcache = make([]literalNode, literalCount+1)
+	}
+	list := h.freqcache[:len(freq)+1]
+	// Number of non-zero literals
+	count := 0
+	// Set list to be the set of all non-zero literals and their frequencies
+	for i, f := range freq {
+		if f != 0 {
+			list[count] = literalNode{uint16(i), f}
+			count++
+		} else {
+			list[count] = literalNode{}
+			h.codes[i].len = 0
+		}
+	}
+	list[len(freq)] = literalNode{}
+
+	list = list[:count]
+	if count <= 2 {
+		// Handle the small cases here, because they are awkward for the general case code. With
+		// two or fewer literals, everything has bit length 1.
+		for i, node := range list {
+			// "list" is in order of increasing literal value.
+			h.codes[node.literal].set(uint16(i), 1)
+		}
+		return
+	}
+	sortByFreq(list)
+
+	// Get the number of literals for each bit count
+	bitCount := h.bitCounts(list, maxBits)
+	// And do the assignment
+	h.assignEncodingAndSize(bitCount, list)
+}
+
+func atLeastOne(v float32) float32 {
+	if v < 1 {
+		return 1
+	}
+	return v
+}
+
+// histogramSize accumulates a histogram of b in h.
+// An estimated size in bits is returned.
+// Unassigned values are assigned '1' in the histogram.
+// len(h) must be >= 256, and h's elements must be all zeroes.
+func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
+	h = h[:256]
+	for _, t := range b {
+		h[t]++
+	}
+	invTotal := 1.0 / float32(len(b))
+	shannon := float32(0.0)
+	var extra float32
+	if fill {
+		oneBits := atLeastOne(-mFastLog2(invTotal))
+		for i, v := range h[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			} else {
+				h[i] = 1
+				extra += oneBits
+			}
+		}
+	} else {
+		for _, v := range h[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
+		}
+	}
+
+	return int(shannon + 0.99), int(extra + 0.99)
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
new file mode 100644
index 00000000000..20778029900
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
@@ -0,0 +1,178 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+// siftDownByFreq implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDownByFreq(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && (data[first+child].freq == data[first+child+1].freq && data[first+child].literal < data[first+child+1].literal || data[first+child].freq < data[first+child+1].freq) {
+			child++
+		}
+		if data[first+root].freq == data[first+child].freq && data[first+root].literal > data[first+child].literal || data[first+root].freq > data[first+child].freq {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
new file mode 100644
index 00000000000..93f1aea109e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
@@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
diff --git a/vendor/github.com/klauspost/compress/flate/inflate.go b/vendor/github.com/klauspost/compress/flate/inflate.go
new file mode 100644
index 00000000000..3e4259f157b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@@ -0,0 +1,1001 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+package flate
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"math/bits"
+	"strconv"
+	"sync"
+)
+
+const (
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
+	// The next three numbers come from the RFC section 3.2.7, with the
+	// additional proviso in section 3.2.5 which implies that distance codes
+	// 30 and 31 should never occur in compressed data.
+	maxNumLit  = 286
+	maxNumDist = 30
+	numCodes   = 19 // number of codes in Huffman meta-code
+
+	debugDecode = false
+)
+
+// Initialize the fixedHuffmanDecoder only once upon first use.
+var fixedOnce sync.Once
+var fixedHuffmanDecoder huffmanDecoder
+
+// A CorruptInputError reports the presence of corrupt input at a given offset.
+type CorruptInputError int64
+
+func (e CorruptInputError) Error() string {
+	return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
+}
+
+// An InternalError reports an error in the flate code itself.
+type InternalError string
+
+func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
+
+// A ReadError reports an error encountered while reading input.
+//
+// Deprecated: No longer returned.
+type ReadError struct {
+	Offset int64 // byte offset where error occurred
+	Err    error // error returned by underlying Read
+}
+
+func (e *ReadError) Error() string {
+	return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// A WriteError reports an error encountered while writing output.
+//
+// Deprecated: No longer returned.
+type WriteError struct {
+	Offset int64 // byte offset where error occurred
+	Err    error // error returned by underlying Write
+}
+
+func (e *WriteError) Error() string {
+	return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
+}
+
+// Resetter resets a ReadCloser returned by NewReader or NewReaderDict to
+// to switch to a new underlying Reader. This permits reusing a ReadCloser
+// instead of allocating a new one.
+type Resetter interface {
+	// Reset discards any buffered data and resets the Resetter as if it was
+	// newly initialized with the given reader.
+	Reset(r io.Reader, dict []byte) error
+}
+
+// The data structure for decoding Huffman tables is based on that of
+// zlib. There is a lookup table of a fixed bit width (huffmanChunkBits),
+// For codes smaller than the table width, there are multiple entries
+// (each combination of trailing bits has the same value). For codes
+// larger than the table width, the table contains a link to an overflow
+// table. The width of each entry in the link table is the maximum code
+// size minus the chunk width.
+//
+// Note that you can do a lookup in the table even without all bits
+// filled. Since the extra bits are zero, and the DEFLATE Huffman codes
+// have the property that shorter codes come before longer ones, the
+// bit length estimate in the result is a lower bound on the actual
+// number of bits.
+//
+// See the following:
+//	http://www.gzip.org/algorithm.txt
+
+// chunk & 15 is number of bits
+// chunk >> 4 is value, including table link
+
+const (
+	huffmanChunkBits  = 9
+	huffmanNumChunks  = 1 << huffmanChunkBits
+	huffmanCountMask  = 15
+	huffmanValueShift = 4
+)
+
+type huffmanDecoder struct {
+	maxRead  int                       // the maximum number of bits we can read and not overread
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
+	links    [][]uint16                // overflow links
+	linkMask uint32                    // mask the width of the link table
+}
+
+// Initialize Huffman decoding tables from array of code lengths.
+// Following this function, h is guaranteed to be initialized into a complete
+// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
+// degenerate case where the tree has only a single symbol with length 1. Empty
+// trees are permitted.
+func (h *huffmanDecoder) init(lengths []int) bool {
+	// Sanity enables additional runtime tests during Huffman
+	// table construction. It's intended to be used during
+	// development to supplement the currently ad-hoc unit tests.
+	const sanity = false
+
+	if h.chunks == nil {
+		h.chunks = &[huffmanNumChunks]uint16{}
+	}
+	if h.maxRead != 0 {
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
+	}
+
+	// Count number of codes of each length,
+	// compute maxRead and max length.
+	var count [maxCodeLen]int
+	var min, max int
+	for _, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		if min == 0 || n < min {
+			min = n
+		}
+		if n > max {
+			max = n
+		}
+		count[n&maxCodeLenMask]++
+	}
+
+	// Empty tree. The decompressor.huffSym function will fail later if the tree
+	// is used. Technically, an empty tree is only valid for the HDIST tree and
+	// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
+	// is guaranteed to fail since it will attempt to use the tree to decode the
+	// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
+	// guaranteed to fail later since the compressed data section must be
+	// composed of at least one symbol (the end-of-block marker).
+	if max == 0 {
+		return true
+	}
+
+	code := 0
+	var nextcode [maxCodeLen]int
+	for i := min; i <= max; i++ {
+		code <<= 1
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
+	}
+
+	// Check that the coding is complete (i.e., that we've
+	// assigned all 2-to-the-max possible bit sequences).
+	// Exception: To be compatible with zlib, we also need to
+	// accept degenerate single-code codings. See also
+	// TestDegenerateHuffmanCoding.
+	if code != 1<<uint(max) && !(code == 1 && max == 1) {
+		if debugDecode {
+			fmt.Println("coding failed, code, max:", code, max, code == 1<<uint(max), code == 1 && max == 1, "(one should be true)")
+		}
+		return false
+	}
+
+	h.maxRead = min
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
+	if max > huffmanChunkBits {
+		numLinks := 1 << (uint(max) - huffmanChunkBits)
+		h.linkMask = uint32(numLinks - 1)
+
+		// create link tables
+		link := nextcode[huffmanChunkBits+1] >> 1
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint16, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
+		for j := uint(link); j < huffmanNumChunks; j++ {
+			reverse := int(bits.Reverse16(uint16(j)))
+			reverse >>= uint(16 - huffmanChunkBits)
+			off := j - uint(link)
+			if sanity && h.chunks[reverse] != 0 {
+				panic("impossible: overwriting existing chunk")
+			}
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint16, numLinks)
+			} else {
+				links := h.links[off][:0]
+				h.links[off] = links[:numLinks]
+			}
+		}
+	} else {
+		h.links = h.links[:0]
+	}
+
+	for i, n := range lengths {
+		if n == 0 {
+			continue
+		}
+		code := nextcode[n]
+		nextcode[n]++
+		chunk := uint16(i<<huffmanValueShift | n)
+		reverse := int(bits.Reverse16(uint16(code)))
+		reverse >>= uint(16 - n)
+		if n <= huffmanChunkBits {
+			for off := reverse; off < len(h.chunks); off += 1 << uint(n) {
+				// We should never need to overwrite
+				// an existing chunk. Also, 0 is
+				// never a valid chunk, because the
+				// lower 4 "count" bits should be
+				// between 1 and 15.
+				if sanity && h.chunks[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				h.chunks[off] = chunk
+			}
+		} else {
+			j := reverse & (huffmanNumChunks - 1)
+			if sanity && h.chunks[j]&huffmanCountMask != huffmanChunkBits+1 {
+				// Longer codes should have been
+				// associated with a link table above.
+				panic("impossible: not an indirect chunk")
+			}
+			value := h.chunks[j] >> huffmanValueShift
+			linktab := h.links[value]
+			reverse >>= huffmanChunkBits
+			for off := reverse; off < len(linktab); off += 1 << uint(n-huffmanChunkBits) {
+				if sanity && linktab[off] != 0 {
+					panic("impossible: overwriting existing chunk")
+				}
+				linktab[off] = chunk
+			}
+		}
+	}
+
+	if sanity {
+		// Above we've sanity checked that we never overwrote
+		// an existing entry. Here we additionally check that
+		// we filled the tables completely.
+		for i, chunk := range h.chunks {
+			if chunk == 0 {
+				// As an exception, in the degenerate
+				// single-code case, we allow odd
+				// chunks to be missing.
+				if code == 1 && i%2 == 1 {
+					continue
+				}
+				panic("impossible: missing chunk")
+			}
+		}
+		for _, linktab := range h.links {
+			for _, chunk := range linktab {
+				if chunk == 0 {
+					panic("impossible: missing chunk")
+				}
+			}
+		}
+	}
+
+	return true
+}
+
+// The actual read interface needed by NewReader.
+// If the passed in io.Reader does not also have ReadByte,
+// the NewReader will introduce its own buffering.
+type Reader interface {
+	io.Reader
+	io.ByteReader
+}
+
+// Decompress state.
+type decompressor struct {
+	// Input source.
+	r       Reader
+	roffset int64
+
+	// Huffman decoders for literal/length, distance.
+	h1, h2 huffmanDecoder
+
+	// Length arrays used to define Huffman codes.
+	bits     *[maxNumLit + maxNumDist]int
+	codebits *[numCodes]int
+
+	// Output history, buffer.
+	dict dictDecoder
+
+	// Next step in the decompression,
+	// and decompression state.
+	step      func(*decompressor)
+	stepState int
+	err       error
+	toRead    []byte
+	hl, hd    *huffmanDecoder
+	copyLen   int
+	copyDist  int
+
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+
+	// Input bits, in top of b.
+	b uint32
+
+	nb    uint
+	final bool
+}
+
+func (f *decompressor) nextBlock() {
+	for f.nb < 1+2 {
+		if f.err = f.moreBits(); f.err != nil {
+			return
+		}
+	}
+	f.final = f.b&1 == 1
+	f.b >>= 1
+	typ := f.b & 3
+	f.b >>= 2
+	f.nb -= 1 + 2
+	switch typ {
+	case 0:
+		f.dataBlock()
+	case 1:
+		// compressed, fixed Huffman tables
+		f.hl = &fixedHuffmanDecoder
+		f.hd = nil
+		f.huffmanBlockDecoder()()
+	case 2:
+		// compressed, dynamic Huffman tables
+		if f.err = f.readHuffman(); f.err != nil {
+			break
+		}
+		f.hl = &f.h1
+		f.hd = &f.h2
+		f.huffmanBlockDecoder()()
+	default:
+		// 3 is reserved.
+		if debugDecode {
+			fmt.Println("reserved data block encountered")
+		}
+		f.err = CorruptInputError(f.roffset)
+	}
+}
+
+func (f *decompressor) Read(b []byte) (int, error) {
+	for {
+		if len(f.toRead) > 0 {
+			n := copy(b, f.toRead)
+			f.toRead = f.toRead[n:]
+			if len(f.toRead) == 0 {
+				return n, f.err
+			}
+			return n, nil
+		}
+		if f.err != nil {
+			return 0, f.err
+		}
+		f.step(f)
+		if f.err != nil && len(f.toRead) == 0 {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+		}
+	}
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (f *decompressor) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	flushed := false
+	for {
+		if len(f.toRead) > 0 {
+			n, err := w.Write(f.toRead)
+			total += int64(n)
+			if err != nil {
+				f.err = err
+				return total, err
+			}
+			if n != len(f.toRead) {
+				return total, io.ErrShortWrite
+			}
+			f.toRead = f.toRead[:0]
+		}
+		if f.err != nil && flushed {
+			if f.err == io.EOF {
+				return total, nil
+			}
+			return total, f.err
+		}
+		if f.err == nil {
+			f.step(f)
+		}
+		if len(f.toRead) == 0 && f.err != nil && !flushed {
+			f.toRead = f.dict.readFlush() // Flush what's left in case of error
+			flushed = true
+		}
+	}
+}
+
+func (f *decompressor) Close() error {
+	if f.err == io.EOF {
+		return nil
+	}
+	return f.err
+}
+
+// RFC 1951 section 3.2.7.
+// Compression with dynamic Huffman codes
+
+var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
+
+func (f *decompressor) readHuffman() error {
+	// HLIT[5], HDIST[5], HCLEN[4].
+	for f.nb < 5+5+4 {
+		if err := f.moreBits(); err != nil {
+			return err
+		}
+	}
+	nlit := int(f.b&0x1F) + 257
+	if nlit > maxNumLit {
+		if debugDecode {
+			fmt.Println("nlit > maxNumLit", nlit)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	ndist := int(f.b&0x1F) + 1
+	if ndist > maxNumDist {
+		if debugDecode {
+			fmt.Println("ndist > maxNumDist", ndist)
+		}
+		return CorruptInputError(f.roffset)
+	}
+	f.b >>= 5
+	nclen := int(f.b&0xF) + 4
+	// numCodes is 19, so nclen is always valid.
+	f.b >>= 4
+	f.nb -= 5 + 5 + 4
+
+	// (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
+	for i := 0; i < nclen; i++ {
+		for f.nb < 3 {
+			if err := f.moreBits(); err != nil {
+				return err
+			}
+		}
+		f.codebits[codeOrder[i]] = int(f.b & 0x7)
+		f.b >>= 3
+		f.nb -= 3
+	}
+	for i := nclen; i < len(codeOrder); i++ {
+		f.codebits[codeOrder[i]] = 0
+	}
+	if !f.h1.init(f.codebits[0:]) {
+		if debugDecode {
+			fmt.Println("init codebits failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// HLIT + 257 code lengths, HDIST + 1 code lengths,
+	// using the code length Huffman code.
+	for i, n := 0, nlit+ndist; i < n; {
+		x, err := f.huffSym(&f.h1)
+		if err != nil {
+			return err
+		}
+		if x < 16 {
+			// Actual length.
+			f.bits[i] = x
+			i++
+			continue
+		}
+		// Repeat previous length or zero.
+		var rep int
+		var nb uint
+		var b int
+		switch x {
+		default:
+			return InternalError("unexpected length code")
+		case 16:
+			rep = 3
+			nb = 2
+			if i == 0 {
+				if debugDecode {
+					fmt.Println("i==0")
+				}
+				return CorruptInputError(f.roffset)
+			}
+			b = f.bits[i-1]
+		case 17:
+			rep = 3
+			nb = 3
+			b = 0
+		case 18:
+			rep = 11
+			nb = 7
+			b = 0
+		}
+		for f.nb < nb {
+			if err := f.moreBits(); err != nil {
+				if debugDecode {
+					fmt.Println("morebits:", err)
+				}
+				return err
+			}
+		}
+		rep += int(f.b & uint32(1<<nb-1))
+		f.b >>= nb
+		f.nb -= nb
+		if i+rep > n {
+			if debugDecode {
+				fmt.Println("i+rep > n", i, rep, n)
+			}
+			return CorruptInputError(f.roffset)
+		}
+		for j := 0; j < rep; j++ {
+			f.bits[i] = b
+			i++
+		}
+	}
+
+	if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
+		if debugDecode {
+			fmt.Println("init2 failed")
+		}
+		return CorruptInputError(f.roffset)
+	}
+
+	// As an optimization, we can initialize the maxRead bits to read at a time
+	// for the HLIT tree to the length of the EOB marker since we know that
+	// every block must terminate with one. This preserves the property that
+	// we never read any extra bytes after the end of the DEFLATE stream.
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
+	}
+
+	return nil
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBlockGeneric() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := f.r.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBlockGeneric
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = f.moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Copy a single uncompressed data block from input to output.
+func (f *decompressor) dataBlock() {
+	// Uncompressed.
+	// Discard current half-byte.
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0
+
+	// Length then ones-complement of length.
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
+	f.roffset += int64(nr)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
+		if debugDecode {
+			ncomp := ^n
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ncomp)
+		}
+		f.err = CorruptInputError(f.roffset)
+		return
+	}
+
+	if n == 0 {
+		f.toRead = f.dict.readFlush()
+		f.finishBlock()
+		return
+	}
+
+	f.copyLen = int(n)
+	f.copyData()
+}
+
+// copyData copies f.copyLen bytes from the underlying reader into f.hist.
+// It pauses for reads when f.hist is full.
+func (f *decompressor) copyData() {
+	buf := f.dict.writeSlice()
+	if len(buf) > f.copyLen {
+		buf = buf[:f.copyLen]
+	}
+
+	cnt, err := io.ReadFull(f.r, buf)
+	f.roffset += int64(cnt)
+	f.copyLen -= cnt
+	f.dict.writeMark(cnt)
+	if err != nil {
+		f.err = noEOF(err)
+		return
+	}
+
+	if f.dict.availWrite() == 0 || f.copyLen > 0 {
+		f.toRead = f.dict.readFlush()
+		f.step = (*decompressor).copyData
+		return
+	}
+	f.finishBlock()
+}
+
+func (f *decompressor) finishBlock() {
+	if f.final {
+		if f.dict.availRead() > 0 {
+			f.toRead = f.dict.readFlush()
+		}
+		f.err = io.EOF
+	}
+	f.step = (*decompressor).nextBlock
+}
+
+// noEOF returns err, unless err == io.EOF, in which case it returns io.ErrUnexpectedEOF.
+func noEOF(e error) error {
+	if e == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return e
+}
+
+func (f *decompressor) moreBits() error {
+	c, err := f.r.ReadByte()
+	if err != nil {
+		return noEOF(err)
+	}
+	f.roffset++
+	f.b |= uint32(c) << f.nb
+	f.nb += 8
+	return nil
+}
+
+// Read the next Huffman-encoded symbol from f according to h.
+func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
+	// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+	// with single element, huffSym must error on these two edge cases. In both
+	// cases, the chunks slice will be 0 for the invalid sequence, leading it
+	// satisfy the n == 0 check below.
+	n := uint(h.maxRead)
+	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b,nb back to f on return.
+	nb, b := f.nb, f.b
+	for {
+		for nb < n {
+			c, err := f.r.ReadByte()
+			if err != nil {
+				f.b = b
+				f.nb = nb
+				return 0, noEOF(err)
+			}
+			f.roffset++
+			b |= uint32(c) << (nb & 31)
+			nb += 8
+		}
+		chunk := h.chunks[b&(huffmanNumChunks-1)]
+		n = uint(chunk & huffmanCountMask)
+		if n > huffmanChunkBits {
+			chunk = h.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&h.linkMask]
+			n = uint(chunk & huffmanCountMask)
+		}
+		if n <= nb {
+			if n == 0 {
+				f.b = b
+				f.nb = nb
+				if debugDecode {
+					fmt.Println("huffsym: n==0")
+				}
+				f.err = CorruptInputError(f.roffset)
+				return 0, f.err
+			}
+			f.b = b >> (n & 31)
+			f.nb = nb - n
+			return int(chunk >> huffmanValueShift), nil
+		}
+	}
+}
+
+func makeReader(r io.Reader) Reader {
+	if rr, ok := r.(Reader); ok {
+		return rr
+	}
+	return bufio.NewReader(r)
+}
+
+func fixedHuffmanDecoderInit() {
+	fixedOnce.Do(func() {
+		// These come from the RFC section 3.2.6.
+		var bits [288]int
+		for i := 0; i < 144; i++ {
+			bits[i] = 8
+		}
+		for i := 144; i < 256; i++ {
+			bits[i] = 9
+		}
+		for i := 256; i < 280; i++ {
+			bits[i] = 7
+		}
+		for i := 280; i < 288; i++ {
+			bits[i] = 8
+		}
+		fixedHuffmanDecoder.init(bits[:])
+	})
+}
+
+func (f *decompressor) Reset(r io.Reader, dict []byte) error {
+	*f = decompressor{
+		r:        makeReader(r),
+		bits:     f.bits,
+		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
+		dict:     f.dict,
+		step:     (*decompressor).nextBlock,
+	}
+	f.dict.init(maxMatchOffset, dict)
+	return nil
+}
+
+// NewReader returns a new ReadCloser that can be used
+// to read the uncompressed version of r.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+// It is the caller's responsibility to call Close on the ReadCloser
+// when finished reading.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReader(r io.Reader) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, nil)
+	return &f
+}
+
+// NewReaderDict is like NewReader but initializes the reader
+// with a preset dictionary. The returned Reader behaves as if
+// the uncompressed data stream started with the given dictionary,
+// which has already been read. NewReaderDict is typically used
+// to read data compressed by NewWriterDict.
+//
+// The ReadCloser returned by NewReader also implements Resetter.
+func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
+	fixedHuffmanDecoderInit()
+
+	var f decompressor
+	f.r = makeReader(r)
+	f.bits = new([maxNumLit + maxNumDist]int)
+	f.codebits = new([numCodes]int)
+	f.step = (*decompressor).nextBlock
+	f.dict.init(maxMatchOffset, dict)
+	return &f
+}
diff --git a/vendor/github.com/klauspost/compress/flate/inflate_gen.go b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
new file mode 100644
index 00000000000..397dc1b1a13
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@@ -0,0 +1,922 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"math/bits"
+	"strings"
+)
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesBuffer
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBufioReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+func (f *decompressor) huffmanBlockDecoder() func() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		return f.huffmanBytesBuffer
+	case *bytes.Reader:
+		return f.huffmanBytesReader
+	case *bufio.Reader:
+		return f.huffmanBufioReader
+	case *strings.Reader:
+		return f.huffmanStringsReader
+	default:
+		return f.huffmanBlockGeneric
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level1.go b/vendor/github.com/klauspost/compress/flate/level1.go
new file mode 100644
index 00000000000..1e5eea3968a
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@@ -0,0 +1,179 @@
+package flate
+
+import "fmt"
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL1 struct {
+	fastGen
+	table [tableSize]tableEntry
+}
+
+// EncodeL1 uses a similar algorithm to level 1
+func (e *fastEncL1) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash(cv)
+			candidate = e.table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hash(uint32(now))
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			// Save the match found
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash(uint32(x))
+			e.table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hash(uint32(x))
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level2.go b/vendor/github.com/klauspost/compress/flate/level2.go
new file mode 100644
index 00000000000..5b986a1944e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@@ -0,0 +1,205 @@
+package flate
+
+import "fmt"
+
+// fastGen maintains the table for matches,
+// and the previous byte block for level 2.
+// This is the generic implementation.
+type fastEncL2 struct {
+	fastGen
+	table [bTableSize]tableEntry
+}
+
+// EncodeL2 uses a similar algorithm to level 1, but is capable
+// of matching across blocks giving better compression at a small slowdown.
+func (e *fastEncL2) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		// When should we start skipping if we haven't found matches in a long while.
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash4u(cv, bTableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = e.table[nextHash]
+			now := load6432(src, nextS)
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+			nextHash = hash4u(uint32(now), bTableBits)
+
+			offset := s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = e.table[nextHash]
+			now >>= 8
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
+
+			offset = s - (candidate.offset - e.cur)
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				break
+			}
+			cv = uint32(now)
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				// Index first pair after match end.
+				if int(s+l+4) < len(src) {
+					cv := load3232(src, s)
+					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
+				}
+				goto emitRemainder
+			}
+
+			// Store every second hash in-between, but offset by 1.
+			for i := s - l + 2; i < s-5; i += 7 {
+				x := load6432(src, int32(i))
+				nextHash := hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
+				// Skip one
+				x >>= 16
+				nextHash = hash4u(uint32(x), bTableBits)
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 to s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6432(src, s-2)
+			o := e.cur + s - 2
+			prevHash := hash4u(uint32(x), bTableBits)
+			prevHash2 := hash4u(uint32(x>>8), bTableBits)
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
+			currHash := hash4u(uint32(x>>16), bTableBits)
+			candidate = e.table[currHash]
+			e.table[currHash] = tableEntry{offset: o + 2}
+
+			offset := s - (candidate.offset - e.cur)
+			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
+				cv = uint32(x >> 24)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level3.go b/vendor/github.com/klauspost/compress/flate/level3.go
new file mode 100644
index 00000000000..c22b4244a5c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@@ -0,0 +1,229 @@
+package flate
+
+import "fmt"
+
+// fastEncL3
+type fastEncL3 struct {
+	fastGen
+	table [tableSize]tableEntryPrev
+}
+
+// Encode uses a similar algorithm to level 2, will check up to two candidates.
+func (e *fastEncL3) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 8 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+			}
+			if v.Prev.offset <= minOff {
+				v.Prev.offset = 0
+			} else {
+				v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+			}
+			e.table[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// Skip if too small.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3232(src, s)
+	for {
+		const skipLog = 6
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hash(cv)
+			s = nextS
+			nextS = s + 1 + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidates := e.table[nextHash]
+			now := load3232(src, nextS)
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if cv == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
+					break
+				}
+				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
+				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
+				if l2 > l1 {
+					candidate = candidates.Prev
+				}
+				break
+			} else {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					break
+				}
+			}
+			cv = now
+		}
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			//
+			t := candidate.offset - e.cur
+			l := e.matchlenLong(s+4, t+4, src) + 4
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+
+			if s >= sLimit {
+				t += l
+				// Index first pair after match end.
+				if int(t+4) < len(src) && t > 0 {
+					cv := load3232(src, t)
+					nextHash := hash(cv)
+					e.table[nextHash] = tableEntryPrev{
+						Prev: e.table[nextHash].Cur,
+						Cur:  tableEntry{offset: e.cur + t},
+					}
+				}
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-3 to s.
+			x := load6432(src, s-3)
+			prevHash := hash(uint32(x))
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 3},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 2},
+			}
+			x >>= 8
+			prevHash = hash(uint32(x))
+
+			e.table[prevHash] = tableEntryPrev{
+				Prev: e.table[prevHash].Cur,
+				Cur:  tableEntry{offset: e.cur + s - 1},
+			}
+			x >>= 8
+			currHash := hash(uint32(x))
+			candidates := e.table[currHash]
+			cv = uint32(x)
+			e.table[currHash] = tableEntryPrev{
+				Prev: candidates.Cur,
+				Cur:  tableEntry{offset: s + e.cur},
+			}
+
+			// Check both candidates
+			candidate = candidates.Cur
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
+				// We only check if value mismatches.
+				// Offset will always be invalid in other cases.
+				candidate = candidates.Prev
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					offset := s - (candidate.offset - e.cur)
+					if offset <= maxMatchOffset {
+						continue
+					}
+				}
+			}
+			cv = uint32(x >> 8)
+			s++
+			break
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level4.go b/vendor/github.com/klauspost/compress/flate/level4.go
new file mode 100644
index 00000000000..e62f0c02b1e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@@ -0,0 +1,212 @@
+package flate
+
+import "fmt"
+
+type fastEncL4 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntry
+}
+
+func (e *fastEncL4) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntry{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.bTable[i].offset = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			e.bTable[nextHashL] = entry
+
+			t = lCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
+				// We got a long match. Use that.
+				break
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				lCandidate = e.bTable[hash7(next, tableBits)]
+
+				// If the next long is a candidate, check if we should use that instead...
+				lOff := nextS - (lCandidate.offset - e.cur)
+				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
+					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
+					if l2 > l1 {
+						s = nextS
+						t = lCandidate.offset - e.cur
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlenLong(s+4, t+4, src) + 4
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if debugDeflate {
+			if t >= s {
+				panic("s-t")
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index first pair after match end.
+			if int(s+8) < len(src) {
+				cv := load6432(src, s)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
+			}
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between
+		if true {
+			i := nextS
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				e.bTable[hash7(cv, tableBits)] = t
+				e.bTable[hash7(cv>>8, tableBits)] = t2
+				e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+
+				i += 3
+				for ; i < s-1; i += 3 {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					e.bTable[hash7(cv, tableBits)] = t
+					e.bTable[hash7(cv>>8, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level5.go b/vendor/github.com/klauspost/compress/flate/level5.go
new file mode 100644
index 00000000000..d513f1ffd37
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@@ -0,0 +1,279 @@
+package flate
+
+import "fmt"
+
+type fastEncL5 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL5) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	for {
+		const skipLog = 6
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+				lCandidate = e.bTable[nextHashL]
+				// Store the next match
+
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// If the next long is a candidate, use that...
+				t2 := lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if debugDeflate {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			goto emitRemainder
+		}
+
+		// Store every 3rd hash in-between.
+		if true {
+			const hashEvery = 3
+			i := s - l + 1
+			if i < s-1 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// Do an long at i+1
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				eLong = &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+
+				// We only have enough bits for a short entry at i+2
+				cv >>= 8
+				t = tableEntry{offset: t.offset + 1}
+				e.table[hash4x64(cv, tableBits)] = t
+
+				// Skip one - otherwise we risk hitting 's'
+				i += 4
+				for ; i < s-1; i += hashEvery {
+					cv := load6432(src, i)
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
+					eLong := &e.bTable[hash7(cv, tableBits)]
+					eLong.Cur, eLong.Prev = t, eLong.Cur
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
+				}
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		x := load6432(src, s-1)
+		o := e.cur + s - 1
+		prevHashS := hash4x64(x, tableBits)
+		prevHashL := hash7(x, tableBits)
+		e.table[prevHashS] = tableEntry{offset: o}
+		eLong := &e.bTable[prevHashL]
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
+		cv = x >> 8
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/level6.go b/vendor/github.com/klauspost/compress/flate/level6.go
new file mode 100644
index 00000000000..a52c80ea456
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@@ -0,0 +1,282 @@
+package flate
+
+import "fmt"
+
+type fastEncL6 struct {
+	fastGen
+	table  [tableSize]tableEntry
+	bTable [tableSize]tableEntryPrev
+}
+
+func (e *fastEncL6) Encode(dst *tokens, src []byte) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.bTable[:] {
+				e.bTable[i] = tableEntryPrev{}
+			}
+			e.cur = maxMatchOffset
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - maxMatchOffset
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v <= minOff {
+				v = 0
+			} else {
+				v = v - e.cur + maxMatchOffset
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.bTable[:] {
+			v := e.bTable[i]
+			if v.Cur.offset <= minOff {
+				v.Cur.offset = 0
+				v.Prev.offset = 0
+			} else {
+				v.Cur.offset = v.Cur.offset - e.cur + maxMatchOffset
+				if v.Prev.offset <= minOff {
+					v.Prev.offset = 0
+				} else {
+					v.Prev.offset = v.Prev.offset - e.cur + maxMatchOffset
+				}
+			}
+			e.bTable[i] = v
+		}
+		e.cur = maxMatchOffset
+	}
+
+	s := e.addBlock(src)
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = uint16(len(src))
+		return
+	}
+
+	// Override src
+	src = e.hist
+	nextEmit := s
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int32(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load6432(src, s)
+	// Repeat MUST be > 1 and within range
+	repeat := int32(1)
+	for {
+		const skipLog = 7
+		const doEvery = 1
+
+		nextS := s
+		var l int32
+		var t int32
+		for {
+			nextHashS := hash4x64(cv, tableBits)
+			nextHashL := hash7(cv, tableBits)
+			s = nextS
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			// Fetch a short+long candidate
+			sCandidate := e.table[nextHashS]
+			lCandidate := e.bTable[nextHashL]
+			next := load6432(src, nextS)
+			entry := tableEntry{offset: s + e.cur}
+			e.table[nextHashS] = entry
+			eLong := &e.bTable[nextHashL]
+			eLong.Cur, eLong.Prev = entry, eLong.Cur
+
+			// Calculate hashes of 'next'
+			nextHashS = hash4x64(next, tableBits)
+			nextHashL = hash7(next, tableBits)
+
+			t = lCandidate.Cur.offset - e.cur
+			if s-t < maxMatchOffset {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
+					// Long candidate matches at least 4 bytes.
+
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+					// Check the previous long candidate as well.
+					t2 := lCandidate.Prev.offset - e.cur
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+						l = e.matchlen(s+4, t+4, src) + 4
+						ml1 := e.matchlen(s+4, t2+4, src) + 4
+						if ml1 > l {
+							t = t2
+							l = ml1
+							break
+						}
+					}
+					break
+				}
+				// Current value did not match, but check if previous long value does.
+				t = lCandidate.Prev.offset - e.cur
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
+					// Store the next match
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+					eLong := &e.bTable[nextHashL]
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+					break
+				}
+			}
+
+			t = sCandidate.offset - e.cur
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
+				// Found a 4 match...
+				l = e.matchlen(s+4, t+4, src) + 4
+
+				// Look up next long candidate (at nextS)
+				lCandidate = e.bTable[nextHashL]
+
+				// Store the next match
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
+				eLong := &e.bTable[nextHashL]
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
+
+				// Check repeat at s + repOff
+				const repOff = 1
+				t2 := s - repeat + repOff
+				if load3232(src, t2) == uint32(cv>>(8*repOff)) {
+					ml := e.matchlen(s+4+repOff, t2+4, src) + 4
+					if ml > l {
+						t = t2
+						l = ml
+						s += repOff
+						// Not worth checking more.
+						break
+					}
+				}
+
+				// If the next long is a candidate, use that...
+				t2 = lCandidate.Cur.offset - e.cur
+				if nextS-t2 < maxMatchOffset {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							// This is ok, but check previous as well.
+						}
+					}
+					// If the previous long is a candidate, use that...
+					t2 = lCandidate.Prev.offset - e.cur
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
+						ml := e.matchlen(nextS+4, t2+4, src) + 4
+						if ml > l {
+							t = t2
+							s = nextS
+							l = ml
+							break
+						}
+					}
+				}
+				break
+			}
+			cv = next
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+
+		// Extend the 4-byte match as long as possible.
+		if l == 0 {
+			l = e.matchlenLong(s+4, t+4, src) + 4
+		} else if l == maxMatchLength {
+			l += e.matchlenLong(s+l, t+l, src)
+		}
+
+		// Extend backwards
+		for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+		if nextEmit < s {
+			emitLiteral(dst, src[nextEmit:s])
+		}
+		if false {
+			if t >= s {
+				panic(fmt.Sprintln("s-t", s, t))
+			}
+			if (s - t) > maxMatchOffset {
+				panic(fmt.Sprintln("mmo", s-t))
+			}
+			if l < baseMatchLength {
+				panic("bml")
+			}
+		}
+
+		dst.AddMatchLong(l, uint32(s-t-baseMatchOffset))
+		repeat = s - t
+		s += l
+		nextEmit = s
+		if nextS >= s {
+			s = nextS + 1
+		}
+
+		if s >= sLimit {
+			// Index after match end.
+			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
+				cv := load6432(src, i)
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
+			}
+			goto emitRemainder
+		}
+
+		// Store every long hash in-between and every second short.
+		if true {
+			for i := nextS + 1; i < s-1; i += 2 {
+				cv := load6432(src, i)
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
+				eLong := &e.bTable[hash7(cv, tableBits)]
+				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
+				e.table[hash4x64(cv, tableBits)] = t
+				eLong.Cur, eLong.Prev = t, eLong.Cur
+				eLong2.Cur, eLong2.Prev = t2, eLong2.Cur
+			}
+		}
+
+		// We could immediately start working at s now, but to improve
+		// compression we first update the hash table at s-1 and at s.
+		cv = load6432(src, s)
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/stateless.go b/vendor/github.com/klauspost/compress/flate/stateless.go
new file mode 100644
index 00000000000..53e89912463
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@@ -0,0 +1,297 @@
+package flate
+
+import (
+	"io"
+	"math"
+	"sync"
+)
+
+const (
+	maxStatelessBlock = math.MaxInt16
+	// dictionary will be taken from maxStatelessBlock, so limit it.
+	maxStatelessDict = 8 << 10
+
+	slTableBits  = 13
+	slTableSize  = 1 << slTableBits
+	slTableShift = 32 - slTableBits
+)
+
+type statelessWriter struct {
+	dst    io.Writer
+	closed bool
+}
+
+func (s *statelessWriter) Close() error {
+	if s.closed {
+		return nil
+	}
+	s.closed = true
+	// Emit EOF block
+	return StatelessDeflate(s.dst, nil, true, nil)
+}
+
+func (s *statelessWriter) Write(p []byte) (n int, err error) {
+	err = StatelessDeflate(s.dst, p, false, nil)
+	if err != nil {
+		return 0, err
+	}
+	return len(p), nil
+}
+
+func (s *statelessWriter) Reset(w io.Writer) {
+	s.dst = w
+	s.closed = false
+}
+
+// NewStatelessWriter will do compression but without maintaining any state
+// between Write calls.
+// There will be no memory kept between Write calls,
+// but compression and speed will be suboptimal.
+// Because of this, the size of actual Write calls will affect output size.
+func NewStatelessWriter(dst io.Writer) io.WriteCloser {
+	return &statelessWriter{dst: dst}
+}
+
+// bitWriterPool contains bit writers that can be reused.
+var bitWriterPool = sync.Pool{
+	New: func() interface{} {
+		return newHuffmanBitWriter(nil)
+	},
+}
+
+// StatelessDeflate allows to compress directly to a Writer without retaining state.
+// When returning everything will be flushed.
+// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block.
+// Longer dictionaries will be truncated and will still produce valid output.
+// Sending nil dictionary is perfectly fine.
+func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
+	var dst tokens
+	bw := bitWriterPool.Get().(*huffmanBitWriter)
+	bw.reset(out)
+	defer func() {
+		// don't keep a reference to our output
+		bw.reset(nil)
+		bitWriterPool.Put(bw)
+	}()
+	if eof && len(in) == 0 {
+		// Just write an EOF block.
+		// Could be faster...
+		bw.writeStoredHeader(0, true)
+		bw.flush()
+		return bw.err
+	}
+
+	// Truncate dict
+	if len(dict) > maxStatelessDict {
+		dict = dict[len(dict)-maxStatelessDict:]
+	}
+
+	for len(in) > 0 {
+		todo := in
+		if len(todo) > maxStatelessBlock-len(dict) {
+			todo = todo[:maxStatelessBlock-len(dict)]
+		}
+		in = in[len(todo):]
+		uncompressed := todo
+		if len(dict) > 0 {
+			// combine dict and source
+			bufLen := len(todo) + len(dict)
+			combined := make([]byte, bufLen)
+			copy(combined, dict)
+			copy(combined[len(dict):], todo)
+			todo = combined
+		}
+		// Compress
+		statelessEnc(&dst, todo, int16(len(dict)))
+		isEof := eof && len(in) == 0
+
+		if dst.n == 0 {
+			bw.writeStoredHeader(len(uncompressed), isEof)
+			if bw.err != nil {
+				return bw.err
+			}
+			bw.writeBytes(uncompressed)
+		} else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
+			// If we removed less than 1/16th, huffman compress the block.
+			bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
+		} else {
+			bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0)
+		}
+		if len(in) > 0 {
+			// Retain a dict if we have more
+			dict = todo[len(todo)-maxStatelessDict:]
+			dst.Reset()
+		}
+		if bw.err != nil {
+			return bw.err
+		}
+	}
+	if !eof {
+		// Align, only a stored block can do that.
+		bw.writeStoredHeader(0, false)
+	}
+	bw.flush()
+	return bw.err
+}
+
+func hashSL(u uint32) uint32 {
+	return (u * 0x1e35a7bd) >> slTableShift
+}
+
+func load3216(b []byte, i int16) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6416(b []byte, i int16) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func statelessEnc(dst *tokens, src []byte, startAt int16) {
+	const (
+		inputMargin            = 12 - 1
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	type tableEntry struct {
+		offset int16
+	}
+
+	var table [slTableSize]tableEntry
+
+	// This check isn't in the Snappy implementation, but there, the caller
+	// instead of the callee handles this case.
+	if len(src)-int(startAt) < minNonLiteralBlockSize {
+		// We do not fill the token table.
+		// This will be picked up by caller.
+		dst.n = 0
+		return
+	}
+	// Index until startAt
+	if startAt > 0 {
+		cv := load3232(src, 0)
+		for i := int16(0); i < startAt; i++ {
+			table[hashSL(cv)] = tableEntry{offset: i}
+			cv = (cv >> 8) | (uint32(src[i+4]) << 24)
+		}
+	}
+
+	s := startAt + 1
+	nextEmit := startAt
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := int16(len(src) - inputMargin)
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	cv := load3216(src, s)
+
+	for {
+		const skipLog = 5
+		const doEvery = 2
+
+		nextS := s
+		var candidate tableEntry
+		for {
+			nextHash := hashSL(cv)
+			candidate = table[nextHash]
+			nextS = s + doEvery + (s-nextEmit)>>skipLog
+			if nextS > sLimit || nextS <= 0 {
+				goto emitRemainder
+			}
+
+			now := load6416(src, nextS)
+			table[nextHash] = tableEntry{offset: s}
+			nextHash = hashSL(uint32(now))
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+
+			// Do one right away...
+			cv = uint32(now)
+			s = nextS
+			nextS++
+			candidate = table[nextHash]
+			now >>= 8
+			table[nextHash] = tableEntry{offset: s}
+
+			if cv == load3216(src, candidate.offset) {
+				table[nextHash] = tableEntry{offset: nextS}
+				break
+			}
+			cv = uint32(now)
+			s = nextS
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+
+			// Extend the 4-byte match as long as possible.
+			t := candidate.offset
+			l := int16(matchLen(src[s+4:], src[t+4:]) + 4)
+
+			// Extend backwards
+			for t > 0 && s > nextEmit && src[t-1] == src[s-1] {
+				s--
+				t--
+				l++
+			}
+			if nextEmit < s {
+				emitLiteral(dst, src[nextEmit:s])
+			}
+
+			// Save the match found
+			dst.AddMatchLong(int32(l), uint32(s-t-baseMatchOffset))
+			s += l
+			nextEmit = s
+			if nextS >= s {
+				s = nextS + 1
+			}
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-2 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load6416(src, s-2)
+			o := s - 2
+			prevHash := hashSL(uint32(x))
+			table[prevHash] = tableEntry{offset: o}
+			x >>= 16
+			currHash := hashSL(uint32(x))
+			candidate = table[currHash]
+			table[currHash] = tableEntry{offset: o + 2}
+
+			if uint32(x) != load3216(src, candidate.offset) {
+				cv = uint32(x >> 8)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if int(nextEmit) < len(src) {
+		// If nothing was added, don't encode literals.
+		if dst.n == 0 {
+			return
+		}
+		emitLiteral(dst, src[nextEmit:])
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/flate/token.go b/vendor/github.com/klauspost/compress/flate/token.go
new file mode 100644
index 00000000000..f9abf606d67
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@@ -0,0 +1,375 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"io"
+	"math"
+)
+
+const (
+	// 2 bits:   type   0 = literal  1=EOF  2=Match   3=Unused
+	// 8 bits:   xlength = length - MIN_MATCH_LENGTH
+	// 22 bits   xoffset = offset - MIN_OFFSET_SIZE, or literal
+	lengthShift = 22
+	offsetMask  = 1<<lengthShift - 1
+	typeMask    = 3 << 30
+	literalType = 0 << 30
+	matchType   = 1 << 30
+)
+
+// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
+// is lengthCodes[length - MIN_MATCH_LENGTH]
+var lengthCodes = [256]uint8{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
+	9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
+	13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
+	15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+	17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
+	18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+	19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+	20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 28,
+}
+
+// lengthCodes1 is length codes, but starting at 1.
+var lengthCodes1 = [256]uint8{
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
+	10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
+	14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
+	16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+	18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+	19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
+	20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+	21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+	22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
+	23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+	23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 29,
+}
+
+var offsetCodes = [256]uint32{
+	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+}
+
+// offsetCodes14 are offsetCodes, but with 14 added.
+var offsetCodes14 = [256]uint32{
+	14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+	29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+}
+
+type token uint32
+
+type tokens struct {
+	nLits     int
+	extraHist [32]uint16  // codes 256->maxnumlit
+	offHist   [32]uint16  // offset codes
+	litHist   [256]uint16 // codes 0->255
+	n         uint16      // Must be able to contain maxStoreBlockSize
+	tokens    [maxStoreBlockSize + 1]token
+}
+
+func (t *tokens) Reset() {
+	if t.n == 0 {
+		return
+	}
+	t.n = 0
+	t.nLits = 0
+	for i := range t.litHist[:] {
+		t.litHist[i] = 0
+	}
+	for i := range t.extraHist[:] {
+		t.extraHist[i] = 0
+	}
+	for i := range t.offHist[:] {
+		t.offHist[i] = 0
+	}
+}
+
+func (t *tokens) Fill() {
+	if t.n == 0 {
+		return
+	}
+	for i, v := range t.litHist[:] {
+		if v == 0 {
+			t.litHist[i] = 1
+			t.nLits++
+		}
+	}
+	for i, v := range t.extraHist[:literalCount-256] {
+		if v == 0 {
+			t.nLits++
+			t.extraHist[i] = 1
+		}
+	}
+	for i, v := range t.offHist[:offsetCodeCount] {
+		if v == 0 {
+			t.offHist[i] = 1
+		}
+	}
+}
+
+func indexTokens(in []token) tokens {
+	var t tokens
+	t.indexTokens(in)
+	return t
+}
+
+func (t *tokens) indexTokens(in []token) {
+	t.Reset()
+	for _, tok := range in {
+		if tok < matchType {
+			t.AddLiteral(tok.literal())
+			continue
+		}
+		t.AddMatch(uint32(tok.length()), tok.offset())
+	}
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+func emitLiteral(dst *tokens, lit []byte) {
+	ol := int(dst.n)
+	for i, v := range lit {
+		dst.tokens[(i+ol)&maxStoreBlockSize] = token(v)
+		dst.litHist[v]++
+	}
+	dst.n += uint16(len(lit))
+	dst.nLits += len(lit)
+}
+
+func (t *tokens) AddLiteral(lit byte) {
+	t.tokens[t.n] = token(lit)
+	t.litHist[lit]++
+	t.n++
+	t.nLits++
+}
+
+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
+
+// EstimatedBits will return an minimum size estimated by an *optimal*
+// compression of the block.
+// The size of the block
+func (t *tokens) EstimatedBits() int {
+	shannon := float32(0)
+	bits := int(0)
+	nMatches := 0
+	if t.nLits > 0 {
+		invTotal := 1.0 / float32(t.nLits)
+		for _, v := range t.litHist[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
+			}
+		}
+		// Just add 15 for EOB
+		shannon += 15
+		for i, v := range t.extraHist[1 : literalCount-256] {
+			if v > 0 {
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
+				nMatches += int(v)
+			}
+		}
+	}
+	if nMatches > 0 {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
+			if v > 0 {
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
+			}
+		}
+	}
+	return int(shannon) + bits
+}
+
+// AddMatch adds a match to the tokens.
+// This function is very sensitive to inlining and right on the border.
+func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
+	if debugDeflate {
+		if xlength >= maxMatchLength+baseMatchLength {
+			panic(fmt.Errorf("invalid length: %v", xlength))
+		}
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	t.nLits++
+	lengthCode := lengthCodes1[uint8(xlength)] & 31
+	t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
+	t.extraHist[lengthCode]++
+	t.offHist[offsetCode(xoffset)&31]++
+	t.n++
+}
+
+// AddMatchLong adds a match to the tokens, potentially longer than max match length.
+// Length should NOT have the base subtracted, only offset should.
+func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
+	if debugDeflate {
+		if xoffset >= maxMatchOffset+baseMatchOffset {
+			panic(fmt.Errorf("invalid offset: %v", xoffset))
+		}
+	}
+	oc := offsetCode(xoffset) & 31
+	for xlength > 0 {
+		xl := xlength
+		if xl > 258 {
+			// We need to have at least baseMatchLength left over for next loop.
+			xl = 258 - baseMatchLength
+		}
+		xlength -= xl
+		xl -= 3
+		t.nLits++
+		lengthCode := lengthCodes1[uint8(xl)] & 31
+		t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
+		t.extraHist[lengthCode]++
+		t.offHist[oc]++
+		t.n++
+	}
+}
+
+func (t *tokens) AddEOB() {
+	t.tokens[t.n] = token(endBlockMarker)
+	t.extraHist[0]++
+	t.n++
+}
+
+func (t *tokens) Slice() []token {
+	return t.tokens[:t.n]
+}
+
+// VarInt returns the tokens as varint encoded bytes.
+func (t *tokens) VarInt() []byte {
+	var b = make([]byte, binary.MaxVarintLen32*int(t.n))
+	var off int
+	for _, v := range t.tokens[:t.n] {
+		off += binary.PutUvarint(b[off:], uint64(v))
+	}
+	return b[:off]
+}
+
+// FromVarInt restores t to the varint encoded tokens provided.
+// Any data in t is removed.
+func (t *tokens) FromVarInt(b []byte) error {
+	var buf = bytes.NewReader(b)
+	var toks []token
+	for {
+		r, err := binary.ReadUvarint(buf)
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return err
+		}
+		toks = append(toks, token(r))
+	}
+	t.indexTokens(toks)
+	return nil
+}
+
+// Returns the type of a token
+func (t token) typ() uint32 { return uint32(t) & typeMask }
+
+// Returns the literal of a literal token
+func (t token) literal() uint8 { return uint8(t) }
+
+// Returns the extra offset of a match token
+func (t token) offset() uint32 { return uint32(t) & offsetMask }
+
+func (t token) length() uint8 { return uint8(t >> lengthShift) }
+
+// The code is never more than 8 bits, but is returned as uint32 for convenience.
+func lengthCode(len uint8) uint32 { return uint32(lengthCodes[len]) }
+
+// Returns the offset code corresponding to a specific offset
+func offsetCode(off uint32) uint32 {
+	if false {
+		if off < uint32(len(offsetCodes)) {
+			return offsetCodes[off&255]
+		} else if off>>7 < uint32(len(offsetCodes)) {
+			return offsetCodes[(off>>7)&255] + 14
+		} else {
+			return offsetCodes[(off>>14)&255] + 28
+		}
+	}
+	if off < uint32(len(offsetCodes)) {
+		return offsetCodes[uint8(off)]
+	}
+	return offsetCodes14[uint8(off>>7)]
+}
diff --git a/vendor/github.com/klauspost/compress/fse/README.md b/vendor/github.com/klauspost/compress/fse/README.md
new file mode 100644
index 00000000000..ea7324da671
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/README.md
@@ -0,0 +1,79 @@
+# Finite State Entropy
+
+This package provides Finite State Entropy encoding and decoding.
+            
+Finite State Entropy (also referenced as [tANS](https://en.wikipedia.org/wiki/Asymmetric_numeral_systems#tANS)) 
+encoding provides a fast near-optimal symbol encoding/decoding
+for byte blocks as implemented in [zstandard](https://github.com/facebook/zstd).
+
+This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
+This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
+but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
+
+* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/fse)
+
+## News
+
+ * Feb 2018: First implementation released. Consider this beta software for now.
+
+# Usage
+
+This package provides a low level interface that allows to compress single independent blocks. 
+
+Each block is separate, and there is no built in integrity checks. 
+This means that the caller should keep track of block sizes and also do checksums if needed.  
+
+Compressing a block is done via the [`Compress`](https://godoc.org/github.com/klauspost/compress/fse#Compress) function.
+You must provide input and will receive the output and maybe an error.
+
+These error values can be returned:
+
+| Error               | Description                                                                 |
+|---------------------|-----------------------------------------------------------------------------|
+| `<nil>`             | Everything ok, output is returned                                           |
+| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
+| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
+| `(error)`           | An internal error occurred.                                                 |
+
+As can be seen above there are errors that will be returned even under normal operation so it is important to handle these.
+
+To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/fse#Scratch) object 
+that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
+object can be used for both.   
+
+Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
+you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
+
+Decompressing is done by calling the [`Decompress`](https://godoc.org/github.com/klauspost/compress/fse#Decompress) function.
+You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
+your input was likely corrupted. 
+
+It is important to note that a successful decoding does *not* mean your output matches your original input. 
+There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
+
+For more detailed usage, see examples in the [godoc documentation](https://godoc.org/github.com/klauspost/compress/fse#pkg-examples).
+
+# Performance
+
+A lot of factors are affecting speed. Block sizes and compressibility of the material are primary factors.  
+All compression functions are currently only running on the calling goroutine so only one core will be used per block.  
+
+The compressor is significantly faster if symbols are kept as small as possible. The highest byte value of the input
+is used to reduce some of the processing, so if all your input is above byte value 64 for instance, it may be 
+beneficial to transpose all your input values down by 64.   
+
+With moderate block sizes around 64k speed are typically 200MB/s per core for compression and 
+around 300MB/s decompression speed. 
+
+The same hardware typically does Huffman (deflate) encoding at 125MB/s and decompression at 100MB/s. 
+
+# Plans
+
+At one point, more internals will be exposed to facilitate more "expert" usage of the components. 
+
+A streaming interface is also likely to be implemented. Likely compatible with [FSE stream format](https://github.com/Cyan4973/FiniteStateEntropy/blob/dev/programs/fileio.c#L261).  
+
+# Contributing
+
+Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
+changes will likely not be accepted. If in doubt open an issue before writing the PR.  
\ No newline at end of file
diff --git a/vendor/github.com/klauspost/compress/fse/bitreader.go b/vendor/github.com/klauspost/compress/fse/bitreader.go
new file mode 100644
index 00000000000..b9db204f59d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/bitreader.go
@@ -0,0 +1,107 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package fse
+
+import (
+	"errors"
+	"io"
+)
+
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReader struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReader) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	b.fill()
+	b.fill()
+	b.bitsRead += 8 - uint8(highBits(uint32(v)))
+	return nil
+}
+
+// getBits will return n bits. n can be 0.
+func (b *bitReader) getBits(n uint8) uint16 {
+	if n == 0 || b.bitsRead >= 64 {
+		return 0
+	}
+	return b.getBitsFast(n)
+}
+
+// getBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReader) getBitsFast(n uint8) uint16 {
+	const regMask = 64 - 1
+	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	b.bitsRead += n
+	return v
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReader) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+	// Do single re-slice to avoid bounds checks.
+	v := b.in[b.off-4 : b.off]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value = (b.value << 32) | uint64(low)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReader) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4 : b.off]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value = (b.value << 32) | uint64(low)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value = (b.value << 8) | uint64(b.in[b.off-1])
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReader) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReader) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/fse/bitwriter.go b/vendor/github.com/klauspost/compress/fse/bitwriter.go
new file mode 100644
index 00000000000..43e463611b1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/bitwriter.go
@@ -0,0 +1,168 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package fse
+
+import "fmt"
+
+// bitWriter will write bits.
+// First bit will be LSB of the first byte of output.
+type bitWriter struct {
+	bitContainer uint64
+	nBits        uint8
+	out          []byte
+}
+
+// bitMask16 is bitmasks. Has extra to avoid bounds check.
+var bitMask16 = [32]uint16{
+	0, 1, 3, 7, 0xF, 0x1F,
+	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
+	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF} /* up to 16 bits */
+
+// addBits16NC will add up to 16 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
+// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// addBits16ZeroNC will add up to 16 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+// This is fastest if bits can be zero.
+func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
+	if bits == 0 {
+		return
+	}
+	value <<= (16 - bits) & 15
+	value >>= (16 - bits) & 15
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// flush will flush all pending full bytes.
+// There will be at least 56 bits available for writing when this has been called.
+// Using flush32 is faster, but leaves less space for writing.
+func (b *bitWriter) flush() {
+	v := b.nBits >> 3
+	switch v {
+	case 0:
+	case 1:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+		)
+	case 2:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+		)
+	case 3:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+		)
+	case 4:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+		)
+	case 5:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+		)
+	case 6:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+		)
+	case 7:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+		)
+	case 8:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+			byte(b.bitContainer>>56),
+		)
+	default:
+		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
+	}
+	b.bitContainer >>= v << 3
+	b.nBits &= 7
+}
+
+// flush32 will flush out, so there are at least 32 bits available for writing.
+func (b *bitWriter) flush32() {
+	if b.nBits < 32 {
+		return
+	}
+	b.out = append(b.out,
+		byte(b.bitContainer),
+		byte(b.bitContainer>>8),
+		byte(b.bitContainer>>16),
+		byte(b.bitContainer>>24))
+	b.nBits -= 32
+	b.bitContainer >>= 32
+}
+
+// flushAlign will flush remaining full bytes and align to next byte boundary.
+func (b *bitWriter) flushAlign() {
+	nbBytes := (b.nBits + 7) >> 3
+	for i := uint8(0); i < nbBytes; i++ {
+		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
+	}
+	b.nBits = 0
+	b.bitContainer = 0
+}
+
+// close will write the alignment bit and write the final byte(s)
+// to the output.
+func (b *bitWriter) close() error {
+	// End mark
+	b.addBits16Clean(1, 1)
+	// flush until next byte.
+	b.flushAlign()
+	return nil
+}
+
+// reset and continue writing by appending to out.
+func (b *bitWriter) reset(out []byte) {
+	b.bitContainer = 0
+	b.nBits = 0
+	b.out = out
+}
diff --git a/vendor/github.com/klauspost/compress/fse/bytereader.go b/vendor/github.com/klauspost/compress/fse/bytereader.go
new file mode 100644
index 00000000000..f228a46cdf6
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/bytereader.go
@@ -0,0 +1,56 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package fse
+
+// byteReader provides a byte reader that reads
+// little endian values from a byte stream.
+// The input stream is manually advanced.
+// The reader performs no bounds checks.
+type byteReader struct {
+	b   []byte
+	off int
+}
+
+// init will initialize the reader and set the input.
+func (b *byteReader) init(in []byte) {
+	b.b = in
+	b.off = 0
+}
+
+// advance the stream b n bytes.
+func (b *byteReader) advance(n uint) {
+	b.off += int(n)
+}
+
+// Int32 returns a little endian int32 starting at current offset.
+func (b byteReader) Int32() int32 {
+	b2 := b.b[b.off : b.off+4 : b.off+4]
+	v3 := int32(b2[3])
+	v2 := int32(b2[2])
+	v1 := int32(b2[1])
+	v0 := int32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// Uint32 returns a little endian uint32 starting at current offset.
+func (b byteReader) Uint32() uint32 {
+	b2 := b.b[b.off : b.off+4 : b.off+4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// unread returns the unread portion of the input.
+func (b byteReader) unread() []byte {
+	return b.b[b.off:]
+}
+
+// remain will return the number of bytes remaining.
+func (b byteReader) remain() int {
+	return len(b.b) - b.off
+}
diff --git a/vendor/github.com/klauspost/compress/fse/compress.go b/vendor/github.com/klauspost/compress/fse/compress.go
new file mode 100644
index 00000000000..b69237c9b8f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/compress.go
@@ -0,0 +1,684 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package fse
+
+import (
+	"errors"
+	"fmt"
+)
+
+// Compress the input bytes. Input must be < 2GB.
+// Provide a Scratch buffer to avoid memory allocations.
+// Note that the output is also kept in the scratch buffer.
+// If input is too hard to compress, ErrIncompressible is returned.
+// If input is a single byte value repeated ErrUseRLE is returned.
+func Compress(in []byte, s *Scratch) ([]byte, error) {
+	if len(in) <= 1 {
+		return nil, ErrIncompressible
+	}
+	if len(in) > (2<<30)-1 {
+		return nil, errors.New("input too big, must be < 2GB")
+	}
+	s, err := s.prepare(in)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create histogram, if none was provided.
+	maxCount := s.maxCount
+	if maxCount == 0 {
+		maxCount = s.countSimple(in)
+	}
+	// Reset for next run.
+	s.clearCount = true
+	s.maxCount = 0
+	if maxCount == len(in) {
+		// One symbol, use RLE
+		return nil, ErrUseRLE
+	}
+	if maxCount == 1 || maxCount < (len(in)>>7) {
+		// Each symbol present maximum once or too well distributed.
+		return nil, ErrIncompressible
+	}
+	s.optimalTableLog()
+	err = s.normalizeCount()
+	if err != nil {
+		return nil, err
+	}
+	err = s.writeCount()
+	if err != nil {
+		return nil, err
+	}
+
+	if false {
+		err = s.validateNorm()
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	err = s.buildCTable()
+	if err != nil {
+		return nil, err
+	}
+	err = s.compress(in)
+	if err != nil {
+		return nil, err
+	}
+	s.Out = s.bw.out
+	// Check if we compressed.
+	if len(s.Out) >= len(in) {
+		return nil, ErrIncompressible
+	}
+	return s.Out, nil
+}
+
+// cState contains the compression state of a stream.
+type cState struct {
+	bw         *bitWriter
+	stateTable []uint16
+	state      uint16
+}
+
+// init will initialize the compression state to the first symbol of the stream.
+func (c *cState) init(bw *bitWriter, ct *cTable, tableLog uint8, first symbolTransform) {
+	c.bw = bw
+	c.stateTable = ct.stateTable
+
+	nbBitsOut := (first.deltaNbBits + (1 << 15)) >> 16
+	im := int32((nbBitsOut << 16) - first.deltaNbBits)
+	lu := (im >> nbBitsOut) + first.deltaFindState
+	c.state = c.stateTable[lu]
+	return
+}
+
+// encode the output symbol provided and write it to the bitstream.
+func (c *cState) encode(symbolTT symbolTransform) {
+	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
+	dstState := int32(c.state>>(nbBitsOut&15)) + symbolTT.deltaFindState
+	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
+	c.state = c.stateTable[dstState]
+}
+
+// encode the output symbol provided and write it to the bitstream.
+func (c *cState) encodeZero(symbolTT symbolTransform) {
+	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
+	dstState := int32(c.state>>(nbBitsOut&15)) + symbolTT.deltaFindState
+	c.bw.addBits16ZeroNC(c.state, uint8(nbBitsOut))
+	c.state = c.stateTable[dstState]
+}
+
+// flush will write the tablelog to the output and flush the remaining full bytes.
+func (c *cState) flush(tableLog uint8) {
+	c.bw.flush32()
+	c.bw.addBits16NC(c.state, tableLog)
+	c.bw.flush()
+}
+
+// compress is the main compression loop that will encode the input from the last byte to the first.
+func (s *Scratch) compress(src []byte) error {
+	if len(src) <= 2 {
+		return errors.New("compress: src too small")
+	}
+	tt := s.ct.symbolTT[:256]
+	s.bw.reset(s.Out)
+
+	// Our two states each encodes every second byte.
+	// Last byte encoded (first byte decoded) will always be encoded by c1.
+	var c1, c2 cState
+
+	// Encode so remaining size is divisible by 4.
+	ip := len(src)
+	if ip&1 == 1 {
+		c1.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-1]])
+		c2.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-2]])
+		c1.encodeZero(tt[src[ip-3]])
+		ip -= 3
+	} else {
+		c2.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-1]])
+		c1.init(&s.bw, &s.ct, s.actualTableLog, tt[src[ip-2]])
+		ip -= 2
+	}
+	if ip&2 != 0 {
+		c2.encodeZero(tt[src[ip-1]])
+		c1.encodeZero(tt[src[ip-2]])
+		ip -= 2
+	}
+
+	// Main compression loop.
+	switch {
+	case !s.zeroBits && s.actualTableLog <= 8:
+		// We can encode 4 symbols without requiring a flush.
+		// We do not need to check if any output is 0 bits.
+		for ip >= 4 {
+			s.bw.flush32()
+			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			c2.encode(tt[v0])
+			c1.encode(tt[v1])
+			c2.encode(tt[v2])
+			c1.encode(tt[v3])
+			ip -= 4
+		}
+	case !s.zeroBits:
+		// We do not need to check if any output is 0 bits.
+		for ip >= 4 {
+			s.bw.flush32()
+			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			c2.encode(tt[v0])
+			c1.encode(tt[v1])
+			s.bw.flush32()
+			c2.encode(tt[v2])
+			c1.encode(tt[v3])
+			ip -= 4
+		}
+	case s.actualTableLog <= 8:
+		// We can encode 4 symbols without requiring a flush
+		for ip >= 4 {
+			s.bw.flush32()
+			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			c2.encodeZero(tt[v0])
+			c1.encodeZero(tt[v1])
+			c2.encodeZero(tt[v2])
+			c1.encodeZero(tt[v3])
+			ip -= 4
+		}
+	default:
+		for ip >= 4 {
+			s.bw.flush32()
+			v3, v2, v1, v0 := src[ip-4], src[ip-3], src[ip-2], src[ip-1]
+			c2.encodeZero(tt[v0])
+			c1.encodeZero(tt[v1])
+			s.bw.flush32()
+			c2.encodeZero(tt[v2])
+			c1.encodeZero(tt[v3])
+			ip -= 4
+		}
+	}
+
+	// Flush final state.
+	// Used to initialize state when decoding.
+	c2.flush(s.actualTableLog)
+	c1.flush(s.actualTableLog)
+
+	return s.bw.close()
+}
+
+// writeCount will write the normalized histogram count to header.
+// This is read back by readNCount.
+func (s *Scratch) writeCount() error {
+	var (
+		tableLog  = s.actualTableLog
+		tableSize = 1 << tableLog
+		previous0 bool
+		charnum   uint16
+
+		maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3
+
+		// Write Table Size
+		bitStream = uint32(tableLog - minTablelog)
+		bitCount  = uint(4)
+		remaining = int16(tableSize + 1) /* +1 for extra accuracy */
+		threshold = int16(tableSize)
+		nbBits    = uint(tableLog + 1)
+	)
+	if cap(s.Out) < maxHeaderSize {
+		s.Out = make([]byte, 0, s.br.remain()+maxHeaderSize)
+	}
+	outP := uint(0)
+	out := s.Out[:maxHeaderSize]
+
+	// stops at 1
+	for remaining > 1 {
+		if previous0 {
+			start := charnum
+			for s.norm[charnum] == 0 {
+				charnum++
+			}
+			for charnum >= start+24 {
+				start += 24
+				bitStream += uint32(0xFFFF) << bitCount
+				out[outP] = byte(bitStream)
+				out[outP+1] = byte(bitStream >> 8)
+				outP += 2
+				bitStream >>= 16
+			}
+			for charnum >= start+3 {
+				start += 3
+				bitStream += 3 << bitCount
+				bitCount += 2
+			}
+			bitStream += uint32(charnum-start) << bitCount
+			bitCount += 2
+			if bitCount > 16 {
+				out[outP] = byte(bitStream)
+				out[outP+1] = byte(bitStream >> 8)
+				outP += 2
+				bitStream >>= 16
+				bitCount -= 16
+			}
+		}
+
+		count := s.norm[charnum]
+		charnum++
+		max := (2*threshold - 1) - remaining
+		if count < 0 {
+			remaining += count
+		} else {
+			remaining -= count
+		}
+		count++ // +1 for extra accuracy
+		if count >= threshold {
+			count += max // [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[
+		}
+		bitStream += uint32(count) << bitCount
+		bitCount += nbBits
+		if count < max {
+			bitCount--
+		}
+
+		previous0 = count == 1
+		if remaining < 1 {
+			return errors.New("internal error: remaining<1")
+		}
+		for remaining < threshold {
+			nbBits--
+			threshold >>= 1
+		}
+
+		if bitCount > 16 {
+			out[outP] = byte(bitStream)
+			out[outP+1] = byte(bitStream >> 8)
+			outP += 2
+			bitStream >>= 16
+			bitCount -= 16
+		}
+	}
+
+	out[outP] = byte(bitStream)
+	out[outP+1] = byte(bitStream >> 8)
+	outP += (bitCount + 7) / 8
+
+	if uint16(charnum) > s.symbolLen {
+		return errors.New("internal error: charnum > s.symbolLen")
+	}
+	s.Out = out[:outP]
+	return nil
+}
+
+// symbolTransform contains the state transform for a symbol.
+type symbolTransform struct {
+	deltaFindState int32
+	deltaNbBits    uint32
+}
+
+// String prints values as a human readable string.
+func (s symbolTransform) String() string {
+	return fmt.Sprintf("dnbits: %08x, fs:%d", s.deltaNbBits, s.deltaFindState)
+}
+
+// cTable contains tables used for compression.
+type cTable struct {
+	tableSymbol []byte
+	stateTable  []uint16
+	symbolTT    []symbolTransform
+}
+
+// allocCtable will allocate tables needed for compression.
+// If existing tables a re big enough, they are simply re-used.
+func (s *Scratch) allocCtable() {
+	tableSize := 1 << s.actualTableLog
+	// get tableSymbol that is big enough.
+	if cap(s.ct.tableSymbol) < int(tableSize) {
+		s.ct.tableSymbol = make([]byte, tableSize)
+	}
+	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
+
+	ctSize := tableSize
+	if cap(s.ct.stateTable) < ctSize {
+		s.ct.stateTable = make([]uint16, ctSize)
+	}
+	s.ct.stateTable = s.ct.stateTable[:ctSize]
+
+	if cap(s.ct.symbolTT) < 256 {
+		s.ct.symbolTT = make([]symbolTransform, 256)
+	}
+	s.ct.symbolTT = s.ct.symbolTT[:256]
+}
+
+// buildCTable will populate the compression table so it is ready to be used.
+func (s *Scratch) buildCTable() error {
+	tableSize := uint32(1 << s.actualTableLog)
+	highThreshold := tableSize - 1
+	var cumul [maxSymbolValue + 2]int16
+
+	s.allocCtable()
+	tableSymbol := s.ct.tableSymbol[:tableSize]
+	// symbol start positions
+	{
+		cumul[0] = 0
+		for ui, v := range s.norm[:s.symbolLen-1] {
+			u := byte(ui) // one less than reference
+			if v == -1 {
+				// Low proba symbol
+				cumul[u+1] = cumul[u] + 1
+				tableSymbol[highThreshold] = u
+				highThreshold--
+			} else {
+				cumul[u+1] = cumul[u] + v
+			}
+		}
+		// Encode last symbol separately to avoid overflowing u
+		u := int(s.symbolLen - 1)
+		v := s.norm[s.symbolLen-1]
+		if v == -1 {
+			// Low proba symbol
+			cumul[u+1] = cumul[u] + 1
+			tableSymbol[highThreshold] = byte(u)
+			highThreshold--
+		} else {
+			cumul[u+1] = cumul[u] + v
+		}
+		if uint32(cumul[s.symbolLen]) != tableSize {
+			return fmt.Errorf("internal error: expected cumul[s.symbolLen] (%d) == tableSize (%d)", cumul[s.symbolLen], tableSize)
+		}
+		cumul[s.symbolLen] = int16(tableSize) + 1
+	}
+	// Spread symbols
+	s.zeroBits = false
+	{
+		step := tableStep(tableSize)
+		tableMask := tableSize - 1
+		var position uint32
+		// if any symbol > largeLimit, we may have 0 bits output.
+		largeLimit := int16(1 << (s.actualTableLog - 1))
+		for ui, v := range s.norm[:s.symbolLen] {
+			symbol := byte(ui)
+			if v > largeLimit {
+				s.zeroBits = true
+			}
+			for nbOccurrences := int16(0); nbOccurrences < v; nbOccurrences++ {
+				tableSymbol[position] = symbol
+				position = (position + step) & tableMask
+				for position > highThreshold {
+					position = (position + step) & tableMask
+				} /* Low proba area */
+			}
+		}
+
+		// Check if we have gone through all positions
+		if position != 0 {
+			return errors.New("position!=0")
+		}
+	}
+
+	// Build table
+	table := s.ct.stateTable
+	{
+		tsi := int(tableSize)
+		for u, v := range tableSymbol {
+			// TableU16 : sorted by symbol order; gives next state value
+			table[cumul[v]] = uint16(tsi + u)
+			cumul[v]++
+		}
+	}
+
+	// Build Symbol Transformation Table
+	{
+		total := int16(0)
+		symbolTT := s.ct.symbolTT[:s.symbolLen]
+		tableLog := s.actualTableLog
+		tl := (uint32(tableLog) << 16) - (1 << tableLog)
+		for i, v := range s.norm[:s.symbolLen] {
+			switch v {
+			case 0:
+			case -1, 1:
+				symbolTT[i].deltaNbBits = tl
+				symbolTT[i].deltaFindState = int32(total - 1)
+				total++
+			default:
+				maxBitsOut := uint32(tableLog) - highBits(uint32(v-1))
+				minStatePlus := uint32(v) << maxBitsOut
+				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
+				symbolTT[i].deltaFindState = int32(total - v)
+				total += v
+			}
+		}
+		if total != int16(tableSize) {
+			return fmt.Errorf("total mismatch %d (got) != %d (want)", total, tableSize)
+		}
+	}
+	return nil
+}
+
+// countSimple will create a simple histogram in s.count.
+// Returns the biggest count.
+// Does not update s.clearCount.
+func (s *Scratch) countSimple(in []byte) (max int) {
+	for _, v := range in {
+		s.count[v]++
+	}
+	m := uint32(0)
+	for i, v := range s.count[:] {
+		if v > m {
+			m = v
+		}
+		if v > 0 {
+			s.symbolLen = uint16(i) + 1
+		}
+	}
+	return int(m)
+}
+
+// minTableLog provides the minimum logSize to safely represent a distribution.
+func (s *Scratch) minTableLog() uint8 {
+	minBitsSrc := highBits(uint32(s.br.remain()-1)) + 1
+	minBitsSymbols := highBits(uint32(s.symbolLen-1)) + 2
+	if minBitsSrc < minBitsSymbols {
+		return uint8(minBitsSrc)
+	}
+	return uint8(minBitsSymbols)
+}
+
+// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
+func (s *Scratch) optimalTableLog() {
+	tableLog := s.TableLog
+	minBits := s.minTableLog()
+	maxBitsSrc := uint8(highBits(uint32(s.br.remain()-1))) - 2
+	if maxBitsSrc < tableLog {
+		// Accuracy can be reduced
+		tableLog = maxBitsSrc
+	}
+	if minBits > tableLog {
+		tableLog = minBits
+	}
+	// Need a minimum to safely represent all symbol values
+	if tableLog < minTablelog {
+		tableLog = minTablelog
+	}
+	if tableLog > maxTableLog {
+		tableLog = maxTableLog
+	}
+	s.actualTableLog = tableLog
+}
+
+var rtbTable = [...]uint32{0, 473195, 504333, 520860, 550000, 700000, 750000, 830000}
+
+// normalizeCount will normalize the count of the symbols so
+// the total is equal to the table size.
+func (s *Scratch) normalizeCount() error {
+	var (
+		tableLog          = s.actualTableLog
+		scale             = 62 - uint64(tableLog)
+		step              = (1 << 62) / uint64(s.br.remain())
+		vStep             = uint64(1) << (scale - 20)
+		stillToDistribute = int16(1 << tableLog)
+		largest           int
+		largestP          int16
+		lowThreshold      = (uint32)(s.br.remain() >> tableLog)
+	)
+
+	for i, cnt := range s.count[:s.symbolLen] {
+		// already handled
+		// if (count[s] == s.length) return 0;   /* rle special case */
+
+		if cnt == 0 {
+			s.norm[i] = 0
+			continue
+		}
+		if cnt <= lowThreshold {
+			s.norm[i] = -1
+			stillToDistribute--
+		} else {
+			proba := (int16)((uint64(cnt) * step) >> scale)
+			if proba < 8 {
+				restToBeat := vStep * uint64(rtbTable[proba])
+				v := uint64(cnt)*step - (uint64(proba) << scale)
+				if v > restToBeat {
+					proba++
+				}
+			}
+			if proba > largestP {
+				largestP = proba
+				largest = i
+			}
+			s.norm[i] = proba
+			stillToDistribute -= proba
+		}
+	}
+
+	if -stillToDistribute >= (s.norm[largest] >> 1) {
+		// corner case, need another normalization method
+		return s.normalizeCount2()
+	}
+	s.norm[largest] += stillToDistribute
+	return nil
+}
+
+// Secondary normalization method.
+// To be used when primary method fails.
+func (s *Scratch) normalizeCount2() error {
+	const notYetAssigned = -2
+	var (
+		distributed  uint32
+		total        = uint32(s.br.remain())
+		tableLog     = s.actualTableLog
+		lowThreshold = uint32(total >> tableLog)
+		lowOne       = uint32((total * 3) >> (tableLog + 1))
+	)
+	for i, cnt := range s.count[:s.symbolLen] {
+		if cnt == 0 {
+			s.norm[i] = 0
+			continue
+		}
+		if cnt <= lowThreshold {
+			s.norm[i] = -1
+			distributed++
+			total -= cnt
+			continue
+		}
+		if cnt <= lowOne {
+			s.norm[i] = 1
+			distributed++
+			total -= cnt
+			continue
+		}
+		s.norm[i] = notYetAssigned
+	}
+	toDistribute := (1 << tableLog) - distributed
+
+	if (total / toDistribute) > lowOne {
+		// risk of rounding to zero
+		lowOne = uint32((total * 3) / (toDistribute * 2))
+		for i, cnt := range s.count[:s.symbolLen] {
+			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
+				s.norm[i] = 1
+				distributed++
+				total -= cnt
+				continue
+			}
+		}
+		toDistribute = (1 << tableLog) - distributed
+	}
+	if distributed == uint32(s.symbolLen)+1 {
+		// all values are pretty poor;
+		//   probably incompressible data (should have already been detected);
+		//   find max, then give all remaining points to max
+		var maxV int
+		var maxC uint32
+		for i, cnt := range s.count[:s.symbolLen] {
+			if cnt > maxC {
+				maxV = i
+				maxC = cnt
+			}
+		}
+		s.norm[maxV] += int16(toDistribute)
+		return nil
+	}
+
+	if total == 0 {
+		// all of the symbols were low enough for the lowOne or lowThreshold
+		for i := uint32(0); toDistribute > 0; i = (i + 1) % (uint32(s.symbolLen)) {
+			if s.norm[i] > 0 {
+				toDistribute--
+				s.norm[i]++
+			}
+		}
+		return nil
+	}
+
+	var (
+		vStepLog = 62 - uint64(tableLog)
+		mid      = uint64((1 << (vStepLog - 1)) - 1)
+		rStep    = (((1 << vStepLog) * uint64(toDistribute)) + mid) / uint64(total) // scale on remaining
+		tmpTotal = mid
+	)
+	for i, cnt := range s.count[:s.symbolLen] {
+		if s.norm[i] == notYetAssigned {
+			var (
+				end    = tmpTotal + uint64(cnt)*rStep
+				sStart = uint32(tmpTotal >> vStepLog)
+				sEnd   = uint32(end >> vStepLog)
+				weight = sEnd - sStart
+			)
+			if weight < 1 {
+				return errors.New("weight < 1")
+			}
+			s.norm[i] = int16(weight)
+			tmpTotal = end
+		}
+	}
+	return nil
+}
+
+// validateNorm validates the normalized histogram table.
+func (s *Scratch) validateNorm() (err error) {
+	var total int
+	for _, v := range s.norm[:s.symbolLen] {
+		if v >= 0 {
+			total += int(v)
+		} else {
+			total -= int(v)
+		}
+	}
+	defer func() {
+		if err == nil {
+			return
+		}
+		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", s.actualTableLog, s.symbolLen)
+		for i, v := range s.norm[:s.symbolLen] {
+			fmt.Printf("%3d: %5d -> %4d \n", i, s.count[i], v)
+		}
+	}()
+	if total != (1 << s.actualTableLog) {
+		return fmt.Errorf("warning: Total == %d != %d", total, 1<<s.actualTableLog)
+	}
+	for i, v := range s.count[s.symbolLen:] {
+		if v != 0 {
+			return fmt.Errorf("warning: Found symbol out of range, %d after cut", i)
+		}
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/fse/decompress.go b/vendor/github.com/klauspost/compress/fse/decompress.go
new file mode 100644
index 00000000000..413ec3b3cd8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/decompress.go
@@ -0,0 +1,374 @@
+package fse
+
+import (
+	"errors"
+	"fmt"
+)
+
+const (
+	tablelogAbsoluteMax = 15
+)
+
+// Decompress a block of data.
+// You can provide a scratch buffer to avoid allocations.
+// If nil is provided a temporary one will be allocated.
+// It is possible, but by no way guaranteed that corrupt data will
+// return an error.
+// It is up to the caller to verify integrity of the returned data.
+// Use a predefined Scrach to set maximum acceptable output size.
+func Decompress(b []byte, s *Scratch) ([]byte, error) {
+	s, err := s.prepare(b)
+	if err != nil {
+		return nil, err
+	}
+	s.Out = s.Out[:0]
+	err = s.readNCount()
+	if err != nil {
+		return nil, err
+	}
+	err = s.buildDtable()
+	if err != nil {
+		return nil, err
+	}
+	err = s.decompress()
+	if err != nil {
+		return nil, err
+	}
+
+	return s.Out, nil
+}
+
+// readNCount will read the symbol distribution so decoding tables can be constructed.
+func (s *Scratch) readNCount() error {
+	var (
+		charnum   uint16
+		previous0 bool
+		b         = &s.br
+	)
+	iend := b.remain()
+	if iend < 4 {
+		return errors.New("input too small")
+	}
+	bitStream := b.Uint32()
+	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
+	if nbBits > tablelogAbsoluteMax {
+		return errors.New("tableLog too large")
+	}
+	bitStream >>= 4
+	bitCount := uint(4)
+
+	s.actualTableLog = uint8(nbBits)
+	remaining := int32((1 << nbBits) + 1)
+	threshold := int32(1 << nbBits)
+	gotTotal := int32(0)
+	nbBits++
+
+	for remaining > 1 {
+		if previous0 {
+			n0 := charnum
+			for (bitStream & 0xFFFF) == 0xFFFF {
+				n0 += 24
+				if b.off < iend-5 {
+					b.advance(2)
+					bitStream = b.Uint32() >> bitCount
+				} else {
+					bitStream >>= 16
+					bitCount += 16
+				}
+			}
+			for (bitStream & 3) == 3 {
+				n0 += 3
+				bitStream >>= 2
+				bitCount += 2
+			}
+			n0 += uint16(bitStream & 3)
+			bitCount += 2
+			if n0 > maxSymbolValue {
+				return errors.New("maxSymbolValue too small")
+			}
+			for charnum < n0 {
+				s.norm[charnum&0xff] = 0
+				charnum++
+			}
+
+			if b.off <= iend-7 || b.off+int(bitCount>>3) <= iend-4 {
+				b.advance(bitCount >> 3)
+				bitCount &= 7
+				bitStream = b.Uint32() >> bitCount
+			} else {
+				bitStream >>= 2
+			}
+		}
+
+		max := (2*(threshold) - 1) - (remaining)
+		var count int32
+
+		if (int32(bitStream) & (threshold - 1)) < max {
+			count = int32(bitStream) & (threshold - 1)
+			bitCount += nbBits - 1
+		} else {
+			count = int32(bitStream) & (2*threshold - 1)
+			if count >= threshold {
+				count -= max
+			}
+			bitCount += nbBits
+		}
+
+		count-- // extra accuracy
+		if count < 0 {
+			// -1 means +1
+			remaining += count
+			gotTotal -= count
+		} else {
+			remaining -= count
+			gotTotal += count
+		}
+		s.norm[charnum&0xff] = int16(count)
+		charnum++
+		previous0 = count == 0
+		for remaining < threshold {
+			nbBits--
+			threshold >>= 1
+		}
+		if b.off <= iend-7 || b.off+int(bitCount>>3) <= iend-4 {
+			b.advance(bitCount >> 3)
+			bitCount &= 7
+		} else {
+			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
+			b.off = len(b.b) - 4
+		}
+		bitStream = b.Uint32() >> (bitCount & 31)
+	}
+	s.symbolLen = charnum
+
+	if s.symbolLen <= 1 {
+		return fmt.Errorf("symbolLen (%d) too small", s.symbolLen)
+	}
+	if s.symbolLen > maxSymbolValue+1 {
+		return fmt.Errorf("symbolLen (%d) too big", s.symbolLen)
+	}
+	if remaining != 1 {
+		return fmt.Errorf("corruption detected (remaining %d != 1)", remaining)
+	}
+	if bitCount > 32 {
+		return fmt.Errorf("corruption detected (bitCount %d > 32)", bitCount)
+	}
+	if gotTotal != 1<<s.actualTableLog {
+		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
+	}
+	b.advance((bitCount + 7) >> 3)
+	return nil
+}
+
+// decSymbol contains information about a state entry,
+// Including the state offset base, the output symbol and
+// the number of bits to read for the low part of the destination state.
+type decSymbol struct {
+	newState uint16
+	symbol   uint8
+	nbBits   uint8
+}
+
+// allocDtable will allocate decoding tables if they are not big enough.
+func (s *Scratch) allocDtable() {
+	tableSize := 1 << s.actualTableLog
+	if cap(s.decTable) < int(tableSize) {
+		s.decTable = make([]decSymbol, tableSize)
+	}
+	s.decTable = s.decTable[:tableSize]
+
+	if cap(s.ct.tableSymbol) < 256 {
+		s.ct.tableSymbol = make([]byte, 256)
+	}
+	s.ct.tableSymbol = s.ct.tableSymbol[:256]
+
+	if cap(s.ct.stateTable) < 256 {
+		s.ct.stateTable = make([]uint16, 256)
+	}
+	s.ct.stateTable = s.ct.stateTable[:256]
+}
+
+// buildDtable will build the decoding table.
+func (s *Scratch) buildDtable() error {
+	tableSize := uint32(1 << s.actualTableLog)
+	highThreshold := tableSize - 1
+	s.allocDtable()
+	symbolNext := s.ct.stateTable[:256]
+
+	// Init, lay down lowprob symbols
+	s.zeroBits = false
+	{
+		largeLimit := int16(1 << (s.actualTableLog - 1))
+		for i, v := range s.norm[:s.symbolLen] {
+			if v == -1 {
+				s.decTable[highThreshold].symbol = uint8(i)
+				highThreshold--
+				symbolNext[i] = 1
+			} else {
+				if v >= largeLimit {
+					s.zeroBits = true
+				}
+				symbolNext[i] = uint16(v)
+			}
+		}
+	}
+	// Spread symbols
+	{
+		tableMask := tableSize - 1
+		step := tableStep(tableSize)
+		position := uint32(0)
+		for ss, v := range s.norm[:s.symbolLen] {
+			for i := 0; i < int(v); i++ {
+				s.decTable[position].symbol = uint8(ss)
+				position = (position + step) & tableMask
+				for position > highThreshold {
+					// lowprob area
+					position = (position + step) & tableMask
+				}
+			}
+		}
+		if position != 0 {
+			// position must reach all cells once, otherwise normalizedCounter is incorrect
+			return errors.New("corrupted input (position != 0)")
+		}
+	}
+
+	// Build Decoding table
+	{
+		tableSize := uint16(1 << s.actualTableLog)
+		for u, v := range s.decTable {
+			symbol := v.symbol
+			nextState := symbolNext[symbol]
+			symbolNext[symbol] = nextState + 1
+			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+			s.decTable[u].nbBits = nBits
+			newState := (nextState << nBits) - tableSize
+			if newState >= tableSize {
+				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+			}
+			if newState == uint16(u) && nBits == 0 {
+				// Seems weird that this is possible with nbits > 0.
+				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+			}
+			s.decTable[u].newState = newState
+		}
+	}
+	return nil
+}
+
+// decompress will decompress the bitstream.
+// If the buffer is over-read an error is returned.
+func (s *Scratch) decompress() error {
+	br := &s.bits
+	br.init(s.br.unread())
+
+	var s1, s2 decoder
+	// Initialize and decode first state and symbol.
+	s1.init(br, s.decTable, s.actualTableLog)
+	s2.init(br, s.decTable, s.actualTableLog)
+
+	// Use temp table to avoid bound checks/append penalty.
+	var tmp = s.ct.tableSymbol[:256]
+	var off uint8
+
+	// Main part
+	if !s.zeroBits {
+		for br.off >= 8 {
+			br.fillFast()
+			tmp[off+0] = s1.nextFast()
+			tmp[off+1] = s2.nextFast()
+			br.fillFast()
+			tmp[off+2] = s1.nextFast()
+			tmp[off+3] = s2.nextFast()
+			off += 4
+			// When off is 0, we have overflowed and should write.
+			if off == 0 {
+				s.Out = append(s.Out, tmp...)
+				if len(s.Out) >= s.DecompressLimit {
+					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
+				}
+			}
+		}
+	} else {
+		for br.off >= 8 {
+			br.fillFast()
+			tmp[off+0] = s1.next()
+			tmp[off+1] = s2.next()
+			br.fillFast()
+			tmp[off+2] = s1.next()
+			tmp[off+3] = s2.next()
+			off += 4
+			if off == 0 {
+				s.Out = append(s.Out, tmp...)
+				// When off is 0, we have overflowed and should write.
+				if len(s.Out) >= s.DecompressLimit {
+					return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
+				}
+			}
+		}
+	}
+	s.Out = append(s.Out, tmp[:off]...)
+
+	// Final bits, a bit more expensive check
+	for {
+		if s1.finished() {
+			s.Out = append(s.Out, s1.final(), s2.final())
+			break
+		}
+		br.fill()
+		s.Out = append(s.Out, s1.next())
+		if s2.finished() {
+			s.Out = append(s.Out, s2.final(), s1.final())
+			break
+		}
+		s.Out = append(s.Out, s2.next())
+		if len(s.Out) >= s.DecompressLimit {
+			return fmt.Errorf("output size (%d) > DecompressLimit (%d)", len(s.Out), s.DecompressLimit)
+		}
+	}
+	return br.close()
+}
+
+// decoder keeps track of the current state and updates it from the bitstream.
+type decoder struct {
+	state uint16
+	br    *bitReader
+	dt    []decSymbol
+}
+
+// init will initialize the decoder and read the first state from the stream.
+func (d *decoder) init(in *bitReader, dt []decSymbol, tableLog uint8) {
+	d.dt = dt
+	d.br = in
+	d.state = uint16(in.getBits(tableLog))
+}
+
+// next returns the next symbol and sets the next state.
+// At least tablelog bits must be available in the bit reader.
+func (d *decoder) next() uint8 {
+	n := &d.dt[d.state]
+	lowBits := d.br.getBits(n.nbBits)
+	d.state = n.newState + lowBits
+	return n.symbol
+}
+
+// finished returns true if all bits have been read from the bitstream
+// and the next state would require reading bits from the input.
+func (d *decoder) finished() bool {
+	return d.br.finished() && d.dt[d.state].nbBits > 0
+}
+
+// final returns the current state symbol without decoding the next.
+func (d *decoder) final() uint8 {
+	return d.dt[d.state].symbol
+}
+
+// nextFast returns the next symbol and sets the next state.
+// This can only be used if no symbols are 0 bits.
+// At least tablelog bits must be available in the bit reader.
+func (d *decoder) nextFast() uint8 {
+	n := d.dt[d.state]
+	lowBits := d.br.getBitsFast(n.nbBits)
+	d.state = n.newState + lowBits
+	return n.symbol
+}
diff --git a/vendor/github.com/klauspost/compress/fse/fse.go b/vendor/github.com/klauspost/compress/fse/fse.go
new file mode 100644
index 00000000000..535cbadfdea
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/fse/fse.go
@@ -0,0 +1,144 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+// Package fse provides Finite State Entropy encoding and decoding.
+//
+// Finite State Entropy encoding provides a fast near-optimal symbol encoding/decoding
+// for byte blocks as implemented in zstd.
+//
+// See https://github.com/klauspost/compress/tree/master/fse for more information.
+package fse
+
+import (
+	"errors"
+	"fmt"
+	"math/bits"
+)
+
+const (
+	/*!MEMORY_USAGE :
+	 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+	 *  Increasing memory usage improves compression ratio
+	 *  Reduced memory usage can improve speed, due to cache effect
+	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+	maxMemoryUsage     = 14
+	defaultMemoryUsage = 13
+
+	maxTableLog     = maxMemoryUsage - 2
+	maxTablesize    = 1 << maxTableLog
+	defaultTablelog = defaultMemoryUsage - 2
+	minTablelog     = 5
+	maxSymbolValue  = 255
+)
+
+var (
+	// ErrIncompressible is returned when input is judged to be too hard to compress.
+	ErrIncompressible = errors.New("input is not compressible")
+
+	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
+	ErrUseRLE = errors.New("input is single value repeated")
+)
+
+// Scratch provides temporary storage for compression and decompression.
+type Scratch struct {
+	// Private
+	count    [maxSymbolValue + 1]uint32
+	norm     [maxSymbolValue + 1]int16
+	br       byteReader
+	bits     bitReader
+	bw       bitWriter
+	ct       cTable      // Compression tables.
+	decTable []decSymbol // Decompression table.
+	maxCount int         // count of the most probable symbol
+
+	// Per block parameters.
+	// These can be used to override compression parameters of the block.
+	// Do not touch, unless you know what you are doing.
+
+	// Out is output buffer.
+	// If the scratch is re-used before the caller is done processing the output,
+	// set this field to nil.
+	// Otherwise the output buffer will be re-used for next Compression/Decompression step
+	// and allocation will be avoided.
+	Out []byte
+
+	// DecompressLimit limits the maximum decoded size acceptable.
+	// If > 0 decompression will stop when approximately this many bytes
+	// has been decoded.
+	// If 0, maximum size will be 2GB.
+	DecompressLimit int
+
+	symbolLen      uint16 // Length of active part of the symbol table.
+	actualTableLog uint8  // Selected tablelog.
+	zeroBits       bool   // no bits has prob > 50%.
+	clearCount     bool   // clear count
+
+	// MaxSymbolValue will override the maximum symbol value of the next block.
+	MaxSymbolValue uint8
+
+	// TableLog will attempt to override the tablelog for the next block.
+	TableLog uint8
+}
+
+// Histogram allows to populate the histogram and skip that step in the compression,
+// It otherwise allows to inspect the histogram when compression is done.
+// To indicate that you have populated the histogram call HistogramFinished
+// with the value of the highest populated symbol, as well as the number of entries
+// in the most populated entry. These are accepted at face value.
+// The returned slice will always be length 256.
+func (s *Scratch) Histogram() []uint32 {
+	return s.count[:]
+}
+
+// HistogramFinished can be called to indicate that the histogram has been populated.
+// maxSymbol is the index of the highest set symbol of the next data segment.
+// maxCount is the number of entries in the most populated entry.
+// These are accepted at face value.
+func (s *Scratch) HistogramFinished(maxSymbol uint8, maxCount int) {
+	s.maxCount = maxCount
+	s.symbolLen = uint16(maxSymbol) + 1
+	s.clearCount = maxCount != 0
+}
+
+// prepare will prepare and allocate scratch tables used for both compression and decompression.
+func (s *Scratch) prepare(in []byte) (*Scratch, error) {
+	if s == nil {
+		s = &Scratch{}
+	}
+	if s.MaxSymbolValue == 0 {
+		s.MaxSymbolValue = 255
+	}
+	if s.TableLog == 0 {
+		s.TableLog = defaultTablelog
+	}
+	if s.TableLog > maxTableLog {
+		return nil, fmt.Errorf("tableLog (%d) > maxTableLog (%d)", s.TableLog, maxTableLog)
+	}
+	if cap(s.Out) == 0 {
+		s.Out = make([]byte, 0, len(in))
+	}
+	if s.clearCount && s.maxCount == 0 {
+		for i := range s.count {
+			s.count[i] = 0
+		}
+		s.clearCount = false
+	}
+	s.br.init(in)
+	if s.DecompressLimit == 0 {
+		// Max size 2GB.
+		s.DecompressLimit = (2 << 30) - 1
+	}
+
+	return s, nil
+}
+
+// tableStep returns the next table index.
+func tableStep(tableSize uint32) uint32 {
+	return (tableSize >> 1) + (tableSize >> 3) + 3
+}
+
+func highBits(val uint32) (n uint32) {
+	return uint32(bits.Len32(val) - 1)
+}
diff --git a/vendor/github.com/klauspost/compress/gzip/gunzip.go b/vendor/github.com/klauspost/compress/gzip/gunzip.go
new file mode 100644
index 00000000000..568b5d4fb8b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gunzip.go
@@ -0,0 +1,344 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package gzip implements reading and writing of gzip format compressed files,
+// as specified in RFC 1952.
+package gzip
+
+import (
+	"bufio"
+	"encoding/binary"
+	"errors"
+	"hash/crc32"
+	"io"
+	"time"
+
+	"github.com/klauspost/compress/flate"
+)
+
+const (
+	gzipID1     = 0x1f
+	gzipID2     = 0x8b
+	gzipDeflate = 8
+	flagText    = 1 << 0
+	flagHdrCrc  = 1 << 1
+	flagExtra   = 1 << 2
+	flagName    = 1 << 3
+	flagComment = 1 << 4
+)
+
+var (
+	// ErrChecksum is returned when reading GZIP data that has an invalid checksum.
+	ErrChecksum = errors.New("gzip: invalid checksum")
+	// ErrHeader is returned when reading GZIP data that has an invalid header.
+	ErrHeader = errors.New("gzip: invalid header")
+)
+
+var le = binary.LittleEndian
+
+// noEOF converts io.EOF to io.ErrUnexpectedEOF.
+func noEOF(err error) error {
+	if err == io.EOF {
+		return io.ErrUnexpectedEOF
+	}
+	return err
+}
+
+// The gzip file stores a header giving metadata about the compressed file.
+// That header is exposed as the fields of the Writer and Reader structs.
+//
+// Strings must be UTF-8 encoded and may only contain Unicode code points
+// U+0001 through U+00FF, due to limitations of the GZIP file format.
+type Header struct {
+	Comment string    // comment
+	Extra   []byte    // "extra data"
+	ModTime time.Time // modification time
+	Name    string    // file name
+	OS      byte      // operating system type
+}
+
+// A Reader is an io.Reader that can be read to retrieve
+// uncompressed data from a gzip-format compressed file.
+//
+// In general, a gzip file can be a concatenation of gzip files,
+// each with its own header. Reads from the Reader
+// return the concatenation of the uncompressed data of each.
+// Only the first header is recorded in the Reader fields.
+//
+// Gzip files store a length and checksum of the uncompressed data.
+// The Reader will return a ErrChecksum when Read
+// reaches the end of the uncompressed data if it does not
+// have the expected length or checksum. Clients should treat data
+// returned by Read as tentative until they receive the io.EOF
+// marking the end of the data.
+type Reader struct {
+	Header       // valid after NewReader or Reader.Reset
+	r            flate.Reader
+	decompressor io.ReadCloser
+	digest       uint32 // CRC-32, IEEE polynomial (section 8)
+	size         uint32 // Uncompressed size (section 2.3.1)
+	buf          [512]byte
+	err          error
+	multistream  bool
+}
+
+// NewReader creates a new Reader reading the given reader.
+// If r does not also implement io.ByteReader,
+// the decompressor may read more data than necessary from r.
+//
+// It is the caller's responsibility to call Close on the Reader when done.
+//
+// The Reader.Header fields will be valid in the Reader returned.
+func NewReader(r io.Reader) (*Reader, error) {
+	z := new(Reader)
+	if err := z.Reset(r); err != nil {
+		return nil, err
+	}
+	return z, nil
+}
+
+// Reset discards the Reader z's state and makes it equivalent to the
+// result of its original state from NewReader, but reading from r instead.
+// This permits reusing a Reader rather than allocating a new one.
+func (z *Reader) Reset(r io.Reader) error {
+	*z = Reader{
+		decompressor: z.decompressor,
+		multistream:  true,
+	}
+	if rr, ok := r.(flate.Reader); ok {
+		z.r = rr
+	} else {
+		z.r = bufio.NewReader(r)
+	}
+	z.Header, z.err = z.readHeader()
+	return z.err
+}
+
+// Multistream controls whether the reader supports multistream files.
+//
+// If enabled (the default), the Reader expects the input to be a sequence
+// of individually gzipped data streams, each with its own header and
+// trailer, ending at EOF. The effect is that the concatenation of a sequence
+// of gzipped files is treated as equivalent to the gzip of the concatenation
+// of the sequence. This is standard behavior for gzip readers.
+//
+// Calling Multistream(false) disables this behavior; disabling the behavior
+// can be useful when reading file formats that distinguish individual gzip
+// data streams or mix gzip data streams with other data streams.
+// In this mode, when the Reader reaches the end of the data stream,
+// Read returns io.EOF. If the underlying reader implements io.ByteReader,
+// it will be left positioned just after the gzip stream.
+// To start the next stream, call z.Reset(r) followed by z.Multistream(false).
+// If there is no next stream, z.Reset(r) will return io.EOF.
+func (z *Reader) Multistream(ok bool) {
+	z.multistream = ok
+}
+
+// readString reads a NUL-terminated string from z.r.
+// It treats the bytes read as being encoded as ISO 8859-1 (Latin-1) and
+// will output a string encoded using UTF-8.
+// This method always updates z.digest with the data read.
+func (z *Reader) readString() (string, error) {
+	var err error
+	needConv := false
+	for i := 0; ; i++ {
+		if i >= len(z.buf) {
+			return "", ErrHeader
+		}
+		z.buf[i], err = z.r.ReadByte()
+		if err != nil {
+			return "", err
+		}
+		if z.buf[i] > 0x7f {
+			needConv = true
+		}
+		if z.buf[i] == 0 {
+			// Digest covers the NUL terminator.
+			z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:i+1])
+
+			// Strings are ISO 8859-1, Latin-1 (RFC 1952, section 2.3.1).
+			if needConv {
+				s := make([]rune, 0, i)
+				for _, v := range z.buf[:i] {
+					s = append(s, rune(v))
+				}
+				return string(s), nil
+			}
+			return string(z.buf[:i]), nil
+		}
+	}
+}
+
+// readHeader reads the GZIP header according to section 2.3.1.
+// This method does not set z.err.
+func (z *Reader) readHeader() (hdr Header, err error) {
+	if _, err = io.ReadFull(z.r, z.buf[:10]); err != nil {
+		// RFC 1952, section 2.2, says the following:
+		//	A gzip file consists of a series of "members" (compressed data sets).
+		//
+		// Other than this, the specification does not clarify whether a
+		// "series" is defined as "one or more" or "zero or more". To err on the
+		// side of caution, Go interprets this to mean "zero or more".
+		// Thus, it is okay to return io.EOF here.
+		return hdr, err
+	}
+	if z.buf[0] != gzipID1 || z.buf[1] != gzipID2 || z.buf[2] != gzipDeflate {
+		return hdr, ErrHeader
+	}
+	flg := z.buf[3]
+	hdr.ModTime = time.Unix(int64(le.Uint32(z.buf[4:8])), 0)
+	// z.buf[8] is XFL and is currently ignored.
+	hdr.OS = z.buf[9]
+	z.digest = crc32.ChecksumIEEE(z.buf[:10])
+
+	if flg&flagExtra != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, z.buf[:2])
+		data := make([]byte, le.Uint16(z.buf[:2]))
+		if _, err = io.ReadFull(z.r, data); err != nil {
+			return hdr, noEOF(err)
+		}
+		z.digest = crc32.Update(z.digest, crc32.IEEETable, data)
+		hdr.Extra = data
+	}
+
+	var s string
+	if flg&flagName != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Name = s
+	}
+
+	if flg&flagComment != 0 {
+		if s, err = z.readString(); err != nil {
+			return hdr, err
+		}
+		hdr.Comment = s
+	}
+
+	if flg&flagHdrCrc != 0 {
+		if _, err = io.ReadFull(z.r, z.buf[:2]); err != nil {
+			return hdr, noEOF(err)
+		}
+		digest := le.Uint16(z.buf[:2])
+		if digest != uint16(z.digest) {
+			return hdr, ErrHeader
+		}
+	}
+
+	z.digest = 0
+	if z.decompressor == nil {
+		z.decompressor = flate.NewReader(z.r)
+	} else {
+		z.decompressor.(flate.Resetter).Reset(z.r, nil)
+	}
+	return hdr, nil
+}
+
+// Read implements io.Reader, reading uncompressed bytes from its underlying Reader.
+func (z *Reader) Read(p []byte) (n int, err error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+
+	n, z.err = z.decompressor.Read(p)
+	z.digest = crc32.Update(z.digest, crc32.IEEETable, p[:n])
+	z.size += uint32(n)
+	if z.err != io.EOF {
+		// In the normal case we return here.
+		return n, z.err
+	}
+
+	// Finished file; check checksum and size.
+	if _, err := io.ReadFull(z.r, z.buf[:8]); err != nil {
+		z.err = noEOF(err)
+		return n, z.err
+	}
+	digest := le.Uint32(z.buf[:4])
+	size := le.Uint32(z.buf[4:8])
+	if digest != z.digest || size != z.size {
+		z.err = ErrChecksum
+		return n, z.err
+	}
+	z.digest, z.size = 0, 0
+
+	// File is ok; check if there is another.
+	if !z.multistream {
+		return n, io.EOF
+	}
+	z.err = nil // Remove io.EOF
+
+	if _, z.err = z.readHeader(); z.err != nil {
+		return n, z.err
+	}
+
+	// Read from next file, if necessary.
+	if n > 0 {
+		return n, nil
+	}
+	return z.Read(p)
+}
+
+// Support the io.WriteTo interface for io.Copy and friends.
+func (z *Reader) WriteTo(w io.Writer) (int64, error) {
+	total := int64(0)
+	crcWriter := crc32.NewIEEE()
+	for {
+		if z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+
+		// We write both to output and digest.
+		mw := io.MultiWriter(w, crcWriter)
+		n, err := z.decompressor.(io.WriterTo).WriteTo(mw)
+		total += n
+		z.size += uint32(n)
+		if err != nil {
+			z.err = err
+			return total, z.err
+		}
+
+		// Finished file; check checksum + size.
+		if _, err := io.ReadFull(z.r, z.buf[0:8]); err != nil {
+			if err == io.EOF {
+				err = io.ErrUnexpectedEOF
+			}
+			z.err = err
+			return total, err
+		}
+		z.digest = crcWriter.Sum32()
+		digest := le.Uint32(z.buf[:4])
+		size := le.Uint32(z.buf[4:8])
+		if digest != z.digest || size != z.size {
+			z.err = ErrChecksum
+			return total, z.err
+		}
+		z.digest, z.size = 0, 0
+
+		// File is ok; check if there is another.
+		if !z.multistream {
+			return total, nil
+		}
+		crcWriter.Reset()
+		z.err = nil // Remove io.EOF
+
+		if _, z.err = z.readHeader(); z.err != nil {
+			if z.err == io.EOF {
+				return total, nil
+			}
+			return total, z.err
+		}
+	}
+}
+
+// Close closes the Reader. It does not close the underlying io.Reader.
+// In order for the GZIP checksum to be verified, the reader must be
+// fully consumed until the io.EOF.
+func (z *Reader) Close() error { return z.decompressor.Close() }
diff --git a/vendor/github.com/klauspost/compress/gzip/gzip.go b/vendor/github.com/klauspost/compress/gzip/gzip.go
new file mode 100644
index 00000000000..26203851bdf
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@@ -0,0 +1,269 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gzip
+
+import (
+	"errors"
+	"fmt"
+	"hash/crc32"
+	"io"
+
+	"github.com/klauspost/compress/flate"
+)
+
+// These constants are copied from the flate package, so that code that imports
+// "compress/gzip" does not also have to import "compress/flate".
+const (
+	NoCompression       = flate.NoCompression
+	BestSpeed           = flate.BestSpeed
+	BestCompression     = flate.BestCompression
+	DefaultCompression  = flate.DefaultCompression
+	ConstantCompression = flate.ConstantCompression
+	HuffmanOnly         = flate.HuffmanOnly
+
+	// StatelessCompression will do compression but without maintaining any state
+	// between Write calls.
+	// There will be no memory kept between Write calls,
+	// but compression and speed will be suboptimal.
+	// Because of this, the size of actual Write calls will affect output size.
+	StatelessCompression = -3
+)
+
+// A Writer is an io.WriteCloser.
+// Writes to a Writer are compressed and written to w.
+type Writer struct {
+	Header      // written at first call to Write, Flush, or Close
+	w           io.Writer
+	level       int
+	err         error
+	compressor  *flate.Writer
+	digest      uint32 // CRC-32, IEEE polynomial (section 8)
+	size        uint32 // Uncompressed size (section 2.3.1)
+	wroteHeader bool
+	closed      bool
+	buf         [10]byte
+}
+
+// NewWriter returns a new Writer.
+// Writes to the returned writer are compressed and written to w.
+//
+// It is the caller's responsibility to call Close on the WriteCloser when done.
+// Writes may be buffered and not flushed until Close.
+//
+// Callers that wish to set the fields in Writer.Header must do so before
+// the first call to Write, Flush, or Close.
+func NewWriter(w io.Writer) *Writer {
+	z, _ := NewWriterLevel(w, DefaultCompression)
+	return z
+}
+
+// NewWriterLevel is like NewWriter but specifies the compression level instead
+// of assuming DefaultCompression.
+//
+// The compression level can be DefaultCompression, NoCompression, or any
+// integer value between BestSpeed and BestCompression inclusive. The error
+// returned will be nil if the level is valid.
+func NewWriterLevel(w io.Writer, level int) (*Writer, error) {
+	if level < StatelessCompression || level > BestCompression {
+		return nil, fmt.Errorf("gzip: invalid compression level: %d", level)
+	}
+	z := new(Writer)
+	z.init(w, level)
+	return z, nil
+}
+
+func (z *Writer) init(w io.Writer, level int) {
+	compressor := z.compressor
+	if level != StatelessCompression {
+		if compressor != nil {
+			compressor.Reset(w)
+		}
+	}
+
+	*z = Writer{
+		Header: Header{
+			OS: 255, // unknown
+		},
+		w:          w,
+		level:      level,
+		compressor: compressor,
+	}
+}
+
+// Reset discards the Writer z's state and makes it equivalent to the
+// result of its original state from NewWriter or NewWriterLevel, but
+// writing to w instead. This permits reusing a Writer rather than
+// allocating a new one.
+func (z *Writer) Reset(w io.Writer) {
+	z.init(w, z.level)
+}
+
+// writeBytes writes a length-prefixed byte slice to z.w.
+func (z *Writer) writeBytes(b []byte) error {
+	if len(b) > 0xffff {
+		return errors.New("gzip.Write: Extra data is too large")
+	}
+	le.PutUint16(z.buf[:2], uint16(len(b)))
+	_, err := z.w.Write(z.buf[:2])
+	if err != nil {
+		return err
+	}
+	_, err = z.w.Write(b)
+	return err
+}
+
+// writeString writes a UTF-8 string s in GZIP's format to z.w.
+// GZIP (RFC 1952) specifies that strings are NUL-terminated ISO 8859-1 (Latin-1).
+func (z *Writer) writeString(s string) (err error) {
+	// GZIP stores Latin-1 strings; error if non-Latin-1; convert if non-ASCII.
+	needconv := false
+	for _, v := range s {
+		if v == 0 || v > 0xff {
+			return errors.New("gzip.Write: non-Latin-1 header string")
+		}
+		if v > 0x7f {
+			needconv = true
+		}
+	}
+	if needconv {
+		b := make([]byte, 0, len(s))
+		for _, v := range s {
+			b = append(b, byte(v))
+		}
+		_, err = z.w.Write(b)
+	} else {
+		_, err = io.WriteString(z.w, s)
+	}
+	if err != nil {
+		return err
+	}
+	// GZIP strings are NUL-terminated.
+	z.buf[0] = 0
+	_, err = z.w.Write(z.buf[:1])
+	return err
+}
+
+// Write writes a compressed form of p to the underlying io.Writer. The
+// compressed bytes are not necessarily flushed until the Writer is closed.
+func (z *Writer) Write(p []byte) (int, error) {
+	if z.err != nil {
+		return 0, z.err
+	}
+	var n int
+	// Write the GZIP header lazily.
+	if !z.wroteHeader {
+		z.wroteHeader = true
+		z.buf[0] = gzipID1
+		z.buf[1] = gzipID2
+		z.buf[2] = gzipDeflate
+		z.buf[3] = 0
+		if z.Extra != nil {
+			z.buf[3] |= 0x04
+		}
+		if z.Name != "" {
+			z.buf[3] |= 0x08
+		}
+		if z.Comment != "" {
+			z.buf[3] |= 0x10
+		}
+		le.PutUint32(z.buf[4:8], uint32(z.ModTime.Unix()))
+		if z.level == BestCompression {
+			z.buf[8] = 2
+		} else if z.level == BestSpeed {
+			z.buf[8] = 4
+		} else {
+			z.buf[8] = 0
+		}
+		z.buf[9] = z.OS
+		n, z.err = z.w.Write(z.buf[:10])
+		if z.err != nil {
+			return n, z.err
+		}
+		if z.Extra != nil {
+			z.err = z.writeBytes(z.Extra)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Name != "" {
+			z.err = z.writeString(z.Name)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+		if z.Comment != "" {
+			z.err = z.writeString(z.Comment)
+			if z.err != nil {
+				return n, z.err
+			}
+		}
+
+		if z.compressor == nil && z.level != StatelessCompression {
+			z.compressor, _ = flate.NewWriter(z.w, z.level)
+		}
+	}
+	z.size += uint32(len(p))
+	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
+	if z.level == StatelessCompression {
+		return len(p), flate.StatelessDeflate(z.w, p, false, nil)
+	}
+	n, z.err = z.compressor.Write(p)
+	return n, z.err
+}
+
+// Flush flushes any pending compressed data to the underlying writer.
+//
+// It is useful mainly in compressed network protocols, to ensure that
+// a remote reader has enough data to reconstruct a packet. Flush does
+// not return until the data has been written. If the underlying
+// writer returns an error, Flush returns that error.
+//
+// In the terminology of the zlib library, Flush is equivalent to Z_SYNC_FLUSH.
+func (z *Writer) Flush() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed || z.level == StatelessCompression {
+		return nil
+	}
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	z.err = z.compressor.Flush()
+	return z.err
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying
+// io.Writer, but does not close the underlying io.Writer.
+func (z *Writer) Close() error {
+	if z.err != nil {
+		return z.err
+	}
+	if z.closed {
+		return nil
+	}
+	z.closed = true
+	if !z.wroteHeader {
+		z.Write(nil)
+		if z.err != nil {
+			return z.err
+		}
+	}
+	if z.level == StatelessCompression {
+		z.err = flate.StatelessDeflate(z.w, nil, true, nil)
+	} else {
+		z.err = z.compressor.Close()
+	}
+	if z.err != nil {
+		return z.err
+	}
+	le.PutUint32(z.buf[:4], z.digest)
+	le.PutUint32(z.buf[4:8], z.size)
+	_, z.err = z.w.Write(z.buf[:8])
+	return z.err
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/.gitignore b/vendor/github.com/klauspost/compress/huff0/.gitignore
new file mode 100644
index 00000000000..b3d262958f8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/.gitignore
@@ -0,0 +1 @@
+/huff0-fuzz.zip
diff --git a/vendor/github.com/klauspost/compress/huff0/README.md b/vendor/github.com/klauspost/compress/huff0/README.md
new file mode 100644
index 00000000000..0a8448ce9f9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/README.md
@@ -0,0 +1,87 @@
+# Huff0 entropy compression
+
+This package provides Huff0 encoding and decoding as used in zstd.
+            
+[Huff0](https://github.com/Cyan4973/FiniteStateEntropy#new-generation-entropy-coders), 
+a Huffman codec designed for modern CPU, featuring OoO (Out of Order) operations on multiple ALU 
+(Arithmetic Logic Unit), achieving extremely fast compression and decompression speeds.
+
+This can be used for compressing input with a lot of similar input values to the smallest number of bytes.
+This does not perform any multi-byte [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) as LZ coders,
+but it can be used as a secondary step to compressors (like Snappy) that does not do entropy encoding. 
+
+* [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
+
+THIS PACKAGE IS NOT CONSIDERED STABLE AND API OR ENCODING MAY CHANGE IN THE FUTURE.
+
+## News
+
+ * Mar 2018: First implementation released. Consider this beta software for now.
+
+# Usage
+
+This package provides a low level interface that allows to compress single independent blocks. 
+
+Each block is separate, and there is no built in integrity checks. 
+This means that the caller should keep track of block sizes and also do checksums if needed.  
+
+Compressing a block is done via the [`Compress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress1X) and 
+[`Compress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Compress4X) functions.
+You must provide input and will receive the output and maybe an error.
+
+These error values can be returned:
+
+| Error               | Description                                                                 |
+|---------------------|-----------------------------------------------------------------------------|
+| `<nil>`             | Everything ok, output is returned                                           |
+| `ErrIncompressible` | Returned when input is judged to be too hard to compress                    |
+| `ErrUseRLE`         | Returned from the compressor when the input is a single byte value repeated |
+| `ErrTooBig`         | Returned if the input block exceeds the maximum allowed size (128 Kib)      |
+| `(error)`           | An internal error occurred.                                                 |
+
+
+As can be seen above some of there are errors that will be returned even under normal operation so it is important to handle these.
+
+To reduce allocations you can provide a [`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object 
+that can be re-used for successive calls. Both compression and decompression accepts a `Scratch` object, and the same 
+object can be used for both.   
+
+Be aware, that when re-using a `Scratch` object that the *output* buffer is also re-used, so if you are still using this
+you must set the `Out` field in the scratch to nil. The same buffer is used for compression and decompression output.
+
+The `Scratch` object will retain state that allows to re-use previous tables for encoding and decoding.  
+
+## Tables and re-use
+
+Huff0 allows for reusing tables from the previous block to save space if that is expected to give better/faster results. 
+
+The Scratch object allows you to set a [`ReusePolicy`](https://godoc.org/github.com/klauspost/compress/huff0#ReusePolicy) 
+that controls this behaviour. See the documentation for details. This can be altered between each block.
+
+Do however note that this information is *not* stored in the output block and it is up to the users of the package to
+record whether [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable) should be called,
+based on the boolean reported back from the CompressXX call. 
+
+If you want to store the table separate from the data, you can access them as `OutData` and `OutTable` on the 
+[`Scratch`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch) object.
+
+## Decompressing
+
+The first part of decoding is to initialize the decoding table through [`ReadTable`](https://godoc.org/github.com/klauspost/compress/huff0#ReadTable).
+This will initialize the decoding tables. 
+You can supply the complete block to `ReadTable` and it will return the data part of the block 
+which can be given to the decompressor. 
+
+Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
+or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
+
+You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
+your input was likely corrupted. 
+
+It is important to note that a successful decoding does *not* mean your output matches your original input. 
+There are no integrity checks, so relying on errors from the decompressor does not assure your data is valid.
+
+# Contributing
+
+Contributions are always welcome. Be aware that adding public functions will require good justification and breaking 
+changes will likely not be accepted. If in doubt open an issue before writing the PR.
\ No newline at end of file
diff --git a/vendor/github.com/klauspost/compress/huff0/bitreader.go b/vendor/github.com/klauspost/compress/huff0/bitreader.go
new file mode 100644
index 00000000000..7d0903c7010
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/bitreader.go
@@ -0,0 +1,115 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package huff0
+
+import (
+	"errors"
+	"io"
+)
+
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReader struct {
+	in       []byte
+	off      uint // next byte to read is at in[off - 1]
+	value    uint64
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReader) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	b.fill()
+	b.fill()
+	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
+	return nil
+}
+
+// getBits will return n bits. n can be 0.
+func (b *bitReader) getBits(n uint8) uint16 {
+	if n == 0 || b.bitsRead >= 64 {
+		return 0
+	}
+	return b.getBitsFast(n)
+}
+
+// getBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReader) getBitsFast(n uint8) uint16 {
+	const regMask = 64 - 1
+	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	b.bitsRead += n
+	return v
+}
+
+// peekBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReader) peekBitsFast(n uint8) uint16 {
+	const regMask = 64 - 1
+	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	return v
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReader) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+	// Do single re-slice to avoid bounds checks.
+	v := b.in[b.off-4 : b.off]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value = (b.value << 32) | uint64(low)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReader) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off > 4 {
+		v := b.in[b.off-4 : b.off]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value = (b.value << 32) | uint64(low)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value = (b.value << 8) | uint64(b.in[b.off-1])
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReader) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReader) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/bitwriter.go b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
new file mode 100644
index 00000000000..bda4021efd3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/bitwriter.go
@@ -0,0 +1,197 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package huff0
+
+import "fmt"
+
+// bitWriter will write bits.
+// First bit will be LSB of the first byte of output.
+type bitWriter struct {
+	bitContainer uint64
+	nBits        uint8
+	out          []byte
+}
+
+// bitMask16 is bitmasks. Has extra to avoid bounds check.
+var bitMask16 = [32]uint16{
+	0, 1, 3, 7, 0xF, 0x1F,
+	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
+	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF} /* up to 16 bits */
+
+// addBits16NC will add up to 16 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
+// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// encSymbol will add up to 16 bits. value may not contain more set bits than indicated.
+// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
+func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
+	enc := ct[symbol]
+	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
+	b.nBits += enc.nBits
+}
+
+// encTwoSymbols will add up to 32 bits. value may not contain more set bits than indicated.
+// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
+func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
+	encA := ct[av]
+	encB := ct[bv]
+	sh := b.nBits & 63
+	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
+	b.bitContainer |= combined << sh
+	b.nBits += encA.nBits + encB.nBits
+}
+
+// addBits16ZeroNC will add up to 16 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+// This is fastest if bits can be zero.
+func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
+	if bits == 0 {
+		return
+	}
+	value <<= (16 - bits) & 15
+	value >>= (16 - bits) & 15
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// flush will flush all pending full bytes.
+// There will be at least 56 bits available for writing when this has been called.
+// Using flush32 is faster, but leaves less space for writing.
+func (b *bitWriter) flush() {
+	v := b.nBits >> 3
+	switch v {
+	case 0:
+		return
+	case 1:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+		)
+		b.bitContainer >>= 1 << 3
+	case 2:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+		)
+		b.bitContainer >>= 2 << 3
+	case 3:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+		)
+		b.bitContainer >>= 3 << 3
+	case 4:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+		)
+		b.bitContainer >>= 4 << 3
+	case 5:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+		)
+		b.bitContainer >>= 5 << 3
+	case 6:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+		)
+		b.bitContainer >>= 6 << 3
+	case 7:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+		)
+		b.bitContainer >>= 7 << 3
+	case 8:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+			byte(b.bitContainer>>56),
+		)
+		b.bitContainer = 0
+		b.nBits = 0
+		return
+	default:
+		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
+	}
+	b.nBits &= 7
+}
+
+// flush32 will flush out, so there are at least 32 bits available for writing.
+func (b *bitWriter) flush32() {
+	if b.nBits < 32 {
+		return
+	}
+	b.out = append(b.out,
+		byte(b.bitContainer),
+		byte(b.bitContainer>>8),
+		byte(b.bitContainer>>16),
+		byte(b.bitContainer>>24))
+	b.nBits -= 32
+	b.bitContainer >>= 32
+}
+
+// flushAlign will flush remaining full bytes and align to next byte boundary.
+func (b *bitWriter) flushAlign() {
+	nbBytes := (b.nBits + 7) >> 3
+	for i := uint8(0); i < nbBytes; i++ {
+		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
+	}
+	b.nBits = 0
+	b.bitContainer = 0
+}
+
+// close will write the alignment bit and write the final byte(s)
+// to the output.
+func (b *bitWriter) close() error {
+	// End mark
+	b.addBits16Clean(1, 1)
+	// flush until next byte.
+	b.flushAlign()
+	return nil
+}
+
+// reset and continue writing by appending to out.
+func (b *bitWriter) reset(out []byte) {
+	b.bitContainer = 0
+	b.nBits = 0
+	b.out = out
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/bytereader.go b/vendor/github.com/klauspost/compress/huff0/bytereader.go
new file mode 100644
index 00000000000..50bcdf6ea99
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/bytereader.go
@@ -0,0 +1,54 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package huff0
+
+// byteReader provides a byte reader that reads
+// little endian values from a byte stream.
+// The input stream is manually advanced.
+// The reader performs no bounds checks.
+type byteReader struct {
+	b   []byte
+	off int
+}
+
+// init will initialize the reader and set the input.
+func (b *byteReader) init(in []byte) {
+	b.b = in
+	b.off = 0
+}
+
+// advance the stream b n bytes.
+func (b *byteReader) advance(n uint) {
+	b.off += int(n)
+}
+
+// Int32 returns a little endian int32 starting at current offset.
+func (b byteReader) Int32() int32 {
+	v3 := int32(b.b[b.off+3])
+	v2 := int32(b.b[b.off+2])
+	v1 := int32(b.b[b.off+1])
+	v0 := int32(b.b[b.off])
+	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
+}
+
+// Uint32 returns a little endian uint32 starting at current offset.
+func (b byteReader) Uint32() uint32 {
+	v3 := uint32(b.b[b.off+3])
+	v2 := uint32(b.b[b.off+2])
+	v1 := uint32(b.b[b.off+1])
+	v0 := uint32(b.b[b.off])
+	return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
+}
+
+// unread returns the unread portion of the input.
+func (b byteReader) unread() []byte {
+	return b.b[b.off:]
+}
+
+// remain will return the number of bytes remaining.
+func (b byteReader) remain() int {
+	return len(b.b) - b.off
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/compress.go b/vendor/github.com/klauspost/compress/huff0/compress.go
new file mode 100644
index 00000000000..0843cb014ff
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/compress.go
@@ -0,0 +1,651 @@
+package huff0
+
+import (
+	"fmt"
+	"runtime"
+	"sync"
+)
+
+// Compress1X will compress the input.
+// The output can be decoded using Decompress1X.
+// Supply a Scratch object. The scratch object contains state about re-use,
+// So when sharing across independent encodes, be sure to set the re-use policy.
+func Compress1X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return nil, false, err
+	}
+	return compress(in, s, s.compress1X)
+}
+
+// Compress4X will compress the input. The input is split into 4 independent blocks
+// and compressed similar to Compress1X.
+// The output can be decoded using Decompress4X.
+// Supply a Scratch object. The scratch object contains state about re-use,
+// So when sharing across independent encodes, be sure to set the re-use policy.
+func Compress4X(in []byte, s *Scratch) (out []byte, reUsed bool, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return nil, false, err
+	}
+	if false {
+		// TODO: compress4Xp only slightly faster.
+		const parallelThreshold = 8 << 10
+		if len(in) < parallelThreshold || runtime.GOMAXPROCS(0) == 1 {
+			return compress(in, s, s.compress4X)
+		}
+		return compress(in, s, s.compress4Xp)
+	}
+	return compress(in, s, s.compress4X)
+}
+
+func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)) (out []byte, reUsed bool, err error) {
+	// Nuke previous table if we cannot reuse anyway.
+	if s.Reuse == ReusePolicyNone {
+		s.prevTable = s.prevTable[:0]
+	}
+
+	// Create histogram, if none was provided.
+	maxCount := s.maxCount
+	var canReuse = false
+	if maxCount == 0 {
+		maxCount, canReuse = s.countSimple(in)
+	} else {
+		canReuse = s.canUseTable(s.prevTable)
+	}
+
+	// We want the output size to be less than this:
+	wantSize := len(in)
+	if s.WantLogLess > 0 {
+		wantSize -= wantSize >> s.WantLogLess
+	}
+
+	// Reset for next run.
+	s.clearCount = true
+	s.maxCount = 0
+	if maxCount >= len(in) {
+		if maxCount > len(in) {
+			return nil, false, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
+		}
+		if len(in) == 1 {
+			return nil, false, ErrIncompressible
+		}
+		// One symbol, use RLE
+		return nil, false, ErrUseRLE
+	}
+	if maxCount == 1 || maxCount < (len(in)>>7) {
+		// Each symbol present maximum once or too well distributed.
+		return nil, false, ErrIncompressible
+	}
+
+	if s.Reuse == ReusePolicyPrefer && canReuse {
+		keepTable := s.cTable
+		keepTL := s.actualTableLog
+		s.cTable = s.prevTable
+		s.actualTableLog = s.prevTableLog
+		s.Out, err = compressor(in)
+		s.cTable = keepTable
+		s.actualTableLog = keepTL
+		if err == nil && len(s.Out) < wantSize {
+			s.OutData = s.Out
+			return s.Out, true, nil
+		}
+		// Do not attempt to re-use later.
+		s.prevTable = s.prevTable[:0]
+	}
+
+	// Calculate new table.
+	err = s.buildCTable()
+	if err != nil {
+		return nil, false, err
+	}
+
+	if false && !s.canUseTable(s.cTable) {
+		panic("invalid table generated")
+	}
+
+	if s.Reuse == ReusePolicyAllow && canReuse {
+		hSize := len(s.Out)
+		oldSize := s.prevTable.estimateSize(s.count[:s.symbolLen])
+		newSize := s.cTable.estimateSize(s.count[:s.symbolLen])
+		if oldSize <= hSize+newSize || hSize+12 >= wantSize {
+			// Retain cTable even if we re-use.
+			keepTable := s.cTable
+			keepTL := s.actualTableLog
+
+			s.cTable = s.prevTable
+			s.actualTableLog = s.prevTableLog
+			s.Out, err = compressor(in)
+
+			// Restore ctable.
+			s.cTable = keepTable
+			s.actualTableLog = keepTL
+			if err != nil {
+				return nil, false, err
+			}
+			if len(s.Out) >= wantSize {
+				return nil, false, ErrIncompressible
+			}
+			s.OutData = s.Out
+			return s.Out, true, nil
+		}
+	}
+
+	// Use new table
+	err = s.cTable.write(s)
+	if err != nil {
+		s.OutTable = nil
+		return nil, false, err
+	}
+	s.OutTable = s.Out
+
+	// Compress using new table
+	s.Out, err = compressor(in)
+	if err != nil {
+		s.OutTable = nil
+		return nil, false, err
+	}
+	if len(s.Out) >= wantSize {
+		s.OutTable = nil
+		return nil, false, ErrIncompressible
+	}
+	// Move current table into previous.
+	s.prevTable, s.prevTableLog, s.cTable = s.cTable, s.actualTableLog, s.prevTable[:0]
+	s.OutData = s.Out[len(s.OutTable):]
+	return s.Out, false, nil
+}
+
+func (s *Scratch) compress1X(src []byte) ([]byte, error) {
+	return s.compress1xDo(s.Out, src)
+}
+
+func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) {
+	var bw = bitWriter{out: dst}
+
+	// N is length divisible by 4.
+	n := len(src)
+	n -= n & 3
+	cTable := s.cTable[:256]
+
+	// Encode last bytes.
+	for i := len(src) & 3; i > 0; i-- {
+		bw.encSymbol(cTable, src[n+i-1])
+	}
+	n -= 4
+	if s.actualTableLog <= 8 {
+		for ; n >= 0; n -= 4 {
+			tmp := src[n : n+4]
+			// tmp should be len 4
+			bw.flush32()
+			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
+			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+		}
+	} else {
+		for ; n >= 0; n -= 4 {
+			tmp := src[n : n+4]
+			// tmp should be len 4
+			bw.flush32()
+			bw.encTwoSymbols(cTable, tmp[3], tmp[2])
+			bw.flush32()
+			bw.encTwoSymbols(cTable, tmp[1], tmp[0])
+		}
+	}
+	err := bw.close()
+	return bw.out, err
+}
+
+var sixZeros [6]byte
+
+func (s *Scratch) compress4X(src []byte) ([]byte, error) {
+	if len(src) < 12 {
+		return nil, ErrIncompressible
+	}
+	segmentSize := (len(src) + 3) / 4
+
+	// Add placeholder for output length
+	offsetIdx := len(s.Out)
+	s.Out = append(s.Out, sixZeros[:]...)
+
+	for i := 0; i < 4; i++ {
+		toDo := src
+		if len(toDo) > segmentSize {
+			toDo = toDo[:segmentSize]
+		}
+		src = src[len(toDo):]
+
+		var err error
+		idx := len(s.Out)
+		s.Out, err = s.compress1xDo(s.Out, toDo)
+		if err != nil {
+			return nil, err
+		}
+		// Write compressed length as little endian before block.
+		if i < 3 {
+			// Last length is not written.
+			length := len(s.Out) - idx
+			s.Out[i*2+offsetIdx] = byte(length)
+			s.Out[i*2+offsetIdx+1] = byte(length >> 8)
+		}
+	}
+
+	return s.Out, nil
+}
+
+// compress4Xp will compress 4 streams using separate goroutines.
+func (s *Scratch) compress4Xp(src []byte) ([]byte, error) {
+	if len(src) < 12 {
+		return nil, ErrIncompressible
+	}
+	// Add placeholder for output length
+	s.Out = s.Out[:6]
+
+	segmentSize := (len(src) + 3) / 4
+	var wg sync.WaitGroup
+	var errs [4]error
+	wg.Add(4)
+	for i := 0; i < 4; i++ {
+		toDo := src
+		if len(toDo) > segmentSize {
+			toDo = toDo[:segmentSize]
+		}
+		src = src[len(toDo):]
+
+		// Separate goroutine for each block.
+		go func(i int) {
+			s.tmpOut[i], errs[i] = s.compress1xDo(s.tmpOut[i][:0], toDo)
+			wg.Done()
+		}(i)
+	}
+	wg.Wait()
+	for i := 0; i < 4; i++ {
+		if errs[i] != nil {
+			return nil, errs[i]
+		}
+		o := s.tmpOut[i]
+		// Write compressed length as little endian before block.
+		if i < 3 {
+			// Last length is not written.
+			s.Out[i*2] = byte(len(o))
+			s.Out[i*2+1] = byte(len(o) >> 8)
+		}
+
+		// Write output.
+		s.Out = append(s.Out, o...)
+	}
+	return s.Out, nil
+}
+
+// countSimple will create a simple histogram in s.count.
+// Returns the biggest count.
+// Does not update s.clearCount.
+func (s *Scratch) countSimple(in []byte) (max int, reuse bool) {
+	reuse = true
+	for _, v := range in {
+		s.count[v]++
+	}
+	m := uint32(0)
+	if len(s.prevTable) > 0 {
+		for i, v := range s.count[:] {
+			if v > m {
+				m = v
+			}
+			if v > 0 {
+				s.symbolLen = uint16(i) + 1
+				if i >= len(s.prevTable) {
+					reuse = false
+				} else {
+					if s.prevTable[i].nBits == 0 {
+						reuse = false
+					}
+				}
+			}
+		}
+		return int(m), reuse
+	}
+	for i, v := range s.count[:] {
+		if v > m {
+			m = v
+		}
+		if v > 0 {
+			s.symbolLen = uint16(i) + 1
+		}
+	}
+	return int(m), false
+}
+
+func (s *Scratch) canUseTable(c cTable) bool {
+	if len(c) < int(s.symbolLen) {
+		return false
+	}
+	for i, v := range s.count[:s.symbolLen] {
+		if v != 0 && c[i].nBits == 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func (s *Scratch) validateTable(c cTable) bool {
+	if len(c) < int(s.symbolLen) {
+		return false
+	}
+	for i, v := range s.count[:s.symbolLen] {
+		if v != 0 {
+			if c[i].nBits == 0 {
+				return false
+			}
+			if c[i].nBits > s.actualTableLog {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// minTableLog provides the minimum logSize to safely represent a distribution.
+func (s *Scratch) minTableLog() uint8 {
+	minBitsSrc := highBit32(uint32(s.br.remain())) + 1
+	minBitsSymbols := highBit32(uint32(s.symbolLen-1)) + 2
+	if minBitsSrc < minBitsSymbols {
+		return uint8(minBitsSrc)
+	}
+	return uint8(minBitsSymbols)
+}
+
+// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
+func (s *Scratch) optimalTableLog() {
+	tableLog := s.TableLog
+	minBits := s.minTableLog()
+	maxBitsSrc := uint8(highBit32(uint32(s.br.remain()-1))) - 1
+	if maxBitsSrc < tableLog {
+		// Accuracy can be reduced
+		tableLog = maxBitsSrc
+	}
+	if minBits > tableLog {
+		tableLog = minBits
+	}
+	// Need a minimum to safely represent all symbol values
+	if tableLog < minTablelog {
+		tableLog = minTablelog
+	}
+	if tableLog > tableLogMax {
+		tableLog = tableLogMax
+	}
+	s.actualTableLog = tableLog
+}
+
+type cTableEntry struct {
+	val   uint16
+	nBits uint8
+	// We have 8 bits extra
+}
+
+const huffNodesMask = huffNodesLen - 1
+
+func (s *Scratch) buildCTable() error {
+	s.optimalTableLog()
+	s.huffSort()
+	if cap(s.cTable) < maxSymbolValue+1 {
+		s.cTable = make([]cTableEntry, s.symbolLen, maxSymbolValue+1)
+	} else {
+		s.cTable = s.cTable[:s.symbolLen]
+		for i := range s.cTable {
+			s.cTable[i] = cTableEntry{}
+		}
+	}
+
+	var startNode = int16(s.symbolLen)
+	nonNullRank := s.symbolLen - 1
+
+	nodeNb := int16(startNode)
+	huffNode := s.nodes[1 : huffNodesLen+1]
+
+	// This overlays the slice above, but allows "-1" index lookups.
+	// Different from reference implementation.
+	huffNode0 := s.nodes[0 : huffNodesLen+1]
+
+	for huffNode[nonNullRank].count == 0 {
+		nonNullRank--
+	}
+
+	lowS := int16(nonNullRank)
+	nodeRoot := nodeNb + lowS - 1
+	lowN := nodeNb
+	huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count
+	huffNode[lowS].parent, huffNode[lowS-1].parent = uint16(nodeNb), uint16(nodeNb)
+	nodeNb++
+	lowS -= 2
+	for n := nodeNb; n <= nodeRoot; n++ {
+		huffNode[n].count = 1 << 30
+	}
+	// fake entry, strong barrier
+	huffNode0[0].count = 1 << 31
+
+	// create parents
+	for nodeNb <= nodeRoot {
+		var n1, n2 int16
+		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+			n1 = lowS
+			lowS--
+		} else {
+			n1 = lowN
+			lowN++
+		}
+		if huffNode0[lowS+1].count < huffNode0[lowN+1].count {
+			n2 = lowS
+			lowS--
+		} else {
+			n2 = lowN
+			lowN++
+		}
+
+		huffNode[nodeNb].count = huffNode0[n1+1].count + huffNode0[n2+1].count
+		huffNode0[n1+1].parent, huffNode0[n2+1].parent = uint16(nodeNb), uint16(nodeNb)
+		nodeNb++
+	}
+
+	// distribute weights (unlimited tree height)
+	huffNode[nodeRoot].nbBits = 0
+	for n := nodeRoot - 1; n >= startNode; n-- {
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+	}
+	for n := uint16(0); n <= nonNullRank; n++ {
+		huffNode[n].nbBits = huffNode[huffNode[n].parent].nbBits + 1
+	}
+	s.actualTableLog = s.setMaxHeight(int(nonNullRank))
+	maxNbBits := s.actualTableLog
+
+	// fill result into tree (val, nbBits)
+	if maxNbBits > tableLogMax {
+		return fmt.Errorf("internal error: maxNbBits (%d) > tableLogMax (%d)", maxNbBits, tableLogMax)
+	}
+	var nbPerRank [tableLogMax + 1]uint16
+	var valPerRank [16]uint16
+	for _, v := range huffNode[:nonNullRank+1] {
+		nbPerRank[v.nbBits]++
+	}
+	// determine stating value per rank
+	{
+		min := uint16(0)
+		for n := maxNbBits; n > 0; n-- {
+			// get starting value within each rank
+			valPerRank[n] = min
+			min += nbPerRank[n]
+			min >>= 1
+		}
+	}
+
+	// push nbBits per symbol, symbol order
+	for _, v := range huffNode[:nonNullRank+1] {
+		s.cTable[v.symbol].nBits = v.nbBits
+	}
+
+	// assign value within rank, symbol order
+	t := s.cTable[:s.symbolLen]
+	for n, val := range t {
+		nbits := val.nBits & 15
+		v := valPerRank[nbits]
+		t[n].val = v
+		valPerRank[nbits] = v + 1
+	}
+
+	return nil
+}
+
+// huffSort will sort symbols, decreasing order.
+func (s *Scratch) huffSort() {
+	type rankPos struct {
+		base    uint32
+		current uint32
+	}
+
+	// Clear nodes
+	nodes := s.nodes[:huffNodesLen+1]
+	s.nodes = nodes
+	nodes = nodes[1 : huffNodesLen+1]
+
+	// Sort into buckets based on length of symbol count.
+	var rank [32]rankPos
+	for _, v := range s.count[:s.symbolLen] {
+		r := highBit32(v+1) & 31
+		rank[r].base++
+	}
+	// maxBitLength is log2(BlockSizeMax) + 1
+	const maxBitLength = 18 + 1
+	for n := maxBitLength; n > 0; n-- {
+		rank[n-1].base += rank[n].base
+	}
+	for n := range rank[:maxBitLength] {
+		rank[n].current = rank[n].base
+	}
+	for n, c := range s.count[:s.symbolLen] {
+		r := (highBit32(c+1) + 1) & 31
+		pos := rank[r].current
+		rank[r].current++
+		prev := nodes[(pos-1)&huffNodesMask]
+		for pos > rank[r].base && c > prev.count {
+			nodes[pos&huffNodesMask] = prev
+			pos--
+			prev = nodes[(pos-1)&huffNodesMask]
+		}
+		nodes[pos&huffNodesMask] = nodeElt{count: c, symbol: byte(n)}
+	}
+	return
+}
+
+func (s *Scratch) setMaxHeight(lastNonNull int) uint8 {
+	maxNbBits := s.actualTableLog
+	huffNode := s.nodes[1 : huffNodesLen+1]
+	//huffNode = huffNode[: huffNodesLen]
+
+	largestBits := huffNode[lastNonNull].nbBits
+
+	// early exit : no elt > maxNbBits
+	if largestBits <= maxNbBits {
+		return largestBits
+	}
+	totalCost := int(0)
+	baseCost := int(1) << (largestBits - maxNbBits)
+	n := uint32(lastNonNull)
+
+	for huffNode[n].nbBits > maxNbBits {
+		totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits))
+		huffNode[n].nbBits = maxNbBits
+		n--
+	}
+	// n stops at huffNode[n].nbBits <= maxNbBits
+
+	for huffNode[n].nbBits == maxNbBits {
+		n--
+	}
+	// n end at index of smallest symbol using < maxNbBits
+
+	// renorm totalCost
+	totalCost >>= largestBits - maxNbBits /* note : totalCost is necessarily a multiple of baseCost */
+
+	// repay normalized cost
+	{
+		const noSymbol = 0xF0F0F0F0
+		var rankLast [tableLogMax + 2]uint32
+
+		for i := range rankLast[:] {
+			rankLast[i] = noSymbol
+		}
+
+		// Get pos of last (smallest) symbol per rank
+		{
+			currentNbBits := uint8(maxNbBits)
+			for pos := int(n); pos >= 0; pos-- {
+				if huffNode[pos].nbBits >= currentNbBits {
+					continue
+				}
+				currentNbBits = huffNode[pos].nbBits // < maxNbBits
+				rankLast[maxNbBits-currentNbBits] = uint32(pos)
+			}
+		}
+
+		for totalCost > 0 {
+			nBitsToDecrease := uint8(highBit32(uint32(totalCost))) + 1
+
+			for ; nBitsToDecrease > 1; nBitsToDecrease-- {
+				highPos := rankLast[nBitsToDecrease]
+				lowPos := rankLast[nBitsToDecrease-1]
+				if highPos == noSymbol {
+					continue
+				}
+				if lowPos == noSymbol {
+					break
+				}
+				highTotal := huffNode[highPos].count
+				lowTotal := 2 * huffNode[lowPos].count
+				if highTotal <= lowTotal {
+					break
+				}
+			}
+			// only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !)
+			// HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary
+			// FIXME: try to remove
+			for (nBitsToDecrease <= tableLogMax) && (rankLast[nBitsToDecrease] == noSymbol) {
+				nBitsToDecrease++
+			}
+			totalCost -= 1 << (nBitsToDecrease - 1)
+			if rankLast[nBitsToDecrease-1] == noSymbol {
+				// this rank is no longer empty
+				rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]
+			}
+			huffNode[rankLast[nBitsToDecrease]].nbBits++
+			if rankLast[nBitsToDecrease] == 0 {
+				/* special case, reached largest symbol */
+				rankLast[nBitsToDecrease] = noSymbol
+			} else {
+				rankLast[nBitsToDecrease]--
+				if huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease {
+					rankLast[nBitsToDecrease] = noSymbol /* this rank is now empty */
+				}
+			}
+		}
+
+		for totalCost < 0 { /* Sometimes, cost correction overshoot */
+			if rankLast[1] == noSymbol { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */
+				for huffNode[n].nbBits == maxNbBits {
+					n--
+				}
+				huffNode[n+1].nbBits--
+				rankLast[1] = n + 1
+				totalCost++
+				continue
+			}
+			huffNode[rankLast[1]+1].nbBits--
+			rankLast[1]++
+			totalCost++
+		}
+	}
+	return maxNbBits
+}
+
+type nodeElt struct {
+	count  uint32
+	parent uint16
+	symbol byte
+	nbBits uint8
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/decompress.go b/vendor/github.com/klauspost/compress/huff0/decompress.go
new file mode 100644
index 00000000000..97ae66a4ac7
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/decompress.go
@@ -0,0 +1,472 @@
+package huff0
+
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/klauspost/compress/fse"
+)
+
+type dTable struct {
+	single []dEntrySingle
+	double []dEntryDouble
+}
+
+// single-symbols decoding
+type dEntrySingle struct {
+	entry uint16
+}
+
+// double-symbols decoding
+type dEntryDouble struct {
+	seq   uint16
+	nBits uint8
+	len   uint8
+}
+
+// ReadTable will read a table from the input.
+// The size of the input may be larger than the table definition.
+// Any content remaining after the table definition will be returned.
+// If no Scratch is provided a new one is allocated.
+// The returned Scratch can be used for decoding input using this table.
+func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
+	s, err = s.prepare(in)
+	if err != nil {
+		return s, nil, err
+	}
+	if len(in) <= 1 {
+		return s, nil, errors.New("input too small for table")
+	}
+	iSize := in[0]
+	in = in[1:]
+	if iSize >= 128 {
+		// Uncompressed
+		oSize := iSize - 127
+		iSize = (oSize + 1) / 2
+		if int(iSize) > len(in) {
+			return s, nil, errors.New("input too small for table")
+		}
+		for n := uint8(0); n < oSize; n += 2 {
+			v := in[n/2]
+			s.huffWeight[n] = v >> 4
+			s.huffWeight[n+1] = v & 15
+		}
+		s.symbolLen = uint16(oSize)
+		in = in[iSize:]
+	} else {
+		if len(in) <= int(iSize) {
+			return s, nil, errors.New("input too small for table")
+		}
+		// FSE compressed weights
+		s.fse.DecompressLimit = 255
+		hw := s.huffWeight[:]
+		s.fse.Out = hw
+		b, err := fse.Decompress(in[:iSize], s.fse)
+		s.fse.Out = nil
+		if err != nil {
+			return s, nil, err
+		}
+		if len(b) > 255 {
+			return s, nil, errors.New("corrupt input: output table too large")
+		}
+		s.symbolLen = uint16(len(b))
+		in = in[iSize:]
+	}
+
+	// collect weight stats
+	var rankStats [16]uint32
+	weightTotal := uint32(0)
+	for _, v := range s.huffWeight[:s.symbolLen] {
+		if v > tableLogMax {
+			return s, nil, errors.New("corrupt input: weight too large")
+		}
+		v2 := v & 15
+		rankStats[v2]++
+		weightTotal += (1 << v2) >> 1
+	}
+	if weightTotal == 0 {
+		return s, nil, errors.New("corrupt input: weights zero")
+	}
+
+	// get last non-null symbol weight (implied, total must be 2^n)
+	{
+		tableLog := highBit32(weightTotal) + 1
+		if tableLog > tableLogMax {
+			return s, nil, errors.New("corrupt input: tableLog too big")
+		}
+		s.actualTableLog = uint8(tableLog)
+		// determine last weight
+		{
+			total := uint32(1) << tableLog
+			rest := total - weightTotal
+			verif := uint32(1) << highBit32(rest)
+			lastWeight := highBit32(rest) + 1
+			if verif != rest {
+				// last value must be a clean power of 2
+				return s, nil, errors.New("corrupt input: last value not power of two")
+			}
+			s.huffWeight[s.symbolLen] = uint8(lastWeight)
+			s.symbolLen++
+			rankStats[lastWeight]++
+		}
+	}
+
+	if (rankStats[1] < 2) || (rankStats[1]&1 != 0) {
+		// by construction : at least 2 elts of rank 1, must be even
+		return s, nil, errors.New("corrupt input: min elt size, even check failed ")
+	}
+
+	// TODO: Choose between single/double symbol decoding
+
+	// Calculate starting value for each rank
+	{
+		var nextRankStart uint32
+		for n := uint8(1); n < s.actualTableLog+1; n++ {
+			current := nextRankStart
+			nextRankStart += rankStats[n] << (n - 1)
+			rankStats[n] = current
+		}
+	}
+
+	// fill DTable (always full size)
+	tSize := 1 << tableLogMax
+	if len(s.dt.single) != tSize {
+		s.dt.single = make([]dEntrySingle, tSize)
+	}
+	for n, w := range s.huffWeight[:s.symbolLen] {
+		if w == 0 {
+			continue
+		}
+		length := (uint32(1) << w) >> 1
+		d := dEntrySingle{
+			entry: uint16(s.actualTableLog+1-w) | (uint16(n) << 8),
+		}
+		single := s.dt.single[rankStats[w] : rankStats[w]+length]
+		for i := range single {
+			single[i] = d
+		}
+		rankStats[w] += length
+	}
+	return s, in, nil
+}
+
+// Decompress1X will decompress a 1X encoded stream.
+// The length of the supplied input must match the end of a block exactly.
+// Before this is called, the table must be initialized with ReadTable unless
+// the encoder re-used the table.
+func (s *Scratch) Decompress1X(in []byte) (out []byte, err error) {
+	if len(s.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	var br bitReader
+	err = br.init(in)
+	if err != nil {
+		return nil, err
+	}
+	s.Out = s.Out[:0]
+
+	decode := func() byte {
+		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
+		v := s.dt.single[val]
+		br.bitsRead += uint8(v.entry)
+		return uint8(v.entry >> 8)
+	}
+	hasDec := func(v dEntrySingle) byte {
+		br.bitsRead += uint8(v.entry)
+		return uint8(v.entry >> 8)
+	}
+
+	// Avoid bounds check by always having full sized table.
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	dt := s.dt.single[:tlSize]
+
+	// Use temp table to avoid bound checks/append penalty.
+	var tmp = s.huffWeight[:256]
+	var off uint8
+
+	for br.off >= 8 {
+		br.fillFast()
+		tmp[off+0] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		tmp[off+1] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		br.fillFast()
+		tmp[off+2] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		tmp[off+3] = hasDec(dt[br.peekBitsFast(s.actualTableLog)&tlMask])
+		off += 4
+		if off == 0 {
+			if len(s.Out)+256 > s.MaxDecodedSize {
+				br.close()
+				return nil, ErrMaxDecodedSizeExceeded
+			}
+			s.Out = append(s.Out, tmp...)
+		}
+	}
+
+	if len(s.Out)+int(off) > s.MaxDecodedSize {
+		br.close()
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	s.Out = append(s.Out, tmp[:off]...)
+
+	for !br.finished() {
+		br.fill()
+		if len(s.Out) >= s.MaxDecodedSize {
+			br.close()
+			return nil, ErrMaxDecodedSizeExceeded
+		}
+		s.Out = append(s.Out, decode())
+	}
+	return s.Out, br.close()
+}
+
+// Decompress4X will decompress a 4X encoded stream.
+// Before this is called, the table must be initialized with ReadTable unless
+// the encoder re-used the table.
+// The length of the supplied input must match the end of a block exactly.
+// The destination size of the uncompressed data must be known and provided.
+func (s *Scratch) Decompress4X(in []byte, dstSize int) (out []byte, err error) {
+	if len(s.dt.single) == 0 {
+		return nil, errors.New("no table loaded")
+	}
+	if len(in) < 6+(4*1) {
+		return nil, errors.New("input too small")
+	}
+	if dstSize > s.MaxDecodedSize {
+		return nil, ErrMaxDecodedSizeExceeded
+	}
+	// TODO: We do not detect when we overrun a buffer, except if the last one does.
+
+	var br [4]bitReader
+	start := 6
+	for i := 0; i < 3; i++ {
+		length := int(in[i*2]) | (int(in[i*2+1]) << 8)
+		if start+length >= len(in) {
+			return nil, errors.New("truncated input (or invalid offset)")
+		}
+		err = br[i].init(in[start : start+length])
+		if err != nil {
+			return nil, err
+		}
+		start += length
+	}
+	err = br[3].init(in[start:])
+	if err != nil {
+		return nil, err
+	}
+
+	// Prepare output
+	if cap(s.Out) < dstSize {
+		s.Out = make([]byte, 0, dstSize)
+	}
+	s.Out = s.Out[:dstSize]
+	// destination, offset to match first output
+	dstOut := s.Out
+	dstEvery := (dstSize + 3) / 4
+
+	const tlSize = 1 << tableLogMax
+	const tlMask = tlSize - 1
+	single := s.dt.single[:tlSize]
+
+	decode := func(br *bitReader) byte {
+		val := br.peekBitsFast(s.actualTableLog) /* note : actualTableLog >= 1 */
+		v := single[val&tlMask]
+		br.bitsRead += uint8(v.entry)
+		return uint8(v.entry >> 8)
+	}
+
+	// Use temp table to avoid bound checks/append penalty.
+	var tmp = s.huffWeight[:256]
+	var off uint8
+	var decoded int
+
+	// Decode 2 values from each decoder/loop.
+	const bufoff = 256 / 4
+bigloop:
+	for {
+		for i := range br {
+			br := &br[i]
+			if br.off < 4 {
+				break bigloop
+			}
+			br.fillFast()
+		}
+
+		{
+			const stream = 0
+			val := br[stream].peekBitsFast(s.actualTableLog)
+			v := single[val&tlMask]
+			br[stream].bitsRead += uint8(v.entry)
+
+			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			v2 := single[val2&tlMask]
+			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			br[stream].bitsRead += uint8(v2.entry)
+		}
+
+		{
+			const stream = 1
+			val := br[stream].peekBitsFast(s.actualTableLog)
+			v := single[val&tlMask]
+			br[stream].bitsRead += uint8(v.entry)
+
+			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			v2 := single[val2&tlMask]
+			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			br[stream].bitsRead += uint8(v2.entry)
+		}
+
+		{
+			const stream = 2
+			val := br[stream].peekBitsFast(s.actualTableLog)
+			v := single[val&tlMask]
+			br[stream].bitsRead += uint8(v.entry)
+
+			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			v2 := single[val2&tlMask]
+			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			br[stream].bitsRead += uint8(v2.entry)
+		}
+
+		{
+			const stream = 3
+			val := br[stream].peekBitsFast(s.actualTableLog)
+			v := single[val&tlMask]
+			br[stream].bitsRead += uint8(v.entry)
+
+			val2 := br[stream].peekBitsFast(s.actualTableLog)
+			v2 := single[val2&tlMask]
+			tmp[off+bufoff*stream+1] = uint8(v2.entry >> 8)
+			tmp[off+bufoff*stream] = uint8(v.entry >> 8)
+			br[stream].bitsRead += uint8(v2.entry)
+		}
+
+		off += 2
+
+		if off == bufoff {
+			if bufoff > dstEvery {
+				return nil, errors.New("corruption detected: stream overrun 1")
+			}
+			copy(dstOut, tmp[:bufoff])
+			copy(dstOut[dstEvery:], tmp[bufoff:bufoff*2])
+			copy(dstOut[dstEvery*2:], tmp[bufoff*2:bufoff*3])
+			copy(dstOut[dstEvery*3:], tmp[bufoff*3:bufoff*4])
+			off = 0
+			dstOut = dstOut[bufoff:]
+			decoded += 256
+			// There must at least be 3 buffers left.
+			if len(dstOut) < dstEvery*3 {
+				return nil, errors.New("corruption detected: stream overrun 2")
+			}
+		}
+	}
+	if off > 0 {
+		ioff := int(off)
+		if len(dstOut) < dstEvery*3+ioff {
+			return nil, errors.New("corruption detected: stream overrun 3")
+		}
+		copy(dstOut, tmp[:off])
+		copy(dstOut[dstEvery:dstEvery+ioff], tmp[bufoff:bufoff*2])
+		copy(dstOut[dstEvery*2:dstEvery*2+ioff], tmp[bufoff*2:bufoff*3])
+		copy(dstOut[dstEvery*3:dstEvery*3+ioff], tmp[bufoff*3:bufoff*4])
+		decoded += int(off) * 4
+		dstOut = dstOut[off:]
+	}
+
+	// Decode remaining.
+	for i := range br {
+		offset := dstEvery * i
+		br := &br[i]
+		for !br.finished() {
+			br.fill()
+			if offset >= len(dstOut) {
+				return nil, errors.New("corruption detected: stream overrun 4")
+			}
+			dstOut[offset] = decode(br)
+			offset++
+		}
+		decoded += offset - dstEvery*i
+		err = br.close()
+		if err != nil {
+			return nil, err
+		}
+	}
+	if dstSize != decoded {
+		return nil, errors.New("corruption detected: short output block")
+	}
+	return s.Out, nil
+}
+
+// matches will compare a decoding table to a coding table.
+// Errors are written to the writer.
+// Nothing will be written if table is ok.
+func (s *Scratch) matches(ct cTable, w io.Writer) {
+	if s == nil || len(s.dt.single) == 0 {
+		return
+	}
+	dt := s.dt.single[:1<<s.actualTableLog]
+	tablelog := s.actualTableLog
+	ok := 0
+	broken := 0
+	for sym, enc := range ct {
+		errs := 0
+		broken++
+		if enc.nBits == 0 {
+			for _, dec := range dt {
+				if uint8(dec.entry>>8) == byte(sym) {
+					fmt.Fprintf(w, "symbol %x has decoder, but no encoder\n", sym)
+					errs++
+					break
+				}
+			}
+			if errs == 0 {
+				broken--
+			}
+			continue
+		}
+		// Unused bits in input
+		ub := tablelog - enc.nBits
+		top := enc.val << ub
+		// decoder looks at top bits.
+		dec := dt[top]
+		if uint8(dec.entry) != enc.nBits {
+			fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", sym, enc.nBits, uint8(dec.entry))
+			errs++
+		}
+		if uint8(dec.entry>>8) != uint8(sym) {
+			fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", sym, sym, uint8(dec.entry>>8))
+			errs++
+		}
+		if errs > 0 {
+			fmt.Fprintf(w, "%d errros in base, stopping\n", errs)
+			continue
+		}
+		// Ensure that all combinations are covered.
+		for i := uint16(0); i < (1 << ub); i++ {
+			vval := top | i
+			dec := dt[vval]
+			if uint8(dec.entry) != enc.nBits {
+				fmt.Fprintf(w, "symbol 0x%x bit size mismatch (enc: %d, dec:%d).\n", vval, enc.nBits, uint8(dec.entry))
+				errs++
+			}
+			if uint8(dec.entry>>8) != uint8(sym) {
+				fmt.Fprintf(w, "symbol 0x%x decoder output mismatch (enc: %d, dec:%d).\n", vval, sym, uint8(dec.entry>>8))
+				errs++
+			}
+			if errs > 20 {
+				fmt.Fprintf(w, "%d errros, stopping\n", errs)
+				break
+			}
+		}
+		if errs == 0 {
+			ok++
+			broken--
+		}
+	}
+	if broken > 0 {
+		fmt.Fprintf(w, "%d broken, %d ok\n", broken, ok)
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/huff0/huff0.go b/vendor/github.com/klauspost/compress/huff0/huff0.go
new file mode 100644
index 00000000000..177d6c4ea0e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/huff0/huff0.go
@@ -0,0 +1,260 @@
+// Package huff0 provides fast huffman encoding as used in zstd.
+//
+// See README.md at https://github.com/klauspost/compress/tree/master/huff0 for details.
+package huff0
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"math/bits"
+
+	"github.com/klauspost/compress/fse"
+)
+
+const (
+	maxSymbolValue = 255
+
+	// zstandard limits tablelog to 11, see:
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#huffman-tree-description
+	tableLogMax     = 11
+	tableLogDefault = 11
+	minTablelog     = 5
+	huffNodesLen    = 512
+
+	// BlockSizeMax is maximum input size for a single block uncompressed.
+	BlockSizeMax = 1<<18 - 1
+)
+
+var (
+	// ErrIncompressible is returned when input is judged to be too hard to compress.
+	ErrIncompressible = errors.New("input is not compressible")
+
+	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
+	ErrUseRLE = errors.New("input is single value repeated")
+
+	// ErrTooBig is return if input is too large for a single block.
+	ErrTooBig = errors.New("input too big")
+
+	// ErrMaxDecodedSizeExceeded is return if input is too large for a single block.
+	ErrMaxDecodedSizeExceeded = errors.New("maximum output size exceeded")
+)
+
+type ReusePolicy uint8
+
+const (
+	// ReusePolicyAllow will allow reuse if it produces smaller output.
+	ReusePolicyAllow ReusePolicy = iota
+
+	// ReusePolicyPrefer will re-use aggressively if possible.
+	// This will not check if a new table will produce smaller output,
+	// except if the current table is impossible to use or
+	// compressed output is bigger than input.
+	ReusePolicyPrefer
+
+	// ReusePolicyNone will disable re-use of tables.
+	// This is slightly faster than ReusePolicyAllow but may produce larger output.
+	ReusePolicyNone
+)
+
+type Scratch struct {
+	count [maxSymbolValue + 1]uint32
+
+	// Per block parameters.
+	// These can be used to override compression parameters of the block.
+	// Do not touch, unless you know what you are doing.
+
+	// Out is output buffer.
+	// If the scratch is re-used before the caller is done processing the output,
+	// set this field to nil.
+	// Otherwise the output buffer will be re-used for next Compression/Decompression step
+	// and allocation will be avoided.
+	Out []byte
+
+	// OutTable will contain the table data only, if a new table has been generated.
+	// Slice of the returned data.
+	OutTable []byte
+
+	// OutData will contain the compressed data.
+	// Slice of the returned data.
+	OutData []byte
+
+	// MaxDecodedSize will set the maximum allowed output size.
+	// This value will automatically be set to BlockSizeMax if not set.
+	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
+	MaxDecodedSize int
+
+	br byteReader
+
+	// MaxSymbolValue will override the maximum symbol value of the next block.
+	MaxSymbolValue uint8
+
+	// TableLog will attempt to override the tablelog for the next block.
+	// Must be <= 11 and >= 5.
+	TableLog uint8
+
+	// Reuse will specify the reuse policy
+	Reuse ReusePolicy
+
+	// WantLogLess allows to specify a log 2 reduction that should at least be achieved,
+	// otherwise the block will be returned as incompressible.
+	// The reduction should then at least be (input size >> WantLogLess)
+	// If WantLogLess == 0 any improvement will do.
+	WantLogLess uint8
+
+	symbolLen      uint16 // Length of active part of the symbol table.
+	maxCount       int    // count of the most probable symbol
+	clearCount     bool   // clear count
+	actualTableLog uint8  // Selected tablelog.
+	prevTableLog   uint8  // Tablelog for previous table
+	prevTable      cTable // Table used for previous compression.
+	cTable         cTable // compression table
+	dt             dTable // decompression table
+	nodes          []nodeElt
+	tmpOut         [4][]byte
+	fse            *fse.Scratch
+	huffWeight     [maxSymbolValue + 1]byte
+}
+
+func (s *Scratch) prepare(in []byte) (*Scratch, error) {
+	if len(in) > BlockSizeMax {
+		return nil, ErrTooBig
+	}
+	if s == nil {
+		s = &Scratch{}
+	}
+	if s.MaxSymbolValue == 0 {
+		s.MaxSymbolValue = maxSymbolValue
+	}
+	if s.TableLog == 0 {
+		s.TableLog = tableLogDefault
+	}
+	if s.TableLog > tableLogMax || s.TableLog < minTablelog {
+		return nil, fmt.Errorf(" invalid tableLog %d (%d -> %d)", s.TableLog, minTablelog, tableLogMax)
+	}
+	if s.MaxDecodedSize <= 0 || s.MaxDecodedSize > BlockSizeMax {
+		s.MaxDecodedSize = BlockSizeMax
+	}
+	if s.clearCount && s.maxCount == 0 {
+		for i := range s.count {
+			s.count[i] = 0
+		}
+		s.clearCount = false
+	}
+	if cap(s.Out) == 0 {
+		s.Out = make([]byte, 0, len(in))
+	}
+	s.Out = s.Out[:0]
+
+	s.OutTable = nil
+	s.OutData = nil
+	if cap(s.nodes) < huffNodesLen+1 {
+		s.nodes = make([]nodeElt, 0, huffNodesLen+1)
+	}
+	s.nodes = s.nodes[:0]
+	if s.fse == nil {
+		s.fse = &fse.Scratch{}
+	}
+	s.br.init(in)
+
+	return s, nil
+}
+
+type cTable []cTableEntry
+
+func (c cTable) write(s *Scratch) error {
+	var (
+		// precomputed conversion table
+		bitsToWeight [tableLogMax + 1]byte
+		huffLog      = s.actualTableLog
+		// last weight is not saved.
+		maxSymbolValue = uint8(s.symbolLen - 1)
+		huffWeight     = s.huffWeight[:256]
+	)
+	const (
+		maxFSETableLog = 6
+	)
+	// convert to weight
+	bitsToWeight[0] = 0
+	for n := uint8(1); n < huffLog+1; n++ {
+		bitsToWeight[n] = huffLog + 1 - n
+	}
+
+	// Acquire histogram for FSE.
+	hist := s.fse.Histogram()
+	hist = hist[:256]
+	for i := range hist[:16] {
+		hist[i] = 0
+	}
+	for n := uint8(0); n < maxSymbolValue; n++ {
+		v := bitsToWeight[c[n].nBits] & 15
+		huffWeight[n] = v
+		hist[v]++
+	}
+
+	// FSE compress if feasible.
+	if maxSymbolValue >= 2 {
+		huffMaxCnt := uint32(0)
+		huffMax := uint8(0)
+		for i, v := range hist[:16] {
+			if v == 0 {
+				continue
+			}
+			huffMax = byte(i)
+			if v > huffMaxCnt {
+				huffMaxCnt = v
+			}
+		}
+		s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
+		s.fse.TableLog = maxFSETableLog
+		b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
+		if err == nil && len(b) < int(s.symbolLen>>1) {
+			s.Out = append(s.Out, uint8(len(b)))
+			s.Out = append(s.Out, b...)
+			return nil
+		}
+		// Unable to compress (RLE/uncompressible)
+	}
+	// write raw values as 4-bits (max : 15)
+	if maxSymbolValue > (256 - 128) {
+		// should not happen : likely means source cannot be compressed
+		return ErrIncompressible
+	}
+	op := s.Out
+	// special case, pack weights 4 bits/weight.
+	op = append(op, 128|(maxSymbolValue-1))
+	// be sure it doesn't cause msan issue in final combination
+	huffWeight[maxSymbolValue] = 0
+	for n := uint16(0); n < uint16(maxSymbolValue); n += 2 {
+		op = append(op, (huffWeight[n]<<4)|huffWeight[n+1])
+	}
+	s.Out = op
+	return nil
+}
+
+// estimateSize returns the estimated size in bytes of the input represented in the
+// histogram supplied.
+func (c cTable) estimateSize(hist []uint32) int {
+	nbBits := uint32(7)
+	for i, v := range c[:len(hist)] {
+		nbBits += uint32(v.nBits) * hist[i]
+	}
+	return int(nbBits >> 3)
+}
+
+// minSize returns the minimum possible size considering the shannon limit.
+func (s *Scratch) minSize(total int) int {
+	nbBits := float64(7)
+	fTotal := float64(total)
+	for _, v := range s.count[:s.symbolLen] {
+		n := float64(v)
+		if n > 0 {
+			nbBits += math.Log2(fTotal/n) * n
+		}
+	}
+	return int(nbBits) >> 3
+}
+
+func highBit32(val uint32) (n uint32) {
+	return uint32(bits.Len32(val) - 1)
+}
diff --git a/vendor/github.com/klauspost/compress/snappy/.gitignore b/vendor/github.com/klauspost/compress/snappy/.gitignore
new file mode 100644
index 00000000000..042091d9b3b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/.gitignore
@@ -0,0 +1,16 @@
+cmd/snappytool/snappytool
+testdata/bench
+
+# These explicitly listed benchmark data files are for an obsolete version of
+# snappy_test.go.
+testdata/alice29.txt
+testdata/asyoulik.txt
+testdata/fireworks.jpeg
+testdata/geo.protodata
+testdata/html
+testdata/html_x_4
+testdata/kppkn.gtb
+testdata/lcet10.txt
+testdata/paper-100k.pdf
+testdata/plrabn12.txt
+testdata/urls.10K
diff --git a/vendor/github.com/klauspost/compress/snappy/AUTHORS b/vendor/github.com/klauspost/compress/snappy/AUTHORS
new file mode 100644
index 00000000000..bcfa19520af
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/AUTHORS
@@ -0,0 +1,15 @@
+# This is the official list of Snappy-Go authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Google Inc.
+Jan Mercl <0xjnml@gmail.com>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
new file mode 100644
index 00000000000..931ae31606f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/CONTRIBUTORS
@@ -0,0 +1,37 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Snappy-Go repository.
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS, because Google holds the copyright.
+#
+# The submission process automatically checks to make sure
+# that people submitting code are listed in this file (by email address).
+#
+# Names should be added to this file only after verifying that
+# the individual or the individual's organization has agreed to
+# the appropriate Contributor License Agreement, found here:
+#
+#     http://code.google.com/legal/individual-cla-v1.0.html
+#     http://code.google.com/legal/corporate-cla-v1.0.html
+#
+# The agreement for individuals can be filled out on the web.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file, depending on whether the
+# individual or corporate CLA was used.
+
+# Names should be added to this file like so:
+#     Name <email address>
+
+# Please keep the list sorted.
+
+Damian Gryski <dgryski@gmail.com>
+Jan Mercl <0xjnml@gmail.com>
+Kai Backman <kaib@golang.org>
+Marc-Antoine Ruel <maruel@chromium.org>
+Nigel Tao <nigeltao@golang.org>
+Rob Pike <r@golang.org>
+Rodolfo Carvalho <rhcarvalho@gmail.com>
+Russ Cox <rsc@golang.org>
+Sebastien Binet <seb.binet@gmail.com>
diff --git a/vendor/github.com/klauspost/compress/snappy/LICENSE b/vendor/github.com/klauspost/compress/snappy/LICENSE
new file mode 100644
index 00000000000..6050c10f4c8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/vendor/github.com/klauspost/compress/snappy/README b/vendor/github.com/klauspost/compress/snappy/README
new file mode 100644
index 00000000000..cea12879a0e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/README
@@ -0,0 +1,107 @@
+The Snappy compression format in the Go programming language.
+
+To download and install from source:
+$ go get github.com/golang/snappy
+
+Unless otherwise noted, the Snappy-Go source files are distributed
+under the BSD-style license found in the LICENSE file.
+
+
+
+Benchmarks.
+
+The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
+or so files, the same set used by the C++ Snappy code (github.com/google/snappy
+and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
+3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
+
+"go test -test.bench=."
+
+_UFlat0-8         2.19GB/s ± 0%  html
+_UFlat1-8         1.41GB/s ± 0%  urls
+_UFlat2-8         23.5GB/s ± 2%  jpg
+_UFlat3-8         1.91GB/s ± 0%  jpg_200
+_UFlat4-8         14.0GB/s ± 1%  pdf
+_UFlat5-8         1.97GB/s ± 0%  html4
+_UFlat6-8          814MB/s ± 0%  txt1
+_UFlat7-8          785MB/s ± 0%  txt2
+_UFlat8-8          857MB/s ± 0%  txt3
+_UFlat9-8          719MB/s ± 1%  txt4
+_UFlat10-8        2.84GB/s ± 0%  pb
+_UFlat11-8        1.05GB/s ± 0%  gaviota
+
+_ZFlat0-8         1.04GB/s ± 0%  html
+_ZFlat1-8          534MB/s ± 0%  urls
+_ZFlat2-8         15.7GB/s ± 1%  jpg
+_ZFlat3-8          740MB/s ± 3%  jpg_200
+_ZFlat4-8         9.20GB/s ± 1%  pdf
+_ZFlat5-8          991MB/s ± 0%  html4
+_ZFlat6-8          379MB/s ± 0%  txt1
+_ZFlat7-8          352MB/s ± 0%  txt2
+_ZFlat8-8          396MB/s ± 1%  txt3
+_ZFlat9-8          327MB/s ± 1%  txt4
+_ZFlat10-8        1.33GB/s ± 1%  pb
+_ZFlat11-8         605MB/s ± 1%  gaviota
+
+
+
+"go test -test.bench=. -tags=noasm"
+
+_UFlat0-8          621MB/s ± 2%  html
+_UFlat1-8          494MB/s ± 1%  urls
+_UFlat2-8         23.2GB/s ± 1%  jpg
+_UFlat3-8         1.12GB/s ± 1%  jpg_200
+_UFlat4-8         4.35GB/s ± 1%  pdf
+_UFlat5-8          609MB/s ± 0%  html4
+_UFlat6-8          296MB/s ± 0%  txt1
+_UFlat7-8          288MB/s ± 0%  txt2
+_UFlat8-8          309MB/s ± 1%  txt3
+_UFlat9-8          280MB/s ± 1%  txt4
+_UFlat10-8         753MB/s ± 0%  pb
+_UFlat11-8         400MB/s ± 0%  gaviota
+
+_ZFlat0-8          409MB/s ± 1%  html
+_ZFlat1-8          250MB/s ± 1%  urls
+_ZFlat2-8         12.3GB/s ± 1%  jpg
+_ZFlat3-8          132MB/s ± 0%  jpg_200
+_ZFlat4-8         2.92GB/s ± 0%  pdf
+_ZFlat5-8          405MB/s ± 1%  html4
+_ZFlat6-8          179MB/s ± 1%  txt1
+_ZFlat7-8          170MB/s ± 1%  txt2
+_ZFlat8-8          189MB/s ± 1%  txt3
+_ZFlat9-8          164MB/s ± 1%  txt4
+_ZFlat10-8         479MB/s ± 1%  pb
+_ZFlat11-8         270MB/s ± 1%  gaviota
+
+
+
+For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
+are the numbers from C++ Snappy's
+
+make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
+
+BM_UFlat/0     2.4GB/s  html
+BM_UFlat/1     1.4GB/s  urls
+BM_UFlat/2    21.8GB/s  jpg
+BM_UFlat/3     1.5GB/s  jpg_200
+BM_UFlat/4    13.3GB/s  pdf
+BM_UFlat/5     2.1GB/s  html4
+BM_UFlat/6     1.0GB/s  txt1
+BM_UFlat/7   959.4MB/s  txt2
+BM_UFlat/8     1.0GB/s  txt3
+BM_UFlat/9   864.5MB/s  txt4
+BM_UFlat/10    2.9GB/s  pb
+BM_UFlat/11    1.2GB/s  gaviota
+
+BM_ZFlat/0   944.3MB/s  html (22.31 %)
+BM_ZFlat/1   501.6MB/s  urls (47.78 %)
+BM_ZFlat/2    14.3GB/s  jpg (99.95 %)
+BM_ZFlat/3   538.3MB/s  jpg_200 (73.00 %)
+BM_ZFlat/4     8.3GB/s  pdf (83.30 %)
+BM_ZFlat/5   903.5MB/s  html4 (22.52 %)
+BM_ZFlat/6   336.0MB/s  txt1 (57.88 %)
+BM_ZFlat/7   312.3MB/s  txt2 (61.91 %)
+BM_ZFlat/8   353.1MB/s  txt3 (54.99 %)
+BM_ZFlat/9   289.9MB/s  txt4 (66.26 %)
+BM_ZFlat/10    1.2GB/s  pb (19.68 %)
+BM_ZFlat/11  527.4MB/s  gaviota (37.72 %)
diff --git a/vendor/github.com/klauspost/compress/snappy/decode.go b/vendor/github.com/klauspost/compress/snappy/decode.go
new file mode 100644
index 00000000000..72efb0353dd
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/decode.go
@@ -0,0 +1,237 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+var (
+	// ErrCorrupt reports that the input is invalid.
+	ErrCorrupt = errors.New("snappy: corrupt input")
+	// ErrTooLarge reports that the uncompressed length is too large.
+	ErrTooLarge = errors.New("snappy: decoded block is too large")
+	// ErrUnsupported reports that the input isn't supported.
+	ErrUnsupported = errors.New("snappy: unsupported input")
+
+	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
+)
+
+// DecodedLen returns the length of the decoded block.
+func DecodedLen(src []byte) (int, error) {
+	v, _, err := decodedLen(src)
+	return v, err
+}
+
+// decodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func decodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrTooLarge
+	}
+	return int(v), n, nil
+}
+
+const (
+	decodeErrCodeCorrupt                  = 1
+	decodeErrCodeUnsupportedLiteralLength = 2
+)
+
+// Decode returns the decoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire decoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Decode(dst, src []byte) ([]byte, error) {
+	dLen, s, err := decodedLen(src)
+	if err != nil {
+		return nil, err
+	}
+	if dLen <= len(dst) {
+		dst = dst[:dLen]
+	} else {
+		dst = make([]byte, dLen)
+	}
+	switch decode(dst, src[s:]) {
+	case 0:
+		return dst, nil
+	case decodeErrCodeUnsupportedLiteralLength:
+		return nil, errUnsupportedLiteralLength
+	}
+	return nil, ErrCorrupt
+}
+
+// NewReader returns a new Reader that decompresses from r, using the framing
+// format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func NewReader(r io.Reader) *Reader {
+	return &Reader{
+		r:       r,
+		decoded: make([]byte, maxBlockSize),
+		buf:     make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
+	}
+}
+
+// Reader is an io.Reader that can read Snappy-compressed bytes.
+type Reader struct {
+	r       io.Reader
+	err     error
+	decoded []byte
+	buf     []byte
+	// decoded[i:j] contains decoded bytes that have not yet been passed on.
+	i, j       int
+	readHeader bool
+}
+
+// Reset discards any buffered data, resets all state, and switches the Snappy
+// reader to read from r. This permits reusing a Reader rather than allocating
+// a new one.
+func (r *Reader) Reset(reader io.Reader) {
+	r.r = reader
+	r.err = nil
+	r.i = 0
+	r.j = 0
+	r.readHeader = false
+}
+
+func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+// Read satisfies the io.Reader interface.
+func (r *Reader) Read(p []byte) (int, error) {
+	if r.err != nil {
+		return 0, r.err
+	}
+	for {
+		if r.i < r.j {
+			n := copy(p, r.decoded[r.i:r.j])
+			r.i += n
+			return n, nil
+		}
+		if !r.readFull(r.buf[:4], true) {
+			return 0, r.err
+		}
+		chunkType := r.buf[0]
+		if !r.readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+		if chunkLen > len(r.buf) {
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[checksumSize:]
+
+			n, err := DecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if _, err := Decode(r.decoded, buf); err != nil {
+				r.err = err
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeUncompressedData:
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < checksumSize {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			buf := r.buf[:checksumSize]
+			if !r.readFull(buf, false) {
+				return 0, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - checksumSize
+			if n > len(r.decoded) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.decoded[:n], false) {
+				return 0, r.err
+			}
+			if crc(r.decoded[:n]) != checksum {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			r.i, r.j = 0, n
+			continue
+
+		case chunkTypeStreamIdentifier:
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(magicBody) {
+				r.err = ErrCorrupt
+				return 0, r.err
+			}
+			if !r.readFull(r.buf[:len(magicBody)], false) {
+				return 0, r.err
+			}
+			for i := 0; i < len(magicBody); i++ {
+				if r.buf[i] != magicBody[i] {
+					r.err = ErrCorrupt
+					return 0, r.err
+				}
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			r.err = ErrUnsupported
+			return 0, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.readFull(r.buf[:chunkLen], false) {
+			return 0, r.err
+		}
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.go b/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
new file mode 100644
index 00000000000..fcd192b849e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/decode_amd64.go
@@ -0,0 +1,14 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// decode has the same semantics as in decode_other.go.
+//
+//go:noescape
+func decode(dst, src []byte) int
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_amd64.s b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
new file mode 100644
index 00000000000..1c66e37234d
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/decode_amd64.s
@@ -0,0 +1,482 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The asm code generally follows the pure Go code in decode_other.go, except
+// where marked with a "!!!".
+
+// func decode(dst, src []byte) int
+//
+// All local variables fit into registers. The non-zero stack size is only to
+// spill registers and push args when issuing a CALL. The register allocation:
+//	- AX	scratch
+//	- BX	scratch
+//	- CX	length or x
+//	- DX	offset
+//	- SI	&src[s]
+//	- DI	&dst[d]
+//	+ R8	dst_base
+//	+ R9	dst_len
+//	+ R10	dst_base + dst_len
+//	+ R11	src_base
+//	+ R12	src_len
+//	+ R13	src_base + src_len
+//	- R14	used by doCopy
+//	- R15	used by doCopy
+//
+// The registers R8-R13 (marked with a "+") are set at the start of the
+// function, and after a CALL returns, and are not otherwise modified.
+//
+// The d variable is implicitly DI - R8,  and len(dst)-d is R10 - DI.
+// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
+TEXT ·decode(SB), NOSPLIT, $48-56
+	// Initialize SI, DI and R8-R13.
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, DI
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, SI
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+loop:
+	// for s < len(src)
+	CMPQ SI, R13
+	JEQ  end
+
+	// CX = uint32(src[s])
+	//
+	// switch src[s] & 0x03
+	MOVBLZX (SI), CX
+	MOVL    CX, BX
+	ANDL    $3, BX
+	CMPL    BX, $1
+	JAE     tagCopy
+
+	// ----------------------------------------
+	// The code below handles literal tags.
+
+	// case tagLiteral:
+	// x := uint32(src[s] >> 2)
+	// switch
+	SHRL $2, CX
+	CMPL CX, $60
+	JAE  tagLit60Plus
+
+	// case x < 60:
+	// s++
+	INCQ SI
+
+doLit:
+	// This is the end of the inner "switch", when we have a literal tag.
+	//
+	// We assume that CX == x and x fits in a uint32, where x is the variable
+	// used in the pure Go decode_other.go code.
+
+	// length = int(x) + 1
+	//
+	// Unlike the pure Go code, we don't need to check if length <= 0 because
+	// CX can hold 64 bits, so the increment cannot overflow.
+	INCQ CX
+
+	// Prepare to check if copying length bytes will run past the end of dst or
+	// src.
+	//
+	// AX = len(dst) - d
+	// BX = len(src) - s
+	MOVQ R10, AX
+	SUBQ DI, AX
+	MOVQ R13, BX
+	SUBQ SI, BX
+
+	// !!! Try a faster technique for short (16 or fewer bytes) copies.
+	//
+	// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
+	//   goto callMemmove // Fall back on calling runtime·memmove.
+	// }
+	//
+	// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
+	// against 21 instead of 16, because it cannot assume that all of its input
+	// is contiguous in memory and so it needs to leave enough source bytes to
+	// read the next tag without refilling buffers, but Go's Decode assumes
+	// contiguousness (the src argument is a []byte).
+	CMPQ CX, $16
+	JGT  callMemmove
+	CMPQ AX, $16
+	JLT  callMemmove
+	CMPQ BX, $16
+	JLT  callMemmove
+
+	// !!! Implement the copy from src to dst as a 16-byte load and store.
+	// (Decode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only length bytes, but that's
+	// OK. If the input is a valid Snappy encoding then subsequent iterations
+	// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
+	// non-nil error), so the overrun will be ignored.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(SI), X0
+	MOVOU X0, 0(DI)
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+callMemmove:
+	// if length > len(dst)-d || length > len(src)-s { etc }
+	CMPQ CX, AX
+	JGT  errCorrupt
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// copy(dst[d:], src[s:s+length])
+	//
+	// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
+	// DI, SI and CX as arguments. Coincidentally, we also need to spill those
+	// three registers to the stack, to save local variables across the CALL.
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP)
+	CALL runtime·memmove(SB)
+
+	// Restore local variables: unspill registers from the stack and
+	// re-calculate R8-R13.
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVQ dst_base+0(FP), R8
+	MOVQ dst_len+8(FP), R9
+	MOVQ R8, R10
+	ADDQ R9, R10
+	MOVQ src_base+24(FP), R11
+	MOVQ src_len+32(FP), R12
+	MOVQ R11, R13
+	ADDQ R12, R13
+
+	// d += length
+	// s += length
+	ADDQ CX, DI
+	ADDQ CX, SI
+	JMP  loop
+
+tagLit60Plus:
+	// !!! This fragment does the
+	//
+	// s += x - 58; if uint(s) > uint(len(src)) { etc }
+	//
+	// checks. In the asm version, we code it once instead of once per switch case.
+	ADDQ CX, SI
+	SUBQ $58, SI
+	CMPQ SI, R13
+	JA   errCorrupt
+
+	// case x == 60:
+	CMPL CX, $61
+	JEQ  tagLit61
+	JA   tagLit62Plus
+
+	// x = uint32(src[s-1])
+	MOVBLZX -1(SI), CX
+	JMP     doLit
+
+tagLit61:
+	// case x == 61:
+	// x = uint32(src[s-2]) | uint32(src[s-1])<<8
+	MOVWLZX -2(SI), CX
+	JMP     doLit
+
+tagLit62Plus:
+	CMPL CX, $62
+	JA   tagLit63
+
+	// case x == 62:
+	// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+	MOVWLZX -3(SI), CX
+	MOVBLZX -1(SI), BX
+	SHLL    $16, BX
+	ORL     BX, CX
+	JMP     doLit
+
+tagLit63:
+	// case x == 63:
+	// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+	MOVL -4(SI), CX
+	JMP  doLit
+
+// The code above handles literal tags.
+// ----------------------------------------
+// The code below handles copy tags.
+
+tagCopy4:
+	// case tagCopy4:
+	// s += 5
+	ADDQ $5, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ SI, R13
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-5])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+	MOVLQZX -4(SI), DX
+	JMP     doCopy
+
+tagCopy2:
+	// case tagCopy2:
+	// s += 3
+	ADDQ $3, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ SI, R13
+	JA   errCorrupt
+
+	// length = 1 + int(src[s-3])>>2
+	SHRQ $2, CX
+	INCQ CX
+
+	// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+	MOVWQZX -2(SI), DX
+	JMP     doCopy
+
+tagCopy:
+	// We have a copy tag. We assume that:
+	//	- BX == src[s] & 0x03
+	//	- CX == src[s]
+	CMPQ BX, $2
+	JEQ  tagCopy2
+	JA   tagCopy4
+
+	// case tagCopy1:
+	// s += 2
+	ADDQ $2, SI
+
+	// if uint(s) > uint(len(src)) { etc }
+	CMPQ SI, R13
+	JA   errCorrupt
+
+	// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+	MOVQ    CX, DX
+	ANDQ    $0xe0, DX
+	SHLQ    $3, DX
+	MOVBQZX -1(SI), BX
+	ORQ     BX, DX
+
+	// length = 4 + int(src[s-2])>>2&0x7
+	SHRQ $2, CX
+	ANDQ $7, CX
+	ADDQ $4, CX
+
+doCopy:
+	// This is the end of the outer "switch", when we have a copy tag.
+	//
+	// We assume that:
+	//	- CX == length && CX > 0
+	//	- DX == offset
+
+	// if offset <= 0 { etc }
+	CMPQ DX, $0
+	JLE  errCorrupt
+
+	// if d < offset { etc }
+	MOVQ DI, BX
+	SUBQ R8, BX
+	CMPQ BX, DX
+	JLT  errCorrupt
+
+	// if length > len(dst)-d { etc }
+	MOVQ R10, BX
+	SUBQ DI, BX
+	CMPQ CX, BX
+	JGT  errCorrupt
+
+	// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
+	//
+	// Set:
+	//	- R14 = len(dst)-d
+	//	- R15 = &dst[d-offset]
+	MOVQ R10, R14
+	SUBQ DI, R14
+	MOVQ DI, R15
+	SUBQ DX, R15
+
+	// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
+	//
+	// First, try using two 8-byte load/stores, similar to the doLit technique
+	// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
+	// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
+	// and not one 16-byte load/store, and the first store has to be before the
+	// second load, due to the overlap if offset is in the range [8, 16).
+	//
+	// if length > 16 || offset < 8 || len(dst)-d < 16 {
+	//   goto slowForwardCopy
+	// }
+	// copy 16 bytes
+	// d += length
+	CMPQ CX, $16
+	JGT  slowForwardCopy
+	CMPQ DX, $8
+	JLT  slowForwardCopy
+	CMPQ R14, $16
+	JLT  slowForwardCopy
+	MOVQ 0(R15), AX
+	MOVQ AX, 0(DI)
+	MOVQ 8(R15), BX
+	MOVQ BX, 8(DI)
+	ADDQ CX, DI
+	JMP  loop
+
+slowForwardCopy:
+	// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
+	// can still try 8-byte load stores, provided we can overrun up to 10 extra
+	// bytes. As above, the overrun will be fixed up by subsequent iterations
+	// of the outermost loop.
+	//
+	// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
+	// commentary says:
+	//
+	// ----
+	//
+	// The main part of this loop is a simple copy of eight bytes at a time
+	// until we've copied (at least) the requested amount of bytes.  However,
+	// if d and d-offset are less than eight bytes apart (indicating a
+	// repeating pattern of length < 8), we first need to expand the pattern in
+	// order to get the correct results. For instance, if the buffer looks like
+	// this, with the eight-byte <d-offset> and <d> patterns marked as
+	// intervals:
+	//
+	//    abxxxxxxxxxxxx
+	//    [------]           d-offset
+	//      [------]         d
+	//
+	// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
+	// once, after which we can move <d> two bytes without moving <d-offset>:
+	//
+	//    ababxxxxxxxxxx
+	//    [------]           d-offset
+	//        [------]       d
+	//
+	// and repeat the exercise until the two no longer overlap.
+	//
+	// This allows us to do very well in the special case of one single byte
+	// repeated many times, without taking a big hit for more general cases.
+	//
+	// The worst case of extra writing past the end of the match occurs when
+	// offset == 1 and length == 1; the last copy will read from byte positions
+	// [0..7] and write to [4..11], whereas it was only supposed to write to
+	// position 1. Thus, ten excess bytes.
+	//
+	// ----
+	//
+	// That "10 byte overrun" worst case is confirmed by Go's
+	// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
+	// and finishSlowForwardCopy algorithm.
+	//
+	// if length > len(dst)-d-10 {
+	//   goto verySlowForwardCopy
+	// }
+	SUBQ $10, R14
+	CMPQ CX, R14
+	JGT  verySlowForwardCopy
+
+makeOffsetAtLeast8:
+	// !!! As above, expand the pattern so that offset >= 8 and we can use
+	// 8-byte load/stores.
+	//
+	// for offset < 8 {
+	//   copy 8 bytes from dst[d-offset:] to dst[d:]
+	//   length -= offset
+	//   d      += offset
+	//   offset += offset
+	//   // The two previous lines together means that d-offset, and therefore
+	//   // R15, is unchanged.
+	// }
+	CMPQ DX, $8
+	JGE  fixUpSlowForwardCopy
+	MOVQ (R15), BX
+	MOVQ BX, (DI)
+	SUBQ DX, CX
+	ADDQ DX, DI
+	ADDQ DX, DX
+	JMP  makeOffsetAtLeast8
+
+fixUpSlowForwardCopy:
+	// !!! Add length (which might be negative now) to d (implied by DI being
+	// &dst[d]) so that d ends up at the right place when we jump back to the
+	// top of the loop. Before we do that, though, we save DI to AX so that, if
+	// length is positive, copying the remaining length bytes will write to the
+	// right place.
+	MOVQ DI, AX
+	ADDQ CX, DI
+
+finishSlowForwardCopy:
+	// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
+	// length means that we overrun, but as above, that will be fixed up by
+	// subsequent iterations of the outermost loop.
+	CMPQ CX, $0
+	JLE  loop
+	MOVQ (R15), BX
+	MOVQ BX, (AX)
+	ADDQ $8, R15
+	ADDQ $8, AX
+	SUBQ $8, CX
+	JMP  finishSlowForwardCopy
+
+verySlowForwardCopy:
+	// verySlowForwardCopy is a simple implementation of forward copy. In C
+	// parlance, this is a do/while loop instead of a while loop, since we know
+	// that length > 0. In Go syntax:
+	//
+	// for {
+	//   dst[d] = dst[d - offset]
+	//   d++
+	//   length--
+	//   if length == 0 {
+	//     break
+	//   }
+	// }
+	MOVB (R15), BX
+	MOVB BX, (DI)
+	INCQ R15
+	INCQ DI
+	DECQ CX
+	JNZ  verySlowForwardCopy
+	JMP  loop
+
+// The code above handles copy tags.
+// ----------------------------------------
+
+end:
+	// This is the end of the "for s < len(src)".
+	//
+	// if d != len(dst) { etc }
+	CMPQ DI, R10
+	JNE  errCorrupt
+
+	// return 0
+	MOVQ $0, ret+48(FP)
+	RET
+
+errCorrupt:
+	// return decodeErrCodeCorrupt
+	MOVQ $1, ret+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/decode_other.go b/vendor/github.com/klauspost/compress/snappy/decode_other.go
new file mode 100644
index 00000000000..94a96c5d7b8
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/decode_other.go
@@ -0,0 +1,115 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+// decode writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read, and that len(dst)
+// equals that length.
+//
+// It returns 0 on success or a decodeErrCodeXxx error code on failure.
+func decode(dst, src []byte) int {
+	var d, s, offset, length int
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case tagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					return decodeErrCodeCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				return decodeErrCodeUnsupportedLiteralLength
+			}
+			if length > len(dst)-d || length > len(src)-s {
+				return decodeErrCodeCorrupt
+			}
+			copy(dst[d:], src[s:s+length])
+			d += length
+			s += length
+			continue
+
+		case tagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
+
+		case tagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
+
+		case tagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				return decodeErrCodeCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
+		}
+
+		if offset <= 0 || d < offset || length > len(dst)-d {
+			return decodeErrCodeCorrupt
+		}
+		// Copy from an earlier sub-slice of dst to a later sub-slice.
+		// If no overlap, use the built-in copy:
+		if offset > length {
+			copy(dst[d:d+length], dst[d-offset:])
+			d += length
+			continue
+		}
+
+		// Unlike the built-in copy function, this byte-by-byte copy always runs
+		// forwards, even if the slices overlap. Conceptually, this is:
+		//
+		// d += forwardCopy(dst[d:d+length], dst[d-offset:])
+		//
+		// We align the slices into a and b and show the compiler they are the same size.
+		// This allows the loop to run without bounds checks.
+		a := dst[d : d+length]
+		b := dst[d-offset:]
+		b = b[:len(a)]
+		for i := range a {
+			a[i] = b[i]
+		}
+		d += length
+	}
+	if d != len(dst) {
+		return decodeErrCodeCorrupt
+	}
+	return 0
+}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode.go b/vendor/github.com/klauspost/compress/snappy/encode.go
new file mode 100644
index 00000000000..8d393e904bb
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/encode.go
@@ -0,0 +1,285 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package snappy
+
+import (
+	"encoding/binary"
+	"errors"
+	"io"
+)
+
+// Encode returns the encoded form of src. The returned slice may be a sub-
+// slice of dst if dst was large enough to hold the entire encoded block.
+// Otherwise, a newly allocated slice will be returned.
+//
+// The dst and src must not overlap. It is valid to pass a nil dst.
+func Encode(dst, src []byte) []byte {
+	if n := MaxEncodedLen(len(src)); n < 0 {
+		panic(ErrTooLarge)
+	} else if len(dst) < n {
+		dst = make([]byte, n)
+	}
+
+	// The block starts with the varint-encoded length of the decompressed bytes.
+	d := binary.PutUvarint(dst, uint64(len(src)))
+
+	for len(src) > 0 {
+		p := src
+		src = nil
+		if len(p) > maxBlockSize {
+			p, src = p[:maxBlockSize], p[maxBlockSize:]
+		}
+		if len(p) < minNonLiteralBlockSize {
+			d += emitLiteral(dst[d:], p)
+		} else {
+			d += encodeBlock(dst[d:], p)
+		}
+	}
+	return dst[:d]
+}
+
+// inputMargin is the minimum number of extra input bytes to keep, inside
+// encodeBlock's inner loop. On some architectures, this margin lets us
+// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
+// literals can be implemented as a single load to and store from a 16-byte
+// register. That literal's actual length can be as short as 1 byte, so this
+// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
+// the encoding loop will fix up the copy overrun, and this inputMargin ensures
+// that we don't overrun the dst and src buffers.
+const inputMargin = 16 - 1
+
+// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
+// could be encoded with a copy tag. This is the minimum with respect to the
+// algorithm used by encodeBlock, not a minimum enforced by the file format.
+//
+// The encoded output must start with at least a 1 byte literal, as there are
+// no previous bytes to copy. A minimal (1 byte) copy after that, generated
+// from an emitCopy call in encodeBlock's main loop, would require at least
+// another inputMargin bytes, for the reason above: we want any emitLiteral
+// calls inside encodeBlock's main loop to use the fast path if possible, which
+// requires being able to overrun by inputMargin bytes. Thus,
+// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
+//
+// The C++ code doesn't use this exact threshold, but it could, as discussed at
+// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
+// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
+// optimization. It should not affect the encoded form. This is tested by
+// TestSameEncodingAsCppShortCopies.
+const minNonLiteralBlockSize = 1 + 1 + inputMargin
+
+// MaxEncodedLen returns the maximum length of a snappy block, given its
+// uncompressed length.
+//
+// It will return a negative value if srcLen is too large to encode.
+func MaxEncodedLen(srcLen int) int {
+	n := uint64(srcLen)
+	if n > 0xffffffff {
+		return -1
+	}
+	// Compressed data can be defined as:
+	//    compressed := item* literal*
+	//    item       := literal* copy
+	//
+	// The trailing literal sequence has a space blowup of at most 62/60
+	// since a literal of length 60 needs one tag byte + one extra byte
+	// for length information.
+	//
+	// Item blowup is trickier to measure. Suppose the "copy" op copies
+	// 4 bytes of data. Because of a special check in the encoding code,
+	// we produce a 4-byte copy only if the offset is < 65536. Therefore
+	// the copy op takes 3 bytes to encode, and this type of item leads
+	// to at most the 62/60 blowup for representing literals.
+	//
+	// Suppose the "copy" op copies 5 bytes of data. If the offset is big
+	// enough, it will take 5 bytes to encode the copy op. Therefore the
+	// worst case here is a one-byte literal followed by a five-byte copy.
+	// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
+	//
+	// This last factor dominates the blowup, so the final estimate is:
+	n = 32 + n + n/6
+	if n > 0xffffffff {
+		return -1
+	}
+	return int(n)
+}
+
+var errClosed = errors.New("snappy: Writer is closed")
+
+// NewWriter returns a new Writer that compresses to w.
+//
+// The Writer returned does not buffer writes. There is no need to Flush or
+// Close such a Writer.
+//
+// Deprecated: the Writer returned is not suitable for many small writes, only
+// for few large writes. Use NewBufferedWriter instead, which is efficient
+// regardless of the frequency and shape of the writes, and remember to Close
+// that Writer when done.
+func NewWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// NewBufferedWriter returns a new Writer that compresses to w, using the
+// framing format described at
+// https://github.com/google/snappy/blob/master/framing_format.txt
+//
+// The Writer returned buffers writes. Users must call Close to guarantee all
+// data has been forwarded to the underlying io.Writer. They may also call
+// Flush zero or more times before calling Close.
+func NewBufferedWriter(w io.Writer) *Writer {
+	return &Writer{
+		w:    w,
+		ibuf: make([]byte, 0, maxBlockSize),
+		obuf: make([]byte, obufLen),
+	}
+}
+
+// Writer is an io.Writer that can write Snappy-compressed bytes.
+type Writer struct {
+	w   io.Writer
+	err error
+
+	// ibuf is a buffer for the incoming (uncompressed) bytes.
+	//
+	// Its use is optional. For backwards compatibility, Writers created by the
+	// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
+	// therefore do not need to be Flush'ed or Close'd.
+	ibuf []byte
+
+	// obuf is a buffer for the outgoing (compressed) bytes.
+	obuf []byte
+
+	// wroteStreamHeader is whether we have written the stream header.
+	wroteStreamHeader bool
+}
+
+// Reset discards the writer's state and switches the Snappy writer to write to
+// w. This permits reusing a Writer rather than allocating a new one.
+func (w *Writer) Reset(writer io.Writer) {
+	w.w = writer
+	w.err = nil
+	if w.ibuf != nil {
+		w.ibuf = w.ibuf[:0]
+	}
+	w.wroteStreamHeader = false
+}
+
+// Write satisfies the io.Writer interface.
+func (w *Writer) Write(p []byte) (nRet int, errRet error) {
+	if w.ibuf == nil {
+		// Do not buffer incoming bytes. This does not perform or compress well
+		// if the caller of Writer.Write writes many small slices. This
+		// behavior is therefore deprecated, but still supported for backwards
+		// compatibility with code that doesn't explicitly Flush or Close.
+		return w.write(p)
+	}
+
+	// The remainder of this method is based on bufio.Writer.Write from the
+	// standard library.
+
+	for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
+		var n int
+		if len(w.ibuf) == 0 {
+			// Large write, empty buffer.
+			// Write directly from p to avoid copy.
+			n, _ = w.write(p)
+		} else {
+			n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+			w.ibuf = w.ibuf[:len(w.ibuf)+n]
+			w.Flush()
+		}
+		nRet += n
+		p = p[n:]
+	}
+	if w.err != nil {
+		return nRet, w.err
+	}
+	n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
+	w.ibuf = w.ibuf[:len(w.ibuf)+n]
+	nRet += n
+	return nRet, nil
+}
+
+func (w *Writer) write(p []byte) (nRet int, errRet error) {
+	if w.err != nil {
+		return 0, w.err
+	}
+	for len(p) > 0 {
+		obufStart := len(magicChunk)
+		if !w.wroteStreamHeader {
+			w.wroteStreamHeader = true
+			copy(w.obuf, magicChunk)
+			obufStart = 0
+		}
+
+		var uncompressed []byte
+		if len(p) > maxBlockSize {
+			uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
+		} else {
+			uncompressed, p = p, nil
+		}
+		checksum := crc(uncompressed)
+
+		// Compress the buffer, discarding the result if the improvement
+		// isn't at least 12.5%.
+		compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
+		chunkType := uint8(chunkTypeCompressedData)
+		chunkLen := 4 + len(compressed)
+		obufEnd := obufHeaderLen + len(compressed)
+		if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
+			chunkType = chunkTypeUncompressedData
+			chunkLen = 4 + len(uncompressed)
+			obufEnd = obufHeaderLen
+		}
+
+		// Fill in the per-chunk header that comes before the body.
+		w.obuf[len(magicChunk)+0] = chunkType
+		w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
+		w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
+		w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
+		w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
+		w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
+		w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
+		w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
+
+		if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
+			w.err = err
+			return nRet, err
+		}
+		if chunkType == chunkTypeUncompressedData {
+			if _, err := w.w.Write(uncompressed); err != nil {
+				w.err = err
+				return nRet, err
+			}
+		}
+		nRet += len(uncompressed)
+	}
+	return nRet, nil
+}
+
+// Flush flushes the Writer to its underlying io.Writer.
+func (w *Writer) Flush() error {
+	if w.err != nil {
+		return w.err
+	}
+	if len(w.ibuf) == 0 {
+		return nil
+	}
+	w.write(w.ibuf)
+	w.ibuf = w.ibuf[:0]
+	return w.err
+}
+
+// Close calls Flush and then closes the Writer.
+func (w *Writer) Close() error {
+	w.Flush()
+	ret := w.err
+	if w.err == nil {
+		w.err = errClosed
+	}
+	return ret
+}
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.go b/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
new file mode 100644
index 00000000000..150d91bc8be
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/encode_amd64.go
@@ -0,0 +1,29 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package snappy
+
+// emitLiteral has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitLiteral(dst, lit []byte) int
+
+// emitCopy has the same semantics as in encode_other.go.
+//
+//go:noescape
+func emitCopy(dst []byte, offset, length int) int
+
+// extendMatch has the same semantics as in encode_other.go.
+//
+//go:noescape
+func extendMatch(src []byte, i, j int) int
+
+// encodeBlock has the same semantics as in encode_other.go.
+//
+//go:noescape
+func encodeBlock(dst, src []byte) (d int)
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_amd64.s b/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
new file mode 100644
index 00000000000..adfd979fe27
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/encode_amd64.s
@@ -0,0 +1,730 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
+// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
+// https://github.com/golang/snappy/issues/29
+//
+// As a workaround, the package was built with a known good assembler, and
+// those instructions were disassembled by "objdump -d" to yield the
+//	4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+// style comments, in AT&T asm syntax. Note that rsp here is a physical
+// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
+// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
+// fine on Go 1.6.
+
+// The asm code generally follows the pure Go code in encode_other.go, except
+// where marked with a "!!!".
+
+// ----------------------------------------------------------------------------
+
+// func emitLiteral(dst, lit []byte) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	len(lit)
+//	- BX	n
+//	- DX	return value
+//	- DI	&dst[i]
+//	- R10	&lit[0]
+//
+// The 24 bytes of stack space is to call runtime·memmove.
+//
+// The unusual register allocation of local variables, such as R10 for the
+// source pointer, matches the allocation used at the call site in encodeBlock,
+// which makes it easier to manually inline this function.
+TEXT ·emitLiteral(SB), NOSPLIT, $24-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ lit_base+24(FP), R10
+	MOVQ lit_len+32(FP), AX
+	MOVQ AX, DX
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  oneByte
+	CMPL BX, $256
+	JLT  twoBytes
+
+threeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	ADDQ $3, DX
+	JMP  memmove
+
+twoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	ADDQ $2, DX
+	JMP  memmove
+
+oneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+	ADDQ $1, DX
+
+memmove:
+	MOVQ DX, ret+48(FP)
+
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	CALL runtime·memmove(SB)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func emitCopy(dst []byte, offset, length int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- AX	length
+//	- SI	&dst[0]
+//	- DI	&dst[i]
+//	- R11	offset
+//
+// The unusual register allocation of local variables, such as R11 for the
+// offset, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·emitCopy(SB), NOSPLIT, $0-48
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, SI
+	MOVQ offset+24(FP), R11
+	MOVQ length+32(FP), AX
+
+loop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  step1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  loop0
+
+step1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  step2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+step2:
+	// if length >= 12 || offset >= 2048 { goto step3 }
+	CMPL AX, $12
+	JGE  step3
+	CMPL R11, $2048
+	JGE  step3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+step3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+	// Return the number of bytes written.
+	SUBQ SI, DI
+	MOVQ DI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func extendMatch(src []byte, i, j int) int
+//
+// All local variables fit into registers. The register allocation:
+//	- DX	&src[0]
+//	- SI	&src[j]
+//	- R13	&src[len(src) - 8]
+//	- R14	&src[len(src)]
+//	- R15	&src[i]
+//
+// The unusual register allocation of local variables, such as R15 for a source
+// pointer, matches the allocation used at the call site in encodeBlock, which
+// makes it easier to manually inline this function.
+TEXT ·extendMatch(SB), NOSPLIT, $0-48
+	MOVQ src_base+0(FP), DX
+	MOVQ src_len+8(FP), R14
+	MOVQ i+24(FP), R15
+	MOVQ j+32(FP), SI
+	ADDQ DX, R14
+	ADDQ DX, R15
+	ADDQ DX, SI
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+cmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   cmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  bsf
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  cmp8
+
+bsf:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+cmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  extendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  extendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  cmp1
+
+extendMatchEnd:
+	// Convert from &src[ret] to ret.
+	SUBQ DX, SI
+	MOVQ SI, ret+40(FP)
+	RET
+
+// ----------------------------------------------------------------------------
+
+// func encodeBlock(dst, src []byte) (d int)
+//
+// All local variables fit into registers, other than "var table". The register
+// allocation:
+//	- AX	.	.
+//	- BX	.	.
+//	- CX	56	shift (note that amd64 shifts by non-immediates must use CX).
+//	- DX	64	&src[0], tableSize
+//	- SI	72	&src[s]
+//	- DI	80	&dst[d]
+//	- R9	88	sLimit
+//	- R10	.	&src[nextEmit]
+//	- R11	96	prevHash, currHash, nextHash, offset
+//	- R12	104	&src[base], skip
+//	- R13	.	&src[nextS], &src[len(src) - 8]
+//	- R14	.	len(src), bytesBetweenHashLookups, &src[len(src)], x
+//	- R15	112	candidate
+//
+// The second column (56, 64, etc) is the stack offset to spill the registers
+// when calling other functions. We could pack this slightly tighter, but it's
+// simpler to have a dedicated spill map independent of the function called.
+//
+// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
+// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
+// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
+TEXT ·encodeBlock(SB), 0, $32888-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R14
+
+	// shift, tableSize := uint32(32-8), 1<<8
+	MOVQ $24, CX
+	MOVQ $256, DX
+
+calcShift:
+	// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+	//	shift--
+	// }
+	CMPQ DX, $16384
+	JGE  varTable
+	CMPQ DX, R14
+	JGE  varTable
+	SUBQ $1, CX
+	SHLQ $1, DX
+	JMP  calcShift
+
+varTable:
+	// var table [maxTableSize]uint16
+	//
+	// In the asm code, unlike the Go code, we can zero-initialize only the
+	// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
+	// writes 16 bytes, so we can do only tableSize/8 writes instead of the
+	// 2048 writes that would zero-initialize all of table's 32768 bytes.
+	SHRQ $3, DX
+	LEAQ table-32768(SP), BX
+	PXOR X0, X0
+
+memclr:
+	MOVOU X0, 0(BX)
+	ADDQ  $16, BX
+	SUBQ  $1, DX
+	JNZ   memclr
+
+	// !!! DX = &src[0]
+	MOVQ SI, DX
+
+	// sLimit := len(src) - inputMargin
+	MOVQ R14, R9
+	SUBQ $15, R9
+
+	// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
+	// change for the rest of the function.
+	MOVQ CX, 56(SP)
+	MOVQ DX, 64(SP)
+	MOVQ R9, 88(SP)
+
+	// nextEmit := 0
+	MOVQ DX, R10
+
+	// s := 1
+	ADDQ $1, SI
+
+	// nextHash := hash(load32(src, s), shift)
+	MOVL  0(SI), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+outer:
+	// for { etc }
+
+	// skip := 32
+	MOVQ $32, R12
+
+	// nextS := s
+	MOVQ SI, R13
+
+	// candidate := 0
+	MOVQ $0, R15
+
+inner0:
+	// for { etc }
+
+	// s := nextS
+	MOVQ R13, SI
+
+	// bytesBetweenHashLookups := skip >> 5
+	MOVQ R12, R14
+	SHRQ $5, R14
+
+	// nextS = s + bytesBetweenHashLookups
+	ADDQ R14, R13
+
+	// skip += bytesBetweenHashLookups
+	ADDQ R14, R12
+
+	// if nextS > sLimit { goto emitRemainder }
+	MOVQ R13, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JA   emitRemainder
+
+	// candidate = int(table[nextHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[nextHash] = uint16(s)
+	MOVQ SI, AX
+	SUBQ DX, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// nextHash = hash(load32(src, nextS), shift)
+	MOVL  0(R13), R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// if load32(src, s) != load32(src, candidate) { continue } break
+	MOVL 0(SI), AX
+	MOVL (DX)(R15*1), BX
+	CMPL AX, BX
+	JNE  inner0
+
+fourByteMatch:
+	// As per the encode_other.go code:
+	//
+	// A 4-byte match has been found. We'll later see etc.
+
+	// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
+	// on inputMargin in encode.go.
+	MOVQ SI, AX
+	SUBQ R10, AX
+	CMPQ AX, $16
+	JLE  emitLiteralFastPath
+
+	// ----------------------------------------
+	// Begin inline of the emitLiteral call.
+	//
+	// d += emitLiteral(dst[d:], src[nextEmit:s])
+
+	MOVL AX, BX
+	SUBL $1, BX
+
+	CMPL BX, $60
+	JLT  inlineEmitLiteralOneByte
+	CMPL BX, $256
+	JLT  inlineEmitLiteralTwoBytes
+
+inlineEmitLiteralThreeBytes:
+	MOVB $0xf4, 0(DI)
+	MOVW BX, 1(DI)
+	ADDQ $3, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralTwoBytes:
+	MOVB $0xf0, 0(DI)
+	MOVB BX, 1(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitLiteralMemmove
+
+inlineEmitLiteralOneByte:
+	SHLB $2, BX
+	MOVB BX, 0(DI)
+	ADDQ $1, DI
+
+inlineEmitLiteralMemmove:
+	// Spill local variables (registers) onto the stack; call; unspill.
+	//
+	// copy(dst[i:], lit)
+	//
+	// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
+	// DI, R10 and AX as arguments.
+	MOVQ DI, 0(SP)
+	MOVQ R10, 8(SP)
+	MOVQ AX, 16(SP)
+	ADDQ AX, DI              // Finish the "d +=" part of "d += emitLiteral(etc)".
+	MOVQ SI, 72(SP)
+	MOVQ DI, 80(SP)
+	MOVQ R15, 112(SP)
+	CALL runtime·memmove(SB)
+	MOVQ 56(SP), CX
+	MOVQ 64(SP), DX
+	MOVQ 72(SP), SI
+	MOVQ 80(SP), DI
+	MOVQ 88(SP), R9
+	MOVQ 112(SP), R15
+	JMP  inner1
+
+inlineEmitLiteralEnd:
+	// End inline of the emitLiteral call.
+	// ----------------------------------------
+
+emitLiteralFastPath:
+	// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
+	MOVB AX, BX
+	SUBB $1, BX
+	SHLB $2, BX
+	MOVB BX, (DI)
+	ADDQ $1, DI
+
+	// !!! Implement the copy from lit to dst as a 16-byte load and store.
+	// (Encode's documentation says that dst and src must not overlap.)
+	//
+	// This always copies 16 bytes, instead of only len(lit) bytes, but that's
+	// OK. Subsequent iterations will fix up the overrun.
+	//
+	// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
+	// 16-byte loads and stores. This technique probably wouldn't be as
+	// effective on architectures that are fussier about alignment.
+	MOVOU 0(R10), X0
+	MOVOU X0, 0(DI)
+	ADDQ  AX, DI
+
+inner1:
+	// for { etc }
+
+	// base := s
+	MOVQ SI, R12
+
+	// !!! offset := base - candidate
+	MOVQ R12, R11
+	SUBQ R15, R11
+	SUBQ DX, R11
+
+	// ----------------------------------------
+	// Begin inline of the extendMatch call.
+	//
+	// s = extendMatch(src, candidate+4, s+4)
+
+	// !!! R14 = &src[len(src)]
+	MOVQ src_len+32(FP), R14
+	ADDQ DX, R14
+
+	// !!! R13 = &src[len(src) - 8]
+	MOVQ R14, R13
+	SUBQ $8, R13
+
+	// !!! R15 = &src[candidate + 4]
+	ADDQ $4, R15
+	ADDQ DX, R15
+
+	// !!! s += 4
+	ADDQ $4, SI
+
+inlineExtendMatchCmp8:
+	// As long as we are 8 or more bytes before the end of src, we can load and
+	// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
+	CMPQ SI, R13
+	JA   inlineExtendMatchCmp1
+	MOVQ (R15), AX
+	MOVQ (SI), BX
+	CMPQ AX, BX
+	JNE  inlineExtendMatchBSF
+	ADDQ $8, R15
+	ADDQ $8, SI
+	JMP  inlineExtendMatchCmp8
+
+inlineExtendMatchBSF:
+	// If those 8 bytes were not equal, XOR the two 8 byte values, and return
+	// the index of the first byte that differs. The BSF instruction finds the
+	// least significant 1 bit, the amd64 architecture is little-endian, and
+	// the shift by 3 converts a bit index to a byte index.
+	XORQ AX, BX
+	BSFQ BX, BX
+	SHRQ $3, BX
+	ADDQ BX, SI
+	JMP  inlineExtendMatchEnd
+
+inlineExtendMatchCmp1:
+	// In src's tail, compare 1 byte at a time.
+	CMPQ SI, R14
+	JAE  inlineExtendMatchEnd
+	MOVB (R15), AX
+	MOVB (SI), BX
+	CMPB AX, BX
+	JNE  inlineExtendMatchEnd
+	ADDQ $1, R15
+	ADDQ $1, SI
+	JMP  inlineExtendMatchCmp1
+
+inlineExtendMatchEnd:
+	// End inline of the extendMatch call.
+	// ----------------------------------------
+
+	// ----------------------------------------
+	// Begin inline of the emitCopy call.
+	//
+	// d += emitCopy(dst[d:], base-candidate, s-base)
+
+	// !!! length := s - base
+	MOVQ SI, AX
+	SUBQ R12, AX
+
+inlineEmitCopyLoop0:
+	// for length >= 68 { etc }
+	CMPL AX, $68
+	JLT  inlineEmitCopyStep1
+
+	// Emit a length 64 copy, encoded as 3 bytes.
+	MOVB $0xfe, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $64, AX
+	JMP  inlineEmitCopyLoop0
+
+inlineEmitCopyStep1:
+	// if length > 64 { etc }
+	CMPL AX, $64
+	JLE  inlineEmitCopyStep2
+
+	// Emit a length 60 copy, encoded as 3 bytes.
+	MOVB $0xee, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+	SUBL $60, AX
+
+inlineEmitCopyStep2:
+	// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
+	CMPL AX, $12
+	JGE  inlineEmitCopyStep3
+	CMPL R11, $2048
+	JGE  inlineEmitCopyStep3
+
+	// Emit the remaining copy, encoded as 2 bytes.
+	MOVB R11, 1(DI)
+	SHRL $8, R11
+	SHLB $5, R11
+	SUBB $4, AX
+	SHLB $2, AX
+	ORB  AX, R11
+	ORB  $1, R11
+	MOVB R11, 0(DI)
+	ADDQ $2, DI
+	JMP  inlineEmitCopyEnd
+
+inlineEmitCopyStep3:
+	// Emit the remaining copy, encoded as 3 bytes.
+	SUBL $1, AX
+	SHLB $2, AX
+	ORB  $2, AX
+	MOVB AX, 0(DI)
+	MOVW R11, 1(DI)
+	ADDQ $3, DI
+
+inlineEmitCopyEnd:
+	// End inline of the emitCopy call.
+	// ----------------------------------------
+
+	// nextEmit = s
+	MOVQ SI, R10
+
+	// if s >= sLimit { goto emitRemainder }
+	MOVQ SI, AX
+	SUBQ DX, AX
+	CMPQ AX, R9
+	JAE  emitRemainder
+
+	// As per the encode_other.go code:
+	//
+	// We could immediately etc.
+
+	// x := load64(src, s-1)
+	MOVQ -1(SI), R14
+
+	// prevHash := hash(uint32(x>>0), shift)
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// table[prevHash] = uint16(s-1)
+	MOVQ SI, AX
+	SUBQ DX, AX
+	SUBQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// currHash := hash(uint32(x>>8), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// candidate = int(table[currHash])
+	// XXX: MOVWQZX table-32768(SP)(R11*2), R15
+	// XXX: 4e 0f b7 7c 5c 78       movzwq 0x78(%rsp,%r11,2),%r15
+	BYTE $0x4e
+	BYTE $0x0f
+	BYTE $0xb7
+	BYTE $0x7c
+	BYTE $0x5c
+	BYTE $0x78
+
+	// table[currHash] = uint16(s)
+	ADDQ $1, AX
+
+	// XXX: MOVW AX, table-32768(SP)(R11*2)
+	// XXX: 66 42 89 44 5c 78       mov    %ax,0x78(%rsp,%r11,2)
+	BYTE $0x66
+	BYTE $0x42
+	BYTE $0x89
+	BYTE $0x44
+	BYTE $0x5c
+	BYTE $0x78
+
+	// if uint32(x>>8) == load32(src, candidate) { continue }
+	MOVL (DX)(R15*1), BX
+	CMPL R14, BX
+	JEQ  inner1
+
+	// nextHash = hash(uint32(x>>16), shift)
+	SHRQ  $8, R14
+	MOVL  R14, R11
+	IMULL $0x1e35a7bd, R11
+	SHRL  CX, R11
+
+	// s++
+	ADDQ $1, SI
+
+	// break out of the inner1 for loop, i.e. continue the outer loop.
+	JMP outer
+
+emitRemainder:
+	// if nextEmit < len(src) { etc }
+	MOVQ src_len+32(FP), AX
+	ADDQ DX, AX
+	CMPQ R10, AX
+	JEQ  encodeBlockEnd
+
+	// d += emitLiteral(dst[d:], src[nextEmit:])
+	//
+	// Push args.
+	MOVQ DI, 0(SP)
+	MOVQ $0, 8(SP)   // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ $0, 16(SP)  // Unnecessary, as the callee ignores it, but conservative.
+	MOVQ R10, 24(SP)
+	SUBQ R10, AX
+	MOVQ AX, 32(SP)
+	MOVQ AX, 40(SP)  // Unnecessary, as the callee ignores it, but conservative.
+
+	// Spill local variables (registers) onto the stack; call; unspill.
+	MOVQ DI, 80(SP)
+	CALL ·emitLiteral(SB)
+	MOVQ 80(SP), DI
+
+	// Finish the "d +=" part of "d += emitLiteral(etc)".
+	ADDQ 48(SP), DI
+
+encodeBlockEnd:
+	MOVQ dst_base+0(FP), AX
+	SUBQ AX, DI
+	MOVQ DI, d+48(FP)
+	RET
diff --git a/vendor/github.com/klauspost/compress/snappy/encode_other.go b/vendor/github.com/klauspost/compress/snappy/encode_other.go
new file mode 100644
index 00000000000..dbcae905e6e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/encode_other.go
@@ -0,0 +1,238 @@
+// Copyright 2016 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64 appengine !gc noasm
+
+package snappy
+
+func load32(b []byte, i int) uint32 {
+	b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load64(b []byte, i int) uint64 {
+	b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+// emitLiteral writes a literal chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= len(lit) && len(lit) <= 65536
+func emitLiteral(dst, lit []byte) int {
+	i, n := 0, uint(len(lit)-1)
+	switch {
+	case n < 60:
+		dst[0] = uint8(n)<<2 | tagLiteral
+		i = 1
+	case n < 1<<8:
+		dst[0] = 60<<2 | tagLiteral
+		dst[1] = uint8(n)
+		i = 2
+	default:
+		dst[0] = 61<<2 | tagLiteral
+		dst[1] = uint8(n)
+		dst[2] = uint8(n >> 8)
+		i = 3
+	}
+	return i + copy(dst[i:], lit)
+}
+
+// emitCopy writes a copy chunk and returns the number of bytes written.
+//
+// It assumes that:
+//	dst is long enough to hold the encoded bytes
+//	1 <= offset && offset <= 65535
+//	4 <= length && length <= 65535
+func emitCopy(dst []byte, offset, length int) int {
+	i := 0
+	// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
+	// threshold for this loop is a little higher (at 68 = 64 + 4), and the
+	// length emitted down below is is a little lower (at 60 = 64 - 4), because
+	// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
+	// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
+	// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
+	// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
+	// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
+	// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
+	for length >= 68 {
+		// Emit a length 64 copy, encoded as 3 bytes.
+		dst[i+0] = 63<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 64
+	}
+	if length > 64 {
+		// Emit a length 60 copy, encoded as 3 bytes.
+		dst[i+0] = 59<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		i += 3
+		length -= 60
+	}
+	if length >= 12 || offset >= 2048 {
+		// Emit the remaining copy, encoded as 3 bytes.
+		dst[i+0] = uint8(length-1)<<2 | tagCopy2
+		dst[i+1] = uint8(offset)
+		dst[i+2] = uint8(offset >> 8)
+		return i + 3
+	}
+	// Emit the remaining copy, encoded as 2 bytes.
+	dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
+	dst[i+1] = uint8(offset)
+	return i + 2
+}
+
+// extendMatch returns the largest k such that k <= len(src) and that
+// src[i:i+k-j] and src[j:k] have the same contents.
+//
+// It assumes that:
+//	0 <= i && i < j && j <= len(src)
+func extendMatch(src []byte, i, j int) int {
+	for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
+	}
+	return j
+}
+
+func hash(u, shift uint32) uint32 {
+	return (u * 0x1e35a7bd) >> shift
+}
+
+// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
+// assumes that the varint-encoded length of the decompressed bytes has already
+// been written.
+//
+// It also assumes that:
+//	len(dst) >= MaxEncodedLen(len(src)) &&
+// 	minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
+func encodeBlock(dst, src []byte) (d int) {
+	// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
+	// The table element type is uint16, as s < sLimit and sLimit < len(src)
+	// and len(src) <= maxBlockSize and maxBlockSize == 65536.
+	const (
+		maxTableSize = 1 << 14
+		// tableMask is redundant, but helps the compiler eliminate bounds
+		// checks.
+		tableMask = maxTableSize - 1
+	)
+	shift := uint32(32 - 8)
+	for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
+		shift--
+	}
+	// In Go, all array elements are zero-initialized, so there is no advantage
+	// to a smaller tableSize per se. However, it matches the C++ algorithm,
+	// and in the asm versions of this code, we can get away with zeroing only
+	// the first tableSize elements.
+	var table [maxTableSize]uint16
+
+	// sLimit is when to stop looking for offset/length copies. The inputMargin
+	// lets us use a fast path for emitLiteral in the main loop, while we are
+	// looking for copies.
+	sLimit := len(src) - inputMargin
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := 0
+
+	// The encoded form must start with a literal, as there are no previous
+	// bytes to copy, so we start looking for hash matches at s == 1.
+	s := 1
+	nextHash := hash(load32(src, s), shift)
+
+	for {
+		// Copied from the C++ snappy implementation:
+		//
+		// Heuristic match skipping: If 32 bytes are scanned with no matches
+		// found, start looking only at every other byte. If 32 more bytes are
+		// scanned (or skipped), look at every third byte, etc.. When a match
+		// is found, immediately go back to looking at every byte. This is a
+		// small loss (~5% performance, ~0.1% density) for compressible data
+		// due to more bookkeeping, but for non-compressible data (such as
+		// JPEG) it's a huge win since the compressor quickly "realizes" the
+		// data is incompressible and doesn't bother looking for matches
+		// everywhere.
+		//
+		// The "skip" variable keeps track of how many bytes there are since
+		// the last match; dividing it by 32 (ie. right-shifting by five) gives
+		// the number of bytes to move ahead for each iteration.
+		skip := 32
+
+		nextS := s
+		candidate := 0
+		for {
+			s = nextS
+			bytesBetweenHashLookups := skip >> 5
+			nextS = s + bytesBetweenHashLookups
+			skip += bytesBetweenHashLookups
+			if nextS > sLimit {
+				goto emitRemainder
+			}
+			candidate = int(table[nextHash&tableMask])
+			table[nextHash&tableMask] = uint16(s)
+			nextHash = hash(load32(src, nextS), shift)
+			if load32(src, s) == load32(src, candidate) {
+				break
+			}
+		}
+
+		// A 4-byte match has been found. We'll later see if more than 4 bytes
+		// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
+		// them as literal bytes.
+		d += emitLiteral(dst[d:], src[nextEmit:s])
+
+		// Call emitCopy, and then see if another emitCopy could be our next
+		// move. Repeat until we find no match for the input immediately after
+		// what was consumed by the last emitCopy call.
+		//
+		// If we exit this loop normally then we need to call emitLiteral next,
+		// though we don't yet know how big the literal will be. We handle that
+		// by proceeding to the next iteration of the main loop. We also can
+		// exit this loop via goto if we get close to exhausting the input.
+		for {
+			// Invariant: we have a 4-byte match at s, and no need to emit any
+			// literal bytes prior to s.
+			base := s
+
+			// Extend the 4-byte match as long as possible.
+			//
+			// This is an inlined version of:
+			//	s = extendMatch(src, candidate+4, s+4)
+			s += 4
+			for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
+			}
+
+			d += emitCopy(dst[d:], base-candidate, s-base)
+			nextEmit = s
+			if s >= sLimit {
+				goto emitRemainder
+			}
+
+			// We could immediately start working at s now, but to improve
+			// compression we first update the hash table at s-1 and at s. If
+			// another emitCopy is not our next move, also calculate nextHash
+			// at s+1. At least on GOARCH=amd64, these three hash calculations
+			// are faster as one load64 call (with some shifts) instead of
+			// three load32 calls.
+			x := load64(src, s-1)
+			prevHash := hash(uint32(x>>0), shift)
+			table[prevHash&tableMask] = uint16(s - 1)
+			currHash := hash(uint32(x>>8), shift)
+			candidate = int(table[currHash&tableMask])
+			table[currHash&tableMask] = uint16(s)
+			if uint32(x>>8) != load32(src, candidate) {
+				nextHash = hash(uint32(x>>16), shift)
+				s++
+				break
+			}
+		}
+	}
+
+emitRemainder:
+	if nextEmit < len(src) {
+		d += emitLiteral(dst[d:], src[nextEmit:])
+	}
+	return d
+}
diff --git a/vendor/github.com/klauspost/compress/snappy/runbench.cmd b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
new file mode 100644
index 00000000000..d24eb4b47c3
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/runbench.cmd
@@ -0,0 +1,2 @@
+del old.txt
+go test -bench=. >>old.txt && go test -bench=. >>old.txt && go test -bench=. >>old.txt && benchstat -delta-test=ttest old.txt new.txt
diff --git a/vendor/github.com/klauspost/compress/snappy/snappy.go b/vendor/github.com/klauspost/compress/snappy/snappy.go
new file mode 100644
index 00000000000..74a36689e87
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/snappy/snappy.go
@@ -0,0 +1,98 @@
+// Copyright 2011 The Snappy-Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package snappy implements the Snappy compression format. It aims for very
+// high speeds and reasonable compression.
+//
+// There are actually two Snappy formats: block and stream. They are related,
+// but different: trying to decompress block-compressed data as a Snappy stream
+// will fail, and vice versa. The block format is the Decode and Encode
+// functions and the stream format is the Reader and Writer types.
+//
+// The block format, the more common case, is used when the complete size (the
+// number of bytes) of the original data is known upfront, at the time
+// compression starts. The stream format, also known as the framing format, is
+// for when that isn't always true.
+//
+// The canonical, C++ implementation is at https://github.com/google/snappy and
+// it only implements the block format.
+package snappy
+
+import (
+	"hash/crc32"
+)
+
+/*
+Each encoded block begins with the varint-encoded length of the decoded data,
+followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
+first byte of each chunk is broken into its 2 least and 6 most significant bits
+called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
+Zero means a literal tag. All other values mean a copy tag.
+
+For literal tags:
+  - If m < 60, the next 1 + m bytes are literal bytes.
+  - Otherwise, let n be the little-endian unsigned integer denoted by the next
+    m - 59 bytes. The next 1 + n bytes after that are literal bytes.
+
+For copy tags, length bytes are copied from offset bytes ago, in the style of
+Lempel-Ziv compression algorithms. In particular:
+  - For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
+    The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
+    of the offset. The next byte is bits 0-7 of the offset.
+  - For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
+    The length is 1 + m. The offset is the little-endian unsigned integer
+    denoted by the next 2 bytes.
+  - For l == 3, this tag is a legacy format that is no longer issued by most
+    encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
+    [1, 65). The length is 1 + m. The offset is the little-endian unsigned
+    integer denoted by the next 4 bytes.
+*/
+const (
+	tagLiteral = 0x00
+	tagCopy1   = 0x01
+	tagCopy2   = 0x02
+	tagCopy4   = 0x03
+)
+
+const (
+	checksumSize    = 4
+	chunkHeaderSize = 4
+	magicChunk      = "\xff\x06\x00\x00" + magicBody
+	magicBody       = "sNaPpY"
+
+	// maxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	maxBlockSize = 65536
+
+	// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	maxEncodedLenOfMaxBlockSize = 76490
+
+	obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
+	obufLen       = obufHeaderLen + maxEncodedLenOfMaxBlockSize
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func crc(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/README.md b/vendor/github.com/klauspost/compress/zstd/README.md
new file mode 100644
index 00000000000..bc977a30234
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/README.md
@@ -0,0 +1,393 @@
+# zstd 
+
+[Zstandard](https://facebook.github.io/zstd/) is a real-time compression algorithm, providing high compression ratios. 
+It offers a very wide range of compression / speed trade-off, while being backed by a very fast decoder.
+A high performance compression algorithm is implemented. For now focused on speed. 
+
+This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
+Note that custom dictionaries are not supported yet, so if your code relies on that, 
+you cannot use the package as-is.
+
+This package is pure Go and without use of "unsafe". 
+If a significant speedup can be achieved using "unsafe", it may be added as an option later.
+
+The `zstd` package is provided as open source software using a Go standard license.
+
+Currently the package is heavily optimized for 64 bit processors and will be significantly slower on 32 bit processors.
+
+## Installation
+
+Install using `go get -u github.com/klauspost/compress`. The package is located in `github.com/klauspost/compress/zstd`.
+
+Godoc Documentation: https://godoc.org/github.com/klauspost/compress/zstd
+
+
+## Compressor
+
+### Status: 
+
+STABLE - there may always be subtle bugs, a wide variety of content has been tested and the library is actively 
+used by several projects. This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
+kindly supplied by [fuzzit.dev](https://fuzzit.dev/).
+
+There may still be specific combinations of data types/size/settings that could lead to edge cases, 
+so as always, testing is recommended.  
+
+For now, a high speed (fastest) and medium-fast (default) compressor has been implemented. 
+
+The "Fastest" compression ratio is roughly equivalent to zstd level 1. 
+The "Default" compression ratio is roughly equivalent to zstd level 3 (default).
+
+In terms of speed, it is typically 2x as fast as the stdlib deflate/gzip in its fastest mode. 
+The compression ratio compared to stdlib is around level 3, but usually 3x as fast.
+
+Compared to cgo zstd, the speed is around level 3 (default), but compression slightly worse, between level 1&2.
+
+ 
+### Usage
+
+An Encoder can be used for either compressing a stream via the
+`io.WriteCloser` interface supported by the Encoder or as multiple independent
+tasks via the `EncodeAll` function.
+Smaller encodes are encouraged to use the EncodeAll function.
+Use `NewWriter` to create a new instance that can be used for both.
+
+To create a writer with default options, do like this:
+
+```Go
+// Compress input to output.
+func Compress(in io.Reader, out io.Writer) error {
+    w, err := NewWriter(output)
+    if err != nil {
+        return err
+    }
+    _, err := io.Copy(w, input)
+    if err != nil {
+        enc.Close()
+        return err
+    }
+    return enc.Close()
+}
+```
+
+Now you can encode by writing data to `enc`. The output will be finished writing when `Close()` is called.
+Even if your encode fails, you should still call `Close()` to release any resources that may be held up.  
+
+The above is fine for big encodes. However, whenever possible try to *reuse* the writer.
+
+To reuse the encoder, you can use the `Reset(io.Writer)` function to change to another output. 
+This will allow the encoder to reuse all resources and avoid wasteful allocations. 
+
+Currently stream encoding has 'light' concurrency, meaning up to 2 goroutines can be working on part 
+of a stream. This is independent of the `WithEncoderConcurrency(n)`, but that is likely to change 
+in the future. So if you want to limit concurrency for future updates, specify the concurrency
+you would like.
+
+You can specify your desired compression level using `WithEncoderLevel()` option. Currently only pre-defined 
+compression settings can be specified.
+
+#### Future Compatibility Guarantees
+
+This will be an evolving project. When using this package it is important to note that both the compression efficiency and speed may change.
+
+The goal will be to keep the default efficiency at the default zstd (level 3). 
+However the encoding should never be assumed to remain the same, 
+and you should not use hashes of compressed output for similarity checks.
+
+The Encoder can be assumed to produce the same output from the exact same code version.
+However, the may be modes in the future that break this, 
+although they will not be enabled without an explicit option.   
+
+This encoder is not designed to (and will probably never) output the exact same bitstream as the reference encoder.
+
+Also note, that the cgo decompressor currently does not [report all errors on invalid input](https://github.com/DataDog/zstd/issues/59),
+[omits error checks](https://github.com/DataDog/zstd/issues/61), [ignores checksums](https://github.com/DataDog/zstd/issues/43) 
+and seems to ignore concatenated streams, even though [it is part of the spec](https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frames).
+
+#### Blocks
+
+For compressing small blocks, the returned encoder has a function called `EncodeAll(src, dst []byte) []byte`.
+
+`EncodeAll` will encode all input in src and append it to dst.
+This function can be called concurrently, but each call will only run on a single goroutine.
+
+Encoded blocks can be concatenated and the result will be the combined input stream.
+Data compressed with EncodeAll can be decoded with the Decoder, using either a stream or `DecodeAll`.
+
+Especially when encoding blocks you should take special care to reuse the encoder. 
+This will effectively make it run without allocations after a warmup period. 
+To make it run completely without allocations, supply a destination buffer with space for all content.   
+
+```Go
+import "github.com/klauspost/compress/zstd"
+
+// Create a writer that caches compressors.
+// For this operation type we supply a nil Reader.
+var encoder, _ = zstd.NewWriter(nil)
+
+// Compress a buffer. 
+// If you have a destination buffer, the allocation in the call can also be eliminated.
+func Compress(src []byte) []byte {
+    return encoder.EncodeAll(src, make([]byte, 0, len(src)))
+} 
+```
+
+You can control the maximum number of concurrent encodes using the `WithEncoderConcurrency(n)` 
+option when creating the writer.
+
+Using the Encoder for both a stream and individual blocks concurrently is safe. 
+
+### Performance
+
+I have collected some speed examples to compare speed and compression against other compressors.
+
+* `file` is the input file.
+* `out` is the compressor used. `zskp` is this package. `gzstd` is gzip standard library. `zstd` is the Datadog cgo library.
+* `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default".
+* `insize`/`outsize` is the input/output size.
+* `millis` is the number of milliseconds used for compression.
+* `mb/s` is megabytes (2^20 bytes) per second.
+
+```
+The test data for the Large Text Compression Benchmark is the first
+10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
+http://mattmahoney.net/dc/textdata.html
+
+file    out     level   insize  outsize     millis  mb/s
+enwik9  zskp    1   1000000000  343833033   5840    163.30
+enwik9  zskp    2   1000000000  317822183   8449    112.87
+enwik9  gzstd   1   1000000000  382578136   13627   69.98
+enwik9  gzstd   3   1000000000  349139651   22344   42.68
+enwik9  zstd    1   1000000000  357416379   4838    197.12
+enwik9  zstd    3   1000000000  313734522   7556    126.21
+
+GOB stream of binary data. Highly compressible.
+https://files.klauspost.com/compress/gob-stream.7z
+
+file        out level   insize      outsize     millis  mb/s
+gob-stream  zskp    1   1911399616  234981983   5100    357.42
+gob-stream  zskp    2   1911399616  208674003   6698    272.15
+gob-stream  gzstd   1   1911399616  357382641   14727   123.78
+gob-stream  gzstd   3   1911399616  327835097   17005   107.19
+gob-stream  zstd    1   1911399616  250787165   4075    447.22
+gob-stream  zstd    3   1911399616  208191888   5511    330.77
+
+Highly compressible JSON file. Similar to logs in a lot of ways.
+https://files.klauspost.com/compress/adresser.001.gz
+
+file            out level   insize      outsize     millis  mb/s
+adresser.001    zskp    1   1073741824  18510122    1477    692.83
+adresser.001    zskp    2   1073741824  19831697    1705    600.59
+adresser.001    gzstd   1   1073741824  47755503    3079    332.47
+adresser.001    gzstd   3   1073741824  40052381    3051    335.63
+adresser.001    zstd    1   1073741824  16135896    994     1030.18
+adresser.001    zstd    3   1073741824  17794465    905     1131.49
+
+VM Image, Linux mint with a few installed applications:
+https://files.klauspost.com/compress/rawstudio-mint14.7z
+
+file    out level   insize  outsize millis  mb/s
+rawstudio-mint14.tar    zskp    1   8558382592  3648168838  33398   244.38
+rawstudio-mint14.tar    zskp    2   8558382592  3376721436  50962   160.16
+rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  84712   96.35
+rawstudio-mint14.tar    gzstd   3   8558382592  3740711978  176344  46.28
+rawstudio-mint14.tar    zstd    1   8558382592  3607859742  27903   292.51
+rawstudio-mint14.tar    zstd    3   8558382592  3341710879  46700   174.77
+
+
+The test data is designed to test archivers in realistic backup scenarios.
+http://mattmahoney.net/dc/10gb.html
+
+file    out level   insize  outsize millis  mb/s
+10gb.tar    zskp    1   10065157632 4883149814  45715   209.97
+10gb.tar    zskp    2   10065157632 4638110010  60970   157.44
+10gb.tar    gzstd   1   10065157632 5198296126  97769   98.18
+10gb.tar    gzstd   3   10065157632 4932665487  313427  30.63
+10gb.tar    zstd    1   10065157632 4940796535  40391   237.65
+10gb.tar    zstd    3   10065157632 4638618579  52911   181.42
+
+Silesia Corpus:
+http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
+
+file    out level   insize  outsize millis  mb/s
+silesia.tar zskp    1   211947520   73025800    1108    182.26
+silesia.tar zskp    2   211947520   67674684    1599    126.41
+silesia.tar gzstd   1   211947520   80007735    2515    80.37
+silesia.tar gzstd   3   211947520   73133380    4259    47.45
+silesia.tar zstd    1   211947520   73513991    933     216.64
+silesia.tar zstd    3   211947520   66793301    1377    146.79
+```
+
+### Converters
+
+As part of the development process a *Snappy* -> *Zstandard* converter was also built.
+
+This can convert a *framed* [Snappy Stream](https://godoc.org/github.com/golang/snappy#Writer) to a zstd stream. 
+Note that a single block is not framed.
+
+Conversion is done by converting the stream directly from Snappy without intermediate full decoding.
+Therefore the compression ratio is much less than what can be done by a full decompression
+and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
+any errors being generated.
+No CRC value is being generated and not all CRC values of the Snappy stream are checked.
+However, it provides really fast re-compression of Snappy streams.
+
+
+```
+BenchmarkSnappy_ConvertSilesia-8           1  1156001600 ns/op   183.35 MB/s
+Snappy len 103008711 -> zstd len 82687318
+
+BenchmarkSnappy_Enwik9-8           1  6472998400 ns/op   154.49 MB/s
+Snappy len 508028601 -> zstd len 390921079
+```
+
+
+```Go
+    s := zstd.SnappyConverter{}
+    n, err = s.Convert(input, output)
+    if err != nil {
+        fmt.Println("Re-compressed stream to", n, "bytes")
+    }
+```
+
+The converter `s` can be reused to avoid allocations, even after errors.
+
+
+## Decompressor
+
+Staus: STABLE - there may still be subtle bugs, but a wide variety of content has been tested.
+
+This library is being continuously [fuzz-tested](https://github.com/klauspost/compress-fuzz),
+kindly supplied by [fuzzit.dev](https://fuzzit.dev/). 
+The main purpose of the fuzz testing is to ensure that it is not possible to crash the decoder, 
+or run it past its limits with ANY input provided.  
+ 
+### Usage
+
+The package has been designed for two main usages, big streams of data and smaller in-memory buffers. 
+There are two main usages of the package for these. Both of them are accessed by creating a `Decoder`.
+
+For streaming use a simple setup could look like this:
+
+```Go
+import "github.com/klauspost/compress/zstd"
+
+func Decompress(in io.Reader, out io.Writer) error {
+    d, err := zstd.NewReader(input)
+    if err != nil {
+        return err
+    }
+    defer d.Close()
+    
+    // Copy content...
+    _, err := io.Copy(out, d)
+    return err
+}
+```
+
+It is important to use the "Close" function when you no longer need the Reader to stop running goroutines. 
+See "Allocation-less operation" below.
+
+For decoding buffers, it could look something like this:
+
+```Go
+import "github.com/klauspost/compress/zstd"
+
+// Create a reader that caches decompressors.
+// For this operation type we supply a nil Reader.
+var decoder, _ = zstd.NewReader(nil)
+
+// Decompress a buffer. We don't supply a destination buffer,
+// so it will be allocated by the decoder.
+func Decompress(src []byte) ([]byte, error) {
+    return decoder.DecodeAll(src, nil)
+} 
+```
+
+Both of these cases should provide the functionality needed. 
+The decoder can be used for *concurrent* decompression of multiple buffers. 
+It will only allow a certain number of concurrent operations to run. 
+To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
+
+### Allocation-less operation
+
+The decoder has been designed to operate without allocations after a warmup. 
+
+This means that you should *store* the decoder for best performance. 
+To re-use a stream decoder, use the `Reset(r io.Reader) error` to switch to another stream.
+A decoder can safely be re-used even if the previous stream failed.
+
+To release the resources, you must call the `Close()` function on a decoder.
+After this it can *no longer be reused*, but all running goroutines will be stopped.
+So you *must* use this if you will no longer need the Reader.
+
+For decompressing smaller buffers a single decoder can be used.
+When decoding buffers, you can supply a destination slice with length 0 and your expected capacity.
+In this case no unneeded allocations should be made. 
+
+### Concurrency
+
+The buffer decoder does everything on the same goroutine and does nothing concurrently.
+It can however decode several buffers concurrently. Use `WithDecoderConcurrency(n)` to limit that.
+
+The stream decoder operates on
+
+* One goroutine reads input and splits the input to several block decoders.
+* A number of decoders will decode blocks.
+* A goroutine coordinates these blocks and sends history from one to the next.
+
+So effectively this also means the decoder will "read ahead" and prepare data to always be available for output.
+
+Since "blocks" are quite dependent on the output of the previous block stream decoding will only have limited concurrency.
+
+In practice this means that concurrency is often limited to utilizing about 2 cores effectively.
+ 
+ 
+### Benchmarks
+
+These are some examples of performance compared to [datadog cgo library](https://github.com/DataDog/zstd).
+
+The first two are streaming decodes and the last are smaller inputs. 
+ 
+```
+BenchmarkDecoderSilesia-8             20       642550210 ns/op   329.85 MB/s      3101 B/op        8 allocs/op
+BenchmarkDecoderSilesiaCgo-8         100       384930000 ns/op   550.61 MB/s    451878 B/op     9713 allocs/op
+
+BenchmarkDecoderEnwik9-2              10        3146000080 ns/op         317.86 MB/s        2649 B/op          9 allocs/op
+BenchmarkDecoderEnwik9Cgo-2           20        1905900000 ns/op         524.69 MB/s     1125120 B/op      45785 allocs/op
+
+BenchmarkDecoder_DecodeAll/z000000.zst-8               200     7049994 ns/op   138.26 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000001.zst-8            100000       19560 ns/op    97.49 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000002.zst-8              5000      297599 ns/op   236.99 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000003.zst-8              2000      725502 ns/op   141.17 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000004.zst-8            200000        9314 ns/op    54.54 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000005.zst-8             10000      137500 ns/op   104.72 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000006.zst-8               500     2316009 ns/op   206.06 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000007.zst-8             20000       64499 ns/op   344.90 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000008.zst-8             50000       24900 ns/op   219.56 MB/s        40 B/op        2 allocs/op
+BenchmarkDecoder_DecodeAll/z000009.zst-8              1000     2348999 ns/op   154.01 MB/s        40 B/op        2 allocs/op
+
+BenchmarkDecoder_DecodeAllCgo/z000000.zst-8            500     4268005 ns/op   228.38 MB/s   1228849 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000001.zst-8         100000       15250 ns/op   125.05 MB/s      2096 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000002.zst-8          10000      147399 ns/op   478.49 MB/s     73776 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000003.zst-8           5000      320798 ns/op   319.27 MB/s    139312 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000004.zst-8         200000       10004 ns/op    50.77 MB/s       560 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000005.zst-8          20000       73599 ns/op   195.64 MB/s     19120 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000006.zst-8           1000     1119003 ns/op   426.48 MB/s    557104 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000007.zst-8          20000      103450 ns/op   215.04 MB/s     71296 B/op        9 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000008.zst-8         100000       20130 ns/op   271.58 MB/s      6192 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllCgo/z000009.zst-8           2000     1123500 ns/op   322.00 MB/s    368688 B/op        3 allocs/op
+```
+
+This reflects the performance around May 2019, but this may be out of date.
+
+# Contributions
+
+Contributions are always welcome. 
+For new features/fixes, remember to add tests and for performance enhancements include benchmarks.
+
+For sending files for reproducing errors use a service like [goobox](https://goobox.io/#/upload) or similar to share your files.
+
+For general feedback and experience reports, feel free to open an issue or write me on [Twitter](https://twitter.com/sh0dan).
+
+This package includes the excellent [`github.com/cespare/xxhash`](https://github.com/cespare/xxhash) package Copyright (c) 2016 Caleb Spare.
diff --git a/vendor/github.com/klauspost/compress/zstd/bitreader.go b/vendor/github.com/klauspost/compress/zstd/bitreader.go
new file mode 100644
index 00000000000..15d79d439fa
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/bitreader.go
@@ -0,0 +1,121 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"io"
+	"math/bits"
+)
+
+// bitReader reads a bitstream in reverse.
+// The last set bit indicates the start of the stream and is used
+// for aligning the input.
+type bitReader struct {
+	in       []byte
+	off      uint   // next byte to read is at in[off - 1]
+	value    uint64 // Maybe use [16]byte, but shifting is awkward.
+	bitsRead uint8
+}
+
+// init initializes and resets the bit reader.
+func (b *bitReader) init(in []byte) error {
+	if len(in) < 1 {
+		return errors.New("corrupt stream: too short")
+	}
+	b.in = in
+	b.off = uint(len(in))
+	// The highest bit of the last byte indicates where to start
+	v := in[len(in)-1]
+	if v == 0 {
+		return errors.New("corrupt stream, did not find end of stream")
+	}
+	b.bitsRead = 64
+	b.value = 0
+	b.fill()
+	b.fill()
+	b.bitsRead += 8 - uint8(highBits(uint32(v)))
+	return nil
+}
+
+// getBits will return n bits. n can be 0.
+func (b *bitReader) getBits(n uint8) int {
+	if n == 0 /*|| b.bitsRead >= 64 */ {
+		return 0
+	}
+	return b.getBitsFast(n)
+}
+
+// getBitsFast requires that at least one bit is requested every time.
+// There are no checks if the buffer is filled.
+func (b *bitReader) getBitsFast(n uint8) int {
+	const regMask = 64 - 1
+	v := uint32((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
+	b.bitsRead += n
+	return int(v)
+}
+
+// fillFast() will make sure at least 32 bits are available.
+// There must be at least 4 bytes available.
+func (b *bitReader) fillFast() {
+	if b.bitsRead < 32 {
+		return
+	}
+	// Do single re-slice to avoid bounds checks.
+	v := b.in[b.off-4 : b.off]
+	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+	b.value = (b.value << 32) | uint64(low)
+	b.bitsRead -= 32
+	b.off -= 4
+}
+
+// fill() will make sure at least 32 bits are available.
+func (b *bitReader) fill() {
+	if b.bitsRead < 32 {
+		return
+	}
+	if b.off >= 4 {
+		v := b.in[b.off-4 : b.off]
+		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
+		b.value = (b.value << 32) | uint64(low)
+		b.bitsRead -= 32
+		b.off -= 4
+		return
+	}
+	for b.off > 0 {
+		b.value = (b.value << 8) | uint64(b.in[b.off-1])
+		b.bitsRead -= 8
+		b.off--
+	}
+}
+
+// finished returns true if all bits have been read from the bit stream.
+func (b *bitReader) finished() bool {
+	return b.off == 0 && b.bitsRead >= 64
+}
+
+// overread returns true if more bits have been requested than is on the stream.
+func (b *bitReader) overread() bool {
+	return b.bitsRead > 64
+}
+
+// remain returns the number of bits remaining.
+func (b *bitReader) remain() uint {
+	return b.off*8 + 64 - uint(b.bitsRead)
+}
+
+// close the bitstream and returns an error if out-of-buffer reads occurred.
+func (b *bitReader) close() error {
+	// Release reference.
+	b.in = nil
+	if b.bitsRead > 64 {
+		return io.ErrUnexpectedEOF
+	}
+	return nil
+}
+
+func highBits(val uint32) (n uint32) {
+	return uint32(bits.Len32(val) - 1)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/bitwriter.go b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
new file mode 100644
index 00000000000..303ae90f944
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/bitwriter.go
@@ -0,0 +1,169 @@
+// Copyright 2018 Klaus Post. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
+
+package zstd
+
+import "fmt"
+
+// bitWriter will write bits.
+// First bit will be LSB of the first byte of output.
+type bitWriter struct {
+	bitContainer uint64
+	nBits        uint8
+	out          []byte
+}
+
+// bitMask16 is bitmasks. Has extra to avoid bounds check.
+var bitMask16 = [32]uint16{
+	0, 1, 3, 7, 0xF, 0x1F,
+	0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
+	0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+	0xFFFF, 0xFFFF} /* up to 16 bits */
+
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
+// addBits16NC will add up to 16 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// addBits32NC will add up to 32 bits.
+// It will not check if there is space for them,
+// so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits32NC(value uint32, bits uint8) {
+	b.bitContainer |= uint64(value&bitMask32[bits&31]) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
+// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
+func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
+	b.bitContainer |= uint64(value) << (b.nBits & 63)
+	b.nBits += bits
+}
+
+// flush will flush all pending full bytes.
+// There will be at least 56 bits available for writing when this has been called.
+// Using flush32 is faster, but leaves less space for writing.
+func (b *bitWriter) flush() {
+	v := b.nBits >> 3
+	switch v {
+	case 0:
+	case 1:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+		)
+	case 2:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+		)
+	case 3:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+		)
+	case 4:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+		)
+	case 5:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+		)
+	case 6:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+		)
+	case 7:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+		)
+	case 8:
+		b.out = append(b.out,
+			byte(b.bitContainer),
+			byte(b.bitContainer>>8),
+			byte(b.bitContainer>>16),
+			byte(b.bitContainer>>24),
+			byte(b.bitContainer>>32),
+			byte(b.bitContainer>>40),
+			byte(b.bitContainer>>48),
+			byte(b.bitContainer>>56),
+		)
+	default:
+		panic(fmt.Errorf("bits (%d) > 64", b.nBits))
+	}
+	b.bitContainer >>= v << 3
+	b.nBits &= 7
+}
+
+// flush32 will flush out, so there are at least 32 bits available for writing.
+func (b *bitWriter) flush32() {
+	if b.nBits < 32 {
+		return
+	}
+	b.out = append(b.out,
+		byte(b.bitContainer),
+		byte(b.bitContainer>>8),
+		byte(b.bitContainer>>16),
+		byte(b.bitContainer>>24))
+	b.nBits -= 32
+	b.bitContainer >>= 32
+}
+
+// flushAlign will flush remaining full bytes and align to next byte boundary.
+func (b *bitWriter) flushAlign() {
+	nbBytes := (b.nBits + 7) >> 3
+	for i := uint8(0); i < nbBytes; i++ {
+		b.out = append(b.out, byte(b.bitContainer>>(i*8)))
+	}
+	b.nBits = 0
+	b.bitContainer = 0
+}
+
+// close will write the alignment bit and write the final byte(s)
+// to the output.
+func (b *bitWriter) close() error {
+	// End mark
+	b.addBits16Clean(1, 1)
+	// flush until next byte.
+	b.flushAlign()
+	return nil
+}
+
+// reset and continue writing by appending to out.
+func (b *bitWriter) reset(out []byte) {
+	b.bitContainer = 0
+	b.nBits = 0
+	b.out = out
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockdec.go b/vendor/github.com/klauspost/compress/zstd/blockdec.go
new file mode 100644
index 00000000000..c2f855e75be
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/blockdec.go
@@ -0,0 +1,731 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/klauspost/compress/huff0"
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+type blockType uint8
+
+//go:generate stringer -type=blockType,literalsBlockType,seqCompMode,tableIndex
+
+const (
+	blockTypeRaw blockType = iota
+	blockTypeRLE
+	blockTypeCompressed
+	blockTypeReserved
+)
+
+type literalsBlockType uint8
+
+const (
+	literalsBlockRaw literalsBlockType = iota
+	literalsBlockRLE
+	literalsBlockCompressed
+	literalsBlockTreeless
+)
+
+const (
+	// maxCompressedBlockSize is the biggest allowed compressed block size (128KB)
+	maxCompressedBlockSize = 128 << 10
+
+	// Maximum possible block size (all Raw+Uncompressed).
+	maxBlockSize = (1 << 21) - 1
+
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#literals_section_header
+	maxCompressedLiteralSize = 1 << 18
+	maxRLELiteralSize        = 1 << 20
+	maxMatchLen              = 131074
+	maxSequences             = 0x7f00 + 0xffff
+
+	// We support slightly less than the reference decoder to be able to
+	// use ints on 32 bit archs.
+	maxOffsetBits = 30
+)
+
+var (
+	huffDecoderPool = sync.Pool{New: func() interface{} {
+		return &huff0.Scratch{}
+	}}
+
+	fseDecoderPool = sync.Pool{New: func() interface{} {
+		return &fseDecoder{}
+	}}
+)
+
+type blockDec struct {
+	// Raw source data of the block.
+	data        []byte
+	dataStorage []byte
+
+	// Destination of the decoded data.
+	dst []byte
+
+	// Buffer for literals data.
+	literalBuf []byte
+
+	// Window size of the block.
+	WindowSize uint64
+
+	history     chan *history
+	input       chan struct{}
+	result      chan decodeOutput
+	sequenceBuf []seq
+	err         error
+	decWG       sync.WaitGroup
+
+	// Block is RLE, this is the size.
+	RLESize uint32
+	tmp     [4]byte
+
+	Type blockType
+
+	// Is this the last block of a frame?
+	Last bool
+
+	// Use less memory
+	lowMem bool
+}
+
+func (b *blockDec) String() string {
+	if b == nil {
+		return "<nil>"
+	}
+	return fmt.Sprintf("Steam Size: %d, Type: %v, Last: %t, Window: %d", len(b.data), b.Type, b.Last, b.WindowSize)
+}
+
+func newBlockDec(lowMem bool) *blockDec {
+	b := blockDec{
+		lowMem:  lowMem,
+		result:  make(chan decodeOutput, 1),
+		input:   make(chan struct{}, 1),
+		history: make(chan *history, 1),
+	}
+	b.decWG.Add(1)
+	go b.startDecoder()
+	return &b
+}
+
+// reset will reset the block.
+// Input must be a start of a block and will be at the end of the block when returned.
+func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
+	b.WindowSize = windowSize
+	tmp := br.readSmall(3)
+	if tmp == nil {
+		if debug {
+			println("Reading block header:", io.ErrUnexpectedEOF)
+		}
+		return io.ErrUnexpectedEOF
+	}
+	bh := uint32(tmp[0]) | (uint32(tmp[1]) << 8) | (uint32(tmp[2]) << 16)
+	b.Last = bh&1 != 0
+	b.Type = blockType((bh >> 1) & 3)
+	// find size.
+	cSize := int(bh >> 3)
+	maxSize := maxBlockSize
+	switch b.Type {
+	case blockTypeReserved:
+		return ErrReservedBlockType
+	case blockTypeRLE:
+		b.RLESize = uint32(cSize)
+		if b.lowMem {
+			maxSize = cSize
+		}
+		cSize = 1
+	case blockTypeCompressed:
+		if debug {
+			println("Data size on stream:", cSize)
+		}
+		b.RLESize = 0
+		maxSize = maxCompressedBlockSize
+		if windowSize < maxCompressedBlockSize && b.lowMem {
+			maxSize = int(windowSize)
+		}
+		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
+			if debug {
+				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
+			}
+			return ErrCompressedSizeTooBig
+		}
+	default:
+		b.RLESize = 0
+	}
+
+	// Read block data.
+	if cap(b.dataStorage) < cSize {
+		if b.lowMem {
+			b.dataStorage = make([]byte, 0, cSize)
+		} else {
+			b.dataStorage = make([]byte, 0, maxBlockSize)
+		}
+	}
+	if cap(b.dst) <= maxSize {
+		b.dst = make([]byte, 0, maxSize+1)
+	}
+	var err error
+	b.data, err = br.readBig(cSize, b.dataStorage)
+	if err != nil {
+		if debug {
+			println("Reading block:", err, "(", cSize, ")", len(b.data))
+			printf("%T", br)
+		}
+		return err
+	}
+	return nil
+}
+
+// sendEOF will make the decoder send EOF on this frame.
+func (b *blockDec) sendErr(err error) {
+	b.Last = true
+	b.Type = blockTypeReserved
+	b.err = err
+	b.input <- struct{}{}
+}
+
+// Close will release resources.
+// Closed blockDec cannot be reset.
+func (b *blockDec) Close() {
+	close(b.input)
+	close(b.history)
+	close(b.result)
+	b.decWG.Wait()
+}
+
+// decodeAsync will prepare decoding the block when it receives input.
+// This will separate output and history.
+func (b *blockDec) startDecoder() {
+	defer b.decWG.Done()
+	for range b.input {
+		//println("blockDec: Got block input")
+		switch b.Type {
+		case blockTypeRLE:
+			if cap(b.dst) < int(b.RLESize) {
+				if b.lowMem {
+					b.dst = make([]byte, b.RLESize)
+				} else {
+					b.dst = make([]byte, maxBlockSize)
+				}
+			}
+			o := decodeOutput{
+				d:   b,
+				b:   b.dst[:b.RLESize],
+				err: nil,
+			}
+			v := b.data[0]
+			for i := range o.b {
+				o.b[i] = v
+			}
+			hist := <-b.history
+			hist.append(o.b)
+			b.result <- o
+		case blockTypeRaw:
+			o := decodeOutput{
+				d:   b,
+				b:   b.data,
+				err: nil,
+			}
+			hist := <-b.history
+			hist.append(o.b)
+			b.result <- o
+		case blockTypeCompressed:
+			b.dst = b.dst[:0]
+			err := b.decodeCompressed(nil)
+			o := decodeOutput{
+				d:   b,
+				b:   b.dst,
+				err: err,
+			}
+			if debug {
+				println("Decompressed to", len(b.dst), "bytes, error:", err)
+			}
+			b.result <- o
+		case blockTypeReserved:
+			// Used for returning errors.
+			<-b.history
+			b.result <- decodeOutput{
+				d:   b,
+				b:   nil,
+				err: b.err,
+			}
+		default:
+			panic("Invalid block type")
+		}
+		if debug {
+			println("blockDec: Finished block")
+		}
+	}
+}
+
+// decodeAsync will prepare decoding the block when it receives the history.
+// If history is provided, it will not fetch it from the channel.
+func (b *blockDec) decodeBuf(hist *history) error {
+	switch b.Type {
+	case blockTypeRLE:
+		if cap(b.dst) < int(b.RLESize) {
+			if b.lowMem {
+				b.dst = make([]byte, b.RLESize)
+			} else {
+				b.dst = make([]byte, maxBlockSize)
+			}
+		}
+		b.dst = b.dst[:b.RLESize]
+		v := b.data[0]
+		for i := range b.dst {
+			b.dst[i] = v
+		}
+		hist.appendKeep(b.dst)
+		return nil
+	case blockTypeRaw:
+		hist.appendKeep(b.data)
+		return nil
+	case blockTypeCompressed:
+		saved := b.dst
+		b.dst = hist.b
+		hist.b = nil
+		err := b.decodeCompressed(hist)
+		if debug {
+			println("Decompressed to total", len(b.dst), "bytes, hash:", xxhash.Sum64(b.dst), "error:", err)
+		}
+		hist.b = b.dst
+		b.dst = saved
+		return err
+	case blockTypeReserved:
+		// Used for returning errors.
+		return b.err
+	default:
+		panic("Invalid block type")
+	}
+}
+
+// decodeCompressed will start decompressing a block.
+// If no history is supplied the decoder will decodeAsync as much as possible
+// before fetching from blockDec.history
+func (b *blockDec) decodeCompressed(hist *history) error {
+	in := b.data
+	delayedHistory := hist == nil
+
+	if delayedHistory {
+		// We must always grab history.
+		defer func() {
+			if hist == nil {
+				<-b.history
+			}
+		}()
+	}
+	// There must be at least one byte for Literals_Block_Type and one for Sequences_Section_Header
+	if len(in) < 2 {
+		return ErrBlockTooSmall
+	}
+	litType := literalsBlockType(in[0] & 3)
+	var litRegenSize int
+	var litCompSize int
+	sizeFormat := (in[0] >> 2) & 3
+	var fourStreams bool
+	switch litType {
+	case literalsBlockRaw, literalsBlockRLE:
+		switch sizeFormat {
+		case 0, 2:
+			// Regenerated_Size uses 5 bits (0-31). Literals_Section_Header uses 1 byte.
+			litRegenSize = int(in[0] >> 3)
+			in = in[1:]
+		case 1:
+			// Regenerated_Size uses 12 bits (0-4095). Literals_Section_Header uses 2 bytes.
+			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4)
+			in = in[2:]
+		case 3:
+			//  Regenerated_Size uses 20 bits (0-1048575). Literals_Section_Header uses 3 bytes.
+			if len(in) < 3 {
+				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
+				return ErrBlockTooSmall
+			}
+			litRegenSize = int(in[0]>>4) + (int(in[1]) << 4) + (int(in[2]) << 12)
+			in = in[3:]
+		}
+	case literalsBlockCompressed, literalsBlockTreeless:
+		switch sizeFormat {
+		case 0, 1:
+			// Both Regenerated_Size and Compressed_Size use 10 bits (0-1023).
+			if len(in) < 3 {
+				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
+				return ErrBlockTooSmall
+			}
+			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12)
+			litRegenSize = int(n & 1023)
+			litCompSize = int(n >> 10)
+			fourStreams = sizeFormat == 1
+			in = in[3:]
+		case 2:
+			fourStreams = true
+			if len(in) < 4 {
+				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
+				return ErrBlockTooSmall
+			}
+			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20)
+			litRegenSize = int(n & 16383)
+			litCompSize = int(n >> 14)
+			in = in[4:]
+		case 3:
+			fourStreams = true
+			if len(in) < 5 {
+				println("too small: litType:", litType, " sizeFormat", sizeFormat, len(in))
+				return ErrBlockTooSmall
+			}
+			n := uint64(in[0]>>4) + (uint64(in[1]) << 4) + (uint64(in[2]) << 12) + (uint64(in[3]) << 20) + (uint64(in[4]) << 28)
+			litRegenSize = int(n & 262143)
+			litCompSize = int(n >> 18)
+			in = in[5:]
+		}
+	}
+	if debug {
+		println("literals type:", litType, "litRegenSize:", litRegenSize, "litCompSize:", litCompSize, "sizeFormat:", sizeFormat, "4X:", fourStreams)
+	}
+	var literals []byte
+	var huff *huff0.Scratch
+	switch litType {
+	case literalsBlockRaw:
+		if len(in) < litRegenSize {
+			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litRegenSize)
+			return ErrBlockTooSmall
+		}
+		literals = in[:litRegenSize]
+		in = in[litRegenSize:]
+		//printf("Found %d uncompressed literals\n", litRegenSize)
+	case literalsBlockRLE:
+		if len(in) < 1 {
+			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", 1)
+			return ErrBlockTooSmall
+		}
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, litRegenSize)
+			} else {
+				if litRegenSize > maxCompressedLiteralSize {
+					// Exceptional
+					b.literalBuf = make([]byte, litRegenSize)
+				} else {
+					b.literalBuf = make([]byte, litRegenSize, maxCompressedLiteralSize)
+
+				}
+			}
+		}
+		literals = b.literalBuf[:litRegenSize]
+		v := in[0]
+		for i := range literals {
+			literals[i] = v
+		}
+		in = in[1:]
+		if debug {
+			printf("Found %d RLE compressed literals\n", litRegenSize)
+		}
+	case literalsBlockTreeless:
+		if len(in) < litCompSize {
+			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
+			return ErrBlockTooSmall
+		}
+		// Store compressed literals, so we defer decoding until we get history.
+		literals = in[:litCompSize]
+		in = in[litCompSize:]
+		if debug {
+			printf("Found %d compressed literals\n", litCompSize)
+		}
+	case literalsBlockCompressed:
+		if len(in) < litCompSize {
+			println("too small: litType:", litType, " sizeFormat", sizeFormat, "remain:", len(in), "want:", litCompSize)
+			return ErrBlockTooSmall
+		}
+		literals = in[:litCompSize]
+		in = in[litCompSize:]
+		huff = huffDecoderPool.Get().(*huff0.Scratch)
+		var err error
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+			}
+		}
+		if huff == nil {
+			huff = &huff0.Scratch{}
+		}
+		huff.Out = b.literalBuf[:0]
+		huff, literals, err = huff0.ReadTable(literals, huff)
+		if err != nil {
+			println("reading huffman table:", err)
+			return err
+		}
+		// Use our out buffer.
+		huff.Out = b.literalBuf[:0]
+		huff.MaxDecodedSize = litRegenSize
+		if fourStreams {
+			literals, err = huff.Decompress4X(literals, litRegenSize)
+		} else {
+			literals, err = huff.Decompress1X(literals)
+		}
+		if err != nil {
+			println("decoding compressed literals:", err)
+			return err
+		}
+		// Make sure we don't leak our literals buffer
+		huff.Out = nil
+		if len(literals) != litRegenSize {
+			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+		if debug {
+			printf("Decompressed %d literals into %d bytes\n", litCompSize, litRegenSize)
+		}
+	}
+
+	// Decode Sequences
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequences-section
+	if len(in) < 1 {
+		return ErrBlockTooSmall
+	}
+	seqHeader := in[0]
+	nSeqs := 0
+	switch {
+	case seqHeader == 0:
+		in = in[1:]
+	case seqHeader < 128:
+		nSeqs = int(seqHeader)
+		in = in[1:]
+	case seqHeader < 255:
+		if len(in) < 2 {
+			return ErrBlockTooSmall
+		}
+		nSeqs = int(seqHeader-128)<<8 | int(in[1])
+		in = in[2:]
+	case seqHeader == 255:
+		if len(in) < 3 {
+			return ErrBlockTooSmall
+		}
+		nSeqs = 0x7f00 + int(in[1]) + (int(in[2]) << 8)
+		in = in[3:]
+	}
+	// Allocate sequences
+	if cap(b.sequenceBuf) < nSeqs {
+		if b.lowMem {
+			b.sequenceBuf = make([]seq, nSeqs)
+		} else {
+			// Allocate max
+			b.sequenceBuf = make([]seq, nSeqs, maxSequences)
+		}
+	} else {
+		// Reuse buffer
+		b.sequenceBuf = b.sequenceBuf[:nSeqs]
+	}
+	var seqs = &sequenceDecs{}
+	if nSeqs > 0 {
+		if len(in) < 1 {
+			return ErrBlockTooSmall
+		}
+		br := byteReader{b: in, off: 0}
+		compMode := br.Uint8()
+		br.advance(1)
+		if debug {
+			printf("Compression modes: 0b%b", compMode)
+		}
+		for i := uint(0); i < 3; i++ {
+			mode := seqCompMode((compMode >> (6 - i*2)) & 3)
+			if debug {
+				println("Table", tableIndex(i), "is", mode)
+			}
+			var seq *sequenceDec
+			switch tableIndex(i) {
+			case tableLiteralLengths:
+				seq = &seqs.litLengths
+			case tableOffsets:
+				seq = &seqs.offsets
+			case tableMatchLengths:
+				seq = &seqs.matchLengths
+			default:
+				panic("unknown table")
+			}
+			switch mode {
+			case compModePredefined:
+				seq.fse = &fsePredef[i]
+			case compModeRLE:
+				if br.remain() < 1 {
+					return ErrBlockTooSmall
+				}
+				v := br.Uint8()
+				br.advance(1)
+				dec := fseDecoderPool.Get().(*fseDecoder)
+				symb, err := decSymbolValue(v, symbolTableX[i])
+				if err != nil {
+					printf("RLE Transform table (%v) error: %v", tableIndex(i), err)
+					return err
+				}
+				dec.setRLE(symb)
+				seq.fse = dec
+				if debug {
+					printf("RLE set to %+v, code: %v", symb, v)
+				}
+			case compModeFSE:
+				println("Reading table for", tableIndex(i))
+				dec := fseDecoderPool.Get().(*fseDecoder)
+				err := dec.readNCount(&br, uint16(maxTableSymbol[i]))
+				if err != nil {
+					println("Read table error:", err)
+					return err
+				}
+				err = dec.transform(symbolTableX[i])
+				if err != nil {
+					println("Transform table error:", err)
+					return err
+				}
+				if debug {
+					println("Read table ok", "symbolLen:", dec.symbolLen)
+				}
+				seq.fse = dec
+			case compModeRepeat:
+				seq.repeat = true
+			}
+			if br.overread() {
+				return io.ErrUnexpectedEOF
+			}
+		}
+		in = br.unread()
+	}
+
+	// Wait for history.
+	// All time spent after this is critical since it is strictly sequential.
+	if hist == nil {
+		hist = <-b.history
+		if hist.error {
+			return ErrDecoderClosed
+		}
+	}
+
+	// Decode treeless literal block.
+	if litType == literalsBlockTreeless {
+		// TODO: We could send the history early WITHOUT the stream history.
+		//   This would allow decoding treeless literials before the byte history is available.
+		//   Silencia stats: Treeless 4393, with: 32775, total: 37168, 11% treeless.
+		//   So not much obvious gain here.
+
+		if hist.huffTree == nil {
+			return errors.New("literal block was treeless, but no history was defined")
+		}
+		// Ensure we have space to store it.
+		if cap(b.literalBuf) < litRegenSize {
+			if b.lowMem {
+				b.literalBuf = make([]byte, 0, litRegenSize)
+			} else {
+				b.literalBuf = make([]byte, 0, maxCompressedLiteralSize)
+			}
+		}
+		var err error
+		// Use our out buffer.
+		huff = hist.huffTree
+		huff.Out = b.literalBuf[:0]
+		huff.MaxDecodedSize = litRegenSize
+		if fourStreams {
+			literals, err = huff.Decompress4X(literals, litRegenSize)
+		} else {
+			literals, err = huff.Decompress1X(literals)
+		}
+		// Make sure we don't leak our literals buffer
+		huff.Out = nil
+		if err != nil {
+			println("decompressing literals:", err)
+			return err
+		}
+		if len(literals) != litRegenSize {
+			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
+		}
+	} else {
+		if hist.huffTree != nil && huff != nil {
+			huffDecoderPool.Put(hist.huffTree)
+			hist.huffTree = nil
+		}
+	}
+	if huff != nil {
+		huff.Out = nil
+		hist.huffTree = huff
+	}
+	if debug {
+		println("Final literals:", len(literals), "hash:", xxhash.Sum64(literals), "and", nSeqs, "sequences.")
+	}
+
+	if nSeqs == 0 {
+		// Decompressed content is defined entirely as Literals Section content.
+		b.dst = append(b.dst, literals...)
+		if delayedHistory {
+			hist.append(literals)
+		}
+		return nil
+	}
+
+	seqs, err := seqs.mergeHistory(&hist.decoders)
+	if err != nil {
+		return err
+	}
+	if debug {
+		println("History merged ok")
+	}
+	br := &bitReader{}
+	if err := br.init(in); err != nil {
+		return err
+	}
+
+	// TODO: Investigate if sending history without decoders are faster.
+	//   This would allow the sequences to be decoded async and only have to construct stream history.
+	//   If only recent offsets were not transferred, this would be an obvious win.
+	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
+
+	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
+		println("initializing sequences:", err)
+		return err
+	}
+	hbytes := hist.b
+	if len(hbytes) > hist.windowSize {
+		hbytes = hbytes[len(hbytes)-hist.windowSize:]
+	}
+	err = seqs.decode(nSeqs, br, hbytes)
+	if err != nil {
+		return err
+	}
+	if !br.finished() {
+		return fmt.Errorf("%d extra bits on block, should be 0", br.remain())
+	}
+
+	err = br.close()
+	if err != nil {
+		printf("Closing sequences: %v, %+v\n", err, *br)
+	}
+	if len(b.data) > maxCompressedBlockSize {
+		return fmt.Errorf("compressed block size too large (%d)", len(b.data))
+	}
+	// Set output and release references.
+	b.dst = seqs.out
+	seqs.out, seqs.literals, seqs.hist = nil, nil, nil
+
+	if !delayedHistory {
+		// If we don't have delayed history, no need to update.
+		hist.recentOffsets = seqs.prevOffset
+		return nil
+	}
+	if b.Last {
+		// if last block we don't care about history.
+		println("Last block, no history returned")
+		hist.b = hist.b[:0]
+		return nil
+	}
+	hist.append(b.dst)
+	hist.recentOffsets = seqs.prevOffset
+	if debug {
+		println("Finished block with literals:", len(literals), "and", nSeqs, "sequences.")
+	}
+
+	return nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/blockenc.go b/vendor/github.com/klauspost/compress/zstd/blockenc.go
new file mode 100644
index 00000000000..4f0eba22f08
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/blockenc.go
@@ -0,0 +1,837 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"math/bits"
+
+	"github.com/klauspost/compress/huff0"
+)
+
+type blockEnc struct {
+	size      int
+	literals  []byte
+	sequences []seq
+	coders    seqCoders
+	litEnc    *huff0.Scratch
+	wr        bitWriter
+
+	extraLits int
+	last      bool
+
+	output            []byte
+	recentOffsets     [3]uint32
+	prevRecentOffsets [3]uint32
+}
+
+// init should be used once the block has been created.
+// If called more than once, the effect is the same as calling reset.
+func (b *blockEnc) init() {
+	if cap(b.literals) < maxCompressedLiteralSize {
+		b.literals = make([]byte, 0, maxCompressedLiteralSize)
+	}
+	const defSeqs = 200
+	b.literals = b.literals[:0]
+	if cap(b.sequences) < defSeqs {
+		b.sequences = make([]seq, 0, defSeqs)
+	}
+	if cap(b.output) < maxCompressedBlockSize {
+		b.output = make([]byte, 0, maxCompressedBlockSize)
+	}
+	if b.coders.mlEnc == nil {
+		b.coders.mlEnc = &fseEncoder{}
+		b.coders.mlPrev = &fseEncoder{}
+		b.coders.ofEnc = &fseEncoder{}
+		b.coders.ofPrev = &fseEncoder{}
+		b.coders.llEnc = &fseEncoder{}
+		b.coders.llPrev = &fseEncoder{}
+	}
+	b.litEnc = &huff0.Scratch{WantLogLess: 4}
+	b.reset(nil)
+}
+
+// initNewEncode can be used to reset offsets and encoders to the initial state.
+func (b *blockEnc) initNewEncode() {
+	b.recentOffsets = [3]uint32{1, 4, 8}
+	b.litEnc.Reuse = huff0.ReusePolicyNone
+	b.coders.setPrev(nil, nil, nil)
+}
+
+// reset will reset the block for a new encode, but in the same stream,
+// meaning that state will be carried over, but the block content is reset.
+// If a previous block is provided, the recent offsets are carried over.
+func (b *blockEnc) reset(prev *blockEnc) {
+	b.extraLits = 0
+	b.literals = b.literals[:0]
+	b.size = 0
+	b.sequences = b.sequences[:0]
+	b.output = b.output[:0]
+	b.last = false
+	if prev != nil {
+		b.recentOffsets = prev.prevRecentOffsets
+	}
+}
+
+// reset will reset the block for a new encode, but in the same stream,
+// meaning that state will be carried over, but the block content is reset.
+// If a previous block is provided, the recent offsets are carried over.
+func (b *blockEnc) swapEncoders(prev *blockEnc) {
+	b.coders.swap(&prev.coders)
+	b.litEnc, prev.litEnc = prev.litEnc, b.litEnc
+}
+
+// blockHeader contains the information for a block header.
+type blockHeader uint32
+
+// setLast sets the 'last' indicator on a block.
+func (h *blockHeader) setLast(b bool) {
+	if b {
+		*h = *h | 1
+	} else {
+		const mask = (1 << 24) - 2
+		*h = *h & mask
+	}
+}
+
+// setSize will store the compressed size of a block.
+func (h *blockHeader) setSize(v uint32) {
+	const mask = 7
+	*h = (*h)&mask | blockHeader(v<<3)
+}
+
+// setType sets the block type.
+func (h *blockHeader) setType(t blockType) {
+	const mask = 1 | (((1 << 24) - 1) ^ 7)
+	*h = (*h & mask) | blockHeader(t<<1)
+}
+
+// appendTo will append the block header to a slice.
+func (h blockHeader) appendTo(b []byte) []byte {
+	return append(b, uint8(h), uint8(h>>8), uint8(h>>16))
+}
+
+// String returns a string representation of the block.
+func (h blockHeader) String() string {
+	return fmt.Sprintf("Type: %d, Size: %d, Last:%t", (h>>1)&3, h>>3, h&1 == 1)
+}
+
+// literalsHeader contains literals header information.
+type literalsHeader uint64
+
+// setType can be used to set the type of literal block.
+func (h *literalsHeader) setType(t literalsBlockType) {
+	const mask = math.MaxUint64 - 3
+	*h = (*h & mask) | literalsHeader(t)
+}
+
+// setSize can be used to set a single size, for uncompressed and RLE content.
+func (h *literalsHeader) setSize(regenLen int) {
+	inBits := bits.Len32(uint32(regenLen))
+	// Only retain 2 bits
+	const mask = 3
+	lh := uint64(*h & mask)
+	switch {
+	case inBits < 5:
+		lh |= (uint64(regenLen) << 3) | (1 << 60)
+		if debug {
+			got := int(lh>>3) & 0xff
+			if got != regenLen {
+				panic(fmt.Sprint("litRegenSize = ", regenLen, "(want) != ", got, "(got)"))
+			}
+		}
+	case inBits < 12:
+		lh |= (1 << 2) | (uint64(regenLen) << 4) | (2 << 60)
+	case inBits < 20:
+		lh |= (3 << 2) | (uint64(regenLen) << 4) | (3 << 60)
+	default:
+		panic(fmt.Errorf("internal error: block too big (%d)", regenLen))
+	}
+	*h = literalsHeader(lh)
+}
+
+// setSizes will set the size of a compressed literals section and the input length.
+func (h *literalsHeader) setSizes(compLen, inLen int, single bool) {
+	compBits, inBits := bits.Len32(uint32(compLen)), bits.Len32(uint32(inLen))
+	// Only retain 2 bits
+	const mask = 3
+	lh := uint64(*h & mask)
+	switch {
+	case compBits <= 10 && inBits <= 10:
+		if !single {
+			lh |= 1 << 2
+		}
+		lh |= (uint64(inLen) << 4) | (uint64(compLen) << (10 + 4)) | (3 << 60)
+		if debug {
+			const mmask = (1 << 24) - 1
+			n := (lh >> 4) & mmask
+			if int(n&1023) != inLen {
+				panic(fmt.Sprint("regensize:", int(n&1023), "!=", inLen, inBits))
+			}
+			if int(n>>10) != compLen {
+				panic(fmt.Sprint("compsize:", int(n>>10), "!=", compLen, compBits))
+			}
+		}
+	case compBits <= 14 && inBits <= 14:
+		lh |= (2 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (14 + 4)) | (4 << 60)
+		if single {
+			panic("single stream used with more than 10 bits length.")
+		}
+	case compBits <= 18 && inBits <= 18:
+		lh |= (3 << 2) | (uint64(inLen) << 4) | (uint64(compLen) << (18 + 4)) | (5 << 60)
+		if single {
+			panic("single stream used with more than 10 bits length.")
+		}
+	default:
+		panic("internal error: block too big")
+	}
+	*h = literalsHeader(lh)
+}
+
+// appendTo will append the literals header to a byte slice.
+func (h literalsHeader) appendTo(b []byte) []byte {
+	size := uint8(h >> 60)
+	switch size {
+	case 1:
+		b = append(b, uint8(h))
+	case 2:
+		b = append(b, uint8(h), uint8(h>>8))
+	case 3:
+		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16))
+	case 4:
+		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16), uint8(h>>24))
+	case 5:
+		b = append(b, uint8(h), uint8(h>>8), uint8(h>>16), uint8(h>>24), uint8(h>>32))
+	default:
+		panic(fmt.Errorf("internal error: literalsHeader has invalid size (%d)", size))
+	}
+	return b
+}
+
+// size returns the output size with currently set values.
+func (h literalsHeader) size() int {
+	return int(h >> 60)
+}
+
+func (h literalsHeader) String() string {
+	return fmt.Sprintf("Type: %d, SizeFormat: %d, Size: 0x%d, Bytes:%d", literalsBlockType(h&3), (h>>2)&3, h&((1<<60)-1)>>4, h>>60)
+}
+
+// pushOffsets will push the recent offsets to the backup store.
+func (b *blockEnc) pushOffsets() {
+	b.prevRecentOffsets = b.recentOffsets
+}
+
+// pushOffsets will push the recent offsets to the backup store.
+func (b *blockEnc) popOffsets() {
+	b.recentOffsets = b.prevRecentOffsets
+}
+
+// matchOffset will adjust recent offsets and return the adjusted one,
+// if it matches a previous offset.
+func (b *blockEnc) matchOffset(offset, lits uint32) uint32 {
+	// Check if offset is one of the recent offsets.
+	// Adjusts the output offset accordingly.
+	// Gives a tiny bit of compression, typically around 1%.
+	if true {
+		if lits > 0 {
+			switch offset {
+			case b.recentOffsets[0]:
+				offset = 1
+			case b.recentOffsets[1]:
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset = 2
+			case b.recentOffsets[2]:
+				b.recentOffsets[2] = b.recentOffsets[1]
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset = 3
+			default:
+				b.recentOffsets[2] = b.recentOffsets[1]
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset += 3
+			}
+		} else {
+			switch offset {
+			case b.recentOffsets[1]:
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset = 1
+			case b.recentOffsets[2]:
+				b.recentOffsets[2] = b.recentOffsets[1]
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset = 2
+			case b.recentOffsets[0] - 1:
+				b.recentOffsets[2] = b.recentOffsets[1]
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset = 3
+			default:
+				b.recentOffsets[2] = b.recentOffsets[1]
+				b.recentOffsets[1] = b.recentOffsets[0]
+				b.recentOffsets[0] = offset
+				offset += 3
+			}
+		}
+	} else {
+		offset += 3
+	}
+	return offset
+}
+
+// encodeRaw can be used to set the output to a raw representation of supplied bytes.
+func (b *blockEnc) encodeRaw(a []byte) {
+	var bh blockHeader
+	bh.setLast(b.last)
+	bh.setSize(uint32(len(a)))
+	bh.setType(blockTypeRaw)
+	b.output = bh.appendTo(b.output[:0])
+	b.output = append(b.output, a...)
+	if debug {
+		println("Adding RAW block, length", len(a))
+	}
+}
+
+// encodeRaw can be used to set the output to a raw representation of supplied bytes.
+func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
+	var bh blockHeader
+	bh.setLast(b.last)
+	bh.setSize(uint32(len(src)))
+	bh.setType(blockTypeRaw)
+	dst = bh.appendTo(dst)
+	dst = append(dst, src...)
+	if debug {
+		println("Adding RAW block, length", len(src))
+	}
+	return dst
+}
+
+// encodeLits can be used if the block is only litLen.
+func (b *blockEnc) encodeLits(raw bool) error {
+	var bh blockHeader
+	bh.setLast(b.last)
+	bh.setSize(uint32(len(b.literals)))
+
+	// Don't compress extremely small blocks
+	if len(b.literals) < 32 || raw {
+		if debug {
+			println("Adding RAW block, length", len(b.literals))
+		}
+		bh.setType(blockTypeRaw)
+		b.output = bh.appendTo(b.output)
+		b.output = append(b.output, b.literals...)
+		return nil
+	}
+
+	var (
+		out            []byte
+		reUsed, single bool
+		err            error
+	)
+	if len(b.literals) >= 1024 {
+		// Use 4 Streams.
+		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
+	} else if len(b.literals) > 32 {
+		// Use 1 stream
+		single = true
+		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+	} else {
+		err = huff0.ErrIncompressible
+	}
+
+	switch err {
+	case huff0.ErrIncompressible:
+		if debug {
+			println("Adding RAW block, length", len(b.literals))
+		}
+		bh.setType(blockTypeRaw)
+		b.output = bh.appendTo(b.output)
+		b.output = append(b.output, b.literals...)
+		return nil
+	case huff0.ErrUseRLE:
+		if debug {
+			println("Adding RLE block, length", len(b.literals))
+		}
+		bh.setType(blockTypeRLE)
+		b.output = bh.appendTo(b.output)
+		b.output = append(b.output, b.literals[0])
+		return nil
+	default:
+		return err
+	case nil:
+	}
+	// Compressed...
+	// Now, allow reuse
+	b.litEnc.Reuse = huff0.ReusePolicyAllow
+	bh.setType(blockTypeCompressed)
+	var lh literalsHeader
+	if reUsed {
+		if debug {
+			println("Reused tree, compressed to", len(out))
+		}
+		lh.setType(literalsBlockTreeless)
+	} else {
+		if debug {
+			println("New tree, compressed to", len(out), "tree size:", len(b.litEnc.OutTable))
+		}
+		lh.setType(literalsBlockCompressed)
+	}
+	// Set sizes
+	lh.setSizes(len(out), len(b.literals), single)
+	bh.setSize(uint32(len(out) + lh.size() + 1))
+
+	// Write block headers.
+	b.output = bh.appendTo(b.output)
+	b.output = lh.appendTo(b.output)
+	// Add compressed data.
+	b.output = append(b.output, out...)
+	// No sequences.
+	b.output = append(b.output, 0)
+	return nil
+}
+
+// fuzzFseEncoder can be used to fuzz the FSE encoder.
+func fuzzFseEncoder(data []byte) int {
+	if len(data) > maxSequences || len(data) < 2 {
+		return 0
+	}
+	enc := fseEncoder{}
+	hist := enc.Histogram()[:256]
+	maxSym := uint8(0)
+	for i, v := range data {
+		v = v & 63
+		data[i] = v
+		hist[v]++
+		if v > maxSym {
+			maxSym = v
+		}
+	}
+	if maxSym == 0 {
+		// All 0
+		return 0
+	}
+	maxCount := func(a []uint32) int {
+		var max uint32
+		for _, v := range a {
+			if v > max {
+				max = v
+			}
+		}
+		return int(max)
+	}
+	cnt := maxCount(hist[:maxSym])
+	if cnt == len(data) {
+		// RLE
+		return 0
+	}
+	enc.HistogramFinished(maxSym, cnt)
+	err := enc.normalizeCount(len(data))
+	if err != nil {
+		return 0
+	}
+	_, err = enc.writeCount(nil)
+	if err != nil {
+		panic(err)
+	}
+	return 1
+}
+
+// encode will encode the block and append the output in b.output.
+func (b *blockEnc) encode(raw bool) error {
+	if len(b.sequences) == 0 {
+		return b.encodeLits(raw)
+	}
+	// We want some difference
+	if len(b.literals) > (b.size - (b.size >> 5)) {
+		return errIncompressible
+	}
+
+	var bh blockHeader
+	var lh literalsHeader
+	bh.setLast(b.last)
+	bh.setType(blockTypeCompressed)
+	// Store offset of the block header. Needed when we know the size.
+	bhOffset := len(b.output)
+	b.output = bh.appendTo(b.output)
+
+	var (
+		out            []byte
+		reUsed, single bool
+		err            error
+	)
+	if len(b.literals) >= 1024 && !raw {
+		// Use 4 Streams.
+		out, reUsed, err = huff0.Compress4X(b.literals, b.litEnc)
+	} else if len(b.literals) > 32 && !raw {
+		// Use 1 stream
+		single = true
+		out, reUsed, err = huff0.Compress1X(b.literals, b.litEnc)
+	} else {
+		err = huff0.ErrIncompressible
+	}
+
+	switch err {
+	case huff0.ErrIncompressible:
+		lh.setType(literalsBlockRaw)
+		lh.setSize(len(b.literals))
+		b.output = lh.appendTo(b.output)
+		b.output = append(b.output, b.literals...)
+		if debug {
+			println("Adding literals RAW, length", len(b.literals))
+		}
+	case huff0.ErrUseRLE:
+		lh.setType(literalsBlockRLE)
+		lh.setSize(len(b.literals))
+		b.output = lh.appendTo(b.output)
+		b.output = append(b.output, b.literals[0])
+		if debug {
+			println("Adding literals RLE")
+		}
+	default:
+		if debug {
+			println("Adding literals ERROR:", err)
+		}
+		return err
+	case nil:
+		// Compressed litLen...
+		if reUsed {
+			if debug {
+				println("reused tree")
+			}
+			lh.setType(literalsBlockTreeless)
+		} else {
+			if debug {
+				println("new tree, size:", len(b.litEnc.OutTable))
+			}
+			lh.setType(literalsBlockCompressed)
+			if debug {
+				_, _, err := huff0.ReadTable(out, nil)
+				if err != nil {
+					panic(err)
+				}
+			}
+		}
+		lh.setSizes(len(out), len(b.literals), single)
+		if debug {
+			printf("Compressed %d literals to %d bytes", len(b.literals), len(out))
+			println("Adding literal header:", lh)
+		}
+		b.output = lh.appendTo(b.output)
+		b.output = append(b.output, out...)
+		b.litEnc.Reuse = huff0.ReusePolicyAllow
+		if debug {
+			println("Adding literals compressed")
+		}
+	}
+	// Sequence compression
+
+	// Write the number of sequences
+	switch {
+	case len(b.sequences) < 128:
+		b.output = append(b.output, uint8(len(b.sequences)))
+	case len(b.sequences) < 0x7f00: // TODO: this could be wrong
+		n := len(b.sequences)
+		b.output = append(b.output, 128+uint8(n>>8), uint8(n))
+	default:
+		n := len(b.sequences) - 0x7f00
+		b.output = append(b.output, 255, uint8(n), uint8(n>>8))
+	}
+	if debug {
+		println("Encoding", len(b.sequences), "sequences")
+	}
+	b.genCodes()
+	llEnc := b.coders.llEnc
+	ofEnc := b.coders.ofEnc
+	mlEnc := b.coders.mlEnc
+	err = llEnc.normalizeCount(len(b.sequences))
+	if err != nil {
+		return err
+	}
+	err = ofEnc.normalizeCount(len(b.sequences))
+	if err != nil {
+		return err
+	}
+	err = mlEnc.normalizeCount(len(b.sequences))
+	if err != nil {
+		return err
+	}
+
+	// Choose the best compression mode for each type.
+	// Will evaluate the new vs predefined and previous.
+	chooseComp := func(cur, prev, preDef *fseEncoder) (*fseEncoder, seqCompMode) {
+		// See if predefined/previous is better
+		hist := cur.count[:cur.symbolLen]
+		nSize := cur.approxSize(hist) + cur.maxHeaderSize()
+		predefSize := preDef.approxSize(hist)
+		prevSize := prev.approxSize(hist)
+
+		// Add a small penalty for new encoders.
+		// Don't bother with extremely small (<2 byte gains).
+		nSize = nSize + (nSize+2*8*16)>>4
+		switch {
+		case predefSize <= prevSize && predefSize <= nSize || forcePreDef:
+			if debug {
+				println("Using predefined", predefSize>>3, "<=", nSize>>3)
+			}
+			return preDef, compModePredefined
+		case prevSize <= nSize:
+			if debug {
+				println("Using previous", prevSize>>3, "<=", nSize>>3)
+			}
+			return prev, compModeRepeat
+		default:
+			if debug {
+				println("Using new, predef", predefSize>>3, ". previous:", prevSize>>3, ">", nSize>>3, "header max:", cur.maxHeaderSize()>>3, "bytes")
+				println("tl:", cur.actualTableLog, "symbolLen:", cur.symbolLen, "norm:", cur.norm[:cur.symbolLen], "hist", cur.count[:cur.symbolLen])
+			}
+			return cur, compModeFSE
+		}
+	}
+
+	// Write compression mode
+	var mode uint8
+	if llEnc.useRLE {
+		mode |= uint8(compModeRLE) << 6
+		llEnc.setRLE(b.sequences[0].llCode)
+		if debug {
+			println("llEnc.useRLE")
+		}
+	} else {
+		var m seqCompMode
+		llEnc, m = chooseComp(llEnc, b.coders.llPrev, &fsePredefEnc[tableLiteralLengths])
+		mode |= uint8(m) << 6
+	}
+	if ofEnc.useRLE {
+		mode |= uint8(compModeRLE) << 4
+		ofEnc.setRLE(b.sequences[0].ofCode)
+		if debug {
+			println("ofEnc.useRLE")
+		}
+	} else {
+		var m seqCompMode
+		ofEnc, m = chooseComp(ofEnc, b.coders.ofPrev, &fsePredefEnc[tableOffsets])
+		mode |= uint8(m) << 4
+	}
+
+	if mlEnc.useRLE {
+		mode |= uint8(compModeRLE) << 2
+		mlEnc.setRLE(b.sequences[0].mlCode)
+		if debug {
+			println("mlEnc.useRLE, code: ", b.sequences[0].mlCode, "value", b.sequences[0].matchLen)
+		}
+	} else {
+		var m seqCompMode
+		mlEnc, m = chooseComp(mlEnc, b.coders.mlPrev, &fsePredefEnc[tableMatchLengths])
+		mode |= uint8(m) << 2
+	}
+	b.output = append(b.output, mode)
+	if debug {
+		printf("Compression modes: 0b%b", mode)
+	}
+	b.output, err = llEnc.writeCount(b.output)
+	if err != nil {
+		return err
+	}
+	start := len(b.output)
+	b.output, err = ofEnc.writeCount(b.output)
+	if err != nil {
+		return err
+	}
+	if false {
+		println("block:", b.output[start:], "tablelog", ofEnc.actualTableLog, "maxcount:", ofEnc.maxCount)
+		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", ofEnc.actualTableLog, ofEnc.symbolLen)
+		for i, v := range ofEnc.norm[:ofEnc.symbolLen] {
+			fmt.Printf("%3d: %5d -> %4d \n", i, ofEnc.count[i], v)
+		}
+	}
+	b.output, err = mlEnc.writeCount(b.output)
+	if err != nil {
+		return err
+	}
+
+	// Maybe in block?
+	wr := &b.wr
+	wr.reset(b.output)
+
+	var ll, of, ml cState
+
+	// Current sequence
+	seq := len(b.sequences) - 1
+	s := b.sequences[seq]
+	llEnc.setBits(llBitsTable[:])
+	mlEnc.setBits(mlBitsTable[:])
+	ofEnc.setBits(nil)
+
+	llTT, ofTT, mlTT := llEnc.ct.symbolTT[:256], ofEnc.ct.symbolTT[:256], mlEnc.ct.symbolTT[:256]
+
+	// We have 3 bounds checks here (and in the loop).
+	// Since we are iterating backwards it is kinda hard to avoid.
+	llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
+	ll.init(wr, &llEnc.ct, llB)
+	of.init(wr, &ofEnc.ct, ofB)
+	wr.flush32()
+	ml.init(wr, &mlEnc.ct, mlB)
+
+	// Each of these lookups also generates a bounds check.
+	wr.addBits32NC(s.litLen, llB.outBits)
+	wr.addBits32NC(s.matchLen, mlB.outBits)
+	wr.flush32()
+	wr.addBits32NC(s.offset, ofB.outBits)
+	if debugSequences {
+		println("Encoded seq", seq, s, "codes:", s.llCode, s.mlCode, s.ofCode, "states:", ll.state, ml.state, of.state, "bits:", llB, mlB, ofB)
+	}
+	seq--
+	if llEnc.maxBits+mlEnc.maxBits+ofEnc.maxBits <= 32 {
+		// No need to flush (common)
+		for seq >= 0 {
+			s = b.sequences[seq]
+			wr.flush32()
+			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
+			// tabelog max is 8 for all.
+			of.encode(ofB)
+			ml.encode(mlB)
+			ll.encode(llB)
+			wr.flush32()
+
+			// We checked that all can stay within 32 bits
+			wr.addBits32NC(s.litLen, llB.outBits)
+			wr.addBits32NC(s.matchLen, mlB.outBits)
+			wr.addBits32NC(s.offset, ofB.outBits)
+
+			if debugSequences {
+				println("Encoded seq", seq, s)
+			}
+
+			seq--
+		}
+	} else {
+		for seq >= 0 {
+			s = b.sequences[seq]
+			wr.flush32()
+			llB, ofB, mlB := llTT[s.llCode], ofTT[s.ofCode], mlTT[s.mlCode]
+			// tabelog max is below 8 for each.
+			of.encode(ofB)
+			ml.encode(mlB)
+			ll.encode(llB)
+			wr.flush32()
+
+			// ml+ll = max 32 bits total
+			wr.addBits32NC(s.litLen, llB.outBits)
+			wr.addBits32NC(s.matchLen, mlB.outBits)
+			wr.flush32()
+			wr.addBits32NC(s.offset, ofB.outBits)
+
+			if debugSequences {
+				println("Encoded seq", seq, s)
+			}
+
+			seq--
+		}
+	}
+	ml.flush(mlEnc.actualTableLog)
+	of.flush(ofEnc.actualTableLog)
+	ll.flush(llEnc.actualTableLog)
+	err = wr.close()
+	if err != nil {
+		return err
+	}
+	b.output = wr.out
+
+	if len(b.output)-3-bhOffset >= b.size {
+		// Maybe even add a bigger margin.
+		b.litEnc.Reuse = huff0.ReusePolicyNone
+		return errIncompressible
+	}
+
+	// Size is output minus block header.
+	bh.setSize(uint32(len(b.output)-bhOffset) - 3)
+	if debug {
+		println("Rewriting block header", bh)
+	}
+	_ = bh.appendTo(b.output[bhOffset:bhOffset])
+	b.coders.setPrev(llEnc, mlEnc, ofEnc)
+	return nil
+}
+
+var errIncompressible = errors.New("incompressible")
+
+func (b *blockEnc) genCodes() {
+	if len(b.sequences) == 0 {
+		// nothing to do
+		return
+	}
+
+	if len(b.sequences) > math.MaxUint16 {
+		panic("can only encode up to 64K sequences")
+	}
+	// No bounds checks after here:
+	llH := b.coders.llEnc.Histogram()[:256]
+	ofH := b.coders.ofEnc.Histogram()[:256]
+	mlH := b.coders.mlEnc.Histogram()[:256]
+	for i := range llH {
+		llH[i] = 0
+	}
+	for i := range ofH {
+		ofH[i] = 0
+	}
+	for i := range mlH {
+		mlH[i] = 0
+	}
+
+	var llMax, ofMax, mlMax uint8
+	for i, seq := range b.sequences {
+		v := llCode(seq.litLen)
+		seq.llCode = v
+		llH[v]++
+		if v > llMax {
+			llMax = v
+		}
+
+		v = ofCode(seq.offset)
+		seq.ofCode = v
+		ofH[v]++
+		if v > ofMax {
+			ofMax = v
+		}
+
+		v = mlCode(seq.matchLen)
+		seq.mlCode = v
+		mlH[v]++
+		if v > mlMax {
+			mlMax = v
+			if debugAsserts && mlMax > maxMatchLengthSymbol {
+				panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d), matchlen: %d", mlMax, seq.matchLen))
+			}
+		}
+		b.sequences[i] = seq
+	}
+	maxCount := func(a []uint32) int {
+		var max uint32
+		for _, v := range a {
+			if v > max {
+				max = v
+			}
+		}
+		return int(max)
+	}
+	if debugAsserts && mlMax > maxMatchLengthSymbol {
+		panic(fmt.Errorf("mlMax > maxMatchLengthSymbol (%d)", mlMax))
+	}
+	if debugAsserts && ofMax > maxOffsetBits {
+		panic(fmt.Errorf("ofMax > maxOffsetBits (%d)", ofMax))
+	}
+	if debugAsserts && llMax > maxLiteralLengthSymbol {
+		panic(fmt.Errorf("llMax > maxLiteralLengthSymbol (%d)", llMax))
+	}
+
+	b.coders.mlEnc.HistogramFinished(mlMax, maxCount(mlH[:mlMax+1]))
+	b.coders.ofEnc.HistogramFinished(ofMax, maxCount(ofH[:ofMax+1]))
+	b.coders.llEnc.HistogramFinished(llMax, maxCount(llH[:llMax+1]))
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/blocktype_string.go b/vendor/github.com/klauspost/compress/zstd/blocktype_string.go
new file mode 100644
index 00000000000..01a01e486e1
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/blocktype_string.go
@@ -0,0 +1,85 @@
+// Code generated by "stringer -type=blockType,literalsBlockType,seqCompMode,tableIndex"; DO NOT EDIT.
+
+package zstd
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[blockTypeRaw-0]
+	_ = x[blockTypeRLE-1]
+	_ = x[blockTypeCompressed-2]
+	_ = x[blockTypeReserved-3]
+}
+
+const _blockType_name = "blockTypeRawblockTypeRLEblockTypeCompressedblockTypeReserved"
+
+var _blockType_index = [...]uint8{0, 12, 24, 43, 60}
+
+func (i blockType) String() string {
+	if i >= blockType(len(_blockType_index)-1) {
+		return "blockType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _blockType_name[_blockType_index[i]:_blockType_index[i+1]]
+}
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[literalsBlockRaw-0]
+	_ = x[literalsBlockRLE-1]
+	_ = x[literalsBlockCompressed-2]
+	_ = x[literalsBlockTreeless-3]
+}
+
+const _literalsBlockType_name = "literalsBlockRawliteralsBlockRLEliteralsBlockCompressedliteralsBlockTreeless"
+
+var _literalsBlockType_index = [...]uint8{0, 16, 32, 55, 76}
+
+func (i literalsBlockType) String() string {
+	if i >= literalsBlockType(len(_literalsBlockType_index)-1) {
+		return "literalsBlockType(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _literalsBlockType_name[_literalsBlockType_index[i]:_literalsBlockType_index[i+1]]
+}
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[compModePredefined-0]
+	_ = x[compModeRLE-1]
+	_ = x[compModeFSE-2]
+	_ = x[compModeRepeat-3]
+}
+
+const _seqCompMode_name = "compModePredefinedcompModeRLEcompModeFSEcompModeRepeat"
+
+var _seqCompMode_index = [...]uint8{0, 18, 29, 40, 54}
+
+func (i seqCompMode) String() string {
+	if i >= seqCompMode(len(_seqCompMode_index)-1) {
+		return "seqCompMode(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _seqCompMode_name[_seqCompMode_index[i]:_seqCompMode_index[i+1]]
+}
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[tableLiteralLengths-0]
+	_ = x[tableOffsets-1]
+	_ = x[tableMatchLengths-2]
+}
+
+const _tableIndex_name = "tableLiteralLengthstableOffsetstableMatchLengths"
+
+var _tableIndex_index = [...]uint8{0, 19, 31, 48}
+
+func (i tableIndex) String() string {
+	if i >= tableIndex(len(_tableIndex_index)-1) {
+		return "tableIndex(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _tableIndex_name[_tableIndex_index[i]:_tableIndex_index[i+1]]
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytebuf.go b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
new file mode 100644
index 00000000000..658ef78380e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/bytebuf.go
@@ -0,0 +1,127 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+	"io/ioutil"
+)
+
+type byteBuffer interface {
+	// Read up to 8 bytes.
+	// Returns nil if no more input is available.
+	readSmall(n int) []byte
+
+	// Read >8 bytes.
+	// MAY use the destination slice.
+	readBig(n int, dst []byte) ([]byte, error)
+
+	// Read a single byte.
+	readByte() (byte, error)
+
+	// Skip n bytes.
+	skipN(n int) error
+}
+
+// in-memory buffer
+type byteBuf []byte
+
+func (b *byteBuf) readSmall(n int) []byte {
+	if debugAsserts && n > 8 {
+		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
+	}
+	bb := *b
+	if len(bb) < n {
+		return nil
+	}
+	r := bb[:n]
+	*b = bb[n:]
+	return r
+}
+
+func (b *byteBuf) readBig(n int, dst []byte) ([]byte, error) {
+	bb := *b
+	if len(bb) < n {
+		return nil, io.ErrUnexpectedEOF
+	}
+	r := bb[:n]
+	*b = bb[n:]
+	return r, nil
+}
+
+func (b *byteBuf) remain() []byte {
+	return *b
+}
+
+func (b *byteBuf) readByte() (byte, error) {
+	bb := *b
+	if len(bb) < 1 {
+		return 0, nil
+	}
+	r := bb[0]
+	*b = bb[1:]
+	return r, nil
+}
+
+func (b *byteBuf) skipN(n int) error {
+	bb := *b
+	if len(bb) < n {
+		return io.ErrUnexpectedEOF
+	}
+	*b = bb[n:]
+	return nil
+}
+
+// wrapper around a reader.
+type readerWrapper struct {
+	r   io.Reader
+	tmp [8]byte
+}
+
+func (r *readerWrapper) readSmall(n int) []byte {
+	if debugAsserts && n > 8 {
+		panic(fmt.Errorf("small read > 8 (%d). use readBig", n))
+	}
+	n2, err := io.ReadFull(r.r, r.tmp[:n])
+	// We only really care about the actual bytes read.
+	if n2 != n {
+		if debug {
+			println("readSmall: got", n2, "want", n, "err", err)
+		}
+		return nil
+	}
+	return r.tmp[:n]
+}
+
+func (r *readerWrapper) readBig(n int, dst []byte) ([]byte, error) {
+	if cap(dst) < n {
+		dst = make([]byte, n)
+	}
+	n2, err := io.ReadFull(r.r, dst[:n])
+	if err == io.EOF && n > 0 {
+		err = io.ErrUnexpectedEOF
+	}
+	return dst[:n2], err
+}
+
+func (r *readerWrapper) readByte() (byte, error) {
+	n2, err := r.r.Read(r.tmp[:1])
+	if err != nil {
+		return 0, err
+	}
+	if n2 != 1 {
+		return 0, io.ErrUnexpectedEOF
+	}
+	return r.tmp[0], nil
+}
+
+func (r *readerWrapper) skipN(n int) error {
+	n2, err := io.CopyN(ioutil.Discard, r.r, int64(n))
+	if n2 != int64(n) {
+		err = io.ErrUnexpectedEOF
+	}
+	return err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/bytereader.go b/vendor/github.com/klauspost/compress/zstd/bytereader.go
new file mode 100644
index 00000000000..dc4378b6401
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/bytereader.go
@@ -0,0 +1,74 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+// byteReader provides a byte reader that reads
+// little endian values from a byte stream.
+// The input stream is manually advanced.
+// The reader performs no bounds checks.
+type byteReader struct {
+	b   []byte
+	off int
+}
+
+// init will initialize the reader and set the input.
+func (b *byteReader) init(in []byte) {
+	b.b = in
+	b.off = 0
+}
+
+// advance the stream b n bytes.
+func (b *byteReader) advance(n uint) {
+	b.off += int(n)
+}
+
+// overread returns whether we have advanced too far.
+func (b *byteReader) overread() bool {
+	return b.off > len(b.b)
+}
+
+// Int32 returns a little endian int32 starting at current offset.
+func (b byteReader) Int32() int32 {
+	b2 := b.b[b.off : b.off+4 : b.off+4]
+	v3 := int32(b2[3])
+	v2 := int32(b2[2])
+	v1 := int32(b2[1])
+	v0 := int32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// Uint8 returns the next byte
+func (b *byteReader) Uint8() uint8 {
+	v := b.b[b.off]
+	return v
+}
+
+// Uint32 returns a little endian uint32 starting at current offset.
+func (b byteReader) Uint32() uint32 {
+	if r := b.remain(); r < 4 {
+		// Very rare
+		v := uint32(0)
+		for i := 1; i <= r; i++ {
+			v = (v << 8) | uint32(b.b[len(b.b)-i])
+		}
+		return v
+	}
+	b2 := b.b[b.off : b.off+4 : b.off+4]
+	v3 := uint32(b2[3])
+	v2 := uint32(b2[2])
+	v1 := uint32(b2[1])
+	v0 := uint32(b2[0])
+	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
+}
+
+// unread returns the unread portion of the input.
+func (b byteReader) unread() []byte {
+	return b.b[b.off:]
+}
+
+// remain will return the number of bytes remaining.
+func (b byteReader) remain() int {
+	return len(b.b) - b.off
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder.go b/vendor/github.com/klauspost/compress/zstd/decoder.go
new file mode 100644
index 00000000000..2340255051e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/decoder.go
@@ -0,0 +1,518 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"bytes"
+	"errors"
+	"io"
+	"sync"
+)
+
+// Decoder provides decoding of zstandard streams.
+// The decoder has been designed to operate without allocations after a warmup.
+// This means that you should store the decoder for best performance.
+// To re-use a stream decoder, use the Reset(r io.Reader) error to switch to another stream.
+// A decoder can safely be re-used even if the previous stream failed.
+// To release the resources, you must call the Close() function on a decoder.
+type Decoder struct {
+	o decoderOptions
+
+	// Unreferenced decoders, ready for use.
+	decoders chan *blockDec
+
+	// Unreferenced decoders, ready for use.
+	frames chan *frameDec
+
+	// Streams ready to be decoded.
+	stream chan decodeStream
+
+	// Current read position used for Reader functionality.
+	current decoderState
+
+	// Custom dictionaries
+	dicts map[uint32]struct{}
+
+	// streamWg is the waitgroup for all streams
+	streamWg sync.WaitGroup
+}
+
+// decoderState is used for maintaining state when the decoder
+// is used for streaming.
+type decoderState struct {
+	// current block being written to stream.
+	decodeOutput
+
+	// output in order to be written to stream.
+	output chan decodeOutput
+
+	// cancel remaining output.
+	cancel chan struct{}
+
+	flushed bool
+}
+
+var (
+	// Check the interfaces we want to support.
+	_ = io.WriterTo(&Decoder{})
+	_ = io.Reader(&Decoder{})
+)
+
+// NewReader creates a new decoder.
+// A nil Reader can be provided in which case Reset can be used to start a decode.
+//
+// A Decoder can be used in two modes:
+//
+// 1) As a stream, or
+// 2) For stateless decoding using DecodeAll.
+//
+// Only a single stream can be decoded concurrently, but the same decoder
+// can run multiple concurrent stateless decodes. It is even possible to
+// use stateless decodes while a stream is being decoded.
+//
+// The Reset function can be used to initiate a new stream, which is will considerably
+// reduce the allocations normally caused by NewReader.
+func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
+	initPredefined()
+	var d Decoder
+	d.o.setDefault()
+	for _, o := range opts {
+		err := o(&d.o)
+		if err != nil {
+			return nil, err
+		}
+	}
+	d.current.output = make(chan decodeOutput, d.o.concurrent)
+	d.current.flushed = true
+
+	// Create decoders
+	d.decoders = make(chan *blockDec, d.o.concurrent)
+	d.frames = make(chan *frameDec, d.o.concurrent)
+	for i := 0; i < d.o.concurrent; i++ {
+		d.frames <- newFrameDec(d.o)
+		d.decoders <- newBlockDec(d.o.lowMem)
+	}
+
+	if r == nil {
+		return &d, nil
+	}
+	return &d, d.Reset(r)
+}
+
+// Read bytes from the decompressed stream into p.
+// Returns the number of bytes written and any error that occurred.
+// When the stream is done, io.EOF will be returned.
+func (d *Decoder) Read(p []byte) (int, error) {
+	if d.stream == nil {
+		return 0, errors.New("no input has been initialized")
+	}
+	var n int
+	for {
+		if len(d.current.b) > 0 {
+			filled := copy(p, d.current.b)
+			p = p[filled:]
+			d.current.b = d.current.b[filled:]
+			n += filled
+		}
+		if len(p) == 0 {
+			break
+		}
+		if len(d.current.b) == 0 {
+			// We have an error and no more data
+			if d.current.err != nil {
+				break
+			}
+			if !d.nextBlock(n == 0) {
+				return n, nil
+			}
+		}
+	}
+	if len(d.current.b) > 0 {
+		if debug {
+			println("returning", n, "still bytes left:", len(d.current.b))
+		}
+		// Only return error at end of block
+		return n, nil
+	}
+	if d.current.err != nil {
+		d.drainOutput()
+	}
+	if debug {
+		println("returning", n, d.current.err, len(d.decoders))
+	}
+	return n, d.current.err
+}
+
+// Reset will reset the decoder the supplied stream after the current has finished processing.
+// Note that this functionality cannot be used after Close has been called.
+func (d *Decoder) Reset(r io.Reader) error {
+	if d.current.err == ErrDecoderClosed {
+		return d.current.err
+	}
+	if r == nil {
+		return errors.New("nil Reader sent as input")
+	}
+
+	if d.stream == nil {
+		d.stream = make(chan decodeStream, 1)
+		d.streamWg.Add(1)
+		go d.startStreamDecoder(d.stream)
+	}
+
+	d.drainOutput()
+
+	// If bytes buffer and < 1MB, do sync decoding anyway.
+	if bb, ok := r.(*bytes.Buffer); ok && bb.Len() < 1<<20 {
+		if debug {
+			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
+		}
+		b := bb.Bytes()
+		var dst []byte
+		if cap(d.current.b) > 0 {
+			dst = d.current.b
+		}
+
+		dst, err := d.DecodeAll(b, dst[:0])
+		if err == nil {
+			err = io.EOF
+		}
+		d.current.b = dst
+		d.current.err = err
+		d.current.flushed = true
+		if debug {
+			println("sync decode to ", len(dst), "bytes, err:", err)
+		}
+		return nil
+	}
+
+	// Remove current block.
+	d.current.decodeOutput = decodeOutput{}
+	d.current.err = nil
+	d.current.cancel = make(chan struct{})
+	d.current.flushed = false
+	d.current.d = nil
+
+	d.stream <- decodeStream{
+		r:      r,
+		output: d.current.output,
+		cancel: d.current.cancel,
+	}
+	return nil
+}
+
+// drainOutput will drain the output until errEndOfStream is sent.
+func (d *Decoder) drainOutput() {
+	if d.current.cancel != nil {
+		println("cancelling current")
+		close(d.current.cancel)
+		d.current.cancel = nil
+	}
+	if d.current.d != nil {
+		if debug {
+			printf("re-adding current decoder %p, decoders: %d", d.current.d, len(d.decoders))
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+		d.current.b = nil
+	}
+	if d.current.output == nil || d.current.flushed {
+		println("current already flushed")
+		return
+	}
+	for {
+		select {
+		case v := <-d.current.output:
+			if v.d != nil {
+				if debug {
+					printf("re-adding decoder %p", v.d)
+				}
+				d.decoders <- v.d
+			}
+			if v.err == errEndOfStream {
+				println("current flushed")
+				d.current.flushed = true
+				return
+			}
+		}
+	}
+}
+
+// WriteTo writes data to w until there's no more data to write or when an error occurs.
+// The return value n is the number of bytes written.
+// Any error encountered during the write is also returned.
+func (d *Decoder) WriteTo(w io.Writer) (int64, error) {
+	if d.stream == nil {
+		return 0, errors.New("no input has been initialized")
+	}
+	var n int64
+	for {
+		if len(d.current.b) > 0 {
+			n2, err2 := w.Write(d.current.b)
+			n += int64(n2)
+			if err2 != nil && d.current.err == nil {
+				d.current.err = err2
+				break
+			}
+		}
+		if d.current.err != nil {
+			break
+		}
+		d.nextBlock(true)
+	}
+	err := d.current.err
+	if err != nil {
+		d.drainOutput()
+	}
+	if err == io.EOF {
+		err = nil
+	}
+	return n, err
+}
+
+// DecodeAll allows stateless decoding of a blob of bytes.
+// Output will be appended to dst, so if the destination size is known
+// you can pre-allocate the destination slice to avoid allocations.
+// DecodeAll can be used concurrently.
+// The Decoder concurrency limits will be respected.
+func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
+	if d.current.err == ErrDecoderClosed {
+		return dst, ErrDecoderClosed
+	}
+
+	// Grab a block decoder and frame decoder.
+	block, frame := <-d.decoders, <-d.frames
+	defer func() {
+		if debug {
+			printf("re-adding decoder: %p", block)
+		}
+		d.decoders <- block
+		frame.rawInput = nil
+		frame.bBuf = nil
+		d.frames <- frame
+	}()
+	frame.bBuf = input
+
+	for {
+		err := frame.reset(&frame.bBuf)
+		if err == io.EOF {
+			return dst, nil
+		}
+		if err != nil {
+			return dst, err
+		}
+		if frame.FrameContentSize > d.o.maxDecodedSize-uint64(len(dst)) {
+			return dst, ErrDecoderSizeExceeded
+		}
+		if frame.FrameContentSize > 0 && frame.FrameContentSize < 1<<30 {
+			// Never preallocate moe than 1 GB up front.
+			if uint64(cap(dst)) < frame.FrameContentSize {
+				dst2 := make([]byte, len(dst), len(dst)+int(frame.FrameContentSize))
+				copy(dst2, dst)
+				dst = dst2
+			}
+		}
+		if cap(dst) == 0 {
+			// Allocate window size * 2 by default if nothing is provided and we didn't get frame content size.
+			size := frame.WindowSize * 2
+			// Cap to 1 MB.
+			if size > 1<<20 {
+				size = 1 << 20
+			}
+			dst = make([]byte, 0, size)
+		}
+
+		dst, err = frame.runDecoder(dst, block)
+		if err != nil {
+			return dst, err
+		}
+		if len(frame.bBuf) == 0 {
+			break
+		}
+	}
+	return dst, nil
+}
+
+// nextBlock returns the next block.
+// If an error occurs d.err will be set.
+// Optionally the function can block for new output.
+// If non-blocking mode is used the returned boolean will be false
+// if no data was available without blocking.
+func (d *Decoder) nextBlock(blocking bool) (ok bool) {
+	if d.current.d != nil {
+		if debug {
+			printf("re-adding current decoder %p", d.current.d)
+		}
+		d.decoders <- d.current.d
+		d.current.d = nil
+	}
+	if d.current.err != nil {
+		// Keep error state.
+		return blocking
+	}
+
+	if blocking {
+		d.current.decodeOutput = <-d.current.output
+	} else {
+		select {
+		case d.current.decodeOutput = <-d.current.output:
+		default:
+			return false
+		}
+	}
+	if debug {
+		println("got", len(d.current.b), "bytes, error:", d.current.err)
+	}
+	return true
+}
+
+// Close will release all resources.
+// It is NOT possible to reuse the decoder after this.
+func (d *Decoder) Close() {
+	if d.current.err == ErrDecoderClosed {
+		return
+	}
+	d.drainOutput()
+	if d.stream != nil {
+		close(d.stream)
+		d.streamWg.Wait()
+		d.stream = nil
+	}
+	if d.decoders != nil {
+		close(d.decoders)
+		for dec := range d.decoders {
+			dec.Close()
+		}
+		d.decoders = nil
+	}
+	if d.current.d != nil {
+		d.current.d.Close()
+		d.current.d = nil
+	}
+	d.current.err = ErrDecoderClosed
+}
+
+// IOReadCloser returns the decoder as an io.ReadCloser for convenience.
+// Any changes to the decoder will be reflected, so the returned ReadCloser
+// can be reused along with the decoder.
+// io.WriterTo is also supported by the returned ReadCloser.
+func (d *Decoder) IOReadCloser() io.ReadCloser {
+	return closeWrapper{d: d}
+}
+
+// closeWrapper wraps a function call as a closer.
+type closeWrapper struct {
+	d *Decoder
+}
+
+// WriteTo forwards WriteTo calls to the decoder.
+func (c closeWrapper) WriteTo(w io.Writer) (n int64, err error) {
+	return c.d.WriteTo(w)
+}
+
+// Read forwards read calls to the decoder.
+func (c closeWrapper) Read(p []byte) (n int, err error) {
+	return c.d.Read(p)
+}
+
+// Close closes the decoder.
+func (c closeWrapper) Close() error {
+	c.d.Close()
+	return nil
+}
+
+type decodeOutput struct {
+	d   *blockDec
+	b   []byte
+	err error
+}
+
+type decodeStream struct {
+	r io.Reader
+
+	// Blocks ready to be written to output.
+	output chan decodeOutput
+
+	// cancel reading from the input
+	cancel chan struct{}
+}
+
+// errEndOfStream indicates that everything from the stream was read.
+var errEndOfStream = errors.New("end-of-stream")
+
+// Create Decoder:
+// Spawn n block decoders. These accept tasks to decode a block.
+// Create goroutine that handles stream processing, this will send history to decoders as they are available.
+// Decoders update the history as they decode.
+// When a block is returned:
+// 		a) history is sent to the next decoder,
+// 		b) content written to CRC.
+// 		c) return data to WRITER.
+// 		d) wait for next block to return data.
+// Once WRITTEN, the decoders reused by the writer frame decoder for re-use.
+func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
+	defer d.streamWg.Done()
+	frame := newFrameDec(d.o)
+	for stream := range inStream {
+		if debug {
+			println("got new stream")
+		}
+		br := readerWrapper{r: stream.r}
+	decodeStream:
+		for {
+			err := frame.reset(&br)
+			if debug && err != nil {
+				println("Frame decoder returned", err)
+			}
+			if err != nil {
+				stream.output <- decodeOutput{
+					err: err,
+				}
+				break
+			}
+			if debug {
+				println("starting frame decoder")
+			}
+
+			// This goroutine will forward history between frames.
+			frame.frameDone.Add(1)
+			frame.initAsync()
+
+			go frame.startDecoder(stream.output)
+		decodeFrame:
+			// Go through all blocks of the frame.
+			for {
+				dec := <-d.decoders
+				select {
+				case <-stream.cancel:
+					if !frame.sendErr(dec, io.EOF) {
+						// To not let the decoder dangle, send it back.
+						stream.output <- decodeOutput{d: dec}
+					}
+					break decodeStream
+				default:
+				}
+				err := frame.next(dec)
+				switch err {
+				case io.EOF:
+					// End of current frame, no error
+					println("EOF on next block")
+					break decodeFrame
+				case nil:
+					continue
+				default:
+					println("block decoder returned", err)
+					break decodeStream
+				}
+			}
+			// All blocks have started decoding, check if there are more frames.
+			println("waiting for done")
+			frame.frameDone.Wait()
+			println("done waiting...")
+		}
+		frame.frameDone.Wait()
+		println("Sending EOS")
+		stream.output <- decodeOutput{err: errEndOfStream}
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/decoder_options.go b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
new file mode 100644
index 00000000000..2ac9cd2dd30
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/decoder_options.go
@@ -0,0 +1,68 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+)
+
+// DOption is an option for creating a decoder.
+type DOption func(*decoderOptions) error
+
+// options retains accumulated state of multiple options.
+type decoderOptions struct {
+	lowMem         bool
+	concurrent     int
+	maxDecodedSize uint64
+}
+
+func (o *decoderOptions) setDefault() {
+	*o = decoderOptions{
+		// use less ram: true for now, but may change.
+		lowMem:     true,
+		concurrent: runtime.GOMAXPROCS(0),
+	}
+	o.maxDecodedSize = 1 << 63
+}
+
+// WithDecoderLowmem will set whether to use a lower amount of memory,
+// but possibly have to allocate more while running.
+func WithDecoderLowmem(b bool) DOption {
+	return func(o *decoderOptions) error { o.lowMem = b; return nil }
+}
+
+// WithDecoderConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WithDecoderConcurrency(n int) DOption {
+	return func(o *decoderOptions) error {
+		if n <= 0 {
+			return fmt.Errorf("Concurrency must be at least 1")
+		}
+		o.concurrent = n
+		return nil
+	}
+}
+
+// WithDecoderMaxMemory allows to set a maximum decoded size for in-memory
+// non-streaming operations or maximum window size for streaming operations.
+// This can be used to control memory usage of potentially hostile content.
+// For streaming operations, the maximum window size is capped at 1<<30 bytes.
+// Maximum and default is 1 << 63 bytes.
+func WithDecoderMaxMemory(n uint64) DOption {
+	return func(o *decoderOptions) error {
+		if n == 0 {
+			return errors.New("WithDecoderMaxMemory must be at least 1")
+		}
+		if n > 1<<63 {
+			return fmt.Errorf("WithDecoderMaxmemory must be less than 1 << 63")
+		}
+		o.maxDecodedSize = n
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_better.go b/vendor/github.com/klauspost/compress/zstd/enc_better.go
new file mode 100644
index 00000000000..c120d905481
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_better.go
@@ -0,0 +1,518 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import "fmt"
+
+const (
+	betterLongTableBits = 19                       // Bits used in the long match table
+	betterLongTableSize = 1 << betterLongTableBits // Size of the table
+
+	// Note: Increasing the short table bits or making the hash shorter
+	// can actually lead to compression degradation since it will 'steal' more from the
+	// long match table and match offsets are quite big.
+	// This greatly depends on the type of input.
+	betterShortTableBits = 13                        // Bits used in the short match table
+	betterShortTableSize = 1 << betterShortTableBits // Size of the table
+)
+
+type prevEntry struct {
+	offset int32
+	prev   int32
+}
+
+// betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
+// The long match table contains the previous entry with the same hash,
+// effectively making it a "chain" of length 2.
+// When we find a long match we choose between the two values and select the longest.
+// When we find a short match, after checking the long, we check if we can find a long at n+1
+// and that it is longer (lazy matching).
+type betterFastEncoder struct {
+	fastBase
+	table     [betterShortTableSize]tableEntry
+	longTable [betterLongTableSize]prevEntry
+}
+
+// Encode improves compression...
+func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = prevEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			v2 := e.longTable[i].prev
+			if v < minOff {
+				v = 0
+				v2 = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+				if v2 < minOff {
+					v2 = 0
+				} else {
+					v2 = v2 - e.cur + e.maxMatchOff
+				}
+			}
+			e.longTable[i] = prevEntry{
+				offset: v,
+				prev:   v2,
+			}
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 9
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+		var matched int32
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			off := s + e.cur
+			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
+			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					// Index match start+1 (long) -> s - 1
+					index0 := s + repOff
+					s += lenght + repOff
+
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					continue
+				}
+				const repOff2 = 1
+
+				// We deviate from the reference encoder and also check offset 2.
+				// Still slower and not much better, so disabled.
+				// repIndex = s - offset2 + repOff2
+				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
+					// Consider history as well.
+					var seq seq
+					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff2
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 2
+					seq.offset = 2
+					if debugSequences {
+						println("repeat sequence 2", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+
+					index0 := s + repOff2
+					s += lenght + repOff2
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+
+					// Index skipped...
+					for index0 < s-1 {
+						cv0 := load6432(src, index0)
+						cv1 := cv0 >> 8
+						h0 := hash8(cv0, betterLongTableBits)
+						off := index0 + e.cur
+						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+						index0 += 2
+					}
+					cv = load6432(src, s)
+					// Swap offsets
+					offset1, offset2 = offset2, offset1
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := candidateL.offset - e.cur
+			coffsetLP := candidateL.prev - e.cur
+
+			// Check if we have a long match.
+			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetL+8, src) + 8
+				t = coffsetL
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+
+				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+					// Found a long match, at least 8 bytes.
+					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
+					if prevMatch > matched {
+						matched = prevMatch
+						t = coffsetLP
+					}
+					if debugAsserts && s <= t {
+						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+					}
+					if debugAsserts && s-t > e.maxMatchOff {
+						panic("s - t >e.maxMatchOff")
+					}
+					if debugMatches {
+						println("long match")
+					}
+				}
+				break
+			}
+
+			// Check if we have a long match on prev.
+			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
+				// Found a long match, at least 8 bytes.
+				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
+				t = coffsetLP
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			coffsetS := candidateS.offset - e.cur
+
+			// Check if we have a short match.
+			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				matched = e.matchlen(s+4, coffsetS+4, src) + 4
+
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, betterLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = candidateL.offset - e.cur
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("long match (after short)")
+						}
+						break
+					}
+				}
+
+				// Check prev long...
+				coffsetL = candidateL.prev - e.cur
+				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
+					// Found a long match, at least 8 bytes.
+					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
+					if matchedNext > matched {
+						t = coffsetL
+						s += checkAt
+						matched = matchedNext
+						if debugMatches {
+							println("prev long match (after short)")
+						}
+						break
+					}
+				}
+				t = coffsetS
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the n-byte match as long as possible.
+		l := matched
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) -> s - 1
+		index0 := s - l + 1
+		for index0 < s-1 {
+			cv0 := load6432(src, index0)
+			cv1 := cv0 >> 8
+			h0 := hash8(cv0, betterLongTableBits)
+			off := index0 + e.cur
+			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
+			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
+			index0 += 2
+		}
+
+		cv = load6432(src, s)
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, betterShortTableBits)
+			nextHashL := hash8(cv, betterLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
+			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	e.Encode(blk, src)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_dfast.go b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
new file mode 100644
index 00000000000..5ebead9dc8e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_dfast.go
@@ -0,0 +1,674 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import "fmt"
+
+const (
+	dFastLongTableBits = 17                      // Bits used in the long match table
+	dFastLongTableSize = 1 << dFastLongTableBits // Size of the table
+	dFastLongTableMask = dFastLongTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+
+	dFastShortTableBits = tableBits                // Bits used in the short match table
+	dFastShortTableSize = 1 << dFastShortTableBits // Size of the table
+	dFastShortTableMask = dFastShortTableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+)
+
+type doubleFastEncoder struct {
+	fastEncoder
+	longTable [dFastLongTableSize]tableEntry
+}
+
+// Encode mimmics functionality in zstd_dfast.c
+func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			for i := range e.longTable[:] {
+				e.longTable[i] = tableEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		for i := range e.longTable[:] {
+			v := e.longTable[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.longTable[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		// We allow the encoder to optionally turn off repeat offsets across blocks
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.table[nextHashS] = entry
+
+			if canRepeat {
+				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+
+					seq.matchLen = uint32(lenght - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += lenght + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, lenght)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, dFastLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		l := e.matchlen(s+4, t+4, src) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
+		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		e.table[hash5(cv0, dFastShortTableBits)] = te0
+		e.table[hash5(cv1, dFastShortTableBits)] = te1
+
+		cv = load6432(src, s)
+
+		if !canRepeat {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			l := 4 + e.matchlen(s+4, o2+4, src)
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.table[nextHashS] = entry
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	const (
+		// Input margin is the number of bytes we read (8)
+		// and the maximum we will read ahead (2)
+		inputMargin            = 8 + 2
+		minNonLiteralBlockSize = 16
+	)
+
+	// Protect against e.cur wraparound.
+	if e.cur >= bufferReset {
+		for i := range e.table[:] {
+			e.table[i] = tableEntry{}
+		}
+		for i := range e.longTable[:] {
+			e.longTable[i] = tableEntry{}
+		}
+		e.cur = e.maxMatchOff
+	}
+
+	s := int32(0)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 1.
+	const stepSize = 1
+
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		var t int32
+		for {
+
+			nextHashS := hash5(cv, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+			candidateL := e.longTable[nextHashL]
+			candidateS := e.table[nextHashS]
+
+			const repOff = 1
+			repIndex := s - offset1 + repOff
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.table[nextHashS] = entry
+
+			if len(blk.sequences) > 2 {
+				if load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
+					// Consider history as well.
+					var seq seq
+					//length := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
+					length := 4 + int32(matchLen(src[s+4+repOff:], src[repIndex+4:]))
+
+					seq.matchLen = uint32(length - zstdMinMatch)
+
+					// We might be able to match backwards.
+					// Extend as long as we can.
+					start := s + repOff
+					// We end the search early, so we don't risk 0 literals
+					// and have to do special offset treatment.
+					startLimit := nextEmit + 1
+
+					tMin := s - e.maxMatchOff
+					if tMin < 0 {
+						tMin = 0
+					}
+					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] {
+						repIndex--
+						start--
+						seq.matchLen++
+					}
+					addLiterals(&seq, start)
+
+					// rep 0
+					seq.offset = 1
+					if debugSequences {
+						println("repeat sequence", seq, "next s:", s)
+					}
+					blk.sequences = append(blk.sequences, seq)
+					s += length + repOff
+					nextEmit = s
+					if s >= sLimit {
+						if debug {
+							println("repeat ended", s, length)
+
+						}
+						break encodeLoop
+					}
+					cv = load6432(src, s)
+					continue
+				}
+			}
+			// Find the offsets of our two matches.
+			coffsetL := s - (candidateL.offset - e.cur)
+			coffsetS := s - (candidateS.offset - e.cur)
+
+			// Check if we have a long match.
+			if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+				// Found a long match, likely at least 8 bytes.
+				// Reference encoder checks all 8 bytes, we only check 4,
+				// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+				t = candidateL.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugMatches {
+					println("long match")
+				}
+				break
+			}
+
+			// Check if we have a short match.
+			if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
+				// found a regular match
+				// See if we can find a long match at s+1
+				const checkAt = 1
+				cv := load6432(src, s+checkAt)
+				nextHashL = hash8(cv, dFastLongTableBits)
+				candidateL = e.longTable[nextHashL]
+				coffsetL = s - (candidateL.offset - e.cur) + checkAt
+
+				// We can store it, since we have at least a 4 byte match.
+				e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)}
+				if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val {
+					// Found a long match, likely at least 8 bytes.
+					// Reference encoder checks all 8 bytes, we only check 4,
+					// but the likelihood of both the first 4 bytes and the hash matching should be enough.
+					t = candidateL.offset - e.cur
+					s += checkAt
+					if debugMatches {
+						println("long match (after short)")
+					}
+					break
+				}
+
+				t = candidateS.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				if debugMatches {
+					println("short match")
+				}
+				break
+			}
+
+			// No match found, move forward in input.
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+
+		// A 4-byte match has been found. Update recent offsets.
+		// We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlen(s+4, t+4, src) + 4
+		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+
+		// Index match start+1 (long) and start+2 (short)
+		index0 := s - l + 1
+		// Index match end-2 (long) and end-1 (short)
+		index1 := s - 2
+
+		cv0 := load6432(src, index0)
+		cv1 := load6432(src, index1)
+		te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)}
+		te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)}
+		e.longTable[hash8(cv0, dFastLongTableBits)] = te0
+		e.longTable[hash8(cv1, dFastLongTableBits)] = te1
+		cv0 >>= 8
+		cv1 >>= 8
+		te0.offset++
+		te1.offset++
+		te0.val = uint32(cv0)
+		te1.val = uint32(cv1)
+		e.table[hash5(cv0, dFastShortTableBits)] = te0
+		e.table[hash5(cv1, dFastShortTableBits)] = te1
+
+		cv = load6432(src, s)
+
+		if len(blk.sequences) <= 2 {
+			continue
+		}
+
+		// Check offset 2
+		for {
+			o2 := s - offset2
+			if load3232(src, o2) != uint32(cv) {
+				// Do regular search
+				break
+			}
+
+			// Store this, since we have it.
+			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashL := hash8(cv, dFastLongTableBits)
+
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+
+			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.longTable[nextHashL] = entry
+			e.table[nextHashS] = entry
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				// Finished
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_fast.go b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
new file mode 100644
index 00000000000..d1d3658e611
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_fast.go
@@ -0,0 +1,744 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"fmt"
+	"math"
+	"math/bits"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+const (
+	tableBits      = 15             // Bits used in the table
+	tableSize      = 1 << tableBits // Size of the table
+	tableMask      = tableSize - 1  // Mask for table indices. Redundant, but can eliminate bounds checks.
+	maxMatchLength = 131074
+)
+
+type tableEntry struct {
+	val    uint32
+	offset int32
+}
+
+type fastBase struct {
+	// cur is the offset at the start of hist
+	cur int32
+	// maximum offset. Should be at least 2x block size.
+	maxMatchOff int32
+	hist        []byte
+	crc         *xxhash.Digest
+	tmp         [8]byte
+	blk         *blockEnc
+}
+
+type fastEncoder struct {
+	fastBase
+	table [tableSize]tableEntry
+}
+
+// CRC returns the underlying CRC writer.
+func (e *fastBase) CRC() *xxhash.Digest {
+	return e.crc
+}
+
+// AppendCRC will append the CRC to the destination slice and return it.
+func (e *fastBase) AppendCRC(dst []byte) []byte {
+	crc := e.crc.Sum(e.tmp[:0])
+	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
+	return dst
+}
+
+// WindowSize returns the window size of the encoder,
+// or a window size small enough to contain the input size, if > 0.
+func (e *fastBase) WindowSize(size int) int32 {
+	if size > 0 && size < int(e.maxMatchOff) {
+		b := int32(1) << uint(bits.Len(uint(size)))
+		// Keep minimum window.
+		if b < 1024 {
+			b = 1024
+		}
+		return b
+	}
+	return e.maxMatchOff
+}
+
+// Block returns the current block.
+func (e *fastBase) Block() *blockEnc {
+	return e.blk
+}
+
+// Encode mimmics functionality in zstd_fast.c
+func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+
+	// Protect against e.cur wraparound.
+	for e.cur >= bufferReset {
+		if len(e.hist) == 0 {
+			for i := range e.table[:] {
+				e.table[i] = tableEntry{}
+			}
+			e.cur = e.maxMatchOff
+			break
+		}
+		// Shift down everything in the table that isn't already too far away.
+		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
+		for i := range e.table[:] {
+			v := e.table[i].offset
+			if v < minOff {
+				v = 0
+			} else {
+				v = v - e.cur + e.maxMatchOff
+			}
+			e.table[i].offset = v
+		}
+		e.cur = e.maxMatchOff
+		break
+	}
+
+	s := e.addBlock(src)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	// Override src
+	src = e.hist
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+		canRepeat := len(blk.sequences) > 2
+
+		for {
+			if debugAsserts && canRepeat && offset1 == 0 {
+				panic("offset0 was 0")
+			}
+
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+
+			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				var length int32
+				// length = 4 + e.matchlen(s+6, repIndex+4, src)
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debug {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		if debugAsserts && canRepeat && int(offset1) > len(src) {
+			panic("invalid offset")
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlen(s+4, t+4, src) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlen(s+4, o2+4, src)
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
+
+			// Store this, since we have it.
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	blk.recentOffsets[0] = uint32(offset1)
+	blk.recentOffsets[1] = uint32(offset2)
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+// EncodeNoHist will encode a block with no history and no following blocks.
+// Most notable difference is that src will not be copied for history and
+// we do not need to check for max match length.
+func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
+	const (
+		inputMargin            = 8
+		minNonLiteralBlockSize = 1 + 1 + inputMargin
+	)
+	if debug {
+		if len(src) > maxBlockSize {
+			panic("src too big")
+		}
+	}
+	// Protect against e.cur wraparound.
+	if e.cur >= bufferReset {
+		for i := range e.table[:] {
+			e.table[i] = tableEntry{}
+		}
+		e.cur = e.maxMatchOff
+	}
+
+	s := int32(0)
+	blk.size = len(src)
+	if len(src) < minNonLiteralBlockSize {
+		blk.extraLits = len(src)
+		blk.literals = blk.literals[:len(src)]
+		copy(blk.literals, src)
+		return
+	}
+
+	sLimit := int32(len(src)) - inputMargin
+	// stepSize is the number of bytes to skip on every main loop iteration.
+	// It should be >= 2.
+	const stepSize = 2
+
+	// TEMPLATE
+	const hashLog = tableBits
+	// seems global, but would be nice to tweak.
+	const kSearchStrength = 8
+
+	// nextEmit is where in src the next emitLiteral should start from.
+	nextEmit := s
+	cv := load6432(src, s)
+
+	// Relative offsets
+	offset1 := int32(blk.recentOffsets[0])
+	offset2 := int32(blk.recentOffsets[1])
+
+	addLiterals := func(s *seq, until int32) {
+		if until == nextEmit {
+			return
+		}
+		blk.literals = append(blk.literals, src[nextEmit:until]...)
+		s.litLen = uint32(until - nextEmit)
+	}
+	if debug {
+		println("recent offsets:", blk.recentOffsets)
+	}
+
+encodeLoop:
+	for {
+		// t will contain the match offset when we find one.
+		// When existing the search loop, we have already checked 4 bytes.
+		var t int32
+
+		// We will not use repeat offsets across blocks.
+		// By not using them for the first 3 matches
+
+		for {
+			nextHash := hash6(cv, hashLog)
+			nextHash2 := hash6(cv>>8, hashLog)
+			candidate := e.table[nextHash]
+			candidate2 := e.table[nextHash2]
+			repIndex := s - offset1 + 2
+
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)}
+
+			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
+				// Consider history as well.
+				var seq seq
+				// length := 4 + e.matchlen(s+6, repIndex+4, src)
+				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				var length int32
+				{
+					a := src[s+6:]
+					b := src[repIndex+4:]
+					endI := len(a) & (math.MaxInt32 - 7)
+					length = int32(endI) + 4
+					for i := 0; i < endI; i += 8 {
+						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+							break
+						}
+					}
+				}
+
+				seq.matchLen = uint32(length - zstdMinMatch)
+
+				// We might be able to match backwards.
+				// Extend as long as we can.
+				start := s + 2
+				// We end the search early, so we don't risk 0 literals
+				// and have to do special offset treatment.
+				startLimit := nextEmit + 1
+
+				sMin := s - e.maxMatchOff
+				if sMin < 0 {
+					sMin = 0
+				}
+				for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] {
+					repIndex--
+					start--
+					seq.matchLen++
+				}
+				addLiterals(&seq, start)
+
+				// rep 0
+				seq.offset = 1
+				if debugSequences {
+					println("repeat sequence", seq, "next s:", s)
+				}
+				blk.sequences = append(blk.sequences, seq)
+				s += length + 2
+				nextEmit = s
+				if s >= sLimit {
+					if debug {
+						println("repeat ended", s, length)
+
+					}
+					break encodeLoop
+				}
+				cv = load6432(src, s)
+				continue
+			}
+			coffset0 := s - (candidate.offset - e.cur)
+			coffset1 := s - (candidate2.offset - e.cur) + 1
+			if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val {
+				// found a regular match
+				t = candidate.offset - e.cur
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				break
+			}
+
+			if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val {
+				// found a regular match
+				t = candidate2.offset - e.cur
+				s++
+				if debugAsserts && s <= t {
+					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+				}
+				if debugAsserts && s-t > e.maxMatchOff {
+					panic("s - t >e.maxMatchOff")
+				}
+				if debugAsserts && t < 0 {
+					panic("t<0")
+				}
+				break
+			}
+			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
+			if s >= sLimit {
+				break encodeLoop
+			}
+			cv = load6432(src, s)
+		}
+		// A 4-byte match has been found. We'll later see if more than 4 bytes.
+		offset2 = offset1
+		offset1 = s - t
+
+		if debugAsserts && s <= t {
+			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
+		}
+
+		// Extend the 4-byte match as long as possible.
+		//l := e.matchlenNoHist(s+4, t+4, src) + 4
+		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		var l int32
+		{
+			a := src[s+4:]
+			b := src[t+4:]
+			endI := len(a) & (math.MaxInt32 - 7)
+			l = int32(endI) + 4
+			for i := 0; i < endI; i += 8 {
+				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+					break
+				}
+			}
+		}
+
+		// Extend backwards
+		tMin := s - e.maxMatchOff
+		if tMin < 0 {
+			tMin = 0
+		}
+		for t > tMin && s > nextEmit && src[t-1] == src[s-1] {
+			s--
+			t--
+			l++
+		}
+
+		// Write our sequence.
+		var seq seq
+		seq.litLen = uint32(s - nextEmit)
+		seq.matchLen = uint32(l - zstdMinMatch)
+		if seq.litLen > 0 {
+			blk.literals = append(blk.literals, src[nextEmit:s]...)
+		}
+		// Don't use repeat offsets
+		seq.offset = uint32(s-t) + 3
+		s += l
+		if debugSequences {
+			println("sequence", seq, "next s:", s)
+		}
+		blk.sequences = append(blk.sequences, seq)
+		nextEmit = s
+		if s >= sLimit {
+			break encodeLoop
+		}
+		cv = load6432(src, s)
+
+		// Check offset 2
+		if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) {
+			// We have at least 4 byte match.
+			// No need to check backwards. We come straight from a match
+			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
+			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			var l int32
+			{
+				a := src[s+4:]
+				b := src[o2+4:]
+				endI := len(a) & (math.MaxInt32 - 7)
+				l = int32(endI) + 4
+				for i := 0; i < endI; i += 8 {
+					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
+						break
+					}
+				}
+			}
+
+			// Store this, since we have it.
+			nextHash := hash6(cv, hashLog)
+			e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+			seq.matchLen = uint32(l) - zstdMinMatch
+			seq.litLen = 0
+			// Since litlen is always 0, this is offset 1.
+			seq.offset = 1
+			s += l
+			nextEmit = s
+			if debugSequences {
+				println("sequence", seq, "next s:", s)
+			}
+			blk.sequences = append(blk.sequences, seq)
+
+			// Swap offset 1 and 2.
+			offset1, offset2 = offset2, offset1
+			if s >= sLimit {
+				break encodeLoop
+			}
+			// Prepare next loop.
+			cv = load6432(src, s)
+		}
+	}
+
+	if int(nextEmit) < len(src) {
+		blk.literals = append(blk.literals, src[nextEmit:]...)
+		blk.extraLits = len(src) - int(nextEmit)
+	}
+	if debug {
+		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
+	}
+}
+
+func (e *fastBase) addBlock(src []byte) int32 {
+	if debugAsserts && e.cur > bufferReset {
+		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
+	}
+	// check if we have space already
+	if len(e.hist)+len(src) > cap(e.hist) {
+		if cap(e.hist) == 0 {
+			l := e.maxMatchOff * 2
+			// Make it at least 1MB.
+			if l < 1<<20 {
+				l = 1 << 20
+			}
+			e.hist = make([]byte, 0, l)
+		} else {
+			if cap(e.hist) < int(e.maxMatchOff*2) {
+				panic("unexpected buffer size")
+			}
+			// Move down
+			offset := int32(len(e.hist)) - e.maxMatchOff
+			copy(e.hist[0:e.maxMatchOff], e.hist[offset:])
+			e.cur += offset
+			e.hist = e.hist[:e.maxMatchOff]
+		}
+	}
+	s := int32(len(e.hist))
+	e.hist = append(e.hist, src...)
+	return s
+}
+
+// useBlock will replace the block with the provided one,
+// but transfer recent offsets from the previous.
+func (e *fastBase) UseBlock(enc *blockEnc) {
+	enc.reset(e.blk)
+	e.blk = enc
+}
+
+func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
+	if debugAsserts {
+		if s < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if t < 0 {
+			err := fmt.Sprintf("s (%d) < 0", s)
+			panic(err)
+		}
+		if s-t > e.maxMatchOff {
+			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
+			panic(err)
+		}
+		if len(src)-int(s) > maxCompressedBlockSize {
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
+		}
+	}
+
+	// Extend the match to be as long as possible.
+	return int32(matchLen(src[s:], src[t:]))
+}
+
+// Reset the encoding table.
+func (e *fastBase) Reset() {
+	if e.blk == nil {
+		e.blk = &blockEnc{}
+		e.blk.init()
+	} else {
+		e.blk.reset(nil)
+	}
+	e.blk.initNewEncode()
+	if e.crc == nil {
+		e.crc = xxhash.New()
+	} else {
+		e.crc.Reset()
+	}
+	if cap(e.hist) < int(e.maxMatchOff*2) {
+		l := e.maxMatchOff * 2
+		// Make it at least 1MB.
+		if l < 1<<20 {
+			l = 1 << 20
+		}
+		e.hist = make([]byte, 0, l)
+	}
+	// We offset current position so everything will be out of reach.
+	// If above reset line, history will be purged.
+	if e.cur < bufferReset {
+		e.cur += e.maxMatchOff + int32(len(e.hist))
+	}
+	e.hist = e.hist[:0]
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/enc_params.go b/vendor/github.com/klauspost/compress/zstd/enc_params.go
new file mode 100644
index 00000000000..d874116f715
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/enc_params.go
@@ -0,0 +1,157 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+/*
+// encParams are not really used, just here for reference.
+type encParams struct {
+	// largest match distance : larger == more compression, more memory needed during decompression
+	windowLog uint8
+
+	// fully searched segment : larger == more compression, slower, more memory (useless for fast)
+	chainLog uint8
+
+	//  dispatch table : larger == faster, more memory
+	hashLog uint8
+
+	// < nb of searches : larger == more compression, slower
+	searchLog uint8
+
+	// < match length searched : larger == faster decompression, sometimes less compression
+	minMatch uint8
+
+	// acceptable match size for optimal parser (only) : larger == more compression, slower
+	targetLength uint32
+
+	// see ZSTD_strategy definition above
+	strategy strategy
+}
+
+// strategy defines the algorithm to use when generating sequences.
+type strategy uint8
+
+const (
+	// Compression strategies, listed from fastest to strongest
+	strategyFast strategy = iota + 1
+	strategyDfast
+	strategyGreedy
+	strategyLazy
+	strategyLazy2
+	strategyBtlazy2
+	strategyBtopt
+	strategyBtultra
+	strategyBtultra2
+	// note : new strategies _might_ be added in the future.
+	//   Only the order (from fast to strong) is guaranteed
+
+)
+
+var defEncParams = [4][]encParams{
+	{ // "default" - for any srcSize > 256 KB
+		// W,  C,  H,  S,  L, TL, strat
+		{19, 12, 13, 1, 6, 1, strategyFast},       // base for negative levels
+		{19, 13, 14, 1, 7, 0, strategyFast},       // level  1
+		{20, 15, 16, 1, 6, 0, strategyFast},       // level  2
+		{21, 16, 17, 1, 5, 1, strategyDfast},      // level  3
+		{21, 18, 18, 1, 5, 1, strategyDfast},      // level  4
+		{21, 18, 19, 2, 5, 2, strategyGreedy},     // level  5
+		{21, 19, 19, 3, 5, 4, strategyGreedy},     // level  6
+		{21, 19, 19, 3, 5, 8, strategyLazy},       // level  7
+		{21, 19, 19, 3, 5, 16, strategyLazy2},     // level  8
+		{21, 19, 20, 4, 5, 16, strategyLazy2},     // level  9
+		{22, 20, 21, 4, 5, 16, strategyLazy2},     // level 10
+		{22, 21, 22, 4, 5, 16, strategyLazy2},     // level 11
+		{22, 21, 22, 5, 5, 16, strategyLazy2},     // level 12
+		{22, 21, 22, 5, 5, 32, strategyBtlazy2},   // level 13
+		{22, 22, 23, 5, 5, 32, strategyBtlazy2},   // level 14
+		{22, 23, 23, 6, 5, 32, strategyBtlazy2},   // level 15
+		{22, 22, 22, 5, 5, 48, strategyBtopt},     // level 16
+		{23, 23, 22, 5, 4, 64, strategyBtopt},     // level 17
+		{23, 23, 22, 6, 3, 64, strategyBtultra},   // level 18
+		{23, 24, 22, 7, 3, 256, strategyBtultra2}, // level 19
+		{25, 25, 23, 7, 3, 256, strategyBtultra2}, // level 20
+		{26, 26, 24, 7, 3, 512, strategyBtultra2}, // level 21
+		{27, 27, 25, 9, 3, 999, strategyBtultra2}, // level 22
+	},
+	{ // for srcSize <= 256 KB
+		// W,  C,  H,  S,  L,  T, strat
+		{18, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
+		{18, 13, 14, 1, 6, 0, strategyFast},        // level  1
+		{18, 14, 14, 1, 5, 1, strategyDfast},       // level  2
+		{18, 16, 16, 1, 4, 1, strategyDfast},       // level  3
+		{18, 16, 17, 2, 5, 2, strategyGreedy},      // level  4.
+		{18, 18, 18, 3, 5, 2, strategyGreedy},      // level  5.
+		{18, 18, 19, 3, 5, 4, strategyLazy},        // level  6.
+		{18, 18, 19, 4, 4, 4, strategyLazy},        // level  7
+		{18, 18, 19, 4, 4, 8, strategyLazy2},       // level  8
+		{18, 18, 19, 5, 4, 8, strategyLazy2},       // level  9
+		{18, 18, 19, 6, 4, 8, strategyLazy2},       // level 10
+		{18, 18, 19, 5, 4, 12, strategyBtlazy2},    // level 11.
+		{18, 19, 19, 7, 4, 12, strategyBtlazy2},    // level 12.
+		{18, 18, 19, 4, 4, 16, strategyBtopt},      // level 13
+		{18, 18, 19, 4, 3, 32, strategyBtopt},      // level 14.
+		{18, 18, 19, 6, 3, 128, strategyBtopt},     // level 15.
+		{18, 19, 19, 6, 3, 128, strategyBtultra},   // level 16.
+		{18, 19, 19, 8, 3, 256, strategyBtultra},   // level 17.
+		{18, 19, 19, 6, 3, 128, strategyBtultra2},  // level 18.
+		{18, 19, 19, 8, 3, 256, strategyBtultra2},  // level 19.
+		{18, 19, 19, 10, 3, 512, strategyBtultra2}, // level 20.
+		{18, 19, 19, 12, 3, 512, strategyBtultra2}, // level 21.
+		{18, 19, 19, 13, 3, 999, strategyBtultra2}, // level 22.
+	},
+	{ // for srcSize <= 128 KB
+		// W,  C,  H,  S,  L,  T, strat
+		{17, 12, 12, 1, 5, 1, strategyFast},        // base for negative levels
+		{17, 12, 13, 1, 6, 0, strategyFast},        // level  1
+		{17, 13, 15, 1, 5, 0, strategyFast},        // level  2
+		{17, 15, 16, 2, 5, 1, strategyDfast},       // level  3
+		{17, 17, 17, 2, 4, 1, strategyDfast},       // level  4
+		{17, 16, 17, 3, 4, 2, strategyGreedy},      // level  5
+		{17, 17, 17, 3, 4, 4, strategyLazy},        // level  6
+		{17, 17, 17, 3, 4, 8, strategyLazy2},       // level  7
+		{17, 17, 17, 4, 4, 8, strategyLazy2},       // level  8
+		{17, 17, 17, 5, 4, 8, strategyLazy2},       // level  9
+		{17, 17, 17, 6, 4, 8, strategyLazy2},       // level 10
+		{17, 17, 17, 5, 4, 8, strategyBtlazy2},     // level 11
+		{17, 18, 17, 7, 4, 12, strategyBtlazy2},    // level 12
+		{17, 18, 17, 3, 4, 12, strategyBtopt},      // level 13.
+		{17, 18, 17, 4, 3, 32, strategyBtopt},      // level 14.
+		{17, 18, 17, 6, 3, 256, strategyBtopt},     // level 15.
+		{17, 18, 17, 6, 3, 128, strategyBtultra},   // level 16.
+		{17, 18, 17, 8, 3, 256, strategyBtultra},   // level 17.
+		{17, 18, 17, 10, 3, 512, strategyBtultra},  // level 18.
+		{17, 18, 17, 5, 3, 256, strategyBtultra2},  // level 19.
+		{17, 18, 17, 7, 3, 512, strategyBtultra2},  // level 20.
+		{17, 18, 17, 9, 3, 512, strategyBtultra2},  // level 21.
+		{17, 18, 17, 11, 3, 999, strategyBtultra2}, // level 22.
+	},
+	{ // for srcSize <= 16 KB
+		// W,  C,  H,  S,  L,  T, strat
+		{14, 12, 13, 1, 5, 1, strategyFast},        // base for negative levels
+		{14, 14, 15, 1, 5, 0, strategyFast},        // level  1
+		{14, 14, 15, 1, 4, 0, strategyFast},        // level  2
+		{14, 14, 15, 2, 4, 1, strategyDfast},       // level  3
+		{14, 14, 14, 4, 4, 2, strategyGreedy},      // level  4
+		{14, 14, 14, 3, 4, 4, strategyLazy},        // level  5.
+		{14, 14, 14, 4, 4, 8, strategyLazy2},       // level  6
+		{14, 14, 14, 6, 4, 8, strategyLazy2},       // level  7
+		{14, 14, 14, 8, 4, 8, strategyLazy2},       // level  8.
+		{14, 15, 14, 5, 4, 8, strategyBtlazy2},     // level  9.
+		{14, 15, 14, 9, 4, 8, strategyBtlazy2},     // level 10.
+		{14, 15, 14, 3, 4, 12, strategyBtopt},      // level 11.
+		{14, 15, 14, 4, 3, 24, strategyBtopt},      // level 12.
+		{14, 15, 14, 5, 3, 32, strategyBtultra},    // level 13.
+		{14, 15, 15, 6, 3, 64, strategyBtultra},    // level 14.
+		{14, 15, 15, 7, 3, 256, strategyBtultra},   // level 15.
+		{14, 15, 15, 5, 3, 48, strategyBtultra2},   // level 16.
+		{14, 15, 15, 6, 3, 128, strategyBtultra2},  // level 17.
+		{14, 15, 15, 7, 3, 256, strategyBtultra2},  // level 18.
+		{14, 15, 15, 8, 3, 256, strategyBtultra2},  // level 19.
+		{14, 15, 15, 8, 3, 512, strategyBtultra2},  // level 20.
+		{14, 15, 15, 9, 3, 512, strategyBtultra2},  // level 21.
+		{14, 15, 15, 10, 3, 999, strategyBtultra2}, // level 22.
+	},
+}
+*/
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder.go b/vendor/github.com/klauspost/compress/zstd/encoder.go
new file mode 100644
index 00000000000..af4f00b7342
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/encoder.go
@@ -0,0 +1,555 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"crypto/rand"
+	"fmt"
+	"io"
+	rdebug "runtime/debug"
+	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+// Encoder provides encoding to Zstandard.
+// An Encoder can be used for either compressing a stream via the
+// io.WriteCloser interface supported by the Encoder or as multiple independent
+// tasks via the EncodeAll function.
+// Smaller encodes are encouraged to use the EncodeAll function.
+// Use NewWriter to create a new instance.
+type Encoder struct {
+	o        encoderOptions
+	encoders chan encoder
+	state    encoderState
+	init     sync.Once
+}
+
+type encoder interface {
+	Encode(blk *blockEnc, src []byte)
+	EncodeNoHist(blk *blockEnc, src []byte)
+	Block() *blockEnc
+	CRC() *xxhash.Digest
+	AppendCRC([]byte) []byte
+	WindowSize(size int) int32
+	UseBlock(*blockEnc)
+	Reset()
+}
+
+type encoderState struct {
+	w                io.Writer
+	filling          []byte
+	current          []byte
+	previous         []byte
+	encoder          encoder
+	writing          *blockEnc
+	err              error
+	writeErr         error
+	nWritten         int64
+	headerWritten    bool
+	eofWritten       bool
+	fullFrameWritten bool
+
+	// This waitgroup indicates an encode is running.
+	wg sync.WaitGroup
+	// This waitgroup indicates we have a block encoding/writing.
+	wWg sync.WaitGroup
+}
+
+// NewWriter will create a new Zstandard encoder.
+// If the encoder will be used for encoding blocks a nil writer can be used.
+func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) {
+	initPredefined()
+	var e Encoder
+	e.o.setDefault()
+	for _, o := range opts {
+		err := o(&e.o)
+		if err != nil {
+			return nil, err
+		}
+	}
+	if w != nil {
+		e.Reset(w)
+	}
+	return &e, nil
+}
+
+func (e *Encoder) initialize() {
+	if e.o.concurrent == 0 {
+		e.o.setDefault()
+	}
+	e.encoders = make(chan encoder, e.o.concurrent)
+	for i := 0; i < e.o.concurrent; i++ {
+		e.encoders <- e.o.encoder()
+	}
+}
+
+// Reset will re-initialize the writer and new writes will encode to the supplied writer
+// as a new, independent stream.
+func (e *Encoder) Reset(w io.Writer) {
+	s := &e.state
+	s.wg.Wait()
+	s.wWg.Wait()
+	if cap(s.filling) == 0 {
+		s.filling = make([]byte, 0, e.o.blockSize)
+	}
+	if cap(s.current) == 0 {
+		s.current = make([]byte, 0, e.o.blockSize)
+	}
+	if cap(s.previous) == 0 {
+		s.previous = make([]byte, 0, e.o.blockSize)
+	}
+	if s.encoder == nil {
+		s.encoder = e.o.encoder()
+	}
+	if s.writing == nil {
+		s.writing = &blockEnc{}
+		s.writing.init()
+	}
+	s.writing.initNewEncode()
+	s.filling = s.filling[:0]
+	s.current = s.current[:0]
+	s.previous = s.previous[:0]
+	s.encoder.Reset()
+	s.headerWritten = false
+	s.eofWritten = false
+	s.fullFrameWritten = false
+	s.w = w
+	s.err = nil
+	s.nWritten = 0
+	s.writeErr = nil
+}
+
+// Write data to the encoder.
+// Input data will be buffered and as the buffer fills up
+// content will be compressed and written to the output.
+// When done writing, use Close to flush the remaining output
+// and write CRC if requested.
+func (e *Encoder) Write(p []byte) (n int, err error) {
+	s := &e.state
+	for len(p) > 0 {
+		if len(p)+len(s.filling) < e.o.blockSize {
+			if e.o.crc {
+				_, _ = s.encoder.CRC().Write(p)
+			}
+			s.filling = append(s.filling, p...)
+			return n + len(p), nil
+		}
+		add := p
+		if len(p)+len(s.filling) > e.o.blockSize {
+			add = add[:e.o.blockSize-len(s.filling)]
+		}
+		if e.o.crc {
+			_, _ = s.encoder.CRC().Write(add)
+		}
+		s.filling = append(s.filling, add...)
+		p = p[len(add):]
+		n += len(add)
+		if len(s.filling) < e.o.blockSize {
+			return n, nil
+		}
+		err := e.nextBlock(false)
+		if err != nil {
+			return n, err
+		}
+		if debugAsserts && len(s.filling) > 0 {
+			panic(len(s.filling))
+		}
+	}
+	return n, nil
+}
+
+// nextBlock will synchronize and start compressing input in e.state.filling.
+// If an error has occurred during encoding it will be returned.
+func (e *Encoder) nextBlock(final bool) error {
+	s := &e.state
+	// Wait for current block.
+	s.wg.Wait()
+	if s.err != nil {
+		return s.err
+	}
+	if len(s.filling) > e.o.blockSize {
+		return fmt.Errorf("block > maxStoreBlockSize")
+	}
+	if !s.headerWritten {
+		// If we have a single block encode, do a sync compression.
+		if final && len(s.filling) > 0 {
+			s.current = e.EncodeAll(s.filling, s.current[:0])
+			var n2 int
+			n2, s.err = s.w.Write(s.current)
+			if s.err != nil {
+				return s.err
+			}
+			s.nWritten += int64(n2)
+			s.current = s.current[:0]
+			s.filling = s.filling[:0]
+			s.headerWritten = true
+			s.fullFrameWritten = true
+			return nil
+		}
+
+		var tmp [maxHeaderSize]byte
+		fh := frameHeader{
+			ContentSize:   0,
+			WindowSize:    uint32(s.encoder.WindowSize(0)),
+			SingleSegment: false,
+			Checksum:      e.o.crc,
+			DictID:        0,
+		}
+		dst, err := fh.appendTo(tmp[:0])
+		if err != nil {
+			return err
+		}
+		s.headerWritten = true
+		s.wWg.Wait()
+		var n2 int
+		n2, s.err = s.w.Write(dst)
+		if s.err != nil {
+			return s.err
+		}
+		s.nWritten += int64(n2)
+	}
+	if s.eofWritten {
+		// Ensure we only write it once.
+		final = false
+	}
+
+	if len(s.filling) == 0 {
+		// Final block, but no data.
+		if final {
+			enc := s.encoder
+			blk := enc.Block()
+			blk.reset(nil)
+			blk.last = true
+			blk.encodeRaw(nil)
+			s.wWg.Wait()
+			_, s.err = s.w.Write(blk.output)
+			s.nWritten += int64(len(blk.output))
+			s.eofWritten = true
+		}
+		return s.err
+	}
+
+	// Move blocks forward.
+	s.filling, s.current, s.previous = s.previous[:0], s.filling, s.current
+	s.wg.Add(1)
+	go func(src []byte) {
+		if debug {
+			println("Adding block,", len(src), "bytes, final:", final)
+		}
+		defer func() {
+			if r := recover(); r != nil {
+				s.err = fmt.Errorf("panic while encoding: %v", r)
+				rdebug.PrintStack()
+			}
+			s.wg.Done()
+		}()
+		enc := s.encoder
+		blk := enc.Block()
+		enc.Encode(blk, src)
+		blk.last = final
+		if final {
+			s.eofWritten = true
+		}
+		// Wait for pending writes.
+		s.wWg.Wait()
+		if s.writeErr != nil {
+			s.err = s.writeErr
+			return
+		}
+		// Transfer encoders from previous write block.
+		blk.swapEncoders(s.writing)
+		// Transfer recent offsets to next.
+		enc.UseBlock(s.writing)
+		s.writing = blk
+		s.wWg.Add(1)
+		go func() {
+			defer func() {
+				if r := recover(); r != nil {
+					s.writeErr = fmt.Errorf("panic while encoding/writing: %v", r)
+					rdebug.PrintStack()
+				}
+				s.wWg.Done()
+			}()
+			err := errIncompressible
+			// If we got the exact same number of literals as input,
+			// assume the literals cannot be compressed.
+			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
+				err = blk.encode(e.o.noEntropy)
+			}
+			switch err {
+			case errIncompressible:
+				if debug {
+					println("Storing incompressible block as raw")
+				}
+				blk.encodeRaw(src)
+				// In fast mode, we do not transfer offsets, so we don't have to deal with changing the.
+			case nil:
+			default:
+				s.writeErr = err
+				return
+			}
+			_, s.writeErr = s.w.Write(blk.output)
+			s.nWritten += int64(len(blk.output))
+		}()
+	}(s.current)
+	return nil
+}
+
+// ReadFrom reads data from r until EOF or error.
+// The return value n is the number of bytes read.
+// Any error except io.EOF encountered during the read is also returned.
+//
+// The Copy function uses ReaderFrom if available.
+func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
+	if debug {
+		println("Using ReadFrom")
+	}
+	// Maybe handle stuff queued?
+	e.state.filling = e.state.filling[:e.o.blockSize]
+	src := e.state.filling
+	for {
+		n2, err := r.Read(src)
+		if e.o.crc {
+			_, _ = e.state.encoder.CRC().Write(src[:n2])
+		}
+		// src is now the unfilled part...
+		src = src[n2:]
+		n += int64(n2)
+		switch err {
+		case io.EOF:
+			e.state.filling = e.state.filling[:len(e.state.filling)-len(src)]
+			if debug {
+				println("ReadFrom: got EOF final block:", len(e.state.filling))
+			}
+			return n, e.nextBlock(true)
+		default:
+			if debug {
+				println("ReadFrom: got error:", err)
+			}
+			e.state.err = err
+			return n, err
+		case nil:
+		}
+		if len(src) > 0 {
+			if debug {
+				println("ReadFrom: got space left in source:", len(src))
+			}
+			continue
+		}
+		err = e.nextBlock(false)
+		if err != nil {
+			return n, err
+		}
+		e.state.filling = e.state.filling[:e.o.blockSize]
+		src = e.state.filling
+	}
+}
+
+// Flush will send the currently written data to output
+// and block until everything has been written.
+// This should only be used on rare occasions where pushing the currently queued data is critical.
+func (e *Encoder) Flush() error {
+	s := &e.state
+	if len(s.filling) > 0 {
+		err := e.nextBlock(false)
+		if err != nil {
+			return err
+		}
+	}
+	s.wg.Wait()
+	s.wWg.Wait()
+	if s.err != nil {
+		return s.err
+	}
+	return s.writeErr
+}
+
+// Close will flush the final output and close the stream.
+// The function will block until everything has been written.
+// The Encoder can still be re-used after calling this.
+func (e *Encoder) Close() error {
+	s := &e.state
+	if s.encoder == nil {
+		return nil
+	}
+	err := e.nextBlock(true)
+	if err != nil {
+		return err
+	}
+	if e.state.fullFrameWritten {
+		return s.err
+	}
+	s.wg.Wait()
+	s.wWg.Wait()
+
+	if s.err != nil {
+		return s.err
+	}
+	if s.writeErr != nil {
+		return s.writeErr
+	}
+
+	// Write CRC
+	if e.o.crc && s.err == nil {
+		// heap alloc.
+		var tmp [4]byte
+		_, s.err = s.w.Write(s.encoder.AppendCRC(tmp[:0]))
+		s.nWritten += 4
+	}
+
+	// Add padding with content from crypto/rand.Reader
+	if s.err == nil && e.o.pad > 0 {
+		add := calcSkippableFrame(s.nWritten, int64(e.o.pad))
+		frame, err := skippableFrame(s.filling[:0], add, rand.Reader)
+		if err != nil {
+			return err
+		}
+		_, s.err = s.w.Write(frame)
+	}
+	return s.err
+}
+
+// EncodeAll will encode all input in src and append it to dst.
+// This function can be called concurrently, but each call will only run on a single goroutine.
+// If empty input is given, nothing is returned, unless WithZeroFrames is specified.
+// Encoded blocks can be concatenated and the result will be the combined input stream.
+// Data compressed with EncodeAll can be decoded with the Decoder,
+// using either a stream or DecodeAll.
+func (e *Encoder) EncodeAll(src, dst []byte) []byte {
+	if len(src) == 0 {
+		if e.o.fullZero {
+			// Add frame header.
+			fh := frameHeader{
+				ContentSize:   0,
+				WindowSize:    MinWindowSize,
+				SingleSegment: true,
+				// Adding a checksum would be a waste of space.
+				Checksum: false,
+				DictID:   0,
+			}
+			dst, _ = fh.appendTo(dst)
+
+			// Write raw block as last one only.
+			var blk blockHeader
+			blk.setSize(0)
+			blk.setType(blockTypeRaw)
+			blk.setLast(true)
+			dst = blk.appendTo(dst)
+		}
+		return dst
+	}
+	e.init.Do(e.initialize)
+	enc := <-e.encoders
+	defer func() {
+		// Release encoder reference to last block.
+		enc.Reset()
+		e.encoders <- enc
+	}()
+	enc.Reset()
+	blk := enc.Block()
+	// Use single segments when above minimum window and below 1MB.
+	single := len(src) < 1<<20 && len(src) > MinWindowSize
+	if e.o.single != nil {
+		single = *e.o.single
+	}
+	fh := frameHeader{
+		ContentSize:   uint64(len(src)),
+		WindowSize:    uint32(enc.WindowSize(len(src))),
+		SingleSegment: single,
+		Checksum:      e.o.crc,
+		DictID:        0,
+	}
+
+	// If less than 1MB, allocate a buffer up front.
+	if len(dst) == 0 && cap(dst) == 0 && len(src) < 1<<20 {
+		dst = make([]byte, 0, len(src))
+	}
+	dst, err := fh.appendTo(dst)
+	if err != nil {
+		panic(err)
+	}
+
+	if len(src) <= e.o.blockSize && len(src) <= maxBlockSize {
+		// Slightly faster with no history and everything in one block.
+		if e.o.crc {
+			_, _ = enc.CRC().Write(src)
+		}
+		blk.reset(nil)
+		blk.last = true
+		enc.EncodeNoHist(blk, src)
+
+		// If we got the exact same number of literals as input,
+		// assume the literals cannot be compressed.
+		err := errIncompressible
+		oldout := blk.output
+		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
+			// Output directly to dst
+			blk.output = dst
+			err = blk.encode(e.o.noEntropy)
+		}
+
+		switch err {
+		case errIncompressible:
+			if debug {
+				println("Storing incompressible block as raw")
+			}
+			dst = blk.encodeRawTo(dst, src)
+		case nil:
+			dst = blk.output
+		default:
+			panic(err)
+		}
+		blk.output = oldout
+	} else {
+		for len(src) > 0 {
+			todo := src
+			if len(todo) > e.o.blockSize {
+				todo = todo[:e.o.blockSize]
+			}
+			src = src[len(todo):]
+			if e.o.crc {
+				_, _ = enc.CRC().Write(todo)
+			}
+			blk.reset(nil)
+			blk.pushOffsets()
+			enc.Encode(blk, todo)
+			if len(src) == 0 {
+				blk.last = true
+			}
+			err := errIncompressible
+			// If we got the exact same number of literals as input,
+			// assume the literals cannot be compressed.
+			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
+				err = blk.encode(e.o.noEntropy)
+			}
+
+			switch err {
+			case errIncompressible:
+				if debug {
+					println("Storing incompressible block as raw")
+				}
+				dst = blk.encodeRawTo(dst, todo)
+				blk.popOffsets()
+			case nil:
+				dst = append(dst, blk.output...)
+			default:
+				panic(err)
+			}
+		}
+	}
+	if e.o.crc {
+		dst = enc.AppendCRC(dst)
+	}
+	// Add padding with content from crypto/rand.Reader
+	if e.o.pad > 0 {
+		add := calcSkippableFrame(int64(len(dst)), int64(e.o.pad))
+		dst, err = skippableFrame(dst, add, rand.Reader)
+		if err != nil {
+			panic(err)
+		}
+	}
+	return dst
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/encoder_options.go b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
new file mode 100644
index 00000000000..3fc03097a67
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/encoder_options.go
@@ -0,0 +1,249 @@
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"runtime"
+	"strings"
+)
+
+// EOption is an option for creating a encoder.
+type EOption func(*encoderOptions) error
+
+// options retains accumulated state of multiple options.
+type encoderOptions struct {
+	concurrent   int
+	level        EncoderLevel
+	single       *bool
+	pad          int
+	blockSize    int
+	windowSize   int
+	crc          bool
+	fullZero     bool
+	noEntropy    bool
+	customWindow bool
+}
+
+func (o *encoderOptions) setDefault() {
+	*o = encoderOptions{
+		// use less ram: true for now, but may change.
+		concurrent: runtime.GOMAXPROCS(0),
+		crc:        true,
+		single:     nil,
+		blockSize:  1 << 16,
+		windowSize: 8 << 20,
+		level:      SpeedDefault,
+	}
+}
+
+// encoder returns an encoder with the selected options.
+func (o encoderOptions) encoder() encoder {
+	switch o.level {
+	case SpeedDefault:
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
+	case SpeedBetterCompression:
+		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+	case SpeedFastest:
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
+	}
+	panic("unknown compression level")
+}
+
+// WithEncoderCRC will add CRC value to output.
+// Output will be 4 bytes larger.
+func WithEncoderCRC(b bool) EOption {
+	return func(o *encoderOptions) error { o.crc = b; return nil }
+}
+
+// WithEncoderConcurrency will set the concurrency,
+// meaning the maximum number of decoders to run concurrently.
+// The value supplied must be at least 1.
+// By default this will be set to GOMAXPROCS.
+func WithEncoderConcurrency(n int) EOption {
+	return func(o *encoderOptions) error {
+		if n <= 0 {
+			return fmt.Errorf("concurrency must be at least 1")
+		}
+		o.concurrent = n
+		return nil
+	}
+}
+
+// WithWindowSize will set the maximum allowed back-reference distance.
+// The value must be a power of two between MinWindowSize and MaxWindowSize.
+// A larger value will enable better compression but allocate more memory and,
+// for above-default values, take considerably longer.
+// The default value is determined by the compression level.
+func WithWindowSize(n int) EOption {
+	return func(o *encoderOptions) error {
+		switch {
+		case n < MinWindowSize:
+			return fmt.Errorf("window size must be at least %d", MinWindowSize)
+		case n > MaxWindowSize:
+			return fmt.Errorf("window size must be at most %d", MaxWindowSize)
+		case (n & (n - 1)) != 0:
+			return errors.New("window size must be a power of 2")
+		}
+
+		o.windowSize = n
+		o.customWindow = true
+		if o.blockSize > o.windowSize {
+			o.blockSize = o.windowSize
+		}
+		return nil
+	}
+}
+
+// WithEncoderPadding will add padding to all output so the size will be a multiple of n.
+// This can be used to obfuscate the exact output size or make blocks of a certain size.
+// The contents will be a skippable frame, so it will be invisible by the decoder.
+// n must be > 0 and <= 1GB, 1<<30 bytes.
+// The padded area will be filled with data from crypto/rand.Reader.
+// If `EncodeAll` is used with data already in the destination, the total size will be multiple of this.
+func WithEncoderPadding(n int) EOption {
+	return func(o *encoderOptions) error {
+		if n <= 0 {
+			return fmt.Errorf("padding must be at least 1")
+		}
+		// No need to waste our time.
+		if n == 1 {
+			o.pad = 0
+		}
+		if n > 1<<30 {
+			return fmt.Errorf("padding must less than 1GB (1<<30 bytes) ")
+		}
+		o.pad = n
+		return nil
+	}
+}
+
+// EncoderLevel predefines encoder compression levels.
+// Only use the constants made available, since the actual mapping
+// of these values are very likely to change and your compression could change
+// unpredictably when upgrading the library.
+type EncoderLevel int
+
+const (
+	speedNotSet EncoderLevel = iota
+
+	// SpeedFastest will choose the fastest reasonable compression.
+	// This is roughly equivalent to the fastest Zstandard mode.
+	SpeedFastest
+
+	// SpeedDefault is the default "pretty fast" compression option.
+	// This is roughly equivalent to the default Zstandard mode (level 3).
+	SpeedDefault
+
+	// SpeedBetterCompression will yield better compression than the default.
+	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
+	// By using this, notice that CPU usage may go up in the future.
+	SpeedBetterCompression
+
+	// speedLast should be kept as the last actual compression option.
+	// The is not for external usage, but is used to keep track of the valid options.
+	speedLast
+
+	// SpeedBestCompression will choose the best available compression option.
+	// For now this is not implemented.
+	SpeedBestCompression = SpeedBetterCompression
+)
+
+// EncoderLevelFromString will convert a string representation of an encoding level back
+// to a compression level. The compare is not case sensitive.
+// If the string wasn't recognized, (false, SpeedDefault) will be returned.
+func EncoderLevelFromString(s string) (bool, EncoderLevel) {
+	for l := EncoderLevel(speedNotSet + 1); l < speedLast; l++ {
+		if strings.EqualFold(s, l.String()) {
+			return true, l
+		}
+	}
+	return false, SpeedDefault
+}
+
+// EncoderLevelFromZstd will return an encoder level that closest matches the compression
+// ratio of a specific zstd compression level.
+// Many input values will provide the same compression level.
+func EncoderLevelFromZstd(level int) EncoderLevel {
+	switch {
+	case level < 3:
+		return SpeedFastest
+	case level >= 3 && level < 6:
+		return SpeedDefault
+	case level > 5:
+		return SpeedBetterCompression
+	}
+	return SpeedDefault
+}
+
+// String provides a string representation of the compression level.
+func (e EncoderLevel) String() string {
+	switch e {
+	case SpeedFastest:
+		return "fastest"
+	case SpeedDefault:
+		return "default"
+	case SpeedBetterCompression:
+		return "better"
+	default:
+		return "invalid"
+	}
+}
+
+// WithEncoderLevel specifies a predefined compression level.
+func WithEncoderLevel(l EncoderLevel) EOption {
+	return func(o *encoderOptions) error {
+		switch {
+		case l <= speedNotSet || l >= speedLast:
+			return fmt.Errorf("unknown encoder level")
+		}
+		o.level = l
+		if !o.customWindow {
+			switch o.level {
+			case SpeedFastest:
+				o.windowSize = 4 << 20
+			case SpeedDefault:
+				o.windowSize = 8 << 20
+			case SpeedBetterCompression:
+				o.windowSize = 16 << 20
+			}
+		}
+		return nil
+	}
+}
+
+// WithZeroFrames will encode 0 length input as full frames.
+// This can be needed for compatibility with zstandard usage,
+// but is not needed for this package.
+func WithZeroFrames(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.fullZero = b
+		return nil
+	}
+}
+
+// WithNoEntropyCompression will always skip entropy compression of literals.
+// This can be useful if content has matches, but unlikely to benefit from entropy
+// compression. Usually the slight speed improvement is not worth enabling this.
+func WithNoEntropyCompression(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.noEntropy = b
+		return nil
+	}
+}
+
+// WithSingleSegment will set the "single segment" flag when EncodeAll is used.
+// If this flag is set, data must be regenerated within a single continuous memory segment.
+// In this case, Window_Descriptor byte is skipped, but Frame_Content_Size is necessarily present.
+// As a consequence, the decoder must allocate a memory segment of size equal or larger than size of your content.
+// In order to preserve the decoder from unreasonable memory requirements,
+// a decoder is allowed to reject a compressed frame which requests a memory size beyond decoder's authorized range.
+// For broader compatibility, decoders are recommended to support memory sizes of at least 8 MB.
+// This is only a recommendation, each decoder is free to support higher or lower limits, depending on local limitations.
+// If this is not specified, block encodes will automatically choose this based on the input size.
+// This setting has no effect on streamed encodes.
+func WithSingleSegment(b bool) EOption {
+	return func(o *encoderOptions) error {
+		o.single = &b
+		return nil
+	}
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/framedec.go b/vendor/github.com/klauspost/compress/zstd/framedec.go
new file mode 100644
index 00000000000..780880ebe44
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/framedec.go
@@ -0,0 +1,495 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"bytes"
+	"encoding/hex"
+	"errors"
+	"hash"
+	"io"
+	"sync"
+
+	"github.com/klauspost/compress/zstd/internal/xxhash"
+)
+
+type frameDec struct {
+	o      decoderOptions
+	crc    hash.Hash64
+	offset int64
+
+	WindowSize uint64
+
+	// maxWindowSize is the maximum windows size to support.
+	// should never be bigger than max-int.
+	maxWindowSize uint64
+
+	// In order queue of blocks being decoded.
+	decoding chan *blockDec
+
+	// Frame history passed between blocks
+	history history
+
+	rawInput byteBuffer
+
+	// Byte buffer that can be reused for small input blocks.
+	bBuf byteBuf
+
+	FrameContentSize uint64
+	frameDone        sync.WaitGroup
+
+	DictionaryID  uint32
+	HasCheckSum   bool
+	SingleSegment bool
+
+	// asyncRunning indicates whether the async routine processes input on 'decoding'.
+	asyncRunningMu sync.Mutex
+	asyncRunning   bool
+}
+
+const (
+	// The minimum Window_Size is 1 KB.
+	MinWindowSize = 1 << 10
+	MaxWindowSize = 1 << 29
+)
+
+var (
+	frameMagic          = []byte{0x28, 0xb5, 0x2f, 0xfd}
+	skippableFrameMagic = []byte{0x2a, 0x4d, 0x18}
+)
+
+func newFrameDec(o decoderOptions) *frameDec {
+	d := frameDec{
+		o:             o,
+		maxWindowSize: MaxWindowSize,
+	}
+	if d.maxWindowSize > o.maxDecodedSize {
+		d.maxWindowSize = o.maxDecodedSize
+	}
+	return &d
+}
+
+// reset will read the frame header and prepare for block decoding.
+// If nothing can be read from the input, io.EOF will be returned.
+// Any other error indicated that the stream contained data, but
+// there was a problem.
+func (d *frameDec) reset(br byteBuffer) error {
+	d.HasCheckSum = false
+	d.WindowSize = 0
+	var b []byte
+	for {
+		b = br.readSmall(4)
+		if b == nil {
+			return io.EOF
+		}
+		if !bytes.Equal(b[1:4], skippableFrameMagic) || b[0]&0xf0 != 0x50 {
+			if debug {
+				println("Not skippable", hex.EncodeToString(b), hex.EncodeToString(skippableFrameMagic))
+			}
+			// Break if not skippable frame.
+			break
+		}
+		// Read size to skip
+		b = br.readSmall(4)
+		if b == nil {
+			println("Reading Frame Size EOF")
+			return io.ErrUnexpectedEOF
+		}
+		n := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+		println("Skipping frame with", n, "bytes.")
+		err := br.skipN(int(n))
+		if err != nil {
+			if debug {
+				println("Reading discarded frame", err)
+			}
+			return err
+		}
+	}
+	if !bytes.Equal(b, frameMagic) {
+		println("Got magic numbers: ", b, "want:", frameMagic)
+		return ErrMagicMismatch
+	}
+
+	// Read Frame_Header_Descriptor
+	fhd, err := br.readByte()
+	if err != nil {
+		println("Reading Frame_Header_Descriptor", err)
+		return err
+	}
+	d.SingleSegment = fhd&(1<<5) != 0
+
+	if fhd&(1<<3) != 0 {
+		return errors.New("Reserved bit set on frame header")
+	}
+
+	// Read Window_Descriptor
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#window_descriptor
+	d.WindowSize = 0
+	if !d.SingleSegment {
+		wd, err := br.readByte()
+		if err != nil {
+			println("Reading Window_Descriptor", err)
+			return err
+		}
+		printf("raw: %x, mantissa: %d, exponent: %d\n", wd, wd&7, wd>>3)
+		windowLog := 10 + (wd >> 3)
+		windowBase := uint64(1) << windowLog
+		windowAdd := (windowBase / 8) * uint64(wd&0x7)
+		d.WindowSize = windowBase + windowAdd
+	}
+
+	// Read Dictionary_ID
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
+	d.DictionaryID = 0
+	if size := fhd & 3; size != 0 {
+		if size == 3 {
+			size = 4
+		}
+		b = br.readSmall(int(size))
+		if b == nil {
+			if debug {
+				println("Reading Dictionary_ID", io.ErrUnexpectedEOF)
+			}
+			return io.ErrUnexpectedEOF
+		}
+		switch size {
+		case 1:
+			d.DictionaryID = uint32(b[0])
+		case 2:
+			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+		case 4:
+			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+		}
+		if debug {
+			println("Dict size", size, "ID:", d.DictionaryID)
+		}
+		if d.DictionaryID != 0 {
+			return ErrUnknownDictionary
+		}
+	}
+
+	// Read Frame_Content_Size
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#frame_content_size
+	var fcsSize int
+	v := fhd >> 6
+	switch v {
+	case 0:
+		if d.SingleSegment {
+			fcsSize = 1
+		}
+	default:
+		fcsSize = 1 << v
+	}
+	d.FrameContentSize = 0
+	if fcsSize > 0 {
+		b := br.readSmall(fcsSize)
+		if b == nil {
+			println("Reading Frame content", io.ErrUnexpectedEOF)
+			return io.ErrUnexpectedEOF
+		}
+		switch fcsSize {
+		case 1:
+			d.FrameContentSize = uint64(b[0])
+		case 2:
+			// When FCS_Field_Size is 2, the offset of 256 is added.
+			d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) + 256
+		case 4:
+			d.FrameContentSize = uint64(b[0]) | (uint64(b[1]) << 8) | (uint64(b[2]) << 16) | (uint64(b[3]) << 24)
+		case 8:
+			d1 := uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			d2 := uint32(b[4]) | (uint32(b[5]) << 8) | (uint32(b[6]) << 16) | (uint32(b[7]) << 24)
+			d.FrameContentSize = uint64(d1) | (uint64(d2) << 32)
+		}
+		if debug {
+			println("field size bits:", v, "fcsSize:", fcsSize, "FrameContentSize:", d.FrameContentSize, hex.EncodeToString(b[:fcsSize]), "singleseg:", d.SingleSegment, "window:", d.WindowSize)
+		}
+	}
+	// Move this to shared.
+	d.HasCheckSum = fhd&(1<<2) != 0
+	if d.HasCheckSum {
+		if d.crc == nil {
+			d.crc = xxhash.New()
+		}
+		d.crc.Reset()
+	}
+
+	if d.WindowSize == 0 && d.SingleSegment {
+		// We may not need window in this case.
+		d.WindowSize = d.FrameContentSize
+		if d.WindowSize < MinWindowSize {
+			d.WindowSize = MinWindowSize
+		}
+	}
+
+	if d.WindowSize > d.maxWindowSize {
+		printf("window size %d > max %d\n", d.WindowSize, d.maxWindowSize)
+		return ErrWindowSizeExceeded
+	}
+	// The minimum Window_Size is 1 KB.
+	if d.WindowSize < MinWindowSize {
+		println("got window size: ", d.WindowSize)
+		return ErrWindowSizeTooSmall
+	}
+	d.history.windowSize = int(d.WindowSize)
+	if d.o.lowMem && d.history.windowSize < maxBlockSize {
+		d.history.maxSize = d.history.windowSize * 2
+	} else {
+		d.history.maxSize = d.history.windowSize + maxBlockSize
+	}
+	// history contains input - maybe we do something
+	d.rawInput = br
+	return nil
+}
+
+// next will start decoding the next block from stream.
+func (d *frameDec) next(block *blockDec) error {
+	if debug {
+		printf("decoding new block %p:%p", block, block.data)
+	}
+	err := block.reset(d.rawInput, d.WindowSize)
+	if err != nil {
+		println("block error:", err)
+		// Signal the frame decoder we have a problem.
+		d.sendErr(block, err)
+		return err
+	}
+	block.input <- struct{}{}
+	if debug {
+		println("next block:", block)
+	}
+	d.asyncRunningMu.Lock()
+	defer d.asyncRunningMu.Unlock()
+	if !d.asyncRunning {
+		return nil
+	}
+	if block.Last {
+		// We indicate the frame is done by sending io.EOF
+		d.decoding <- block
+		return io.EOF
+	}
+	d.decoding <- block
+	return nil
+}
+
+// sendEOF will queue an error block on the frame.
+// This will cause the frame decoder to return when it encounters the block.
+// Returns true if the decoder was added.
+func (d *frameDec) sendErr(block *blockDec, err error) bool {
+	d.asyncRunningMu.Lock()
+	defer d.asyncRunningMu.Unlock()
+	if !d.asyncRunning {
+		return false
+	}
+
+	println("sending error", err.Error())
+	block.sendErr(err)
+	d.decoding <- block
+	return true
+}
+
+// checkCRC will check the checksum if the frame has one.
+// Will return ErrCRCMismatch if crc check failed, otherwise nil.
+func (d *frameDec) checkCRC() error {
+	if !d.HasCheckSum {
+		return nil
+	}
+	var tmp [4]byte
+	got := d.crc.Sum64()
+	// Flip to match file order.
+	tmp[0] = byte(got >> 0)
+	tmp[1] = byte(got >> 8)
+	tmp[2] = byte(got >> 16)
+	tmp[3] = byte(got >> 24)
+
+	// We can overwrite upper tmp now
+	want := d.rawInput.readSmall(4)
+	if want == nil {
+		println("CRC missing?")
+		return io.ErrUnexpectedEOF
+	}
+
+	if !bytes.Equal(tmp[:], want) {
+		if debug {
+			println("CRC Check Failed:", tmp[:], "!=", want)
+		}
+		return ErrCRCMismatch
+	}
+	if debug {
+		println("CRC ok", tmp[:])
+	}
+	return nil
+}
+
+func (d *frameDec) initAsync() {
+	if !d.o.lowMem && !d.SingleSegment {
+		// set max extra size history to 10MB.
+		d.history.maxSize = d.history.windowSize + maxBlockSize*5
+	}
+	// re-alloc if more than one extra block size.
+	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
+		d.history.b = make([]byte, 0, d.history.maxSize)
+	}
+	if cap(d.history.b) < d.history.maxSize {
+		d.history.b = make([]byte, 0, d.history.maxSize)
+	}
+	if cap(d.decoding) < d.o.concurrent {
+		d.decoding = make(chan *blockDec, d.o.concurrent)
+	}
+	if debug {
+		h := d.history
+		printf("history init. len: %d, cap: %d", len(h.b), cap(h.b))
+	}
+	d.asyncRunningMu.Lock()
+	d.asyncRunning = true
+	d.asyncRunningMu.Unlock()
+}
+
+// startDecoder will start decoding blocks and write them to the writer.
+// The decoder will stop as soon as an error occurs or at end of frame.
+// When the frame has finished decoding the *bufio.Reader
+// containing the remaining input will be sent on frameDec.frameDone.
+func (d *frameDec) startDecoder(output chan decodeOutput) {
+	// TODO: Init to dictionary
+	d.history.reset()
+	written := int64(0)
+
+	defer func() {
+		d.asyncRunningMu.Lock()
+		d.asyncRunning = false
+		d.asyncRunningMu.Unlock()
+
+		// Drain the currently decoding.
+		d.history.error = true
+	flushdone:
+		for {
+			select {
+			case b := <-d.decoding:
+				b.history <- &d.history
+				output <- <-b.result
+			default:
+				break flushdone
+			}
+		}
+		println("frame decoder done, signalling done")
+		d.frameDone.Done()
+	}()
+	// Get decoder for first block.
+	block := <-d.decoding
+	block.history <- &d.history
+	for {
+		var next *blockDec
+		// Get result
+		r := <-block.result
+		if r.err != nil {
+			println("Result contained error", r.err)
+			output <- r
+			return
+		}
+		if debug {
+			println("got result, from ", d.offset, "to", d.offset+int64(len(r.b)))
+			d.offset += int64(len(r.b))
+		}
+		if !block.Last {
+			// Send history to next block
+			select {
+			case next = <-d.decoding:
+				if debug {
+					println("Sending ", len(d.history.b), "bytes as history")
+				}
+				next.history <- &d.history
+			default:
+				// Wait until we have sent the block, so
+				// other decoders can potentially get the decoder.
+				next = nil
+			}
+		}
+
+		// Add checksum, async to decoding.
+		if d.HasCheckSum {
+			n, err := d.crc.Write(r.b)
+			if err != nil {
+				r.err = err
+				if n != len(r.b) {
+					r.err = io.ErrShortWrite
+				}
+				output <- r
+				return
+			}
+		}
+		written += int64(len(r.b))
+		if d.SingleSegment && uint64(written) > d.FrameContentSize {
+			println("runDecoder: single segment and", uint64(written), ">", d.FrameContentSize)
+			r.err = ErrFrameSizeExceeded
+			output <- r
+			return
+		}
+		if block.Last {
+			r.err = d.checkCRC()
+			output <- r
+			return
+		}
+		output <- r
+		if next == nil {
+			// There was no decoder available, we wait for one now that we have sent to the writer.
+			if debug {
+				println("Sending ", len(d.history.b), " bytes as history")
+			}
+			next = <-d.decoding
+			next.history <- &d.history
+		}
+		block = next
+	}
+}
+
+// runDecoder will create a sync decoder that will decode a block of data.
+func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
+	// TODO: Init to dictionary
+	d.history.reset()
+	saved := d.history.b
+
+	// We use the history for output to avoid copying it.
+	d.history.b = dst
+	// Store input length, so we only check new data.
+	crcStart := len(dst)
+	var err error
+	for {
+		err = dec.reset(d.rawInput, d.WindowSize)
+		if err != nil {
+			break
+		}
+		if debug {
+			println("next block:", dec)
+		}
+		err = dec.decodeBuf(&d.history)
+		if err != nil || dec.Last {
+			break
+		}
+		if uint64(len(d.history.b)) > d.o.maxDecodedSize {
+			err = ErrDecoderSizeExceeded
+			break
+		}
+		if d.SingleSegment && uint64(len(d.history.b)) > d.o.maxDecodedSize {
+			println("runDecoder: single segment and", uint64(len(d.history.b)), ">", d.o.maxDecodedSize)
+			err = ErrFrameSizeExceeded
+			break
+		}
+	}
+	dst = d.history.b
+	if err == nil {
+		if d.HasCheckSum {
+			var n int
+			n, err = d.crc.Write(dst[crcStart:])
+			if err == nil {
+				if n != len(dst)-crcStart {
+					err = io.ErrShortWrite
+				} else {
+					err = d.checkCRC()
+				}
+			}
+		}
+	}
+	d.history.b = saved
+	return dst, err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/frameenc.go b/vendor/github.com/klauspost/compress/zstd/frameenc.go
new file mode 100644
index 00000000000..4479cfe18b2
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/frameenc.go
@@ -0,0 +1,115 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"math/bits"
+)
+
+type frameHeader struct {
+	ContentSize   uint64
+	WindowSize    uint32
+	SingleSegment bool
+	Checksum      bool
+	DictID        uint32 // Not stored.
+}
+
+const maxHeaderSize = 14
+
+func (f frameHeader) appendTo(dst []byte) ([]byte, error) {
+	dst = append(dst, frameMagic...)
+	var fhd uint8
+	if f.Checksum {
+		fhd |= 1 << 2
+	}
+	if f.SingleSegment {
+		fhd |= 1 << 5
+	}
+	var fcs uint8
+	if f.ContentSize >= 256 {
+		fcs++
+	}
+	if f.ContentSize >= 65536+256 {
+		fcs++
+	}
+	if f.ContentSize >= 0xffffffff {
+		fcs++
+	}
+	fhd |= fcs << 6
+
+	dst = append(dst, fhd)
+	if !f.SingleSegment {
+		const winLogMin = 10
+		windowLog := (bits.Len32(f.WindowSize-1) - winLogMin) << 3
+		dst = append(dst, uint8(windowLog))
+	}
+
+	switch fcs {
+	case 0:
+		if f.SingleSegment {
+			dst = append(dst, uint8(f.ContentSize))
+		}
+		// Unless SingleSegment is set, framessizes < 256 are nto stored.
+	case 1:
+		f.ContentSize -= 256
+		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8))
+	case 2:
+		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8), uint8(f.ContentSize>>16), uint8(f.ContentSize>>24))
+	case 3:
+		dst = append(dst, uint8(f.ContentSize), uint8(f.ContentSize>>8), uint8(f.ContentSize>>16), uint8(f.ContentSize>>24),
+			uint8(f.ContentSize>>32), uint8(f.ContentSize>>40), uint8(f.ContentSize>>48), uint8(f.ContentSize>>56))
+	default:
+		panic("invalid fcs")
+	}
+	return dst, nil
+}
+
+const skippableFrameHeader = 4 + 4
+
+// calcSkippableFrame will return a total size to be added for written
+// to be divisible by multiple.
+// The value will always be > skippableFrameHeader.
+// The function will panic if written < 0 or wantMultiple <= 0.
+func calcSkippableFrame(written, wantMultiple int64) int {
+	if wantMultiple <= 0 {
+		panic("wantMultiple <= 0")
+	}
+	if written < 0 {
+		panic("written < 0")
+	}
+	leftOver := written % wantMultiple
+	if leftOver == 0 {
+		return 0
+	}
+	toAdd := wantMultiple - leftOver
+	for toAdd < skippableFrameHeader {
+		toAdd += wantMultiple
+	}
+	return int(toAdd)
+}
+
+// skippableFrame will add a skippable frame with a total size of bytes.
+// total should be >= skippableFrameHeader and < math.MaxUint32.
+func skippableFrame(dst []byte, total int, r io.Reader) ([]byte, error) {
+	if total == 0 {
+		return dst, nil
+	}
+	if total < skippableFrameHeader {
+		return dst, fmt.Errorf("requested skippable frame (%d) < 8", total)
+	}
+	if int64(total) > math.MaxUint32 {
+		return dst, fmt.Errorf("requested skippable frame (%d) > max uint32", total)
+	}
+	dst = append(dst, 0x50, 0x2a, 0x4d, 0x18)
+	f := uint32(total - skippableFrameHeader)
+	dst = append(dst, uint8(f), uint8(f>>8), uint8(f>>16), uint8(f>>24))
+	start := len(dst)
+	dst = append(dst, make([]byte, f)...)
+	_, err := io.ReadFull(r, dst[start:])
+	return dst, err
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_decoder.go b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
new file mode 100644
index 00000000000..e002be98b9b
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_decoder.go
@@ -0,0 +1,384 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+)
+
+const (
+	tablelogAbsoluteMax = 9
+)
+
+const (
+	/*!MEMORY_USAGE :
+	 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+	 *  Increasing memory usage improves compression ratio
+	 *  Reduced memory usage can improve speed, due to cache effect
+	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
+	maxMemoryUsage = 11
+
+	maxTableLog    = maxMemoryUsage - 2
+	maxTablesize   = 1 << maxTableLog
+	maxTableMask   = (1 << maxTableLog) - 1
+	minTablelog    = 5
+	maxSymbolValue = 255
+)
+
+// fseDecoder provides temporary storage for compression and decompression.
+type fseDecoder struct {
+	dt             [maxTablesize]decSymbol // Decompression table.
+	symbolLen      uint16                  // Length of active part of the symbol table.
+	actualTableLog uint8                   // Selected tablelog.
+	maxBits        uint8                   // Maximum number of additional bits
+
+	// used for table creation to avoid allocations.
+	stateTable [256]uint16
+	norm       [maxSymbolValue + 1]int16
+	preDefined bool
+}
+
+// tableStep returns the next table index.
+func tableStep(tableSize uint32) uint32 {
+	return (tableSize >> 1) + (tableSize >> 3) + 3
+}
+
+// readNCount will read the symbol distribution so decoding tables can be constructed.
+func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
+	var (
+		charnum   uint16
+		previous0 bool
+	)
+	if b.remain() < 4 {
+		return errors.New("input too small")
+	}
+	bitStream := b.Uint32()
+	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
+	if nbBits > tablelogAbsoluteMax {
+		println("Invalid tablelog:", nbBits)
+		return errors.New("tableLog too large")
+	}
+	bitStream >>= 4
+	bitCount := uint(4)
+
+	s.actualTableLog = uint8(nbBits)
+	remaining := int32((1 << nbBits) + 1)
+	threshold := int32(1 << nbBits)
+	gotTotal := int32(0)
+	nbBits++
+
+	for remaining > 1 && charnum <= maxSymbol {
+		if previous0 {
+			//println("prev0")
+			n0 := charnum
+			for (bitStream & 0xFFFF) == 0xFFFF {
+				//println("24 x 0")
+				n0 += 24
+				if r := b.remain(); r > 5 {
+					b.advance(2)
+					bitStream = b.Uint32() >> bitCount
+				} else {
+					// end of bit stream
+					bitStream >>= 16
+					bitCount += 16
+				}
+			}
+			//printf("bitstream: %d, 0b%b", bitStream&3, bitStream)
+			for (bitStream & 3) == 3 {
+				n0 += 3
+				bitStream >>= 2
+				bitCount += 2
+			}
+			n0 += uint16(bitStream & 3)
+			bitCount += 2
+
+			if n0 > maxSymbolValue {
+				return errors.New("maxSymbolValue too small")
+			}
+			//println("inserting ", n0-charnum, "zeroes from idx", charnum, "ending before", n0)
+			for charnum < n0 {
+				s.norm[uint8(charnum)] = 0
+				charnum++
+			}
+
+			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+				b.advance(bitCount >> 3)
+				bitCount &= 7
+				bitStream = b.Uint32() >> bitCount
+			} else {
+				bitStream >>= 2
+			}
+		}
+
+		max := (2*threshold - 1) - remaining
+		var count int32
+
+		if int32(bitStream)&(threshold-1) < max {
+			count = int32(bitStream) & (threshold - 1)
+			if debugAsserts && nbBits < 1 {
+				panic("nbBits underflow")
+			}
+			bitCount += nbBits - 1
+		} else {
+			count = int32(bitStream) & (2*threshold - 1)
+			if count >= threshold {
+				count -= max
+			}
+			bitCount += nbBits
+		}
+
+		// extra accuracy
+		count--
+		if count < 0 {
+			// -1 means +1
+			remaining += count
+			gotTotal -= count
+		} else {
+			remaining -= count
+			gotTotal += count
+		}
+		s.norm[charnum&0xff] = int16(count)
+		charnum++
+		previous0 = count == 0
+		for remaining < threshold {
+			nbBits--
+			threshold >>= 1
+		}
+
+		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
+		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			b.advance(bitCount >> 3)
+			bitCount &= 7
+		} else {
+			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
+			b.off = len(b.b) - 4
+			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+		}
+		bitStream = b.Uint32() >> (bitCount & 31)
+		//printf("bitstream is now: 0b%b", bitStream)
+	}
+	s.symbolLen = charnum
+	if s.symbolLen <= 1 {
+		return fmt.Errorf("symbolLen (%d) too small", s.symbolLen)
+	}
+	if s.symbolLen > maxSymbolValue+1 {
+		return fmt.Errorf("symbolLen (%d) too big", s.symbolLen)
+	}
+	if remaining != 1 {
+		return fmt.Errorf("corruption detected (remaining %d != 1)", remaining)
+	}
+	if bitCount > 32 {
+		return fmt.Errorf("corruption detected (bitCount %d > 32)", bitCount)
+	}
+	if gotTotal != 1<<s.actualTableLog {
+		return fmt.Errorf("corruption detected (total %d != %d)", gotTotal, 1<<s.actualTableLog)
+	}
+	b.advance((bitCount + 7) >> 3)
+	// println(s.norm[:s.symbolLen], s.symbolLen)
+	return s.buildDtable()
+}
+
+// decSymbol contains information about a state entry,
+// Including the state offset base, the output symbol and
+// the number of bits to read for the low part of the destination state.
+// Using a composite uint64 is faster than a struct with separate members.
+type decSymbol uint64
+
+func newDecSymbol(nbits, addBits uint8, newState uint16, baseline uint32) decSymbol {
+	return decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
+}
+
+func (d decSymbol) nbBits() uint8 {
+	return uint8(d)
+}
+
+func (d decSymbol) addBits() uint8 {
+	return uint8(d >> 8)
+}
+
+func (d decSymbol) newState() uint16 {
+	return uint16(d >> 16)
+}
+
+func (d decSymbol) baseline() uint32 {
+	return uint32(d >> 32)
+}
+
+func (d decSymbol) baselineInt() int {
+	return int(d >> 32)
+}
+
+func (d *decSymbol) set(nbits, addBits uint8, newState uint16, baseline uint32) {
+	*d = decSymbol(nbits) | (decSymbol(addBits) << 8) | (decSymbol(newState) << 16) | (decSymbol(baseline) << 32)
+}
+
+func (d *decSymbol) setNBits(nBits uint8) {
+	const mask = 0xffffffffffffff00
+	*d = (*d & mask) | decSymbol(nBits)
+}
+
+func (d *decSymbol) setAddBits(addBits uint8) {
+	const mask = 0xffffffffffff00ff
+	*d = (*d & mask) | (decSymbol(addBits) << 8)
+}
+
+func (d *decSymbol) setNewState(state uint16) {
+	const mask = 0xffffffff0000ffff
+	*d = (*d & mask) | decSymbol(state)<<16
+}
+
+func (d *decSymbol) setBaseline(baseline uint32) {
+	const mask = 0xffffffff
+	*d = (*d & mask) | decSymbol(baseline)<<32
+}
+
+func (d *decSymbol) setExt(addBits uint8, baseline uint32) {
+	const mask = 0xffff00ff
+	*d = (*d & mask) | (decSymbol(addBits) << 8) | (decSymbol(baseline) << 32)
+}
+
+// decSymbolValue returns the transformed decSymbol for the given symbol.
+func decSymbolValue(symb uint8, t []baseOffset) (decSymbol, error) {
+	if int(symb) >= len(t) {
+		return 0, fmt.Errorf("rle symbol %d >= max %d", symb, len(t))
+	}
+	lu := t[symb]
+	return newDecSymbol(0, lu.addBits, 0, lu.baseLine), nil
+}
+
+// setRLE will set the decoder til RLE mode.
+func (s *fseDecoder) setRLE(symbol decSymbol) {
+	s.actualTableLog = 0
+	s.maxBits = symbol.addBits()
+	s.dt[0] = symbol
+}
+
+// buildDtable will build the decoding table.
+func (s *fseDecoder) buildDtable() error {
+	tableSize := uint32(1 << s.actualTableLog)
+	highThreshold := tableSize - 1
+	symbolNext := s.stateTable[:256]
+
+	// Init, lay down lowprob symbols
+	{
+		for i, v := range s.norm[:s.symbolLen] {
+			if v == -1 {
+				s.dt[highThreshold].setAddBits(uint8(i))
+				highThreshold--
+				symbolNext[i] = 1
+			} else {
+				symbolNext[i] = uint16(v)
+			}
+		}
+	}
+	// Spread symbols
+	{
+		tableMask := tableSize - 1
+		step := tableStep(tableSize)
+		position := uint32(0)
+		for ss, v := range s.norm[:s.symbolLen] {
+			for i := 0; i < int(v); i++ {
+				s.dt[position].setAddBits(uint8(ss))
+				position = (position + step) & tableMask
+				for position > highThreshold {
+					// lowprob area
+					position = (position + step) & tableMask
+				}
+			}
+		}
+		if position != 0 {
+			// position must reach all cells once, otherwise normalizedCounter is incorrect
+			return errors.New("corrupted input (position != 0)")
+		}
+	}
+
+	// Build Decoding table
+	{
+		tableSize := uint16(1 << s.actualTableLog)
+		for u, v := range s.dt[:tableSize] {
+			symbol := v.addBits()
+			nextState := symbolNext[symbol]
+			symbolNext[symbol] = nextState + 1
+			nBits := s.actualTableLog - byte(highBits(uint32(nextState)))
+			s.dt[u&maxTableMask].setNBits(nBits)
+			newState := (nextState << nBits) - tableSize
+			if newState > tableSize {
+				return fmt.Errorf("newState (%d) outside table size (%d)", newState, tableSize)
+			}
+			if newState == uint16(u) && nBits == 0 {
+				// Seems weird that this is possible with nbits > 0.
+				return fmt.Errorf("newState (%d) == oldState (%d) and no bits", newState, u)
+			}
+			s.dt[u&maxTableMask].setNewState(newState)
+		}
+	}
+	return nil
+}
+
+// transform will transform the decoder table into a table usable for
+// decoding without having to apply the transformation while decoding.
+// The state will contain the base value and the number of bits to read.
+func (s *fseDecoder) transform(t []baseOffset) error {
+	tableSize := uint16(1 << s.actualTableLog)
+	s.maxBits = 0
+	for i, v := range s.dt[:tableSize] {
+		add := v.addBits()
+		if int(add) >= len(t) {
+			return fmt.Errorf("invalid decoding table entry %d, symbol %d >= max (%d)", i, v.addBits(), len(t))
+		}
+		lu := t[add]
+		if lu.addBits > s.maxBits {
+			s.maxBits = lu.addBits
+		}
+		v.setExt(lu.addBits, lu.baseLine)
+		s.dt[i] = v
+	}
+	return nil
+}
+
+type fseState struct {
+	dt    []decSymbol
+	state decSymbol
+}
+
+// Initialize and decodeAsync first state and symbol.
+func (s *fseState) init(br *bitReader, tableLog uint8, dt []decSymbol) {
+	s.dt = dt
+	br.fill()
+	s.state = dt[br.getBits(tableLog)]
+}
+
+// next returns the current symbol and sets the next state.
+// At least tablelog bits must be available in the bit reader.
+func (s *fseState) next(br *bitReader) {
+	lowBits := uint16(br.getBits(s.state.nbBits()))
+	s.state = s.dt[s.state.newState()+lowBits]
+}
+
+// finished returns true if all bits have been read from the bitstream
+// and the next state would require reading bits from the input.
+func (s *fseState) finished(br *bitReader) bool {
+	return br.finished() && s.state.nbBits() > 0
+}
+
+// final returns the current state symbol without decoding the next.
+func (s *fseState) final() (int, uint8) {
+	return s.state.baselineInt(), s.state.addBits()
+}
+
+// final returns the current state symbol without decoding the next.
+func (s decSymbol) final() (int, uint8) {
+	return s.baselineInt(), s.addBits()
+}
+
+// nextFast returns the next symbol and sets the next state.
+// This can only be used if no symbols are 0 bits.
+// At least tablelog bits must be available in the bit reader.
+func (s *fseState) nextFast(br *bitReader) (uint32, uint8) {
+	lowBits := uint16(br.getBitsFast(s.state.nbBits()))
+	s.state = s.dt[s.state.newState()+lowBits]
+	return s.state.baseline(), s.state.addBits()
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_encoder.go b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
new file mode 100644
index 00000000000..aa9eba88b80
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_encoder.go
@@ -0,0 +1,726 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"math"
+)
+
+const (
+	// For encoding we only support up to
+	maxEncTableLog    = 8
+	maxEncTablesize   = 1 << maxTableLog
+	maxEncTableMask   = (1 << maxTableLog) - 1
+	minEncTablelog    = 5
+	maxEncSymbolValue = maxMatchLengthSymbol
+)
+
+// Scratch provides temporary storage for compression and decompression.
+type fseEncoder struct {
+	symbolLen      uint16 // Length of active part of the symbol table.
+	actualTableLog uint8  // Selected tablelog.
+	ct             cTable // Compression tables.
+	maxCount       int    // count of the most probable symbol
+	zeroBits       bool   // no bits has prob > 50%.
+	clearCount     bool   // clear count
+	useRLE         bool   // This encoder is for RLE
+	preDefined     bool   // This encoder is predefined.
+	reUsed         bool   // Set to know when the encoder has been reused.
+	rleVal         uint8  // RLE Symbol
+	maxBits        uint8  // Maximum output bits after transform.
+
+	// TODO: Technically zstd should be fine with 64 bytes.
+	count [256]uint32
+	norm  [256]int16
+}
+
+// cTable contains tables used for compression.
+type cTable struct {
+	tableSymbol []byte
+	stateTable  []uint16
+	symbolTT    []symbolTransform
+}
+
+// symbolTransform contains the state transform for a symbol.
+type symbolTransform struct {
+	deltaNbBits    uint32
+	deltaFindState int16
+	outBits        uint8
+}
+
+// String prints values as a human readable string.
+func (s symbolTransform) String() string {
+	return fmt.Sprintf("{deltabits: %08x, findstate:%d outbits:%d}", s.deltaNbBits, s.deltaFindState, s.outBits)
+}
+
+// Histogram allows to populate the histogram and skip that step in the compression,
+// It otherwise allows to inspect the histogram when compression is done.
+// To indicate that you have populated the histogram call HistogramFinished
+// with the value of the highest populated symbol, as well as the number of entries
+// in the most populated entry. These are accepted at face value.
+// The returned slice will always be length 256.
+func (s *fseEncoder) Histogram() []uint32 {
+	return s.count[:]
+}
+
+// HistogramFinished can be called to indicate that the histogram has been populated.
+// maxSymbol is the index of the highest set symbol of the next data segment.
+// maxCount is the number of entries in the most populated entry.
+// These are accepted at face value.
+func (s *fseEncoder) HistogramFinished(maxSymbol uint8, maxCount int) {
+	s.maxCount = maxCount
+	s.symbolLen = uint16(maxSymbol) + 1
+	s.clearCount = maxCount != 0
+}
+
+// prepare will prepare and allocate scratch tables used for both compression and decompression.
+func (s *fseEncoder) prepare() (*fseEncoder, error) {
+	if s == nil {
+		s = &fseEncoder{}
+	}
+	s.useRLE = false
+	if s.clearCount && s.maxCount == 0 {
+		for i := range s.count {
+			s.count[i] = 0
+		}
+		s.clearCount = false
+	}
+	return s, nil
+}
+
+// allocCtable will allocate tables needed for compression.
+// If existing tables a re big enough, they are simply re-used.
+func (s *fseEncoder) allocCtable() {
+	tableSize := 1 << s.actualTableLog
+	// get tableSymbol that is big enough.
+	if cap(s.ct.tableSymbol) < int(tableSize) {
+		s.ct.tableSymbol = make([]byte, tableSize)
+	}
+	s.ct.tableSymbol = s.ct.tableSymbol[:tableSize]
+
+	ctSize := tableSize
+	if cap(s.ct.stateTable) < ctSize {
+		s.ct.stateTable = make([]uint16, ctSize)
+	}
+	s.ct.stateTable = s.ct.stateTable[:ctSize]
+
+	if cap(s.ct.symbolTT) < 256 {
+		s.ct.symbolTT = make([]symbolTransform, 256)
+	}
+	s.ct.symbolTT = s.ct.symbolTT[:256]
+}
+
+// buildCTable will populate the compression table so it is ready to be used.
+func (s *fseEncoder) buildCTable() error {
+	tableSize := uint32(1 << s.actualTableLog)
+	highThreshold := tableSize - 1
+	var cumul [256]int16
+
+	s.allocCtable()
+	tableSymbol := s.ct.tableSymbol[:tableSize]
+	// symbol start positions
+	{
+		cumul[0] = 0
+		for ui, v := range s.norm[:s.symbolLen-1] {
+			u := byte(ui) // one less than reference
+			if v == -1 {
+				// Low proba symbol
+				cumul[u+1] = cumul[u] + 1
+				tableSymbol[highThreshold] = u
+				highThreshold--
+			} else {
+				cumul[u+1] = cumul[u] + v
+			}
+		}
+		// Encode last symbol separately to avoid overflowing u
+		u := int(s.symbolLen - 1)
+		v := s.norm[s.symbolLen-1]
+		if v == -1 {
+			// Low proba symbol
+			cumul[u+1] = cumul[u] + 1
+			tableSymbol[highThreshold] = byte(u)
+			highThreshold--
+		} else {
+			cumul[u+1] = cumul[u] + v
+		}
+		if uint32(cumul[s.symbolLen]) != tableSize {
+			return fmt.Errorf("internal error: expected cumul[s.symbolLen] (%d) == tableSize (%d)", cumul[s.symbolLen], tableSize)
+		}
+		cumul[s.symbolLen] = int16(tableSize) + 1
+	}
+	// Spread symbols
+	s.zeroBits = false
+	{
+		step := tableStep(tableSize)
+		tableMask := tableSize - 1
+		var position uint32
+		// if any symbol > largeLimit, we may have 0 bits output.
+		largeLimit := int16(1 << (s.actualTableLog - 1))
+		for ui, v := range s.norm[:s.symbolLen] {
+			symbol := byte(ui)
+			if v > largeLimit {
+				s.zeroBits = true
+			}
+			for nbOccurrences := int16(0); nbOccurrences < v; nbOccurrences++ {
+				tableSymbol[position] = symbol
+				position = (position + step) & tableMask
+				for position > highThreshold {
+					position = (position + step) & tableMask
+				} /* Low proba area */
+			}
+		}
+
+		// Check if we have gone through all positions
+		if position != 0 {
+			return errors.New("position!=0")
+		}
+	}
+
+	// Build table
+	table := s.ct.stateTable
+	{
+		tsi := int(tableSize)
+		for u, v := range tableSymbol {
+			// TableU16 : sorted by symbol order; gives next state value
+			table[cumul[v]] = uint16(tsi + u)
+			cumul[v]++
+		}
+	}
+
+	// Build Symbol Transformation Table
+	{
+		total := int16(0)
+		symbolTT := s.ct.symbolTT[:s.symbolLen]
+		tableLog := s.actualTableLog
+		tl := (uint32(tableLog) << 16) - (1 << tableLog)
+		for i, v := range s.norm[:s.symbolLen] {
+			switch v {
+			case 0:
+			case -1, 1:
+				symbolTT[i].deltaNbBits = tl
+				symbolTT[i].deltaFindState = int16(total - 1)
+				total++
+			default:
+				maxBitsOut := uint32(tableLog) - highBit(uint32(v-1))
+				minStatePlus := uint32(v) << maxBitsOut
+				symbolTT[i].deltaNbBits = (maxBitsOut << 16) - minStatePlus
+				symbolTT[i].deltaFindState = int16(total - v)
+				total += v
+			}
+		}
+		if total != int16(tableSize) {
+			return fmt.Errorf("total mismatch %d (got) != %d (want)", total, tableSize)
+		}
+	}
+	return nil
+}
+
+var rtbTable = [...]uint32{0, 473195, 504333, 520860, 550000, 700000, 750000, 830000}
+
+func (s *fseEncoder) setRLE(val byte) {
+	s.allocCtable()
+	s.actualTableLog = 0
+	s.ct.stateTable = s.ct.stateTable[:1]
+	s.ct.symbolTT[val] = symbolTransform{
+		deltaFindState: 0,
+		deltaNbBits:    0,
+	}
+	if debug {
+		println("setRLE: val", val, "symbolTT", s.ct.symbolTT[val])
+	}
+	s.rleVal = val
+	s.useRLE = true
+}
+
+// setBits will set output bits for the transform.
+// if nil is provided, the number of bits is equal to the index.
+func (s *fseEncoder) setBits(transform []byte) {
+	if s.reUsed || s.preDefined {
+		return
+	}
+	if s.useRLE {
+		if transform == nil {
+			s.ct.symbolTT[s.rleVal].outBits = s.rleVal
+			s.maxBits = s.rleVal
+			return
+		}
+		s.maxBits = transform[s.rleVal]
+		s.ct.symbolTT[s.rleVal].outBits = s.maxBits
+		return
+	}
+	if transform == nil {
+		for i := range s.ct.symbolTT[:s.symbolLen] {
+			s.ct.symbolTT[i].outBits = uint8(i)
+		}
+		s.maxBits = uint8(s.symbolLen - 1)
+		return
+	}
+	s.maxBits = 0
+	for i, v := range transform[:s.symbolLen] {
+		s.ct.symbolTT[i].outBits = v
+		if v > s.maxBits {
+			// We could assume bits always going up, but we play safe.
+			s.maxBits = v
+		}
+	}
+}
+
+// normalizeCount will normalize the count of the symbols so
+// the total is equal to the table size.
+// If successful, compression tables will also be made ready.
+func (s *fseEncoder) normalizeCount(length int) error {
+	if s.reUsed {
+		return nil
+	}
+	s.optimalTableLog(length)
+	var (
+		tableLog          = s.actualTableLog
+		scale             = 62 - uint64(tableLog)
+		step              = (1 << 62) / uint64(length)
+		vStep             = uint64(1) << (scale - 20)
+		stillToDistribute = int16(1 << tableLog)
+		largest           int
+		largestP          int16
+		lowThreshold      = (uint32)(length >> tableLog)
+	)
+	if s.maxCount == length {
+		s.useRLE = true
+		return nil
+	}
+	s.useRLE = false
+	for i, cnt := range s.count[:s.symbolLen] {
+		// already handled
+		// if (count[s] == s.length) return 0;   /* rle special case */
+
+		if cnt == 0 {
+			s.norm[i] = 0
+			continue
+		}
+		if cnt <= lowThreshold {
+			s.norm[i] = -1
+			stillToDistribute--
+		} else {
+			proba := (int16)((uint64(cnt) * step) >> scale)
+			if proba < 8 {
+				restToBeat := vStep * uint64(rtbTable[proba])
+				v := uint64(cnt)*step - (uint64(proba) << scale)
+				if v > restToBeat {
+					proba++
+				}
+			}
+			if proba > largestP {
+				largestP = proba
+				largest = i
+			}
+			s.norm[i] = proba
+			stillToDistribute -= proba
+		}
+	}
+
+	if -stillToDistribute >= (s.norm[largest] >> 1) {
+		// corner case, need another normalization method
+		err := s.normalizeCount2(length)
+		if err != nil {
+			return err
+		}
+		if debugAsserts {
+			err = s.validateNorm()
+			if err != nil {
+				return err
+			}
+		}
+		return s.buildCTable()
+	}
+	s.norm[largest] += stillToDistribute
+	if debugAsserts {
+		err := s.validateNorm()
+		if err != nil {
+			return err
+		}
+	}
+	return s.buildCTable()
+}
+
+// Secondary normalization method.
+// To be used when primary method fails.
+func (s *fseEncoder) normalizeCount2(length int) error {
+	const notYetAssigned = -2
+	var (
+		distributed  uint32
+		total        = uint32(length)
+		tableLog     = s.actualTableLog
+		lowThreshold = uint32(total >> tableLog)
+		lowOne       = uint32((total * 3) >> (tableLog + 1))
+	)
+	for i, cnt := range s.count[:s.symbolLen] {
+		if cnt == 0 {
+			s.norm[i] = 0
+			continue
+		}
+		if cnt <= lowThreshold {
+			s.norm[i] = -1
+			distributed++
+			total -= cnt
+			continue
+		}
+		if cnt <= lowOne {
+			s.norm[i] = 1
+			distributed++
+			total -= cnt
+			continue
+		}
+		s.norm[i] = notYetAssigned
+	}
+	toDistribute := (1 << tableLog) - distributed
+
+	if (total / toDistribute) > lowOne {
+		// risk of rounding to zero
+		lowOne = uint32((total * 3) / (toDistribute * 2))
+		for i, cnt := range s.count[:s.symbolLen] {
+			if (s.norm[i] == notYetAssigned) && (cnt <= lowOne) {
+				s.norm[i] = 1
+				distributed++
+				total -= cnt
+				continue
+			}
+		}
+		toDistribute = (1 << tableLog) - distributed
+	}
+	if distributed == uint32(s.symbolLen)+1 {
+		// all values are pretty poor;
+		//   probably incompressible data (should have already been detected);
+		//   find max, then give all remaining points to max
+		var maxV int
+		var maxC uint32
+		for i, cnt := range s.count[:s.symbolLen] {
+			if cnt > maxC {
+				maxV = i
+				maxC = cnt
+			}
+		}
+		s.norm[maxV] += int16(toDistribute)
+		return nil
+	}
+
+	if total == 0 {
+		// all of the symbols were low enough for the lowOne or lowThreshold
+		for i := uint32(0); toDistribute > 0; i = (i + 1) % (uint32(s.symbolLen)) {
+			if s.norm[i] > 0 {
+				toDistribute--
+				s.norm[i]++
+			}
+		}
+		return nil
+	}
+
+	var (
+		vStepLog = 62 - uint64(tableLog)
+		mid      = uint64((1 << (vStepLog - 1)) - 1)
+		rStep    = (((1 << vStepLog) * uint64(toDistribute)) + mid) / uint64(total) // scale on remaining
+		tmpTotal = mid
+	)
+	for i, cnt := range s.count[:s.symbolLen] {
+		if s.norm[i] == notYetAssigned {
+			var (
+				end    = tmpTotal + uint64(cnt)*rStep
+				sStart = uint32(tmpTotal >> vStepLog)
+				sEnd   = uint32(end >> vStepLog)
+				weight = sEnd - sStart
+			)
+			if weight < 1 {
+				return errors.New("weight < 1")
+			}
+			s.norm[i] = int16(weight)
+			tmpTotal = end
+		}
+	}
+	return nil
+}
+
+// optimalTableLog calculates and sets the optimal tableLog in s.actualTableLog
+func (s *fseEncoder) optimalTableLog(length int) {
+	tableLog := uint8(maxEncTableLog)
+	minBitsSrc := highBit(uint32(length)) + 1
+	minBitsSymbols := highBit(uint32(s.symbolLen-1)) + 2
+	minBits := uint8(minBitsSymbols)
+	if minBitsSrc < minBitsSymbols {
+		minBits = uint8(minBitsSrc)
+	}
+
+	maxBitsSrc := uint8(highBit(uint32(length-1))) - 2
+	if maxBitsSrc < tableLog {
+		// Accuracy can be reduced
+		tableLog = maxBitsSrc
+	}
+	if minBits > tableLog {
+		tableLog = minBits
+	}
+	// Need a minimum to safely represent all symbol values
+	if tableLog < minEncTablelog {
+		tableLog = minEncTablelog
+	}
+	if tableLog > maxEncTableLog {
+		tableLog = maxEncTableLog
+	}
+	s.actualTableLog = tableLog
+}
+
+// validateNorm validates the normalized histogram table.
+func (s *fseEncoder) validateNorm() (err error) {
+	var total int
+	for _, v := range s.norm[:s.symbolLen] {
+		if v >= 0 {
+			total += int(v)
+		} else {
+			total -= int(v)
+		}
+	}
+	defer func() {
+		if err == nil {
+			return
+		}
+		fmt.Printf("selected TableLog: %d, Symbol length: %d\n", s.actualTableLog, s.symbolLen)
+		for i, v := range s.norm[:s.symbolLen] {
+			fmt.Printf("%3d: %5d -> %4d \n", i, s.count[i], v)
+		}
+	}()
+	if total != (1 << s.actualTableLog) {
+		return fmt.Errorf("warning: Total == %d != %d", total, 1<<s.actualTableLog)
+	}
+	for i, v := range s.count[s.symbolLen:] {
+		if v != 0 {
+			return fmt.Errorf("warning: Found symbol out of range, %d after cut", i)
+		}
+	}
+	return nil
+}
+
+// writeCount will write the normalized histogram count to header.
+// This is read back by readNCount.
+func (s *fseEncoder) writeCount(out []byte) ([]byte, error) {
+	if s.useRLE {
+		return append(out, s.rleVal), nil
+	}
+	if s.preDefined || s.reUsed {
+		// Never write predefined.
+		return out, nil
+	}
+
+	var (
+		tableLog  = s.actualTableLog
+		tableSize = 1 << tableLog
+		previous0 bool
+		charnum   uint16
+
+		// maximum header size plus 2 extra bytes for final output if bitCount == 0.
+		maxHeaderSize = ((int(s.symbolLen) * int(tableLog)) >> 3) + 3 + 2
+
+		// Write Table Size
+		bitStream = uint32(tableLog - minEncTablelog)
+		bitCount  = uint(4)
+		remaining = int16(tableSize + 1) /* +1 for extra accuracy */
+		threshold = int16(tableSize)
+		nbBits    = uint(tableLog + 1)
+		outP      = len(out)
+	)
+	if cap(out) < outP+maxHeaderSize {
+		out = append(out, make([]byte, maxHeaderSize*3)...)
+		out = out[:len(out)-maxHeaderSize*3]
+	}
+	out = out[:outP+maxHeaderSize]
+
+	// stops at 1
+	for remaining > 1 {
+		if previous0 {
+			start := charnum
+			for s.norm[charnum] == 0 {
+				charnum++
+			}
+			for charnum >= start+24 {
+				start += 24
+				bitStream += uint32(0xFFFF) << bitCount
+				out[outP] = byte(bitStream)
+				out[outP+1] = byte(bitStream >> 8)
+				outP += 2
+				bitStream >>= 16
+			}
+			for charnum >= start+3 {
+				start += 3
+				bitStream += 3 << bitCount
+				bitCount += 2
+			}
+			bitStream += uint32(charnum-start) << bitCount
+			bitCount += 2
+			if bitCount > 16 {
+				out[outP] = byte(bitStream)
+				out[outP+1] = byte(bitStream >> 8)
+				outP += 2
+				bitStream >>= 16
+				bitCount -= 16
+			}
+		}
+
+		count := s.norm[charnum]
+		charnum++
+		max := (2*threshold - 1) - remaining
+		if count < 0 {
+			remaining += count
+		} else {
+			remaining -= count
+		}
+		count++ // +1 for extra accuracy
+		if count >= threshold {
+			count += max // [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[
+		}
+		bitStream += uint32(count) << bitCount
+		bitCount += nbBits
+		if count < max {
+			bitCount--
+		}
+
+		previous0 = count == 1
+		if remaining < 1 {
+			return nil, errors.New("internal error: remaining < 1")
+		}
+		for remaining < threshold {
+			nbBits--
+			threshold >>= 1
+		}
+
+		if bitCount > 16 {
+			out[outP] = byte(bitStream)
+			out[outP+1] = byte(bitStream >> 8)
+			outP += 2
+			bitStream >>= 16
+			bitCount -= 16
+		}
+	}
+
+	if outP+2 > len(out) {
+		return nil, fmt.Errorf("internal error: %d > %d, maxheader: %d, sl: %d, tl: %d, normcount: %v", outP+2, len(out), maxHeaderSize, s.symbolLen, int(tableLog), s.norm[:s.symbolLen])
+	}
+	out[outP] = byte(bitStream)
+	out[outP+1] = byte(bitStream >> 8)
+	outP += int((bitCount + 7) / 8)
+
+	if charnum > s.symbolLen {
+		return nil, errors.New("internal error: charnum > s.symbolLen")
+	}
+	return out[:outP], nil
+}
+
+// Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits)
+// note 1 : assume symbolValue is valid (<= maxSymbolValue)
+// note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits *
+func (s *fseEncoder) bitCost(symbolValue uint8, accuracyLog uint32) uint32 {
+	minNbBits := s.ct.symbolTT[symbolValue].deltaNbBits >> 16
+	threshold := (minNbBits + 1) << 16
+	if debugAsserts {
+		if !(s.actualTableLog < 16) {
+			panic("!s.actualTableLog < 16")
+		}
+		// ensure enough room for renormalization double shift
+		if !(uint8(accuracyLog) < 31-s.actualTableLog) {
+			panic("!uint8(accuracyLog) < 31-s.actualTableLog")
+		}
+	}
+	tableSize := uint32(1) << s.actualTableLog
+	deltaFromThreshold := threshold - (s.ct.symbolTT[symbolValue].deltaNbBits + tableSize)
+	// linear interpolation (very approximate)
+	normalizedDeltaFromThreshold := (deltaFromThreshold << accuracyLog) >> s.actualTableLog
+	bitMultiplier := uint32(1) << accuracyLog
+	if debugAsserts {
+		if s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold {
+			panic("s.ct.symbolTT[symbolValue].deltaNbBits+tableSize > threshold")
+		}
+		if normalizedDeltaFromThreshold > bitMultiplier {
+			panic("normalizedDeltaFromThreshold > bitMultiplier")
+		}
+	}
+	return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold
+}
+
+// Returns the cost in bits of encoding the distribution in count using ctable.
+// Histogram should only be up to the last non-zero symbol.
+// Returns an -1 if ctable cannot represent all the symbols in count.
+func (s *fseEncoder) approxSize(hist []uint32) uint32 {
+	if int(s.symbolLen) < len(hist) {
+		// More symbols than we have.
+		return math.MaxUint32
+	}
+	if s.useRLE {
+		// We will never reuse RLE encoders.
+		return math.MaxUint32
+	}
+	const kAccuracyLog = 8
+	badCost := (uint32(s.actualTableLog) + 1) << kAccuracyLog
+	var cost uint32
+	for i, v := range hist {
+		if v == 0 {
+			continue
+		}
+		if s.norm[i] == 0 {
+			return math.MaxUint32
+		}
+		bitCost := s.bitCost(uint8(i), kAccuracyLog)
+		if bitCost > badCost {
+			return math.MaxUint32
+		}
+		cost += v * bitCost
+	}
+	return cost >> kAccuracyLog
+}
+
+// maxHeaderSize returns the maximum header size in bits.
+// This is not exact size, but we want a penalty for new tables anyway.
+func (s *fseEncoder) maxHeaderSize() uint32 {
+	if s.preDefined {
+		return 0
+	}
+	if s.useRLE {
+		return 8
+	}
+	return (((uint32(s.symbolLen) * uint32(s.actualTableLog)) >> 3) + 3) * 8
+}
+
+// cState contains the compression state of a stream.
+type cState struct {
+	bw         *bitWriter
+	stateTable []uint16
+	state      uint16
+}
+
+// init will initialize the compression state to the first symbol of the stream.
+func (c *cState) init(bw *bitWriter, ct *cTable, first symbolTransform) {
+	c.bw = bw
+	c.stateTable = ct.stateTable
+	if len(c.stateTable) == 1 {
+		// RLE
+		c.stateTable[0] = uint16(0)
+		c.state = 0
+		return
+	}
+	nbBitsOut := (first.deltaNbBits + (1 << 15)) >> 16
+	im := int32((nbBitsOut << 16) - first.deltaNbBits)
+	lu := (im >> nbBitsOut) + int32(first.deltaFindState)
+	c.state = c.stateTable[lu]
+	return
+}
+
+// encode the output symbol provided and write it to the bitstream.
+func (c *cState) encode(symbolTT symbolTransform) {
+	nbBitsOut := (uint32(c.state) + symbolTT.deltaNbBits) >> 16
+	dstState := int32(c.state>>(nbBitsOut&15)) + int32(symbolTT.deltaFindState)
+	c.bw.addBits16NC(c.state, uint8(nbBitsOut))
+	c.state = c.stateTable[dstState]
+}
+
+// flush will write the tablelog to the output and flush the remaining full bytes.
+func (c *cState) flush(tableLog uint8) {
+	c.bw.flush32()
+	c.bw.addBits16NC(c.state, tableLog)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/fse_predefined.go b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
new file mode 100644
index 00000000000..6c17dc17f4f
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/fse_predefined.go
@@ -0,0 +1,158 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"fmt"
+	"math"
+	"sync"
+)
+
+var (
+	// fsePredef are the predefined fse tables as defined here:
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
+	// These values are already transformed.
+	fsePredef [3]fseDecoder
+
+	// fsePredefEnc are the predefined encoder based on fse tables as defined here:
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
+	// These values are already transformed.
+	fsePredefEnc [3]fseEncoder
+
+	// symbolTableX contain the transformations needed for each type as defined in
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets
+	symbolTableX [3][]baseOffset
+
+	// maxTableSymbol is the biggest supported symbol for each table type
+	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#the-codes-for-literals-lengths-match-lengths-and-offsets
+	maxTableSymbol = [3]uint8{tableLiteralLengths: maxLiteralLengthSymbol, tableOffsets: maxOffsetLengthSymbol, tableMatchLengths: maxMatchLengthSymbol}
+
+	// bitTables is the bits table for each table.
+	bitTables = [3][]byte{tableLiteralLengths: llBitsTable[:], tableOffsets: nil, tableMatchLengths: mlBitsTable[:]}
+)
+
+type tableIndex uint8
+
+const (
+	// indexes for fsePredef and symbolTableX
+	tableLiteralLengths tableIndex = 0
+	tableOffsets        tableIndex = 1
+	tableMatchLengths   tableIndex = 2
+
+	maxLiteralLengthSymbol = 35
+	maxOffsetLengthSymbol  = 30
+	maxMatchLengthSymbol   = 52
+)
+
+// baseOffset is used for calculating transformations.
+type baseOffset struct {
+	baseLine uint32
+	addBits  uint8
+}
+
+// fillBase will precalculate base offsets with the given bit distributions.
+func fillBase(dst []baseOffset, base uint32, bits ...uint8) {
+	if len(bits) != len(dst) {
+		panic(fmt.Sprintf("len(dst) (%d) != len(bits) (%d)", len(dst), len(bits)))
+	}
+	for i, bit := range bits {
+		if base > math.MaxInt32 {
+			panic(fmt.Sprintf("invalid decoding table, base overflows int32"))
+		}
+
+		dst[i] = baseOffset{
+			baseLine: base,
+			addBits:  bit,
+		}
+		base += 1 << bit
+	}
+}
+
+var predef sync.Once
+
+func initPredefined() {
+	predef.Do(func() {
+		// Literals length codes
+		tmp := make([]baseOffset, 36)
+		for i := range tmp[:16] {
+			tmp[i] = baseOffset{
+				baseLine: uint32(i),
+				addBits:  0,
+			}
+		}
+		fillBase(tmp[16:], 16, 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+		symbolTableX[tableLiteralLengths] = tmp
+
+		// Match length codes
+		tmp = make([]baseOffset, 53)
+		for i := range tmp[:32] {
+			tmp[i] = baseOffset{
+				// The transformation adds the 3 length.
+				baseLine: uint32(i) + 3,
+				addBits:  0,
+			}
+		}
+		fillBase(tmp[32:], 35, 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)
+		symbolTableX[tableMatchLengths] = tmp
+
+		// Offset codes
+		tmp = make([]baseOffset, maxOffsetBits+1)
+		tmp[1] = baseOffset{
+			baseLine: 1,
+			addBits:  1,
+		}
+		fillBase(tmp[2:], 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)
+		symbolTableX[tableOffsets] = tmp
+
+		// Fill predefined tables and transform them.
+		// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#default-distributions
+		for i := range fsePredef[:] {
+			f := &fsePredef[i]
+			switch tableIndex(i) {
+			case tableLiteralLengths:
+				// https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L243
+				f.actualTableLog = 6
+				copy(f.norm[:], []int16{4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
+					2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
+					-1, -1, -1, -1})
+				f.symbolLen = 36
+			case tableOffsets:
+				// https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L281
+				f.actualTableLog = 5
+				copy(f.norm[:], []int16{
+					1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+					1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1})
+				f.symbolLen = 29
+			case tableMatchLengths:
+				//https://github.com/facebook/zstd/blob/ededcfca57366461021c922720878c81a5854a0a/lib/decompress/zstd_decompress_block.c#L304
+				f.actualTableLog = 6
+				copy(f.norm[:], []int16{
+					1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+					1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+					1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1,
+					-1, -1, -1, -1, -1})
+				f.symbolLen = 53
+			}
+			if err := f.buildDtable(); err != nil {
+				panic(fmt.Errorf("building table %v: %v", tableIndex(i), err))
+			}
+			if err := f.transform(symbolTableX[i]); err != nil {
+				panic(fmt.Errorf("building table %v: %v", tableIndex(i), err))
+			}
+			f.preDefined = true
+
+			// Create encoder as well
+			enc := &fsePredefEnc[i]
+			copy(enc.norm[:], f.norm[:])
+			enc.symbolLen = f.symbolLen
+			enc.actualTableLog = f.actualTableLog
+			if err := enc.buildCTable(); err != nil {
+				panic(fmt.Errorf("building encoding table %v: %v", tableIndex(i), err))
+			}
+			enc.setBits(bitTables[i])
+			enc.preDefined = true
+		}
+	})
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/hash.go b/vendor/github.com/klauspost/compress/zstd/hash.go
new file mode 100644
index 00000000000..4a752067fc9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/hash.go
@@ -0,0 +1,77 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+const (
+	prime3bytes = 506832829
+	prime4bytes = 2654435761
+	prime5bytes = 889523592379
+	prime6bytes = 227718039650203
+	prime7bytes = 58295818150454627
+	prime8bytes = 0xcf1bbcdcb7a56463
+)
+
+// hashLen returns a hash of the lowest l bytes of u for a size size of h bytes.
+// l must be >=4 and <=8. Any other value will return hash for 4 bytes.
+// h should always be <32.
+// Preferably h and l should be a constant.
+// FIXME: This does NOT get resolved, if 'mls' is constant,
+//  so this cannot be used.
+func hashLen(u uint64, hashLog, mls uint8) uint32 {
+	switch mls {
+	case 5:
+		return hash5(u, hashLog)
+	case 6:
+		return hash6(u, hashLog)
+	case 7:
+		return hash7(u, hashLog)
+	case 8:
+		return hash8(u, hashLog)
+	default:
+		return hash4x64(u, hashLog)
+	}
+}
+
+// hash3 returns the hash of the lower 3 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash3(u uint32, h uint8) uint32 {
+	return ((u << (32 - 24)) * prime3bytes) >> ((32 - h) & 31)
+}
+
+// hash4 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4(u uint32, h uint8) uint32 {
+	return (u * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash4x64 returns the hash of the lowest 4 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <32.
+func hash4x64(u uint64, h uint8) uint32 {
+	return (uint32(u) * prime4bytes) >> ((32 - h) & 31)
+}
+
+// hash5 returns the hash of the lowest 5 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash5(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 40)) * prime5bytes) >> ((64 - h) & 63))
+}
+
+// hash6 returns the hash of the lowest 6 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash6(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 48)) * prime6bytes) >> ((64 - h) & 63))
+}
+
+// hash7 returns the hash of the lowest 7 bytes of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash7(u uint64, h uint8) uint32 {
+	return uint32(((u << (64 - 56)) * prime7bytes) >> ((64 - h) & 63))
+}
+
+// hash8 returns the hash of u to fit in a hash table with h bits.
+// Preferably h should be a constant and should always be <64.
+func hash8(u uint64, h uint8) uint32 {
+	return uint32((u * prime8bytes) >> ((64 - h) & 63))
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/history.go b/vendor/github.com/klauspost/compress/zstd/history.go
new file mode 100644
index 00000000000..e8c419bd533
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/history.go
@@ -0,0 +1,73 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"github.com/klauspost/compress/huff0"
+)
+
+// history contains the information transferred between blocks.
+type history struct {
+	b             []byte
+	huffTree      *huff0.Scratch
+	recentOffsets [3]int
+	decoders      sequenceDecs
+	windowSize    int
+	maxSize       int
+	error         bool
+}
+
+// reset will reset the history to initial state of a frame.
+// The history must already have been initialized to the desired size.
+func (h *history) reset() {
+	h.b = h.b[:0]
+	h.error = false
+	h.recentOffsets = [3]int{1, 4, 8}
+	if f := h.decoders.litLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+	}
+	if f := h.decoders.offsets.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+	}
+	if f := h.decoders.matchLengths.fse; f != nil && !f.preDefined {
+		fseDecoderPool.Put(f)
+	}
+	h.decoders = sequenceDecs{}
+	if h.huffTree != nil {
+		huffDecoderPool.Put(h.huffTree)
+	}
+	h.huffTree = nil
+	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
+}
+
+// append bytes to history.
+// This function will make sure there is space for it,
+// if the buffer has been allocated with enough extra space.
+func (h *history) append(b []byte) {
+	if len(b) >= h.windowSize {
+		// Discard all history by simply overwriting
+		h.b = h.b[:h.windowSize]
+		copy(h.b, b[len(b)-h.windowSize:])
+		return
+	}
+
+	// If there is space, append it.
+	if len(b) < cap(h.b)-len(h.b) {
+		h.b = append(h.b, b...)
+		return
+	}
+
+	// Move data down so we only have window size left.
+	// We know we have less than window size in b at this point.
+	discard := len(b) + len(h.b) - h.windowSize
+	copy(h.b, h.b[discard:])
+	h.b = h.b[:h.windowSize]
+	copy(h.b[h.windowSize-len(b):], b)
+}
+
+// append bytes to history without ever discarding anything.
+func (h *history) appendKeep(b []byte) {
+	h.b = append(h.b, b...)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt
new file mode 100644
index 00000000000..24b53065f40
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/LICENSE.txt
@@ -0,0 +1,22 @@
+Copyright (c) 2016 Caleb Spare
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
new file mode 100644
index 00000000000..69aa3bb587c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/README.md
@@ -0,0 +1,58 @@
+# xxhash
+
+VENDORED: Go to [github.com/cespare/xxhash](https://github.com/cespare/xxhash) for original package.
+
+
+[![GoDoc](https://godoc.org/github.com/cespare/xxhash?status.svg)](https://godoc.org/github.com/cespare/xxhash)
+[![Build Status](https://travis-ci.org/cespare/xxhash.svg?branch=master)](https://travis-ci.org/cespare/xxhash)
+
+xxhash is a Go implementation of the 64-bit
+[xxHash](http://cyan4973.github.io/xxHash/) algorithm, XXH64. This is a
+high-quality hashing algorithm that is much faster than anything in the Go
+standard library.
+
+This package provides a straightforward API:
+
+```
+func Sum64(b []byte) uint64
+func Sum64String(s string) uint64
+type Digest struct{ ... }
+    func New() *Digest
+```
+
+The `Digest` type implements hash.Hash64. Its key methods are:
+
+```
+func (*Digest) Write([]byte) (int, error)
+func (*Digest) WriteString(string) (int, error)
+func (*Digest) Sum64() uint64
+```
+
+This implementation provides a fast pure-Go implementation and an even faster
+assembly implementation for amd64.
+
+## Benchmarks
+
+Here are some quick benchmarks comparing the pure-Go and assembly
+implementations of Sum64.
+
+| input size | purego | asm |
+| --- | --- | --- |
+| 5 B   |  979.66 MB/s |  1291.17 MB/s  |
+| 100 B | 7475.26 MB/s | 7973.40 MB/s  |
+| 4 KB  | 17573.46 MB/s | 17602.65 MB/s |
+| 10 MB | 17131.46 MB/s | 17142.16 MB/s |
+
+These numbers were generated on Ubuntu 18.04 with an Intel i7-8700K CPU using
+the following commands under Go 1.11.2:
+
+```
+$ go test -tags purego -benchtime 10s -bench '/xxhash,direct,bytes'
+$ go test -benchtime 10s -bench '/xxhash,direct,bytes'
+```
+
+## Projects using this package
+
+- [InfluxDB](https://github.com/influxdata/influxdb)
+- [Prometheus](https://github.com/prometheus/prometheus)
+- [FreeCache](https://github.com/coocood/freecache)
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
new file mode 100644
index 00000000000..426b9cac786
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash.go
@@ -0,0 +1,238 @@
+// Package xxhash implements the 64-bit variant of xxHash (XXH64) as described
+// at http://cyan4973.github.io/xxHash/.
+// THIS IS VENDORED: Go to github.com/cespare/xxhash for original package.
+
+package xxhash
+
+import (
+	"encoding/binary"
+	"errors"
+	"math/bits"
+)
+
+const (
+	prime1 uint64 = 11400714785074694791
+	prime2 uint64 = 14029467366897019727
+	prime3 uint64 = 1609587929392839161
+	prime4 uint64 = 9650029242287828579
+	prime5 uint64 = 2870177450012600261
+)
+
+// NOTE(caleb): I'm using both consts and vars of the primes. Using consts where
+// possible in the Go code is worth a small (but measurable) performance boost
+// by avoiding some MOVQs. Vars are needed for the asm and also are useful for
+// convenience in the Go code in a few places where we need to intentionally
+// avoid constant arithmetic (e.g., v1 := prime1 + prime2 fails because the
+// result overflows a uint64).
+var (
+	prime1v = prime1
+	prime2v = prime2
+	prime3v = prime3
+	prime4v = prime4
+	prime5v = prime5
+)
+
+// Digest implements hash.Hash64.
+type Digest struct {
+	v1    uint64
+	v2    uint64
+	v3    uint64
+	v4    uint64
+	total uint64
+	mem   [32]byte
+	n     int // how much of mem is used
+}
+
+// New creates a new Digest that computes the 64-bit xxHash algorithm.
+func New() *Digest {
+	var d Digest
+	d.Reset()
+	return &d
+}
+
+// Reset clears the Digest's state so that it can be reused.
+func (d *Digest) Reset() {
+	d.v1 = prime1v + prime2
+	d.v2 = prime2
+	d.v3 = 0
+	d.v4 = -prime1v
+	d.total = 0
+	d.n = 0
+}
+
+// Size always returns 8 bytes.
+func (d *Digest) Size() int { return 8 }
+
+// BlockSize always returns 32 bytes.
+func (d *Digest) BlockSize() int { return 32 }
+
+// Write adds more data to d. It always returns len(b), nil.
+func (d *Digest) Write(b []byte) (n int, err error) {
+	n = len(b)
+	d.total += uint64(n)
+
+	if d.n+n < 32 {
+		// This new data doesn't even fill the current block.
+		copy(d.mem[d.n:], b)
+		d.n += n
+		return
+	}
+
+	if d.n > 0 {
+		// Finish off the partial block.
+		copy(d.mem[d.n:], b)
+		d.v1 = round(d.v1, u64(d.mem[0:8]))
+		d.v2 = round(d.v2, u64(d.mem[8:16]))
+		d.v3 = round(d.v3, u64(d.mem[16:24]))
+		d.v4 = round(d.v4, u64(d.mem[24:32]))
+		b = b[32-d.n:]
+		d.n = 0
+	}
+
+	if len(b) >= 32 {
+		// One or more full blocks left.
+		nw := writeBlocks(d, b)
+		b = b[nw:]
+	}
+
+	// Store any remaining partial block.
+	copy(d.mem[:], b)
+	d.n = len(b)
+
+	return
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+func (d *Digest) Sum(b []byte) []byte {
+	s := d.Sum64()
+	return append(
+		b,
+		byte(s>>56),
+		byte(s>>48),
+		byte(s>>40),
+		byte(s>>32),
+		byte(s>>24),
+		byte(s>>16),
+		byte(s>>8),
+		byte(s),
+	)
+}
+
+// Sum64 returns the current hash.
+func (d *Digest) Sum64() uint64 {
+	var h uint64
+
+	if d.total >= 32 {
+		v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
+		h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+		h = mergeRound(h, v1)
+		h = mergeRound(h, v2)
+		h = mergeRound(h, v3)
+		h = mergeRound(h, v4)
+	} else {
+		h = d.v3 + prime5
+	}
+
+	h += d.total
+
+	i, end := 0, d.n
+	for ; i+8 <= end; i += 8 {
+		k1 := round(0, u64(d.mem[i:i+8]))
+		h ^= k1
+		h = rol27(h)*prime1 + prime4
+	}
+	if i+4 <= end {
+		h ^= uint64(u32(d.mem[i:i+4])) * prime1
+		h = rol23(h)*prime2 + prime3
+		i += 4
+	}
+	for i < end {
+		h ^= uint64(d.mem[i]) * prime5
+		h = rol11(h) * prime1
+		i++
+	}
+
+	h ^= h >> 33
+	h *= prime2
+	h ^= h >> 29
+	h *= prime3
+	h ^= h >> 32
+
+	return h
+}
+
+const (
+	magic         = "xxh\x06"
+	marshaledSize = len(magic) + 8*5 + 32
+)
+
+// MarshalBinary implements the encoding.BinaryMarshaler interface.
+func (d *Digest) MarshalBinary() ([]byte, error) {
+	b := make([]byte, 0, marshaledSize)
+	b = append(b, magic...)
+	b = appendUint64(b, d.v1)
+	b = appendUint64(b, d.v2)
+	b = appendUint64(b, d.v3)
+	b = appendUint64(b, d.v4)
+	b = appendUint64(b, d.total)
+	b = append(b, d.mem[:d.n]...)
+	b = b[:len(b)+len(d.mem)-d.n]
+	return b, nil
+}
+
+// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
+func (d *Digest) UnmarshalBinary(b []byte) error {
+	if len(b) < len(magic) || string(b[:len(magic)]) != magic {
+		return errors.New("xxhash: invalid hash state identifier")
+	}
+	if len(b) != marshaledSize {
+		return errors.New("xxhash: invalid hash state size")
+	}
+	b = b[len(magic):]
+	b, d.v1 = consumeUint64(b)
+	b, d.v2 = consumeUint64(b)
+	b, d.v3 = consumeUint64(b)
+	b, d.v4 = consumeUint64(b)
+	b, d.total = consumeUint64(b)
+	copy(d.mem[:], b)
+	b = b[len(d.mem):]
+	d.n = int(d.total % uint64(len(d.mem)))
+	return nil
+}
+
+func appendUint64(b []byte, x uint64) []byte {
+	var a [8]byte
+	binary.LittleEndian.PutUint64(a[:], x)
+	return append(b, a[:]...)
+}
+
+func consumeUint64(b []byte) ([]byte, uint64) {
+	x := u64(b)
+	return b[8:], x
+}
+
+func u64(b []byte) uint64 { return binary.LittleEndian.Uint64(b) }
+func u32(b []byte) uint32 { return binary.LittleEndian.Uint32(b) }
+
+func round(acc, input uint64) uint64 {
+	acc += input * prime2
+	acc = rol31(acc)
+	acc *= prime1
+	return acc
+}
+
+func mergeRound(acc, val uint64) uint64 {
+	val = round(0, val)
+	acc ^= val
+	acc = acc*prime1 + prime4
+	return acc
+}
+
+func rol1(x uint64) uint64  { return bits.RotateLeft64(x, 1) }
+func rol7(x uint64) uint64  { return bits.RotateLeft64(x, 7) }
+func rol11(x uint64) uint64 { return bits.RotateLeft64(x, 11) }
+func rol12(x uint64) uint64 { return bits.RotateLeft64(x, 12) }
+func rol18(x uint64) uint64 { return bits.RotateLeft64(x, 18) }
+func rol23(x uint64) uint64 { return bits.RotateLeft64(x, 23) }
+func rol27(x uint64) uint64 { return bits.RotateLeft64(x, 27) }
+func rol31(x uint64) uint64 { return bits.RotateLeft64(x, 31) }
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
new file mode 100644
index 00000000000..35318d7c46c
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.go
@@ -0,0 +1,13 @@
+// +build !appengine
+// +build gc
+// +build !purego
+
+package xxhash
+
+// Sum64 computes the 64-bit xxHash digest of b.
+//
+//go:noescape
+func Sum64(b []byte) uint64
+
+//go:noescape
+func writeBlocks(*Digest, []byte) int
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
new file mode 100644
index 00000000000..2c9c5357a14
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_amd64.s
@@ -0,0 +1,215 @@
+// +build !appengine
+// +build gc
+// +build !purego
+
+#include "textflag.h"
+
+// Register allocation:
+// AX	h
+// CX	pointer to advance through b
+// DX	n
+// BX	loop end
+// R8	v1, k1
+// R9	v2
+// R10	v3
+// R11	v4
+// R12	tmp
+// R13	prime1v
+// R14	prime2v
+// R15	prime4v
+
+// round reads from and advances the buffer pointer in CX.
+// It assumes that R13 has prime1v and R14 has prime2v.
+#define round(r) \
+	MOVQ  (CX), R12 \
+	ADDQ  $8, CX    \
+	IMULQ R14, R12  \
+	ADDQ  R12, r    \
+	ROLQ  $31, r    \
+	IMULQ R13, r
+
+// mergeRound applies a merge round on the two registers acc and val.
+// It assumes that R13 has prime1v, R14 has prime2v, and R15 has prime4v.
+#define mergeRound(acc, val) \
+	IMULQ R14, val \
+	ROLQ  $31, val \
+	IMULQ R13, val \
+	XORQ  val, acc \
+	IMULQ R13, acc \
+	ADDQ  R15, acc
+
+// func Sum64(b []byte) uint64
+TEXT ·Sum64(SB), NOSPLIT, $0-32
+	// Load fixed primes.
+	MOVQ ·prime1v(SB), R13
+	MOVQ ·prime2v(SB), R14
+	MOVQ ·prime4v(SB), R15
+
+	// Load slice.
+	MOVQ b_base+0(FP), CX
+	MOVQ b_len+8(FP), DX
+	LEAQ (CX)(DX*1), BX
+
+	// The first loop limit will be len(b)-32.
+	SUBQ $32, BX
+
+	// Check whether we have at least one block.
+	CMPQ DX, $32
+	JLT  noBlocks
+
+	// Set up initial state (v1, v2, v3, v4).
+	MOVQ R13, R8
+	ADDQ R14, R8
+	MOVQ R14, R9
+	XORQ R10, R10
+	XORQ R11, R11
+	SUBQ R13, R11
+
+	// Loop until CX > BX.
+blockLoop:
+	round(R8)
+	round(R9)
+	round(R10)
+	round(R11)
+
+	CMPQ CX, BX
+	JLE  blockLoop
+
+	MOVQ R8, AX
+	ROLQ $1, AX
+	MOVQ R9, R12
+	ROLQ $7, R12
+	ADDQ R12, AX
+	MOVQ R10, R12
+	ROLQ $12, R12
+	ADDQ R12, AX
+	MOVQ R11, R12
+	ROLQ $18, R12
+	ADDQ R12, AX
+
+	mergeRound(AX, R8)
+	mergeRound(AX, R9)
+	mergeRound(AX, R10)
+	mergeRound(AX, R11)
+
+	JMP afterBlocks
+
+noBlocks:
+	MOVQ ·prime5v(SB), AX
+
+afterBlocks:
+	ADDQ DX, AX
+
+	// Right now BX has len(b)-32, and we want to loop until CX > len(b)-8.
+	ADDQ $24, BX
+
+	CMPQ CX, BX
+	JG   fourByte
+
+wordLoop:
+	// Calculate k1.
+	MOVQ  (CX), R8
+	ADDQ  $8, CX
+	IMULQ R14, R8
+	ROLQ  $31, R8
+	IMULQ R13, R8
+
+	XORQ  R8, AX
+	ROLQ  $27, AX
+	IMULQ R13, AX
+	ADDQ  R15, AX
+
+	CMPQ CX, BX
+	JLE  wordLoop
+
+fourByte:
+	ADDQ $4, BX
+	CMPQ CX, BX
+	JG   singles
+
+	MOVL  (CX), R8
+	ADDQ  $4, CX
+	IMULQ R13, R8
+	XORQ  R8, AX
+
+	ROLQ  $23, AX
+	IMULQ R14, AX
+	ADDQ  ·prime3v(SB), AX
+
+singles:
+	ADDQ $4, BX
+	CMPQ CX, BX
+	JGE  finalize
+
+singlesLoop:
+	MOVBQZX (CX), R12
+	ADDQ    $1, CX
+	IMULQ   ·prime5v(SB), R12
+	XORQ    R12, AX
+
+	ROLQ  $11, AX
+	IMULQ R13, AX
+
+	CMPQ CX, BX
+	JL   singlesLoop
+
+finalize:
+	MOVQ  AX, R12
+	SHRQ  $33, R12
+	XORQ  R12, AX
+	IMULQ R14, AX
+	MOVQ  AX, R12
+	SHRQ  $29, R12
+	XORQ  R12, AX
+	IMULQ ·prime3v(SB), AX
+	MOVQ  AX, R12
+	SHRQ  $32, R12
+	XORQ  R12, AX
+
+	MOVQ AX, ret+24(FP)
+	RET
+
+// writeBlocks uses the same registers as above except that it uses AX to store
+// the d pointer.
+
+// func writeBlocks(d *Digest, b []byte) int
+TEXT ·writeBlocks(SB), NOSPLIT, $0-40
+	// Load fixed primes needed for round.
+	MOVQ ·prime1v(SB), R13
+	MOVQ ·prime2v(SB), R14
+
+	// Load slice.
+	MOVQ arg1_base+8(FP), CX
+	MOVQ arg1_len+16(FP), DX
+	LEAQ (CX)(DX*1), BX
+	SUBQ $32, BX
+
+	// Load vN from d.
+	MOVQ arg+0(FP), AX
+	MOVQ 0(AX), R8   // v1
+	MOVQ 8(AX), R9   // v2
+	MOVQ 16(AX), R10 // v3
+	MOVQ 24(AX), R11 // v4
+
+	// We don't need to check the loop condition here; this function is
+	// always called with at least one block of data to process.
+blockLoop:
+	round(R8)
+	round(R9)
+	round(R10)
+	round(R11)
+
+	CMPQ CX, BX
+	JLE  blockLoop
+
+	// Copy vN back to d.
+	MOVQ R8, 0(AX)
+	MOVQ R9, 8(AX)
+	MOVQ R10, 16(AX)
+	MOVQ R11, 24(AX)
+
+	// The number of bytes written is CX minus the old base pointer.
+	SUBQ arg1_base+8(FP), CX
+	MOVQ CX, ret+32(FP)
+
+	RET
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
new file mode 100644
index 00000000000..4a5a821603e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_other.go
@@ -0,0 +1,76 @@
+// +build !amd64 appengine !gc purego
+
+package xxhash
+
+// Sum64 computes the 64-bit xxHash digest of b.
+func Sum64(b []byte) uint64 {
+	// A simpler version would be
+	//   d := New()
+	//   d.Write(b)
+	//   return d.Sum64()
+	// but this is faster, particularly for small inputs.
+
+	n := len(b)
+	var h uint64
+
+	if n >= 32 {
+		v1 := prime1v + prime2
+		v2 := prime2
+		v3 := uint64(0)
+		v4 := -prime1v
+		for len(b) >= 32 {
+			v1 = round(v1, u64(b[0:8:len(b)]))
+			v2 = round(v2, u64(b[8:16:len(b)]))
+			v3 = round(v3, u64(b[16:24:len(b)]))
+			v4 = round(v4, u64(b[24:32:len(b)]))
+			b = b[32:len(b):len(b)]
+		}
+		h = rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+		h = mergeRound(h, v1)
+		h = mergeRound(h, v2)
+		h = mergeRound(h, v3)
+		h = mergeRound(h, v4)
+	} else {
+		h = prime5
+	}
+
+	h += uint64(n)
+
+	i, end := 0, len(b)
+	for ; i+8 <= end; i += 8 {
+		k1 := round(0, u64(b[i:i+8:len(b)]))
+		h ^= k1
+		h = rol27(h)*prime1 + prime4
+	}
+	if i+4 <= end {
+		h ^= uint64(u32(b[i:i+4:len(b)])) * prime1
+		h = rol23(h)*prime2 + prime3
+		i += 4
+	}
+	for ; i < end; i++ {
+		h ^= uint64(b[i]) * prime5
+		h = rol11(h) * prime1
+	}
+
+	h ^= h >> 33
+	h *= prime2
+	h ^= h >> 29
+	h *= prime3
+	h ^= h >> 32
+
+	return h
+}
+
+func writeBlocks(d *Digest, b []byte) int {
+	v1, v2, v3, v4 := d.v1, d.v2, d.v3, d.v4
+	n := len(b)
+	for len(b) >= 32 {
+		v1 = round(v1, u64(b[0:8:len(b)]))
+		v2 = round(v2, u64(b[8:16:len(b)]))
+		v3 = round(v3, u64(b[16:24:len(b)]))
+		v4 = round(v4, u64(b[24:32:len(b)]))
+		b = b[32:len(b):len(b)]
+	}
+	d.v1, d.v2, d.v3, d.v4 = v1, v2, v3, v4
+	return n - len(b)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go
new file mode 100644
index 00000000000..6f3b0cb1026
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/internal/xxhash/xxhash_safe.go
@@ -0,0 +1,11 @@
+package xxhash
+
+// Sum64String computes the 64-bit xxHash digest of s.
+func Sum64String(s string) uint64 {
+	return Sum64([]byte(s))
+}
+
+// WriteString adds more data to d. It always returns len(s), nil.
+func (d *Digest) WriteString(s string) (n int, err error) {
+	return d.Write([]byte(s))
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqdec.go b/vendor/github.com/klauspost/compress/zstd/seqdec.go
new file mode 100644
index 00000000000..15a45f7b501
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqdec.go
@@ -0,0 +1,402 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"errors"
+	"fmt"
+	"io"
+)
+
+type seq struct {
+	litLen   uint32
+	matchLen uint32
+	offset   uint32
+
+	// Codes are stored here for the encoder
+	// so they only have to be looked up once.
+	llCode, mlCode, ofCode uint8
+}
+
+func (s seq) String() string {
+	if s.offset <= 3 {
+		if s.offset == 0 {
+			return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset: INVALID (0)")
+		}
+		return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset:", s.offset, " (repeat)")
+	}
+	return fmt.Sprint("litLen:", s.litLen, ", matchLen:", s.matchLen+zstdMinMatch, ", offset:", s.offset-3, " (new)")
+}
+
+type seqCompMode uint8
+
+const (
+	compModePredefined seqCompMode = iota
+	compModeRLE
+	compModeFSE
+	compModeRepeat
+)
+
+type sequenceDec struct {
+	// decoder keeps track of the current state and updates it from the bitstream.
+	fse    *fseDecoder
+	state  fseState
+	repeat bool
+}
+
+// init the state of the decoder with input from stream.
+func (s *sequenceDec) init(br *bitReader) error {
+	if s.fse == nil {
+		return errors.New("sequence decoder not defined")
+	}
+	s.state.init(br, s.fse.actualTableLog, s.fse.dt[:1<<s.fse.actualTableLog])
+	return nil
+}
+
+// sequenceDecs contains all 3 sequence decoders and their state.
+type sequenceDecs struct {
+	litLengths   sequenceDec
+	offsets      sequenceDec
+	matchLengths sequenceDec
+	prevOffset   [3]int
+	hist         []byte
+	literals     []byte
+	out          []byte
+	maxBits      uint8
+}
+
+// initialize all 3 decoders from the stream input.
+func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []byte) error {
+	if err := s.litLengths.init(br); err != nil {
+		return errors.New("litLengths:" + err.Error())
+	}
+	if err := s.offsets.init(br); err != nil {
+		return errors.New("offsets:" + err.Error())
+	}
+	if err := s.matchLengths.init(br); err != nil {
+		return errors.New("matchLengths:" + err.Error())
+	}
+	s.literals = literals
+	s.hist = hist.b
+	s.prevOffset = hist.recentOffsets
+	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
+	s.out = out
+	return nil
+}
+
+// decode sequences from the stream with the provided history.
+func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
+	startSize := len(s.out)
+	// Grab full sizes tables, to avoid bounds checks.
+	llTable, mlTable, ofTable := s.litLengths.fse.dt[:maxTablesize], s.matchLengths.fse.dt[:maxTablesize], s.offsets.fse.dt[:maxTablesize]
+	llState, mlState, ofState := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+
+	for i := seqs - 1; i >= 0; i-- {
+		if br.overread() {
+			printf("reading sequence %d, exceeded available data\n", seqs-i)
+			return io.ErrUnexpectedEOF
+		}
+		var litLen, matchOff, matchLen int
+		if br.off > 4+((maxOffsetBits+16+16)>>3) {
+			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
+			br.fillFast()
+		} else {
+			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
+			br.fill()
+		}
+
+		if debugSequences {
+			println("Seq", seqs-i-1, "Litlen:", litLen, "matchOff:", matchOff, "(abs) matchLen:", matchLen)
+		}
+
+		if litLen > len(s.literals) {
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", litLen, len(s.literals))
+		}
+		size := litLen + matchLen + len(s.out)
+		if size-startSize > maxBlockSize {
+			return fmt.Errorf("output (%d) bigger than max block size", size)
+		}
+		if size > cap(s.out) {
+			// Not enough size, will be extremely rarely triggered,
+			// but could be if destination slice is too small for sync operations.
+			// We add maxBlockSize to the capacity.
+			s.out = append(s.out, make([]byte, maxBlockSize)...)
+			s.out = s.out[:len(s.out)-maxBlockSize]
+		}
+		if matchLen > maxMatchLen {
+			return fmt.Errorf("match len (%d) bigger than max allowed length", matchLen)
+		}
+		if matchOff > len(s.out)+len(hist)+litLen {
+			return fmt.Errorf("match offset (%d) bigger than current history (%d)", matchOff, len(s.out)+len(hist)+litLen)
+		}
+		if matchOff == 0 && matchLen > 0 {
+			return fmt.Errorf("zero matchoff and matchlen > 0")
+		}
+
+		s.out = append(s.out, s.literals[:litLen]...)
+		s.literals = s.literals[litLen:]
+		out := s.out
+
+		// Copy from history.
+		// TODO: Blocks without history could be made to ignore this completely.
+		if v := matchOff - len(s.out); v > 0 {
+			// v is the start position in history from end.
+			start := len(s.hist) - v
+			if matchLen > v {
+				// Some goes into current block.
+				// Copy remainder of history
+				out = append(out, s.hist[start:]...)
+				matchOff -= v
+				matchLen -= v
+			} else {
+				out = append(out, s.hist[start:start+matchLen]...)
+				matchLen = 0
+			}
+		}
+		// We must be in current buffer now
+		if matchLen > 0 {
+			start := len(s.out) - matchOff
+			if matchLen <= len(s.out)-start {
+				// No overlap
+				out = append(out, s.out[start:start+matchLen]...)
+			} else {
+				// Overlapping copy
+				// Extend destination slice and copy one byte at the time.
+				out = out[:len(out)+matchLen]
+				src := out[start : start+matchLen]
+				// Destination is the space we just added.
+				dst := out[len(out)-matchLen:]
+				dst = dst[:len(src)]
+				for i := range src {
+					dst[i] = src[i]
+				}
+			}
+		}
+		s.out = out
+		if i == 0 {
+			// This is the last sequence, so we shouldn't update state.
+			break
+		}
+
+		// Manually inlined, ~ 5-20% faster
+		// Update all 3 states at once. Approx 20% faster.
+		nBits := llState.nbBits() + mlState.nbBits() + ofState.nbBits()
+		if nBits == 0 {
+			llState = llTable[llState.newState()&maxTableMask]
+			mlState = mlTable[mlState.newState()&maxTableMask]
+			ofState = ofTable[ofState.newState()&maxTableMask]
+		} else {
+			bits := br.getBitsFast(nBits)
+			lowBits := uint16(bits >> ((ofState.nbBits() + mlState.nbBits()) & 31))
+			llState = llTable[(llState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits >> (ofState.nbBits() & 31))
+			lowBits &= bitMask[mlState.nbBits()&15]
+			mlState = mlTable[(mlState.newState()+lowBits)&maxTableMask]
+
+			lowBits = uint16(bits) & bitMask[ofState.nbBits()&15]
+			ofState = ofTable[(ofState.newState()+lowBits)&maxTableMask]
+		}
+	}
+
+	// Add final literals
+	s.out = append(s.out, s.literals...)
+	return nil
+}
+
+// update states, at least 27 bits must be available.
+func (s *sequenceDecs) update(br *bitReader) {
+	// Max 8 bits
+	s.litLengths.state.next(br)
+	// Max 9 bits
+	s.matchLengths.state.next(br)
+	// Max 8 bits
+	s.offsets.state.next(br)
+}
+
+var bitMask [16]uint16
+
+func init() {
+	for i := range bitMask[:] {
+		bitMask[i] = uint16((1 << uint(i)) - 1)
+	}
+}
+
+// update states, at least 27 bits must be available.
+func (s *sequenceDecs) updateAlt(br *bitReader) {
+	// Update all 3 states at once. Approx 20% faster.
+	a, b, c := s.litLengths.state.state, s.matchLengths.state.state, s.offsets.state.state
+
+	nBits := a.nbBits() + b.nbBits() + c.nbBits()
+	if nBits == 0 {
+		s.litLengths.state.state = s.litLengths.state.dt[a.newState()]
+		s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()]
+		s.offsets.state.state = s.offsets.state.dt[c.newState()]
+		return
+	}
+	bits := br.getBitsFast(nBits)
+	lowBits := uint16(bits >> ((c.nbBits() + b.nbBits()) & 31))
+	s.litLengths.state.state = s.litLengths.state.dt[a.newState()+lowBits]
+
+	lowBits = uint16(bits >> (c.nbBits() & 31))
+	lowBits &= bitMask[b.nbBits()&15]
+	s.matchLengths.state.state = s.matchLengths.state.dt[b.newState()+lowBits]
+
+	lowBits = uint16(bits) & bitMask[c.nbBits()&15]
+	s.offsets.state.state = s.offsets.state.dt[c.newState()+lowBits]
+}
+
+// nextFast will return new states when there are at least 4 unused bytes left on the stream when done.
+func (s *sequenceDecs) nextFast(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
+	// Final will not read from stream.
+	ll, llB := llState.final()
+	ml, mlB := mlState.final()
+	mo, moB := ofState.final()
+
+	// extra bits are stored in reverse order.
+	br.fillFast()
+	mo += br.getBits(moB)
+	if s.maxBits > 32 {
+		br.fillFast()
+	}
+	ml += br.getBits(mlB)
+	ll += br.getBits(llB)
+
+	if moB > 1 {
+		s.prevOffset[2] = s.prevOffset[1]
+		s.prevOffset[1] = s.prevOffset[0]
+		s.prevOffset[0] = mo
+		return
+	}
+	// mo = s.adjustOffset(mo, ll, moB)
+	// Inlined for rather big speedup
+	if ll == 0 {
+		// There is an exception though, when current sequence's literals_length = 0.
+		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+		mo++
+	}
+
+	if mo == 0 {
+		mo = s.prevOffset[0]
+		return
+	}
+	var temp int
+	if mo == 3 {
+		temp = s.prevOffset[0] - 1
+	} else {
+		temp = s.prevOffset[mo]
+	}
+
+	if temp == 0 {
+		// 0 is not valid; input is corrupted; force offset to 1
+		println("temp was 0")
+		temp = 1
+	}
+
+	if mo != 1 {
+		s.prevOffset[2] = s.prevOffset[1]
+	}
+	s.prevOffset[1] = s.prevOffset[0]
+	s.prevOffset[0] = temp
+	mo = temp
+	return
+}
+
+func (s *sequenceDecs) next(br *bitReader, llState, mlState, ofState decSymbol) (ll, mo, ml int) {
+	// Final will not read from stream.
+	ll, llB := llState.final()
+	ml, mlB := mlState.final()
+	mo, moB := ofState.final()
+
+	// extra bits are stored in reverse order.
+	br.fill()
+	if s.maxBits <= 32 {
+		mo += br.getBits(moB)
+		ml += br.getBits(mlB)
+		ll += br.getBits(llB)
+	} else {
+		mo += br.getBits(moB)
+		br.fill()
+		// matchlength+literal length, max 32 bits
+		ml += br.getBits(mlB)
+		ll += br.getBits(llB)
+
+	}
+	mo = s.adjustOffset(mo, ll, moB)
+	return
+}
+
+func (s *sequenceDecs) adjustOffset(offset, litLen int, offsetB uint8) int {
+	if offsetB > 1 {
+		s.prevOffset[2] = s.prevOffset[1]
+		s.prevOffset[1] = s.prevOffset[0]
+		s.prevOffset[0] = offset
+		return offset
+	}
+
+	if litLen == 0 {
+		// There is an exception though, when current sequence's literals_length = 0.
+		// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
+		// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
+		offset++
+	}
+
+	if offset == 0 {
+		return s.prevOffset[0]
+	}
+	var temp int
+	if offset == 3 {
+		temp = s.prevOffset[0] - 1
+	} else {
+		temp = s.prevOffset[offset]
+	}
+
+	if temp == 0 {
+		// 0 is not valid; input is corrupted; force offset to 1
+		println("temp was 0")
+		temp = 1
+	}
+
+	if offset != 1 {
+		s.prevOffset[2] = s.prevOffset[1]
+	}
+	s.prevOffset[1] = s.prevOffset[0]
+	s.prevOffset[0] = temp
+	return temp
+}
+
+// mergeHistory will merge history.
+func (s *sequenceDecs) mergeHistory(hist *sequenceDecs) (*sequenceDecs, error) {
+	for i := uint(0); i < 3; i++ {
+		var sNew, sHist *sequenceDec
+		switch i {
+		default:
+			// same as "case 0":
+			sNew = &s.litLengths
+			sHist = &hist.litLengths
+		case 1:
+			sNew = &s.offsets
+			sHist = &hist.offsets
+		case 2:
+			sNew = &s.matchLengths
+			sHist = &hist.matchLengths
+		}
+		if sNew.repeat {
+			if sHist.fse == nil {
+				return nil, fmt.Errorf("sequence stream %d, repeat requested, but no history", i)
+			}
+			continue
+		}
+		if sNew.fse == nil {
+			return nil, fmt.Errorf("sequence stream %d, no fse found", i)
+		}
+		if sHist.fse != nil && !sHist.fse.preDefined {
+			fseDecoderPool.Put(sHist.fse)
+		}
+		sHist.fse = sNew.fse
+	}
+	return hist, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/seqenc.go b/vendor/github.com/klauspost/compress/zstd/seqenc.go
new file mode 100644
index 00000000000..36bcc3cc02e
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/seqenc.go
@@ -0,0 +1,115 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import "math/bits"
+
+type seqCoders struct {
+	llEnc, ofEnc, mlEnc    *fseEncoder
+	llPrev, ofPrev, mlPrev *fseEncoder
+}
+
+// swap coders with another (block).
+func (s *seqCoders) swap(other *seqCoders) {
+	*s, *other = *other, *s
+}
+
+// setPrev will update the previous encoders to the actually used ones
+// and make sure a fresh one is in the main slot.
+func (s *seqCoders) setPrev(ll, ml, of *fseEncoder) {
+	compareSwap := func(used *fseEncoder, current, prev **fseEncoder) {
+		// We used the new one, more current to history and reuse the previous history
+		if *current == used {
+			*prev, *current = *current, *prev
+			c := *current
+			p := *prev
+			c.reUsed = false
+			p.reUsed = true
+			return
+		}
+		if used == *prev {
+			return
+		}
+		// Ensure we cannot reuse by accident
+		prevEnc := *prev
+		prevEnc.symbolLen = 0
+		return
+	}
+	compareSwap(ll, &s.llEnc, &s.llPrev)
+	compareSwap(ml, &s.mlEnc, &s.mlPrev)
+	compareSwap(of, &s.ofEnc, &s.ofPrev)
+}
+
+func highBit(val uint32) (n uint32) {
+	return uint32(bits.Len32(val) - 1)
+}
+
+var llCodeTable = [64]byte{0, 1, 2, 3, 4, 5, 6, 7,
+	8, 9, 10, 11, 12, 13, 14, 15,
+	16, 16, 17, 17, 18, 18, 19, 19,
+	20, 20, 20, 20, 21, 21, 21, 21,
+	22, 22, 22, 22, 22, 22, 22, 22,
+	23, 23, 23, 23, 23, 23, 23, 23,
+	24, 24, 24, 24, 24, 24, 24, 24,
+	24, 24, 24, 24, 24, 24, 24, 24}
+
+// Up to 6 bits
+const maxLLCode = 35
+
+// llBitsTable translates from ll code to number of bits.
+var llBitsTable = [maxLLCode + 1]byte{
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 2, 2, 3, 3,
+	4, 6, 7, 8, 9, 10, 11, 12,
+	13, 14, 15, 16}
+
+// llCode returns the code that represents the literal length requested.
+func llCode(litLength uint32) uint8 {
+	const llDeltaCode = 19
+	if litLength <= 63 {
+		// Compiler insists on bounds check (Go 1.12)
+		return llCodeTable[litLength&63]
+	}
+	return uint8(highBit(litLength)) + llDeltaCode
+}
+
+var mlCodeTable = [128]byte{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+	16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+	32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37,
+	38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39,
+	40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+	41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
+	42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+	42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42}
+
+// Up to 6 bits
+const maxMLCode = 52
+
+// mlBitsTable translates from ml code to number of bits.
+var mlBitsTable = [maxMLCode + 1]byte{
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 2, 2, 3, 3,
+	4, 4, 5, 7, 8, 9, 10, 11,
+	12, 13, 14, 15, 16}
+
+// note : mlBase = matchLength - MINMATCH;
+// because it's the format it's stored in seqStore->sequences
+func mlCode(mlBase uint32) uint8 {
+	const mlDeltaCode = 36
+	if mlBase <= 127 {
+		// Compiler insists on bounds check (Go 1.12)
+		return mlCodeTable[mlBase&127]
+	}
+	return uint8(highBit(mlBase)) + mlDeltaCode
+}
+
+func ofCode(offset uint32) uint8 {
+	// A valid offset will always be > 0.
+	return uint8(bits.Len32(offset) - 1)
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/snappy.go b/vendor/github.com/klauspost/compress/zstd/snappy.go
new file mode 100644
index 00000000000..356956ba256
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/snappy.go
@@ -0,0 +1,436 @@
+// Copyright 2019+ Klaus Post. All rights reserved.
+// License information can be found in the LICENSE file.
+// Based on work by Yann Collet, released under BSD License.
+
+package zstd
+
+import (
+	"encoding/binary"
+	"errors"
+	"hash/crc32"
+	"io"
+
+	"github.com/klauspost/compress/huff0"
+	"github.com/klauspost/compress/snappy"
+)
+
+const (
+	snappyTagLiteral = 0x00
+	snappyTagCopy1   = 0x01
+	snappyTagCopy2   = 0x02
+	snappyTagCopy4   = 0x03
+)
+
+const (
+	snappyChecksumSize = 4
+	snappyMagicBody    = "sNaPpY"
+
+	// snappyMaxBlockSize is the maximum size of the input to encodeBlock. It is not
+	// part of the wire format per se, but some parts of the encoder assume
+	// that an offset fits into a uint16.
+	//
+	// Also, for the framing format (Writer type instead of Encode function),
+	// https://github.com/google/snappy/blob/master/framing_format.txt says
+	// that "the uncompressed data in a chunk must be no longer than 65536
+	// bytes".
+	snappyMaxBlockSize = 65536
+
+	// snappyMaxEncodedLenOfMaxBlockSize equals MaxEncodedLen(snappyMaxBlockSize), but is
+	// hard coded to be a const instead of a variable, so that obufLen can also
+	// be a const. Their equivalence is confirmed by
+	// TestMaxEncodedLenOfMaxBlockSize.
+	snappyMaxEncodedLenOfMaxBlockSize = 76490
+)
+
+const (
+	chunkTypeCompressedData   = 0x00
+	chunkTypeUncompressedData = 0x01
+	chunkTypePadding          = 0xfe
+	chunkTypeStreamIdentifier = 0xff
+)
+
+var (
+	// ErrSnappyCorrupt reports that the input is invalid.
+	ErrSnappyCorrupt = errors.New("snappy: corrupt input")
+	// ErrSnappyTooLarge reports that the uncompressed length is too large.
+	ErrSnappyTooLarge = errors.New("snappy: decoded block is too large")
+	// ErrSnappyUnsupported reports that the input isn't supported.
+	ErrSnappyUnsupported = errors.New("snappy: unsupported input")
+
+	errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
+)
+
+// SnappyConverter can read SnappyConverter-compressed streams and convert them to zstd.
+// Conversion is done by converting the stream directly from Snappy without intermediate
+// full decoding.
+// Therefore the compression ratio is much less than what can be done by a full decompression
+// and compression, and a faulty Snappy stream may lead to a faulty Zstandard stream without
+// any errors being generated.
+// No CRC value is being generated and not all CRC values of the Snappy stream are checked.
+// However, it provides really fast recompression of Snappy streams.
+// The converter can be reused to avoid allocations, even after errors.
+type SnappyConverter struct {
+	r     io.Reader
+	err   error
+	buf   []byte
+	block *blockEnc
+}
+
+// Convert the Snappy stream supplied in 'in' and write the zStandard stream to 'w'.
+// If any error is detected on the Snappy stream it is returned.
+// The number of bytes written is returned.
+func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
+	initPredefined()
+	r.err = nil
+	r.r = in
+	if r.block == nil {
+		r.block = &blockEnc{}
+		r.block.init()
+	}
+	r.block.initNewEncode()
+	if len(r.buf) != snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize {
+		r.buf = make([]byte, snappyMaxEncodedLenOfMaxBlockSize+snappyChecksumSize)
+	}
+	r.block.litEnc.Reuse = huff0.ReusePolicyNone
+	var written int64
+	var readHeader bool
+	{
+		var header []byte
+		var n int
+		header, r.err = frameHeader{WindowSize: snappyMaxBlockSize}.appendTo(r.buf[:0])
+
+		n, r.err = w.Write(header)
+		if r.err != nil {
+			return written, r.err
+		}
+		written += int64(n)
+	}
+
+	for {
+		if !r.readFull(r.buf[:4], true) {
+			// Add empty last block
+			r.block.reset(nil)
+			r.block.last = true
+			err := r.block.encodeLits(false)
+			if err != nil {
+				return written, err
+			}
+			n, err := w.Write(r.block.output)
+			if err != nil {
+				return written, err
+			}
+			written += int64(n)
+
+			return written, r.err
+		}
+		chunkType := r.buf[0]
+		if !readHeader {
+			if chunkType != chunkTypeStreamIdentifier {
+				println("chunkType != chunkTypeStreamIdentifier", chunkType)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			readHeader = true
+		}
+		chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
+		if chunkLen > len(r.buf) {
+			println("chunkLen > len(r.buf)", chunkType)
+			r.err = ErrSnappyUnsupported
+			return written, r.err
+		}
+
+		// The chunk types are specified at
+		// https://github.com/google/snappy/blob/master/framing_format.txt
+		switch chunkType {
+		case chunkTypeCompressedData:
+			// Section 4.2. Compressed data (chunk type 0x00).
+			if chunkLen < snappyChecksumSize {
+				println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			buf := r.buf[:chunkLen]
+			if !r.readFull(buf, false) {
+				return written, r.err
+			}
+			//checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			buf = buf[snappyChecksumSize:]
+
+			n, hdr, err := snappyDecodedLen(buf)
+			if err != nil {
+				r.err = err
+				return written, r.err
+			}
+			buf = buf[hdr:]
+			if n > snappyMaxBlockSize {
+				println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			r.block.reset(nil)
+			r.block.pushOffsets()
+			if err := decodeSnappy(r.block, buf); err != nil {
+				r.err = err
+				return written, r.err
+			}
+			if r.block.size+r.block.extraLits != n {
+				printf("invalid size, want %d, got %d\n", n, r.block.size+r.block.extraLits)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			err = r.block.encode(false)
+			switch err {
+			case errIncompressible:
+				r.block.popOffsets()
+				r.block.reset(nil)
+				r.block.literals, err = snappy.Decode(r.block.literals[:n], r.buf[snappyChecksumSize:chunkLen])
+				if err != nil {
+					println("snappy.Decode:", err)
+					return written, err
+				}
+				err = r.block.encodeLits(false)
+				if err != nil {
+					return written, err
+				}
+			case nil:
+			default:
+				return written, err
+			}
+
+			n, r.err = w.Write(r.block.output)
+			if r.err != nil {
+				return written, err
+			}
+			written += int64(n)
+			continue
+		case chunkTypeUncompressedData:
+			if debug {
+				println("Uncompressed, chunklen", chunkLen)
+			}
+			// Section 4.3. Uncompressed data (chunk type 0x01).
+			if chunkLen < snappyChecksumSize {
+				println("chunkLen < snappyChecksumSize", chunkLen, snappyChecksumSize)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			r.block.reset(nil)
+			buf := r.buf[:snappyChecksumSize]
+			if !r.readFull(buf, false) {
+				return written, r.err
+			}
+			checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
+			// Read directly into r.decoded instead of via r.buf.
+			n := chunkLen - snappyChecksumSize
+			if n > snappyMaxBlockSize {
+				println("n > snappyMaxBlockSize", n, snappyMaxBlockSize)
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			r.block.literals = r.block.literals[:n]
+			if !r.readFull(r.block.literals, false) {
+				return written, r.err
+			}
+			if snappyCRC(r.block.literals) != checksum {
+				println("literals crc mismatch")
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			err := r.block.encodeLits(false)
+			if err != nil {
+				return written, err
+			}
+			n, r.err = w.Write(r.block.output)
+			if r.err != nil {
+				return written, err
+			}
+			written += int64(n)
+			continue
+
+		case chunkTypeStreamIdentifier:
+			if debug {
+				println("stream id", chunkLen, len(snappyMagicBody))
+			}
+			// Section 4.1. Stream identifier (chunk type 0xff).
+			if chunkLen != len(snappyMagicBody) {
+				println("chunkLen != len(snappyMagicBody)", chunkLen, len(snappyMagicBody))
+				r.err = ErrSnappyCorrupt
+				return written, r.err
+			}
+			if !r.readFull(r.buf[:len(snappyMagicBody)], false) {
+				return written, r.err
+			}
+			for i := 0; i < len(snappyMagicBody); i++ {
+				if r.buf[i] != snappyMagicBody[i] {
+					println("r.buf[i] != snappyMagicBody[i]", r.buf[i], snappyMagicBody[i], i)
+					r.err = ErrSnappyCorrupt
+					return written, r.err
+				}
+			}
+			continue
+		}
+
+		if chunkType <= 0x7f {
+			// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
+			println("chunkType <= 0x7f")
+			r.err = ErrSnappyUnsupported
+			return written, r.err
+		}
+		// Section 4.4 Padding (chunk type 0xfe).
+		// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
+		if !r.readFull(r.buf[:chunkLen], false) {
+			return written, r.err
+		}
+	}
+}
+
+// decodeSnappy writes the decoding of src to dst. It assumes that the varint-encoded
+// length of the decompressed bytes has already been read.
+func decodeSnappy(blk *blockEnc, src []byte) error {
+	//decodeRef(make([]byte, snappyMaxBlockSize), src)
+	var s, length int
+	lits := blk.extraLits
+	var offset uint32
+	for s < len(src) {
+		switch src[s] & 0x03 {
+		case snappyTagLiteral:
+			x := uint32(src[s] >> 2)
+			switch {
+			case x < 60:
+				s++
+			case x == 60:
+				s += 2
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					println("uint(s) > uint(len(src)", s, src)
+					return ErrSnappyCorrupt
+				}
+				x = uint32(src[s-1])
+			case x == 61:
+				s += 3
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					println("uint(s) > uint(len(src)", s, src)
+					return ErrSnappyCorrupt
+				}
+				x = uint32(src[s-2]) | uint32(src[s-1])<<8
+			case x == 62:
+				s += 4
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					println("uint(s) > uint(len(src)", s, src)
+					return ErrSnappyCorrupt
+				}
+				x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
+			case x == 63:
+				s += 5
+				if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+					println("uint(s) > uint(len(src)", s, src)
+					return ErrSnappyCorrupt
+				}
+				x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+			}
+			if x > snappyMaxBlockSize {
+				println("x > snappyMaxBlockSize", x, snappyMaxBlockSize)
+				return ErrSnappyCorrupt
+			}
+			length = int(x) + 1
+			if length <= 0 {
+				println("length <= 0 ", length)
+
+				return errUnsupportedLiteralLength
+			}
+			//if length > snappyMaxBlockSize-d || uint32(length) > len(src)-s {
+			//	return ErrSnappyCorrupt
+			//}
+
+			blk.literals = append(blk.literals, src[s:s+length]...)
+			//println(length, "litLen")
+			lits += length
+			s += length
+			continue
+
+		case snappyTagCopy1:
+			s += 2
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				println("uint(s) > uint(len(src)", s, len(src))
+				return ErrSnappyCorrupt
+			}
+			length = 4 + int(src[s-2])>>2&0x7
+			offset = uint32(src[s-2])&0xe0<<3 | uint32(src[s-1])
+
+		case snappyTagCopy2:
+			s += 3
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				println("uint(s) > uint(len(src)", s, len(src))
+				return ErrSnappyCorrupt
+			}
+			length = 1 + int(src[s-3])>>2
+			offset = uint32(src[s-2]) | uint32(src[s-1])<<8
+
+		case snappyTagCopy4:
+			s += 5
+			if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
+				println("uint(s) > uint(len(src)", s, len(src))
+				return ErrSnappyCorrupt
+			}
+			length = 1 + int(src[s-5])>>2
+			offset = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
+		}
+
+		if offset <= 0 || blk.size+lits < int(offset) /*|| length > len(blk)-d */ {
+			println("offset <= 0 || blk.size+lits < int(offset)", offset, blk.size+lits, int(offset), blk.size, lits)
+
+			return ErrSnappyCorrupt
+		}
+
+		// Check if offset is one of the recent offsets.
+		// Adjusts the output offset accordingly.
+		// Gives a tiny bit of compression, typically around 1%.
+		if false {
+			offset = blk.matchOffset(offset, uint32(lits))
+		} else {
+			offset += 3
+		}
+
+		blk.sequences = append(blk.sequences, seq{
+			litLen:   uint32(lits),
+			offset:   offset,
+			matchLen: uint32(length) - zstdMinMatch,
+		})
+		blk.size += length + lits
+		lits = 0
+	}
+	blk.extraLits = lits
+	return nil
+}
+
+func (r *SnappyConverter) readFull(p []byte, allowEOF bool) (ok bool) {
+	if _, r.err = io.ReadFull(r.r, p); r.err != nil {
+		if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
+			r.err = ErrSnappyCorrupt
+		}
+		return false
+	}
+	return true
+}
+
+var crcTable = crc32.MakeTable(crc32.Castagnoli)
+
+// crc implements the checksum specified in section 3 of
+// https://github.com/google/snappy/blob/master/framing_format.txt
+func snappyCRC(b []byte) uint32 {
+	c := crc32.Update(0, crcTable, b)
+	return uint32(c>>15|c<<17) + 0xa282ead8
+}
+
+// snappyDecodedLen returns the length of the decoded block and the number of bytes
+// that the length header occupied.
+func snappyDecodedLen(src []byte) (blockLen, headerLen int, err error) {
+	v, n := binary.Uvarint(src)
+	if n <= 0 || v > 0xffffffff {
+		return 0, 0, ErrSnappyCorrupt
+	}
+
+	const wordSize = 32 << (^uint(0) >> 32 & 1)
+	if wordSize == 32 && v > 0x7fffffff {
+		return 0, 0, ErrSnappyTooLarge
+	}
+	return int(v), n, nil
+}
diff --git a/vendor/github.com/klauspost/compress/zstd/zstd.go b/vendor/github.com/klauspost/compress/zstd/zstd.go
new file mode 100644
index 00000000000..0807719c8b9
--- /dev/null
+++ b/vendor/github.com/klauspost/compress/zstd/zstd.go
@@ -0,0 +1,144 @@
+// Package zstd provides decompression of zstandard files.
+//
+// For advanced usage and examples, go to the README: https://github.com/klauspost/compress/tree/master/zstd#zstd
+package zstd
+
+import (
+	"errors"
+	"log"
+	"math"
+	"math/bits"
+)
+
+// enable debug printing
+const debug = false
+
+// Enable extra assertions.
+const debugAsserts = debug || false
+
+// print sequence details
+const debugSequences = false
+
+// print detailed matching information
+const debugMatches = false
+
+// force encoder to use predefined tables.
+const forcePreDef = false
+
+// zstdMinMatch is the minimum zstd match length.
+const zstdMinMatch = 3
+
+// Reset the buffer offset when reaching this.
+const bufferReset = math.MaxInt32 - MaxWindowSize
+
+var (
+	// ErrReservedBlockType is returned when a reserved block type is found.
+	// Typically this indicates wrong or corrupted input.
+	ErrReservedBlockType = errors.New("invalid input: reserved block type encountered")
+
+	// ErrCompressedSizeTooBig is returned when a block is bigger than allowed.
+	// Typically this indicates wrong or corrupted input.
+	ErrCompressedSizeTooBig = errors.New("invalid input: compressed size too big")
+
+	// ErrBlockTooSmall is returned when a block is too small to be decoded.
+	// Typically returned on invalid input.
+	ErrBlockTooSmall = errors.New("block too small")
+
+	// ErrMagicMismatch is returned when a "magic" number isn't what is expected.
+	// Typically this indicates wrong or corrupted input.
+	ErrMagicMismatch = errors.New("invalid input: magic number mismatch")
+
+	// ErrWindowSizeExceeded is returned when a reference exceeds the valid window size.
+	// Typically this indicates wrong or corrupted input.
+	ErrWindowSizeExceeded = errors.New("window size exceeded")
+
+	// ErrWindowSizeTooSmall is returned when no window size is specified.
+	// Typically this indicates wrong or corrupted input.
+	ErrWindowSizeTooSmall = errors.New("invalid input: window size was too small")
+
+	// ErrDecoderSizeExceeded is returned if decompressed size exceeds the configured limit.
+	ErrDecoderSizeExceeded = errors.New("decompressed size exceeds configured limit")
+
+	// ErrUnknownDictionary is returned if the dictionary ID is unknown.
+	// For the time being dictionaries are not supported.
+	ErrUnknownDictionary = errors.New("unknown dictionary")
+
+	// ErrFrameSizeExceeded is returned if the stated frame size is exceeded.
+	// This is only returned if SingleSegment is specified on the frame.
+	ErrFrameSizeExceeded = errors.New("frame size exceeded")
+
+	// ErrCRCMismatch is returned if CRC mismatches.
+	ErrCRCMismatch = errors.New("CRC check failed")
+
+	// ErrDecoderClosed will be returned if the Decoder was used after
+	// Close has been called.
+	ErrDecoderClosed = errors.New("decoder used after Close")
+)
+
+func println(a ...interface{}) {
+	if debug {
+		log.Println(a...)
+	}
+}
+
+func printf(format string, a ...interface{}) {
+	if debug {
+		log.Printf(format, a...)
+	}
+}
+
+// matchLenFast does matching, but will not match the last up to 7 bytes.
+func matchLenFast(a, b []byte) int {
+	endI := len(a) & (math.MaxInt32 - 7)
+	for i := 0; i < endI; i += 8 {
+		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+			return i + bits.TrailingZeros64(diff)>>3
+		}
+	}
+	return endI
+}
+
+// matchLen returns the maximum length.
+// a must be the shortest of the two.
+// The function also returns whether all bytes matched.
+func matchLen(a, b []byte) int {
+	b = b[:len(a)]
+	for i := 0; i < len(a)-7; i += 8 {
+		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
+			return i + (bits.TrailingZeros64(diff) >> 3)
+		}
+	}
+
+	checked := (len(a) >> 3) << 3
+	a = a[checked:]
+	b = b[checked:]
+	for i := range a {
+		if a[i] != b[i] {
+			return i + checked
+		}
+	}
+	return len(a) + checked
+}
+
+func load3232(b []byte, i int32) uint32 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:4]
+	return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
+}
+
+func load6432(b []byte, i int32) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
+
+func load64(b []byte, i int) uint64 {
+	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
+	b = b[i:]
+	b = b[:8]
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
+		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/.gitignore b/vendor/github.com/pierrec/lz4/v4/.gitignore
new file mode 100644
index 00000000000..5d7e88de0a3
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/.gitignore
@@ -0,0 +1,36 @@
+# Created by https://www.gitignore.io/api/macos
+
+### macOS ###
+*.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+# End of https://www.gitignore.io/api/macos
+
+cmd/*/*exe
+.idea
+
+fuzz/*.zip
diff --git a/vendor/github.com/pierrec/lz4/v4/LICENSE b/vendor/github.com/pierrec/lz4/v4/LICENSE
new file mode 100644
index 00000000000..bd899d8353d
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/LICENSE
@@ -0,0 +1,28 @@
+Copyright (c) 2015, Pierre Curto
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of xxHash nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
diff --git a/vendor/github.com/pierrec/lz4/v4/README.md b/vendor/github.com/pierrec/lz4/v4/README.md
new file mode 100644
index 00000000000..df027e2c301
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/README.md
@@ -0,0 +1,90 @@
+# lz4 : LZ4 compression in pure Go
+
+[![Go Reference](https://pkg.go.dev/badge/github.com/pierrec/lz4/v4.svg)](https://pkg.go.dev/github.com/pierrec/lz4/v4)
+[![CI](https://github.com/pierrec/lz4/workflows/ci/badge.svg)](https://github.com/pierrec/lz4/actions)
+[![Go Report Card](https://goreportcard.com/badge/github.com/pierrec/lz4)](https://goreportcard.com/report/github.com/pierrec/lz4)
+[![GitHub tag (latest SemVer)](https://img.shields.io/github/tag/pierrec/lz4.svg?style=social)](https://github.com/pierrec/lz4/tags)
+
+## Overview
+
+This package provides a streaming interface to [LZ4 data streams](http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html) as well as low level compress and uncompress functions for LZ4 data blocks.
+The implementation is based on the reference C [one](https://github.com/lz4/lz4).
+
+## Install
+
+Assuming you have the go toolchain installed:
+
+```
+go get github.com/pierrec/lz4/v4
+```
+
+There is a command line interface tool to compress and decompress LZ4 files.
+
+```
+go install github.com/pierrec/lz4/v4/cmd/lz4c
+```
+
+Usage
+
+```
+Usage of lz4c:
+  -version
+        print the program version
+
+Subcommands:
+Compress the given files or from stdin to stdout.
+compress [arguments] [<file name> ...]
+  -bc
+        enable block checksum
+  -l int
+        compression level (0=fastest)
+  -sc
+        disable stream checksum
+  -size string
+        block max size [64K,256K,1M,4M] (default "4M")
+
+Uncompress the given files or from stdin to stdout.
+uncompress [arguments] [<file name> ...]
+
+```
+
+
+## Example
+
+```
+// Compress and uncompress an input string.
+s := "hello world"
+r := strings.NewReader(s)
+
+// The pipe will uncompress the data from the writer.
+pr, pw := io.Pipe()
+zw := lz4.NewWriter(pw)
+zr := lz4.NewReader(pr)
+
+go func() {
+	// Compress the input string.
+	_, _ = io.Copy(zw, r)
+	_ = zw.Close() // Make sure the writer is closed
+	_ = pw.Close() // Terminate the pipe
+}()
+
+_, _ = io.Copy(os.Stdout, zr)
+
+// Output:
+// hello world
+```
+
+## Contributing
+
+Contributions are very welcome for bug fixing, performance improvements...!
+
+- Open an issue with a proper description
+- Send a pull request with appropriate test case(s)
+
+## Contributors
+
+Thanks to all [contributors](https://github.com/pierrec/lz4/graphs/contributors)  so far!
+
+Special thanks to [@Zariel](https://github.com/Zariel) for his asm implementation of the decoder.
+
+Special thanks to [@klauspost](https://github.com/klauspost) for his work on optimizing the code.
diff --git a/vendor/github.com/pierrec/lz4/v4/go.mod b/vendor/github.com/pierrec/lz4/v4/go.mod
new file mode 100644
index 00000000000..42229b29672
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/go.mod
@@ -0,0 +1,3 @@
+module github.com/pierrec/lz4/v4
+
+go 1.14
diff --git a/vendor/github.com/pierrec/lz4/v4/go.sum b/vendor/github.com/pierrec/lz4/v4/go.sum
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
new file mode 100644
index 00000000000..f3826494308
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
@@ -0,0 +1,469 @@
+package lz4block
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+const (
+	// The following constants are used to setup the compression algorithm.
+	minMatch   = 4  // the minimum size of the match sequence size (4 bytes)
+	winSizeLog = 16 // LZ4 64Kb window size limit
+	winSize    = 1 << winSizeLog
+	winMask    = winSize - 1 // 64Kb window of previous data for dependent blocks
+
+	// hashLog determines the size of the hash table used to quickly find a previous match position.
+	// Its value influences the compression speed and memory usage, the lower the faster,
+	// but at the expense of the compression ratio.
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
+
+	mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
+)
+
+func recoverBlock(e *error) {
+	if r := recover(); r != nil && *e == nil {
+		*e = lz4errors.ErrInvalidSourceShortBuffer
+	}
+}
+
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
+}
+
+func CompressBlockBound(n int) int {
+	return n + n/255 + 16
+}
+
+func UncompressBlock(src, dst []byte) (int, error) {
+	if len(src) == 0 {
+		return 0, nil
+	}
+	if di := decodeBlock(dst, src); di >= 0 {
+		return di, nil
+	}
+	return 0, lz4errors.ErrInvalidSourceShortBuffer
+}
+
+type Compressor struct {
+	// Offsets are at most 64kiB, so we can store only the lower 16 bits of
+	// match positions: effectively, an offset from some 64kiB block boundary.
+	//
+	// When we retrieve such an offset, we interpret it as relative to the last
+	// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
+	// depending on which of these is inside the current window. If a table
+	// entry was generated more than 64kiB back in the input, we find out by
+	// inspecting the input stream.
+	table [htSize]uint16
+
+	needsReset bool
+}
+
+// Get returns the position of a presumptive match for the hash h.
+// The match may be a false positive due to a hash collision or an old entry.
+// If si < winSize, the return value may be negative.
+func (c *Compressor) get(h uint32, si int) int {
+	h &= htSize - 1
+	i := int(c.table[h])
+	i += si &^ winMask
+	if i >= si {
+		// Try previous 64kiB block (negative when in first block).
+		i -= winSize
+	}
+	return i
+}
+
+func (c *Compressor) put(h uint32, si int) {
+	h &= htSize - 1
+	c.table[h] = uint16(si)
+}
+
+var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
+
+func CompressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*Compressor)
+	n, err := c.CompressBlock(src, dst)
+	compressorPool.Put(c)
+	return n, err
+}
+
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.table = [htSize]uint16{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	for si < sn {
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
+		h := blockHash(match)
+		h2 := blockHash(match >> 8)
+
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
+		ref := c.get(h, si)
+		ref2 := c.get(h2, si)
+		c.put(h, si)
+		c.put(h2, si+1)
+
+		offset := si - ref
+
+		if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref3 := c.get(h, si+2)
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref3
+				c.put(h, si)
+
+				if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
+		}
+
+		// Match found.
+		lLen := si - anchor // Literal length.
+		// We already matched 4 bytes.
+		mLen := 4
+
+		// Extend backwards if we can, reducing literals.
+		tOff := si - offset - 1
+		for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
+			si--
+			tOff--
+			lLen--
+			mLen++
+		}
+
+		// Add the match length, so we continue search at the end.
+		// Use mLen to store the offset base.
+		si, mLen = si+mLen, si+minMatch
+
+		// Find the longest match by looking by batches of 8 bytes.
+		for si+8 < sn {
+			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
+			if x == 0 {
+				si += 8
+			} else {
+				// Stop is first non-zero byte.
+				si += bits.TrailingZeros64(x) >> 3
+				break
+			}
+		}
+
+		mLen = si - mLen
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		if di+lLen > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen + 2
+		anchor = si
+
+		// Encode offset.
+		if di > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		c.put(h, si-2)
+	}
+
+lastLiterals:
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	if di >= len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	if di+len(src)-anchor > len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
+
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
+type CompressorHC struct {
+	// hashTable: stores the last position found for a given hash
+	// chainTable: stores previous positions for a given hash
+	hashTable, chainTable [htSize]int
+	needsReset            bool
+}
+
+var compressorHCPool = sync.Pool{New: func() interface{} { return new(CompressorHC) }}
+
+func CompressBlockHC(src, dst []byte, depth CompressionLevel) (int, error) {
+	c := compressorHCPool.Get().(*CompressorHC)
+	n, err := c.CompressBlock(src, dst, depth)
+	compressorHCPool.Put(c)
+	return n, err
+}
+
+func (c *CompressorHC) CompressBlock(src, dst []byte, depth CompressionLevel) (_ int, err error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.hashTable = [htSize]int{}
+		c.chainTable = [htSize]int{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	defer recoverBlock(&err)
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	if depth == 0 {
+		depth = winSize
+	}
+
+	for si < sn {
+		// Hash the next 4 bytes (sequence).
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHashHC(match)
+
+		// Follow the chain until out of window and give the longest match.
+		mLen := 0
+		offset := 0
+		for next, try := c.hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next, try = c.chainTable[next&winMask], try-1 {
+			// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
+			// must match to improve on the match length.
+			if src[next+mLen] != src[si+mLen] {
+				continue
+			}
+			ml := 0
+			// Compare the current position with a previous with the same hash.
+			for ml < sn-si {
+				x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
+				if x == 0 {
+					ml += 8
+				} else {
+					// Stop is first non-zero byte.
+					ml += bits.TrailingZeros64(x) >> 3
+					break
+				}
+			}
+			if ml < minMatch || ml <= mLen {
+				// Match too small (<minMath) or smaller than the current match.
+				continue
+			}
+			// Found a longer match, keep its position and length.
+			mLen = ml
+			offset = si - next
+			// Try another previous position with the same hash.
+		}
+		c.chainTable[si&winMask] = c.hashTable[h]
+		c.hashTable[h] = si
+
+		// No match found.
+		if mLen == 0 {
+			si += 1 + (si-anchor)>>adaptSkipLog
+			continue
+		}
+
+		// Match found.
+		// Update hash/chain tables with overlapping bytes:
+		// si already hashed, add everything from si+1 up to the match length.
+		winStart := si + 1
+		if ws := si + mLen - winSize; ws > winStart {
+			winStart = ws
+		}
+		for si, ml := winStart, si+mLen; si < ml; {
+			match >>= 8
+			match |= uint32(src[si+3]) << 24
+			h := blockHashHC(match)
+			c.chainTable[si&winMask] = c.hashTable[h]
+			c.hashTable[h] = si
+			si++
+		}
+
+		lLen := si - anchor
+		si += mLen
+		mLen -= minMatch // Match length does not include minMatch.
+
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen
+		anchor = si
+
+		// Encode offset.
+		di += 2
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+lastLiterals:
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		lLen -= 0xF
+		for ; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
new file mode 100644
index 00000000000..e6cf88d71c0
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
@@ -0,0 +1,88 @@
+// Package lz4block provides LZ4 BlockSize types and pools of buffers.
+package lz4block
+
+import "sync"
+
+const (
+	Block64Kb uint32 = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+	Block8Mb        = 2 * Block4Mb
+	legacyBlockSize = Block8Mb + Block8Mb/255 + 16 // CompressBound(Block8Mb)
+)
+
+var (
+	BlockPool64K  = sync.Pool{New: func() interface{} { return make([]byte, Block64Kb) }}
+	BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
+	BlockPool1M   = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
+	BlockPool4M   = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
+	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, legacyBlockSize) }}
+)
+
+func Index(b uint32) BlockSizeIndex {
+	switch b {
+	case Block64Kb:
+		return 4
+	case Block256Kb:
+		return 5
+	case Block1Mb:
+		return 6
+	case Block4Mb:
+		return 7
+	case Block8Mb: // only valid in legacy mode
+		return 3
+	}
+	return 0
+}
+
+func IsValid(b uint32) bool {
+	return Index(b) > 0
+}
+
+type BlockSizeIndex uint8
+
+func (b BlockSizeIndex) IsValid() bool {
+	switch b {
+	case 4, 5, 6, 7:
+		return true
+	}
+	return false
+}
+
+func (b BlockSizeIndex) Get() []byte {
+	var buf interface{}
+	switch b {
+	case 4:
+		buf = BlockPool64K.Get()
+	case 5:
+		buf = BlockPool256K.Get()
+	case 6:
+		buf = BlockPool1M.Get()
+	case 7:
+		buf = BlockPool4M.Get()
+	case 3:
+		buf = BlockPool8M.Get()
+	}
+	return buf.([]byte)
+}
+
+func Put(buf []byte) {
+	// Safeguard: do not allow invalid buffers.
+	switch c := cap(buf); uint32(c) {
+	case Block64Kb:
+		BlockPool64K.Put(buf[:c])
+	case Block256Kb:
+		BlockPool256K.Put(buf[:c])
+	case Block1Mb:
+		BlockPool1M.Put(buf[:c])
+	case Block4Mb:
+		BlockPool4M.Put(buf[:c])
+	case legacyBlockSize:
+		BlockPool8M.Put(buf[:c])
+	}
+}
+
+type CompressionLevel uint32
+
+const Fast CompressionLevel = 0
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
new file mode 100644
index 00000000000..be79faa3fe8
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
@@ -0,0 +1,369 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX scratch
+// DX token
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// func decodeBlock(dst, src []byte) int
+// using 50 bytes of stack currently
+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	CMPQ R9, $0
+	JE   err_corrupt
+	ADDQ SI, R9
+
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+loop:
+	// for si < len(src)
+	CMPQ SI, R9
+	JGE end
+
+	// token := uint32(src[si])
+	MOVBQZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVQ DX, CX
+	SHRQ $4, CX
+
+	// if lit_len != 0xF
+	CMPQ CX, $0xF
+	JEQ lit_len_loop_pre
+	CMPQ DI, R12
+	JGE lit_len_loop_pre
+	CMPQ SI, R13
+	JGE lit_len_loop_pre
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVQ DX, CX
+	ANDQ $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	CMPQ AX, DI
+	JGT err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPQ CX, $0xF
+	JEQ match_len_loop_pre
+	CMPQ DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JLT err_short_buf
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	LEAQ 4(DI)(CX*1), DI // minmatch
+
+	// shortcut complete, load next token
+	JMP loop
+
+lit_len_loop_pre:
+	// if lit_len > 0
+	CMPQ CX, $0
+	JEQ offset
+	CMPQ CX, $0xF
+	JNE copy_literal
+
+lit_len_loop:
+	// for src[si] == 0xFF
+	CMPB (SI), $0xFF
+	JNE lit_len_finalise
+
+	// bounds check src[si+1]
+	LEAQ 1(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// lit_len += 0xFF
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP lit_len_loop
+
+lit_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_literal:
+	// bounds check src and dst
+	LEAQ (SI)(CX*1), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	LEAQ (DI)(CX*1), AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// whats a good cut off to call memmove?
+	CMPQ CX, $16
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	// if len(src[si:]) < 16
+	MOVQ R9, AX
+	SUBQ SI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	MOVB DX, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVB 48(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+finish_lit_copy:
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	CMPQ SI, R9
+	JGE end
+
+offset:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVQ DX, CX
+
+	LEAQ 2(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// offset
+	// DX := int(src[si]) | int(src[si+1])<<8
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	// 0 offset is invalid
+	CMPQ DX, $0
+	JEQ err_corrupt
+
+	ANDB $0xF, CX
+
+match_len_loop_pre:
+	// if mlen != 0xF
+	CMPB CX, $0xF
+	JNE copy_match
+
+match_len_loop:
+	// for src[si] == 0xFF
+	// lit_len += 0xFF
+	CMPB (SI), $0xFF
+	JNE match_len_finalise
+
+	// bounds check src[si+1]
+	LEAQ 1(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP match_len_loop
+
+match_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_match:
+	// mLen += minMatch
+	ADDQ $4, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	LEAQ (DI)(CX*1), AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	CMPQ BX, R11
+	JLT err_short_buf
+
+	// if offset + match_len < di
+	LEAQ (BX)(CX*1), AX
+	CMPQ DI, AX
+	JGT copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+
+	CMPQ CX, $0
+	JGT copy_match_loop
+
+	JMP loop
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	JMP loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	ADDQ CX, DI
+	JMP loop
+
+err_corrupt:
+	MOVQ $-1, ret+48(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+48(FP)
+	RET
+
+end:
+	SUBQ R11, DI
+	MOVQ DI, ret+48(FP)
+	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
new file mode 100644
index 00000000000..64be9adcaa8
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
@@ -0,0 +1,197 @@
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// Register allocation.
+#define dst	R0
+#define dstorig	R1
+#define src	R2
+#define dstend	R3
+#define srcend	R4
+#define match	R5	// Match address.
+#define token	R6
+#define len	R7	// Literal and match lengths.
+#define offset	R6	// Match offset; overlaps with token.
+#define tmp1	R8
+#define tmp2	R9
+#define tmp3	R12
+
+#define minMatch	$4
+
+// func decodeBlock(dst, src []byte) int
+TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
+	MOVW dst_base +0(FP), dst
+	MOVW dst_len  +4(FP), dstend
+	MOVW src_base+12(FP), src
+	MOVW src_len +16(FP), srcend
+
+	CMP $0, srcend
+	BEQ shortSrc
+
+	ADD dst, dstend
+	ADD src, srcend
+
+	MOVW dst, dstorig
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	MOVW    token >> 4, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD     tmp1, len
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CMP $0, len
+	BEQ copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADD    dst, len, tmp1
+	CMP    dstend, tmp1
+	//BHI  shortDst	// Uncomment for distinct error codes.
+	ADD    src, len, tmp2
+	CMP.LS srcend, tmp2
+	BHI    shortSrc
+
+	// Copy literal.
+	CMP $4, len
+	BLO copyLiteralFinish
+
+	// Copy 0-3 bytes until src is aligned.
+	TST        $1, src
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $1, len
+
+	TST        $2, src
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $2, len
+
+	B copyLiteralLoopCond
+
+copyLiteralLoop:
+	// Aligned load, unaligned write.
+	MOVW.P 4(src), tmp1
+	MOVW   tmp1 >>  8, tmp2
+	MOVB   tmp2, 1(dst)
+	MOVW   tmp1 >> 16, tmp3
+	MOVB   tmp3, 2(dst)
+	MOVW   tmp1 >> 24, tmp2
+	MOVB   tmp2, 3(dst)
+	MOVB.P tmp1, 4(dst)
+copyLiteralLoopCond:
+	// Loop until len-4 < 0.
+	SUB.S  $4, len
+	BPL    copyLiteralLoop
+
+	// Restore len, which is now negative.
+	ADD $4, len
+
+copyLiteralFinish:
+	// Copy remaining 0-3 bytes.
+	TST        $2, len
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	TST        $1, len
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+
+copyLiteralDone:
+	CMP src, srcend
+	BEQ end
+
+	// Initial part of match length.
+	// This frees up the token register for reuse as offset.
+	AND $15, token, len
+
+	// Read offset.
+	ADD   $2, src
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVBU -2(src), offset
+	MOVBU -1(src), tmp1
+	ORR   tmp1 << 8, offset
+	CMP   $0, offset
+	BEQ   corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD     tmp1, len
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	// Bounds check dst+len+minMatch and match = dst-offset.
+	ADD    dst, len, tmp1
+	ADD    minMatch, tmp1
+	CMP    dstend, tmp1
+	//BHI  shortDst	// Uncomment for distinct error codes.
+	SUB    offset, dst, match
+	CMP.LS match, dstorig
+	BHI    corrupt
+
+	// Since len+minMatch is at least four, we can do a 4× unrolled
+	// byte copy loop. Using MOVW instead of four byte loads is faster,
+	// but to remain portable we'd have to align match first, which is
+	// too expensive. By alternating loads and stores, we also handle
+	// the case offset < 4.
+copyMatch4:
+	SUB.S   $4, len
+	MOVBU.P 4(match), tmp1
+	MOVB.P  tmp1, 4(dst)
+	MOVBU   -3(match), tmp2
+	MOVB    tmp2, -3(dst)
+	MOVBU   -2(match), tmp3
+	MOVB    tmp3, -2(dst)
+	MOVBU   -1(match), tmp1
+	MOVB    tmp1, -1(dst)
+	BPL     copyMatch4
+
+	// Restore len, which is now negative.
+	ADD.S $4, len
+	BEQ   copyMatchDone
+
+copyMatch:
+	// Finish with a byte-at-a-time copy.
+	SUB.S   $1, len
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	BNE     copyMatch
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	SUB  dstorig, dst, tmp1
+	MOVW tmp1, ret+24(FP)
+	RET
+
+	// The three error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDst:
+shortSrc:
+corrupt:
+	MOVW $-1, tmp1
+	MOVW tmp1, ret+24(FP)
+	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
new file mode 100644
index 00000000000..e26f8cd613e
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
@@ -0,0 +1,9 @@
+// +build amd64 arm
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4block
+
+//go:noescape
+func decodeBlock(dst, src []byte) int
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
new file mode 100644
index 00000000000..52df2f2b8ec
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
@@ -0,0 +1,108 @@
+// +build !amd64,!arm appengine !gc noasm
+
+package lz4block
+
+import "encoding/binary"
+
+func decodeBlock(dst, src []byte) (ret int) {
+	// Restrict capacities so we don't read or write out of bounds.
+	dst = dst[:len(dst):len(dst)]
+	src = src[:len(src):len(src)]
+
+	const hasError = -2
+	defer func() {
+		if recover() != nil {
+			ret = hasError
+		}
+	}()
+
+	var si, di uint
+	for {
+		// Literals and match lengths (token).
+		b := uint(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			switch {
+			case lLen < 0xF && si+16 < uint(len(src)):
+				// Shortcut 1
+				// if we have enough room in src and dst, and the literals length
+				// is small enough (0..14) then copy all 16 bytes, even if not all
+				// are part of the literals.
+				copy(dst[di:], src[si:si+16])
+				si += lLen
+				di += lLen
+				if mLen := b & 0xF; mLen < 0xF {
+					// Shortcut 2
+					// if the match length (4..18) fits within the literals, then copy
+					// all 18 bytes, even if not all are part of the literals.
+					mLen += 4
+					if offset := u16(src[si:]); mLen <= offset {
+						i := di - offset
+						end := i + 18
+						if end > uint(len(dst)) {
+							// The remaining buffer may not hold 18 bytes.
+							// See https://github.com/pierrec/lz4/issues/51.
+							end = uint(len(dst))
+						}
+						copy(dst[di:], dst[i:end])
+						si += 2
+						di += mLen
+						continue
+					}
+				}
+			case lLen == 0xF:
+				for src[si] == 0xFF {
+					lLen += 0xFF
+					si++
+				}
+				lLen += uint(src[si])
+				si++
+				fallthrough
+			default:
+				copy(dst[di:di+lLen], src[si:si+lLen])
+				si += lLen
+				di += lLen
+			}
+		}
+		if si == uint(len(src)) {
+			return int(di)
+		} else if si > uint(len(src)) {
+			return hasError
+		}
+
+		offset := u16(src[si:])
+		if offset == 0 {
+			return hasError
+		}
+		si += 2
+
+		// Match.
+		mLen := b & 0xF
+		if mLen == 0xF {
+			for src[si] == 0xFF {
+				mLen += 0xFF
+				si++
+			}
+			mLen += uint(src[si])
+			si++
+		}
+		mLen += minMatch
+
+		// Copy the match.
+		expanded := dst[di-offset:]
+		if mLen > offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
+	}
+}
+
+func u16(p []byte) uint { return uint(binary.LittleEndian.Uint16(p)) }
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
new file mode 100644
index 00000000000..710ea42812e
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
@@ -0,0 +1,19 @@
+package lz4errors
+
+type Error string
+
+func (e Error) Error() string { return string(e) }
+
+const (
+	ErrInvalidSourceShortBuffer      Error = "lz4: invalid source or destination buffer too short"
+	ErrInvalidFrame                  Error = "lz4: bad magic number"
+	ErrInternalUnhandledState        Error = "lz4: unhandled state"
+	ErrInvalidHeaderChecksum         Error = "lz4: invalid header checksum"
+	ErrInvalidBlockChecksum          Error = "lz4: invalid block checksum"
+	ErrInvalidFrameChecksum          Error = "lz4: invalid frame checksum"
+	ErrOptionInvalidCompressionLevel Error = "lz4: invalid compression level"
+	ErrOptionClosedOrError           Error = "lz4: cannot apply options on closed or in error object"
+	ErrOptionInvalidBlockSize        Error = "lz4: invalid block size"
+	ErrOptionNotApplicable           Error = "lz4: option not applicable"
+	ErrWriterNotClosed               Error = "lz4: writer not closed"
+)
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
new file mode 100644
index 00000000000..c7b929fdfec
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
@@ -0,0 +1,335 @@
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+type Blocks struct {
+	Block  *FrameDataBlock
+	Blocks chan chan *FrameDataBlock
+	mu     sync.Mutex
+	err    error
+}
+
+func (b *Blocks) initW(f *Frame, dst io.Writer, num int) {
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return
+	}
+	b.Block = nil
+	if cap(b.Blocks) != num {
+		b.Blocks = make(chan chan *FrameDataBlock, num)
+	}
+	// goroutine managing concurrent block compression goroutines.
+	go func() {
+		// Process next block compression item.
+		for c := range b.Blocks {
+			// Read the next compressed block result.
+			// Waiting here ensures that the blocks are output in the order they were sent.
+			// The incoming channel is always closed as it indicates to the caller that
+			// the block has been processed.
+			block := <-c
+			if block == nil {
+				// Notify the block compression routine that we are done with its result.
+				// This is used when a sentinel block is sent to terminate the compression.
+				close(c)
+				return
+			}
+			// Do not attempt to write the block upon any previous failure.
+			if b.err == nil {
+				// Write the block.
+				if err := block.Write(f, dst); err != nil {
+					// Keep the first error.
+					b.err = err
+					// All pending compression goroutines need to shut down, so we need to keep going.
+				}
+			}
+			close(c)
+		}
+	}()
+}
+
+func (b *Blocks) close(f *Frame, num int) error {
+	if num == 1 {
+		if b.Block != nil {
+			b.Block.Close(f)
+		}
+		err := b.err
+		b.err = nil
+		return err
+	}
+	if b.Blocks == nil {
+		err := b.err
+		b.err = nil
+		return err
+	}
+	c := make(chan *FrameDataBlock)
+	b.Blocks <- c
+	c <- nil
+	<-c
+	err := b.err
+	b.err = nil
+	return err
+}
+
+// ErrorR returns any error set while uncompressing a stream.
+func (b *Blocks) ErrorR() error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.err
+}
+
+// initR returns a channel that streams the uncompressed blocks if in concurrent
+// mode and no error. When the channel is closed, check for any error with b.ErrorR.
+//
+// If not in concurrent mode, the uncompressed block is b.Block and the returned error
+// needs to be checked.
+func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
+	size := f.Descriptor.Flags.BlockSizeIndex()
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return nil, nil
+	}
+	b.Block = nil
+	blocks := make(chan chan []byte, num)
+	// data receives the uncompressed blocks.
+	data := make(chan []byte)
+	// Read blocks from the source sequentially
+	// and uncompress them concurrently.
+
+	// In legacy mode, accrue the uncompress sizes in cum.
+	var cum uint32
+	go func() {
+		var cumx uint32
+		var err error
+		for b.ErrorR() == nil {
+			block := NewFrameDataBlock(f)
+			cumx, err = block.Read(f, src, 0)
+			if err != nil {
+				block.Close(f)
+				break
+			}
+			// Recheck for an error as reading may be slow and uncompressing is expensive.
+			if b.ErrorR() != nil {
+				block.Close(f)
+				break
+			}
+			c := make(chan []byte)
+			blocks <- c
+			go func() {
+				defer block.Close(f)
+				data, err := block.Uncompress(f, size.Get(), false)
+				if err != nil {
+					b.closeR(err)
+				} else {
+					c <- data
+				}
+			}()
+		}
+		// End the collection loop and the data channel.
+		c := make(chan []byte)
+		blocks <- c
+		c <- nil // signal the collection loop that we are done
+		<-c      // wait for the collect loop to complete
+		if f.isLegacy() && cum == cumx {
+			err = io.EOF
+		}
+		b.closeR(err)
+		close(data)
+	}()
+	// Collect the uncompressed blocks and make them available
+	// on the returned channel.
+	go func(leg bool) {
+		defer close(blocks)
+		for c := range blocks {
+			buf := <-c
+			if buf == nil {
+				// Signal to end the loop.
+				close(c)
+				return
+			}
+			// Perform checksum now as the blocks are received in order.
+			if f.Descriptor.Flags.ContentChecksum() {
+				_, _ = f.checksum.Write(buf)
+			}
+			if leg {
+				cum += uint32(len(buf))
+			}
+			data <- buf
+			close(c)
+		}
+	}(f.isLegacy())
+	return data, nil
+}
+
+// closeR safely sets the error on b if not already set.
+func (b *Blocks) closeR(err error) {
+	b.mu.Lock()
+	if b.err == nil {
+		b.err = err
+	}
+	b.mu.Unlock()
+}
+
+func NewFrameDataBlock(f *Frame) *FrameDataBlock {
+	buf := f.Descriptor.Flags.BlockSizeIndex().Get()
+	return &FrameDataBlock{Data: buf, data: buf}
+}
+
+type FrameDataBlock struct {
+	Size     DataBlockSize
+	Data     []byte // compressed or uncompressed data (.data or .src)
+	Checksum uint32
+	data     []byte // buffer for compressed data
+	src      []byte // uncompressed data
+	err      error  // used in concurrent mode
+}
+
+func (b *FrameDataBlock) Close(f *Frame) {
+	b.Size = 0
+	b.Checksum = 0
+	b.err = nil
+	if b.data != nil {
+		// Block was not already closed.
+		lz4block.Put(b.data)
+		b.Data = nil
+		b.data = nil
+		b.src = nil
+	}
+}
+
+// Block compression errors are ignored since the buffer is sized appropriately.
+func (b *FrameDataBlock) Compress(f *Frame, src []byte, level lz4block.CompressionLevel) *FrameDataBlock {
+	data := b.data
+	if f.isLegacy() {
+		data = data[:cap(data)]
+	} else {
+		data = data[:len(src)] // trigger the incompressible flag in CompressBlock
+	}
+	var n int
+	switch level {
+	case lz4block.Fast:
+		n, _ = lz4block.CompressBlock(src, data)
+	default:
+		n, _ = lz4block.CompressBlockHC(src, data, level)
+	}
+	if n == 0 {
+		b.Size.UncompressedSet(true)
+		b.Data = src
+	} else {
+		b.Size.UncompressedSet(false)
+		b.Data = data[:n]
+	}
+	b.Size.sizeSet(len(b.Data))
+	b.src = src // keep track of the source for content checksum
+
+	if f.Descriptor.Flags.BlockChecksum() {
+		b.Checksum = xxh32.ChecksumZero(src)
+	}
+	return b
+}
+
+func (b *FrameDataBlock) Write(f *Frame, dst io.Writer) error {
+	// Write is called in the same order as blocks are compressed,
+	// so content checksum must be done here.
+	if f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(b.src)
+	}
+	buf := f.buf[:]
+	binary.LittleEndian.PutUint32(buf, uint32(b.Size))
+	if _, err := dst.Write(buf[:4]); err != nil {
+		return err
+	}
+
+	if _, err := dst.Write(b.Data); err != nil {
+		return err
+	}
+
+	if b.Checksum == 0 {
+		return nil
+	}
+	binary.LittleEndian.PutUint32(buf, b.Checksum)
+	_, err := dst.Write(buf[:4])
+	return err
+}
+
+// Read updates b with the next block data, size and checksum if available.
+func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, error) {
+	x, err := f.readUint32(src)
+	if err != nil {
+		return 0, err
+	}
+	if f.isLegacy() {
+		switch x {
+		case frameMagicLegacy:
+			// Concatenated legacy frame.
+			return b.Read(f, src, cum)
+		case cum:
+			// Only works in non concurrent mode, for concurrent mode
+			// it is handled separately.
+			// Linux kernel format appends the total uncompressed size at the end.
+			return 0, io.EOF
+		}
+	} else if x == 0 {
+		// Marker for end of stream.
+		return 0, io.EOF
+	}
+	b.Size = DataBlockSize(x)
+
+	size := b.Size.size()
+	if size > cap(b.data) {
+		return x, lz4errors.ErrOptionInvalidBlockSize
+	}
+	b.data = b.data[:size]
+	if _, err := io.ReadFull(src, b.data); err != nil {
+		return x, err
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		sum, err := f.readUint32(src)
+		if err != nil {
+			return 0, err
+		}
+		b.Checksum = sum
+	}
+	return x, nil
+}
+
+func (b *FrameDataBlock) Uncompress(f *Frame, dst []byte, sum bool) ([]byte, error) {
+	if b.Size.Uncompressed() {
+		n := copy(dst, b.data)
+		dst = dst[:n]
+	} else {
+		n, err := lz4block.UncompressBlock(b.data, dst)
+		if err != nil {
+			return nil, err
+		}
+		dst = dst[:n]
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		if c := xxh32.ChecksumZero(dst); c != b.Checksum {
+			err := fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidBlockChecksum, c, b.Checksum)
+			return nil, err
+		}
+	}
+	if sum && f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(dst)
+	}
+	return dst, nil
+}
+
+func (f *Frame) readUint32(r io.Reader) (x uint32, err error) {
+	if _, err = io.ReadFull(r, f.buf[:4]); err != nil {
+		return
+	}
+	x = binary.LittleEndian.Uint32(f.buf[:4])
+	return
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
new file mode 100644
index 00000000000..cfbd5674d9d
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
@@ -0,0 +1,200 @@
+// Package lz4stream provides the types that support reading and writing LZ4 data streams.
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+//go:generate go run gen.go
+
+const (
+	frameMagic       uint32 = 0x184D2204
+	frameSkipMagic   uint32 = 0x184D2A50
+	frameMagicLegacy uint32 = 0x184C2102
+)
+
+func NewFrame() *Frame {
+	return &Frame{}
+}
+
+type Frame struct {
+	buf        [15]byte // frame descriptor needs at most 4(magic)+4+8+1=11 bytes
+	Magic      uint32
+	Descriptor FrameDescriptor
+	Blocks     Blocks
+	Checksum   uint32
+	checksum   xxh32.XXHZero
+}
+
+// Reset allows reusing the Frame.
+// The Descriptor configuration is not modified.
+func (f *Frame) Reset(num int) {
+	f.Magic = 0
+	f.Descriptor.Checksum = 0
+	f.Descriptor.ContentSize = 0
+	_ = f.Blocks.close(f, num)
+	f.Checksum = 0
+}
+
+func (f *Frame) InitW(dst io.Writer, num int, legacy bool) {
+	if legacy {
+		f.Magic = frameMagicLegacy
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+	} else {
+		f.Magic = frameMagic
+		f.Descriptor.initW()
+	}
+	f.Blocks.initW(f, dst, num)
+	f.checksum.Reset()
+}
+
+func (f *Frame) CloseW(dst io.Writer, num int) error {
+	if err := f.Blocks.close(f, num); err != nil {
+		return err
+	}
+	if f.isLegacy() {
+		return nil
+	}
+	buf := f.buf[:0]
+	// End mark (data block size of uint32(0)).
+	buf = append(buf, 0, 0, 0, 0)
+	if f.Descriptor.Flags.ContentChecksum() {
+		buf = f.checksum.Sum(buf)
+	}
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (f *Frame) isLegacy() bool {
+	return f.Magic == frameMagicLegacy
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+	if f.Magic > 0 {
+		// Header already read.
+		return nil, nil
+	}
+
+newFrame:
+	var err error
+	if f.Magic, err = f.readUint32(src); err != nil {
+		return nil, err
+	}
+	switch m := f.Magic; {
+	case m == frameMagic || m == frameMagicLegacy:
+	// All 16 values of frameSkipMagic are valid.
+	case m>>8 == frameSkipMagic>>8:
+		skip, err := f.readUint32(src)
+		if err != nil {
+			return nil, err
+		}
+		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
+			return nil, err
+		}
+		goto newFrame
+	default:
+		return nil, lz4errors.ErrInvalidFrame
+	}
+	if err := f.Descriptor.initR(f, src); err != nil {
+		return nil, err
+	}
+	f.checksum.Reset()
+	return f.Blocks.initR(f, num, src)
+}
+
+func (f *Frame) CloseR(src io.Reader) (err error) {
+	if f.isLegacy() {
+		return nil
+	}
+	if !f.Descriptor.Flags.ContentChecksum() {
+		return nil
+	}
+	if f.Checksum, err = f.readUint32(src); err != nil {
+		return err
+	}
+	if c := f.checksum.Sum32(); c != f.Checksum {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidFrameChecksum, c, f.Checksum)
+	}
+	return nil
+}
+
+type FrameDescriptor struct {
+	Flags       DescriptorFlags
+	ContentSize uint64
+	Checksum    uint8
+}
+
+func (fd *FrameDescriptor) initW() {
+	fd.Flags.VersionSet(1)
+	fd.Flags.BlockIndependenceSet(true)
+}
+
+func (fd *FrameDescriptor) Write(f *Frame, dst io.Writer) error {
+	if fd.Checksum > 0 {
+		// Header already written.
+		return nil
+	}
+
+	buf := f.buf[:4]
+	// Write the magic number here even though it belongs to the Frame.
+	binary.LittleEndian.PutUint32(buf, f.Magic)
+	if !f.isLegacy() {
+		buf = buf[:4+2]
+		binary.LittleEndian.PutUint16(buf[4:], uint16(fd.Flags))
+
+		if fd.Flags.Size() {
+			buf = buf[:4+2+8]
+			binary.LittleEndian.PutUint64(buf[4+2:], fd.ContentSize)
+		}
+		fd.Checksum = descriptorChecksum(buf[4:])
+		buf = append(buf, fd.Checksum)
+	}
+
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (fd *FrameDescriptor) initR(f *Frame, src io.Reader) error {
+	if f.isLegacy() {
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+		return nil
+	}
+	// Read the flags and the checksum, hoping that there is not content size.
+	buf := f.buf[:3]
+	if _, err := io.ReadFull(src, buf); err != nil {
+		return err
+	}
+	descr := binary.LittleEndian.Uint16(buf)
+	fd.Flags = DescriptorFlags(descr)
+	if fd.Flags.Size() {
+		// Append the 8 missing bytes.
+		buf = buf[:3+8]
+		if _, err := io.ReadFull(src, buf[3:]); err != nil {
+			return err
+		}
+		fd.ContentSize = binary.LittleEndian.Uint64(buf[2:])
+	}
+	fd.Checksum = buf[len(buf)-1] // the checksum is the last byte
+	buf = buf[:len(buf)-1]        // all descriptor fields except checksum
+	if c := descriptorChecksum(buf); fd.Checksum != c {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidHeaderChecksum, c, fd.Checksum)
+	}
+	// Validate the elements that can be.
+	if idx := fd.Flags.BlockSizeIndex(); !idx.IsValid() {
+		return lz4errors.ErrOptionInvalidBlockSize
+	}
+	return nil
+}
+
+func descriptorChecksum(buf []byte) byte {
+	return byte(xxh32.ChecksumZero(buf) >> 8)
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
new file mode 100644
index 00000000000..d33a6be95c3
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
@@ -0,0 +1,103 @@
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+package lz4stream
+
+import "github.com/pierrec/lz4/v4/internal/lz4block"
+
+// DescriptorFlags is defined as follow:
+//   field              bits
+//   -----              ----
+//   _                  2
+//   ContentChecksum    1
+//   Size               1
+//   BlockChecksum      1
+//   BlockIndependence  1
+//   Version            2
+//   _                  4
+//   BlockSizeIndex     3
+//   _                  1
+type DescriptorFlags uint16
+
+// Getters.
+func (x DescriptorFlags) ContentChecksum() bool   { return x>>2&1 != 0 }
+func (x DescriptorFlags) Size() bool              { return x>>3&1 != 0 }
+func (x DescriptorFlags) BlockChecksum() bool     { return x>>4&1 != 0 }
+func (x DescriptorFlags) BlockIndependence() bool { return x>>5&1 != 0 }
+func (x DescriptorFlags) Version() uint16         { return uint16(x >> 6 & 0x3) }
+func (x DescriptorFlags) BlockSizeIndex() lz4block.BlockSizeIndex {
+	return lz4block.BlockSizeIndex(x >> 12 & 0x7)
+}
+
+// Setters.
+func (x *DescriptorFlags) ContentChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 2
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) SizeSet(v bool) *DescriptorFlags {
+	const b = 1 << 3
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 4
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockIndependenceSet(v bool) *DescriptorFlags {
+	const b = 1 << 5
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) VersionSet(v uint16) *DescriptorFlags {
+	*x = *x&^(0x3<<6) | (DescriptorFlags(v) & 0x3 << 6)
+	return x
+}
+func (x *DescriptorFlags) BlockSizeIndexSet(v lz4block.BlockSizeIndex) *DescriptorFlags {
+	*x = *x&^(0x7<<12) | (DescriptorFlags(v) & 0x7 << 12)
+	return x
+}
+
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+// DataBlockSize is defined as follow:
+//   field         bits
+//   -----         ----
+//   size          31
+//   Uncompressed  1
+type DataBlockSize uint32
+
+// Getters.
+func (x DataBlockSize) size() int          { return int(x & 0x7FFFFFFF) }
+func (x DataBlockSize) Uncompressed() bool { return x>>31&1 != 0 }
+
+// Setters.
+func (x *DataBlockSize) sizeSet(v int) *DataBlockSize {
+	*x = *x&^0x7FFFFFFF | DataBlockSize(v)&0x7FFFFFFF
+	return x
+}
+func (x *DataBlockSize) UncompressedSet(v bool) *DataBlockSize {
+	const b = 1 << 31
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
new file mode 100644
index 00000000000..8d3206a87c5
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
@@ -0,0 +1,212 @@
+// Package xxh32 implements the very fast XXH hashing algorithm (32 bits version).
+// (https://github.com/Cyan4973/XXH/)
+package xxh32
+
+import (
+	"encoding/binary"
+)
+
+const (
+	prime1 uint32 = 2654435761
+	prime2 uint32 = 2246822519
+	prime3 uint32 = 3266489917
+	prime4 uint32 = 668265263
+	prime5 uint32 = 374761393
+
+	primeMask   = 0xFFFFFFFF
+	prime1plus2 = uint32((uint64(prime1) + uint64(prime2)) & primeMask) // 606290984
+	prime1minus = uint32((-int64(prime1)) & primeMask)                  // 1640531535
+)
+
+// XXHZero represents an xxhash32 object with seed 0.
+type XXHZero struct {
+	v        [4]uint32
+	totalLen uint64
+	buf      [16]byte
+	bufused  int
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (xxh XXHZero) Sum(b []byte) []byte {
+	h32 := xxh.Sum32()
+	return append(b, byte(h32), byte(h32>>8), byte(h32>>16), byte(h32>>24))
+}
+
+// Reset resets the Hash to its initial state.
+func (xxh *XXHZero) Reset() {
+	xxh.v[0] = prime1plus2
+	xxh.v[1] = prime2
+	xxh.v[2] = 0
+	xxh.v[3] = prime1minus
+	xxh.totalLen = 0
+	xxh.bufused = 0
+}
+
+// Size returns the number of bytes returned by Sum().
+func (xxh *XXHZero) Size() int {
+	return 4
+}
+
+// BlockSizeIndex gives the minimum number of bytes accepted by Write().
+func (xxh *XXHZero) BlockSize() int {
+	return 1
+}
+
+// Write adds input bytes to the Hash.
+// It never returns an error.
+func (xxh *XXHZero) Write(input []byte) (int, error) {
+	if xxh.totalLen == 0 {
+		xxh.Reset()
+	}
+	n := len(input)
+	m := xxh.bufused
+
+	xxh.totalLen += uint64(n)
+
+	r := len(xxh.buf) - m
+	if n < r {
+		copy(xxh.buf[m:], input)
+		xxh.bufused += len(input)
+		return n, nil
+	}
+
+	var buf *[16]byte
+	if m != 0 {
+		// some data left from previous update
+		buf = &xxh.buf
+		c := copy(buf[m:], input)
+		n -= c
+		input = input[c:]
+	}
+	update(&xxh.v, buf, input)
+	xxh.bufused = copy(xxh.buf[:], input[n-n%16:])
+
+	return n, nil
+}
+
+// Portable version of update. This updates v by processing all of buf
+// (if not nil) and all full 16-byte blocks of input.
+func updateGo(v *[4]uint32, buf *[16]byte, input []byte) {
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := v[0], v[1], v[2], v[3]
+
+	if buf != nil {
+		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
+	}
+
+	for ; len(input) >= 16; input = input[16:] {
+		sub := input[:16] //BCE hint for compiler
+		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+	}
+	v[0], v[1], v[2], v[3] = v1, v2, v3, v4
+}
+
+// Sum32 returns the 32 bits Hash value.
+func (xxh *XXHZero) Sum32() uint32 {
+	h32 := uint32(xxh.totalLen)
+	if h32 >= 16 {
+		h32 += rol1(xxh.v[0]) + rol7(xxh.v[1]) + rol12(xxh.v[2]) + rol18(xxh.v[3])
+	} else {
+		h32 += prime5
+	}
+
+	p := 0
+	n := xxh.bufused
+	buf := xxh.buf
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for ; p < n; p++ {
+		h32 += uint32(buf[p]) * prime5
+		h32 = rol11(h32) * prime1
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+// Portable version of ChecksumZero.
+func checksumZeroGo(input []byte) uint32 {
+	n := len(input)
+	h32 := uint32(n)
+
+	if n < 16 {
+		h32 += prime5
+	} else {
+		v1 := prime1plus2
+		v2 := prime2
+		v3 := uint32(0)
+		v4 := prime1minus
+		p := 0
+		for n := n - 16; p <= n; p += 16 {
+			sub := input[p:][:16] //BCE hint for compiler
+			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+		}
+		input = input[p:]
+		n -= p
+		h32 += rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+	}
+
+	p := 0
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for p < n {
+		h32 += uint32(input[p]) * prime5
+		h32 = rol11(h32) * prime1
+		p++
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+func rol1(u uint32) uint32 {
+	return u<<1 | u>>31
+}
+
+func rol7(u uint32) uint32 {
+	return u<<7 | u>>25
+}
+
+func rol11(u uint32) uint32 {
+	return u<<11 | u>>21
+}
+
+func rol12(u uint32) uint32 {
+	return u<<12 | u>>20
+}
+
+func rol13(u uint32) uint32 {
+	return u<<13 | u>>19
+}
+
+func rol17(u uint32) uint32 {
+	return u<<17 | u>>15
+}
+
+func rol18(u uint32) uint32 {
+	return u<<18 | u>>14
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
new file mode 100644
index 00000000000..0978b2665bd
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
@@ -0,0 +1,11 @@
+// +build !noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+//
+//go:noescape
+func ChecksumZero(input []byte) uint32
+
+//go:noescape
+func update(v *[4]uint32, buf *[16]byte, input []byte)
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
new file mode 100644
index 00000000000..0e9f146a36a
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
@@ -0,0 +1,259 @@
+// +build !noasm
+
+#include "textflag.h"
+
+#define prime1		$2654435761
+#define prime2		$2246822519
+#define prime3		$3266489917
+#define prime4		$668265263
+#define prime5		$374761393
+
+#define prime1plus2	$606290984
+#define prime1minus	$1640531535
+
+// Register allocation.
+#define p	R0
+#define n	R1
+#define h	R2
+#define v1	R2	// Alias for h.
+#define v2	R3
+#define v3	R4
+#define v4	R5
+#define x1	R6
+#define x2	R7
+#define x3	R8
+#define x4	R9
+
+// We need the primes in registers. The 16-byte loop only uses prime{1,2}.
+#define prime1r	R11
+#define prime2r	R12
+#define prime3r	R3	// The rest can alias v{2-4}.
+#define prime4r	R4
+#define prime5r	R5
+
+// Update round macros. These read from and increment p.
+
+#define round16aligned			\
+	MOVM.IA.W (p), [x1, x2, x3, x4]	\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MULA x2, prime2r, v2, v2	\
+	MULA x3, prime2r, v3, v3	\
+	MULA x4, prime2r, v4, v4	\
+					\
+	MOVW v1 @> 19, v1		\
+	MOVW v2 @> 19, v2		\
+	MOVW v3 @> 19, v3		\
+	MOVW v4 @> 19, v4		\
+					\
+	MUL prime1r, v1			\
+	MUL prime1r, v2			\
+	MUL prime1r, v3			\
+	MUL prime1r, v4			\
+
+#define round16unaligned 		\
+	MOVBU.P  16(p), x1		\
+	MOVBU   -15(p), x2		\
+	ORR     x2 <<  8, x1		\
+	MOVBU   -14(p), x3		\
+	MOVBU   -13(p), x4		\
+	ORR     x4 <<  8, x3		\
+	ORR     x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MOVW v1 @> 19, v1		\
+	MUL prime1r, v1			\
+					\
+	MOVBU -12(p), x1		\
+	MOVBU -11(p), x2		\
+	ORR   x2 <<  8, x1		\
+	MOVBU -10(p), x3		\
+	MOVBU  -9(p), x4		\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v2, v2	\
+	MOVW v2 @> 19, v2		\
+	MUL prime1r, v2			\
+					\
+	MOVBU -8(p), x1			\
+	MOVBU -7(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -6(p), x3			\
+	MOVBU -5(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v3, v3	\
+	MOVW v3 @> 19, v3		\
+	MUL prime1r, v3			\
+					\
+	MOVBU -4(p), x1			\
+	MOVBU -3(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -2(p), x3			\
+	MOVBU -1(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v4, v4	\
+	MOVW v4 @> 19, v4		\
+	MUL prime1r, v4			\
+
+
+// func ChecksumZero([]byte) uint32
+TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
+	MOVW input_base+0(FP), p
+	MOVW input_len+4(FP),  n
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
+	// here, but that's a pseudo-op that generates a load through R11.
+	MOVW prime5, prime5r
+	ADD  prime5r, n, h
+	CMP  $0, n
+	BEQ  end
+
+	// We let n go negative so we can do comparisons with SUB.S
+	// instead of separate CMP.
+	SUB.S $16, n
+	BMI   loop16done
+
+	MOVW prime1plus2, v1
+	MOVW prime2,      v2
+	MOVW $0,          v3
+	MOVW prime1minus, v4
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   loop16finish
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+loop16finish:
+	MOVW v1 @> 31, h
+	ADD  v2 @> 25, h
+	ADD  v3 @> 20, h
+	ADD  v4 @> 14, h
+
+	// h += len(input) with v2 as temporary.
+	MOVW input_len+4(FP), v2
+	ADD  v2, h
+
+loop16done:
+	ADD $16, n	// Restore number of bytes left.
+
+	SUB.S $4, n
+	MOVW  prime3, prime3r
+	BMI   loop4done
+	MOVW  prime4, prime4r
+
+	TST $3, p
+	BNE loop4unaligned
+
+loop4aligned:
+	SUB.S $4, n
+
+	MOVW.P 4(p), x1
+	MULA   prime3r, x1, h, h
+	MOVW   h @> 15, h
+	MUL    prime4r, h
+
+	BPL loop4aligned
+	B   loop4done
+
+loop4unaligned:
+	SUB.S $4, n
+
+	MOVBU.P  4(p), x1
+	MOVBU   -3(p), x2
+	ORR     x2 <<  8, x1
+	MOVBU   -2(p), x3
+	ORR     x3 << 16, x1
+	MOVBU   -1(p), x4
+	ORR     x4 << 24, x1
+
+	MULA prime3r, x1, h, h
+	MOVW h @> 15, h
+	MUL  prime4r, h
+
+	BPL loop4unaligned
+
+loop4done:
+	ADD.S $4, n	// Restore number of bytes left.
+	BEQ   end
+
+	MOVW prime5, prime5r
+
+loop1:
+	SUB.S $1, n
+
+	MOVBU.P 1(p), x1
+	MULA    prime5r, x1, h, h
+	MOVW    h @> 21, h
+	MUL     prime1r, h
+
+	BNE loop1
+
+end:
+	MOVW prime3, prime3r
+	EOR  h >> 15, h
+	MUL  prime2r, h
+	EOR  h >> 13, h
+	MUL  prime3r, h
+	EOR  h >> 16, h
+
+	MOVW h, ret+12(FP)
+	RET
+
+
+// func update(v *[4]uint64, buf *[16]byte, p []byte)
+TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
+	MOVW    v+0(FP), p
+	MOVM.IA (p), [v1, v2, v3, v4]
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Process buf, if not nil.
+	MOVW buf+4(FP), p
+	CMP  $0, p
+	BEQ  noBuffered
+
+	round16aligned
+
+noBuffered:
+	MOVW input_base +8(FP), p
+	MOVW input_len +12(FP), n
+
+	SUB.S $16, n
+	BMI   end
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   end
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+end:
+	MOVW    v+0(FP), p
+	MOVM.IA [v1, v2, v3, v4], (p)
+	RET
diff --git a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
new file mode 100644
index 00000000000..c96b59b8c3f
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
@@ -0,0 +1,10 @@
+// +build !arm noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+func ChecksumZero(input []byte) uint32 { return checksumZeroGo(input) }
+
+func update(v *[4]uint32, buf *[16]byte, input []byte) {
+	updateGo(v, buf, input)
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/lz4.go b/vendor/github.com/pierrec/lz4/v4/lz4.go
new file mode 100644
index 00000000000..c585d4064f5
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/lz4.go
@@ -0,0 +1,147 @@
+// Package lz4 implements reading and writing lz4 compressed data.
+//
+// The package supports both the LZ4 stream format,
+// as specified in http://fastcompression.blogspot.fr/2013/04/lz4-streaming-format-final.html,
+// and the LZ4 block format, defined at
+// http://fastcompression.blogspot.fr/2011/05/lz4-explained.html.
+//
+// See https://github.com/lz4/lz4 for the reference C implementation.
+package lz4
+
+import (
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+func _() {
+	// Safety checks for duplicated elements.
+	var x [1]struct{}
+	_ = x[lz4block.CompressionLevel(Fast)-lz4block.Fast]
+	_ = x[Block64Kb-BlockSize(lz4block.Block64Kb)]
+	_ = x[Block256Kb-BlockSize(lz4block.Block256Kb)]
+	_ = x[Block1Mb-BlockSize(lz4block.Block1Mb)]
+	_ = x[Block4Mb-BlockSize(lz4block.Block4Mb)]
+}
+
+// CompressBlockBound returns the maximum size of a given buffer of size n, when not compressible.
+func CompressBlockBound(n int) int {
+	return lz4block.CompressBlockBound(n)
+}
+
+// UncompressBlock uncompresses the source buffer into the destination one,
+// and returns the uncompressed size.
+//
+// The destination buffer must be sized appropriately.
+//
+// An error is returned if the source data is invalid or the destination buffer is too small.
+func UncompressBlock(src, dst []byte) (int, error) {
+	return lz4block.UncompressBlock(src, dst)
+}
+
+// A Compressor compresses data into the LZ4 block format.
+// It uses a fast compression algorithm.
+//
+// A Compressor is not safe for concurrent use by multiple goroutines.
+//
+// Use a Writer to compress into the LZ4 stream format.
+type Compressor struct{ c lz4block.Compressor }
+
+// CompressBlock compresses the source buffer src into the destination dst.
+//
+// If compression is successful, the first return value is the size of the
+// compressed data, which is always >0.
+//
+// If dst has length at least CompressBlockBound(len(src)), compression always
+// succeeds. Otherwise, the first return value is zero. The error return is
+// non-nil if the compressed data does not fit in dst, but it might fit in a
+// larger buffer that is still smaller than CompressBlockBound(len(src)). The
+// return value (0, nil) means the data is likely incompressible and a buffer
+// of length CompressBlockBound(len(src)) should be passed in.
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	return c.c.CompressBlock(src, dst)
+}
+
+// CompressBlock compresses the source buffer into the destination one.
+// This is the fast version of LZ4 compression and also the default one.
+//
+// The argument hashTable is scratch space for a hash table used by the
+// compressor. If provided, it should have length at least 1<<16. If it is
+// shorter (or nil), CompressBlock allocates its own hash table.
+//
+// The size of the compressed data is returned.
+//
+// If the destination buffer size is lower than CompressBlockBound and
+// the compressed size is 0 and no error, then the data is incompressible.
+//
+// An error is returned if the destination buffer is too small.
+
+// CompressBlock is equivalent to Compressor.CompressBlock.
+// The final argument is ignored and should be set to nil.
+//
+// This function is deprecated. Use a Compressor instead.
+func CompressBlock(src, dst []byte, _ []int) (int, error) {
+	return lz4block.CompressBlock(src, dst)
+}
+
+// A CompressorHC compresses data into the LZ4 block format.
+// Its compression ratio is potentially better than that of a Compressor,
+// but it is also slower and requires more memory.
+//
+// A Compressor is not safe for concurrent use by multiple goroutines.
+//
+// Use a Writer to compress into the LZ4 stream format.
+type CompressorHC struct {
+	// Level is the maximum search depth for compression.
+	// Values <= 0 mean no maximum.
+	Level CompressionLevel
+	c     lz4block.CompressorHC
+}
+
+// CompressBlock compresses the source buffer src into the destination dst.
+//
+// If compression is successful, the first return value is the size of the
+// compressed data, which is always >0.
+//
+// If dst has length at least CompressBlockBound(len(src)), compression always
+// succeeds. Otherwise, the first return value is zero. The error return is
+// non-nil if the compressed data does not fit in dst, but it might fit in a
+// larger buffer that is still smaller than CompressBlockBound(len(src)). The
+// return value (0, nil) means the data is likely incompressible and a buffer
+// of length CompressBlockBound(len(src)) should be passed in.
+func (c *CompressorHC) CompressBlock(src, dst []byte) (int, error) {
+	return c.c.CompressBlock(src, dst, lz4block.CompressionLevel(c.Level))
+}
+
+// CompressBlockHC is equivalent to CompressorHC.CompressBlock.
+// The final two arguments are ignored and should be set to nil.
+//
+// This function is deprecated. Use a CompressorHC instead.
+func CompressBlockHC(src, dst []byte, depth CompressionLevel, _, _ []int) (int, error) {
+	return lz4block.CompressBlockHC(src, dst, lz4block.CompressionLevel(depth))
+}
+
+const (
+	// ErrInvalidSourceShortBuffer is returned by UncompressBlock or CompressBLock when a compressed
+	// block is corrupted or the destination buffer is not large enough for the uncompressed data.
+	ErrInvalidSourceShortBuffer = lz4errors.ErrInvalidSourceShortBuffer
+	// ErrInvalidFrame is returned when reading an invalid LZ4 archive.
+	ErrInvalidFrame = lz4errors.ErrInvalidFrame
+	// ErrInternalUnhandledState is an internal error.
+	ErrInternalUnhandledState = lz4errors.ErrInternalUnhandledState
+	// ErrInvalidHeaderChecksum is returned when reading a frame.
+	ErrInvalidHeaderChecksum = lz4errors.ErrInvalidHeaderChecksum
+	// ErrInvalidBlockChecksum is returned when reading a frame.
+	ErrInvalidBlockChecksum = lz4errors.ErrInvalidBlockChecksum
+	// ErrInvalidFrameChecksum is returned when reading a frame.
+	ErrInvalidFrameChecksum = lz4errors.ErrInvalidFrameChecksum
+	// ErrOptionInvalidCompressionLevel is returned when the supplied compression level is invalid.
+	ErrOptionInvalidCompressionLevel = lz4errors.ErrOptionInvalidCompressionLevel
+	// ErrOptionClosedOrError is returned when an option is applied to a closed or in error object.
+	ErrOptionClosedOrError = lz4errors.ErrOptionClosedOrError
+	// ErrOptionInvalidBlockSize is returned when
+	ErrOptionInvalidBlockSize = lz4errors.ErrOptionInvalidBlockSize
+	// ErrOptionNotApplicable is returned when trying to apply an option to an object not supporting it.
+	ErrOptionNotApplicable = lz4errors.ErrOptionNotApplicable
+	// ErrWriterNotClosed is returned when attempting to reset an unclosed writer.
+	ErrWriterNotClosed = lz4errors.ErrWriterNotClosed
+)
diff --git a/vendor/github.com/pierrec/lz4/v4/options.go b/vendor/github.com/pierrec/lz4/v4/options.go
new file mode 100644
index 00000000000..4e1b6703b57
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/options.go
@@ -0,0 +1,213 @@
+package lz4
+
+import (
+	"fmt"
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"reflect"
+	"runtime"
+)
+
+//go:generate go run golang.org/x/tools/cmd/stringer -type=BlockSize,CompressionLevel -output options_gen.go
+
+type (
+	applier interface {
+		Apply(...Option) error
+		private()
+	}
+	// Option defines the parameters to setup an LZ4 Writer or Reader.
+	Option func(applier) error
+)
+
+// String returns a string representation of the option with its parameter(s).
+func (o Option) String() string {
+	return o(nil).Error()
+}
+
+// Default options.
+var (
+	DefaultBlockSizeOption = BlockSizeOption(Block4Mb)
+	DefaultChecksumOption  = ChecksumOption(true)
+	DefaultConcurrency     = ConcurrencyOption(1)
+	defaultOnBlockDone     = OnBlockDoneOption(nil)
+)
+
+const (
+	Block64Kb BlockSize = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+)
+
+// BlockSizeIndex defines the size of the blocks to be compressed.
+type BlockSize uint32
+
+// BlockSizeOption defines the maximum size of compressed blocks (default=Block4Mb).
+func BlockSizeOption(size BlockSize) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("BlockSizeOption(%s)", size)
+			return lz4errors.Error(s)
+		case *Writer:
+			size := uint32(size)
+			if !lz4block.IsValid(size) {
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidBlockSize, size)
+			}
+			w.frame.Descriptor.Flags.BlockSizeIndexSet(lz4block.Index(size))
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// BlockChecksumOption enables or disables block checksum (default=false).
+func BlockChecksumOption(flag bool) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("BlockChecksumOption(%v)", flag)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.BlockChecksumSet(flag)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// ChecksumOption enables/disables all blocks or content checksum (default=true).
+func ChecksumOption(flag bool) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("ChecksumOption(%v)", flag)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.ContentChecksumSet(flag)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// SizeOption sets the size of the original uncompressed data (default=0). It is useful to know the size of the
+// whole uncompressed data stream.
+func SizeOption(size uint64) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("SizeOption(%d)", size)
+			return lz4errors.Error(s)
+		case *Writer:
+			w.frame.Descriptor.Flags.SizeSet(size > 0)
+			w.frame.Descriptor.ContentSize = size
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// ConcurrencyOption sets the number of go routines used for compression.
+// If n <= 0, then the output of runtime.GOMAXPROCS(0) is used.
+func ConcurrencyOption(n int) Option {
+	if n <= 0 {
+		n = runtime.GOMAXPROCS(0)
+	}
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("ConcurrencyOption(%d)", n)
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.num = n
+			return nil
+		case *Reader:
+			rw.num = n
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// CompressionLevel defines the level of compression to use. The higher the better, but slower, compression.
+type CompressionLevel uint32
+
+const (
+	Fast   CompressionLevel = 0
+	Level1 CompressionLevel = 1 << (8 + iota)
+	Level2
+	Level3
+	Level4
+	Level5
+	Level6
+	Level7
+	Level8
+	Level9
+)
+
+// CompressionLevelOption defines the compression level (default=Fast).
+func CompressionLevelOption(level CompressionLevel) Option {
+	return func(a applier) error {
+		switch w := a.(type) {
+		case nil:
+			s := fmt.Sprintf("CompressionLevelOption(%s)", level)
+			return lz4errors.Error(s)
+		case *Writer:
+			switch level {
+			case Fast, Level1, Level2, Level3, Level4, Level5, Level6, Level7, Level8, Level9:
+			default:
+				return fmt.Errorf("%w: %d", lz4errors.ErrOptionInvalidCompressionLevel, level)
+			}
+			w.level = lz4block.CompressionLevel(level)
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+func onBlockDone(int) {}
+
+// OnBlockDoneOption is triggered when a block has been processed. For a Writer, it is when is has been compressed,
+// for a Reader, it is when it has been uncompressed.
+func OnBlockDoneOption(handler func(size int)) Option {
+	if handler == nil {
+		handler = onBlockDone
+	}
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("OnBlockDoneOption(%s)", reflect.TypeOf(handler).String())
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.handler = handler
+			return nil
+		case *Reader:
+			rw.handler = handler
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
+
+// LegacyOption provides support for writing LZ4 frames in the legacy format.
+//
+// See https://github.com/lz4/lz4/blob/dev/doc/lz4_Frame_format.md#legacy-frame.
+//
+// NB. compressed Linux kernel images use a tweaked LZ4 legacy format where
+// the compressed stream is followed by the original (uncompressed) size of
+// the kernel (https://events.static.linuxfound.org/sites/events/files/lcjpcojp13_klee.pdf).
+// This is also supported as a special case.
+func LegacyOption(legacy bool) Option {
+	return func(a applier) error {
+		switch rw := a.(type) {
+		case nil:
+			s := fmt.Sprintf("LegacyOption(%v)", legacy)
+			return lz4errors.Error(s)
+		case *Writer:
+			rw.legacy = legacy
+			return nil
+		}
+		return lz4errors.ErrOptionNotApplicable
+	}
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/options_gen.go b/vendor/github.com/pierrec/lz4/v4/options_gen.go
new file mode 100644
index 00000000000..2de814909ef
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/options_gen.go
@@ -0,0 +1,92 @@
+// Code generated by "stringer -type=BlockSize,CompressionLevel -output options_gen.go"; DO NOT EDIT.
+
+package lz4
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[Block64Kb-65536]
+	_ = x[Block256Kb-262144]
+	_ = x[Block1Mb-1048576]
+	_ = x[Block4Mb-4194304]
+}
+
+const (
+	_BlockSize_name_0 = "Block64Kb"
+	_BlockSize_name_1 = "Block256Kb"
+	_BlockSize_name_2 = "Block1Mb"
+	_BlockSize_name_3 = "Block4Mb"
+)
+
+func (i BlockSize) String() string {
+	switch {
+	case i == 65536:
+		return _BlockSize_name_0
+	case i == 262144:
+		return _BlockSize_name_1
+	case i == 1048576:
+		return _BlockSize_name_2
+	case i == 4194304:
+		return _BlockSize_name_3
+	default:
+		return "BlockSize(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[Fast-0]
+	_ = x[Level1-512]
+	_ = x[Level2-1024]
+	_ = x[Level3-2048]
+	_ = x[Level4-4096]
+	_ = x[Level5-8192]
+	_ = x[Level6-16384]
+	_ = x[Level7-32768]
+	_ = x[Level8-65536]
+	_ = x[Level9-131072]
+}
+
+const (
+	_CompressionLevel_name_0 = "Fast"
+	_CompressionLevel_name_1 = "Level1"
+	_CompressionLevel_name_2 = "Level2"
+	_CompressionLevel_name_3 = "Level3"
+	_CompressionLevel_name_4 = "Level4"
+	_CompressionLevel_name_5 = "Level5"
+	_CompressionLevel_name_6 = "Level6"
+	_CompressionLevel_name_7 = "Level7"
+	_CompressionLevel_name_8 = "Level8"
+	_CompressionLevel_name_9 = "Level9"
+)
+
+func (i CompressionLevel) String() string {
+	switch {
+	case i == 0:
+		return _CompressionLevel_name_0
+	case i == 512:
+		return _CompressionLevel_name_1
+	case i == 1024:
+		return _CompressionLevel_name_2
+	case i == 2048:
+		return _CompressionLevel_name_3
+	case i == 4096:
+		return _CompressionLevel_name_4
+	case i == 8192:
+		return _CompressionLevel_name_5
+	case i == 16384:
+		return _CompressionLevel_name_6
+	case i == 32768:
+		return _CompressionLevel_name_7
+	case i == 65536:
+		return _CompressionLevel_name_8
+	case i == 131072:
+		return _CompressionLevel_name_9
+	default:
+		return "CompressionLevel(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/reader.go b/vendor/github.com/pierrec/lz4/v4/reader.go
new file mode 100644
index 00000000000..403aaf697a3
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/reader.go
@@ -0,0 +1,243 @@
+package lz4
+
+import (
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/lz4stream"
+)
+
+var readerStates = []aState{
+	noState:     newState,
+	errorState:  newState,
+	newState:    readState,
+	readState:   closedState,
+	closedState: newState,
+}
+
+// NewReader returns a new LZ4 frame decoder.
+func NewReader(r io.Reader) *Reader {
+	return newReader(r, false)
+}
+
+func newReader(r io.Reader, legacy bool) *Reader {
+	zr := &Reader{frame: lz4stream.NewFrame()}
+	zr.state.init(readerStates)
+	_ = zr.Apply(DefaultConcurrency, defaultOnBlockDone)
+	zr.Reset(r)
+	return zr
+}
+
+// Reader allows reading an LZ4 stream.
+type Reader struct {
+	state   _State
+	src     io.Reader        // source reader
+	num     int              // concurrency level
+	frame   *lz4stream.Frame // frame being read
+	data    []byte           // block buffer allocated in non concurrent mode
+	reads   chan []byte      // pending data
+	idx     int              // size of pending data
+	handler func(int)
+	cum     uint32
+}
+
+func (*Reader) private() {}
+
+func (r *Reader) Apply(options ...Option) (err error) {
+	defer r.state.check(&err)
+	switch r.state.state {
+	case newState:
+	case errorState:
+		return r.state.err
+	default:
+		return lz4errors.ErrOptionClosedOrError
+	}
+	for _, o := range options {
+		if err = o(r); err != nil {
+			return
+		}
+	}
+	return
+}
+
+// Size returns the size of the underlying uncompressed data, if set in the stream.
+func (r *Reader) Size() int {
+	switch r.state.state {
+	case readState, closedState:
+		if r.frame.Descriptor.Flags.Size() {
+			return int(r.frame.Descriptor.ContentSize)
+		}
+	}
+	return 0
+}
+
+func (r *Reader) isNotConcurrent() bool {
+	return r.num == 1
+}
+
+func (r *Reader) init() error {
+	data, err := r.frame.InitR(r.src, r.num)
+	if err != nil {
+		return err
+	}
+	r.reads = data
+	r.idx = 0
+	size := r.frame.Descriptor.Flags.BlockSizeIndex()
+	r.data = size.Get()
+	r.cum = 0
+	return nil
+}
+
+func (r *Reader) Read(buf []byte) (n int, err error) {
+	defer r.state.check(&err)
+	switch r.state.state {
+	case readState:
+	case closedState, errorState:
+		return 0, r.state.err
+	case newState:
+		// First initialization.
+		if err = r.init(); r.state.next(err) {
+			return
+		}
+	default:
+		return 0, r.state.fail()
+	}
+	for len(buf) > 0 {
+		var bn int
+		if r.idx == 0 {
+			if r.isNotConcurrent() {
+				bn, err = r.read(buf)
+			} else {
+				lz4block.Put(r.data)
+				r.data = <-r.reads
+				if len(r.data) == 0 {
+					// No uncompressed data: something went wrong or we are done.
+					err = r.frame.Blocks.ErrorR()
+				}
+			}
+			switch err {
+			case nil:
+			case io.EOF:
+				if er := r.frame.CloseR(r.src); er != nil {
+					err = er
+				}
+				lz4block.Put(r.data)
+				r.data = nil
+				return
+			default:
+				return
+			}
+		}
+		if bn == 0 {
+			// Fill buf with buffered data.
+			bn = copy(buf, r.data[r.idx:])
+			r.idx += bn
+			if r.idx == len(r.data) {
+				// All data read, get ready for the next Read.
+				r.idx = 0
+			}
+		}
+		buf = buf[bn:]
+		n += bn
+		r.handler(bn)
+	}
+	return
+}
+
+// read uncompresses the next block as follow:
+// - if buf has enough room, the block is uncompressed into it directly
+//   and the lenght of used space is returned
+// - else, the uncompress data is stored in r.data and 0 is returned
+func (r *Reader) read(buf []byte) (int, error) {
+	block := r.frame.Blocks.Block
+	_, err := block.Read(r.frame, r.src, r.cum)
+	if err != nil {
+		return 0, err
+	}
+	var direct bool
+	dst := r.data[:cap(r.data)]
+	if len(buf) >= len(dst) {
+		// Uncompress directly into buf.
+		direct = true
+		dst = buf
+	}
+	dst, err = block.Uncompress(r.frame, dst, true)
+	if err != nil {
+		return 0, err
+	}
+	r.cum += uint32(len(dst))
+	if direct {
+		return len(dst), nil
+	}
+	r.data = dst
+	return 0, nil
+}
+
+// Reset clears the state of the Reader r such that it is equivalent to its
+// initial state from NewReader, but instead writing to writer.
+// No access to reader is performed.
+//
+// w.Close must be called before Reset.
+func (r *Reader) Reset(reader io.Reader) {
+	if r.data != nil {
+		lz4block.Put(r.data)
+		r.data = nil
+	}
+	r.frame.Reset(r.num)
+	r.state.reset()
+	r.src = reader
+	r.reads = nil
+}
+
+// WriteTo efficiently uncompresses the data from the Reader underlying source to w.
+func (r *Reader) WriteTo(w io.Writer) (n int64, err error) {
+	switch r.state.state {
+	case closedState, errorState:
+		return 0, r.state.err
+	case newState:
+		if err = r.init(); r.state.next(err) {
+			return
+		}
+	default:
+		return 0, r.state.fail()
+	}
+	defer r.state.nextd(&err)
+
+	var data []byte
+	if r.isNotConcurrent() {
+		size := r.frame.Descriptor.Flags.BlockSizeIndex()
+		data = size.Get()
+		defer lz4block.Put(data)
+	}
+	for {
+		var bn int
+		var dst []byte
+		if r.isNotConcurrent() {
+			bn, err = r.read(data)
+			dst = data[:bn]
+		} else {
+			lz4block.Put(dst)
+			dst = <-r.reads
+			bn = len(dst)
+			if bn == 0 {
+				// No uncompressed data: something went wrong or we are done.
+				err = r.frame.Blocks.ErrorR()
+			}
+		}
+		switch err {
+		case nil:
+		case io.EOF:
+			err = r.frame.CloseR(r.src)
+			return
+		default:
+			return
+		}
+		r.handler(bn)
+		bn, err = w.Write(dst)
+		n += int64(bn)
+		if err != nil {
+			return
+		}
+	}
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/state.go b/vendor/github.com/pierrec/lz4/v4/state.go
new file mode 100644
index 00000000000..d94f04d05eb
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/state.go
@@ -0,0 +1,75 @@
+package lz4
+
+import (
+	"errors"
+	"fmt"
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+//go:generate go run golang.org/x/tools/cmd/stringer -type=aState -output state_gen.go
+
+const (
+	noState     aState = iota // uninitialized reader
+	errorState                // unrecoverable error encountered
+	newState                  // instantiated object
+	readState                 // reading data
+	writeState                // writing data
+	closedState               // all done
+)
+
+type (
+	aState uint8
+	_State struct {
+		states []aState
+		state  aState
+		err    error
+	}
+)
+
+func (s *_State) init(states []aState) {
+	s.states = states
+	s.state = states[0]
+}
+
+func (s *_State) reset() {
+	s.state = s.states[0]
+	s.err = nil
+}
+
+// next sets the state to the next one unless it is passed a non nil error.
+// It returns whether or not it is in error.
+func (s *_State) next(err error) bool {
+	if err != nil {
+		s.err = fmt.Errorf("%s: %w", s.state, err)
+		s.state = errorState
+		return true
+	}
+	s.state = s.states[s.state]
+	return false
+}
+
+// nextd is like next but for defers.
+func (s *_State) nextd(errp *error) bool {
+	return errp != nil && s.next(*errp)
+}
+
+// check sets s in error if not already in error and if the error is not nil or io.EOF,
+func (s *_State) check(errp *error) {
+	if s.state == errorState || errp == nil {
+		return
+	}
+	if err := *errp; err != nil {
+		s.err = fmt.Errorf("%w[%s]", err, s.state)
+		if !errors.Is(err, io.EOF) {
+			s.state = errorState
+		}
+	}
+}
+
+func (s *_State) fail() error {
+	s.state = errorState
+	s.err = fmt.Errorf("%w[%s]", lz4errors.ErrInternalUnhandledState, s.state)
+	return s.err
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/state_gen.go b/vendor/github.com/pierrec/lz4/v4/state_gen.go
new file mode 100644
index 00000000000..75fb8289243
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/state_gen.go
@@ -0,0 +1,28 @@
+// Code generated by "stringer -type=aState -output state_gen.go"; DO NOT EDIT.
+
+package lz4
+
+import "strconv"
+
+func _() {
+	// An "invalid array index" compiler error signifies that the constant values have changed.
+	// Re-run the stringer command to generate them again.
+	var x [1]struct{}
+	_ = x[noState-0]
+	_ = x[errorState-1]
+	_ = x[newState-2]
+	_ = x[readState-3]
+	_ = x[writeState-4]
+	_ = x[closedState-5]
+}
+
+const _aState_name = "noStateerrorStatenewStatereadStatewriteStateclosedState"
+
+var _aState_index = [...]uint8{0, 7, 17, 25, 34, 44, 55}
+
+func (i aState) String() string {
+	if i >= aState(len(_aState_index)-1) {
+		return "aState(" + strconv.FormatInt(int64(i), 10) + ")"
+	}
+	return _aState_name[_aState_index[i]:_aState_index[i+1]]
+}
diff --git a/vendor/github.com/pierrec/lz4/v4/writer.go b/vendor/github.com/pierrec/lz4/v4/writer.go
new file mode 100644
index 00000000000..44a43d251b0
--- /dev/null
+++ b/vendor/github.com/pierrec/lz4/v4/writer.go
@@ -0,0 +1,233 @@
+package lz4
+
+import (
+	"io"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/lz4stream"
+)
+
+var writerStates = []aState{
+	noState:     newState,
+	newState:    writeState,
+	writeState:  closedState,
+	closedState: newState,
+	errorState:  newState,
+}
+
+// NewWriter returns a new LZ4 frame encoder.
+func NewWriter(w io.Writer) *Writer {
+	zw := &Writer{frame: lz4stream.NewFrame()}
+	zw.state.init(writerStates)
+	_ = zw.Apply(DefaultBlockSizeOption, DefaultChecksumOption, DefaultConcurrency, defaultOnBlockDone)
+	zw.Reset(w)
+	return zw
+}
+
+// Writer allows writing an LZ4 stream.
+type Writer struct {
+	state   _State
+	src     io.Writer                 // destination writer
+	level   lz4block.CompressionLevel // how hard to try
+	num     int                       // concurrency level
+	frame   *lz4stream.Frame          // frame being built
+	data    []byte                    // pending data
+	idx     int                       // size of pending data
+	handler func(int)
+	legacy  bool
+}
+
+func (*Writer) private() {}
+
+func (w *Writer) Apply(options ...Option) (err error) {
+	defer w.state.check(&err)
+	switch w.state.state {
+	case newState:
+	case errorState:
+		return w.state.err
+	default:
+		return lz4errors.ErrOptionClosedOrError
+	}
+	for _, o := range options {
+		if err = o(w); err != nil {
+			return
+		}
+	}
+	w.Reset(w.src)
+	return
+}
+
+func (w *Writer) isNotConcurrent() bool {
+	return w.num == 1
+}
+
+// init sets up the Writer when in newState. It does not change the Writer state.
+func (w *Writer) init() error {
+	w.frame.InitW(w.src, w.num, w.legacy)
+	if true || !w.isNotConcurrent() {
+		size := w.frame.Descriptor.Flags.BlockSizeIndex()
+		w.data = size.Get()
+	}
+	w.idx = 0
+	return w.frame.Descriptor.Write(w.frame, w.src)
+}
+
+func (w *Writer) Write(buf []byte) (n int, err error) {
+	defer w.state.check(&err)
+	switch w.state.state {
+	case writeState:
+	case closedState, errorState:
+		return 0, w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
+	default:
+		return 0, w.state.fail()
+	}
+
+	zn := len(w.data)
+	for len(buf) > 0 {
+		if w.idx == 0 && len(buf) >= zn {
+			// Avoid a copy as there is enough data for a block.
+			if err = w.write(buf[:zn], false); err != nil {
+				return
+			}
+			n += zn
+			buf = buf[zn:]
+			continue
+		}
+		// Accumulate the data to be compressed.
+		m := copy(w.data[w.idx:], buf)
+		n += m
+		w.idx += m
+		buf = buf[m:]
+
+		if w.idx < len(w.data) {
+			// Buffer not filled.
+			return
+		}
+
+		// Buffer full.
+		if err = w.write(w.data, true); err != nil {
+			return
+		}
+		if !w.isNotConcurrent() {
+			size := w.frame.Descriptor.Flags.BlockSizeIndex()
+			w.data = size.Get()
+		}
+		w.idx = 0
+	}
+	return
+}
+
+func (w *Writer) write(data []byte, safe bool) error {
+	if w.isNotConcurrent() {
+		block := w.frame.Blocks.Block
+		err := block.Compress(w.frame, data, w.level).Write(w.frame, w.src)
+		w.handler(len(block.Data))
+		return err
+	}
+	c := make(chan *lz4stream.FrameDataBlock)
+	w.frame.Blocks.Blocks <- c
+	go func(c chan *lz4stream.FrameDataBlock, data []byte, safe bool) {
+		b := lz4stream.NewFrameDataBlock(w.frame)
+		c <- b.Compress(w.frame, data, w.level)
+		<-c
+		w.handler(len(b.Data))
+		b.Close(w.frame)
+		if safe {
+			// safe to put it back as the last usage of it was FrameDataBlock.Write() called before c is closed
+			lz4block.Put(data)
+		}
+	}(c, data, safe)
+
+	return nil
+}
+
+// Close closes the Writer, flushing any unwritten data to the underlying io.Writer,
+// but does not close the underlying io.Writer.
+func (w *Writer) Close() (err error) {
+	switch w.state.state {
+	case writeState:
+	case errorState:
+		return w.state.err
+	default:
+		return nil
+	}
+	defer w.state.nextd(&err)
+	if w.idx > 0 {
+		// Flush pending data, disable w.data freeing as it is done later on.
+		if err = w.write(w.data[:w.idx], false); err != nil {
+			return err
+		}
+		w.idx = 0
+	}
+	err = w.frame.CloseW(w.src, w.num)
+	// It is now safe to free the buffer.
+	if w.data != nil {
+		lz4block.Put(w.data)
+		w.data = nil
+	}
+	return
+}
+
+// Reset clears the state of the Writer w such that it is equivalent to its
+// initial state from NewWriter, but instead writing to writer.
+// Reset keeps the previous options unless overwritten by the supplied ones.
+// No access to writer is performed.
+//
+// w.Close must be called before Reset or pending data may be dropped.
+func (w *Writer) Reset(writer io.Writer) {
+	w.frame.Reset(w.num)
+	w.state.reset()
+	w.src = writer
+}
+
+// ReadFrom efficiently reads from r and compressed into the Writer destination.
+func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) {
+	switch w.state.state {
+	case closedState, errorState:
+		return 0, w.state.err
+	case newState:
+		if err = w.init(); w.state.next(err) {
+			return
+		}
+	default:
+		return 0, w.state.fail()
+	}
+	defer w.state.check(&err)
+
+	size := w.frame.Descriptor.Flags.BlockSizeIndex()
+	var done bool
+	var rn int
+	data := size.Get()
+	if w.isNotConcurrent() {
+		// Keep the same buffer for the whole process.
+		defer lz4block.Put(data)
+	}
+	for !done {
+		rn, err = io.ReadFull(r, data)
+		switch err {
+		case nil:
+		case io.EOF, io.ErrUnexpectedEOF: // read may be partial
+			done = true
+		default:
+			return
+		}
+		n += int64(rn)
+		err = w.write(data[:rn], true)
+		if err != nil {
+			return
+		}
+		w.handler(rn)
+		if !done && !w.isNotConcurrent() {
+			// The buffer will be returned automatically by go routines (safe=true)
+			// so get a new one fo the next round.
+			data = size.Get()
+		}
+	}
+	err = w.Close()
+	return
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index b68212c008f..050a1634946 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -397,6 +397,7 @@ github.com/golang/protobuf/ptypes/struct
 github.com/golang/protobuf/ptypes/timestamp
 github.com/golang/protobuf/ptypes/wrappers
 # github.com/golang/snappy v0.0.2
+## explicit
 github.com/golang/snappy
 # github.com/golangci/check v0.0.0-20180506172741-cfe4005ccda2
 github.com/golangci/check/cmd/structcheck
@@ -645,6 +646,15 @@ github.com/julienschmidt/httprouter
 # github.com/kisielk/gotool v1.0.0
 github.com/kisielk/gotool
 github.com/kisielk/gotool/internal/load
+# github.com/klauspost/compress v1.10.5
+## explicit
+github.com/klauspost/compress/flate
+github.com/klauspost/compress/fse
+github.com/klauspost/compress/gzip
+github.com/klauspost/compress/huff0
+github.com/klauspost/compress/snappy
+github.com/klauspost/compress/zstd
+github.com/klauspost/compress/zstd/internal/xxhash
 # github.com/klauspost/cpuid v1.3.1
 github.com/klauspost/cpuid
 # github.com/konsorten/go-windows-terminal-sequences v1.0.3
@@ -770,6 +780,13 @@ github.com/pborman/uuid
 github.com/pelletier/go-toml
 # github.com/phayes/checkstyle v0.0.0-20170904204023-bfd46e6a821d
 github.com/phayes/checkstyle
+# github.com/pierrec/lz4/v4 v4.1.3
+## explicit
+github.com/pierrec/lz4/v4
+github.com/pierrec/lz4/v4/internal/lz4block
+github.com/pierrec/lz4/v4/internal/lz4errors
+github.com/pierrec/lz4/v4/internal/lz4stream
+github.com/pierrec/lz4/v4/internal/xxh32
 # github.com/pkg/errors v0.9.1
 ## explicit
 github.com/pkg/errors