Merge branch 'master' into leaner-interface-pt2

# Conflicts: # compressible_test.go
klauspost · Jun 3, 2019 · a76fe6f · a76fe6f
2 parents 385a4d9 + 8538a23
commit a76fe6f
Show file tree

Hide file tree

Showing 286 changed files with 8,164 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -14,6 +14,7 @@ It offers slightly better compression at lower compression settings, and up to 3
 
 # changelog
 
+* June 2, 2019: Added [zstandard](https://github.com/klauspost/compress/tree/master/zstd#zstd) compression!
 * May 25, 2019: deflate/gzip: 10% faster bit writer, mostly visible in lower levels.
 * Apr 22, 2019: [zstd](https://github.com/klauspost/compress/tree/master/zstd#zstd) decompression added.
 * Aug 1, 2018: Added [huff0 README](https://github.com/klauspost/compress/tree/master/huff0#huff0-entropy-compression).

diff --git a/compressible.go b/compressible.go
@@ -61,3 +61,22 @@ func Estimate(b []byte) float64 {
 	// 50/50 weight between prediction and histogram distribution
 	return math.Pow((prediction+entropy)/2, 0.9)
 }
+
+// SnannonEntropyBits returns the number of bits minimum required to represent
+// an entropy encoding of the input bytes.
+// https://en.wiktionary.org/wiki/Shannon_entropy
+func SnannonEntropyBits(b []byte) int {
+	var hist [256]int
+	for _, c := range b {
+		hist[c]++
+	}
+	shannon := float64(0)
+	total := float64(len(b))
+	for i := range hist[:] {
+		n := float64(hist[i])
+		if n > 0 {
+			shannon += math.Log2(total/n) * n
+		}
+	}
+	return int(math.Ceil(shannon))
+}
diff --git a/compressible_test.go b/compressible_test.go
@@ -126,6 +126,120 @@ Thoughts?`)
 	})
 }
 
+func BenchmarkSnannonEntropyBits(b *testing.B) {
+	b.ReportAllocs()
+	// (predictable, low entropy distibution)
+	b.Run("zeroes-5k", func(b *testing.B) {
+		var testData = make([]byte, 5000)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (predictable, high entropy distibution)
+	b.Run("predictable-5k", func(b *testing.B) {
+		var testData = make([]byte, 5000)
+		for i := range testData {
+			testData[i] = byte(float64(i) / float64(len(testData)) * 256)
+		}
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (not predictable, high entropy distibution)
+	b.Run("random-500b", func(b *testing.B) {
+		var testData = make([]byte, 500)
+		rand.Read(testData)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (not predictable, high entropy distibution)
+	b.Run("random-5k", func(b *testing.B) {
+		var testData = make([]byte, 5000)
+		rand.Read(testData)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (not predictable, high entropy distibution)
+	b.Run("random-50k", func(b *testing.B) {
+		var testData = make([]byte, 50000)
+		rand.Read(testData)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (not predictable, high entropy distibution)
+	b.Run("random-500k", func(b *testing.B) {
+		var testData = make([]byte, 500000)
+		rand.Read(testData)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+
+	// (not predictable, medium entropy distibution)
+	b.Run("base-32-5k", func(b *testing.B) {
+		var testData = make([]byte, 5000)
+		rand.Read(testData)
+		s := base32.StdEncoding.EncodeToString(testData)
+		testData = []byte(s)
+		testData = testData[:5000]
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+	// (medium predictable, medium entropy distibution)
+	b.Run("text", func(b *testing.B) {
+		var testData = []byte(`If compression is done per-chunk, care should be taken that it doesn't leave restic backups open to watermarking/fingerprinting attacks.
+This is essentially the same problem we discussed related to fingerprinting the CDC deduplication process:
+With "naive" CDC, a "known plaintext" file can be verified to exist within the backup if the size of individual blocks can be observed by an attacker, by using CDC on the file in parallel and comparing the resulting amount of chunks and individual chunk lengths.
+As discussed earlier, this can be somewhat mitigated by salting the CDC algorithm with a secret value, as done in attic.
+With salted CDC, I assume compression would happen on each individual chunk, after splitting the problematic file into chunks. Restic chunks are in the range of 512 KB to 8 MB (but not evenly distributed - right?).
+Attacker knows that the CDC algorithm uses a secret salt, so the attacker generates a range of chunks consisting of the first 512 KB to 8 MB of the file, one for each valid chunk length. The attacker is also able to determine the lengths of compressed chunks.
+The attacker then compresses that chunk using the compression algorithm.
+The attacker compares the lengths of the resulting chunks to the first chunk in the restic backup sets.
+IF a matching block length is found, the attacker repeats the exercise with the next chunk, and the next chunk, and the next chunk, ... and the next chunk.
+It is my belief that with sufficiently large files, and considering the fact that the CDC algorithm is "biased" (in lack of better of words) towards generating blocks of about 1 MB, this would be sufficient to ascertain whether or not a certain large file exists in the backup.
+AS always, a paranoid and highly unscientific stream of consciousness.
+Thoughts?`)
+		testData = append(testData, testData...)
+		testData = append(testData, testData...)
+		b.SetBytes(int64(len(testData)))
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			SnannonEntropyBits(testData)
+		}
+		b.Log(SnannonEntropyBits(testData))
+	})
+}
+
 func BenchmarkCompressAllocations(b *testing.B) {
 	payload := []byte(strings.Repeat("Tiny payload", 20))
 	for j := -2; j <= 9; j++ {

diff --git a/fse/decompress.go b/fse/decompress.go
@@ -134,8 +134,8 @@ func (s *Scratch) readNCount() error {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
 		} else {
-			bitCount -= (uint)(8 * (iend - 4 - b.off))
-			b.off = iend - 4
+			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
+			b.off = len(b.b) - 4
 		}
 		bitStream = b.Uint32() >> (bitCount & 31)
 	}

diff --git a/huff0/compress.go b/huff0/compress.go
@@ -61,6 +61,9 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 		if maxCount > len(in) {
 			return nil, false, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
 		}
+		if len(in) == 1 {
+			return nil, false, ErrIncompressible
+		}
 		// One symbol, use RLE
 		return nil, false, ErrUseRLE
 	}
@@ -112,7 +115,11 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 	}
 
 	// Use new table
-	s.cTable.write(s)
+	err = s.cTable.write(s)
+	if err != nil {
+		s.OutTable = nil
+		return nil, false, err
+	}
 	s.OutTable = s.Out
 
 	// Compress using new table
@@ -347,13 +354,16 @@ type cTableEntry struct {
 const huffNodesMask = huffNodesLen - 1
 
 func (s *Scratch) buildCTable() error {
+	s.huffSort()
 	if cap(s.cTable) < maxSymbolValue+1 {
-		s.cTable = make([]cTableEntry, 0, maxSymbolValue+1)
+		s.cTable = make([]cTableEntry, s.symbolLen, maxSymbolValue+1)
+	} else {
+		s.cTable = s.cTable[:s.symbolLen]
+		for i := range s.cTable {
+			s.cTable[i] = cTableEntry{}
+		}
 	}
 
-	s.huffSort()
-	s.cTable = s.cTable[:s.symbolLen]
-
 	var startNode = int16(s.symbolLen)
 	nonNullRank := s.symbolLen - 1
 

diff --git a/huff0/compress_test.go b/huff0/compress_test.go
@@ -1,7 +1,9 @@
 package huff0
 
 import (
+	"fmt"
 	"io/ioutil"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"strings"
@@ -152,6 +154,70 @@ func TestCompress4X(t *testing.T) {
 	}
 }
 
+func TestCompress4XReuse(t *testing.T) {
+	rng := rand.NewSource(0x1337)
+	var s Scratch
+	s.Reuse = ReusePolicyAllow
+	for i := 0; i < 255; i++ {
+		t.Run(fmt.Sprint("test-", i), func(t *testing.T) {
+			buf0 := make([]byte, BlockSizeMax)
+			for j := range buf0 {
+				buf0[j] = byte(int64(i) + (rng.Int63() & 3))
+			}
+
+			b, re, err := Compress4X(buf0, &s)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if b == nil {
+				t.Error("got no output")
+				return
+			}
+			if len(s.OutData) == 0 {
+				t.Error("got no data output")
+			}
+			if re {
+				t.Error("claimed to have re-used. Unlikely.")
+			}
+
+			t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
+		})
+	}
+}
+
+func TestCompress4XReuseActually(t *testing.T) {
+	rng := rand.NewSource(0x1337)
+	var s Scratch
+	s.Reuse = ReusePolicyAllow
+	for i := 0; i < 255; i++ {
+		t.Run(fmt.Sprint("test-", i), func(t *testing.T) {
+			buf0 := make([]byte, BlockSizeMax)
+			for j := range buf0 {
+				buf0[j] = byte(rng.Int63() & 7)
+			}
+
+			b, re, err := Compress4X(buf0, &s)
+			if err != nil {
+				t.Fatal(err)
+			}
+			if b == nil {
+				t.Error("got no output")
+				return
+			}
+			if len(s.OutData) == 0 {
+				t.Error("got no data output")
+			}
+			if re && i == 0 {
+				t.Error("Claimed to have re-used on first loop.")
+			}
+			if !re && i > 0 {
+				t.Error("Expected table to be reused")
+			}
+
+			t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
+		})
+	}
+}
 func TestCompress1XReuse(t *testing.T) {
 	for _, test := range testfiles {
 		t.Run(test.name, func(t *testing.T) {

diff --git a/huff0/huff0.go b/huff0/huff0.go
@@ -23,7 +23,7 @@ const (
 	huffNodesLen    = 512
 
 	// BlockSizeMax is maximum input size for a single block uncompressed.
-	BlockSizeMax = 128 << 10
+	BlockSizeMax = 1<<18 - 1
 )
 
 var (
@@ -194,6 +194,7 @@ func (c cTable) write(s *Scratch) error {
 			s.Out = append(s.Out, b...)
 			return nil
 		}
+		// Unable to compress (RLE/uncompressible)
 	}
 	// write raw values as 4-bits (max : 15)
 	if maxSymbolValue > (256 - 128) {