From 5e801bfbf1025b2fbb0b3c2af9ff3c12a3677610 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 17 Jun 2019 19:25:02 +0200 Subject: [PATCH] Fix up ShannonEntropyBits (#127) Breaking, but typo to embarrassing to keep ;) --- compressible.go | 17 ++++++++++------- compressible_test.go | 32 ++++++++++++++++---------------- 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/compressible.go b/compressible.go index d1def0d2e3..ea5a692d51 100644 --- a/compressible.go +++ b/compressible.go @@ -62,20 +62,23 @@ func Estimate(b []byte) float64 { return math.Pow((prediction+entropy)/2, 0.9) } -// SnannonEntropyBits returns the number of bits minimum required to represent +// ShannonEntropyBits returns the number of bits minimum required to represent // an entropy encoding of the input bytes. // https://en.wiktionary.org/wiki/Shannon_entropy -func SnannonEntropyBits(b []byte) int { +func ShannonEntropyBits(b []byte) int { + if len(b) == 0 { + return 0 + } var hist [256]int for _, c := range b { hist[c]++ } shannon := float64(0) - total := float64(len(b)) - for i := range hist[:] { - n := float64(hist[i]) - if n > 0 { - shannon += math.Log2(total/n) * n + invTotal := 1.0 / float64(len(b)) + for _, v := range hist[:] { + if v > 0 { + n := float64(v) + shannon += math.Ceil(-math.Log2(n*invTotal) * n) } } return int(math.Ceil(shannon)) diff --git a/compressible_test.go b/compressible_test.go index 7b8b32dfa6..09eb50b95b 100644 --- a/compressible_test.go +++ b/compressible_test.go @@ -134,9 +134,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (predictable, high entropy distibution) @@ -148,9 +148,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (not predictable, high entropy distibution) @@ -160,9 +160,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (not predictable, high entropy distibution) @@ -172,9 +172,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (not predictable, high entropy distibution) @@ -184,9 +184,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (not predictable, high entropy distibution) @@ -196,9 +196,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (not predictable, medium entropy distibution) @@ -211,9 +211,9 @@ func BenchmarkSnannonEntropyBits(b *testing.B) { b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) // (medium predictable, medium entropy distibution) b.Run("text", func(b *testing.B) { @@ -234,9 +234,9 @@ Thoughts?`) b.SetBytes(int64(len(testData))) b.ResetTimer() for i := 0; i < b.N; i++ { - SnannonEntropyBits(testData) + ShannonEntropyBits(testData) } - b.Log(SnannonEntropyBits(testData)) + b.Log(ShannonEntropyBits(testData)) }) }