diff --git a/README.md b/README.md index 63937b2..1917fe3 100644 --- a/README.md +++ b/README.md @@ -4,15 +4,37 @@ murmur3 Native Go implementation of Austin Appleby's third MurmurHash revision (aka MurmurHash3). -Includes assembly for amd64 for go1.5+ for 128 bit hashes, seeding function, +Includes assembly for amd64 for 64/128 bit hashes, seeding functions, and string functions to avoid string to slice conversions. -Hand rolled 32 bit assembly was removed during 1.11 due to Go's compiler -catching up and generating equal or better assembly. +Hand rolled 32 bit assembly was removed during 1.11, but may be reintroduced +if the compiler slows down any more. As is, the compiler generates marginally +slower code (by one instruction in the hot loop). The reference algorithm has been slightly hacked as to support the streaming mode required by Go's standard [Hash interface](http://golang.org/pkg/hash/#Hash). +Endianness +========== + +Unlike the canonical source, this library **always** reads bytes as little +endian numbers. This makes the hashes portable across architectures, although +does mean that hashing is a bit slower on big endian architectures. + +Safety +====== + +This library used to use `unsafe` to convert four bytes to a `uint32` and eight +bytes to a `uint64`, but Go 1.14 introduced checks around those types of +conversions that flagged that code as erroneous when hashing on unaligned +input. While the code would not be problematic on amd64, it could be +problematic on some architectures. + +As of Go 1.14, those conversions were removed at the expense of a very minor +performance hit. This hit affects all cpu architectures on for `Sum32`, and +non-amd64 architectures for `Sum64` and `Sum128`. For 64 and 128, custom +assembly exists for amd64 that preserves performance. + Testing ======= @@ -22,6 +44,11 @@ Testing includes comparing random inputs against the [canonical implementation](https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp), and testing length 0 through 17 inputs to force all branches. +Because this code always reads input as little endian, testing against the +canonical source is skipped for big endian architectures. The canonical source +just converts bytes to numbers, meaning on big endian architectures, it will +use different numbers for its hashing. + Documentation ============= @@ -32,53 +59,71 @@ Full documentation can be found on `godoc`. Benchmarks ========== -The following benchmarks show deltas for the 128 bit algorithms only; the 32 -bit algorithms have the same implementation. +Benchmarks below were run on an amd64 machine with _and_ without the custom +assembly. The following numbers are for Go 1.14.1 and are comparing against +[spaolacci/murmur3](https://github.com/spaolacci/murmur3). + +You will notice that at small sizes, the other library is better. This is due +to this library converting to safe code for Go 1.14. At large sizes, this +library is nearly identical to the other. On amd64, the 64 bit and 128 bit +sums come out to ~9% faster. -In comparison to [spaolacci/murmur3](https://github.com/spaolacci/murmur3) on -Go at commit [447965d4e0](https://github.com/golang/go/commit/447965d4e0) -(i.e., post 1.11): +32 bit sums: ``` -benchmark old ns/op new ns/op delta -Benchmark128Branches/0-4 22.2 6.28 -71.71% -Benchmark128Branches/1-4 23.6 8.46 -64.15% -Benchmark128Branches/2-4 24.3 8.68 -64.28% -Benchmark128Branches/3-4 24.7 9.07 -63.28% -Benchmark128Branches/4-4 25.2 8.16 -67.62% -Benchmark128Branches/5-4 25.9 8.89 -65.68% -Benchmark128Branches/6-4 26.8 9.32 -65.22% -Benchmark128Branches/7-4 27.4 9.82 -64.16% -Benchmark128Branches/8-4 28.1 7.68 -72.67% -Benchmark128Branches/9-4 29.6 9.04 -69.46% -Benchmark128Branches/10-4 30.2 9.14 -69.74% -Benchmark128Branches/11-4 30.8 9.53 -69.06% -Benchmark128Branches/12-4 31.5 8.65 -72.54% -Benchmark128Branches/13-4 31.5 9.26 -70.60% -Benchmark128Branches/14-4 32.5 9.69 -70.18% -Benchmark128Branches/15-4 33.4 10.1 -69.76% -Benchmark128Branches/16-4 24.9 10.0 -59.84% -Benchmark64Sizes/32-4 27.8 13.6 -51.08% -Benchmark64Sizes/64-4 35.2 18.8 -46.59% -Benchmark64Sizes/128-4 49.6 30.5 -38.51% -Benchmark64Sizes/256-4 77.9 54.5 -30.04% -Benchmark64Sizes/512-4 136 105 -22.79% -Benchmark64Sizes/1024-4 251 209 -16.73% -Benchmark64Sizes/2048-4 492 419 -14.84% -Benchmark64Sizes/4096-4 952 832 -12.61% -Benchmark64Sizes/8192-4 1879 1658 -11.76% -Benchmark128Sizes/32-4 28.5 13.6 -52.28% -Benchmark128Sizes/64-4 35.7 18.7 -47.62% -Benchmark128Sizes/128-4 49.8 30.3 -39.16% -Benchmark128Sizes/256-4 78.0 54.2 -30.51% -Benchmark128Sizes/512-4 135 105 -22.22% -Benchmark128Sizes/1024-4 250 209 -16.40% -Benchmark128Sizes/2048-4 489 419 -14.31% -Benchmark128Sizes/4096-4 959 831 -13.35% -Benchmark128Sizes/8192-4 1885 1659 -11.99% -BenchmarkNoescape128-4 3226 1824 -43.46% +32Sizes/32-12 3.00GB/s ± 1% 2.12GB/s ±11% -29.24% (p=0.000 n=9+10) +32Sizes/64-12 3.61GB/s ± 3% 2.79GB/s ± 8% -22.62% (p=0.000 n=10+10) +32Sizes/128-12 3.47GB/s ± 8% 2.79GB/s ± 4% -19.47% (p=0.000 n=10+10) +32Sizes/256-12 3.66GB/s ± 4% 3.25GB/s ± 6% -11.09% (p=0.000 n=10+10) +32Sizes/512-12 3.78GB/s ± 3% 3.54GB/s ± 4% -6.30% (p=0.000 n=9+9) +32Sizes/1024-12 3.86GB/s ± 3% 3.69GB/s ± 5% -4.46% (p=0.000 n=10+10) +32Sizes/2048-12 3.85GB/s ± 3% 3.81GB/s ± 3% ~ (p=0.079 n=10+9) +32Sizes/4096-12 3.90GB/s ± 3% 3.82GB/s ± 2% -2.14% (p=0.029 n=10+10) +32Sizes/8192-12 3.82GB/s ± 3% 3.78GB/s ± 7% ~ (p=0.529 n=10+10) ``` -The speedup for large inputs levels out around ~1.12x. Additionally, -this code avoids allocating stack slices unnecessarily for the 128 -algorithm, unlike `spaolacci/murmur3`. +64/128 bit sums, non-amd64: + +``` +64Sizes/32-12 2.34GB/s ± 5% 2.64GB/s ± 9% +12.87% (p=0.000 n=10+10) +64Sizes/64-12 3.62GB/s ± 5% 3.96GB/s ± 4% +9.41% (p=0.000 n=10+10) +64Sizes/128-12 5.12GB/s ± 3% 5.44GB/s ± 4% +6.09% (p=0.000 n=10+9) +64Sizes/256-12 6.35GB/s ± 2% 6.27GB/s ± 9% ~ (p=0.796 n=10+10) +64Sizes/512-12 6.58GB/s ± 7% 6.79GB/s ± 3% ~ (p=0.075 n=10+10) +64Sizes/1024-12 7.49GB/s ± 3% 7.55GB/s ± 9% ~ (p=0.393 n=10+10) +64Sizes/2048-12 8.06GB/s ± 2% 7.90GB/s ± 6% ~ (p=0.156 n=9+10) +64Sizes/4096-12 8.27GB/s ± 6% 8.22GB/s ± 5% ~ (p=0.631 n=10+10) +64Sizes/8192-12 8.35GB/s ± 4% 8.38GB/s ± 6% ~ (p=0.631 n=10+10) +128Sizes/32-12 2.27GB/s ± 2% 2.68GB/s ± 5% +18.00% (p=0.000 n=10+10) +128Sizes/64-12 3.55GB/s ± 2% 4.00GB/s ± 3% +12.47% (p=0.000 n=8+9) +128Sizes/128-12 5.09GB/s ± 1% 5.43GB/s ± 3% +6.65% (p=0.000 n=9+9) +128Sizes/256-12 6.33GB/s ± 3% 5.65GB/s ± 4% -10.79% (p=0.000 n=9+10) +128Sizes/512-12 6.78GB/s ± 3% 6.74GB/s ± 6% ~ (p=0.968 n=9+10) +128Sizes/1024-12 7.46GB/s ± 4% 7.56GB/s ± 4% ~ (p=0.222 n=9+9) +128Sizes/2048-12 7.99GB/s ± 4% 7.96GB/s ± 3% ~ (p=0.666 n=9+9) +128Sizes/4096-12 8.20GB/s ± 2% 8.25GB/s ± 4% ~ (p=0.631 n=10+10) +128Sizes/8192-12 8.24GB/s ± 2% 8.26GB/s ± 5% ~ (p=0.673 n=8+9) +``` + +64/128 bit sums, amd64: + +``` +64Sizes/32-12 2.34GB/s ± 5% 4.36GB/s ± 3% +85.86% (p=0.000 n=10+10) +64Sizes/64-12 3.62GB/s ± 5% 6.27GB/s ± 3% +73.37% (p=0.000 n=10+9) +64Sizes/128-12 5.12GB/s ± 3% 7.70GB/s ± 6% +50.27% (p=0.000 n=10+10) +64Sizes/256-12 6.35GB/s ± 2% 8.61GB/s ± 3% +35.50% (p=0.000 n=10+10) +64Sizes/512-12 6.58GB/s ± 7% 8.59GB/s ± 4% +30.48% (p=0.000 n=10+9) +64Sizes/1024-12 7.49GB/s ± 3% 8.81GB/s ± 2% +17.66% (p=0.000 n=10+10) +64Sizes/2048-12 8.06GB/s ± 2% 8.90GB/s ± 4% +10.49% (p=0.000 n=9+10) +64Sizes/4096-12 8.27GB/s ± 6% 8.90GB/s ± 4% +7.54% (p=0.000 n=10+10) +64Sizes/8192-12 8.35GB/s ± 4% 9.00GB/s ± 3% +7.80% (p=0.000 n=10+9) +128Sizes/32-12 2.27GB/s ± 2% 4.29GB/s ± 9% +88.75% (p=0.000 n=10+10) +128Sizes/64-12 3.55GB/s ± 2% 6.10GB/s ± 8% +71.78% (p=0.000 n=8+10) +128Sizes/128-12 5.09GB/s ± 1% 7.62GB/s ± 9% +49.63% (p=0.000 n=9+10) +128Sizes/256-12 6.33GB/s ± 3% 8.65GB/s ± 3% +36.71% (p=0.000 n=9+10) +128Sizes/512-12 6.78GB/s ± 3% 8.39GB/s ± 6% +23.77% (p=0.000 n=9+10) +128Sizes/1024-12 7.46GB/s ± 4% 8.70GB/s ± 4% +16.70% (p=0.000 n=9+10) +128Sizes/2048-12 7.99GB/s ± 4% 8.73GB/s ± 8% +9.26% (p=0.003 n=9+10) +128Sizes/4096-12 8.20GB/s ± 2% 8.86GB/s ± 6% +8.00% (p=0.000 n=10+10) +128Sizes/8192-12 8.24GB/s ± 2% 9.01GB/s ± 3% +9.30% (p=0.000 n=8+10) +``` diff --git a/murmur.go b/murmur.go index 84f9057..20f8ac6 100644 --- a/murmur.go +++ b/murmur.go @@ -9,6 +9,11 @@ // architectures. package murmur3 +import ( + "reflect" + "unsafe" +) + type bmixer interface { bmix(p []byte) (tail []byte) Size() (n int) @@ -56,3 +61,12 @@ func (d *digest) Reset() { d.tail = nil d.bmixer.reset() } + +func strslice(slice []byte) string { + var str string + *(*reflect.StringHeader)(unsafe.Pointer(&str)) = reflect.StringHeader{ + Data: ((*reflect.SliceHeader)(unsafe.Pointer(&slice))).Data, + Len: len(slice), + } + return str +} diff --git a/murmur128_gen.go b/murmur128_gen.go index c2b04df..58425bc 100644 --- a/murmur128_gen.go +++ b/murmur128_gen.go @@ -13,7 +13,7 @@ import "math/bits" // This reads and processes the data in chunks of little endian uint64s; // thus, the returned hashes are portable across architectures. func SeedSum128(seed1, seed2 uint64, data []byte) (h1 uint64, h2 uint64) { - return SeedStringSum128(seed1, seed2, *(*string)(unsafe.Pointer(&data))) + return SeedStringSum128(seed1, seed2, strslice(data)) } // Sum128 returns the murmur3 sum of data. It is equivalent to the following @@ -22,7 +22,7 @@ func SeedSum128(seed1, seed2 uint64, data []byte) (h1 uint64, h2 uint64) { // hasher.Write(data) // return hasher.Sum128() func Sum128(data []byte) (h1 uint64, h2 uint64) { - return SeedStringSum128(0, 0, *(*string)(unsafe.Pointer(&data))) + return SeedStringSum128(0, 0, strslice(data)) } // StringSum128 is the string version of Sum128. diff --git a/murmur32_gen.go b/murmur32_gen.go index 2890511..49c7133 100644 --- a/murmur32_gen.go +++ b/murmur32_gen.go @@ -1,9 +1,6 @@ package murmur3 -import ( - "math/bits" - "unsafe" -) +import "math/bits" // SeedSum32 returns the murmur3 sum of data with the digest initialized to // seed. @@ -11,7 +8,7 @@ import ( // This reads and processes the data in chunks of little endian uint32s; // thus, the returned hash is portable across architectures. func SeedSum32(seed uint32, data []byte) (h1 uint32) { - return SeedStringSum32(seed, *(*string)(unsafe.Pointer(&data))) + return SeedStringSum32(seed, strslice(data)) } // Sum32 returns the murmur3 sum of data. It is equivalent to the following @@ -20,7 +17,7 @@ func SeedSum32(seed uint32, data []byte) (h1 uint32) { // hasher.Write(data) // return hasher.Sum32() func Sum32(data []byte) uint32 { - return SeedStringSum32(0, *(*string)(unsafe.Pointer(&data))) + return SeedStringSum32(0, strslice(data)) } // StringSum32 is the string version of Sum32. diff --git a/murmur_test.go b/murmur_test.go index e0c84d1..0f529eb 100644 --- a/murmur_test.go +++ b/murmur_test.go @@ -9,10 +9,16 @@ import ( "strconv" "testing" "testing/quick" + "unsafe" "github.com/twmb/murmur3/testdata" ) +var isLittleEndian = func() bool { + i := uint16(1) + return (*(*[2]byte)(unsafe.Pointer(&i)))[0] == 1 +}() + var data = []struct { h32 uint32 h64_1 uint64 @@ -86,7 +92,10 @@ func TestQuickSum32(t *testing.T) { f := func(data []byte) bool { goh1 := Sum32(data) goh2 := StringSum32(string(data)) - cpph1 := testdata.SeedSum32(0, data) + cpph1 := goh1 + if isLittleEndian { + cpph1 = testdata.SeedSum32(0, data) + } return goh1 == goh2 && goh1 == cpph1 } if err := quick.Check(f, nil); err != nil { @@ -99,7 +108,10 @@ func TestQuickSeedSum32(t *testing.T) { goh1 := SeedSum32(seed, data) goh2 := SeedStringSum32(seed, string(data)) goh3 := func() uint32 { h := SeedNew32(seed); h.Write(data); return binary.BigEndian.Uint32(h.Sum(nil)) }() - cpph1 := testdata.SeedSum32(seed, data) + cpph1 := goh1 + if isLittleEndian { + cpph1 = testdata.SeedSum32(seed, data) + } return goh1 == goh2 && goh1 == goh3 && goh1 == cpph1 } if err := quick.Check(f, nil); err != nil { @@ -111,7 +123,10 @@ func TestQuickSum64(t *testing.T) { f := func(data []byte) bool { goh1 := Sum64(data) goh2 := StringSum64(string(data)) - cpph1 := testdata.SeedSum64(0, data) + cpph1 := goh1 + if isLittleEndian { + cpph1 = testdata.SeedSum64(0, data) + } return goh1 == goh2 && goh1 == cpph1 } if err := quick.Check(f, nil); err != nil { @@ -124,7 +139,10 @@ func TestQuickSeedSum64(t *testing.T) { goh1 := SeedSum64(uint64(seed), data) goh2 := SeedStringSum64(uint64(seed), string(data)) goh3 := func() uint64 { h := SeedNew64(uint64(seed)); h.Write(data); return binary.BigEndian.Uint64(h.Sum(nil)) }() - cpph1 := testdata.SeedSum64(seed, data) + cpph1 := goh1 + if isLittleEndian { + cpph1 = testdata.SeedSum64(seed, data) + } return goh1 == goh2 && goh1 == goh3 && goh1 == cpph1 } if err := quick.Check(f, nil); err != nil { @@ -136,7 +154,10 @@ func TestQuickSum128(t *testing.T) { f := func(data []byte) bool { goh1, goh2 := Sum128(data) goh3, goh4 := StringSum128(string(data)) - cpph1, cpph2 := testdata.SeedSum128(0, data) + cpph1, cpph2 := goh1, goh2 + if isLittleEndian { + cpph1, cpph2 = testdata.SeedSum128(0, data) + } return goh1 == goh3 && goh2 == goh4 && goh1 == cpph1 && goh2 == cpph2 } if err := quick.Check(f, nil); err != nil { @@ -154,7 +175,10 @@ func TestQuickSeedSum128(t *testing.T) { sum := h.Sum(nil) return binary.BigEndian.Uint64(sum), binary.BigEndian.Uint64(sum[8:]) }() - cpph1, cpph2 := testdata.SeedSum128(seed, data) + cpph1, cpph2 := goh1, goh2 + if isLittleEndian { + testdata.SeedSum128(seed, data) + } return goh1 == goh3 && goh2 == goh4 && goh1 == goh5 && goh2 == goh6 && goh1 == cpph1 && goh2 == cpph2 @@ -225,7 +249,10 @@ func TestBoundaries(t *testing.T) { test := data[:size] g32h1 := Sum32(test) g32h1s := SeedSum32(0, test) - c32h1 := testdata.SeedSum32(0, test) + c32h1 := g32h1 + if isLittleEndian { + c32h1 = testdata.SeedSum32(0, test) + } if g32h1 != c32h1 { t.Errorf("size #%d: in: %x, g32h1 (%d) != c32h1 (%d); attempt #%d", size, test, g32h1, c32h1, i) } @@ -234,7 +261,10 @@ func TestBoundaries(t *testing.T) { } g64h1 := Sum64(test) g64h1s := SeedSum64(0, test) - c64h1 := testdata.SeedSum64(0, test) + c64h1 := g64h1 + if isLittleEndian { + c64h1 = testdata.SeedSum64(0, test) + } if g64h1 != c64h1 { t.Errorf("size #%d: in: %x, g64h1 (%d) != c64h1 (%d); attempt #%d", size, test, g64h1, c64h1, i) } @@ -243,7 +273,10 @@ func TestBoundaries(t *testing.T) { } g128h1, g128h2 := Sum128(test) g128h1s, g128h2s := SeedSum128(0, 0, test) - c128h1, c128h2 := testdata.SeedSum128(0, test) + c128h1, c128h2 := g128h1, g128h2 + if isLittleEndian { + c128h1, c128h2 = testdata.SeedSum128(0, test) + } if g128h1 != c128h1 { t.Errorf("size #%d: in: %x, g128h1 (%d) != c128h1 (%d); attempt #%d", size, test, g128h1, c128h1, i) } @@ -388,3 +421,10 @@ func BenchmarkNoescape128(b *testing.B) { Sum128(buf[:]) } } + +func BenchmarkStrslice(b *testing.B) { + var s []byte + for i := 0; i < b.N; i++ { + strslice(s) + } +}