diff --git a/README.md b/README.md
index 63937b2..1917fe3 100644
--- a/README.md
+++ b/README.md
@@ -4,15 +4,37 @@ murmur3
 Native Go implementation of Austin Appleby's third MurmurHash revision (aka
 MurmurHash3).
 
-Includes assembly for amd64 for go1.5+ for 128 bit hashes, seeding function,
+Includes assembly for amd64 for 64/128 bit hashes, seeding functions,
 and string functions to avoid string to slice conversions.
 
-Hand rolled 32 bit assembly was removed during 1.11 due to Go's compiler
-catching up and generating equal or better assembly.
+Hand rolled 32 bit assembly was removed during 1.11, but may be reintroduced
+if the compiler slows down any more. As is, the compiler generates marginally
+slower code (by one instruction in the hot loop).
 
 The reference algorithm has been slightly hacked as to support the streaming mode
 required by Go's standard [Hash interface](http://golang.org/pkg/hash/#Hash).
 
+Endianness
+==========
+
+Unlike the canonical source, this library **always** reads bytes as little
+endian numbers. This makes the hashes portable across architectures, although
+does mean that hashing is a bit slower on big endian architectures.
+
+Safety
+======
+
+This library used to use `unsafe` to convert four bytes to a `uint32` and eight
+bytes to a `uint64`, but Go 1.14 introduced checks around those types of
+conversions that flagged that code as erroneous when hashing on unaligned
+input. While the code would not be problematic on amd64, it could be
+problematic on some architectures.
+
+As of Go 1.14, those conversions were removed at the expense of a very minor
+performance hit. This hit affects all cpu architectures on for `Sum32`, and
+non-amd64 architectures for `Sum64` and `Sum128`. For 64 and 128, custom
+assembly exists for amd64 that preserves performance.
+
 Testing
 =======
 
@@ -22,6 +44,11 @@ Testing includes comparing random inputs against the [canonical
 implementation](https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp),
 and testing length 0 through 17 inputs to force all branches.
 
+Because this code always reads input as little endian, testing against the
+canonical source is skipped for big endian architectures. The canonical source
+just converts bytes to numbers, meaning on big endian architectures, it will
+use different numbers for its hashing.
+
 Documentation
 =============
 
@@ -32,53 +59,71 @@ Full documentation can be found on `godoc`.
 Benchmarks
 ==========
 
-The following benchmarks show deltas for the 128 bit algorithms only; the 32
-bit algorithms have the same implementation.
+Benchmarks below were run on an amd64 machine with _and_ without the custom
+assembly. The following numbers are for Go 1.14.1 and are comparing against
+[spaolacci/murmur3](https://github.com/spaolacci/murmur3).
+
+You will notice that at small sizes, the other library is better. This is due
+to this library converting to safe code for Go 1.14. At large sizes, this
+library is nearly identical to the other. On amd64, the 64 bit and 128 bit
+sums come out to ~9% faster.
 
-In comparison to [spaolacci/murmur3](https://github.com/spaolacci/murmur3) on
-Go at commit [447965d4e0](https://github.com/golang/go/commit/447965d4e0)
-(i.e., post 1.11):
+32 bit sums:
 
 ```
-benchmark                     old ns/op     new ns/op     delta
-Benchmark128Branches/0-4      22.2          6.28          -71.71%
-Benchmark128Branches/1-4      23.6          8.46          -64.15%
-Benchmark128Branches/2-4      24.3          8.68          -64.28%
-Benchmark128Branches/3-4      24.7          9.07          -63.28%
-Benchmark128Branches/4-4      25.2          8.16          -67.62%
-Benchmark128Branches/5-4      25.9          8.89          -65.68%
-Benchmark128Branches/6-4      26.8          9.32          -65.22%
-Benchmark128Branches/7-4      27.4          9.82          -64.16%
-Benchmark128Branches/8-4      28.1          7.68          -72.67%
-Benchmark128Branches/9-4      29.6          9.04          -69.46%
-Benchmark128Branches/10-4     30.2          9.14          -69.74%
-Benchmark128Branches/11-4     30.8          9.53          -69.06%
-Benchmark128Branches/12-4     31.5          8.65          -72.54%
-Benchmark128Branches/13-4     31.5          9.26          -70.60%
-Benchmark128Branches/14-4     32.5          9.69          -70.18%
-Benchmark128Branches/15-4     33.4          10.1          -69.76%
-Benchmark128Branches/16-4     24.9          10.0          -59.84%
-Benchmark64Sizes/32-4         27.8          13.6          -51.08%
-Benchmark64Sizes/64-4         35.2          18.8          -46.59%
-Benchmark64Sizes/128-4        49.6          30.5          -38.51%
-Benchmark64Sizes/256-4        77.9          54.5          -30.04%
-Benchmark64Sizes/512-4        136           105           -22.79%
-Benchmark64Sizes/1024-4       251           209           -16.73%
-Benchmark64Sizes/2048-4       492           419           -14.84%
-Benchmark64Sizes/4096-4       952           832           -12.61%
-Benchmark64Sizes/8192-4       1879          1658          -11.76%
-Benchmark128Sizes/32-4        28.5          13.6          -52.28%
-Benchmark128Sizes/64-4        35.7          18.7          -47.62%
-Benchmark128Sizes/128-4       49.8          30.3          -39.16%
-Benchmark128Sizes/256-4       78.0          54.2          -30.51%
-Benchmark128Sizes/512-4       135           105           -22.22%
-Benchmark128Sizes/1024-4      250           209           -16.40%
-Benchmark128Sizes/2048-4      489           419           -14.31%
-Benchmark128Sizes/4096-4      959           831           -13.35%
-Benchmark128Sizes/8192-4      1885          1659          -11.99%
-BenchmarkNoescape128-4        3226          1824          -43.46%
+32Sizes/32-12     3.00GB/s ± 1%  2.12GB/s ±11%  -29.24%  (p=0.000 n=9+10)
+32Sizes/64-12     3.61GB/s ± 3%  2.79GB/s ± 8%  -22.62%  (p=0.000 n=10+10)
+32Sizes/128-12    3.47GB/s ± 8%  2.79GB/s ± 4%  -19.47%  (p=0.000 n=10+10)
+32Sizes/256-12    3.66GB/s ± 4%  3.25GB/s ± 6%  -11.09%  (p=0.000 n=10+10)
+32Sizes/512-12    3.78GB/s ± 3%  3.54GB/s ± 4%   -6.30%  (p=0.000 n=9+9)
+32Sizes/1024-12   3.86GB/s ± 3%  3.69GB/s ± 5%   -4.46%  (p=0.000 n=10+10)
+32Sizes/2048-12   3.85GB/s ± 3%  3.81GB/s ± 3%     ~     (p=0.079 n=10+9)
+32Sizes/4096-12   3.90GB/s ± 3%  3.82GB/s ± 2%   -2.14%  (p=0.029 n=10+10)
+32Sizes/8192-12   3.82GB/s ± 3%  3.78GB/s ± 7%     ~     (p=0.529 n=10+10)
 ```
 
-The speedup for large inputs levels out around ~1.12x. Additionally,
-this code avoids allocating stack slices unnecessarily for the 128
-algorithm, unlike `spaolacci/murmur3`.
+64/128 bit sums, non-amd64:
+
+```
+64Sizes/32-12     2.34GB/s ± 5%  2.64GB/s ± 9%  +12.87%  (p=0.000 n=10+10)
+64Sizes/64-12     3.62GB/s ± 5%  3.96GB/s ± 4%   +9.41%  (p=0.000 n=10+10)
+64Sizes/128-12    5.12GB/s ± 3%  5.44GB/s ± 4%   +6.09%  (p=0.000 n=10+9)
+64Sizes/256-12    6.35GB/s ± 2%  6.27GB/s ± 9%     ~     (p=0.796 n=10+10)
+64Sizes/512-12    6.58GB/s ± 7%  6.79GB/s ± 3%     ~     (p=0.075 n=10+10)
+64Sizes/1024-12   7.49GB/s ± 3%  7.55GB/s ± 9%     ~     (p=0.393 n=10+10)
+64Sizes/2048-12   8.06GB/s ± 2%  7.90GB/s ± 6%     ~     (p=0.156 n=9+10)
+64Sizes/4096-12   8.27GB/s ± 6%  8.22GB/s ± 5%     ~     (p=0.631 n=10+10)
+64Sizes/8192-12   8.35GB/s ± 4%  8.38GB/s ± 6%     ~     (p=0.631 n=10+10)
+128Sizes/32-12    2.27GB/s ± 2%  2.68GB/s ± 5%  +18.00%  (p=0.000 n=10+10)
+128Sizes/64-12    3.55GB/s ± 2%  4.00GB/s ± 3%  +12.47%  (p=0.000 n=8+9)
+128Sizes/128-12   5.09GB/s ± 1%  5.43GB/s ± 3%   +6.65%  (p=0.000 n=9+9)
+128Sizes/256-12   6.33GB/s ± 3%  5.65GB/s ± 4%  -10.79%  (p=0.000 n=9+10)
+128Sizes/512-12   6.78GB/s ± 3%  6.74GB/s ± 6%     ~     (p=0.968 n=9+10)
+128Sizes/1024-12  7.46GB/s ± 4%  7.56GB/s ± 4%     ~     (p=0.222 n=9+9)
+128Sizes/2048-12  7.99GB/s ± 4%  7.96GB/s ± 3%     ~     (p=0.666 n=9+9)
+128Sizes/4096-12  8.20GB/s ± 2%  8.25GB/s ± 4%     ~     (p=0.631 n=10+10)
+128Sizes/8192-12  8.24GB/s ± 2%  8.26GB/s ± 5%     ~     (p=0.673 n=8+9)
+```
+
+64/128 bit sums, amd64:
+
+```
+64Sizes/32-12     2.34GB/s ± 5%  4.36GB/s ± 3%  +85.86%  (p=0.000 n=10+10)
+64Sizes/64-12     3.62GB/s ± 5%  6.27GB/s ± 3%  +73.37%  (p=0.000 n=10+9)
+64Sizes/128-12    5.12GB/s ± 3%  7.70GB/s ± 6%  +50.27%  (p=0.000 n=10+10)
+64Sizes/256-12    6.35GB/s ± 2%  8.61GB/s ± 3%  +35.50%  (p=0.000 n=10+10)
+64Sizes/512-12    6.58GB/s ± 7%  8.59GB/s ± 4%  +30.48%  (p=0.000 n=10+9)
+64Sizes/1024-12   7.49GB/s ± 3%  8.81GB/s ± 2%  +17.66%  (p=0.000 n=10+10)
+64Sizes/2048-12   8.06GB/s ± 2%  8.90GB/s ± 4%  +10.49%  (p=0.000 n=9+10)
+64Sizes/4096-12   8.27GB/s ± 6%  8.90GB/s ± 4%   +7.54%  (p=0.000 n=10+10)
+64Sizes/8192-12   8.35GB/s ± 4%  9.00GB/s ± 3%   +7.80%  (p=0.000 n=10+9)
+128Sizes/32-12    2.27GB/s ± 2%  4.29GB/s ± 9%  +88.75%  (p=0.000 n=10+10)
+128Sizes/64-12    3.55GB/s ± 2%  6.10GB/s ± 8%  +71.78%  (p=0.000 n=8+10)
+128Sizes/128-12   5.09GB/s ± 1%  7.62GB/s ± 9%  +49.63%  (p=0.000 n=9+10)
+128Sizes/256-12   6.33GB/s ± 3%  8.65GB/s ± 3%  +36.71%  (p=0.000 n=9+10)
+128Sizes/512-12   6.78GB/s ± 3%  8.39GB/s ± 6%  +23.77%  (p=0.000 n=9+10)
+128Sizes/1024-12  7.46GB/s ± 4%  8.70GB/s ± 4%  +16.70%  (p=0.000 n=9+10)
+128Sizes/2048-12  7.99GB/s ± 4%  8.73GB/s ± 8%   +9.26%  (p=0.003 n=9+10)
+128Sizes/4096-12  8.20GB/s ± 2%  8.86GB/s ± 6%   +8.00%  (p=0.000 n=10+10)
+128Sizes/8192-12  8.24GB/s ± 2%  9.01GB/s ± 3%   +9.30%  (p=0.000 n=8+10)
+```
diff --git a/murmur.go b/murmur.go
index 84f9057..20f8ac6 100644
--- a/murmur.go
+++ b/murmur.go
@@ -9,6 +9,11 @@
 // architectures.
 package murmur3
 
+import (
+	"reflect"
+	"unsafe"
+)
+
 type bmixer interface {
 	bmix(p []byte) (tail []byte)
 	Size() (n int)
@@ -56,3 +61,12 @@ func (d *digest) Reset() {
 	d.tail = nil
 	d.bmixer.reset()
 }
+
+func strslice(slice []byte) string {
+	var str string
+	*(*reflect.StringHeader)(unsafe.Pointer(&str)) = reflect.StringHeader{
+		Data: ((*reflect.SliceHeader)(unsafe.Pointer(&slice))).Data,
+		Len:  len(slice),
+	}
+	return str
+}
diff --git a/murmur128_gen.go b/murmur128_gen.go
index c2b04df..58425bc 100644
--- a/murmur128_gen.go
+++ b/murmur128_gen.go
@@ -13,7 +13,7 @@ import "math/bits"
 // This reads and processes the data in chunks of little endian uint64s;
 // thus, the returned hashes are portable across architectures.
 func SeedSum128(seed1, seed2 uint64, data []byte) (h1 uint64, h2 uint64) {
-	return SeedStringSum128(seed1, seed2, *(*string)(unsafe.Pointer(&data)))
+	return SeedStringSum128(seed1, seed2, strslice(data))
 }
 
 // Sum128 returns the murmur3 sum of data. It is equivalent to the following
@@ -22,7 +22,7 @@ func SeedSum128(seed1, seed2 uint64, data []byte) (h1 uint64, h2 uint64) {
 //     hasher.Write(data)
 //     return hasher.Sum128()
 func Sum128(data []byte) (h1 uint64, h2 uint64) {
-	return SeedStringSum128(0, 0, *(*string)(unsafe.Pointer(&data)))
+	return SeedStringSum128(0, 0, strslice(data))
 }
 
 // StringSum128 is the string version of Sum128.
diff --git a/murmur32_gen.go b/murmur32_gen.go
index 2890511..49c7133 100644
--- a/murmur32_gen.go
+++ b/murmur32_gen.go
@@ -1,9 +1,6 @@
 package murmur3
 
-import (
-	"math/bits"
-	"unsafe"
-)
+import "math/bits"
 
 // SeedSum32 returns the murmur3 sum of data with the digest initialized to
 // seed.
@@ -11,7 +8,7 @@ import (
 // This reads and processes the data in chunks of little endian uint32s;
 // thus, the returned hash is portable across architectures.
 func SeedSum32(seed uint32, data []byte) (h1 uint32) {
-	return SeedStringSum32(seed, *(*string)(unsafe.Pointer(&data)))
+	return SeedStringSum32(seed, strslice(data))
 }
 
 // Sum32 returns the murmur3 sum of data. It is equivalent to the following
@@ -20,7 +17,7 @@ func SeedSum32(seed uint32, data []byte) (h1 uint32) {
 //     hasher.Write(data)
 //     return hasher.Sum32()
 func Sum32(data []byte) uint32 {
-	return SeedStringSum32(0, *(*string)(unsafe.Pointer(&data)))
+	return SeedStringSum32(0, strslice(data))
 }
 
 // StringSum32 is the string version of Sum32.
diff --git a/murmur_test.go b/murmur_test.go
index e0c84d1..0f529eb 100644
--- a/murmur_test.go
+++ b/murmur_test.go
@@ -9,10 +9,16 @@ import (
 	"strconv"
 	"testing"
 	"testing/quick"
+	"unsafe"
 
 	"github.com/twmb/murmur3/testdata"
 )
 
+var isLittleEndian = func() bool {
+	i := uint16(1)
+	return (*(*[2]byte)(unsafe.Pointer(&i)))[0] == 1
+}()
+
 var data = []struct {
 	h32   uint32
 	h64_1 uint64
@@ -86,7 +92,10 @@ func TestQuickSum32(t *testing.T) {
 	f := func(data []byte) bool {
 		goh1 := Sum32(data)
 		goh2 := StringSum32(string(data))
-		cpph1 := testdata.SeedSum32(0, data)
+		cpph1 := goh1
+		if isLittleEndian {
+			cpph1 = testdata.SeedSum32(0, data)
+		}
 		return goh1 == goh2 && goh1 == cpph1
 	}
 	if err := quick.Check(f, nil); err != nil {
@@ -99,7 +108,10 @@ func TestQuickSeedSum32(t *testing.T) {
 		goh1 := SeedSum32(seed, data)
 		goh2 := SeedStringSum32(seed, string(data))
 		goh3 := func() uint32 { h := SeedNew32(seed); h.Write(data); return binary.BigEndian.Uint32(h.Sum(nil)) }()
-		cpph1 := testdata.SeedSum32(seed, data)
+		cpph1 := goh1
+		if isLittleEndian {
+			cpph1 = testdata.SeedSum32(seed, data)
+		}
 		return goh1 == goh2 && goh1 == goh3 && goh1 == cpph1
 	}
 	if err := quick.Check(f, nil); err != nil {
@@ -111,7 +123,10 @@ func TestQuickSum64(t *testing.T) {
 	f := func(data []byte) bool {
 		goh1 := Sum64(data)
 		goh2 := StringSum64(string(data))
-		cpph1 := testdata.SeedSum64(0, data)
+		cpph1 := goh1
+		if isLittleEndian {
+			cpph1 = testdata.SeedSum64(0, data)
+		}
 		return goh1 == goh2 && goh1 == cpph1
 	}
 	if err := quick.Check(f, nil); err != nil {
@@ -124,7 +139,10 @@ func TestQuickSeedSum64(t *testing.T) {
 		goh1 := SeedSum64(uint64(seed), data)
 		goh2 := SeedStringSum64(uint64(seed), string(data))
 		goh3 := func() uint64 { h := SeedNew64(uint64(seed)); h.Write(data); return binary.BigEndian.Uint64(h.Sum(nil)) }()
-		cpph1 := testdata.SeedSum64(seed, data)
+		cpph1 := goh1
+		if isLittleEndian {
+			cpph1 = testdata.SeedSum64(seed, data)
+		}
 		return goh1 == goh2 && goh1 == goh3 && goh1 == cpph1
 	}
 	if err := quick.Check(f, nil); err != nil {
@@ -136,7 +154,10 @@ func TestQuickSum128(t *testing.T) {
 	f := func(data []byte) bool {
 		goh1, goh2 := Sum128(data)
 		goh3, goh4 := StringSum128(string(data))
-		cpph1, cpph2 := testdata.SeedSum128(0, data)
+		cpph1, cpph2 := goh1, goh2
+		if isLittleEndian {
+			cpph1, cpph2 = testdata.SeedSum128(0, data)
+		}
 		return goh1 == goh3 && goh2 == goh4 && goh1 == cpph1 && goh2 == cpph2
 	}
 	if err := quick.Check(f, nil); err != nil {
@@ -154,7 +175,10 @@ func TestQuickSeedSum128(t *testing.T) {
 			sum := h.Sum(nil)
 			return binary.BigEndian.Uint64(sum), binary.BigEndian.Uint64(sum[8:])
 		}()
-		cpph1, cpph2 := testdata.SeedSum128(seed, data)
+		cpph1, cpph2 := goh1, goh2
+		if isLittleEndian {
+			testdata.SeedSum128(seed, data)
+		}
 		return goh1 == goh3 && goh2 == goh4 &&
 			goh1 == goh5 && goh2 == goh6 &&
 			goh1 == cpph1 && goh2 == cpph2
@@ -225,7 +249,10 @@ func TestBoundaries(t *testing.T) {
 			test := data[:size]
 			g32h1 := Sum32(test)
 			g32h1s := SeedSum32(0, test)
-			c32h1 := testdata.SeedSum32(0, test)
+			c32h1 := g32h1
+			if isLittleEndian {
+				c32h1 = testdata.SeedSum32(0, test)
+			}
 			if g32h1 != c32h1 {
 				t.Errorf("size #%d: in: %x, g32h1 (%d) != c32h1 (%d); attempt #%d", size, test, g32h1, c32h1, i)
 			}
@@ -234,7 +261,10 @@ func TestBoundaries(t *testing.T) {
 			}
 			g64h1 := Sum64(test)
 			g64h1s := SeedSum64(0, test)
-			c64h1 := testdata.SeedSum64(0, test)
+			c64h1 := g64h1
+			if isLittleEndian {
+				c64h1 = testdata.SeedSum64(0, test)
+			}
 			if g64h1 != c64h1 {
 				t.Errorf("size #%d: in: %x, g64h1 (%d) != c64h1 (%d); attempt #%d", size, test, g64h1, c64h1, i)
 			}
@@ -243,7 +273,10 @@ func TestBoundaries(t *testing.T) {
 			}
 			g128h1, g128h2 := Sum128(test)
 			g128h1s, g128h2s := SeedSum128(0, 0, test)
-			c128h1, c128h2 := testdata.SeedSum128(0, test)
+			c128h1, c128h2 := g128h1, g128h2
+			if isLittleEndian {
+				c128h1, c128h2 = testdata.SeedSum128(0, test)
+			}
 			if g128h1 != c128h1 {
 				t.Errorf("size #%d: in: %x, g128h1 (%d) != c128h1 (%d); attempt #%d", size, test, g128h1, c128h1, i)
 			}
@@ -388,3 +421,10 @@ func BenchmarkNoescape128(b *testing.B) {
 		Sum128(buf[:])
 	}
 }
+
+func BenchmarkStrslice(b *testing.B) {
+	var s []byte
+	for i := 0; i < b.N; i++ {
+		strslice(s)
+	}
+}