From f792e480ab3720d5b156910716c0aceb360c0b45 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sat, 7 Apr 2018 17:08:47 +0200
Subject: [PATCH 1/6] Add simd_support feature

---
 Cargo.toml | 3 ++-
 src/lib.rs | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 0985f27ae90..440ac546fd4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,10 +19,11 @@ appveyor = { repository = "alexcrichton/rand" }
 
 [features]
 default = ["std" ] # without "std" rand uses libcore
-nightly = ["i128_support"] # enables all features requiring nightly rust
+nightly = ["i128_support", "simd_support"] # enables all features requiring nightly rust
 std = ["rand_core/std", "alloc", "libc", "winapi", "cloudabi", "fuchsia-zircon"]
 alloc = ["rand_core/alloc"]  # enables Vec and Box support (without std)
 i128_support = [] # enables i128 and u128 support
+simd_support = [] # enables SIMD support
 serde1 = ["serde", "serde_derive", "rand_core/serde1"] # enables serialization for PRNGs
 
 [workspace]
diff --git a/src/lib.rs b/src/lib.rs
index e75b72992bd..77ecf38f537 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -233,6 +233,7 @@
 #![cfg_attr(all(feature="alloc", not(feature="std")), feature(alloc))]
 #![cfg_attr(all(feature="i128_support", feature="nightly"), allow(stable_features))] // stable since 2018-03-27
 #![cfg_attr(all(feature="i128_support", feature="nightly"), feature(i128_type, i128))]
+#![cfg_attr(all(feature="simd_support", feature="nightly"), feature(stdsimd))]
 #![cfg_attr(feature = "stdweb", recursion_limit="128")]
 
 #[cfg(feature="std")] extern crate std as core;

From 38b9a29516dbba0e6891aa9b3c223c924dc7697f Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Fri, 6 Apr 2018 21:52:01 +0200
Subject: [PATCH 2/6] Implement Standard for SIMD integer types

---
 src/distributions/integer.rs | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs
index a23ddd51aba..82efd9ba795 100644
--- a/src/distributions/integer.rs
+++ b/src/distributions/integer.rs
@@ -12,6 +12,8 @@
 
 use {Rng};
 use distributions::{Distribution, Standard};
+#[cfg(feature="simd_support")]
+use core::simd::*;
 
 impl Distribution<u8> for Standard {
     #[inline]
@@ -84,6 +86,39 @@ impl_int_from_uint! { i64, u64 }
 #[cfg(feature = "i128_support")] impl_int_from_uint! { i128, u128 }
 impl_int_from_uint! { isize, usize }
 
+#[cfg(feature="simd_support")]
+macro_rules! simd_impl {
+    ($bits:expr,) => {};
+    ($bits:expr, $ty:ty, $($ty_more:ty,)*) => {
+        simd_impl!($bits, $($ty_more,)*);
+
+        impl Distribution<$ty> for Standard {
+            #[inline]
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                let mut vec = Default::default();
+                unsafe {
+                    let ptr = &mut vec;
+                    let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; $bits/8]);
+                    rng.fill_bytes(b_ptr);
+                }
+                vec
+            }
+        }
+    }
+}
+
+#[cfg(feature="simd_support")]
+simd_impl!(16, u8x2, i8x2,);
+#[cfg(feature="simd_support")]
+simd_impl!(32, u8x4, i8x4, u16x2, i16x2,);
+#[cfg(feature="simd_support")]
+simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,);
+#[cfg(feature="simd_support")]
+simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,);
+#[cfg(feature="simd_support")]
+simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,);
+#[cfg(feature="simd_support")]
+simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,);
 
 #[cfg(test)]
 mod tests {

From d80efb2bf17a1e410b582c2e67f310d7968e6f7e Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sat, 7 Apr 2018 17:02:36 +0200
Subject: [PATCH 3/6] Implement Standard for SIMD float types

---
 src/distributions/float.rs | 198 +++++++++++++++++++++++++++++--------
 1 file changed, 155 insertions(+), 43 deletions(-)

diff --git a/src/distributions/float.rs b/src/distributions/float.rs
index 0058122443c..9cdd8a9e145 100644
--- a/src/distributions/float.rs
+++ b/src/distributions/float.rs
@@ -13,6 +13,8 @@
 use core::mem;
 use Rng;
 use distributions::{Distribution, Standard};
+#[cfg(feature="simd_support")]
+use core::simd::*;
 
 /// A distribution to sample floating point numbers uniformly in the half-open
 /// interval `(0, 1]`, i.e. including 1 but not 0.
@@ -144,63 +146,173 @@ float_impls! { f32, u32, 23, 127 }
 float_impls! { f64, u64, 52, 1023 }
 
 
+#[cfg(feature="simd_support")]
+macro_rules! simd_float_impls {
+    ($ty:ident, $uty:ident, $f_scalar:ty, $u_scalar:ty,
+     $fraction_bits:expr, $exponent_bias:expr) => {
+        impl IntoFloat for $uty {
+            type F = $ty;
+            #[inline(always)]
+            fn into_float_with_exponent(self, exponent: i32) -> $ty {
+                // The exponent is encoded using an offset-binary representation
+                let exponent_bits: $u_scalar =
+                    (($exponent_bias + exponent) as $u_scalar) << $fraction_bits;
+                unsafe { mem::transmute(self | $uty::splat(exponent_bits)) }
+            }
+        }
+
+        impl Distribution<$ty> for Standard {
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                // Multiply-based method; 24/53 random bits; [0, 1) interval.
+                // We use the most significant bits because for simple RNGs
+                // those are usually more random.
+                let float_size = mem::size_of::<$f_scalar>() * 8;
+                let precision = $fraction_bits + 1;
+                let scale = $ty::splat(1.0 / ((1 as $u_scalar << precision) as $f_scalar));
+
+                let value: $uty = rng.gen();
+                let value = $ty::from(value >> (float_size - precision));
+                scale * value
+            }
+        }
+
+        impl Distribution<$ty> for OpenClosed01 {
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                // Multiply-based method; 24/53 random bits; (0, 1] interval.
+                // We use the most significant bits because for simple RNGs
+                // those are usually more random.
+                let float_size = mem::size_of::<$f_scalar>() * 8;
+                let precision = $fraction_bits + 1;
+                let scale = $ty::splat(1.0 / ((1 as $u_scalar << precision) as $f_scalar));
+
+                let value: $uty = rng.gen();
+                // Add 1 to shift up; will not overflow because of right-shift:
+                let value = $ty::from((value >> (float_size - precision)) + 1);
+                scale * value
+            }
+        }
+
+        impl Distribution<$ty> for Open01 {
+            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
+                // Transmute-based method; 23/52 random bits; (0, 1) interval.
+                // We use the most significant bits because for simple RNGs
+                // those are usually more random.
+                const EPSILON: $f_scalar = 1.0 / (1u64 << $fraction_bits) as $f_scalar;
+                let float_size = mem::size_of::<$f_scalar>() * 8;
+
+                let value: $uty = rng.gen();
+                let fraction = value >> (float_size - $fraction_bits);
+                fraction.into_float_with_exponent(0) - $ty::splat(1.0 - EPSILON / 2.0)
+            }
+        }
+    }
+}
+
+#[cfg(feature="simd_support")]
+simd_float_impls! { f32x2, u32x2, f32, u32, 23, 127 }
+#[cfg(feature="simd_support")]
+simd_float_impls! { f32x4, u32x4, f32, u32, 23, 127 }
+#[cfg(feature="simd_support")]
+simd_float_impls! { f32x8, u32x8, f32, u32, 23, 127 }
+#[cfg(feature="simd_support")]
+simd_float_impls! { f32x16, u32x16, f32, u32, 23, 127 }
+
+#[cfg(feature="simd_support")]
+simd_float_impls! { f64x2, u64x2, f64, u64, 52, 1023 }
+#[cfg(feature="simd_support")]
+simd_float_impls! { f64x4, u64x4, f64, u64, 52, 1023 }
+#[cfg(feature="simd_support")]
+simd_float_impls! { f64x8, u64x8, f64, u64, 52, 1023 }
+
+
 #[cfg(test)]
 mod tests {
     use Rng;
     use distributions::{Open01, OpenClosed01};
     use rngs::mock::StepRng;
+    #[cfg(feature="simd_support")]
+    use core::simd::*;
 
     const EPSILON32: f32 = ::core::f32::EPSILON;
     const EPSILON64: f64 = ::core::f64::EPSILON;
 
-    #[test]
-    fn standard_fp_edge_cases() {
-        let mut zeros = StepRng::new(0, 0);
-        assert_eq!(zeros.gen::<f32>(), 0.0);
-        assert_eq!(zeros.gen::<f64>(), 0.0);
-
-        let mut one32 = StepRng::new(1 << 8, 0);
-        assert_eq!(one32.gen::<f32>(), EPSILON32 / 2.0);
-
-        let mut one64 = StepRng::new(1 << 11, 0);
-        assert_eq!(one64.gen::<f64>(), EPSILON64 / 2.0);
-
-        let mut max = StepRng::new(!0, 0);
-        assert_eq!(max.gen::<f32>(), 1.0 - EPSILON32 / 2.0);
-        assert_eq!(max.gen::<f64>(), 1.0 - EPSILON64 / 2.0);
-    }
-
-    #[test]
-    fn openclosed01_edge_cases() {
-        let mut zeros = StepRng::new(0, 0);
-        assert_eq!(zeros.sample::<f32, _>(OpenClosed01), 0.0 + EPSILON32 / 2.0);
-        assert_eq!(zeros.sample::<f64, _>(OpenClosed01), 0.0 + EPSILON64 / 2.0);
-
-        let mut one32 = StepRng::new(1 << 8, 0);
-        assert_eq!(one32.sample::<f32, _>(OpenClosed01), EPSILON32);
+    macro_rules! test_f32 {
+        ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => {
+            #[test]
+            fn $fnn() {
+                // Standard
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.gen::<$ty>(), $ZERO);
+                let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0);
+                assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0);
 
-        let mut one64 = StepRng::new(1 << 11, 0);
-        assert_eq!(one64.sample::<f64, _>(OpenClosed01), EPSILON64);
+                // OpenClosed01
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01),
+                           0.0 + $EPSILON / 2.0);
+                let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0);
+                assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0);
 
-        let mut max = StepRng::new(!0, 0);
-        assert_eq!(max.sample::<f32, _>(OpenClosed01), 1.0);
-        assert_eq!(max.sample::<f64, _>(OpenClosed01), 1.0);
+                // Open01
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0);
+                let mut one = StepRng::new(1 << 9 | 1 << (9 + 32), 0);
+                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0);
+            }
+        }
     }
+    test_f32! { f32_edge_cases, f32, 0.0, ::core::f32::EPSILON }
+    #[cfg(feature="simd_support")]
+    test_f32! { f32x2_edge_cases, f32x2, f32x2::splat(0.0), f32x2::splat(EPSILON32) }
+    #[cfg(feature="simd_support")]
+    test_f32! { f32x4_edge_cases, f32x4, f32x4::splat(0.0), f32x4::splat(EPSILON32) }
+    #[cfg(feature="simd_support")]
+    test_f32! { f32x8_edge_cases, f32x8, f32x8::splat(0.0), f32x8::splat(EPSILON32) }
+    #[cfg(feature="simd_support")]
+    test_f32! { f32x16_edge_cases, f32x16, f32x16::splat(0.0), f32x16::splat(EPSILON32) }
 
-    #[test]
-    fn open01_edge_cases() {
-        let mut zeros = StepRng::new(0, 0);
-        assert_eq!(zeros.sample::<f32, _>(Open01), 0.0 + EPSILON32 / 2.0);
-        assert_eq!(zeros.sample::<f64, _>(Open01), 0.0 + EPSILON64 / 2.0);
+    macro_rules! test_f64 {
+        ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => {
+            #[test]
+            fn $fnn() {
+                // Standard
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.gen::<$ty>(), $ZERO);
+                let mut one = StepRng::new(1 << 11, 0);
+                assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0);
 
-        let mut one32 = StepRng::new(1 << 9, 0);
-        assert_eq!(one32.sample::<f32, _>(Open01), EPSILON32 / 2.0 * 3.0);
+                // OpenClosed01
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.sample::<$ty, _>(OpenClosed01),
+                           0.0 + $EPSILON / 2.0);
+                let mut one = StepRng::new(1 << 11, 0);
+                assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0);
 
-        let mut one64 = StepRng::new(1 << 12, 0);
-        assert_eq!(one64.sample::<f64, _>(Open01), EPSILON64 / 2.0 * 3.0);
-
-        let mut max = StepRng::new(!0, 0);
-        assert_eq!(max.sample::<f32, _>(Open01), 1.0 - EPSILON32 / 2.0);
-        assert_eq!(max.sample::<f64, _>(Open01), 1.0 - EPSILON64 / 2.0);
+                // Open01
+                let mut zeros = StepRng::new(0, 0);
+                assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0);
+                let mut one = StepRng::new(1 << 12, 0);
+                assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0);
+                let mut max = StepRng::new(!0, 0);
+                assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0);
+            }
+        }
     }
+    test_f64! { f64_edge_cases, f64, 0.0, EPSILON64 }
+    #[cfg(feature="simd_support")]
+    test_f64! { f64x2_edge_cases, f64x2, f64x2::splat(0.0), f64x2::splat(EPSILON64) }
+    #[cfg(feature="simd_support")]
+    test_f64! { f64x4_edge_cases, f64x4, f64x4::splat(0.0), f64x4::splat(EPSILON64) }
+    #[cfg(feature="simd_support")]
+    test_f64! { f64x8_edge_cases, f64x8, f64x8::splat(0.0), f64x8::splat(EPSILON64) }
 }

From 209836f7c31aa04be3113c7fde5b82c9be8e6e84 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sat, 7 Apr 2018 19:44:50 +0200
Subject: [PATCH 4/6] Implement Uniform for SIMD float types

---
 src/distributions/uniform.rs | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 98bca748d01..1a2e660a7d2 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -117,6 +117,9 @@ use Rng;
 use distributions::Distribution;
 use distributions::float::IntoFloat;
 
+#[cfg(feature="simd_support")]
+use core::simd::*;
+
 /// Sample values uniformly between two bounds.
 ///
 /// [`Uniform::new`] and [`Uniform::new_inclusive`] construct a uniform
@@ -580,7 +583,7 @@ pub struct UniformFloat<X> {
 }
 
 macro_rules! uniform_float_impl {
-    ($ty:ty, $bits_to_discard:expr, $next_u:ident) => {
+    ($ty:ty, $uty:ident, $bits_to_discard:expr) => {
         impl SampleUniform for $ty {
             type Sampler = UniformFloat<$ty>;
         }
@@ -621,8 +624,8 @@ macro_rules! uniform_float_impl {
 
             fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Self::X {
                 // Generate a value in the range [1, 2)
-                let value1_2 = (rng.$next_u() >> $bits_to_discard)
-                               .into_float_with_exponent(0);
+                let value: $uty = rng.gen::<$uty>() >> $bits_to_discard;
+                let value1_2 = value.into_float_with_exponent(0);
                 // We don't use `f64::mul_add`, because it is not available with
                 // `no_std`. Furthermore, it is slower for some targets (but
                 // faster for others). However, the order of multiplication and
@@ -643,8 +646,8 @@ macro_rules! uniform_float_impl {
                 let scale = high - low;
                 let offset = low - scale;
                 // Generate a value in the range [1, 2)
-                let value1_2 = (rng.$next_u() >> $bits_to_discard)
-                               .into_float_with_exponent(0);
+                let value: $uty = rng.gen::<$uty>() >> $bits_to_discard;
+                let value1_2 = value.into_float_with_exponent(0);
                 // Doing multiply before addition allows some architectures to
                 // use a single instruction.
                 value1_2 * scale + offset
@@ -653,8 +656,24 @@ macro_rules! uniform_float_impl {
     }
 }
 
-uniform_float_impl! { f32, 32 - 23, next_u32 }
-uniform_float_impl! { f64, 64 - 52, next_u64 }
+uniform_float_impl! { f32, u32, 32 - 23 }
+uniform_float_impl! { f64, u64, 64 - 52 }
+
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f32x2, u32x2, 32 - 23 }
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f32x4, u32x4, 32 - 23 }
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f32x8, u32x8, 32 - 23 }
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f32x16, u32x16, 32 - 23 }
+
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f64x2, u64x2, 64 - 52 }
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f64x4, u64x4, 64 - 52 }
+#[cfg(feature="simd_support")]
+uniform_float_impl! { f64x8, u64x8, 64 - 52 }
 
 
 

From 4dab1e3da732d3f7837e9a6f8c06ece7f6715980 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Fri, 22 Jun 2018 07:28:01 +0200
Subject: [PATCH 5/6] Add math_helpers module and various fixes

---
 src/distributions/float.rs        | 103 ++++---------------
 src/distributions/math_helpers.rs | 161 ++++++++++++++++++++++++++++++
 src/distributions/mod.rs          |   1 +
 src/distributions/uniform.rs      |  89 +----------------
 4 files changed, 189 insertions(+), 165 deletions(-)
 create mode 100644 src/distributions/math_helpers.rs

diff --git a/src/distributions/float.rs b/src/distributions/float.rs
index 9cdd8a9e145..facd753eba2 100644
--- a/src/distributions/float.rs
+++ b/src/distributions/float.rs
@@ -13,6 +13,7 @@
 use core::mem;
 use Rng;
 use distributions::{Distribution, Standard};
+use distributions::math_helpers::CastFromInt;
 #[cfg(feature="simd_support")]
 use core::simd::*;
 
@@ -85,70 +86,7 @@ pub(crate) trait IntoFloat {
 }
 
 macro_rules! float_impls {
-    ($ty:ty, $uty:ty, $fraction_bits:expr, $exponent_bias:expr) => {
-        impl IntoFloat for $uty {
-            type F = $ty;
-            #[inline(always)]
-            fn into_float_with_exponent(self, exponent: i32) -> $ty {
-                // The exponent is encoded using an offset-binary representation
-                let exponent_bits =
-                    (($exponent_bias + exponent) as $uty) << $fraction_bits;
-                unsafe { mem::transmute(self | exponent_bits) }
-            }
-        }
-
-        impl Distribution<$ty> for Standard {
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
-                // Multiply-based method; 24/53 random bits; [0, 1) interval.
-                // We use the most significant bits because for simple RNGs
-                // those are usually more random.
-                let float_size = mem::size_of::<$ty>() * 8;
-                let precision = $fraction_bits + 1;
-                let scale = 1.0 / ((1 as $uty << precision) as $ty);
-
-                let value: $uty = rng.gen();
-                scale * (value >> (float_size - precision)) as $ty
-            }
-        }
-
-        impl Distribution<$ty> for OpenClosed01 {
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
-                // Multiply-based method; 24/53 random bits; (0, 1] interval.
-                // We use the most significant bits because for simple RNGs
-                // those are usually more random.
-                let float_size = mem::size_of::<$ty>() * 8;
-                let precision = $fraction_bits + 1;
-                let scale = 1.0 / ((1 as $uty << precision) as $ty);
-
-                let value: $uty = rng.gen();
-                let value = value >> (float_size - precision);
-                // Add 1 to shift up; will not overflow because of right-shift:
-                scale * (value + 1) as $ty
-            }
-        }
-
-        impl Distribution<$ty> for Open01 {
-            fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> $ty {
-                // Transmute-based method; 23/52 random bits; (0, 1) interval.
-                // We use the most significant bits because for simple RNGs
-                // those are usually more random.
-                const EPSILON: $ty = 1.0 / (1u64 << $fraction_bits) as $ty;
-                let float_size = mem::size_of::<$ty>() * 8;
-
-                let value: $uty = rng.gen();
-                let fraction = value >> (float_size - $fraction_bits);
-                fraction.into_float_with_exponent(0) - (1.0 - EPSILON / 2.0)
-            }
-        }
-    }
-}
-float_impls! { f32, u32, 23, 127 }
-float_impls! { f64, u64, 52, 1023 }
-
-
-#[cfg(feature="simd_support")]
-macro_rules! simd_float_impls {
-    ($ty:ident, $uty:ident, $f_scalar:ty, $u_scalar:ty,
+    ($ty:ident, $uty:ident, $f_scalar:ident, $u_scalar:ty,
      $fraction_bits:expr, $exponent_bias:expr) => {
         impl IntoFloat for $uty {
             type F = $ty;
@@ -157,7 +95,7 @@ macro_rules! simd_float_impls {
                 // The exponent is encoded using an offset-binary representation
                 let exponent_bits: $u_scalar =
                     (($exponent_bias + exponent) as $u_scalar) << $fraction_bits;
-                unsafe { mem::transmute(self | $uty::splat(exponent_bits)) }
+                $ty::from_bits(self | exponent_bits)
             }
         }
 
@@ -168,11 +106,11 @@ macro_rules! simd_float_impls {
                 // those are usually more random.
                 let float_size = mem::size_of::<$f_scalar>() * 8;
                 let precision = $fraction_bits + 1;
-                let scale = $ty::splat(1.0 / ((1 as $u_scalar << precision) as $f_scalar));
+                let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
                 let value: $uty = rng.gen();
-                let value = $ty::from(value >> (float_size - precision));
-                scale * value
+                let value = value >> (float_size - precision);
+                scale * $ty::cast_from_int(value)
             }
         }
 
@@ -183,12 +121,12 @@ macro_rules! simd_float_impls {
                 // those are usually more random.
                 let float_size = mem::size_of::<$f_scalar>() * 8;
                 let precision = $fraction_bits + 1;
-                let scale = $ty::splat(1.0 / ((1 as $u_scalar << precision) as $f_scalar));
+                let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar);
 
                 let value: $uty = rng.gen();
+                let value = value >> (float_size - precision);
                 // Add 1 to shift up; will not overflow because of right-shift:
-                let value = $ty::from((value >> (float_size - precision)) + 1);
-                scale * value
+                scale * $ty::cast_from_int(value + 1)
             }
         }
 
@@ -197,32 +135,35 @@ macro_rules! simd_float_impls {
                 // Transmute-based method; 23/52 random bits; (0, 1) interval.
                 // We use the most significant bits because for simple RNGs
                 // those are usually more random.
-                const EPSILON: $f_scalar = 1.0 / (1u64 << $fraction_bits) as $f_scalar;
+                use core::$f_scalar::EPSILON;
                 let float_size = mem::size_of::<$f_scalar>() * 8;
 
                 let value: $uty = rng.gen();
                 let fraction = value >> (float_size - $fraction_bits);
-                fraction.into_float_with_exponent(0) - $ty::splat(1.0 - EPSILON / 2.0)
+                fraction.into_float_with_exponent(0) - (1.0 - EPSILON / 2.0)
             }
         }
     }
 }
 
+float_impls! { f32, u32, f32, u32, 23, 127 }
+float_impls! { f64, u64, f64, u64, 52, 1023 }
+
 #[cfg(feature="simd_support")]
-simd_float_impls! { f32x2, u32x2, f32, u32, 23, 127 }
+float_impls! { f32x2, u32x2, f32, u32, 23, 127 }
 #[cfg(feature="simd_support")]
-simd_float_impls! { f32x4, u32x4, f32, u32, 23, 127 }
+float_impls! { f32x4, u32x4, f32, u32, 23, 127 }
 #[cfg(feature="simd_support")]
-simd_float_impls! { f32x8, u32x8, f32, u32, 23, 127 }
+float_impls! { f32x8, u32x8, f32, u32, 23, 127 }
 #[cfg(feature="simd_support")]
-simd_float_impls! { f32x16, u32x16, f32, u32, 23, 127 }
+float_impls! { f32x16, u32x16, f32, u32, 23, 127 }
 
 #[cfg(feature="simd_support")]
-simd_float_impls! { f64x2, u64x2, f64, u64, 52, 1023 }
+float_impls! { f64x2, u64x2, f64, u64, 52, 1023 }
 #[cfg(feature="simd_support")]
-simd_float_impls! { f64x4, u64x4, f64, u64, 52, 1023 }
+float_impls! { f64x4, u64x4, f64, u64, 52, 1023 }
 #[cfg(feature="simd_support")]
-simd_float_impls! { f64x8, u64x8, f64, u64, 52, 1023 }
+float_impls! { f64x8, u64x8, f64, u64, 52, 1023 }
 
 
 #[cfg(test)]
@@ -267,7 +208,7 @@ mod tests {
             }
         }
     }
-    test_f32! { f32_edge_cases, f32, 0.0, ::core::f32::EPSILON }
+    test_f32! { f32_edge_cases, f32, 0.0, EPSILON32 }
     #[cfg(feature="simd_support")]
     test_f32! { f32x2_edge_cases, f32x2, f32x2::splat(0.0), f32x2::splat(EPSILON32) }
     #[cfg(feature="simd_support")]
diff --git a/src/distributions/math_helpers.rs b/src/distributions/math_helpers.rs
new file mode 100644
index 00000000000..08f2e719227
--- /dev/null
+++ b/src/distributions/math_helpers.rs
@@ -0,0 +1,161 @@
+// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
+// file at the top-level directory of this distribution and at
+// https://rust-lang.org/COPYRIGHT.
+//
+// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
+// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
+// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
+// option. This file may not be copied, modified, or distributed
+// except according to those terms.
+
+//! Math helper functions
+
+#[cfg(feature="simd_support")]
+use core::simd::*;
+
+
+pub trait WideningMultiply<RHS = Self> {
+    type Output;
+
+    fn wmul(self, x: RHS) -> Self::Output;
+}
+
+macro_rules! wmul_impl {
+    ($ty:ty, $wide:ty, $shift:expr) => {
+        impl WideningMultiply for $ty {
+            type Output = ($ty, $ty);
+
+            #[inline(always)]
+            fn wmul(self, x: $ty) -> Self::Output {
+                let tmp = (self as $wide) * (x as $wide);
+                ((tmp >> $shift) as $ty, tmp as $ty)
+            }
+        }
+    }
+}
+wmul_impl! { u8, u16, 8 }
+wmul_impl! { u16, u32, 16 }
+wmul_impl! { u32, u64, 32 }
+#[cfg(feature = "i128_support")]
+wmul_impl! { u64, u128, 64 }
+
+// This code is a translation of the __mulddi3 function in LLVM's
+// compiler-rt. It is an optimised variant of the common method
+// `(a + b) * (c + d) = ac + ad + bc + bd`.
+//
+// For some reason LLVM can optimise the C version very well, but
+// keeps shuffling registers in this Rust translation.
+macro_rules! wmul_impl_large {
+    ($ty:ty, $half:expr) => {
+        impl WideningMultiply for $ty {
+            type Output = ($ty, $ty);
+
+            #[inline(always)]
+            fn wmul(self, b: $ty) -> Self::Output {
+                const LOWER_MASK: $ty = !0 >> $half;
+                let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK);
+                let mut t = low >> $half;
+                low &= LOWER_MASK;
+                t += (self >> $half).wrapping_mul(b & LOWER_MASK);
+                low += (t & LOWER_MASK) << $half;
+                let mut high = t >> $half;
+                t = low >> $half;
+                low &= LOWER_MASK;
+                t += (b >> $half).wrapping_mul(self & LOWER_MASK);
+                low += (t & LOWER_MASK) << $half;
+                high += t >> $half;
+                high += (self >> $half).wrapping_mul(b >> $half);
+
+                (high, low)
+            }
+        }
+    }
+}
+#[cfg(not(feature = "i128_support"))]
+wmul_impl_large! { u64, 32 }
+#[cfg(feature = "i128_support")]
+wmul_impl_large! { u128, 64 }
+
+macro_rules! wmul_impl_usize {
+    ($ty:ty) => {
+        impl WideningMultiply for usize {
+            type Output = (usize, usize);
+
+            #[inline(always)]
+            fn wmul(self, x: usize) -> Self::Output {
+                let (high, low) = (self as $ty).wmul(x as $ty);
+                (high as usize, low as usize)
+            }
+        }
+    }
+}
+#[cfg(target_pointer_width = "32")]
+wmul_impl_usize! { u32 }
+#[cfg(target_pointer_width = "64")]
+wmul_impl_usize! { u64 }
+
+
+pub trait CastFromInt<T> {
+    fn cast_from_int(i: T) -> Self;
+}
+
+impl CastFromInt<u32> for f32 {
+    fn cast_from_int(i: u32) -> Self { i as f32 }
+}
+
+impl CastFromInt<u64> for f64 {
+    fn cast_from_int(i: u64) -> Self { i as f64 }
+}
+
+#[cfg(feature="simd_support")]
+macro_rules! simd_float_from_int {
+    ($ty:ident, $uty:ident) => {
+        impl CastFromInt<$uty> for $ty {
+            fn cast_from_int(i: $uty) -> Self { $ty::from(i) }
+        }
+    }
+}
+
+#[cfg(feature="simd_support")] simd_float_from_int! { f32x2, u32x2 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f32x4, u32x4 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f32x8, u32x8 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f32x16, u32x16 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f64x2, u64x2 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f64x4, u64x4 }
+#[cfg(feature="simd_support")] simd_float_from_int! { f64x8, u64x8 }
+
+
+/// `PartialOrd` for vectors compares lexicographically. We want natural order.
+/// Only the comparison functions we need are implemented.
+pub trait NaturalCompare {
+    fn cmp_lt(self, other: Self) -> bool;
+    fn cmp_le(self, other: Self) -> bool;
+}
+
+impl NaturalCompare for f32 {
+    fn cmp_lt(self, other: Self) -> bool { self < other }
+    fn cmp_le(self, other: Self) -> bool { self <= other }
+}
+
+impl NaturalCompare for f64 {
+    fn cmp_lt(self, other: Self) -> bool { self < other }
+    fn cmp_le(self, other: Self) -> bool { self <= other }
+}
+
+#[cfg(feature="simd_support")]
+macro_rules! simd_less_then {
+    ($ty:ident) => {
+        impl NaturalCompare for $ty {
+            fn cmp_lt(self, other: Self) -> bool { self.lt(other).all() }
+            fn cmp_le(self, other: Self) -> bool { self.le(other).all() }
+        }
+    }
+}
+
+#[cfg(feature="simd_support")] simd_less_then! { f32x2 }
+#[cfg(feature="simd_support")] simd_less_then! { f32x4 }
+#[cfg(feature="simd_support")] simd_less_then! { f32x8 }
+#[cfg(feature="simd_support")] simd_less_then! { f32x16 }
+#[cfg(feature="simd_support")] simd_less_then! { f64x2 }
+#[cfg(feature="simd_support")] simd_less_then! { f64x4 }
+#[cfg(feature="simd_support")] simd_less_then! { f64x8 }
diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs
index 9d093f1d509..0f0420c9d89 100644
--- a/src/distributions/mod.rs
+++ b/src/distributions/mod.rs
@@ -214,6 +214,7 @@ mod float;
 mod integer;
 #[cfg(feature="std")]
 mod log_gamma;
+mod math_helpers;
 mod other;
 #[cfg(feature="std")]
 mod ziggurat_tables;
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 1a2e660a7d2..d908074a560 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -116,6 +116,7 @@ use std::time::Duration;
 use Rng;
 use distributions::Distribution;
 use distributions::float::IntoFloat;
+use distributions::math_helpers::{WideningMultiply, NaturalCompare};
 
 #[cfg(feature="simd_support")]
 use core::simd::*;
@@ -469,87 +470,6 @@ uniform_int_impl! { usize, isize, usize, isize, usize }
 uniform_int_impl! { u128, u128, u128, i128, u128 }
 
 
-trait WideningMultiply<RHS = Self> {
-    type Output;
-
-    fn wmul(self, x: RHS) -> Self::Output;
-}
-
-macro_rules! wmul_impl {
-    ($ty:ty, $wide:ty, $shift:expr) => {
-        impl WideningMultiply for $ty {
-            type Output = ($ty, $ty);
-
-            #[inline(always)]
-            fn wmul(self, x: $ty) -> Self::Output {
-                let tmp = (self as $wide) * (x as $wide);
-                ((tmp >> $shift) as $ty, tmp as $ty)
-            }
-        }
-    }
-}
-wmul_impl! { u8, u16, 8 }
-wmul_impl! { u16, u32, 16 }
-wmul_impl! { u32, u64, 32 }
-#[cfg(feature = "i128_support")]
-wmul_impl! { u64, u128, 64 }
-
-// This code is a translation of the __mulddi3 function in LLVM's
-// compiler-rt. It is an optimised variant of the common method
-// `(a + b) * (c + d) = ac + ad + bc + bd`.
-//
-// For some reason LLVM can optimise the C version very well, but
-// keeps shuffeling registers in this Rust translation.
-macro_rules! wmul_impl_large {
-    ($ty:ty, $half:expr) => {
-        impl WideningMultiply for $ty {
-            type Output = ($ty, $ty);
-
-            #[inline(always)]
-            fn wmul(self, b: $ty) -> Self::Output {
-                const LOWER_MASK: $ty = !0 >> $half;
-                let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK);
-                let mut t = low >> $half;
-                low &= LOWER_MASK;
-                t += (self >> $half).wrapping_mul(b & LOWER_MASK);
-                low += (t & LOWER_MASK) << $half;
-                let mut high = t >> $half;
-                t = low >> $half;
-                low &= LOWER_MASK;
-                t += (b >> $half).wrapping_mul(self & LOWER_MASK);
-                low += (t & LOWER_MASK) << $half;
-                high += t >> $half;
-                high += (self >> $half).wrapping_mul(b >> $half);
-
-                (high, low)
-            }
-        }
-    }
-}
-#[cfg(not(feature = "i128_support"))]
-wmul_impl_large! { u64, 32 }
-#[cfg(feature = "i128_support")]
-wmul_impl_large! { u128, 64 }
-
-macro_rules! wmul_impl_usize {
-    ($ty:ty) => {
-        impl WideningMultiply for usize {
-            type Output = (usize, usize);
-
-            #[inline(always)]
-            fn wmul(self, x: usize) -> Self::Output {
-                let (high, low) = (self as $ty).wmul(x as $ty);
-                (high as usize, low as usize)
-            }
-        }
-    }
-}
-#[cfg(target_pointer_width = "32")]
-wmul_impl_usize! { u32 }
-#[cfg(target_pointer_width = "64")]
-wmul_impl_usize! { u64 }
-
-
 
 /// The back-end implementing [`UniformSampler`] for floating-point types.
 ///
@@ -597,7 +517,8 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low < high, "Uniform::new called with `low >= high`");
+                assert!(low.cmp_lt(high),
+                        "Uniform::new called with `low >= high`");
                 let scale = high - low;
                 let offset = low - scale;
                 UniformFloat {
@@ -612,7 +533,7 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low <= high,
+                assert!(low.cmp_le(high),
                         "Uniform::new_inclusive called with `low > high`");
                 let scale = high - low;
                 let offset = low - scale;
@@ -641,7 +562,7 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low < high,
+                assert!(low.cmp_lt(high),
                         "Uniform::sample_single called with low >= high");
                 let scale = high - low;
                 let offset = low - scale;

From 5c948fe05a565d620099f87650b5965aa232c980 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Tue, 26 Jun 2018 14:45:12 +0200
Subject: [PATCH 6/6] Rename NaturalCompare and math_helpers

---
 src/distributions/float.rs                    |  2 +-
 src/distributions/mod.rs                      |  2 +-
 src/distributions/uniform.rs                  |  8 ++---
 .../{math_helpers.rs => utils.rs}             | 29 ++++++++++---------
 4 files changed, 22 insertions(+), 19 deletions(-)
 rename src/distributions/{math_helpers.rs => utils.rs} (85%)

diff --git a/src/distributions/float.rs b/src/distributions/float.rs
index facd753eba2..9bf4a69ab0b 100644
--- a/src/distributions/float.rs
+++ b/src/distributions/float.rs
@@ -13,7 +13,7 @@
 use core::mem;
 use Rng;
 use distributions::{Distribution, Standard};
-use distributions::math_helpers::CastFromInt;
+use distributions::utils::CastFromInt;
 #[cfg(feature="simd_support")]
 use core::simd::*;
 
diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs
index 0f0420c9d89..06704664b29 100644
--- a/src/distributions/mod.rs
+++ b/src/distributions/mod.rs
@@ -214,8 +214,8 @@ mod float;
 mod integer;
 #[cfg(feature="std")]
 mod log_gamma;
-mod math_helpers;
 mod other;
+mod utils;
 #[cfg(feature="std")]
 mod ziggurat_tables;
 #[cfg(feature="std")]
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index d908074a560..9db0f59600e 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -116,7 +116,7 @@ use std::time::Duration;
 use Rng;
 use distributions::Distribution;
 use distributions::float::IntoFloat;
-use distributions::math_helpers::{WideningMultiply, NaturalCompare};
+use distributions::utils::{WideningMultiply, CompareAll};
 
 #[cfg(feature="simd_support")]
 use core::simd::*;
@@ -517,7 +517,7 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.cmp_lt(high),
+                assert!(low.all_lt(high),
                         "Uniform::new called with `low >= high`");
                 let scale = high - low;
                 let offset = low - scale;
@@ -533,7 +533,7 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.cmp_le(high),
+                assert!(low.all_le(high),
                         "Uniform::new_inclusive called with `low > high`");
                 let scale = high - low;
                 let offset = low - scale;
@@ -562,7 +562,7 @@ macro_rules! uniform_float_impl {
             {
                 let low = *low_b.borrow();
                 let high = *high_b.borrow();
-                assert!(low.cmp_lt(high),
+                assert!(low.all_lt(high),
                         "Uniform::sample_single called with low >= high");
                 let scale = high - low;
                 let offset = low - scale;
diff --git a/src/distributions/math_helpers.rs b/src/distributions/utils.rs
similarity index 85%
rename from src/distributions/math_helpers.rs
rename to src/distributions/utils.rs
index 08f2e719227..f5fef26eb9a 100644
--- a/src/distributions/math_helpers.rs
+++ b/src/distributions/utils.rs
@@ -125,29 +125,32 @@ macro_rules! simd_float_from_int {
 #[cfg(feature="simd_support")] simd_float_from_int! { f64x8, u64x8 }
 
 
-/// `PartialOrd` for vectors compares lexicographically. We want natural order.
+/// `PartialOrd` for vectors compares lexicographically. We want to compare all
+/// the individual SIMD lanes instead, and get the combined result over all
+/// lanes. This is possible using something like `a.lt(b).all()`, but we
+/// implement it as a trait so we can write the same code for `f32` and `f64`.
 /// Only the comparison functions we need are implemented.
-pub trait NaturalCompare {
-    fn cmp_lt(self, other: Self) -> bool;
-    fn cmp_le(self, other: Self) -> bool;
+pub trait CompareAll {
+    fn all_lt(self, other: Self) -> bool;
+    fn all_le(self, other: Self) -> bool;
 }
 
-impl NaturalCompare for f32 {
-    fn cmp_lt(self, other: Self) -> bool { self < other }
-    fn cmp_le(self, other: Self) -> bool { self <= other }
+impl CompareAll for f32 {
+    fn all_lt(self, other: Self) -> bool { self < other }
+    fn all_le(self, other: Self) -> bool { self <= other }
 }
 
-impl NaturalCompare for f64 {
-    fn cmp_lt(self, other: Self) -> bool { self < other }
-    fn cmp_le(self, other: Self) -> bool { self <= other }
+impl CompareAll for f64 {
+    fn all_lt(self, other: Self) -> bool { self < other }
+    fn all_le(self, other: Self) -> bool { self <= other }
 }
 
 #[cfg(feature="simd_support")]
 macro_rules! simd_less_then {
     ($ty:ident) => {
-        impl NaturalCompare for $ty {
-            fn cmp_lt(self, other: Self) -> bool { self.lt(other).all() }
-            fn cmp_le(self, other: Self) -> bool { self.le(other).all() }
+        impl CompareAll for $ty {
+            fn all_lt(self, other: Self) -> bool { self.lt(other).all() }
+            fn all_le(self, other: Self) -> bool { self.le(other).all() }
         }
     }
 }