From 43267f7d931fb438cfa627c923d3bb612981d5b4 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Thu, 7 Dec 2017 20:35:58 +0100
Subject: [PATCH 1/5] Use a sign test for bools

---
 src/distributions/uniform.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs
index 5dc9fc54cfb..fce3b6c9746 100644
--- a/src/distributions/uniform.rs
+++ b/src/distributions/uniform.rs
@@ -184,7 +184,11 @@ impl Distribution<u128> for Uniform {
 impl Distribution<bool> for Uniform {
     #[inline]
     fn sample<R: Rng+?Sized>(&self, rng: &mut R) -> bool {
-        rng.next_u32() & 1 == 1
+        // We can compare against an arbitrary bit of an u32 to get a bool.
+        // Because the least significant bits of a lower quality RNG can have
+        // simple patterns, we compare against the most significant bit. This is
+        // easiest done using a sign test.
+        (rng.next_u32() as i32) < 0
     }
 }
 

From c8c720ebdc7f16f498738b02947bc0dea64e97aa Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Thu, 7 Dec 2017 20:40:00 +0100
Subject: [PATCH 2/5] Ignore the 3 least significant bits for the ziggurat
 layer

---
 src/distributions/mod.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs
index 2f5573a40d2..4c93fe5cc04 100644
--- a/src/distributions/mod.rs
+++ b/src/distributions/mod.rs
@@ -165,6 +165,8 @@ fn ziggurat<R: Rng+?Sized, P, Z>(
         // Of the remaining 11 least significant bits we use 8 to construct `i`.
         // This saves us generating a whole extra random number, while the added
         // precision of using 64 bits for f64 does not buy us much.
+        // Because for some RNG's the least significant bits can be of lower
+        // statistical quality, we use bits 3..10 for i.
         let bits: u64 = uniform(rng);
 
         // u is either U(-1, 1) or U(0, 1) depending on if this is a
@@ -176,7 +178,7 @@ fn ziggurat<R: Rng+?Sized, P, Z>(
                 } else {
                     bits.closed_open01_fixed()
                 };
-        let i = (bits & 0xff) as usize;
+        let i = ((bits >> 3) & 0xff) as usize;
 
         let x = u * x_tab[i];
 

From 3f22ddd7eabc6d5bc500a20d5bfa648d05007305 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Sun, 10 Dec 2017 09:02:59 +0100
Subject: [PATCH 3/5] Use widening multiply instead of modulus in RangeInt

---
 benches/distributions.rs   |   5 +-
 src/distributions/range.rs | 146 +++++++++++++++++++++++++++++++++----
 src/sequences/weighted.rs  |  19 +++--
 3 files changed, 147 insertions(+), 23 deletions(-)

diff --git a/benches/distributions.rs b/benches/distributions.rs
index 8424bd6eb3f..d7415e9fb74 100644
--- a/benches/distributions.rs
+++ b/benches/distributions.rs
@@ -1,4 +1,5 @@
 #![feature(test)]
+#![cfg_attr(feature = "i128_support", feature(i128_type, i128))]
 
 extern crate test;
 extern crate rand;
@@ -46,7 +47,9 @@ macro_rules! distr_range_int {
 distr_range_int!(distr_range_i8, i8, 20i8, 100);
 distr_range_int!(distr_range_i16, i16, -500i16, 2000);
 distr_range_int!(distr_range_i32, i32, -200_000_000i32, 800_000_000);
-distr_range_int!(distr_range_i64, i64, 3i64, 134217671);
+distr_range_int!(distr_range_i64, i64, 3i64, 12345678901234);
+#[cfg(feature = "i128_support")]
+distr_range_int!(distr_range_i128, i128, -12345678901234i128, 12345678901234567890);
 
 macro_rules! distr_float {
     ($fnn:ident, $ty:ty, $distr:expr) => {
diff --git a/src/distributions/range.rs b/src/distributions/range.rs
index e30d69fbace..06d8ddaf464 100644
--- a/src/distributions/range.rs
+++ b/src/distributions/range.rs
@@ -106,7 +106,7 @@ pub struct RangeInt<X> {
 
 macro_rules! range_int_impl {
     ($ty:ty, $signed:ident, $unsigned:ident,
-     $i_large:ident, $u_large:ident) => {
+     $i_large:ident, $u_large:ident, $use_mult:expr) => {
         impl SampleRange for $ty {
             type T = RangeInt<$ty>;
         }
@@ -159,6 +159,16 @@ macro_rules! range_int_impl {
                 // small integer. For an u8 it only ever uses 7 bits. This means
                 // that all but the last 7 bits of `zone` are always 1's (or 15
                 // in the case of u16). So nothing is lost by trucating `zone`.
+                //
+                // An alternative to using a modulus is widening multiply:
+                // After a widening multiply by `range`, the result is in the
+                // high word. Then comparing the low word against `zone` makes
+                // sure our distribution is uniform.
+                //
+                // FIXME: This method is not used for i128/u128. It will
+                // probably help a lot for performance, because a 128-bit
+                // modulus is terribly slow. It means hand-coding a big-integer
+                // widening multiply.
 
                 let unsigned_max: $u_large = ::core::$u_large::MAX;
 
@@ -192,8 +202,15 @@ macro_rules! range_int_impl {
                     let zone = self.zone as $signed as $i_large as $u_large;
                     loop {
                         let v: $u_large = Rand::rand(rng, Uniform);
-                        if v <= zone {
-                            return self.low.wrapping_add((v % range) as $ty);
+                        if $use_mult {
+                            let (high, low) = v.wmul(range);
+                            if low <= zone {
+                                return self.low.wrapping_add(high as $ty);
+                            }
+                        } else {
+                            if v <= zone {
+                                return self.low.wrapping_add((v % range) as $ty);
+                            }
                         }
                     }
                 } else {
@@ -205,20 +222,119 @@ macro_rules! range_int_impl {
     }
 }
 
-range_int_impl! { i8, i8, u8, i32, u32 }
-range_int_impl! { i16, i16, u16, i32, u32 }
-range_int_impl! { i32, i32, u32, i32, u32 }
-range_int_impl! { i64, i64, u64, i64, u64 }
+range_int_impl! { i8, i8, u8, i32, u32, true }
+range_int_impl! { i16, i16, u16, i32, u32, true }
+range_int_impl! { i32, i32, u32, i32, u32, true }
+range_int_impl! { i64, i64, u64, i64, u64, true }
 #[cfg(feature = "i128_support")]
-range_int_impl! { i128, i128, u128, u128, u128 }
-range_int_impl! { isize, isize, usize, isize, usize }
-range_int_impl! { u8, i8, u8, i32, u32 }
-range_int_impl! { u16, i16, u16, i32, u32 }
-range_int_impl! { u32, i32, u32, i32, u32 }
-range_int_impl! { u64, i64, u64, i64, u64 }
+range_int_impl! { i128, i128, u128, u128, u128, false }
+range_int_impl! { isize, isize, usize, isize, usize, true }
+range_int_impl! { u8, i8, u8, i32, u32, true }
+range_int_impl! { u16, i16, u16, i32, u32, true }
+range_int_impl! { u32, i32, u32, i32, u32, true }
+range_int_impl! { u64, i64, u64, i64, u64, true }
+range_int_impl! { usize, isize, usize, isize, usize, true }
 #[cfg(feature = "i128_support")]
-range_int_impl! { u128, u128, u128, i128, u128 }
-range_int_impl! { usize, isize, usize, isize, usize }
+range_int_impl! { u128, u128, u128, i128, u128, false }
+
+
+trait WideningMultiply<RHS = Self> {
+    type Output;
+
+    fn wmul(self, x: RHS) -> Self::Output;
+}
+
+macro_rules! wmul_impl {
+    ($ty:ty, $wide:ident, $shift:expr) => {
+        impl WideningMultiply for $ty {
+            type Output = ($ty, $ty);
+
+            #[inline(always)]
+            fn wmul(self, x: $ty) -> Self::Output {
+                let tmp = (self as $wide) * (x as $wide);
+                ((tmp >> $shift) as $ty, tmp as $ty)
+            }
+        }
+    }
+}
+
+wmul_impl! { u8, u16, 8 }
+wmul_impl! { u16, u32, 16 }
+wmul_impl! { u32, u64, 32 }
+
+#[cfg(feature = "i128_support")]
+wmul_impl! { u64, u128, 64 }
+
+#[cfg(not(feature = "i128_support"))]
+impl WideningMultiply for u64 {
+    type Output = (u64, u64);
+
+    // This code is a translation of the __mulddi3 function in LLVM's
+    // compiler-rt. It is an optimised variant of the common method
+    // `(a + b) * (c + d) = ac + ad + bc + bd`.
+    //
+    // For some reason LLVM can optimise the C version very well, but keeps
+    // shuffeling registers in this Rust translation.
+    #[inline(always)]
+    fn wmul(self, b: u64) -> Self::Output {
+        const LOWER_MASK: u64 = !0u64 >> 32;
+        let mut low = (self & LOWER_MASK).wrapping_mul(b & LOWER_MASK);
+        let mut t = low >> 32;
+        low &= LOWER_MASK;
+        t += (self >> 32).wrapping_mul(b & LOWER_MASK);
+        low += (t & LOWER_MASK) << 32;
+        let mut high = (t >> 32) as i64;
+        t = low >> 32;
+        low &= LOWER_MASK;
+        t += (b >> 32).wrapping_mul(self & LOWER_MASK);
+        low += (t & LOWER_MASK) << 32;
+        high += (t >> 32) as i64;
+        high += (self >> 32).wrapping_mul(b >> 32) as i64;
+
+        (high as u64, low)
+    }
+}
+
+
+macro_rules! wmul_impl_usize {
+    ($ty:ty) => {
+        impl WideningMultiply for usize {
+            type Output = (usize, usize);
+
+            #[inline(always)]
+            fn wmul(self, x: usize) -> Self::Output {
+                let (high, low) = (self as $ty).wmul(x as $ty);
+                (high as usize, low as usize)
+            }
+        }
+    }
+}
+
+#[cfg(target_pointer_width = "32")]
+wmul_impl_usize! { u32 }
+#[cfg(target_pointer_width = "64")]
+wmul_impl_usize! { u64 }
+
+#[cfg(feature = "i128_support")]
+macro_rules! wmul_impl_128 {
+    ($ty:ty) => {
+        impl WideningMultiply for $ty {
+            type Output = ($ty, $ty);
+
+            #[inline(always)]
+            fn wmul(self, _: $ty) -> Self::Output {
+                unimplemented!();
+            }
+        }
+    }
+}
+
+#[cfg(feature = "i128_support")]
+wmul_impl_128! { i128 }
+#[cfg(feature = "i128_support")]
+wmul_impl_128! { u128 }
+
+
 
 /// Implementation of `RangeImpl` for float types.
 #[derive(Clone, Copy, Debug)]
diff --git a/src/sequences/weighted.rs b/src/sequences/weighted.rs
index 6589fa0a24b..37d88c0e610 100644
--- a/src/sequences/weighted.rs
+++ b/src/sequences/weighted.rs
@@ -141,17 +141,22 @@ mod tests {
     #[test]
     fn test_weighted_choice() {
         // this makes assumptions about the internal implementation of
-        // WeightedChoice, specifically: it doesn't reorder the items,
-        // it doesn't do weird things to the RNG (so 0 maps to 0, 1 to
-        // 1, internally; modulo a modulo operation).
+        // WeightedChoice. It may fail when the implementation in
+        // `distributions::range::RangeInt changes.
 
         macro_rules! t {
             ($items:expr, $expected:expr) => {{
                 let items = $items;
+                let mut total_weight = 0;
+                for item in &items { total_weight += item.weight; }
+
                 let wc = WeightedChoice::new(items);
                 let expected = $expected;
 
-                let mut rng = MockAddRng::new(0, 1);
+                // Use extremely large steps between the random numbers, because
+                // we test with small ranges and RangeInt is designed to prefer
+                // the most significant bits.
+                let mut rng = MockAddRng::new(0, !0 / (total_weight as u64));
 
                 for &val in expected.iter() {
                     assert_eq!(wc.sample(&mut rng), val)
@@ -166,12 +171,12 @@ mod tests {
                 Weighted { weight: 2, item: 21},
                 Weighted { weight: 0, item: 22},
                 Weighted { weight: 1, item: 23}),
-           [21,21, 23]);
+           [21, 21, 23]);
 
         // different weights
         t!(vec!(Weighted { weight: 4, item: 30},
                 Weighted { weight: 3, item: 31}),
-           [30,30,30,30, 31,31,31]);
+           [30, 31, 30, 31, 30, 31, 30]);
 
         // check that we're binary searching
         // correctly with some vectors of odd
@@ -189,7 +194,7 @@ mod tests {
                 Weighted { weight: 1, item: 54},
                 Weighted { weight: 1, item: 55},
                 Weighted { weight: 1, item: 56}),
-           [50, 51, 52, 53, 54, 55, 56]);
+           [50, 54, 51, 55, 52, 56, 53]);
     }
 
     #[test]

From 68edc9fc4e67b9ebe6b6d36035b0059bee27cd78 Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Mon, 11 Dec 2017 12:34:44 +0100
Subject: [PATCH 4/5] Add a few extra distr benchmarks

---
 benches/distributions.rs | 85 +++++++++++++++++-----------------------
 1 file changed, 35 insertions(+), 50 deletions(-)

diff --git a/benches/distributions.rs b/benches/distributions.rs
index d7415e9fb74..7fded29c9ea 100644
--- a/benches/distributions.rs
+++ b/benches/distributions.rs
@@ -9,29 +9,16 @@ const RAND_BENCH_N: u64 = 1000;
 use std::mem::size_of;
 use test::{black_box, Bencher};
 
-use rand::{Default, Rand, NewSeeded};
+use rand::NewSeeded;
 use rand::prng::XorShiftRng;
 use rand::distributions::*;
 
-
-#[bench]
-fn distr_baseline(b: &mut Bencher) {
-    let mut rng = XorShiftRng::new().unwrap();
-
-    b.iter(|| {
-        for _ in 0..::RAND_BENCH_N {
-            black_box(u64::rand(&mut rng, Default));
-        }
-    });
-    b.bytes = size_of::<u64>() as u64 * ::RAND_BENCH_N;
-}
-
-macro_rules! distr_range_int {
-    ($fnn:ident, $ty:ty, $low:expr, $high:expr) => {
+macro_rules! distr {
+    ($fnn:ident, $ty:ty, $distr:expr) => {
         #[bench]
         fn $fnn(b: &mut Bencher) {
             let mut rng = XorShiftRng::new().unwrap();
-            let distr = Range::new($low, $high);
+            let distr = $distr;
 
             b.iter(|| {
                 for _ in 0..::RAND_BENCH_N {
@@ -44,41 +31,39 @@ macro_rules! distr_range_int {
     }
 }
 
-distr_range_int!(distr_range_i8, i8, 20i8, 100);
-distr_range_int!(distr_range_i16, i16, -500i16, 2000);
-distr_range_int!(distr_range_i32, i32, -200_000_000i32, 800_000_000);
-distr_range_int!(distr_range_i64, i64, 3i64, 12345678901234);
+// range
+distr!(distr_range_i8, i8, Range::new(20i8, 100));
+distr!(distr_range_i16, i16, Range::new(-500i16, 2000));
+distr!(distr_range_i32, i32, Range::new(-200_000_000i32, 800_000_000));
+distr!(distr_range_i64, i64, Range::new(3i64, 12345678901234));
 #[cfg(feature = "i128_support")]
-distr_range_int!(distr_range_i128, i128, -12345678901234i128, 12345678901234567890);
+distr!(distr_range_i128, i128, Range::new(-12345678901234i128, 12345678901234567890));
 
-macro_rules! distr_float {
-    ($fnn:ident, $ty:ty, $distr:expr) => {
-        #[bench]
-        fn $fnn(b: &mut Bencher) {
-            let mut rng = XorShiftRng::new().unwrap();
-            let distr = $distr;
+distr!(distr_range_float32, f32, Range::new(2.26f32, 2.319));
+distr!(distr_range_float, f64, Range::new(2.26f64, 2.319));
 
-            b.iter(|| {
-                for _ in 0..::RAND_BENCH_N {
-                    let x: $ty = distr.sample(&mut rng);
-                    black_box(x);
-                }
-            });
-            b.bytes = size_of::<$ty>() as u64 * ::RAND_BENCH_N;
-        }
-    }
-}
+// uniform
+distr!(distr_uniform_i8, i8, Uniform);
+distr!(distr_uniform_i16, i16, Uniform);
+distr!(distr_uniform_i32, i32, Uniform);
+distr!(distr_uniform_i64, i64, Uniform);
+#[cfg(feature = "i128_support")]
+distr!(distr_uniform_i128, i128, Uniform);
+
+distr!(distr_uniform_bool, bool, Uniform);
+distr!(distr_uniform_ascii_char, char, AsciiWordChar);
+
+distr!(distr_uniform01_float32, f32, Uniform01);
+distr!(distr_closed01_float32, f32, Closed01);
+distr!(distr_open01_float32, f32, Open01);
 
-distr_float!(distr_uniform01_float32, f32, Uniform01);
-distr_float!(distr_closed01_float32, f32, Closed01);
-distr_float!(distr_open01_float32, f32, Open01);
+distr!(distr_uniform01_float, f64, Uniform01);
+distr!(distr_closed01_float, f64, Closed01);
+distr!(distr_open01_float, f64, Open01);
 
-distr_float!(distr_uniform01_float, f64, Uniform01);
-distr_float!(distr_closed01_float, f64, Closed01);
-distr_float!(distr_open01_float, f64, Open01);
-distr_float!(distr_range_float, f64, Range::new(2.26f64, 2.319f64));
-distr_float!(distr_exp, f64, Exp::new(2.71828 * 3.14159));
-distr_float!(distr_normal, f64, Normal::new(-2.71828, 3.14159));
-distr_float!(distr_log_normal, f64, LogNormal::new(-2.71828, 3.14159));
-distr_float!(distr_gamma_large_shape, f64, Gamma::new(10., 1.0));
-distr_float!(distr_gamma_small_shape, f64, Gamma::new(0.1, 1.0));
+// distributions
+distr!(distr_exp, f64, Exp::new(2.71828 * 3.14159));
+distr!(distr_normal, f64, Normal::new(-2.71828, 3.14159));
+distr!(distr_log_normal, f64, LogNormal::new(-2.71828, 3.14159));
+distr!(distr_gamma_large_shape, f64, Gamma::new(10., 1.0));
+distr!(distr_gamma_small_shape, f64, Gamma::new(0.1, 1.0));

From 173c2b4c64ad08584043a179f1ac0697946c4cbe Mon Sep 17 00:00:00 2001
From: Paul Dicker <pitdicker@gmail.com>
Date: Mon, 11 Dec 2017 13:40:56 +0100
Subject: [PATCH 5/5] Benchmark constructing and sampling from a range

---
 benches/distributions.rs | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/benches/distributions.rs b/benches/distributions.rs
index 7fded29c9ea..ec2906a7353 100644
--- a/benches/distributions.rs
+++ b/benches/distributions.rs
@@ -67,3 +67,29 @@ distr!(distr_normal, f64, Normal::new(-2.71828, 3.14159));
 distr!(distr_log_normal, f64, LogNormal::new(-2.71828, 3.14159));
 distr!(distr_gamma_large_shape, f64, Gamma::new(10., 1.0));
 distr!(distr_gamma_small_shape, f64, Gamma::new(0.1, 1.0));
+
+
+// construct and sample from a range
+macro_rules! gen_range_int {
+    ($fnn:ident, $ty:ty, $low:expr, $high:expr) => {
+        #[bench]
+        fn $fnn(b: &mut Bencher) {
+            let mut rng = XorShiftRng::new().unwrap();
+
+            b.iter(|| {
+                for _ in 0..::RAND_BENCH_N {
+                    let x: $ty = Range::new($low, $high).sample(&mut rng);
+                    black_box(x);
+                }
+            });
+            b.bytes = size_of::<$ty>() as u64 * ::RAND_BENCH_N;
+        }
+    }
+}
+
+gen_range_int!(gen_range_i8, i8, 20i8, 100);
+gen_range_int!(gen_range_i16, i16, -500i16, 2000);
+gen_range_int!(gen_range_i32, i32, -200_000_000i32, 800_000_000);
+gen_range_int!(gen_range_i64, i64, 3i64, 12345678901234);
+#[cfg(feature = "i128_support")]
+gen_range_int!(gen_range_i128, i128, -12345678901234i128, 12345678901234567890);