From bf9d429e89ee1baa633e7eec5b75f89dcd78e7f6 Mon Sep 17 00:00:00 2001 From: Diggory Hardy Date: Tue, 26 Nov 2024 11:22:55 +0000 Subject: [PATCH] Add KS tests for weighted sampling; use A-ExpJ alg with log-keys (#1530) - Extra testing for weighted sampling - Fix IndexedRandom::choose_multiple_weighted with very small keys - Use A-ExpJ algorithm with BinaryHeap for better performance with large length / amount --- distr_test/Cargo.toml | 2 +- distr_test/tests/weighted.rs | 235 +++++++++++++++++++++++++++++++++++ src/seq/index.rs | 54 ++++---- src/seq/slice.rs | 16 +-- 4 files changed, 277 insertions(+), 30 deletions(-) create mode 100644 distr_test/tests/weighted.rs diff --git a/distr_test/Cargo.toml b/distr_test/Cargo.toml index 7f37853b3c..36314b37cf 100644 --- a/distr_test/Cargo.toml +++ b/distr_test/Cargo.toml @@ -5,7 +5,7 @@ edition = "2021" publish = false [dev-dependencies] -rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false } +rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] } rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] } num-traits = "0.2.19" # Special functions for testing distributions diff --git a/distr_test/tests/weighted.rs b/distr_test/tests/weighted.rs new file mode 100644 index 0000000000..cf87b3ee63 --- /dev/null +++ b/distr_test/tests/weighted.rs @@ -0,0 +1,235 @@ +// Copyright 2024 Developers of the Rand project. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +mod ks; +use ks::test_discrete; +use rand::distr::{Distribution, WeightedIndex}; +use rand::seq::{IndexedRandom, IteratorRandom}; +use rand_distr::{WeightedAliasIndex, WeightedTreeIndex}; + +/// Takes the unnormalized pdf and creates the cdf of a discrete distribution +fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 { + let mut cdf = Vec::with_capacity(num); + let mut ac = 0.0; + for i in 0..num { + ac += f(i as i64); + cdf.push(ac); + } + + let frac = 1.0 / ac; + for x in &mut cdf { + *x *= frac; + } + + move |i| { + if i < 0 { + 0.0 + } else { + cdf[i as usize] + } + } +} + +#[test] +fn weighted_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} + +#[test] +fn weighted_alias_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let weights = (0..num).map(|i| weight(i as i64)).collect(); + let distr = WeightedAliasIndex::new(weights).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} + +#[test] +fn weighted_tree_index() { + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap(); + test_discrete(0, distr, make_cdf(num, weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} + +#[test] +fn choose_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + *IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap() + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + test_discrete(0, distr, make_cdf(num, &weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} + +#[test] +fn choose_one_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + *IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i)) + .unwrap() + .next() + .unwrap() + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + test_discrete(0, distr, make_cdf(num, &weight)); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); +} + +#[test] +fn choose_two_weighted_indexed() { + struct Adapter f64>(Vec, F); + impl f64> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + let mut iter = + IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i)) + .unwrap(); + let mut a = *iter.next().unwrap(); + let mut b = *iter.next().unwrap(); + assert!(iter.next().is_none()); + if b < a { + std::mem::swap(&mut a, &mut b); + } + a * self.0.len() as i64 + b + } + } + + fn test_weights(num: usize, weight: impl Fn(i64) -> f64) { + let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight); + + let pmf1 = (0..num).map(|i| weight(i as i64)).collect::>(); + let sum: f64 = pmf1.iter().sum(); + let frac = 1.0 / sum; + + let mut ac = 0.0; + let mut cdf = Vec::with_capacity(num * num); + for a in 0..num { + for b in 0..num { + if a < b { + let pa = pmf1[a] * frac; + let pab = pa * pmf1[b] / (sum - pmf1[a]); + + let pb = pmf1[b] * frac; + let pba = pb * pmf1[a] / (sum - pmf1[b]); + + ac += pab + pba; + } + cdf.push(ac); + } + } + assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9); + + let cdf = |i| { + if i < 0 { + 0.0 + } else { + cdf[i as usize] + } + }; + + test_discrete(0, distr, cdf); + } + + test_weights(100, |_| 1.0); + test_weights(100, |i| ((i + 1) as f64).ln()); + test_weights(100, |i| i as f64); + test_weights(100, |i| (i as f64).powi(3)); + test_weights(100, |i| 1.0 / ((i + 1) as f64)); + test_weights(10, |i| ((i + 1) as f64).powi(-8)); +} + +#[test] +fn choose_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + IteratorRandom::choose(self.0.clone(), rng).unwrap() + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + test_discrete(0, distr, make_cdf(100, |_| 1.0)); +} + +#[test] +fn choose_stable_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + IteratorRandom::choose_stable(self.0.clone(), rng).unwrap() + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + test_discrete(0, distr, make_cdf(100, |_| 1.0)); +} + +#[test] +fn choose_two_iterator() { + struct Adapter(I); + impl> Distribution for Adapter { + fn sample(&self, rng: &mut R) -> i64 { + let mut buf = [0; 2]; + IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf); + buf.sort_unstable(); + assert!(buf[0] < 99 && buf[1] >= 1); + let a = buf[0]; + 4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1 + } + } + + let distr = Adapter((0..100).map(|i| i as i64)); + + test_discrete( + 0, + distr, + |i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 }, + ); +} diff --git a/src/seq/index.rs b/src/seq/index.rs index 70231dde2c..852bdac76c 100644 --- a/src/seq/index.rs +++ b/src/seq/index.rs @@ -333,8 +333,8 @@ where /// ordering). The weights are to be provided by the input function `weights`, /// which will be called once for each index. /// -/// This implementation uses the algorithm described by Efraimidis and Spirakis -/// in this paper: +/// This implementation is based on the algorithm A-ExpJ as found in +/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003). /// It uses `O(length + amount)` space and `O(length)` time. /// /// Error cases: @@ -354,7 +354,7 @@ where N: UInt, IndexVec: From>, { - use std::cmp::Ordering; + use std::{cmp::Ordering, collections::BinaryHeap}; if amount == N::zero() { return Ok(IndexVec::U32(Vec::new())); @@ -373,9 +373,9 @@ where impl Ord for Element { fn cmp(&self, other: &Self) -> Ordering { - // partial_cmp will always produce a value, - // because we check that the weights are not nan - self.key.partial_cmp(&other.key).unwrap() + // unwrap() should not panic since weights should not be NaN + // We reverse so that BinaryHeap::peek shows the smallest item + self.key.partial_cmp(&other.key).unwrap().reverse() } } @@ -387,12 +387,14 @@ where impl Eq for Element {} - let mut candidates = Vec::with_capacity(length.as_usize()); + let mut candidates = BinaryHeap::with_capacity(amount.as_usize()); let mut index = N::zero(); - while index < length { + while index < length && candidates.len() < amount.as_usize() { let weight = weight(index.as_usize()).into(); if weight > 0.0 { - let key = rng.random::().powf(1.0 / weight); + // We use the log of the key used in A-ExpJ to improve precision + // for small weights: + let key = rng.random::().ln() / weight; candidates.push(Element { index, key }); } else if !(weight >= 0.0) { return Err(WeightError::InvalidWeight); @@ -401,23 +403,33 @@ where index += N::one(); } - let avail = candidates.len(); - if avail < amount.as_usize() { + if candidates.len() < amount.as_usize() { return Err(WeightError::InsufficientNonZero); } - // Partially sort the array to find the `amount` elements with the greatest - // keys. Do this by using `select_nth_unstable` to put the elements with - // the *smallest* keys at the beginning of the list in `O(n)` time, which - // provides equivalent information about the elements with the *greatest* keys. - let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize()); + let mut x = rng.random::().ln() / candidates.peek().unwrap().key; + while index < length { + let weight = weight(index.as_usize()).into(); + if weight > 0.0 { + x -= weight; + if x <= 0.0 { + let min_candidate = candidates.pop().unwrap(); + let t = (min_candidate.key * weight).exp(); + let key = rng.random_range(t..1.0).ln() / weight; + candidates.push(Element { index, key }); + + x = rng.random::().ln() / candidates.peek().unwrap().key; + } + } else if !(weight >= 0.0) { + return Err(WeightError::InvalidWeight); + } - let mut result: Vec = Vec::with_capacity(amount.as_usize()); - result.push(mid.index); - for element in greater { - result.push(element.index); + index += N::one(); } - Ok(IndexVec::from(result)) + + Ok(IndexVec::from( + candidates.iter().map(|elt| elt.index).collect(), + )) } /// Randomly sample exactly `amount` indices from `0..length`, using Floyd's diff --git a/src/seq/slice.rs b/src/seq/slice.rs index 4144f91345..1fc10c0985 100644 --- a/src/seq/slice.rs +++ b/src/seq/slice.rs @@ -732,17 +732,17 @@ mod test { use super::*; // The theoretical probabilities of the different outcomes are: - // AB: 0.5 * 0.5 = 0.250 - // AC: 0.5 * 0.5 = 0.250 - // BA: 0.25 * 0.67 = 0.167 - // BC: 0.25 * 0.33 = 0.082 - // CA: 0.25 * 0.67 = 0.167 - // CB: 0.25 * 0.33 = 0.082 - let choices = [('a', 2), ('b', 1), ('c', 1)]; + // AB: 0.5 * 0.667 = 0.3333 + // AC: 0.5 * 0.333 = 0.1667 + // BA: 0.333 * 0.75 = 0.25 + // BC: 0.333 * 0.25 = 0.0833 + // CA: 0.167 * 0.6 = 0.1 + // CB: 0.167 * 0.4 = 0.0667 + let choices = [('a', 3), ('b', 2), ('c', 1)]; let mut rng = crate::test::rng(414); let mut results = [0i32; 3]; - let expected_results = [4167, 4167, 1666]; + let expected_results = [5833, 2667, 1500]; for _ in 0..10000 { let result = choices .choose_multiple_weighted(&mut rng, 2, |item| item.1)