Skip to content

Commit

Permalink
Add KS tests for weighted sampling; use A-ExpJ alg with log-keys (#1530)
Browse files Browse the repository at this point in the history
- Extra testing for weighted sampling
- Fix IndexedRandom::choose_multiple_weighted with very small keys
- Use A-ExpJ algorithm with BinaryHeap for better performance with large length / amount
  • Loading branch information
dhardy authored Nov 26, 2024
1 parent 0ff946c commit bf9d429
Show file tree
Hide file tree
Showing 4 changed files with 277 additions and 30 deletions.
2 changes: 1 addition & 1 deletion distr_test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"
publish = false

[dev-dependencies]
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false }
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] }
rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] }
num-traits = "0.2.19"
# Special functions for testing distributions
Expand Down
235 changes: 235 additions & 0 deletions distr_test/tests/weighted.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
// Copyright 2024 Developers of the Rand project.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

mod ks;
use ks::test_discrete;
use rand::distr::{Distribution, WeightedIndex};
use rand::seq::{IndexedRandom, IteratorRandom};
use rand_distr::{WeightedAliasIndex, WeightedTreeIndex};

/// Takes the unnormalized pdf and creates the cdf of a discrete distribution
fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 {
let mut cdf = Vec::with_capacity(num);
let mut ac = 0.0;
for i in 0..num {
ac += f(i as i64);
cdf.push(ac);
}

let frac = 1.0 / ac;
for x in &mut cdf {
*x *= frac;
}

move |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
}
}

#[test]
fn weighted_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_alias_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let weights = (0..num).map(|i| weight(i as i64)).collect();
let distr = WeightedAliasIndex::new(weights).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_tree_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_one_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i))
.unwrap()
.next()
.unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_two_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut iter =
IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i))
.unwrap();
let mut a = *iter.next().unwrap();
let mut b = *iter.next().unwrap();
assert!(iter.next().is_none());
if b < a {
std::mem::swap(&mut a, &mut b);
}
a * self.0.len() as i64 + b
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);

let pmf1 = (0..num).map(|i| weight(i as i64)).collect::<Vec<f64>>();
let sum: f64 = pmf1.iter().sum();
let frac = 1.0 / sum;

let mut ac = 0.0;
let mut cdf = Vec::with_capacity(num * num);
for a in 0..num {
for b in 0..num {
if a < b {
let pa = pmf1[a] * frac;
let pab = pa * pmf1[b] / (sum - pmf1[a]);

let pb = pmf1[b] * frac;
let pba = pb * pmf1[a] / (sum - pmf1[b]);

ac += pab + pba;
}
cdf.push(ac);
}
}
assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9);

let cdf = |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
};

test_discrete(0, distr, cdf);
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
test_weights(10, |i| ((i + 1) as f64).powi(-8));
}

#[test]
fn choose_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_stable_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose_stable(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_two_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut buf = [0; 2];
IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf);
buf.sort_unstable();
assert!(buf[0] < 99 && buf[1] >= 1);
let a = buf[0];
4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1
}
}

let distr = Adapter((0..100).map(|i| i as i64));

test_discrete(
0,
distr,
|i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 },
);
}
54 changes: 33 additions & 21 deletions src/seq/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ where
/// ordering). The weights are to be provided by the input function `weights`,
/// which will be called once for each index.
///
/// This implementation uses the algorithm described by Efraimidis and Spirakis
/// in this paper: <https://doi.org/10.1016/j.ipl.2005.11.003>
/// This implementation is based on the algorithm A-ExpJ as found in
/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003).
/// It uses `O(length + amount)` space and `O(length)` time.
///
/// Error cases:
Expand All @@ -354,7 +354,7 @@ where
N: UInt,
IndexVec: From<Vec<N>>,
{
use std::cmp::Ordering;
use std::{cmp::Ordering, collections::BinaryHeap};

if amount == N::zero() {
return Ok(IndexVec::U32(Vec::new()));
Expand All @@ -373,9 +373,9 @@ where

impl<N> Ord for Element<N> {
fn cmp(&self, other: &Self) -> Ordering {
// partial_cmp will always produce a value,
// because we check that the weights are not nan
self.key.partial_cmp(&other.key).unwrap()
// unwrap() should not panic since weights should not be NaN
// We reverse so that BinaryHeap::peek shows the smallest item
self.key.partial_cmp(&other.key).unwrap().reverse()
}
}

Expand All @@ -387,12 +387,14 @@ where

impl<N> Eq for Element<N> {}

let mut candidates = Vec::with_capacity(length.as_usize());
let mut candidates = BinaryHeap::with_capacity(amount.as_usize());
let mut index = N::zero();
while index < length {
while index < length && candidates.len() < amount.as_usize() {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
let key = rng.random::<f64>().powf(1.0 / weight);
// We use the log of the key used in A-ExpJ to improve precision
// for small weights:
let key = rng.random::<f64>().ln() / weight;
candidates.push(Element { index, key });
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
Expand All @@ -401,23 +403,33 @@ where
index += N::one();
}

let avail = candidates.len();
if avail < amount.as_usize() {
if candidates.len() < amount.as_usize() {
return Err(WeightError::InsufficientNonZero);
}

// Partially sort the array to find the `amount` elements with the greatest
// keys. Do this by using `select_nth_unstable` to put the elements with
// the *smallest* keys at the beginning of the list in `O(n)` time, which
// provides equivalent information about the elements with the *greatest* keys.
let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize());
let mut x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
while index < length {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
x -= weight;
if x <= 0.0 {
let min_candidate = candidates.pop().unwrap();
let t = (min_candidate.key * weight).exp();
let key = rng.random_range(t..1.0).ln() / weight;
candidates.push(Element { index, key });

x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
}
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

let mut result: Vec<N> = Vec::with_capacity(amount.as_usize());
result.push(mid.index);
for element in greater {
result.push(element.index);
index += N::one();
}
Ok(IndexVec::from(result))

Ok(IndexVec::from(
candidates.iter().map(|elt| elt.index).collect(),
))
}

/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
Expand Down
16 changes: 8 additions & 8 deletions src/seq/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,17 +732,17 @@ mod test {
use super::*;

// The theoretical probabilities of the different outcomes are:
// AB: 0.5 * 0.5 = 0.250
// AC: 0.5 * 0.5 = 0.250
// BA: 0.25 * 0.67 = 0.167
// BC: 0.25 * 0.33 = 0.082
// CA: 0.25 * 0.67 = 0.167
// CB: 0.25 * 0.33 = 0.082
let choices = [('a', 2), ('b', 1), ('c', 1)];
// AB: 0.5 * 0.667 = 0.3333
// AC: 0.5 * 0.333 = 0.1667
// BA: 0.333 * 0.75 = 0.25
// BC: 0.333 * 0.25 = 0.0833
// CA: 0.167 * 0.6 = 0.1
// CB: 0.167 * 0.4 = 0.0667
let choices = [('a', 3), ('b', 2), ('c', 1)];
let mut rng = crate::test::rng(414);

let mut results = [0i32; 3];
let expected_results = [4167, 4167, 1666];
let expected_results = [5833, 2667, 1500];
for _ in 0..10000 {
let result = choices
.choose_multiple_weighted(&mut rng, 2, |item| item.1)
Expand Down

0 comments on commit bf9d429

Please sign in to comment.