Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add KS tests for weighted sampling #1530

Merged
merged 21 commits into from
Nov 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion distr_test/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ edition = "2021"
publish = false

[dev-dependencies]
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false }
rand_distr = { path = "../rand_distr", version = "=0.5.0-alpha.1", default-features = false, features = ["alloc"] }
rand = { path = "..", version = "=0.9.0-alpha.1", features = ["small_rng"] }
num-traits = "0.2.19"
# Special functions for testing distributions
Expand Down
235 changes: 235 additions & 0 deletions distr_test/tests/weighted.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
// Copyright 2024 Developers of the Rand project.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

mod ks;
use ks::test_discrete;
use rand::distr::{Distribution, WeightedIndex};
use rand::seq::{IndexedRandom, IteratorRandom};
use rand_distr::{WeightedAliasIndex, WeightedTreeIndex};

/// Takes the unnormalized pdf and creates the cdf of a discrete distribution
fn make_cdf(num: usize, f: impl Fn(i64) -> f64) -> impl Fn(i64) -> f64 {
dhardy marked this conversation as resolved.
Show resolved Hide resolved
let mut cdf = Vec::with_capacity(num);
let mut ac = 0.0;
for i in 0..num {
ac += f(i as i64);
cdf.push(ac);
}

let frac = 1.0 / ac;
for x in &mut cdf {
*x *= frac;
}

move |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
}
}

#[test]
fn weighted_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_alias_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let weights = (0..num).map(|i| weight(i as i64)).collect();
let distr = WeightedAliasIndex::new(weights).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn weighted_tree_index() {
fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = WeightedTreeIndex::new((0..num).map(|i| weight(i as i64))).unwrap();
test_discrete(0, distr, make_cdf(num, weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_weighted(&self.0[..], rng, |i| (self.1)(*i)).unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_one_weighted_indexed() {
struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
*IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 1, |i| (self.1)(*i))
.unwrap()
.next()
.unwrap()
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);
test_discrete(0, distr, make_cdf(num, &weight));
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
}

#[test]
fn choose_two_weighted_indexed() {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.
It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably more complex than needed, but looks correct.

You mean the use of an Adapter? Yes, but I'd sooner do this than revise the KS test API (which is well adapted for other usages).

It's probably worth implementing chi squared at some point, but this should also be quite sensitive.

A fair point.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean using KS for these distributions (chi squared would be more straight forward), Adapter I think is fine.

struct Adapter<F: Fn(i64) -> f64>(Vec<i64>, F);
impl<F: Fn(i64) -> f64> Distribution<i64> for Adapter<F> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut iter =
IndexedRandom::choose_multiple_weighted(&self.0[..], rng, 2, |i| (self.1)(*i))
.unwrap();
let mut a = *iter.next().unwrap();
let mut b = *iter.next().unwrap();
assert!(iter.next().is_none());
if b < a {
std::mem::swap(&mut a, &mut b);
}
a * self.0.len() as i64 + b
}
}

fn test_weights(num: usize, weight: impl Fn(i64) -> f64) {
let distr = Adapter((0..num).map(|i| i as i64).collect(), &weight);

let pmf1 = (0..num).map(|i| weight(i as i64)).collect::<Vec<f64>>();
let sum: f64 = pmf1.iter().sum();
let frac = 1.0 / sum;

let mut ac = 0.0;
let mut cdf = Vec::with_capacity(num * num);
for a in 0..num {
for b in 0..num {
if a < b {
let pa = pmf1[a] * frac;
let pab = pa * pmf1[b] / (sum - pmf1[a]);

let pb = pmf1[b] * frac;
let pba = pb * pmf1[a] / (sum - pmf1[b]);

ac += pab + pba;
}
cdf.push(ac);
}
}
assert!((cdf.last().unwrap() - 1.0).abs() < 1e-9);

let cdf = |i| {
if i < 0 {
0.0
} else {
cdf[i as usize]
}
};

test_discrete(0, distr, cdf);
}

test_weights(100, |_| 1.0);
test_weights(100, |i| ((i + 1) as f64).ln());
test_weights(100, |i| i as f64);
test_weights(100, |i| (i as f64).powi(3));
test_weights(100, |i| 1.0 / ((i + 1) as f64));
test_weights(10, |i| ((i + 1) as f64).powi(-8));
}

#[test]
fn choose_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_stable_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
IteratorRandom::choose_stable(self.0.clone(), rng).unwrap()
}
}

let distr = Adapter((0..100).map(|i| i as i64));
test_discrete(0, distr, make_cdf(100, |_| 1.0));
}

#[test]
fn choose_two_iterator() {
struct Adapter<I>(I);
impl<I: Clone + Iterator<Item = i64>> Distribution<i64> for Adapter<I> {
fn sample<R: rand::Rng + ?Sized>(&self, rng: &mut R) -> i64 {
let mut buf = [0; 2];
IteratorRandom::choose_multiple_fill(self.0.clone(), rng, &mut buf);
buf.sort_unstable();
assert!(buf[0] < 99 && buf[1] >= 1);
let a = buf[0];
4950 - (99 - a) * (100 - a) / 2 + buf[1] - a - 1
}
}

let distr = Adapter((0..100).map(|i| i as i64));

test_discrete(
0,
distr,
|i| if i < 0 { 0.0 } else { (i + 1) as f64 / 4950.0 },
);
}
54 changes: 33 additions & 21 deletions src/seq/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ where
/// ordering). The weights are to be provided by the input function `weights`,
/// which will be called once for each index.
///
/// This implementation uses the algorithm described by Efraimidis and Spirakis
/// in this paper: <https://doi.org/10.1016/j.ipl.2005.11.003>
/// This implementation is based on the algorithm A-ExpJ as found in
/// [Efraimidis and Spirakis, 2005](https://doi.org/10.1016/j.ipl.2005.11.003).
/// It uses `O(length + amount)` space and `O(length)` time.
///
/// Error cases:
Expand All @@ -354,7 +354,7 @@ where
N: UInt,
IndexVec: From<Vec<N>>,
{
use std::cmp::Ordering;
use std::{cmp::Ordering, collections::BinaryHeap};

if amount == N::zero() {
return Ok(IndexVec::U32(Vec::new()));
Expand All @@ -373,9 +373,9 @@ where

impl<N> Ord for Element<N> {
fn cmp(&self, other: &Self) -> Ordering {
// partial_cmp will always produce a value,
// because we check that the weights are not nan
self.key.partial_cmp(&other.key).unwrap()
// unwrap() should not panic since weights should not be NaN
// We reverse so that BinaryHeap::peek shows the smallest item
self.key.partial_cmp(&other.key).unwrap().reverse()
}
}

Expand All @@ -387,12 +387,14 @@ where

impl<N> Eq for Element<N> {}

let mut candidates = Vec::with_capacity(length.as_usize());
let mut candidates = BinaryHeap::with_capacity(amount.as_usize());
let mut index = N::zero();
while index < length {
while index < length && candidates.len() < amount.as_usize() {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
let key = rng.random::<f64>().powf(1.0 / weight);
// We use the log of the key used in A-ExpJ to improve precision
// for small weights:
let key = rng.random::<f64>().ln() / weight;
candidates.push(Element { index, key });
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
Expand All @@ -401,23 +403,33 @@ where
index += N::one();
}

let avail = candidates.len();
if avail < amount.as_usize() {
if candidates.len() < amount.as_usize() {
return Err(WeightError::InsufficientNonZero);
}

// Partially sort the array to find the `amount` elements with the greatest
// keys. Do this by using `select_nth_unstable` to put the elements with
// the *smallest* keys at the beginning of the list in `O(n)` time, which
// provides equivalent information about the elements with the *greatest* keys.
let (_, mid, greater) = candidates.select_nth_unstable(avail - amount.as_usize());
let mut x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
while index < length {
let weight = weight(index.as_usize()).into();
if weight > 0.0 {
x -= weight;
if x <= 0.0 {
let min_candidate = candidates.pop().unwrap();
let t = (min_candidate.key * weight).exp();
let key = rng.random_range(t..1.0).ln() / weight;
candidates.push(Element { index, key });

x = rng.random::<f64>().ln() / candidates.peek().unwrap().key;
}
} else if !(weight >= 0.0) {
return Err(WeightError::InvalidWeight);
}

let mut result: Vec<N> = Vec::with_capacity(amount.as_usize());
result.push(mid.index);
for element in greater {
result.push(element.index);
index += N::one();
}
Ok(IndexVec::from(result))

Ok(IndexVec::from(
candidates.iter().map(|elt| elt.index).collect(),
))
}

/// Randomly sample exactly `amount` indices from `0..length`, using Floyd's
Expand Down
16 changes: 8 additions & 8 deletions src/seq/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -732,17 +732,17 @@ mod test {
use super::*;

// The theoretical probabilities of the different outcomes are:
// AB: 0.5 * 0.5 = 0.250
// AC: 0.5 * 0.5 = 0.250
// BA: 0.25 * 0.67 = 0.167
// BC: 0.25 * 0.33 = 0.082
// CA: 0.25 * 0.67 = 0.167
// CB: 0.25 * 0.33 = 0.082
let choices = [('a', 2), ('b', 1), ('c', 1)];
// AB: 0.5 * 0.667 = 0.3333
// AC: 0.5 * 0.333 = 0.1667
// BA: 0.333 * 0.75 = 0.25
// BC: 0.333 * 0.25 = 0.0833
// CA: 0.167 * 0.6 = 0.1
// CB: 0.167 * 0.4 = 0.0667
let choices = [('a', 3), ('b', 2), ('c', 1)];
let mut rng = crate::test::rng(414);

let mut results = [0i32; 3];
let expected_results = [4167, 4167, 1666];
let expected_results = [5833, 2667, 1500];
for _ in 0..10000 {
let result = choices
.choose_multiple_weighted(&mut rng, 2, |item| item.1)
Expand Down