Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

model correlation score for simple model #25

Open
wants to merge 2 commits into
base: api-variant
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 100 additions & 8 deletions R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @param bins number of bins to use for summary
#' @importFrom stats chisq.test integrate
#' @export
rank_summary <- function(ranks, par, thin, bins = 20){
rank_summary <- function(results, par, bins = 20){
pval <- function(bin_count){
return(chisq.test(bin_count)$p.value)
}
Expand All @@ -25,14 +25,21 @@ rank_summary <- function(ranks, par, thin, bins = 20){
val <- integrate(tempf,1,bins, rel.tol=.Machine$double.eps^.05)$value
return(val)
}
S <- dim(ranks)[1]
bin_size <- S / bins
bin_count <- rep(0, bins)
for (s in 1:S) {
bin <- ceiling(ranks[s, par] / bins)
bin_count[bin] <- bin_count[bin] + 1
for (p in par){
ranks <- results$stats$rank[names(results$stats$rank) %in% p]
ranks <- array(ranks, dim=c(length(ranks), 1))
colnames(ranks) <- c(p)
S <- results$stats$max_rank[1]
bin_size <- S / bins
bin_count <- rep(0, bins)
for (n in 1:dim(ranks)[1]) {
bin <- ceiling(ranks[n, p] / bin_size)
bin_count[bin] <- bin_count[bin] + 1
}
rank_list <- list()
rank_list[[p]] <- list(pval = round(pval(bin_count),10), max_diff = round(max_diff(bin_count),3), wasserstein = round(wasserstein(bin_count),3))
}
print(paste0("pval: ", round(pval(bin_count),10), " max_diff: ", round(max_diff(bin_count),3), " wasserstein: ", round(wasserstein(bin_count),3)))
return(list(par = par, rank_list = rank_list))
}

#' Summarize relational property of overall prior and posterior samples
Expand All @@ -48,3 +55,88 @@ dist_summary <- function(prior, post, par, bins = 20){
h2 <- hist(post[[p]], breaks = breaks, plot = FALSE)
return(list("KLD"= HistogramTools::kl.divergence(h1, h2), "JeffDivg" = HistogramTools::jeffrey.divergence(h1, h2), "Minkowski_2" = HistogramTools::minkowski.dist(h1, h2, 2)))
}

##' Cumulative Jensen-Shannon divergence
##'
##' Computes the cumulative Jensen-Shannon distance between two
##' samples.
##'
##' The Cumulative Jensen-Shannon distance is a symmetric metric based
##' on the cumulative Jensen-Shannon divergence. The divergence CJS(P || Q) between
##' two cumulative distribution functions P and Q is defined as:
##'
##' \deqn{CJS(P || Q) = \sum P(x) \log \frac{P(x)}{0.5 (P(x) + Q(x))} + \frac{1}{2 \ln 2} \sum (Q(x) - P(x))}
##'
##' The symmetric metric is defined as:
##'
##' \deqn{CJS_{dist}(P || Q) = \sqrt{CJS(P || Q) + CJS(Q || P)}}
##'
##' This has an upper bound of \eqn{\sqrt \sum (P(x) + Q(x))}
##'
##' @param x numeric vector of samples from first distribution
##' @param y numeric vector of samples from second distribution
##' @param x_weights numeric vector of weights of first distribution
##' @param y_weights numeric vector of weights of second distribution
##' @param ... unused
##' @return distance value based on CJS computation.
##' @references Nguyen H-V., Vreeken J. (2015). Non-parametric
##' Jensen-Shannon Divergence. In: Appice A., Rodrigues P., Santos
##' Costa V., Gama J., Jorge A., Soares C. (eds) Machine Learning
##' and Knowledge Discovery in Databases. ECML PKDD 2015. Lecture
##' Notes in Computer Science, vol 9285. Springer, Cham.
##' \code{doi:10.1007/978-3-319-23525-7_11}
##' @export
cjs_dist <- function(x, y, x_weights, y_weights, ...) {

# sort draws and weights
x_idx <- order(x)
x <- x[x_idx]
wp <- x_weights[x_idx]

y_idx <- order(y)
y <- y[y_idx]
wq <- y_weights[y_idx]

# add end point of final step
x_v <- x[length(x)] + x[length(x)] - x[length(x) - 1]
y_v <- y[length(y)] + y[length(y)] - y[length(y) - 1]

# calculate widths of each step
x_diff <- diff(c(x, x_v))
y_diff <- diff(c(y, y_v))

# calculate required weighted ecdfs
Px <- spatstat.geom::ewcdf(x, weights = wp)(x)
Qx <- spatstat.geom::ewcdf(y, weights = wq)(x)

Py <- spatstat.geom::ewcdf(x, weights = wp)(y)
Qy <- spatstat.geom::ewcdf(y, weights = wq)(y)

# calculate integral of ecdfs
Px_int <- drop(Px %*% x_diff)
Qx_int <- drop(Qx %*% x_diff)

Py_int <- drop(Py %*% y_diff)
Qy_int <- drop(Qy %*% y_diff)

# calculate cjs
cjs_PQ <- x_diff %*% (
Px * (log(Px, base = 2) -
log(0.5 * Px + 0.5 * Qx, base = 2)
)
) + 0.5 / log(2) * (Qx_int - Px_int)

cjs_QP <- y_diff %*% (
Qy * (log(Qy, base = 2) -
log(0.5 * Qy + 0.5 * Py, base = 2)
)
) + 0.5 / log(2) * (Py_int - Qy_int)

# calculate upper bound
bound <- Px_int + Qy_int

# normalise with respect to upper bound
out <- (sqrt(cjs_PQ + cjs_QP)) / sqrt(bound)

return(c(cjs = out))
}
196 changes: 196 additions & 0 deletions vignettes/model_interaction.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
---
title: "Model interaction"
output: html_document
---

```{r setup}
devtools::install_github("hyunjimoon/SBC",ref="api-variant")
library(SBC)

# use_cmdstanr <- TRUE # Set to false to use rstan instead
#
# if(use_cmdstanr) {
# library(cmdstanr)
# } else {
# library(rstan)
# }
library(cmdstanr)
library(bayesplot)
library(posterior)

library(future)
plan(multisession)

options(SBC.min_chunk_size = 5)
```

Three models of simple.stan from "ryanbe.me", indexed in order of choice from the tree graph. To define model correlation score the following should be inspected: symmetry, triangular inequality,
```{r simple}
backend_1_1 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple1_1.stan"))
backend_1_2 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple1_2.stan"))
backend_2_1 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple2_1.stan"))
# backend_2_2 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple2_2.stan")) doesn't have any parameter
```

```{r }
generator_func_1_1 <- function(N) {
sigma <- rlnorm(1);
mu <- rnorm(1);
y <- rnorm(N, mu, sigma);
list(
parameters = list(
mu = mu,
sigma = sigma
),
generated = list(
N = N,
y = y
)
)
}
generator_func_1_1 <- function_SBC_generator(generator_func_1_1, N = 50)
dataset_1_1 <- generate_datasets(generator_func_1_1, n_datasets = 100)
```

```{r }
generator_func_1_2 <- function(N) {
sigma <- rexp(1)
mu <- rnorm(1)
y <- rnorm(N, mu, sigma)
list(
parameters = list(
mu = mu,
sigma = sigma
),
generated = list(
N = N,
y = y
)
)
}
generator_func_1_2 <- function_SBC_generator(generator_func_1_2, N = 50)
dataset_1_2 <- generate_datasets(generator_func_1_2, n_datasets = 100)
```

```{r }
generator_func_2_1 <- function(N) {
sigma <- rlnorm(1) # from parameter block
mu <- 0 # from transformed parameter block
y <- rnorm(N, mu, sigma) #not sure between rnorm(N, 0, sigma)
list(
parameters = list(
mu = mu,
sigma = sigma
),
generated = list(
N = N,
y = y
)
)
}
generator_func_2_1 <- function_SBC_generator(generator_func_2_1, N = 50)
dataset_2_1 <- generate_datasets(generator_func_2_1, 100)
```

```{r }
# How would the degree of neighbor affect the correlation score between two models? Following questions are in order
# 1. Would there exist any stats s.t. results_i_j_i_j+1$stat < results_i_j_!(i_j) for all i_j paris?
# 2. Would symmetric? triangular inequality?
results_1_1_1_1 <- compute_results(dataset_1_1, backend_1_1)
results_1_1_1_2 <- compute_results(dataset_1_1, backend_1_2)
results_1_1_2_1 <- compute_results(dataset_1_1, backend_2_1)

results_1_2_1_2 <- compute_results(dataset_1_2, backend_1_2)
results_1_2_1_1 <- compute_results(dataset_1_2, backend_1_1)
results_1_2_2_1 <- compute_results(dataset_1_2, backend_2_1)

results_2_1_2_1 <- compute_results(dataset_2_1, backend_2_1)
results_2_1_1_1 <- compute_results(dataset_2_1, backend_1_1)
results_2_1_1_2 <- compute_results(dataset_2_1, backend_1_2)
```

```{R}
rank_summary(results_1_1_1_1, par = c("mu", "sigma"))
rank_summary(results_1_1_1_2, par = c("mu", "sigma"))
rank_summary(results_1_1_2_1, par = c("mu", "sigma"))

rank_summary(results_1_2_1_2, par = c("mu", "sigma"))
rank_summary(results_1_2_1_1, par = c("mu", "sigma"))
rank_summary(results_1_2_2_1, par = c("mu", "sigma"))

rank_summary(results_2_1_2_1, par = c("mu", "sigma"))
rank_summary(results_2_1_1_1, par = c("mu", "sigma"))
rank_summary(results_2_1_1_2, par = c("mu", "sigma"))
```

> rank_summary(results_1_1_1_1, par = c("mu", "sigma"))
[1] " max_diff: 1.6 wasserstein: 0.315"
[1] " max_diff: 0.6 wasserstein: 0.241"

> rank_summary(results_1_1_1_2, par = c("mu", "sigma"))
[1] " max_diff: 1.4 wasserstein: 0.318"
[1] " max_diff: 1 wasserstein: 0.291"

> rank_summary(results_1_1_2_1, par = c("mu", "sigma"))
[1] " max_diff: 19 wasserstein: 0.95"
[1] " max_diff: 3.179 wasserstein: 0.411"

> rank_summary(results_1_2_1_2, par = c("mu", "sigma"))
[1] " max_diff: 1 wasserstein: 0.249"
[1] " max_diff: 0.8 wasserstein: 0.291"

> rank_summary(results_1_2_1_1, par = c("mu", "sigma"))
[1] " max_diff: 0.6 wasserstein: 0.255"
[1] " max_diff: 0.8 wasserstein: 0.265"

> rank_summary(results_1_2_2_1, par = c("mu", "sigma"))
[1] " max_diff: 19 wasserstein: 0.95"
[1] " max_diff: 4.581 wasserstein: 0.851"

>
> rank_summary(results_2_1_2_1, par = c("mu", "sigma"))
[1] " max_diff: 1 wasserstein: 0.298"
[1] " max_diff: 1 wasserstein: 0.402"

> rank_summary(results_2_1_1_1, par = c("mu", "sigma"))
[1] " max_diff: 0.8 wasserstein: 0.375"
[1] " max_diff: 0.8 wasserstein: 0.356"

> rank_summary(results_2_1_1_2, par = c("mu", "sigma"))
[1] " max_diff: 1 wasserstein: 0.406"
[1] " max_diff: 0.8 wasserstein: 0.374"

Graphical comparison.
```{R}

plot_ecdf_diff(results_1_1_1_1)
plot_ecdf_diff(results_1_1_1_2)
plot_ecdf_diff(results_1_1_2_1) # mu is fixed to 0 and therefore have a therefore highly under-dispersed.

plot_ecdf_diff(results_1_2_1_2)
plot_ecdf_diff(results_1_2_1_1)
plot_ecdf_diff(results_1_2_2_1)

plot_ecdf_diff(results_2_1_2_1)
plot_ecdf_diff(results_2_1_1_1)
plot_ecdf_diff(results_2_1_1_2)
```

As can be seen from by comparing (`results_1_1_2_1` , `results_2_1_1_1` ) (`results_1_2_2_1`, `results_2_1_1_2`) rank scores are not symmetric. Compared to the `generator` model, `backend` model seem to have a greater effect on the outcome.

The next question is, how would this metric affect the final goal, finding the model with high elpd within the model network? What common trait would model that are close would have? If we calculate rank summaries with `lp__`, what implication would this have? For this let's experiment with bram

```
template_data = data.frame(y = rep(0, 15), x = rnorm(15))
priors <- prior(normal(0,1), class = "b") +
prior(normal(0,1), class = "Intercept") +
prior(normal(0,1), class = "sigma")
generator <- brms_SBC_generator(y ~ x, data = template_data, prior = priors,
thin = 50, warmup = 10000, refresh = 2000,
# Will generate the log density - this is useful,
#but a bit computationally expensive
generate_lp = TRUE
)
```

then this would mean for This is a 1d summary of two models; At first, it is recommended to explore diverse models; heterogenity is
Binary file added vignettes/model_interaction/simple1_1
Binary file not shown.
13 changes: 13 additions & 0 deletions vignettes/model_interaction/simple1_1.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
data {
int N;
vector[N] y;
}
parameters {
real mu;
real<lower=0> sigma;
}
model {
sigma ~ lognormal(0, 1);
mu ~ normal(0, 1);
y ~ normal(mu, sigma);
}
Binary file added vignettes/model_interaction/simple1_2
Binary file not shown.
13 changes: 13 additions & 0 deletions vignettes/model_interaction/simple1_2.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
data {
int N;
vector[N] y;
}
parameters {
real mu;
real <lower =0> sigma;
}
model {
mu ~ normal(0, 1);
sigma ~ exponential(1);
y ~ normal(mu, sigma);
}
Binary file added vignettes/model_interaction/simple2_1
Binary file not shown.
15 changes: 15 additions & 0 deletions vignettes/model_interaction/simple2_1.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
data {
int N;
vector[N] y;
}
parameters {
real<lower=0> sigma;
}
transformed parameters{
real<lower=0> mu;
mu = 0;
}
model {
sigma ~ lognormal(0, 1);
y ~ normal(0, sigma);
}
Binary file added vignettes/model_interaction/simple2_2
Binary file not shown.
9 changes: 9 additions & 0 deletions vignettes/model_interaction/simple2_2.stan
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
data {
int N;
vector[N] y;
}
parameters {
}
model {
y ~ normal(0, 1);
}