hyunjimoon · hyunjimoon · Aug 12, 2021 · Aug 12, 2021
diff --git a/R/print.R b/R/print.R
@@ -7,7 +7,7 @@
 #' @param bins number of bins to use for summary
 #' @importFrom stats chisq.test integrate
 #' @export
-rank_summary <- function(ranks, par, thin, bins = 20){
+rank_summary <- function(results, par, bins = 20){
   pval <- function(bin_count){
     return(chisq.test(bin_count)$p.value)
   }
@@ -25,14 +25,21 @@ rank_summary <- function(ranks, par, thin, bins = 20){
     val <- integrate(tempf,1,bins, rel.tol=.Machine$double.eps^.05)$value
     return(val)
   }
-  S <- dim(ranks)[1]
-  bin_size <- S / bins
-  bin_count <- rep(0, bins)
-  for (s in 1:S) {
-    bin <- ceiling(ranks[s, par] / bins)
-    bin_count[bin] <- bin_count[bin] + 1
+  for (p in par){
+    ranks <- results$stats$rank[names(results$stats$rank) %in% p]
+    ranks <- array(ranks, dim=c(length(ranks), 1))
+    colnames(ranks) <- c(p)
+    S <- results$stats$max_rank[1]
+    bin_size <- S / bins
+    bin_count <- rep(0, bins)
+    for (n in 1:dim(ranks)[1]) {
+      bin <- ceiling(ranks[n, p] / bin_size)
+      bin_count[bin] <- bin_count[bin] + 1
+    }
+    rank_list <- list()
+    rank_list[[p]] <- list(pval = round(pval(bin_count),10), max_diff =  round(max_diff(bin_count),3), wasserstein = round(wasserstein(bin_count),3))
   }
-  print(paste0("pval: ", round(pval(bin_count),10), " max_diff: ", round(max_diff(bin_count),3), " wasserstein: ", round(wasserstein(bin_count),3)))
+  return(list(par = par, rank_list = rank_list))
 }
 
 #' Summarize relational property of overall prior and posterior samples
@@ -48,3 +55,88 @@ dist_summary <- function(prior, post, par, bins = 20){
   h2 <- hist(post[[p]],  breaks = breaks, plot = FALSE)
   return(list("KLD"= HistogramTools::kl.divergence(h1, h2), "JeffDivg" = HistogramTools::jeffrey.divergence(h1, h2), "Minkowski_2" = HistogramTools::minkowski.dist(h1, h2, 2)))
 }
+
+##' Cumulative Jensen-Shannon divergence
+##'
+##' Computes the cumulative Jensen-Shannon distance between two
+##' samples.
+##'
+##' The Cumulative Jensen-Shannon distance is a symmetric metric based
+##' on the cumulative Jensen-Shannon divergence. The divergence CJS(P || Q) between
+##' two cumulative distribution functions P and Q is defined as:
+##'
+##' \deqn{CJS(P || Q) = \sum P(x) \log \frac{P(x)}{0.5 (P(x) + Q(x))} + \frac{1}{2 \ln 2} \sum (Q(x) - P(x))}
+##'
+##' The symmetric metric is defined as:
+##'
+##' \deqn{CJS_{dist}(P || Q) = \sqrt{CJS(P || Q) + CJS(Q || P)}}
+##'
+##' This has an upper bound of \eqn{\sqrt \sum (P(x) + Q(x))}
+##'
+##' @param x numeric vector of samples from first distribution
+##' @param y numeric vector of samples from second distribution
+##' @param x_weights numeric vector of weights of first distribution
+##' @param y_weights numeric vector of weights of second distribution
+##' @param ... unused
+##' @return distance value based on CJS computation.
+##' @references Nguyen H-V., Vreeken J. (2015).  Non-parametric
+##'   Jensen-Shannon Divergence.  In: Appice A., Rodrigues P., Santos
+##'   Costa V., Gama J., Jorge A., Soares C. (eds) Machine Learning
+##'   and Knowledge Discovery in Databases.  ECML PKDD 2015. Lecture
+##'   Notes in Computer Science, vol 9285.  Springer, Cham.
+##'   \code{doi:10.1007/978-3-319-23525-7_11}
+##' @export
+cjs_dist <- function(x, y, x_weights, y_weights, ...) {
+
+  # sort draws and weights
+  x_idx <- order(x)
+  x <- x[x_idx]
+  wp <- x_weights[x_idx]
+
+  y_idx <- order(y)
+  y <- y[y_idx]
+  wq <- y_weights[y_idx]
+
+  # add end point of final step
+  x_v <- x[length(x)] + x[length(x)] - x[length(x) - 1]
+  y_v <- y[length(y)] + y[length(y)] - y[length(y) - 1]
+
+  # calculate widths of each step
+  x_diff <- diff(c(x, x_v))
+  y_diff <- diff(c(y, y_v))
+
+  # calculate required weighted ecdfs
+  Px <- spatstat.geom::ewcdf(x, weights = wp)(x)
+  Qx <- spatstat.geom::ewcdf(y, weights = wq)(x)
+
+  Py <- spatstat.geom::ewcdf(x, weights = wp)(y)
+  Qy <- spatstat.geom::ewcdf(y, weights = wq)(y)
+
+  # calculate integral of ecdfs
+  Px_int <- drop(Px %*% x_diff)
+  Qx_int <- drop(Qx %*% x_diff)
+
+  Py_int <- drop(Py %*% y_diff)
+  Qy_int <- drop(Qy %*% y_diff)
+
+  # calculate cjs
+  cjs_PQ <-  x_diff %*% (
+    Px * (log(Px, base = 2) -
+            log(0.5 * Px + 0.5 * Qx, base = 2)
+    )
+  ) + 0.5 / log(2) * (Qx_int - Px_int)
+
+  cjs_QP <- y_diff %*% (
+    Qy * (log(Qy, base = 2) -
+            log(0.5 * Qy + 0.5 * Py, base = 2)
+    )
+  ) + 0.5 / log(2) * (Py_int - Qy_int)
+
+  # calculate upper bound
+  bound <- Px_int + Qy_int
+
+  # normalise with respect to upper bound
+  out <- (sqrt(cjs_PQ + cjs_QP)) / sqrt(bound)
+
+  return(c(cjs = out))
+}
diff --git a/vignettes/model_interaction.Rmd b/vignettes/model_interaction.Rmd
@@ -0,0 +1,196 @@
+---
+title: "Model interaction"
+output: html_document
+---
+
+```{r setup}
+devtools::install_github("hyunjimoon/SBC",ref="api-variant")
+library(SBC)
+
+# use_cmdstanr <- TRUE # Set to false to use rstan instead
+# 
+# if(use_cmdstanr) {
+#   library(cmdstanr)
+# } else {
+#   library(rstan)
+# }
+library(cmdstanr)
+library(bayesplot)
+library(posterior)
+
+library(future)
+plan(multisession) 
+
+options(SBC.min_chunk_size = 5)
+```
+
+Three models of simple.stan from "ryanbe.me", indexed in order of choice from the tree graph. To define model correlation score the following should be inspected: symmetry, triangular inequality, 
+```{r simple}
+backend_1_1 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple1_1.stan"))
+backend_1_2 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple1_2.stan"))
+backend_2_1 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple2_1.stan"))
+# backend_2_2 <- cmdstan_sample_SBC_backend(cmdstan_model("model_interaction/simple2_2.stan")) doesn't have any parameter
+```
+
+```{r }
+generator_func_1_1 <- function(N) {
+  sigma <-  rlnorm(1);
+  mu <- rnorm(1);
+  y <- rnorm(N, mu, sigma);
+  list(
+    parameters = list(
+      mu = mu,
+      sigma = sigma
+    ),
+    generated = list(
+      N = N,
+      y = y
+    )
+  )
+}
+generator_func_1_1 <- function_SBC_generator(generator_func_1_1, N = 50)
+dataset_1_1 <- generate_datasets(generator_func_1_1, n_datasets = 100)
+```
+
+```{r }
+generator_func_1_2 <- function(N) {
+  sigma <-  rexp(1)
+  mu <- rnorm(1)
+  y <- rnorm(N, mu, sigma)
+  list(
+    parameters = list(
+      mu = mu, 
+      sigma = sigma
+    ),
+    generated = list(
+      N = N,
+      y = y
+    )
+  )
+}
+generator_func_1_2 <- function_SBC_generator(generator_func_1_2, N = 50)
+dataset_1_2 <- generate_datasets(generator_func_1_2, n_datasets = 100)
+```
+
+```{r }
+generator_func_2_1 <- function(N) {
+  sigma <-  rlnorm(1) # from parameter block
+  mu <- 0 # from transformed parameter block
+  y <- rnorm(N, mu, sigma) #not sure between rnorm(N, 0, sigma)
+  list(
+    parameters = list(
+      mu = mu,
+      sigma = sigma
+    ),
+    generated = list(
+      N = N,
+      y = y
+    )
+  )
+}
+generator_func_2_1 <- function_SBC_generator(generator_func_2_1, N = 50)
+dataset_2_1 <- generate_datasets(generator_func_2_1, 100)
+```
+
+```{r }
+# How would the degree of neighbor affect the correlation score between two models? Following questions are in order
+# 1. Would there exist any stats s.t. results_i_j_i_j+1$stat < results_i_j_!(i_j) for all i_j paris?
+# 2. Would symmetric? triangular inequality?
+results_1_1_1_1 <- compute_results(dataset_1_1, backend_1_1)
+results_1_1_1_2 <- compute_results(dataset_1_1, backend_1_2)
+results_1_1_2_1 <- compute_results(dataset_1_1, backend_2_1)
+
+results_1_2_1_2 <- compute_results(dataset_1_2, backend_1_2)
+results_1_2_1_1 <- compute_results(dataset_1_2, backend_1_1)
+results_1_2_2_1 <- compute_results(dataset_1_2, backend_2_1)
+
+results_2_1_2_1 <- compute_results(dataset_2_1, backend_2_1)
+results_2_1_1_1 <- compute_results(dataset_2_1, backend_1_1)
+results_2_1_1_2 <- compute_results(dataset_2_1, backend_1_2)
+```
+
+```{R}
+rank_summary(results_1_1_1_1, par = c("mu", "sigma"))
+rank_summary(results_1_1_1_2, par = c("mu", "sigma"))
+rank_summary(results_1_1_2_1, par = c("mu", "sigma"))
+
+rank_summary(results_1_2_1_2, par = c("mu", "sigma"))
+rank_summary(results_1_2_1_1, par = c("mu", "sigma"))
+rank_summary(results_1_2_2_1, par = c("mu", "sigma"))
+
+rank_summary(results_2_1_2_1, par = c("mu", "sigma"))
+rank_summary(results_2_1_1_1, par = c("mu", "sigma"))
+rank_summary(results_2_1_1_2, par = c("mu", "sigma"))
+```
+
+> rank_summary(results_1_1_1_1, par = c("mu", "sigma"))
+[1] " max_diff: 1.6 wasserstein: 0.315"
+[1] " max_diff: 0.6 wasserstein: 0.241"
+
+> rank_summary(results_1_1_1_2, par = c("mu", "sigma"))
+[1] " max_diff: 1.4 wasserstein: 0.318"
+[1] " max_diff: 1 wasserstein: 0.291"
+
+> rank_summary(results_1_1_2_1, par = c("mu", "sigma"))
+[1] " max_diff: 19 wasserstein: 0.95"
+[1] " max_diff: 3.179 wasserstein: 0.411"
+
+> rank_summary(results_1_2_1_2, par = c("mu", "sigma"))
+[1] " max_diff: 1 wasserstein: 0.249"
+[1] " max_diff: 0.8 wasserstein: 0.291"
+
+> rank_summary(results_1_2_1_1, par = c("mu", "sigma"))
+[1] " max_diff: 0.6 wasserstein: 0.255"
+[1] " max_diff: 0.8 wasserstein: 0.265"
+
+> rank_summary(results_1_2_2_1, par = c("mu", "sigma"))
+[1] " max_diff: 19 wasserstein: 0.95"
+[1] " max_diff: 4.581 wasserstein: 0.851"
+
+> 
+> rank_summary(results_2_1_2_1, par = c("mu", "sigma"))
+[1] " max_diff: 1 wasserstein: 0.298"
+[1] " max_diff: 1 wasserstein: 0.402"
+
+> rank_summary(results_2_1_1_1, par = c("mu", "sigma"))
+[1] " max_diff: 0.8 wasserstein: 0.375"
+[1] " max_diff: 0.8 wasserstein: 0.356"
+
+> rank_summary(results_2_1_1_2, par = c("mu", "sigma"))
+[1] " max_diff: 1 wasserstein: 0.406"
+[1] " max_diff: 0.8 wasserstein: 0.374"
+
+Graphical comparison.
+```{R}
+
+plot_ecdf_diff(results_1_1_1_1)
+plot_ecdf_diff(results_1_1_1_2)
+plot_ecdf_diff(results_1_1_2_1) # mu is fixed to 0 and therefore have a therefore highly under-dispersed.
+
+plot_ecdf_diff(results_1_2_1_2)
+plot_ecdf_diff(results_1_2_1_1)
+plot_ecdf_diff(results_1_2_2_1)
+
+plot_ecdf_diff(results_2_1_2_1)
+plot_ecdf_diff(results_2_1_1_1)
+plot_ecdf_diff(results_2_1_1_2)
+```
+
+As can be seen from by comparing (`results_1_1_2_1` , `results_2_1_1_1` ) (`results_1_2_2_1`, `results_2_1_1_2`) rank scores are not symmetric. Compared to the `generator` model, `backend` model seem to have a greater effect on the outcome. 
+
+The next question is, how would this metric affect the final goal, finding the model with high elpd within the model network? What common trait would model that are close would have? If we calculate rank summaries with `lp__`, what implication would this have? For this let's experiment with bram
+
+```
+template_data = data.frame(y = rep(0, 15), x = rnorm(15))
+priors <- prior(normal(0,1), class = "b") +
+  prior(normal(0,1), class = "Intercept") +
+  prior(normal(0,1), class = "sigma")
+generator <- brms_SBC_generator(y ~ x, data = template_data, prior = priors, 
+                                thin = 50, warmup = 10000, refresh = 2000,
+                                # Will generate the log density - this is useful, 
+                                #but a bit computationally expensive
+                                generate_lp = TRUE 
+                                )
+```
+
+then this would mean for This is a 1d summary of two models;  At first, it is recommended to explore diverse models; heterogenity is 
diff --git a/vignettes/model_interaction/simple1_1 b/vignettes/model_interaction/simple1_1
diff --git a/vignettes/model_interaction/simple1_1.stan b/vignettes/model_interaction/simple1_1.stan
@@ -0,0 +1,13 @@
+data {
+  int N;
+  vector[N] y;
+}
+parameters {
+  real mu;
+  real<lower=0> sigma;
+}
+model {
+  sigma ~ lognormal(0, 1);
+  mu ~ normal(0, 1);
+  y ~ normal(mu, sigma);
+}
diff --git a/vignettes/model_interaction/simple1_2 b/vignettes/model_interaction/simple1_2
diff --git a/vignettes/model_interaction/simple1_2.stan b/vignettes/model_interaction/simple1_2.stan
@@ -0,0 +1,13 @@
+data {
+  int N;
+  vector[N] y;
+}
+parameters {
+  real mu;
+  real <lower =0> sigma;
+}
+model {
+  mu ~ normal(0, 1);
+  sigma ~ exponential(1);
+  y ~ normal(mu, sigma);
+}
diff --git a/vignettes/model_interaction/simple2_1 b/vignettes/model_interaction/simple2_1
diff --git a/vignettes/model_interaction/simple2_1.stan b/vignettes/model_interaction/simple2_1.stan
@@ -0,0 +1,15 @@
+data {
+  int N;
+  vector[N] y;
+}
+parameters {
+  real<lower=0> sigma;
+}
+transformed parameters{
+  real<lower=0> mu;
+  mu = 0;
+}
+model {
+  sigma ~ lognormal(0, 1);
+  y ~ normal(0, sigma);
+}
diff --git a/vignettes/model_interaction/simple2_2 b/vignettes/model_interaction/simple2_2
diff --git a/vignettes/model_interaction/simple2_2.stan b/vignettes/model_interaction/simple2_2.stan
@@ -0,0 +1,9 @@
+data {
+  int N;
+  vector[N] y;
+}
+parameters {
+}
+model {
+  y ~ normal(0, 1);
+}