From cce68c5eb1e6648d5b08caa9a1c563f01ddd3c40 Mon Sep 17 00:00:00 2001
From: FBartos <f.bartos96@gmail.com>
Date: Thu, 23 May 2024 14:36:02 +0200
Subject: [PATCH] fix ODR

---
 DESCRIPTION                  |  2 +-
 NEWS.md                      |  3 +++
 R/data-preparation.R         | 41 +++++++++++++++++++++++++++++-------
 R/main.R                     | 31 +++++++++++++++++++++------
 R/zcurve_clustered.R         | 12 ++++++++++-
 man/zcurve-package.Rd        |  1 +
 man/zcurve_data.Rd           |  2 +-
 tests/testthat/test-zcurve.R |  8 +++----
 8 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0060e01..21800d2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: zcurve
 Title: An Implementation of Z-Curves
-Version: 2.4.0
+Version: 2.4.1
 Authors@R: c(
     person("František", "Bartoš", email = "f.bartos96@gmail.com", role = c("aut", "cre")),
     person("Ulrich", "Schimmack", email = "ulrich.schimmack@utoronto.ca", role = c("aut")))
diff --git a/NEWS.md b/NEWS.md
index becec42..d86cc07 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,6 @@
+## version 2.4.1
+- Fixes ODR computation with censored data
+
 ## version 2.4.0
 - Implementation of ggploting function.
 
diff --git a/R/data-preparation.R b/R/data-preparation.R
index 66336ff..c38a50e 100644
--- a/R/data-preparation.R
+++ b/R/data-preparation.R
@@ -21,7 +21,7 @@
 #' p-values treated as exact values.
 #'
 #' @details By default, the function extract the type of test statistic:
-#' \itemize{
+#' \describe{
 #'  \item{\code{"F(df1, df2)=x"}}{F-statistic with df1 and df2 degrees of freedom,}
 #'  \item{\code{"chi(df)=x"}}{Chi-square statistic with df degrees of freedom,}
 #'  \item{\code{"t(df)=x"}}{for t-statistic with df degrees of freedom,}
@@ -119,9 +119,10 @@ zcurve_data <- function(data, id = NULL, rounded = TRUE, stat_precise = 2, p_pre
   }
   
   # prepare empty containers
-  p_vals    <- rep(NA, length(data))
-  p_vals.lb <- rep(NA, length(data))
-  p_vals.ub <- rep(NA, length(data))
+  p_vals     <- rep(NA, length(data))
+  p_vals.rep <- rep(NA, length(data))
+  p_vals.lb  <- rep(NA, length(data))
+  p_vals.ub  <- rep(NA, length(data))
   
   # compute and allocate the p-values accordingly
   for(i in seq_along(data)){
@@ -151,10 +152,11 @@ zcurve_data <- function(data, id = NULL, rounded = TRUE, stat_precise = 2, p_pre
         ),
         warning = function(w) stop(paste0("The following input could not be decoded: '", data[i], "'."))
       )
-      p_vals.lb[i] <- 0
+      p_vals.lb[i]  <- 0
+      p_vals.rep[i] <- p_vals.ub[i]
     }else if(rounded[i] != -1 && !censored[i]){
       # rounded non-censored values
-      
+      temp_stat_val    <- abs(stat_val[i]) 
       temp_stat_val.lb <- abs(stat_val[i]) - 0.5 * 10^-digits[i]
       temp_stat_val.ub <- abs(stat_val[i]) + 0.5 * 10^-digits[i]
       
@@ -182,9 +184,20 @@ zcurve_data <- function(data, id = NULL, rounded = TRUE, stat_precise = 2, p_pre
         ),
         warning = function(w) stop(paste0("The following input could not be decoded: '", data[i], "'."))
       )
+      p_vals.rep[i] <- tryCatch(
+        switch(
+          stat_type[i],
+          "f" = stats::pf(temp_stat_val, df1 = stat_df1[i], df2 = stat_df2[i], lower.tail = FALSE),
+          "c" = stats::pchisq(temp_stat_val, df = stat_df1[i], lower.tail = FALSE),
+          "t" = stats::pt(abs(temp_stat_val), df = stat_df1[i], lower.tail = FALSE) * 2,
+          "z" = stats::pnorm(abs(temp_stat_val), lower.tail = FALSE) * 2,
+          "p" = temp_stat_val
+        ),
+        warning = function(w) stop(paste0("The following input could not be decoded: '", data[i], "'."))
+      )
     }else if(rounded[i] != -1 && censored[i]){
       # rounded censored values
-      
+      temp_stat_val    <- abs(stat_val[i]) 
       temp_stat_val.ub <- abs(stat_val[i]) + 0.5 * 10^-digits[i]
     
       p_vals.ub[i] <- tryCatch(
@@ -198,7 +211,18 @@ zcurve_data <- function(data, id = NULL, rounded = TRUE, stat_precise = 2, p_pre
         ),
         warning = function(w) stop(paste0("The following input could not be decoded: '", data[i], "'."))
       )
-      p_vals.lb[i] <- 0
+      p_vals.lb[i]  <- 0
+      p_vals.rep[i] <- tryCatch(
+        switch(
+          stat_type[i],
+          "f" = stats::pf(temp_stat_val.lb , df1 = stat_df1[i], df2 = stat_df2[i], lower.tail = FALSE),
+          "c" = stats::pchisq(temp_stat_val.lb, df = stat_df1[i], lower.tail = FALSE),
+          "t" = stats::pt(temp_stat_val.lb, df = stat_df1[i], lower.tail = FALSE) * 2,
+          "z" = stats::pnorm(temp_stat_val.lb, lower.tail = FALSE) * 2,
+          "p" = temp_stat_val
+        ),
+        warning = function(w) stop(paste0("The following input could not be decoded: '", data[i], "'."))
+      )
     }
   }
   
@@ -212,6 +236,7 @@ zcurve_data <- function(data, id = NULL, rounded = TRUE, stat_precise = 2, p_pre
       "input" = data[!is.na(p_vals.lb)],
       "p.lb"  = p_vals.lb[!is.na(p_vals.lb)],
       "p.ub"  = p_vals.ub[!is.na(p_vals.ub)],
+      "p.rep" = p_vals.rep[!is.na(p_vals.rep)],
       "id"    = id[!is.na(p_vals.lb)]
     )
   )
diff --git a/R/main.R b/R/main.R
index 2d499ef..8ec6603 100644
--- a/R/main.R
+++ b/R/main.R
@@ -128,6 +128,7 @@ zcurve       <- function(z, z.lb, z.ub, p, p.lb, p.ub, data, method = "EM", boot
   
   ### prepare data
   if(missing(data)){
+    
     # get point estimates on the same scale
     if(!missing(z)){
       z <- abs(z)
@@ -150,11 +151,18 @@ zcurve       <- function(z, z.lb, z.ub, p, p.lb, p.ub, data, method = "EM", boot
       ub          <- c(ub, .p_to_z(p.lb))
     }
     
+    # get the total number of observations before removal for fitting purposes
+    N_obs <- length(z) + length(lb)
+    N_sig <- length(z[z >= control$a]) + length(lb[lb >= control$a])
+    
     # restrict censoring to the fitting range & treat extremely censored values as extremely significant values
     if(!is.null(lb)){
       
-      if(any(lb < control$a))
-        stop("All censored observations must be higher than the fitting range.")
+      if(any(lb < control$a)){
+        warning(paste0(sum(lb < control$a), " censored p-values removed due to the upper bound being larger that the fitting range."), immediate. = TRUE, call. = FALSE)
+        ub <- ub[lb >= control$a]
+        lb <- lb[lb >= control$a]
+      }
       
       z  <- c(z, lb[lb >= control$b])
       
@@ -184,18 +192,26 @@ zcurve       <- function(z, z.lb, z.ub, p, p.lb, p.ub, data, method = "EM", boot
       object$data <- numeric()
     }
     
+    # get the total number of observations before removal for fitting purposes
+    N_obs <- length(data$precise$p)
+    N_sig <- length(data$precise$p[.p_to_z(data$precise$p) >= control$a])
+    
     if(nrow(data$censored) != 0){
       
       lb <- .p_to_z(data$censored$p.ub)
       ub <- .p_to_z(data$censored$p.lb)
       
+      # get the total number of observations before removal for fitting purposes (using the collected bounds before accounting for rounding)
+      N_obs <- N_obs + length(.p_to_z(data$censored$p.rep))
+      N_sig <- N_sig + length(.p_to_z(data$censored$p.rep)[.p_to_z(data$censored$p.rep) >= control$a])
+      
       # remove non-significant censored p-values
       if(any(lb < control$a)){
         warning(paste0(sum(lb < control$a), " censored p-values removed due to the upper bound being larger that the fitting range."), immediate. = TRUE, call. = FALSE)
         ub <- ub[lb >= control$a]
         lb <- lb[lb >= control$a]
       }
-
+      
       # move too significant censored p-values among precise p-values          
       if(length(lb) > 0 && any(lb >= control$b)){
         object$data <- c(object$data, lb[lb >= control$b])
@@ -222,7 +238,9 @@ zcurve       <- function(z, z.lb, z.ub, p, p.lb, p.ub, data, method = "EM", boot
     }
   }
 
-  object$control        <- control
+  object$control <- control
+  object$N_obs   <- N_obs
+  object$N_sig   <- N_sig
   
   # only run the algorithm with some significant results
   if(sum(object$data > control$a & object$data < control$b) + nrow(object$data_censoring) < 10)
@@ -353,9 +371,8 @@ summary.zcurve       <- function(object, type = "results", all = FALSE, ERR.adj
     iter_text <- object$fit$iter
   }
   
-  temp_N_sig     <- sum(object$data > stats::qnorm(object$control$sig_level/2, lower.tail = FALSE)) + 
-    nrow(object$data_censoring[object$data_censoring$lb > object$control$a,]) 
-  temp_N_obs     <- length(object$data) + nrow(object$data_censoring)
+  temp_N_sig     <- object$N_sig
+  temp_N_obs     <- object$N_obs
   temp_N_used    <- sum(object$data > object$control$a & object$data < object$control$b) + 
     nrow(object$data_censoring[object$data_censoring$lb > object$control$a & object$data_censoring$lb < object$control$b ,])
     
diff --git a/R/zcurve_clustered.R b/R/zcurve_clustered.R
index a4b0aeb..a3015c0 100644
--- a/R/zcurve_clustered.R
+++ b/R/zcurve_clustered.R
@@ -66,12 +66,20 @@ zcurve_clustered <- function(data, method = "b", bootstrap = 1000, parallel = FA
     z_id <- numeric()
   }
   
+  # get the total number of observations before removal for fitting purposes
+  N_obs <- length(data$precise$p)
+  N_sig <- length(data$precise$p[.p_to_z(data$precise$p) >= control$a])
+  
   if(nrow(data$censored) != 0){
     
     lb   <- .p_to_z(data$censored$p.ub)
     ub   <- .p_to_z(data$censored$p.lb)
     b_id <- data$censored$id
     
+    # get the total number of observations before removal for fitting purposes (using the collected bounds before accounting for rounding)
+    N_obs <- N_obs + length(.p_to_z(data$censored$p.rep))
+    N_sig <- N_sig + length(.p_to_z(data$censored$p.rep)[.p_to_z(data$censored$p.rep) >= control$a])
+    
     # remove non-significant censored p-values
     if(any(lb < control$a)){
       warning(paste0(sum(lb < control$a), " censored p-values removed due to the upper bound being larger that the fitting range."), immediate. = TRUE, call. = FALSE)
@@ -109,7 +117,9 @@ zcurve_clustered <- function(data, method = "b", bootstrap = 1000, parallel = FA
   object$data_id        <- z_id
   object$data_censoring <- data.frame(lb = lb, ub = ub, id = b_id)
   
-  object$control        <- control
+  object$control <- control
+  object$N_obs   <- N_obs
+  object$N_sig   <- N_sig
   
   # only run the algorithm with some significant results
   if(sum(z > control$a & z < control$b) + length(lb) < 10)
diff --git a/man/zcurve-package.Rd b/man/zcurve-package.Rd
index 83b6d58..d44445f 100644
--- a/man/zcurve-package.Rd
+++ b/man/zcurve-package.Rd
@@ -12,6 +12,7 @@ An implementation of z-curves - a method for estimating expected discovery and r
 Useful links:
 \itemize{
   \item \url{https://fbartos.github.io/zcurve/}
+  \item Report bugs at \url{https://github.com/FBartos/zcurve/issues}
 }
 
 }
diff --git a/man/zcurve_data.Rd b/man/zcurve_data.Rd
index 5f6ca91..1cc7e73 100644
--- a/man/zcurve_data.Rd
+++ b/man/zcurve_data.Rd
@@ -37,7 +37,7 @@ to be censored as well as test statistics reported with low accuracy
 }
 \details{
 By default, the function extract the type of test statistic:
-\itemize{
+\describe{
 \item{\code{"F(df1, df2)=x"}}{F-statistic with df1 and df2 degrees of freedom,}
 \item{\code{"chi(df)=x"}}{Chi-square statistic with df degrees of freedom,}
 \item{\code{"t(df)=x"}}{for t-statistic with df degrees of freedom,}
diff --git a/tests/testthat/test-zcurve.R b/tests/testthat/test-zcurve.R
index f92118d..2d94d89 100644
--- a/tests/testthat/test-zcurve.R
+++ b/tests/testthat/test-zcurve.R
@@ -184,7 +184,7 @@ test_that("z-curve clustered works", {
       "EDR    0.722 0.632 0.823"                                                                             ,
       ""                                                                                                     ,
       "Model converged in 57 + 450.85 iterations"                                                               ,
-      "Fitted using 299 zcurve-data-values. 480 supplied, 299 significant (ODR = 0.62, 95% CI [0.58, 0.67]).",
+      "Fitted using 299 zcurve-data-values. 500 supplied, 300 significant (ODR = 0.60, 95% CI [0.56, 0.64]).",
       "Q = -236.27, 95% CI[-251.36, -224.63]"  
     ))
   
@@ -200,7 +200,7 @@ test_that("z-curve clustered works", {
       "EDR    0.783 0.655 0.895"                                                                             ,
       ""                                                                                                     ,
       "Model converged in 73 + 178 iterations"                                                               ,
-      "Fitted using 299 zcurve-data-values. 480 supplied, 299 significant (ODR = 0.62, 95% CI [0.58, 0.67]).",
+      "Fitted using 299 zcurve-data-values. 500 supplied, 300 significant (ODR = 0.60, 95% CI [0.56, 0.64]).",
       "Q = -238.11, 95% CI[-269.95, -219.12]"  
     ))
   
@@ -278,7 +278,7 @@ test_that("z-curve clustered works", {
       "EDR    0.733 0.639 0.831"                                                                             ,
       ""                                                                                                     ,
       "Model converged in 72 + 430.1 iterations"                                                             ,
-      "Fitted using 299 zcurve-data-values. 489 supplied, 299 significant (ODR = 0.61, 95% CI [0.57, 0.65]).",
+      "Fitted using 299 zcurve-data-values. 500 supplied, 300 significant (ODR = 0.60, 95% CI [0.56, 0.64]).",
       "Q = -319.14, 95% CI[-340.91, -306.78]"  
     ))
   
@@ -294,7 +294,7 @@ test_that("z-curve clustered works", {
       "EDR    0.788 0.651 0.901"                                                                             ,
       ""                                                                                                     ,
       "Model converged in 59 + 188 iterations"                                                               ,
-      "Fitted using 299 zcurve-data-values. 489 supplied, 299 significant (ODR = 0.61, 95% CI [0.57, 0.65]).",
+      "Fitted using 299 zcurve-data-values. 500 supplied, 300 significant (ODR = 0.60, 95% CI [0.56, 0.64]).",
       "Q = -321.39, 95% CI[-361.78, -298.20]"  
     ))