From 248f4c882127449fb339e9f68ead62e3f561a4e2 Mon Sep 17 00:00:00 2001 From: Simon Kucharsky Date: Fri, 27 Oct 2023 13:22:45 +0200 Subject: [PATCH] make coxSnell and Nagelkerke computationally more robust (#265) * make coxSnell and Nagelkerke computationally more robust * fix typo * 1-exp -> -exmp1 --- R/commonglm.R | 10 ++++-- tests/testthat/test-regressionlogistic.R | 46 +++++++++++++++++++----- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/R/commonglm.R b/R/commonglm.R index 6119111d..67d5211e 100644 --- a/R/commonglm.R +++ b/R/commonglm.R @@ -173,8 +173,8 @@ l0 <- -0.5*nullModel[["deviance"]] lm <- as.numeric(logLik(glmModel)) n <- length(glmModel[["y"]]) - coxSnell <- 1 - exp(l0 - lm)^(2 / n) - denom <- 1 - exp(l0)^(2 / n) + coxSnell <- .coxSnellCompute(l0, lm, n) + denom <- -expm1(2*l0/n) return(max(c(0,coxSnell / denom))) } } @@ -195,11 +195,15 @@ l0 <- -0.5*nullModel[["deviance"]] lm <- as.numeric(logLik(glmModel)) n <- length(glmModel[["y"]]) - coxSnell <- 1 - exp(l0 - lm)^(2 / n) + coxSnell <- .coxSnellCompute(l0, lm, n) return(max(c(0,coxSnell))) } } +.coxSnellCompute <- function(l0, lm, n) { + return(-expm1(2*(l0 - lm)/n)) +} + .bic <- function(glmModel) { return(log(length(glmModel[["y"]]))*length(coef(glmModel))+glmModel[["deviance"]]) } diff --git a/tests/testthat/test-regressionlogistic.R b/tests/testthat/test-regressionlogistic.R index 9dbec1f2..12ec64e0 100644 --- a/tests/testthat/test-regressionlogistic.R +++ b/tests/testthat/test-regressionlogistic.R @@ -215,7 +215,7 @@ test_that("Confusion Matrix Table Matches", { list(components="contNormal", isNuisance=FALSE), list(components="contOutlier", isNuisance=FALSE) ) - + options$confusionMatrix <- TRUE results <- jaspTools::runAnalysis("RegressionLogistic", "debug.csv", options) table <- results[["results"]][["perfDiag"]][["collection"]][["perfDiag_confusionMatrix"]][["data"]] @@ -254,35 +254,35 @@ test_that("Confusion Matrix Table Matches", { options$covariates <- c("contNormal", "contOutlier") options$factors <- c("facFive") options$dependent <- "facGender" - + options$modelTerms <- list( list(components="contNormal", isNuisance=FALSE), list(components="contOutlier", isNuisance=FALSE), list(components="facFive", isNuisance=FALSE) ) - + options$multicollinearity <- TRUE results <- jaspTools::runAnalysis("RegressionLogistic", "debug.csv", options) table <- results[["results"]][["multicolliTable"]][["data"]] jaspTools::expect_equal_tables(table, list("contNormal", 0.9536754, 1.048575, - "contOutlier", 0.9742628, 1.026417, + "contOutlier", 0.9742628, 1.026417, "facFive", 0.9377347, 1.0664)) - + options <- jaspTools::analysisOptions("RegressionLogistic") options$covariates <- c("contNormal", "contOutlier") options$dependent <- "facGender" - + options$modelTerms <- list( list(components="contNormal", isNuisance=FALSE), list(components="contOutlier", isNuisance=FALSE) ) - + options$multicollinearity <- TRUE results <- jaspTools::runAnalysis("RegressionLogistic", "debug.csv", options) table <- results[["results"]][["multicolliTable"]][["data"]] jaspTools::expect_equal_tables(table, list("contNormal", 0.9958289, 1.004189, "contOutlier", 0.9958289, 1.004189)) - + }) test_that("Error Handling", { @@ -353,6 +353,36 @@ test_that("Pseudo R-squared are correct", { jaspTools::expect_equal_tables(r_squared, list(0.0856291418878957, 0.141844242772774, 0.0962310669224921, 0.100864461712579) ) + + + # Another test, inspired by the issue: https://github.com/jasp-stats/jasp-issues/issues/2368 + # That issue was caused by a numerical overflow for a somewhat large dataset + set.seed(1) + n <- 1e5 + x <- runif(n, min = 0, max = 1) + y_hat <- 5 * x + p <- plogis(y_hat) + y <- rbinom(n, 1, p) + + df <- data.frame(x, y) + + # library(performance) # version 0.10.5 + # fit <- glm(y~x, data = df, family = binomial) + # performance::r2_mcfadden(fit)$R2 # 0.2066583 + # performance::r2_nagelkerke(fit) # 0.2767422 + # performance::r2_tjur(fit) # 0.1713724 + # performance::r2_coxsnell(fit) # 0.1524201 + + options <- jaspTools::analysisOptions("RegressionLogistic") + options$dependent <- "y" + options$covariates <- "x" + options$modelTerms <- list(list(components = "x", isNuisance = FALSE)) + + results <- jaspTools::runAnalysis("RegressionLogistic", df, options) + r_squared <- results$results$modelSummary$data[[2]][c("fad", "nag", "tju", "cas")] + jaspTools::expect_equal_tables(r_squared, + list(0.206658284548968, 0.276742170706529, 0.171372447470285, 0.152420076012601) + ) }) test_that("Performance plots match", {