support log data

seroanalytics · Oct 17, 2024 · 75c60f5 · 75c60f5
1 parent bc4da25
commit 75c60f5
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 18 deletions.
diff --git a/R/biokinetics.R b/R/biokinetics.R
@@ -8,6 +8,7 @@ biokinetics <- R6::R6Class(
   "biokinetics",
   cloneable = FALSE,
   private = list(
+    scale = NULL,
     priors = NULL,
     preds_sd = NULL,
     data = NULL,
@@ -55,7 +56,7 @@ biokinetics <- R6::R6Class(
     },
     construct_design_matrix = function() {
       var <- pid <- NULL
-      dt_design_matrix <- private$data[, .SD, .SDcols = private$all_formula_vars, by = pid] |>
+      dt_design_matrix <- as.character(private$data[, .SD, .SDcols = private$all_formula_vars, by = pid]) |>
         unique()
 
       # Build the full design matrix using model.matrix
@@ -211,19 +212,22 @@ biokinetics <- R6::R6Class(
     #' @param priors Object of type \link[epikinetics]{biokinetics_priors}. Default biokinetics_priors().
     #' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
     #' will be treated as categorical variables. Default ~0.
-    #' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
+    #' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it
+    #' will be converted to a log scale for model fitting.
     initialize = function(priors = biokinetics_priors(),
                           data = NULL,
                           file_path = NULL,
                           covariate_formula = ~0,
-                          preds_sd = 0.25) {
+                          preds_sd = 0.25,
+                          scale = "natural") {
       validate_priors(priors)
       private$priors <- priors
       validate_numeric(preds_sd)
       private$preds_sd <- preds_sd
       validate_formula(covariate_formula)
       private$covariate_formula <- covariate_formula
       private$all_formula_vars <- all.vars(covariate_formula)
+      private$scale <- scale
       if (is.null(data) && is.null(file_path)) {
         stop("One of 'data' or 'file_path' must be provided")
       }
@@ -241,7 +245,9 @@ biokinetics <- R6::R6Class(
       validate_required_cols(private$data)
       validate_formula_vars(private$all_formula_vars, private$data)
       logger::log_info("Preparing data for stan")
-      private$data <- convert_log_scale(private$data, "value")
+      if (scale == "natural") {
+        private$data <- convert_log_scale(private$data, "value")
+      }
       private$data[, `:=`(obs_id = seq_len(.N),
                           t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
       if (!("censored" %in% colnames(private$data))) {
@@ -263,6 +269,9 @@ biokinetics <- R6::R6Class(
     get_stan_data = function() {
       private$stan_input_data
     },
+    get_covariate_lookup_table = function() {
+      private$covariate_lookup_table
+    },
     #' @description Fit the model and return CmdStanMCMC fitted model object.
     #' @return A CmdStanMCMC fitted model object: <https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html>
     #' @param ... Named arguments to the `sample()` method of CmdStan model.
@@ -376,6 +385,10 @@ biokinetics <- R6::R6Class(
       dt_out <- dt_out[
         , lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]
 
+      if (private$scale == "log") {
+        return(dt_out)
+      }
+
       if (summarise) {
         dt_out <- convert_log_scale_inverse(
           dt_out, vars_to_transform = c("me", "lo", "hi"))
@@ -416,8 +429,10 @@ biokinetics <- R6::R6Class(
       logger::log_info("Recovering covariate names")
       dt_peak_switch <- private$recover_covariate_names(dt_peak_switch)
 
-      dt_peak_switch <- convert_log_scale_inverse(
-        dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
+      if (private$scale == "natural") {
+        dt_peak_switch <- convert_log_scale_inverse(
+          dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
+      }
 
       logger::log_info("Calculating medians")
       dt_peak_switch[
@@ -476,10 +491,12 @@ biokinetics <- R6::R6Class(
       # Running the C++ code to simulate trajectories for each parameter sample
       # for each individual
       logger::log_info("Simulating individual trajectories")
-      dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)
+      dt_params_ind_traj <- data.table::setDT(biokinetics_simulate_trajectories(dt_params_ind))
 
-      dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
-        dt_params_ind_traj, vars_to_transform = "mu"))
+      if (private$scale == "natural") {
+        dt_params_ind_traj <- convert_log_scale_inverse_cpp(
+          dt_params_ind_traj, vars_to_transform = "mu")
+      }
 
       # convert numeric pid to original pid
       dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]

diff --git a/src/stan/antibody_kinetics_main.stan b/src/stan/antibody_kinetics_main.stan
@@ -32,7 +32,7 @@ data {
   array[N] int<lower=1, upper=K> titre_type; // Titre type for each observation
   vector[N] t; // Time for each observation
   vector[N] value; // Observed titre values
-  array[N] int<lower=-2, upper=1> censored; // Censoring indicator: -2 for lo, -1 for me 1 for upper, 0 for none
+  array[N] int<lower=-1, upper=1> censored; // Censoring indicator: -1 for lo, 1 for upper, 0 for none
 
   // Indices for different censoring scenarios
   int N_uncens; // number of uncensored observations

diff --git a/vignettes/data.Rmd b/vignettes/data.Rmd
@@ -19,19 +19,22 @@ knitr::opts_chunk$set(
 The model requires time series data about individual titre readings, along with last exposure times. Times can be relative (e.g. day of study) or absolute (i.e. precise calendar dates).
 The full list of required columns is as follows:
 
-| name         | type                 | description                                                                                                     | required |
-|--------------|----------------------|-----------------------------------------------------------------------------------------------------------------| -------- |
-| pid          | numeric or character | Unique identifier to identify a person across observations                                                      | T|
-| day          | integer or date      | The day of the observation. Can be a date or an integer representing a relative day of study                    | T|
-| last_exp_day | integer or date      | The most recent day on which the person was exposed. Must be of the same type as the 'day' column               | T|
-| titre_type   | character            | Name of the titre or biomarker                                                                                  | T|
-| value        | numeric              | Titre value                                                                                                     | T|
+| name         | type                 | description                                                                                | required |
+|--------------|----------------------|--------------------------------------------------------------------------------------------| -------- |
+| pid          | numeric or character | Unique identifier to identify a person across observations                                 | T|
+| day          | integer or date      | The day of the observation. Can be a date or an integer representing a relative day of study | T|
+| last_exp_day | integer or date      | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T|
+| titre_type   | character            | Name of the titre or biomarker                                                             | T|
+| value        | numeric              | Titre value                                                                                | T|
 | censored     | -1, 0 or 1           | Optional column. Whether this observation should be treated as censored: -1 for lower, 1 for upper, 0 for none. | F|
 
 It can also contain further columns for any covariates to be included in the model. The data files installed with this package have additional columns infection_history, last_vax_type, and exp_num.
 
 The model also accepts a covariate formula to define the regression model. The variables in the formula must correspond to column names in the dataset. Note that all variables will be treated as **categorical variables**; that is, converted to factors regardless of their input type.
 
+Note also that the `value` column is assumed to be on a natural scale by default, and will be converted to a log scale for model fitting. If your data
+is already on a log scale, you must pass the `log=TRUE` argument when initialising the biokinetics class. See [biokinetics](../reference/biokinetics.html).
+
 ## Example
 
 ```{r}
@@ -45,7 +48,7 @@ After fitting a model, a [CmdStanMCMC](https://mc-stan.org/cmdstanr/reference/Cm
 that users who are already familiar with `cmdstanr` are free to do what they want with the fitted model.
 
 **Important!**
-**The data provided to `biokinetics$new` is converted to a base2 log scale before inference is performed. This means that if working
+**If you provide data on a natural scale, it will be converted to a base2 log scale before inference is performed. This means that if working
 directly with the fitted `CmdStanMCMC` all values will be on this scale. The package provides a helper function for converting
 back to the original scale: [convert_log_scale_inverse](../reference/convert_log_scale_inverse.html).**