Skip to content

Commit

Permalink
support log data
Browse files Browse the repository at this point in the history
  • Loading branch information
hillalex committed Oct 17, 2024
1 parent bc4da25 commit 75c60f5
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 18 deletions.
35 changes: 26 additions & 9 deletions R/biokinetics.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ biokinetics <- R6::R6Class(
"biokinetics",
cloneable = FALSE,
private = list(
scale = NULL,
priors = NULL,
preds_sd = NULL,
data = NULL,
Expand Down Expand Up @@ -55,7 +56,7 @@ biokinetics <- R6::R6Class(
},
construct_design_matrix = function() {
var <- pid <- NULL
dt_design_matrix <- private$data[, .SD, .SDcols = private$all_formula_vars, by = pid] |>
dt_design_matrix <- as.character(private$data[, .SD, .SDcols = private$all_formula_vars, by = pid]) |>
unique()

# Build the full design matrix using model.matrix
Expand Down Expand Up @@ -211,19 +212,22 @@ biokinetics <- R6::R6Class(
#' @param priors Object of type \link[epikinetics]{biokinetics_priors}. Default biokinetics_priors().
#' @param covariate_formula Formula specifying linear regression model. Note all variables in the formula
#' will be treated as categorical variables. Default ~0.
#' @param preds_sd Standard deviation of predictor coefficients. Default 0.25.
#' @param scale One of "log" or "natural". Default "natural". Is provided data on a log or a natural scale? If on a natural scale it
#' will be converted to a log scale for model fitting.
initialize = function(priors = biokinetics_priors(),
data = NULL,
file_path = NULL,
covariate_formula = ~0,
preds_sd = 0.25) {
preds_sd = 0.25,
scale = "natural") {
validate_priors(priors)
private$priors <- priors
validate_numeric(preds_sd)
private$preds_sd <- preds_sd
validate_formula(covariate_formula)
private$covariate_formula <- covariate_formula
private$all_formula_vars <- all.vars(covariate_formula)
private$scale <- scale
if (is.null(data) && is.null(file_path)) {
stop("One of 'data' or 'file_path' must be provided")
}
Expand All @@ -241,7 +245,9 @@ biokinetics <- R6::R6Class(
validate_required_cols(private$data)
validate_formula_vars(private$all_formula_vars, private$data)
logger::log_info("Preparing data for stan")
private$data <- convert_log_scale(private$data, "value")
if (scale == "natural") {
private$data <- convert_log_scale(private$data, "value")
}
private$data[, `:=`(obs_id = seq_len(.N),
t_since_last_exp = as.integer(day - last_exp_day, units = "days"))]
if (!("censored" %in% colnames(private$data))) {
Expand All @@ -263,6 +269,9 @@ biokinetics <- R6::R6Class(
get_stan_data = function() {
private$stan_input_data
},
get_covariate_lookup_table = function() {
private$covariate_lookup_table
},
#' @description Fit the model and return CmdStanMCMC fitted model object.
#' @return A CmdStanMCMC fitted model object: <https://mc-stan.org/cmdstanr/reference/CmdStanMCMC.html>
#' @param ... Named arguments to the `sample()` method of CmdStan model.
Expand Down Expand Up @@ -376,6 +385,10 @@ biokinetics <- R6::R6Class(
dt_out <- dt_out[
, lapply(.SD, function(x) if (is.factor(x)) forcats::fct_drop(x) else x)]

if (private$scale == "log") {
return(dt_out)
}

if (summarise) {
dt_out <- convert_log_scale_inverse(
dt_out, vars_to_transform = c("me", "lo", "hi"))
Expand Down Expand Up @@ -416,8 +429,10 @@ biokinetics <- R6::R6Class(
logger::log_info("Recovering covariate names")
dt_peak_switch <- private$recover_covariate_names(dt_peak_switch)

dt_peak_switch <- convert_log_scale_inverse(
dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
if (private$scale == "natural") {
dt_peak_switch <- convert_log_scale_inverse(
dt_peak_switch, vars_to_transform = c("mu_0", "mu_p", "mu_s"))
}

logger::log_info("Calculating medians")
dt_peak_switch[
Expand Down Expand Up @@ -476,10 +491,12 @@ biokinetics <- R6::R6Class(
# Running the C++ code to simulate trajectories for each parameter sample
# for each individual
logger::log_info("Simulating individual trajectories")
dt_params_ind_traj <- biokinetics_simulate_trajectories(dt_params_ind)
dt_params_ind_traj <- data.table::setDT(biokinetics_simulate_trajectories(dt_params_ind))

dt_params_ind_traj <- data.table::setDT(convert_log_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu"))
if (private$scale == "natural") {
dt_params_ind_traj <- convert_log_scale_inverse_cpp(
dt_params_ind_traj, vars_to_transform = "mu")
}

# convert numeric pid to original pid
dt_params_ind_traj[, pid := names(private$pid_lookup)[pid]]
Expand Down
2 changes: 1 addition & 1 deletion src/stan/antibody_kinetics_main.stan
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ data {
array[N] int<lower=1, upper=K> titre_type; // Titre type for each observation
vector[N] t; // Time for each observation
vector[N] value; // Observed titre values
array[N] int<lower=-2, upper=1> censored; // Censoring indicator: -2 for lo, -1 for me 1 for upper, 0 for none
array[N] int<lower=-1, upper=1> censored; // Censoring indicator: -1 for lo, 1 for upper, 0 for none

// Indices for different censoring scenarios
int N_uncens; // number of uncensored observations
Expand Down
19 changes: 11 additions & 8 deletions vignettes/data.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@ knitr::opts_chunk$set(
The model requires time series data about individual titre readings, along with last exposure times. Times can be relative (e.g. day of study) or absolute (i.e. precise calendar dates).
The full list of required columns is as follows:

| name | type | description | required |
|--------------|----------------------|-----------------------------------------------------------------------------------------------------------------| -------- |
| pid | numeric or character | Unique identifier to identify a person across observations | T|
| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T|
| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T|
| titre_type | character | Name of the titre or biomarker | T|
| value | numeric | Titre value | T|
| name | type | description | required |
|--------------|----------------------|--------------------------------------------------------------------------------------------| -------- |
| pid | numeric or character | Unique identifier to identify a person across observations | T|
| day | integer or date | The day of the observation. Can be a date or an integer representing a relative day of study | T|
| last_exp_day | integer or date | The most recent day on which the person was exposed. Must be of the same type as the 'day' column | T|
| titre_type | character | Name of the titre or biomarker | T|
| value | numeric | Titre value | T|
| censored | -1, 0 or 1 | Optional column. Whether this observation should be treated as censored: -1 for lower, 1 for upper, 0 for none. | F|

It can also contain further columns for any covariates to be included in the model. The data files installed with this package have additional columns infection_history, last_vax_type, and exp_num.

The model also accepts a covariate formula to define the regression model. The variables in the formula must correspond to column names in the dataset. Note that all variables will be treated as **categorical variables**; that is, converted to factors regardless of their input type.

Note also that the `value` column is assumed to be on a natural scale by default, and will be converted to a log scale for model fitting. If your data
is already on a log scale, you must pass the `log=TRUE` argument when initialising the biokinetics class. See [biokinetics](../reference/biokinetics.html).

## Example

```{r}
Expand All @@ -45,7 +48,7 @@ After fitting a model, a [CmdStanMCMC](https://mc-stan.org/cmdstanr/reference/Cm
that users who are already familiar with `cmdstanr` are free to do what they want with the fitted model.

**Important!**
**The data provided to `biokinetics$new` is converted to a base2 log scale before inference is performed. This means that if working
**If you provide data on a natural scale, it will be converted to a base2 log scale before inference is performed. This means that if working
directly with the fitted `CmdStanMCMC` all values will be on this scale. The package provides a helper function for converting
back to the original scale: [convert_log_scale_inverse](../reference/convert_log_scale_inverse.html).**

Expand Down

0 comments on commit 75c60f5

Please sign in to comment.