From cda1f757b0582b0f14169be475ee2e541b9fb895 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 11 Sep 2024 10:35:03 +0100 Subject: [PATCH 01/25] Updates for CRAN v0.1.2 --- DESCRIPTION | 2 +- cran-comments.md | 6 ------ man/dot-estimate_severity.Rd | 5 ++--- revdep/README.md | 32 +++++++++++++++++--------------- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 30f6737f..6ffdb3cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cfr Title: Estimate Disease Severity and Case Ascertainment -Version: 0.1.1.9000 +Version: 0.1.2 Authors@R: c( person("Pratik R.", "Gupte", , "pratik.gupte@lshtm.ac.uk", role = c("aut", "cph"), comment = c(ORCID = "0000-0001-5294-7819")), diff --git a/cran-comments.md b/cran-comments.md index 7247ee7f..24d7a764 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -3,12 +3,6 @@ 0 errors | 0 warnings | 1 note * Maintainer: ‘Adam Kucharski ’ - - New maintainer: - Adam Kucharski - - Old maintainer(s): - Pratik R. Gupte ## revdepcheck results diff --git a/man/dot-estimate_severity.Rd b/man/dot-estimate_severity.Rd index 3eb75c48..caf42aa1 100644 --- a/man/dot-estimate_severity.Rd +++ b/man/dot-estimate_severity.Rd @@ -49,9 +49,8 @@ approximated by a Poisson likelihood for large samples. \item When any two of \code{total_cases}, \code{total_deaths}, or \code{total_outcomes} are 0, the estimate and confidence intervals cannot be calculated and the output \verb{} contains only \code{NA}s. -\item When \code{total_outcomes <= total_deaths}, the confidence intervals cannot be -reliably calculated and are returned as \code{NA}. The severity estimate is returned -as \code{0.999}. +\item When \code{total_outcomes <= total_deaths}, estimate and confidence intervals +cannot be reliably calculated and are returned as \code{NA}. } } } diff --git a/revdep/README.md b/revdep/README.md index 3e724b59..02c27f27 100644 --- a/revdep/README.md +++ b/revdep/README.md @@ -1,23 +1,25 @@ # Platform -|field |value | -|:--------|:----------------------------| -|version |R version 4.4.0 (2024-04-24) | -|os |macOS Sonoma 14.4.1 | -|system |aarch64, darwin20 | -|ui |X11 | -|language |(EN) | -|collate |en_US.UTF-8 | -|ctype |en_US.UTF-8 | -|tz |Europe/London | -|date |2024-06-12 | -|pandoc |2.19 @ /usr/local/bin/pandoc | +|field |value | +|:--------|:------------------------------------------------------------------------------------------| +|version |R version 4.3.2 (2023-10-31) | +|os |macOS Monterey 12.6.3 | +|system |x86_64, darwin20 | +|ui |RStudio | +|language |(EN) | +|collate |en_US.UTF-8 | +|ctype |en_US.UTF-8 | +|tz |Europe/London | +|date |2024-09-11 | +|rstudio |2023.12.1+402 Ocean Storm (desktop) | +|pandoc |3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) | # Dependencies -|package |old |new |Δ | -|:-------|:-----|:-----|:--| -|cfr |0.1.0 |0.1.1 |* | +|package |old |new |Δ | +|:---------|:-----|:-----|:--| +|cfr |0.1.1 |0.1.2 |* | +|checkmate |NA |2.3.2 |* | # Revdeps From c11871824059a07b4786dfe97ddfabd40fe45099 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 25 Sep 2024 17:17:35 +0100 Subject: [PATCH 02/25] Update example Speed up time varying --- R/cfr_time_varying.R | 2 +- man/cfr_time_varying.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/cfr_time_varying.R b/R/cfr_time_varying.R index e8a2aa57..39e622aa 100644 --- a/R/cfr_time_varying.R +++ b/R/cfr_time_varying.R @@ -72,7 +72,7 @@ #' @examples #' # get data pre-loaded with the package #' data("covid_data") -#' df_covid_uk <- covid_data[covid_data$country == "United Kingdom", ] +#' df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & covid_data$date <= as.Date("2020-09-31"), ] #' #' # estimate time varying severity without correcting for delays #' cfr_time_varying <- cfr_time_varying( diff --git a/man/cfr_time_varying.Rd b/man/cfr_time_varying.Rd index 40b00a1d..dfc15d73 100644 --- a/man/cfr_time_varying.Rd +++ b/man/cfr_time_varying.Rd @@ -97,7 +97,7 @@ is specified. \examples{ # get data pre-loaded with the package data("covid_data") -df_covid_uk <- covid_data[covid_data$country == "United Kingdom", ] +df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & covid_data$date <= as.Date("2020-09-31"), ] # estimate time varying severity without correcting for delays cfr_time_varying <- cfr_time_varying( From 16e4b3efd3c6c3f8669568fe6604266c8e4db478 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 25 Sep 2024 22:35:17 +0100 Subject: [PATCH 03/25] Fix date in example --- CRAN-SUBMISSION | 3 +++ R/cfr_time_varying.R | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 CRAN-SUBMISSION diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 00000000..fca72da5 --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 0.1.2 +Date: 2024-09-25 16:22:53.836 UTC +SHA: c11871824059a07b4786dfe97ddfabd40fe45099 diff --git a/R/cfr_time_varying.R b/R/cfr_time_varying.R index 39e622aa..f8b459e8 100644 --- a/R/cfr_time_varying.R +++ b/R/cfr_time_varying.R @@ -72,7 +72,8 @@ #' @examples #' # get data pre-loaded with the package #' data("covid_data") -#' df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & covid_data$date <= as.Date("2020-09-31"), ] +#' df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & +#' covid_data$date <= as.Date("2020-09-01"), ] #' #' # estimate time varying severity without correcting for delays #' cfr_time_varying <- cfr_time_varying( From d017cfa623a5bcad503da1f47428b7c90356c131 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 26 Sep 2024 06:48:13 +0100 Subject: [PATCH 04/25] Update example for linting --- CRAN-SUBMISSION | 4 ++-- man/cfr_time_varying.Rd | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index fca72da5..da0107ea 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 0.1.2 -Date: 2024-09-25 16:22:53.836 UTC -SHA: c11871824059a07b4786dfe97ddfabd40fe45099 +Date: 2024-09-25 21:37:10 UTC +SHA: 16e4b3efd3c6c3f8669568fe6604266c8e4db478 diff --git a/man/cfr_time_varying.Rd b/man/cfr_time_varying.Rd index dfc15d73..8ea9a8b0 100644 --- a/man/cfr_time_varying.Rd +++ b/man/cfr_time_varying.Rd @@ -97,7 +97,8 @@ is specified. \examples{ # get data pre-loaded with the package data("covid_data") -df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & covid_data$date <= as.Date("2020-09-31"), ] +df_covid_uk <- covid_data[covid_data$country == "United Kingdom" & +covid_data$date <= as.Date("2020-09-01"), ] # estimate time varying severity without correcting for delays cfr_time_varying <- cfr_time_varying( From 4b4ea5f896260ff6aa4a9734ffbab9c62aaeb44e Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 26 Sep 2024 13:50:21 +0100 Subject: [PATCH 05/25] Update CRAN-SUBMISSION --- CRAN-SUBMISSION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index da0107ea..c7e23dd3 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ Version: 0.1.2 -Date: 2024-09-25 21:37:10 UTC -SHA: 16e4b3efd3c6c3f8669568fe6604266c8e4db478 +Date: 2024-09-26 09:37:53 UTC +SHA: d017cfa623a5bcad503da1f47428b7c90356c131 From 0742d920a09ef5cb7dd89f11d448f55eb6a81f17 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Fri, 27 Sep 2024 11:32:45 +0100 Subject: [PATCH 06/25] Remove epiparameter dependency --- vignettes/estimate_from_individual_data.Rmd | 169 ++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 vignettes/estimate_from_individual_data.Rmd diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd new file mode 100644 index 00000000..384078fe --- /dev/null +++ b/vignettes/estimate_from_individual_data.Rmd @@ -0,0 +1,169 @@ +--- +title: "Estimating fatality risk from individual level data" +output: + bookdown::html_vignette2: + fig_caption: yes + code_folding: show +bibliography: resources/library.json +link-citations: true +vignette: > + %\VignetteIndexEntry{Estimating fatality risk from individual level data} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + dpi = 300, + fig.width = 5, fig.height = 3 +) +``` + +The main examples in _cfr_ use incidence data (i.e. counts of cases and deaths by day). However, in some instances individual-level data will be available, which opens up additional analysis options, but also some potential biases. + +::: {.alert .alert-primary} +## Use case {-} + +We want to estimate the case fatality risk (CFR) from individual-level linelist data. +::: + +::: {.alert .alert-secondary} +### What we have {-} + +* A linelist of case onset timings, as well as outcome (i.e. death, recovery) if it has occurred by this point. +* Data on the distribution of delay from onset-to-death, describing the probability an individual will die $t$ days after they were initially infected. +::: + +```{r, message = FALSE, warning=FALSE, eval = TRUE} +# load {cfr} and data packages +library(cfr) + +# packages to wrangle and plot data +library(dplyr) +library(magrittr) +library(ggplot2) +library(tidyr) +library(incidence2) +``` + +## Estimation based on cases with known outcomes + +If we have individual-level data on case onset timings and outcomes where available, it is common to filter the data to focus on cases only with known outcomes. This means the case fatality risk can then be calculated from this filtered known outcome dataset: +$$ +\frac{\text{Total deaths}}{\text{Total deaths}+ \text{Total recoveries}} +$$ + +One limitation of this method is that it implicity assumes the delay from onset-to-death and onset-to-recovery are the same (see below for the mathematical proof). If these delays are noticeably different, the above calculation will produce a biased estimate of the CFR. For example, if onset-to-recovery is much longer than onset to death, then the denominator above will be an underestimate of the true number of cases that would eventually recover who have similar onset timings to those that would eventually die. Hence this calculation would inflate the CFR estimate. After the mathematical explanation below, the next section will discuss estimation methods based on the expected number of known death outcomes using functionality in _cfr_. + +### Mathematical explanation for bias +If we have a linelist with known outcomes and onset-death delay follows the same distribution as onset-recovery, and CFR = $p$, then at the current time, + +$$ +E(\text{Total deaths to date}) = \sum_{t} p \sum_j f_{j} I_{t-j} +$$ + +and + +$$ +E(\text{Total recoveries to date}) = \sum_{t}(1-p) \sum_j f_{j} I_{t-j} +$$ + +where $I_t$ is number of new symptomatic infections on day $t$ and $f_i$ is the probability of outcome $i$ days after symptom onset. + +Hence +$$ +E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] \\ +=[ \sum_{t} p \sum_j f_{j} I_{t-j} ]/[ \sum_{t} p \sum_j f_{j} I_{t-j} + \sum_{t}(1-p) \sum_j f_{j} I_{t-j}] \\ +=[p \sum_{t} \sum_j f_{j} I_{t-j} ]/[ \sum_{t} \sum_j f_{j} I_{t-j} ] = p +$$ + +However, if delay to death $f_j^D$ is different to delay to recovery $f_j^R$, we have: + +$$ +E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] = \frac{\sum_{t} \sum_j p f_j^D I_{t-j}}{\sum_{t} \sum_j p f_j^D I_{t-j} + \sum_{t} \sum_j (1-p) f_j^R I_{t-j}} +$$ +And hence this will not simplify to provide an unbiased estimate of $p$. + +## Estimation based on expected number of known death outcomes + +If the delay from onset-to-death and onset-to-recovery are different, one option is to use [survival analysis methods](https://academic.oup.com/aje/article/162/5/479/82647) to estimate relative hazards (i.e. fatality risk) over time. + +If we are only interested in an overall estimate of CFR, a simpler alternative is to first calculate the proportion of cases in the linelist that we would expect to have a known outcome by this point if the outcome were fatal. We can then use this estimated proportion with known fatal outcomes to calculate the CFR. This is the calculation performed by `cfr_static()`, and hence this function can give us a better estimate of CFR when delays to death and recovery are not the same. + +## Simulated comparison of above methods + +To compare these methods, we first simulate some case onset timings and outcomes (i.e. deaths and recoveries) + +```{r} +# Simulate data +nn <- 1e3 # Number of cases to simulate +pp <- 0.1 # Assumed CFR +set.seed(10) # Set seed for reproducibility + +# Generate random case onset timings in Jan & Feb 2024 +case_onsets <- as.Date("2024-01-01") + sample(1:60,nn,replace=T) + +# Define current date of data availability (i.e. max follow up) +max_obs <- as.Date("2024-01-20") + +# Sample delays from onset to outcome + +# 1. Deaths: assume mean delay = 5 days, sd = 5 days +# log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=5,sd=5) +log_param <- c(meanlog=1.262864,sdlog=0.8325546) +delay_death <- function(x){dlnorm(x,meanlog = log_param$meanlog,sdlog = log_param$sdlog)} + +outcome_death <- round(rlnorm(round(nn*pp),meanlog = log_param$meanlog,sdlog = log_param$sdlog)) + +# 2. Recoveries: assume mean delay = 15 days, sd = 5 days +# log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=15,sd=5) +log_param <- c(meanlog=2.65537,sdlog=0.3245928) +outcome_recovery <- round(rlnorm(round(nn*(1-pp)),meanlog = log_param$meanlog,sdlog = log_param$sdlog)) + +# Create vector of outcome dates +all_outcomes <- case_onsets + c(outcome_death,outcome_recovery) + +# Create vector of outcome types +type_outcome <- c(rep("D",length(outcome_death)),rep("R",length(outcome_recovery))) + +# Create vector of known outcomes as of max_obs +known_outcomes <- type_outcome; known_outcomes[(all_outcomes>max_obs)] <- "" +``` + +First, we calculate CFR by filtering to focus only on cases with known outcomes: +```{r} +# Filter on known outcomes +total_deaths <- sum(known_outcomes=="D") +total_outcomes <- (total_deaths+sum(known_outcomes=="R")) + +# Calculate CFR with 95% CI +cfr_filter <- binom.test(total_deaths,total_outcomes) +cfr_estimate <- (signif(as.numeric(c(cfr_filter$estimate,cfr_filter$conf.int)),3)) +cfr_estimate +``` + +Next, we calculate CFR based on expected fatal outcomes known to date, using `cfr_static()` +```{r} +# Get times of death for fatal outcomes +death_times <- (case_onsets)[type_outcome=="D"]+outcome_death + +# Create a single data.frame with event types +events <- data.frame( + dates = c(case_onsets, death_times), + event = c(rep("cases", length(case_onsets)), rep("deaths", length(death_times))) +) + +# Use incidence2 to calculate counts of cases and deaths by day +counts <- incidence2::incidence(events, date_index = "dates", groups = "event", complete_dates = TRUE) + +# Pivot the incidence object to get a data frame with counts for both cases and deaths +df <- counts %>% + tidyr::pivot_wider(names_from = event, values_from = count, values_fill = 0) %>% + dplyr::rename(date=date_index) + +cfr_static(df,delay_death) +``` From a2c541accb3fdecafbd23a31c6217e545bf9b89a Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Fri, 27 Sep 2024 11:52:45 +0100 Subject: [PATCH 07/25] Add plot --- vignettes/estimate_from_individual_data.Rmd | 57 ++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 384078fe..3a550d12 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -96,7 +96,7 @@ If we are only interested in an overall estimate of CFR, a simpler alternative i ## Simulated comparison of above methods -To compare these methods, we first simulate some case onset timings and outcomes (i.e. deaths and recoveries) +To compare these methods, we first simulate 1000 case onset timings and outcomes (i.e. deaths and recoveries) ```{r} # Simulate data @@ -116,7 +116,6 @@ max_obs <- as.Date("2024-01-20") # log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=5,sd=5) log_param <- c(meanlog=1.262864,sdlog=0.8325546) delay_death <- function(x){dlnorm(x,meanlog = log_param$meanlog,sdlog = log_param$sdlog)} - outcome_death <- round(rlnorm(round(nn*pp),meanlog = log_param$meanlog,sdlog = log_param$sdlog)) # 2. Recoveries: assume mean delay = 15 days, sd = 5 days @@ -134,6 +133,57 @@ type_outcome <- c(rep("D",length(outcome_death)),rep("R",length(outcome_recovery known_outcomes <- type_outcome; known_outcomes[(all_outcomes>max_obs)] <- "" ``` +```{r fig.cap = "Individual level timings of onsets and outcomes, for individuals with a known outcome as of 20th Jan 2024.", class.source = 'fold-hide'} + +# Create a data frame with onset and outcome dates, and outcome type +data <- data.frame( + id = 1:nn, + case_onsets = case_onsets, + outcome_dates = all_outcomes, + outcome_type = type_outcome, + known_outcome = known_outcomes +) + +# Filter out unknown outcomes (after the max_obs date) +data <- data %>% filter(known_outcome != "") + +# Arrange data by onset date +data <- data %>% + arrange(case_onsets) %>% + mutate(id_ordered = row_number()) # Assign a new 'id_ordered' based on sorted onset dates + +# Prepare the data for plotting (reshape long format to have onset and outcome events in the same column) +plot_data <- data %>% + tidyr::pivot_longer( + cols = c(case_onsets, outcome_dates), + names_to = "event_type", + values_to = "date" + ) %>% + mutate( + event_label = ifelse(event_type == "case_onsets", "Onset", "Outcome") + ) + +# Create the plot with lines linking onset and outcome, and add vertical line for max_obs +ggplot() + + # Add lines connecting onset to outcome using the original 'data' + geom_segment(data = data, aes(x = case_onsets, xend = outcome_dates, y = id_ordered, yend = id_ordered), + color = ifelse(data$outcome_type == "D", "red", "green"), size = 0.5) + + # Plot dots for onset and outcome using the reshaped 'plot_data' + geom_point(data = plot_data, aes(x = date, y = id_ordered, color = outcome_type, shape = event_label), size = 2) + + # Add vertical dashed line at max_obs date + geom_vline(xintercept = as.numeric(max_obs), linetype = "dashed", color = "black", size = 0.5) + + scale_color_manual(values = c("D" = "darkred", "R" = "darkgreen")) + # Color by death/recovery + scale_shape_manual(values = c("Onset" = 16, "Outcome" = 17)) + # Different shapes for onset and outcome + labs( + x = "Date", + y = "Individual (Ordered by onset date)", + color = "Outcome type", + shape = "Event type", + title = "Onset and outcome dates by individual" + ) + + theme_minimal() +``` + First, we calculate CFR by filtering to focus only on cases with known outcomes: ```{r} # Filter on known outcomes @@ -167,3 +217,6 @@ df <- counts %>% cfr_static(df,delay_death) ``` +Hence in this particular simulation, the `cfr_static()` method recovers the correct CFR of 10% (95% CI: 8.3-12.0%), whereas the filtering method produces a biased estimate of 26.4% (17.6-37.0%). + +In an extreme scenario where recoveries are not reported, then we effectively have `outcome_recovery()` with an infinite mean, and the above methods will still apply, with the same bias for the filtering approach. \ No newline at end of file From 0d9cd80f274bc4922fc88cdbdda004337ed711a4 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Fri, 27 Sep 2024 14:11:32 +0100 Subject: [PATCH 08/25] Fix syntax error --- vignettes/estimate_from_individual_data.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 3a550d12..ee4f33c4 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -115,13 +115,13 @@ max_obs <- as.Date("2024-01-20") # 1. Deaths: assume mean delay = 5 days, sd = 5 days # log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=5,sd=5) log_param <- c(meanlog=1.262864,sdlog=0.8325546) -delay_death <- function(x){dlnorm(x,meanlog = log_param$meanlog,sdlog = log_param$sdlog)} -outcome_death <- round(rlnorm(round(nn*pp),meanlog = log_param$meanlog,sdlog = log_param$sdlog)) +delay_death <- function(x){dlnorm(x,meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])} +outcome_death <- round(rlnorm(round(nn*pp),meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])) # 2. Recoveries: assume mean delay = 15 days, sd = 5 days # log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=15,sd=5) log_param <- c(meanlog=2.65537,sdlog=0.3245928) -outcome_recovery <- round(rlnorm(round(nn*(1-pp)),meanlog = log_param$meanlog,sdlog = log_param$sdlog)) +outcome_recovery <- round(rlnorm(round(nn*(1-pp)),meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])) # Create vector of outcome dates all_outcomes <- case_onsets + c(outcome_death,outcome_recovery) From 9ab89769b773c05fc8364361e28480777818c7d7 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 17 Oct 2024 17:45:00 +0100 Subject: [PATCH 09/25] Expand vignette Add only total deaths known --- vignettes/estimate_from_individual_data.Rmd | 65 +++++++++++++++++++-- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index ee4f33c4..a0343393 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -19,11 +19,11 @@ knitr::opts_chunk$set( collapse = TRUE, comment = "#>", dpi = 300, - fig.width = 5, fig.height = 3 + out.width = "100%" ) ``` -The main examples in _cfr_ use incidence data (i.e. counts of cases and deaths by day). However, in some instances individual-level data will be available, which opens up additional analysis options, but also some potential biases. +The main examples in the _cfr_ package use incidence data (i.e. counts of cases and deaths by day). However, in some instances individual-level data will be available, which opens up additional analysis options, but also some potential biases. ::: {.alert .alert-primary} ## Use case {-} @@ -92,7 +92,19 @@ And hence this will not simplify to provide an unbiased estimate of $p$. If the delay from onset-to-death and onset-to-recovery are different, one option is to use [survival analysis methods](https://academic.oup.com/aje/article/162/5/479/82647) to estimate relative hazards (i.e. fatality risk) over time. -If we are only interested in an overall estimate of CFR, a simpler alternative is to first calculate the proportion of cases in the linelist that we would expect to have a known outcome by this point if the outcome were fatal. We can then use this estimated proportion with known fatal outcomes to calculate the CFR. This is the calculation performed by `cfr_static()`, and hence this function can give us a better estimate of CFR when delays to death and recovery are not the same. +If we are only interested in an overall estimate of CFR, a simpler alternative is to first calculate the number of cases in the linelist that we would expect to have a known outcome by this point if the outcome were fatal: +$$ +E(\text{deaths by time }t) += p \sum_{t} \sum_j f_{j} I_{t-j} +$$ +where $p$ is the CFR. + +We can then rearrange the above to calculate the CFR: +$$ +\text{Total deaths} = p \sum_{t} \sum_j f_{j} I_{t-j} \\ +p = \frac{\text{Total deaths}}{\sum_{t} \sum_j f_{j} I_{t-j} } +$$ +This is the calculation performed by `cfr_static()`, and hence this function can give us a better estimate of CFR when delays to death and recovery are not the same. ## Simulated comparison of above methods @@ -178,8 +190,7 @@ ggplot() + x = "Date", y = "Individual (Ordered by onset date)", color = "Outcome type", - shape = "Event type", - title = "Onset and outcome dates by individual" + shape = "Event type" ) + theme_minimal() ``` @@ -219,4 +230,46 @@ cfr_static(df,delay_death) ``` Hence in this particular simulation, the `cfr_static()` method recovers the correct CFR of 10% (95% CI: 8.3-12.0%), whereas the filtering method produces a biased estimate of 26.4% (17.6-37.0%). -In an extreme scenario where recoveries are not reported, then we effectively have `outcome_recovery()` with an infinite mean, and the above methods will still apply, with the same bias for the filtering approach. \ No newline at end of file +## Deaths reported but not recoveries + +In an extreme scenario where recoveries are not reported, then we effectively have the values of `outcome_recovery` generated from a distribution with an infinite mean, and the above methods will still apply, with the same bias for the filtering approach. In particular, we would expect: +$$ +E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] \rightarrow 1 \\ +\text{as } E(\text{Total recoveries}) \rightarrow 0 +$$ +## Only total deaths reported + +In some situations, we may have a timeseries of cases but not deaths. However, we can still use the earlier calculation to derive an unbiased CFR: +$$ +\text{Total deaths} = p \sum_{t} \sum_j f_{j} I_{t-j} \\ +p = \frac{\text{Total deaths}}{\sum_{t} \sum_j f_{j} I_{t-j} } +$$ +We can do this in _cfr_ using the `estimate_outcomes()` function to calculate the expected number of cases with known fatal outcomes in the above denominator: +```{r} +# Calculate total deaths and total cases +total_deaths <- sum(df$deaths) +total_cases <- sum(df$cases) + +# Create data.frame with cases over time only +df_case <- df +df_case$deaths <- 0 + +# Calculate the expected number of known fatal outcomes over time +e_outcomes <- estimate_outcomes(df_case,delay_death) + +# Calculate the CFR +total_deaths/(total_cases*tail(e_outcomes$u_t,1)) +``` +As before, this produces a point estimate of 10.4%, because it is the same underlying method: the internal function code in `cfr_static()`, which calls `estimate_outcomes()`, also tallies up total death. An arguably more elegant approach is to therefore add total deaths to the case data.frame, and use as an input for `cfr_static()`: + +```{r} +# Create data.frame with cases over time only +df_cases <- df +df_cases$deaths <- 0 + +# Add total deaths to the final row in the 'deaths' column +df_cases$deaths[nrow(df_case)] <- sum(df$deaths) + +# Calculate CFR +cfr_static(df_cases,delay_death) +``` From 079bdf7ac1eb0b6c081383d18b5ad16da34e3dc7 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 17 Oct 2024 23:33:32 +0100 Subject: [PATCH 10/25] Update estimate_from_individual_data.Rmd --- vignettes/estimate_from_individual_data.Rmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index a0343393..878d0340 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -233,10 +233,13 @@ Hence in this particular simulation, the `cfr_static()` method recovers the corr ## Deaths reported but not recoveries In an extreme scenario where recoveries are not reported, then we effectively have the values of `outcome_recovery` generated from a distribution with an infinite mean, and the above methods will still apply, with the same bias for the filtering approach. In particular, we would expect: + $$ E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] \rightarrow 1 \\ \text{as } E(\text{Total recoveries}) \rightarrow 0 $$ +And hence the calculated CFR to incorrectly converge to 1 as the proportion of recoveries reported declines to 0. + ## Only total deaths reported In some situations, we may have a timeseries of cases but not deaths. However, we can still use the earlier calculation to derive an unbiased CFR: From c9a9228e903e528c52bdacfc81815410075982fe Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 17 Oct 2024 23:34:34 +0100 Subject: [PATCH 11/25] Update estimate_from_individual_data.Rmd --- vignettes/estimate_from_individual_data.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 878d0340..d9ab5cd3 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -244,8 +244,8 @@ And hence the calculated CFR to incorrectly converge to 1 as the proportion of r In some situations, we may have a timeseries of cases but not deaths. However, we can still use the earlier calculation to derive an unbiased CFR: $$ -\text{Total deaths} = p \sum_{t} \sum_j f_{j} I_{t-j} \\ -p = \frac{\text{Total deaths}}{\sum_{t} \sum_j f_{j} I_{t-j} } +E(\text{Total deaths}) = p \sum_{t} \sum_j f_{j} I_{t-j} \\ +E(p) = \frac{\text{Total deaths}}{\sum_{t} \sum_j f_{j} I_{t-j} } $$ We can do this in _cfr_ using the `estimate_outcomes()` function to calculate the expected number of cases with known fatal outcomes in the above denominator: ```{r} From 6b3b5da990c15b245e6153d2f0511b3c9e176742 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Thu, 24 Oct 2024 17:58:12 -1000 Subject: [PATCH 12/25] Example with only totals --- vignettes/estimate_from_individual_data.Rmd | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index d9ab5cd3..99a0d34f 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -276,3 +276,19 @@ df_cases$deaths[nrow(df_case)] <- sum(df$deaths) # Calculate CFR cfr_static(df_cases,delay_death) ``` + +## Only total cases and deaths reported + +If only total cases and total deaths are known, it is not possible to adjust for delays to outcome. However, we can bound our estimate of CFR by two extreme scenarios. First, if the outbreak is over, and sufficient time has passed so that all fatal outcomes are now known, i.e. +$$ +\sum_{t} \sum_j f_{j} I_{t-j} = \sum_{t} I_t +$$ +then the CFR would be equal to: + +$$ +p = \frac{\text{Total deaths}}{\text{Total cases}} +$$ +In contrast, if the epidemic is rapidly growing and still in its early stages, then even an epidemic with a CFR of 1 may have only generated a very small number of fatal outcomes to date. Hence we have the following bounds on the possible CFR: +$$ +\frac{\text{Total deaths}}{\text{Total cases}} \leq p \leq 1 +$$ From 793f6ae35911be892e57b218f2e4d477cbe46601 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 30 Oct 2024 20:08:24 -1000 Subject: [PATCH 13/25] Fix linting --- vignettes/estimate_from_individual_data.Rmd | 107 ++++++++++++-------- 1 file changed, 67 insertions(+), 40 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 99a0d34f..200c4cf5 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -117,36 +117,46 @@ pp <- 0.1 # Assumed CFR set.seed(10) # Set seed for reproducibility # Generate random case onset timings in Jan & Feb 2024 -case_onsets <- as.Date("2024-01-01") + sample(1:60,nn,replace=T) +case_onsets <- as.Date("2024-01-01") + sample(1:60, nn, replace = TRUE) # Define current date of data availability (i.e. max follow up) -max_obs <- as.Date("2024-01-20") +max_obs <- as.Date("2024-01-20") # Sample delays from onset to outcome # 1. Deaths: assume mean delay = 5 days, sd = 5 days -# log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=5,sd=5) -log_param <- c(meanlog=1.262864,sdlog=0.8325546) -delay_death <- function(x){dlnorm(x,meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])} -outcome_death <- round(rlnorm(round(nn*pp),meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])) +log_param <- c(meanlog = 1.262864, sdlog = 0.8325546) +delay_death <- function(x) { + dlnorm(x, + meanlog = log_param[["meanlog"]], + sdlog = log_param[["sdlog"]] + ) +} +outcome_death <- round(rlnorm(round(nn * pp), + meanlog = log_param[["meanlog"]], + sdlog = log_param[["sdlog"]] +)) # 2. Recoveries: assume mean delay = 15 days, sd = 5 days -# log_param <- epiparameter::convert_summary_stats_to_params("lnorm",mean=15,sd=5) -log_param <- c(meanlog=2.65537,sdlog=0.3245928) -outcome_recovery <- round(rlnorm(round(nn*(1-pp)),meanlog = log_param[["meanlog"]],sdlog = log_param[["sdlog"]])) +log_param <- c(meanlog = 2.65537, sdlog = 0.3245928) +outcome_recovery <- round(rlnorm(round(nn * (1 - pp)), + meanlog = log_param[["meanlog"]], + sdlog = log_param[["sdlog"]] +)) # Create vector of outcome dates -all_outcomes <- case_onsets + c(outcome_death,outcome_recovery) +all_outcomes <- case_onsets + c(outcome_death, outcome_recovery) # Create vector of outcome types -type_outcome <- c(rep("D",length(outcome_death)),rep("R",length(outcome_recovery))) +type_outcome <- c(rep("D", length(outcome_death)), + rep("R", length(outcome_recovery))) # Create vector of known outcomes as of max_obs -known_outcomes <- type_outcome; known_outcomes[(all_outcomes>max_obs)] <- "" +known_outcomes <- type_outcome +known_outcomes[(all_outcomes > max_obs)] <- "" ``` ```{r fig.cap = "Individual level timings of onsets and outcomes, for individuals with a known outcome as of 20th Jan 2024.", class.source = 'fold-hide'} - # Create a data frame with onset and outcome dates, and outcome type data <- data.frame( id = 1:nn, @@ -162,9 +172,9 @@ data <- data %>% filter(known_outcome != "") # Arrange data by onset date data <- data %>% arrange(case_onsets) %>% - mutate(id_ordered = row_number()) # Assign a new 'id_ordered' based on sorted onset dates + mutate(id_ordered = row_number()) # Assign a new 'id_ordered' -# Prepare the data for plotting (reshape long format to have onset and outcome events in the same column) +# Prepare data for plotting (onset and outcome events in same column) plot_data <- data %>% tidyr::pivot_longer( cols = c(case_onsets, outcome_dates), @@ -175,17 +185,27 @@ plot_data <- data %>% event_label = ifelse(event_type == "case_onsets", "Onset", "Outcome") ) -# Create the plot with lines linking onset and outcome, and add vertical line for max_obs +# Create plot with lines linking onset and outcome ggplot() + - # Add lines connecting onset to outcome using the original 'data' - geom_segment(data = data, aes(x = case_onsets, xend = outcome_dates, y = id_ordered, yend = id_ordered), - color = ifelse(data$outcome_type == "D", "red", "green"), size = 0.5) + - # Plot dots for onset and outcome using the reshaped 'plot_data' - geom_point(data = plot_data, aes(x = date, y = id_ordered, color = outcome_type, shape = event_label), size = 2) + - # Add vertical dashed line at max_obs date - geom_vline(xintercept = as.numeric(max_obs), linetype = "dashed", color = "black", size = 0.5) + - scale_color_manual(values = c("D" = "darkred", "R" = "darkgreen")) + # Color by death/recovery - scale_shape_manual(values = c("Onset" = 16, "Outcome" = 17)) + # Different shapes for onset and outcome + geom_segment( + data = data, aes(x = case_onsets, + xend = outcome_dates, + y = id_ordered, + yend = id_ordered), + color = ifelse(data$outcome_type == "D", "red", "green"), + size = 0.5 + ) + + geom_point(data = plot_data, aes(x = date, + y = id_ordered, + color = outcome_type, + shape = event_label), + size = 2) + + geom_vline(xintercept = as.numeric(max_obs), + linetype = "dashed", + color = "black", + size = 0.5) + + scale_color_manual(values = c(D = "darkred", R = "darkgreen")) + + scale_shape_manual(values = c(Onset = 16, Outcome = 17)) + labs( x = "Date", y = "Individual (Ordered by onset date)", @@ -198,35 +218,42 @@ ggplot() + First, we calculate CFR by filtering to focus only on cases with known outcomes: ```{r} # Filter on known outcomes -total_deaths <- sum(known_outcomes=="D") -total_outcomes <- (total_deaths+sum(known_outcomes=="R")) +total_deaths <- sum(known_outcomes == "D") +total_outcomes <- (total_deaths + sum(known_outcomes == "R")) # Calculate CFR with 95% CI -cfr_filter <- binom.test(total_deaths,total_outcomes) -cfr_estimate <- (signif(as.numeric(c(cfr_filter$estimate,cfr_filter$conf.int)),3)) +cfr_filter <- binom.test(total_deaths, total_outcomes) +cfr_estimate <- (signif(as.numeric(c(cfr_filter$estimate, + cfr_filter$conf.int)), 3)) cfr_estimate ``` Next, we calculate CFR based on expected fatal outcomes known to date, using `cfr_static()` ```{r} # Get times of death for fatal outcomes -death_times <- (case_onsets)[type_outcome=="D"]+outcome_death +death_times <- (case_onsets)[type_outcome == "D"] + outcome_death # Create a single data.frame with event types events <- data.frame( dates = c(case_onsets, death_times), - event = c(rep("cases", length(case_onsets)), rep("deaths", length(death_times))) + event = c(rep("cases", length(case_onsets)), + rep("deaths", length(death_times))) ) # Use incidence2 to calculate counts of cases and deaths by day -counts <- incidence2::incidence(events, date_index = "dates", groups = "event", complete_dates = TRUE) +counts <- incidence2::incidence(events, + date_index = "dates", + groups = "event", + complete_dates = TRUE) -# Pivot the incidence object to get a data frame with counts for both cases and deaths +# Pivot incidence object to get data.frame with counts for cases and deaths df <- counts %>% - tidyr::pivot_wider(names_from = event, values_from = count, values_fill = 0) %>% - dplyr::rename(date=date_index) - -cfr_static(df,delay_death) + tidyr::pivot_wider(names_from = event, + values_from = count, + values_fill = 0) %>% + dplyr::rename(date = date_index) + +cfr_static(df, delay_death) ``` Hence in this particular simulation, the `cfr_static()` method recovers the correct CFR of 10% (95% CI: 8.3-12.0%), whereas the filtering method produces a biased estimate of 26.4% (17.6-37.0%). @@ -258,10 +285,10 @@ df_case <- df df_case$deaths <- 0 # Calculate the expected number of known fatal outcomes over time -e_outcomes <- estimate_outcomes(df_case,delay_death) +e_outcomes <- estimate_outcomes(df_case, delay_death) # Calculate the CFR -total_deaths/(total_cases*tail(e_outcomes$u_t,1)) +total_deaths / (total_cases * tail(e_outcomes$u_t, 1)) ``` As before, this produces a point estimate of 10.4%, because it is the same underlying method: the internal function code in `cfr_static()`, which calls `estimate_outcomes()`, also tallies up total death. An arguably more elegant approach is to therefore add total deaths to the case data.frame, and use as an input for `cfr_static()`: @@ -274,7 +301,7 @@ df_cases$deaths <- 0 df_cases$deaths[nrow(df_case)] <- sum(df$deaths) # Calculate CFR -cfr_static(df_cases,delay_death) +cfr_static(df_cases, delay_death) ``` ## Only total cases and deaths reported From 48368eeefd3f05fb8cfffd148ff981069db0e7d8 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 30 Oct 2024 20:30:47 -1000 Subject: [PATCH 14/25] Update news and version --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6ffdb3cb..24418283 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cfr Title: Estimate Disease Severity and Case Ascertainment -Version: 0.1.2 +Version: 0.1.3 Authors@R: c( person("Pratik R.", "Gupte", , "pratik.gupte@lshtm.ac.uk", role = c("aut", "cph"), comment = c(ORCID = "0000-0001-5294-7819")), diff --git a/NEWS.md b/NEWS.md index 68dbd734..063efe59 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # cfr (development version) +# cfr 0.1.3 + +Added vignette `estimate_from_individual_data.Rmd` describing relationship between individual-level data and aggregate estimation. + # cfr 0.1.2 Updated version to fix instability in normal approximation with displayed Ebola example. This release includes: From d0fd4d36807fdabd5abec0c6e9134dcc1939d493 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 30 Oct 2024 20:45:10 -1000 Subject: [PATCH 15/25] Fix spelling --- inst/WORDLIST | 3 +++ vignettes/estimate_from_individual_data.Rmd | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/inst/WORDLIST b/inst/WORDLIST index e31d8e7f..27558687 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -51,12 +51,14 @@ epiday epiparameter epiweek etc +frac geq ggplot gh github infty io +leq lifecycle linelist lockdowns @@ -67,6 +69,7 @@ packagename parameterise parameterised pkg +rightarrow sim st svg diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 200c4cf5..774693bf 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -57,7 +57,7 @@ $$ \frac{\text{Total deaths}}{\text{Total deaths}+ \text{Total recoveries}} $$ -One limitation of this method is that it implicity assumes the delay from onset-to-death and onset-to-recovery are the same (see below for the mathematical proof). If these delays are noticeably different, the above calculation will produce a biased estimate of the CFR. For example, if onset-to-recovery is much longer than onset to death, then the denominator above will be an underestimate of the true number of cases that would eventually recover who have similar onset timings to those that would eventually die. Hence this calculation would inflate the CFR estimate. After the mathematical explanation below, the next section will discuss estimation methods based on the expected number of known death outcomes using functionality in _cfr_. +One limitation of this method is that it implicitly assumes the delay from onset-to-death and onset-to-recovery are the same (see below for the mathematical proof). If these delays are noticeably different, the above calculation will produce a biased estimate of the CFR. For example, if onset-to-recovery is much longer than onset to death, then the denominator above will be an underestimate of the true number of cases that would eventually recover who have similar onset timings to those that would eventually die. Hence this calculation would inflate the CFR estimate. After the mathematical explanation below, the next section will discuss estimation methods based on the expected number of known death outcomes using functionality in _cfr_. ### Mathematical explanation for bias If we have a linelist with known outcomes and onset-death delay follows the same distribution as onset-recovery, and CFR = $p$, then at the current time, @@ -269,7 +269,7 @@ And hence the calculated CFR to incorrectly converge to 1 as the proportion of r ## Only total deaths reported -In some situations, we may have a timeseries of cases but not deaths. However, we can still use the earlier calculation to derive an unbiased CFR: +In some situations, we may have a time series of cases but not deaths. However, we can still use the earlier calculation to derive an unbiased CFR: $$ E(\text{Total deaths}) = p \sum_{t} \sum_j f_{j} I_{t-j} \\ E(p) = \frac{\text{Total deaths}}{\sum_{t} \sum_j f_{j} I_{t-j} } From 1dbba40cbff25a4a8f00fde0ec6f9d7efc759413 Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 30 Oct 2024 20:56:58 -1000 Subject: [PATCH 16/25] Fix dependencies and pkgdown --- DESCRIPTION | 1 + _pkgdown.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/DESCRIPTION b/DESCRIPTION index 24418283..d155258d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,7 @@ Suggests: ggplot2, incidence2, knitr, + magrittr, purrr, rmarkdown, scales, diff --git a/_pkgdown.yml b/_pkgdown.yml index 093d5c33..8aefb8a6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -11,6 +11,7 @@ articles: - estimate_static_severity - estimate_time_varying_severity - estimate_ascertainment + - estimate_from_individual_data - title: Handling data from other packages navbar: Handling data from other packages contents: From b889b58bf108506908f6078c02931cab67c746e7 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:33:06 +0000 Subject: [PATCH 17/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 774693bf..b6751c9e 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -23,7 +23,7 @@ knitr::opts_chunk$set( ) ``` -The main examples in the _cfr_ package use incidence data (i.e. counts of cases and deaths by day). However, in some instances individual-level data will be available, which opens up additional analysis options, but also some potential biases. +The main examples showcased in the vignettes of the _cfr_ package use incidence data (i.e. counts of cases and deaths by day). However, in some instances individual-level data (i.e. where each row provides information about a single case) will be available, which opens up additional analysis options, but also some potential biases. ::: {.alert .alert-primary} ## Use case {-} From c512a9f2c59ce714670bc35fb134798929e86455 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:33:31 +0000 Subject: [PATCH 18/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index b6751c9e..bdab444d 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -39,7 +39,7 @@ We want to estimate the case fatality risk (CFR) from individual-level linelist ::: ```{r, message = FALSE, warning=FALSE, eval = TRUE} -# load {cfr} and data packages +# load {cfr} library(cfr) # packages to wrangle and plot data From 1790b4d0e471630514360888778af6419e6c060e Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:34:08 +0000 Subject: [PATCH 19/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index bdab444d..dd3d0150 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -52,7 +52,7 @@ library(incidence2) ## Estimation based on cases with known outcomes -If we have individual-level data on case onset timings and outcomes where available, it is common to filter the data to focus on cases only with known outcomes. This means the case fatality risk can then be calculated from this filtered known outcome dataset: +If we have individual-level data on case onset timings and outcomes, where available, it is common to filter the data to focus on cases only with known outcomes. This means the case fatality risk can then be calculated from this filtered dataset containing only known outcomes: $$ \frac{\text{Total deaths}}{\text{Total deaths}+ \text{Total recoveries}} $$ From 087d9c73e32a04a0324008f3d32b2836d1a68b75 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:34:20 +0000 Subject: [PATCH 20/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index dd3d0150..44d08c33 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -57,7 +57,7 @@ $$ \frac{\text{Total deaths}}{\text{Total deaths}+ \text{Total recoveries}} $$ -One limitation of this method is that it implicitly assumes the delay from onset-to-death and onset-to-recovery are the same (see below for the mathematical proof). If these delays are noticeably different, the above calculation will produce a biased estimate of the CFR. For example, if onset-to-recovery is much longer than onset to death, then the denominator above will be an underestimate of the true number of cases that would eventually recover who have similar onset timings to those that would eventually die. Hence this calculation would inflate the CFR estimate. After the mathematical explanation below, the next section will discuss estimation methods based on the expected number of known death outcomes using functionality in _cfr_. +One limitation of this method is that it implicitly assumes the delay from onset-to-death and onset-to-recovery are the same (see below for the mathematical proof). If these delays are noticeably different, the above calculation will produce a biased estimate of the CFR. For example, if onset-to-recovery is much longer than onset-to-death, at a given time point mid-outbreak, the denominator above will be an underestimate of the true number of cases that would eventually recover who have similar onset timings to those that would eventually die. Hence this calculation would inflate the CFR estimate. After the mathematical explanation below, the next section will discuss estimation methods based on the expected number of known death outcomes using functionality in _cfr_. ### Mathematical explanation for bias If we have a linelist with known outcomes and onset-death delay follows the same distribution as onset-recovery, and CFR = $p$, then at the current time, From 467bc6eecc94480f8f9b62f8d7b7484016e74f14 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:34:58 +0000 Subject: [PATCH 21/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 44d08c33..54213569 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -108,7 +108,7 @@ This is the calculation performed by `cfr_static()`, and hence this function can ## Simulated comparison of above methods -To compare these methods, we first simulate 1000 case onset timings and outcomes (i.e. deaths and recoveries) +To compare these methods, i.e. filtering out cases without known outcomes vs using the `cfr_static` function, we first simulate 1000 case onset timings and outcomes (i.e. deaths and recoveries) ```{r} # Simulate data From 5353244a9846cec905e31c3594fba04273a491e7 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:35:21 +0000 Subject: [PATCH 22/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 54213569..6a3297ec 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -298,7 +298,7 @@ df_cases <- df df_cases$deaths <- 0 # Add total deaths to the final row in the 'deaths' column -df_cases$deaths[nrow(df_case)] <- sum(df$deaths) +df_cases$deaths[nrow(df_cases)] <- sum(df$deaths) # Calculate CFR cfr_static(df_cases, delay_death) From fa20a1da82784b417f0a775e0dbc1a6afe2d6ec4 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:35:35 +0000 Subject: [PATCH 23/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index 6a3297ec..d7a7cb1b 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -86,7 +86,7 @@ However, if delay to death $f_j^D$ is different to delay to recovery $f_j^R$, we $$ E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] = \frac{\sum_{t} \sum_j p f_j^D I_{t-j}}{\sum_{t} \sum_j p f_j^D I_{t-j} + \sum_{t} \sum_j (1-p) f_j^R I_{t-j}} $$ -And hence this will not simplify to provide an unbiased estimate of $p$. +And hence this will not simplify to provide an unbiased estimate of $p$, as if would if $ f_j^D = f_j^R $ ## Estimation based on expected number of known death outcomes From 6990350fa17b5d24fd8a630d13440218706e7331 Mon Sep 17 00:00:00 2001 From: Adam Kucharski Date: Wed, 27 Nov 2024 09:52:18 +0000 Subject: [PATCH 24/25] Update vignettes/estimate_from_individual_data.Rmd Co-authored-by: CarmenTamayo --- vignettes/estimate_from_individual_data.Rmd | 1 + 1 file changed, 1 insertion(+) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index d7a7cb1b..a4b36f22 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -319,3 +319,4 @@ In contrast, if the epidemic is rapidly growing and still in its early stages, t $$ \frac{\text{Total deaths}}{\text{Total cases}} \leq p \leq 1 $$ +In other words, during a rapidly growing epidemic, we can only say that the CFR lies somewhere between the ratio of total deaths to total cases (as of the last known count) and 1 (the worst-case scenario). From 6bb2df8a155c6ac0f80b8e00c7afe5931a08828d Mon Sep 17 00:00:00 2001 From: adamkucharski Date: Wed, 27 Nov 2024 13:39:13 +0000 Subject: [PATCH 25/25] Final edits --- vignettes/estimate_from_individual_data.Rmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/estimate_from_individual_data.Rmd b/vignettes/estimate_from_individual_data.Rmd index a4b36f22..f87ea3b0 100644 --- a/vignettes/estimate_from_individual_data.Rmd +++ b/vignettes/estimate_from_individual_data.Rmd @@ -34,7 +34,7 @@ We want to estimate the case fatality risk (CFR) from individual-level linelist ::: {.alert .alert-secondary} ### What we have {-} -* A linelist of case onset timings, as well as outcome (i.e. death, recovery) if it has occurred by this point. +* A linelist of case onset timings, as well as either individual outcome (i.e. death, recovery) if it has occurred by this point and been recorded, or total deaths. * Data on the distribution of delay from onset-to-death, describing the probability an individual will die $t$ days after they were initially infected. ::: @@ -92,7 +92,7 @@ And hence this will not simplify to provide an unbiased estimate of $p$, as if w If the delay from onset-to-death and onset-to-recovery are different, one option is to use [survival analysis methods](https://academic.oup.com/aje/article/162/5/479/82647) to estimate relative hazards (i.e. fatality risk) over time. -If we are only interested in an overall estimate of CFR, a simpler alternative is to first calculate the number of cases in the linelist that we would expect to have a known outcome by this point if the outcome were fatal: +However, if we are only interested in an overall estimate of CFR, a simpler alternative is to first calculate the number of cases in the linelist that we would expect to have a known outcome by this point if the outcome were fatal: $$ E(\text{deaths by time }t) = p \sum_{t} \sum_j f_{j} I_{t-j} @@ -259,7 +259,7 @@ Hence in this particular simulation, the `cfr_static()` method recovers the corr ## Deaths reported but not recoveries -In an extreme scenario where recoveries are not reported, then we effectively have the values of `outcome_recovery` generated from a distribution with an infinite mean, and the above methods will still apply, with the same bias for the filtering approach. In particular, we would expect: +In an extreme scenario where recoveries are not reported, then we effectively have the values of `outcome_recovery` generated from a distribution with an infinite mean, and the above conclusions will still apply, with the same bias for the filtering approach. In particular, we would expect: $$ E(\text{Total deaths})/ [ E(\text{Total deaths})+E(\text{Total recoveries}) ] \rightarrow 1 \\