From c9bba3b797f5063887a070530d463771dce173a9 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Tue, 17 Oct 2023 18:30:37 +0100 Subject: [PATCH 1/8] only output case names when add_names = TRUE --- R/sim_linelist.R | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/R/sim_linelist.R b/R/sim_linelist.R index a488c291..3637a12a 100644 --- a/R/sim_linelist.R +++ b/R/sim_linelist.R @@ -246,8 +246,14 @@ sim_linelist <- function(R, # nolint cyclocomp chain$hospitalisation_date <- chain$hosp_rounded + outbreak_start_date chain$death_date <- chain$death_rounded + outbreak_start_date + linelist_cols <- c( + "id", "case_type", "gender", "age", "onset_date", "hospitalisation_date", + "date_first_contact", "date_last_contact" + ) + if (add_names) { chain <- .add_names(.data = chain) + linelist_cols <- append(linelist_cols, "case_name", after = 1) } # add confirmed, probable, suspected case types @@ -258,10 +264,6 @@ sim_linelist <- function(R, # nolint cyclocomp prob = case_type_probs ) - linelist_cols <- c( - "id", "gender", "age", "onset_date", "hospitalisation_date", - "date_first_contact", "date_last_contact" - ) # add Ct if confirmed if (add_ct) { @@ -274,7 +276,6 @@ sim_linelist <- function(R, # nolint cyclocomp contacts <- create_contacts(.data = chain) # nolint var assigned not used } - chain <- chain[, linelist_cols] # return linelist From d324f747a34287dffe5378af77e41b073afac033 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Tue, 17 Oct 2023 18:39:19 +0100 Subject: [PATCH 2/8] updated sim_linelist tests to include case_name and case_type --- tests/testthat/test-sim_linelist.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/testthat/test-sim_linelist.R b/tests/testthat/test-sim_linelist.R index 4a47638f..94efc962 100644 --- a/tests/testthat/test-sim_linelist.R +++ b/tests/testthat/test-sim_linelist.R @@ -31,11 +31,11 @@ test_that("sim_list works as expected", { ) expect_s3_class(linelist, class = "data.frame") - expect_identical(dim(linelist), c(42L, 7L)) + expect_identical(dim(linelist), c(42L, 9L)) expect_identical( colnames(linelist), - c("id", "gender", "age", "onset_date", "hospitalisation_date", - "date_first_contact", "date_last_contact") + c("id", "case_name", "case_type", "gender", "age", "onset_date", + "hospitalisation_date", "date_first_contact", "date_last_contact") ) }) @@ -67,11 +67,11 @@ test_that("sim_list works as expected with age-strat rates", { ) expect_s3_class(linelist, class = "data.frame") - expect_identical(dim(linelist), c(42L, 7L)) + expect_identical(dim(linelist), c(42L, 9L)) expect_identical( colnames(linelist), - c("id", "gender", "age", "onset_date", "hospitalisation_date", - "date_first_contact", "date_last_contact") + c("id", "case_name", "case_type", "gender", "age", "onset_date", + "hospitalisation_date", "date_first_contact", "date_last_contact") ) }) @@ -86,11 +86,12 @@ test_that("sim_list works as expected with Ct", { ) expect_s3_class(linelist, class = "data.frame") - expect_identical(dim(linelist), c(42L, 8L)) + expect_identical(dim(linelist), c(42L, 10L)) expect_identical( colnames(linelist), - c("id", "gender", "age", "onset_date", "hospitalisation_date", - "date_first_contact", "date_last_contact", "ct_value") + c("id", "case_name", "case_type", "gender", "age", "onset_date", + "hospitalisation_date", "date_first_contact", "date_last_contact", + "ct_value") ) }) From 4fec6328ff4c8dc7ce6ad9baf9b7b269887afb6b Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Tue, 17 Oct 2023 18:39:52 +0100 Subject: [PATCH 3/8] added sim_linelist test for add_names = FALSE --- tests/testthat/test-sim_linelist.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/testthat/test-sim_linelist.R b/tests/testthat/test-sim_linelist.R index 94efc962..743408c4 100644 --- a/tests/testthat/test-sim_linelist.R +++ b/tests/testthat/test-sim_linelist.R @@ -95,6 +95,25 @@ test_that("sim_list works as expected with Ct", { ) }) +test_that("sim_list works as expected with anonymous", { + set.seed(1) + linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, + add_names = FALSE + ) + + expect_s3_class(linelist, class = "data.frame") + expect_identical(dim(linelist), c(42L, 8L)) + expect_identical( + colnames(linelist), + c("id", "case_type", "gender", "age", "onset_date", + "hospitalisation_date", "date_first_contact", "date_last_contact") + ) +}) + test_that("sim_list fails as expected for include_contacts = TRUE while WIP", { expect_error( sim_linelist( From 5b3d60fc802e4b5481688fefa4a6ad709c57df4e Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Tue, 17 Oct 2023 18:40:03 +0100 Subject: [PATCH 4/8] updated WORDLIST --- inst/WORDLIST | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/WORDLIST b/inst/WORDLIST index 46d1ef1f..2c60e05a 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -19,9 +19,12 @@ Linelist packagename parameterised Poisson +randomNames RECON +resimulate SARS sim st +stochasticity svg yaml From 59a8714f0d2611b5948ba8136909d860b7c74e82 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Tue, 17 Oct 2023 18:40:29 +0100 Subject: [PATCH 5/8] added first draft of getting started vignette --- vignettes/.gitignore | 2 + vignettes/simulist.Rmd | 142 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) create mode 100644 vignettes/.gitignore create mode 100644 vignettes/simulist.Rmd diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/simulist.Rmd b/vignettes/simulist.Rmd new file mode 100644 index 00000000..a9473c05 --- /dev/null +++ b/vignettes/simulist.Rmd @@ -0,0 +1,142 @@ +--- +title: "Getting Started with {simulist}" +output: + bookdown::html_vignette2: + code_folding: show +pkgdown: + as_is: true +vignette: > + %\VignetteIndexEntry{Getting Started with {simulist}} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(simulist) +library(epiparameter) +``` + +This is an introductory vignette to the {simulist} R package. {simulist} simulates epidemiological linelist data, to provide a row-by-row account of cases in an infectious disease outbreak. + +The main function in the {simulist} package is `sim_linelist()`. This functions takes in arguments that control the dynamics of the outbreak, such as the serial interval distribution, and outputs a table (``) with case information for each infected individual. + +For this example we will simulate a linelist for a COVID-19 (SARS-CoV-2) outbreak. + +First we load in some data that is required for the linelist simulation. Data on epidemiological parameters and distributions is read from the {epiparameter} R package. + +```{r read-epidist} +# get serial interval from {epiparameter} database + +serial_interval <- epiparameter::epidist( + disease = "COVID-19", + epi_dist = "serial interval", + prob_distribution = "gamma", + prob_distribution_params = c(shape = 1, scale = 1) +) + +# get onset to hospital admission from {epiparameter} database +onset_to_hosp <- epiparameter::epidist_db( + disease = "COVID-19", + epi_dist = "onset to hospitalisation", + single_epidist = TRUE +) + +# get onset to death from {epiparameter} database +onset_to_death <- epiparameter::epidist_db( + disease = "COVID-19", + epi_dist = "onset to death", + single_epidist = TRUE +) +``` + +The reproduction number is supplied, here we set it as 1.1: + +```{r, sim-linelist} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death +) +head(linelist) +``` + +## Case type + +In the midst of an infectious disease outbreak it may not be possible to confirm every case. A confirmed case is usually done via a diagnostic test which may or may not need to be carried out in the laboratory. There are several reasons why a case may not be confirmed, one example is limited testing capacity, especially is fast growing epidemics. The other categories for cases are: probable and suspected. Probable cases are those that show clinical evidence for the disease but have not, or cannot, be confirmed by a diagnostic test. Suspected cases are those that are possibly infected but do not show clear clinical or epidemiological evidence, nor has a diagnostic test been performed. + +The linelist output from the {simulist} simulation contains a column (`case_type`) with the type of case. + +{simulist} can simulate varying probabilities of each case being suspected, probable or confirmed. By default the `sim_linelist()` function uses probabilities of `suspected = 0.2`, `probable = 0.3` and `confirmed = 0.5`. + +```{r, sim-linelist-default-case-type} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death +) +head(linelist) +``` + +To alter these probabilities, supply a named vector to the `sim_linelist()` argument `case_type_probs`. The vector should contain three numbers, with the names `suspected`, `probable` and `confirmed`, with the numbers summing to one. Here we change the values to simulate an outbreak in which the proportion of cases confirmed by laboratory testing is high. + +```{r, sim-linelist-mod-case-type} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, + case_type_probs = c(suspected = 0.05, probable = 0.05, confirmed = 0.9) +) +head(linelist) +``` + +It is also possible to set one of these categories to `1`, in which case every case will be of that type. + +The way {simulist} assigns case types is by pasting case types onto existing case data. Thus, it could be viewed that the true underlying data is that all cases in the simulation are confirmed, but that there is a lack of information in some cases. There are no cases in the output linelist that are incorrectly attributed as probable or suspected that have not been infected. That is to say, all individuals in the linelist, whatever their case type, are infected during the outbreak. + +## Conditioning simulation on outbreak size + +The reproduction number has a strong influence on the size of an outbreak. However, the {simulist} package generates linelist data using a stochastic algorithm, so even when `R < 1` it can produce a substantial outbreak by chance, or an `R >> 1` will sometimes not produce a vast epidemic in one simulation (i.e. one replicate) due to the stochasticity. + +When requiring a minimum outbreak size we can specify the `min_chain_size` argument in `sim_linelist()`. By default this is set to 10. This means that the simulation will not return a linelist until the conditioning has been met. In other words, the simulation will resimulate a branching process model until an outbreak infects at least 10 people. + +When requiring a linelist that represents a large outbreak, such as the COVID-19 outbreak, setting the `min_chain_size` to a larger number guarantees a linelist of that size. Here we simulate a linelist requiring at least 250 cases. + +```{r, sim-large-linelist} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, + min_chain_size = 250 +) +head(linelist) +``` + +The amount of time the simulation takes can be determined by the reproduction number (`R`) and the minimum outbreak size (`min_chain_size`). If the `min_outbreak_size` is large, for example hundreds or thousands of cases, and the reproduction number is below one, it will take many branching process simulations until finding one that produces a sufficiently large epidemic. + +## Anonymous linelist + +By default `sim_linelist()` provides the name of each individual in the linelist. If an anonymised linelist is required the `add_names` argument of `sim_linelist()` can be set to `FALSE`. + +```{r sim-anon-linelist} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, + add_names = FALSE +) +head(linelist) +``` + +The names used in the linelist are produced at random by the [{randomNames}](https://CRAN.R-project.org/package=randomNames) R package. Therefore, even when `add_names = TRUE` there is no personal data of real individuals being produced or shared. From 59c34543d1aa3651200e1efffd8a82e8bc69187f Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 19 Oct 2023 10:50:47 +0100 Subject: [PATCH 6/8] updates to simulist vignette --- vignettes/simulist.Rmd | 91 +++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 37 deletions(-) diff --git a/vignettes/simulist.Rmd b/vignettes/simulist.Rmd index a9473c05..82f620cb 100644 --- a/vignettes/simulist.Rmd +++ b/vignettes/simulist.Rmd @@ -18,51 +18,51 @@ knitr::opts_chunk$set( ) ``` +This is an introductory vignette to the {simulist} R package. {simulist} simulates epidemiological linelist data, to provide a row-by-row account of cases in an infectious disease outbreak. + +The main function in the {simulist} package is `sim_linelist()`. This functions takes in arguments that control the dynamics of the outbreak, such as the serial interval, and outputs a linelist table (``) with case information for each infected individual. + +For this introduction we will simulate a linelist for a COVID-19 (SARS-CoV-2) outbreak. This will require two R packages: {simulist}, to produce the linelist, and {epiparameter} to provide epidemiological parameters, such as onset-to-death delays. + ```{r setup} library(simulist) library(epiparameter) ``` -This is an introductory vignette to the {simulist} R package. {simulist} simulates epidemiological linelist data, to provide a row-by-row account of cases in an infectious disease outbreak. - -The main function in the {simulist} package is `sim_linelist()`. This functions takes in arguments that control the dynamics of the outbreak, such as the serial interval distribution, and outputs a table (``) with case information for each infected individual. - -For this example we will simulate a linelist for a COVID-19 (SARS-CoV-2) outbreak. - First we load in some data that is required for the linelist simulation. Data on epidemiological parameters and distributions is read from the {epiparameter} R package. ```{r read-epidist} # get serial interval from {epiparameter} database -serial_interval <- epiparameter::epidist( - disease = "COVID-19", - epi_dist = "serial interval", - prob_distribution = "gamma", +serial_interval <- epidist( + disease = "COVID-19", + epi_dist = "serial interval", + prob_distribution = "gamma", prob_distribution_params = c(shape = 1, scale = 1) ) # get onset to hospital admission from {epiparameter} database -onset_to_hosp <- epiparameter::epidist_db( - disease = "COVID-19", - epi_dist = "onset to hospitalisation", +onset_to_hosp <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to hospitalisation", single_epidist = TRUE ) # get onset to death from {epiparameter} database -onset_to_death <- epiparameter::epidist_db( - disease = "COVID-19", - epi_dist = "onset to death", +onset_to_death <- epidist_db( + disease = "COVID-19", + epi_dist = "onset to death", single_epidist = TRUE ) ``` -The reproduction number is supplied, here we set it as 1.1: +The reproduction number (`R`) is the first argument in `sim_linelist()` and with the serial interval will control the growth rate of the simulated epidemic. Here we set it as 1.1. The minimum requirements to simulate a linelist are the reproduction number (`R`), the serial interval, onset-to-hospitalisation delay and onset-to-death delay. ```{r, sim-linelist} linelist <- sim_linelist( - R = 1.1, - serial_interval = serial_interval, - onset_to_hosp = onset_to_hosp, + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death ) head(linelist) @@ -70,7 +70,7 @@ head(linelist) ## Case type -In the midst of an infectious disease outbreak it may not be possible to confirm every case. A confirmed case is usually done via a diagnostic test which may or may not need to be carried out in the laboratory. There are several reasons why a case may not be confirmed, one example is limited testing capacity, especially is fast growing epidemics. The other categories for cases are: probable and suspected. Probable cases are those that show clinical evidence for the disease but have not, or cannot, be confirmed by a diagnostic test. Suspected cases are those that are possibly infected but do not show clear clinical or epidemiological evidence, nor has a diagnostic test been performed. +In the midst of an infectious disease outbreak it may not be possible to confirm every case. A confirmed case is usually done via a diagnostic test which may or may not need to be carried out in the laboratory. There are several reasons why a case may not be confirmed, one example is limited testing capacity, especially in fast growing epidemics. The other categories for cases are: probable and suspected. Probable cases are those that show clinical evidence for the disease but have not, or cannot, be confirmed by a diagnostic test. Suspected cases are those that are possibly infected but do not show clear clinical or epidemiological evidence, nor has a diagnostic test been performed. The linelist output from the {simulist} simulation contains a column (`case_type`) with the type of case. @@ -78,9 +78,9 @@ The linelist output from the {simulist} simulation contains a column (`case_type ```{r, sim-linelist-default-case-type} linelist <- sim_linelist( - R = 1.1, - serial_interval = serial_interval, - onset_to_hosp = onset_to_hosp, + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death ) head(linelist) @@ -90,10 +90,10 @@ To alter these probabilities, supply a named vector to the `sim_linelist()` argu ```{r, sim-linelist-mod-case-type} linelist <- sim_linelist( - R = 1.1, - serial_interval = serial_interval, - onset_to_hosp = onset_to_hosp, - onset_to_death = onset_to_death, + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, case_type_probs = c(suspected = 0.05, probable = 0.05, confirmed = 0.9) ) head(linelist) @@ -105,17 +105,17 @@ The way {simulist} assigns case types is by pasting case types onto existing cas ## Conditioning simulation on outbreak size -The reproduction number has a strong influence on the size of an outbreak. However, the {simulist} package generates linelist data using a stochastic algorithm, so even when `R < 1` it can produce a substantial outbreak by chance, or an `R >> 1` will sometimes not produce a vast epidemic in one simulation (i.e. one replicate) due to the stochasticity. +The reproduction number has a strong influence on the size of an outbreak. However, the {simulist} package generates linelist data using a stochastic algorithm, so even when $R < 1$ it can produce a substantial outbreak by chance, or an $R >> 1$ will sometimes not produce a vast epidemic in one simulation (i.e. one replicate) due to the stochasticity. When requiring a minimum outbreak size we can specify the `min_chain_size` argument in `sim_linelist()`. By default this is set to 10. This means that the simulation will not return a linelist until the conditioning has been met. In other words, the simulation will resimulate a branching process model until an outbreak infects at least 10 people. -When requiring a linelist that represents a large outbreak, such as the COVID-19 outbreak, setting the `min_chain_size` to a larger number guarantees a linelist of that size. Here we simulate a linelist requiring at least 250 cases. +When requiring a linelist that represents a large outbreak, such as the COVID-19 outbreak, setting the `min_chain_size` to a larger number guarantees a linelist of at least that size. Here we simulate a linelist requiring at least 250 cases. ```{r, sim-large-linelist} linelist <- sim_linelist( - R = 1.1, - serial_interval = serial_interval, - onset_to_hosp = onset_to_hosp, + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, onset_to_death = onset_to_death, min_chain_size = 250 ) @@ -130,13 +130,30 @@ By default `sim_linelist()` provides the name of each individual in the linelist ```{r sim-anon-linelist} linelist <- sim_linelist( - R = 1.1, - serial_interval = serial_interval, - onset_to_hosp = onset_to_hosp, - onset_to_death = onset_to_death, + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, add_names = FALSE ) head(linelist) ``` The names used in the linelist are produced at random by the [{randomNames}](https://CRAN.R-project.org/package=randomNames) R package. Therefore, even when `add_names = TRUE` there is no personal data of real individuals being produced or shared. + +## Population age + +By default `sim_linelist()` simulates individuals ages assuming a uniform distribution between 1 and 90. To change this age range, a vector of two numbers can be supplied to the `age_range` argument. Here we simulated an outbreak in a population with a population ranging from 5 to 75. + +```{r sim-linelist-age-range} +linelist <- sim_linelist( + R = 1.1, + serial_interval = serial_interval, + onset_to_hosp = onset_to_hosp, + onset_to_death = onset_to_death, + age_range = c(5, 75) +) +head(linelist) +``` + +It is currently not possible to simulate with a non-uniform population age structure in {simulist}. From 5e73b6e7b4c1bf9d0bf1c880b8fdcac112ed3054 Mon Sep 17 00:00:00 2001 From: Joshua Lambert Date: Thu, 19 Oct 2023 10:58:40 +0100 Subject: [PATCH 7/8] added bookdown to Suggests --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 1a0cc044..863e6715 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,6 +24,7 @@ Imports: Suggests: incidence2, knitr, + bookdown, rmarkdown, spelling, testthat (>= 3.0.0) From dac03a0c6810e6acd547172721e18136a70e83f5 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 19 Oct 2023 10:10:45 +0000 Subject: [PATCH 8/8] Update CITATION.cff --- CITATION.cff | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CITATION.cff b/CITATION.cff index b251c7eb..fb3c9b0b 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -144,6 +144,18 @@ references: email: xie@yihui.name orcid: https://orcid.org/0000-0003-0645-5666 year: '2023' +- type: software + title: bookdown + abstract: 'bookdown: Authoring Books and Technical Documents with R Markdown' + notes: Suggests + url: https://pkgs.rstudio.com/bookdown/ + repository: https://CRAN.R-project.org/package=bookdown + authors: + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + year: '2023' - type: software title: rmarkdown abstract: 'rmarkdown: Dynamic Documents for R'