diff --git a/analysis/analysis_workflow.R b/analysis/analysis_workflow.R index 70afb0b..2245b9f 100644 --- a/analysis/analysis_workflow.R +++ b/analysis/analysis_workflow.R @@ -414,7 +414,7 @@ for (ds in c("ProMED", "HealthMap", "WHO")) { ## 16-10-2019 -for (datasource in c("ProMED", "HealthMap", "WHO")) { +for (datasource in c("ProMED", "HealthMap")) { message("Working on ", datasource) @@ -432,21 +432,24 @@ for (datasource in c("ProMED", "HealthMap", "WHO")) { "analysis/new_weekly_observations.R", local = TRUE ) - source( - "analysis/plot_sensitivity_specificity_alerts.R", local = TRUE - ) - source( "analysis/roc_new_weekly_observations.R", local = TRUE ) + source( + "analysis/roc_summary.R", local = TRUE + ) + source( + "analysis/plot_sensitivity_specificity_alerts.R", local = TRUE + ) + source( "analysis/tpr_fpr_by_threshold.R", local = TRUE ) - source("analysis/sensitivity_over_time.R", local = TRUE) + ##source("analysis/sensitivity_over_time.R", local = TRUE) } diff --git a/analysis/parameters.R b/analysis/parameters.R index c5ed155..636fd8f 100644 --- a/analysis/parameters.R +++ b/analysis/parameters.R @@ -87,6 +87,7 @@ all_files <- list( HealthMap10 = list( incidfile = "data/processed/16042019_healthmap_loglinear_wide.csv", + incidtall = "data/processed/14102019_hm_loglinear_tall.csv", ## Fixed the issue with NAs in interpolated flag on 10-10-2019 weekly_incidfile = "data/processed/10102019_healthmap_loglinear_weekly.csv", stanfits_dir = "data/healthmap_stanfits_gamma_ul_10", diff --git a/analysis/plot_sensitivity_specificity_alerts.R b/analysis/plot_sensitivity_specificity_alerts.R index 8401ea8..a2c9f2a 100644 --- a/analysis/plot_sensitivity_specificity_alerts.R +++ b/analysis/plot_sensitivity_specificity_alerts.R @@ -1,3 +1,4 @@ + ## Reset the countries so that all counries with 0 alerts in week 1 ## are grouped together arrange_by_alerts <- function(df) { @@ -51,6 +52,16 @@ yaxis_levels <- function(countries, weeks, by = 0.05) { forced_y_level } + +build_caption <- function(countries) { + + fullnames <- countrycode::countrycode( + countries, "iso3c", "country.name" + ) + out <- glue::glue("{countries} - {fullnames}") + out <- glue::glue_collapse(out, sep = ", ") + out +} ## All alerts for all weeks for all countries all_alerts <- function(alerts, by = 0.05) { @@ -180,6 +191,14 @@ source(here::here("analysis/common_plot_properties.R")) message("Reading from and writing to ", all_files[[datasource]]$outdir) +threshold_to_use <- readr::read_csv(here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_{datasource}_threshold_maximising_avg.csv") + ) + ) + + + incid_pred <- vroom::vroom( file = here::here( all_files[[datasource]]$outdir, @@ -187,8 +206,28 @@ incid_pred <- vroom::vroom( ) ) incid_pred <- incid_pred[incid_pred$n.dates.sim != 14, ] + nonna_alerts <- dplyr::filter(incid_pred, alert_type != "No Alert") -nonna_alerts <- dplyr::filter(nonna_alerts, threshold == "50%") + +nonna_alerts <- split( + nonna_alerts, + list(nonna_alerts$time_window, nonna_alerts$n.dates.sim), + sep = "_" +) + +nonna_alerts <- purrr::imap_dfr( + nonna_alerts, + function(df, param) { + use <- threshold_to_use[threshold_to_use$params == param, "threshold"] + use <- dplyr::pull(use, "threshold") + message("Filtering ", param, " on ", use) + df <- df[df$threshold == use, ] + df + } + +) + +##nonna_alerts <- dplyr::filter(nonna_alerts, threshold == threshold_to_use) ## First Alert first_alerts <- dplyr::group_by( @@ -250,6 +289,7 @@ purrr::walk( params, function(param) { alerts <- alerts_byparams[[param]] + ## Plot 1. All countries, all weeks p <- all_alerts(alerts) ggplot2::ggsave( @@ -262,7 +302,14 @@ purrr::walk( width = mriids_plot_theme$single_col_width, height = mriids_plot_theme$single_col_height ) - + cat( + build_caption(alerts$country), + file = here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_alerts_per_week_{datasource}", + "_{param}_captions.txt") + ) + ) } ) @@ -293,7 +340,16 @@ purrr::walk( units = mriids_plot_theme$units, width = mriids_plot_theme$single_col_width, height = mriids_plot_theme$single_col_height - ) + ) + + cat( + build_caption(alerts$country), + file = here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_alerts_by_week_{datasource}", + "_{param}_captions.txt") + ) + ) } ) @@ -330,6 +386,14 @@ purrr::walk( width = mriids_plot_theme$single_col_width, height = mriids_plot_theme$single_col_height ) + cat( + build_caption(df$country), + file = here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_alerts_in_{week}_{datasource}", + "_{param}_captions.txt") + ) + ) } @@ -353,7 +417,9 @@ roc_byparams <- split( purrr::walk( names(roc_byparams), function(param) { - p <- roc(roc_byparams[[param]]) + use <- threshold_to_use[threshold_to_use$params == param, "threshold"] + use <- dplyr::pull(use, "threshold") + p <- roc(roc_byparams[[param]], threshold = use) ggplot2::ggsave( filename = here::here( @@ -399,8 +465,9 @@ purrr::walk( size = 4 ) ) - - p2 <- roc(rates) + use <- threshold_to_use[threshold_to_use$params == param, "threshold"] + use <- dplyr::pull(use, "threshold") + p2 <- roc(rates, threshold = use) plot <- cowplot::plot_grid( p2, p1, @@ -422,14 +489,6 @@ purrr::walk( width = mriids_plot_theme$double_col_width, height = mriids_plot_theme$double_col_width / 2, ) - - readr::write_rds( - x = plot, - path = here::here( - all_files[[datasource]]$outdir, - glue::glue("{Sys.Date()}_alerts_in_1_all_roc", - "_{datasource}_{param}.rds")) - ) roc_byweek <- split(rates, rates$week_of_projection) purrr::walk( names(byweek), @@ -439,7 +498,7 @@ purrr::walk( strip.background = element_blank(), strip.text.x = element_blank() ) - p2 <- roc(roc_byweek[[week]], plot_overall = FALSE) + p2 <- roc(roc_byweek[[week]], plot_overall = FALSE, threshold = use) plot <- cowplot::plot_grid( p2, p1, @@ -466,3 +525,54 @@ purrr::walk( ) } ) + + + +no_cases_threshold <- readr::read_csv(here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_{datasource}_threshold_maximising_avg_no_cases_last_week.csv") + ) + ) + + +infile <- here::here( + all_files[[datasource]]$outdir, + glue::glue( + "{Sys.Date()}_{datasource}_tpr_fpr_no_cases_last_week.csv" + ) +) +message("***********************************************************") +message("*", infile, "*") +message("***********************************************************") + +res <- readr::read_csv(infile) + +roc_byparams <- split( + res, list(res$time_window, res$n.dates.sim), sep = "_" +) + +purrr::walk( + names(roc_byparams), + function(param) { + use <- no_cases_threshold[no_cases_threshold$params == param, "threshold"] + use <- dplyr::pull(use, "threshold") + message("Filtering ", param, " on ", use) + + p <- roc( + roc_byparams[[param]], + threshold = use + ) + + ggplot2::ggsave( + filename = here::here( + all_files[[datasource]]$outdir, + glue::glue("{Sys.Date()}_roc_{datasource}", + "no_cases_last_week_{param}.pdf")), + plot = p, + units = mriids_plot_theme$units, + width = mriids_plot_theme$single_col_width, + height = mriids_plot_theme$single_col_height + ) + + } +) diff --git a/analysis/utils.R b/analysis/utils.R index d6584bd..07d85cb 100644 --- a/analysis/utils.R +++ b/analysis/utils.R @@ -121,9 +121,10 @@ tpr_by_threshold <- function(df) { df$week_of_projection <- factor(df$week_of_projection) df$specificity <- 1 - df$fpr p <- ggplot(df) + - geom_line(aes(threshold, tpr, col = week_of_projection), size = 0.3) + + geom_line(aes(threshold, tpr, col = week_of_projection), + size = 0.4) + ##default thickness is 0.5 too thick; 0.3 is too thin. geom_line(aes(threshold, specificity, col = week_of_projection), - linetype = "dashed", size = 0.3) + linetype = "dashed", size = 0.4) p <- p + scale_color_manual( values = mriids_plot_theme$week_color_scale ) diff --git a/tex/pnas/PNAS-template-main.tex b/tex/pnas/PNAS-template-main.tex index f6d0e36..c55df93 100644 --- a/tex/pnas/PNAS-template-main.tex +++ b/tex/pnas/PNAS-template-main.tex @@ -6,7 +6,7 @@ % {pnasresearcharticle} = Template for a two-column research article % {pnasmathematics} %= Template for a one-column mathematics article % {pnasinvited} %= Template for a PNAS invited submission -\newcommand{\sangeeta}[1]{\textcolor{blue}{#1}} + % Added to be able to reference figures across SI. \usepackage{xr} \makeatletter @@ -31,12 +31,12 @@ \author[b]{Britta Lassmann} \author[c]{Emily Cohn} \author[b, d]{Malwina Carrion} -\author[e]{Moritz Kraemer} -\author[f]{Mark Herringer} +\author[c, e, f]{Moritz U. G. Kraemer} +\author[g]{Mark Herringer} \author[c]{John Brownstein} \author[b]{Larry Madoff} \author[1, a]{Anne Cori} -\author[1, g]{Pierre Nouvellet} +\author[1, h]{Pierre Nouvellet} \affil[a]{MRC Centre for Global Infectious Disease Analysis, School of Public Health, Imperial College London, Faculty of Medicine, Norfolk Place, London, W2 1PG, UK} @@ -44,12 +44,16 @@ Brookline, MA 02446 USA} \affil[c]{Computational Epidemiology Group, Division of Emergency Medicine, Boston Children’s Hospital, Boston, MA USA} -\affil[e]{Spatial Ecology and Epidemiology Group, Tinbergen Building, Department of Zoology, Oxford University, Oxford, UK} -\affil[f]{healthsites.io} -\affil[g]{Evolution, Behaviour and Environment, University of Sussex, - Brighton, UK} \affil[d]{Department of Health Science, Sargent College, Boston University, Boston, MA USA} +\affil[e]{Department of Zoology, Tinbergen Building, Oxford + University, Oxford, UK} +\affil[f]{Department of Pediatrics, Harvard Medical School, Boston, + USA} +\affil[g]{healthsites.io} +\affil[h]{Evolution, Behaviour and Environment, University of Sussex, + Brighton, UK} + % Please give the surname of the lead author for the running footer \leadauthor{Bhatia} @@ -60,7 +64,7 @@ during an outbreak, it is important to predict which countries or regions will be affected next. This can be challenging because of delays in data collection and publication. Here we show -how publicly available data from ProMED and HealthMap, the two +how publicly available data from ProMED and HealthMap, two leading and pioneering internet-based disease surveillance platforms, could have been used in real-time to accurately predict the international spread of Ebola during @@ -86,7 +90,7 @@ important early warning systems as well as complement the field surveillance during an ongoing outbreak. Here we present a flexible statistical model that uses data produced from digital surveillance - tools (ProMED and HealthMap) to forecast short term incidence trend + tools (ProMED and HealthMap) to forecast short term incidence trends in a spatially explicit manner. The model was applied to data collected by ProMED and HealthMap during the 2013-2016 West African Ebola epidemic. The model was able to @@ -108,56 +112,62 @@ % If your first paragraph (i.e. with the \dropcap) contains a list environment (quote, quotation, theorem, definition, enumerate, itemize...), the line after the list may have some extra indentation. If this is the case, add \parshape=0 to the end of the list environment. Increasing globalization of commerce, finance, production, and services -has fostered rapid movement of people, animals, plants, food, and -feed \cite{morse2001factors}. With the transportation of people and +has fostered rapid movement of people, animals, plants, and food + \cite{morse2001factors}. With the transportation of people and goods comes the widespread dispersion of pathogens that cause infectious -diseases and the vectors that may spread them. Outbreaks that begin in +diseases and the vectors that may spread them +\cite{stoddard2009role}. Outbreaks that begin in the most remote parts of the world can now spread swiftly to urban centers and to countries far away with dangerous, global consequences \cite{ex1995communicable}. For instance, population mobility across -borders played a critical role in the spread of Ebola \sangeeta{virus} in West Africa +borders played a critical role in the spread of Ebola virus in West Africa during the 2013-2016 Ebola epidemic \cite{ebfactors}. A more recent -example is the 2016 \sangeeta{y}ellow Fever outbreak in Angola. Infected travellers +example is the 2016 yellow Fever outbreak in Angola +\cite{kraemer2017spread}. Infected travellers from Angola reached China, representing the first ever reported cases of -\sangeeta{y}ellow fever in Asia \cite{wasserman2016yellow}. +yellow fever in Asia \cite{wasserman2016yellow}. -Early detection and \sangeeta{monitoring} of infectious disease outbreaks through passive or +Early detection and monitoring of infectious disease outbreaks through passive or active collection of surveillance data can help public health officials initiate interventions such as removing contaminated food sources, isolating affected individuals or launching vaccination campaigns. However, any data collection method involves trade-offs between speed, -accuracy and costs. Data collected through \sangeeta{traditional} surveillance, -\sangeeta{for example via} public health infrastructure, are generally reliable but +accuracy and costs. Data collected through traditional surveillance, +for example via public health infrastructure, are generally reliable but are resource intensive and are therefore typically available for -upstream analysis with an (understandable) delay. +upstream analysis with an (understandable) delay +\cite{tariq2019assessing}. +In the past three decades, the internet has grown at a staggering +pace, with approximately half of the world's population accessing +internet in 2017 \cite{internetgrowth}. The rapid growth of the internet has fostered a corresponding increase -in tools for internet based disease detection \sangeeta{ and monitoring} +in tools for internet based disease detection and monitoring that lie at the other end of the spectrum. Digital disease surveillance consists of monitoring online information sources to collate relevant information about -diseases. The sources of information \sangeeta{can} be formal such as advisories -posted by a ministry of health, or informal such as \sangeeta{news media -items}, blogs or tweets. +diseases. The sources of information can be formal such as advisories +posted by a ministry of health, or informal such as news media +items, blogs or tweets. Digital surveillance makes data collection less expensive and time -consuming but the acquired data often \sangeeta{contain more noise} than those -collected through \sangeeta{traditional public health} surveillance. \sangeeta{While traditional +consuming but the acquired data often contain more noise than those +collected through traditional public health surveillance. While traditional surveillance systems report on select pathogens and depend on a well-functioning public health infrastructure, digital surveillance in contrast typically monitor a wide range of pathogens using little -to no additional infrastructure.} Thus, digital surveillance tools +to no additional infrastructure. Thus, digital surveillance tools can play a significant role in the rapid recognition of public health emergencies \cite{grein2000rumors, anema2014digital}. The Program for Monitoring Emerging Diseases (ProMED, www.promedmail.org) was one of the first entrants in the field of -digital disease \sangeeta{surveillance}. ProMED was created in 1994 as a surveillance -network to provide early warning of emerging \sangeeta{and re-emerging} +digital disease surveillance. ProMED was created in 1994 as a surveillance +network to provide early warning of emerging and re-emerging infections \cite{morse2012public}. ProMED collates information from various sources that include media reports, official reports, local observers, and a network of clinicians throughout the world. The reports generated through bottom-up surveillance are reviewed by a team of -subject matter experts before being posted to \sangeeta{the} ProMED network. ProMED now +subject matter experts before being posted to the ProMED network. ProMED now provides free email based reports on outbreaks to over 70,000 subscribers in at least 185 countries. @@ -168,8 +178,8 @@ on a map \cite{freifeld2008healthmap}. The surveillance data collected by HealthMap has been incorporated into the Epidemic Intelligence from Open Sources (EIOS) surveillance system, developed -by WHO. \sangeeta{Both ProMED and} HealthMap -is used by key public health bodies, including the Centers for Disease +by WHO. Both ProMED and HealthMap +are used by key public health bodies, including the Centers for Disease Control and Prevention (CDC) and the World Health Organisation (WHO). Some other examples of digital disease detection tools include MediSys @@ -180,14 +190,14 @@ real-time. Collating timely data, while critical, is only the first step in the -disease surveillance process. Compiling the data, conducting analysis, +disease surveillance process. Compiling the data, conducting analyses, and generating reports that are easily understood and actionable are -equally important. For instance, \sangeeta{one of the ProMED outbreak analysts +equally important. For instance, one of the ProMED outbreak analysts reported in March 2014 on the likely spread of Zika to the Americas \cite{promedzika}, well before the epidemic surfaced in South -America in February 2015.} However, lacking an easy to use and openly -accessible tool to quantify and visualize the \sangeeta{reported risk of - disease spread}, this report did not have any significant impact on public health resource +America in February 2015. However, lacking an easy to use and openly +accessible tool to quantify and visualize the reported risk of + disease spread, this report did not have any significant impact on public health resource allocation and decision making. While there has been a growing interest in using various internet data streams for epidemiological investigations \cite{generous2014global, milinovich2015role} and @@ -202,8 +212,8 @@ visualize risks posed by outbreak events using digital surveillance data. Our approach relies on a relatively simple statistical framework that integrates multiple data streams to predict the future incidence -trajectory, and \sangeeta{quantifies} spatial heterogeneity in the risk of disease -spread. In this paper, we present the model implemented in this \sangeeta{framework} +trajectory, and quantifies spatial heterogeneity in the risk of disease +spread. In this paper, we present the model implemented in this framework and as a case study, we report the analysis of ProMED and HealthMap data for the 2013-16 West African Ebola epidemic. To assess the robustness of the digital surveillance data, we also applied the framework to the data @@ -220,9 +230,9 @@ \section*{Results} Incidence time series were computed from ProMED, HealthMap and WHO data (see Methods) for the three mainly affected countries, Guinea, -Liberia and Sierra Leone, and are shown in Fig~\ref{fig:whopmhm}. \sangeeta{We +Liberia and Sierra Leone, and are shown in Fig~\ref{fig:whopmhm}. We pre-processed the data from ProMED and HealthMap to -remove inconsistencies and to fill gaps (see Methods for details).} +remove inconsistencies and to fill gaps (see Methods for details). The raw and processed data from ProMED and HealthMap for all countries included in the feed are available in the Github repository for this @@ -235,9 +245,9 @@ \section*{Results} used here are an extensively cleaned version of the data collected during the epidemic \cite{who2014ebola, team2015west}, published more than a year after the epidemic was declared to be -over. \sangeeta{Moreover, while ProMED and HealthMap did not record - probable cases for this epidemic, the WHO data contained confirmed, - probable and suspected cases}. However, +over. Moreover, while ProMED and HealthMap did not record +probable cases for this epidemic, the WHO data contained confirmed, +probable and suspected cases. However, despite these discrepancies, the weekly incidence derived from ProMED and HealthMap was moderately to highly correlated with that reported by WHO later (Pearson's correlation coefficients aggregated across the @@ -259,13 +269,13 @@ \section*{Results} median $R^t$ estimates on sliding 4-week windows from ProMED or HealthMap data with the estimates from WHO data varied from weak (0.30, between reproduction number from WHO and ProMED in Guinea) to very strong -(0.72, between reproduction number from WHO and \sangeeta{ProMED} in Sierra +(0.72, between reproduction number from WHO and ProMED in Sierra Leone, Fig~\ref{fig:rcorr}). Since HealthMap uses ProMED alerts in addition to other online data sources, ProMED represents the more conservative data source between the two. Therefore we present the results based on ProMED data in the main -text, unless otherwise \sangeeta{specified}. The analysis based on HealthMap and +text, unless otherwise specified. The analysis based on HealthMap and WHO data are presented in the Supplementary Information (SI Sections \ref{sec:hm} and \ref{sec:who}). @@ -309,15 +319,6 @@ \subsection*{Short term forecasts}\label{short-term-forecasts} systematic bias in any week of the forecast horizon (median bias 0.12, Fig \ref{fig:perfpm24}A). - - -% To further assess the -% potential tendency of our model to systematically under- or over-predict -% observations, we used a measure of bias ranging from -1 for a model -% which always under-predicts, to 1 for a model which always over-predicts -% (see Methods). The mean bias over the forecast horizon was 0.077 (IQR -% -0.93 - 0.98) indicating - Typically, individual forecasts were within 17.0\% (95\% CrI 0 - 80\%) of the median forecast (based on the median and 95\% CrI for relative sharpness, Fig \ref{fig:perfpm24}B, see Methods for details). @@ -329,48 +330,28 @@ \subsection*{Short term forecasts}\label{short-term-forecasts} forecasts interval, reducing to 49.4\%, 42.3\% and 45.0\% in the second, third and fourth weeks of the forecast horizon. - -% The mean value of the relative mean absolute error -% in the first week of the forecast horizon was 1.32 (Inter Quartile -% Range, IQR: 0.18 - 0.68), suggesting that on average our central -% one-week ahead predictions were about a third off either way of the -% observations. The mean bias for one-week ahead projections was 0.07 (IQR -% -0.81 - 0.94) indicating little systematic bias. The mean sharpness in -% the first week of the forecast horizon is 0.13 (IQR 0.05 - 0.15) meaning -% that on average our individual projected one-week ahead epidemic -% trajectories are within 13\% of the median projection. - -% Similarly, the mean relative error -% increases up to 10.02 (IQR 0.32 - 2.19) in week 4, the mean bias up to -% 0.08 (IQR -0.53 - 0.94) and the mean sharpness up to 0.17 (IQR 0.07 - -% 0.24). - -The model performance varied depending on the \sangeeta{country and - the } phase of the epidemic, defined as ``growing'', ``declining'', and +The model performance varied depending on the country and +the phase of the epidemic, defined as ``growing'', ``declining'', and ``stable'' depending on $R^t$ estimates (see Methods). In general, the model performance was best in the stable phase with 66.7\% of the observations contained in the 95\% forecasts interval (versus 40.2\% and 30.8\% in the declining and growing phases respectively, -SI Table \ref{tab:propinci}). However the forecast uncertainty was largest in the stable +SI Table~\ref{tab:propinci}). However the forecast uncertainty was largest in the stable phase and smallest in the growing phase (Fig \ref{fig:perfpm24}B). Importantly, in the growing phase the model tended to over-predict while under-predicting in the declining phase (Fig \ref{fig:perfpm24}A). -% Fitting the model with alternative calibration windows -% modifies the model complexity as the number of non-overlapping time -% windows, and thus parameters, over the course of the epidemic increases -% with shorter time windows. Overall, the model performed moderately better using WHO data compared -to ProMED and HealthMap data (Fig \ref{fig:perfds}) \sangeeta{and with shorter -calibration windows} (Fig \ref{fig:perftw}). +to ProMED and HealthMap data (Fig \ref{fig:perfds}) and with shorter +calibration windows (Fig \ref{fig:perftw}). \subsection*{Risk of spatial spread} Although our model was not always able to robustly predict the future incidence in the three mainly affected countries, we found that it allowed to robustly predict the -presence or absence of cases in all countries up to a week in -advance. For each week and each country in \sangeeta{Africa}, our model +presence or absence of cases in all countries in Africa up to a week in +advance. For each week and each country in Africa, our model generated an alert if the predicted incidence (using a predetermined percentile of the forecast interval) was greater than 0. We classified an alert for a given week as a true @@ -379,27 +360,29 @@ \subsection*{Risk of spatial spread} observed but were not predicted by the model. Using different percentiles of the forecast (e.g., the median or the 95\textsuperscript{th} percentile) -yielded different rates of true, false and missed alerts, \sangeeta{which} were +yielded different rates of true, false and missed alerts, which were summarised in a ROC curve (Fig \ref{fig:alerts}A). Overall, our model -achieved high sensitivity but variable specificity \sangeeta{with} higher -percentiles as cut-off \sangeeta{yielding} increased sensitivity but poorer -specificity. Maximising the average between -sensitivity and specificity led to \sangeeta{95.9\%} sensitivity and -\sangeeta{98.0\%} specificity at \sangeeta{50\% threshold} -(Fig~\ref{fig:alerts}A). \sangeeta{The +achieved high sensitivity (i.e., true alert rate) but variable specificity +(i.e., 1 - false alert rate). +Maximising the average between +sensitivity and specificity led to 93.7\% sensitivity and +96.0\% specificity at 42.5\% threshold (Fig~\ref{fig:alerts}A). The sensitivity of the model remained high over longer forecast horizon while the specificity deteriorated with more false alerts being raised -4 week ahead (Fig~\ref{fig:alerts4weekahead}). At the same -threshold, the model attained high -specificity (98\%) but poor sensitivity (42.9\%) -when all countries in Africa other than +4 week ahead (Fig~\ref{fig:alerts4weekahead}). Both the sensitivity +and the specificity of the model remained high when the +analysis was restricted to all countries in Africa other than the three majorly affected countries (Guinea, Liberia and Sierra -Leone) were considered (Fig~\ref{fig:alertsallbut3}). -Again, at 50\% threshold, the model exhibited -high specificity (98.0\%) but poor sensitivity (42.9\%) in predicting -presence of cases in weeks following a week with no observed cases in each -country (Fig~\ref{fig:rocnocases}). Out of the 8 missed alerts in this -case, 3 are in Liberia are towards the end of the epidemic, after +Leone). The average of sensitivity and specificity was maximum at +92.5\% threshold in this case with 85.7\% sensitivity and 81.7\% specificity +(Fig~\ref{fig:alertsallbut3}). +The model exhibited +high sensitivity (83.3\%) and specificity (82.0\%) in predicting +presence of cases in weeks following a week with no observed cases in +all countries in Africa (Fig~\ref{fig:rocnocases}) at 93\% threshold +(similarly chosen to maximise the average between sensitivity and +specificity). Out of the 9 one-week ahead missed alerts in this +case, 3 are in Liberia towards the end of the epidemic, after Liberia had been declared Ebola free on two separate occasions \cite{liberiaef1, liberiaef2}. The serial interval distribution that we have used does not account for very long intervals between @@ -407,35 +390,30 @@ \subsection*{Risk of spatial spread} latest available data on pairs of primary and secondary infections and models that allow for more heterogeneity in the distribution of cases e.g., -using Negative Bionamial distributions +using Negative Binomial distributions could potentially improve the assessment of risk of spread in such -cases \cite{thompson2019improved, eggo2015duration} .} - -\sangeeta{In both cases, higher sensitivity with a reasonably high -specificity was achieved by increasing the cut-off at which an alert -is raised (Fig~\ref{fig:tprbythreshold})}. The cut-off for raising an alert -can thus be chosen based -on the relative costs of missed and false alerts \sangeeta{potentially +cases \cite{thompson2019improved, eggo2015duration}. + +The choice of a threshold at which to raise an alert is subjective and +involves a trade-off between sensitivity and specificity. +In general, using a high threshold to raise an alert leads to +high sensitivity with a reasonably high specificity +(Fig~\ref{fig:tprbythreshold}). +The cut-off for raising an alert can be informed by +the relative costs of missed and false alerts potentially using a higher cut-off when the observed incidence is low. -It is also worth noting that in each instance where our model failed to raise an +It is also worth noting that where our model failed to raise an alert in a week, either a true or a false alert had -been raised in the recent weeks indicating a potential risk of spread -of the epidemic in that country.} - -% For instance, in May 2014, the model \sangeeta{correctly predicted} the -% presence of cases in Liberia after three consecutive weeks with no -% observed incidence (\ref{fig:alerts}B). No false alerts were raised -% for any of these countries (Liberia, Sierra Leone, Nigeria, Senegal -% and Mali) and the model sensitivity ranged from 11.4\% -% (using 2.5th percentile) to 93.2\% (97.5th percentile). +been raised in the recent weeks in all but one instance, +indicating a potential risk of spread of the epidemic to that country. -\sangeeta{These results} suggest that the model is able to +These results suggest that the model is able to capture and even anticipate the spatial spread of the epidemic. -\sangeeta{ Importantly, as the model is fitted to data from any of the - three data sources accumulating over +Importantly, as the model is fitted to data from any of the +three data sources accumulating over the course of the epidemic, it is able to predict the presence of cases relatively early in the epidemic (Fig~\ref{fig:tprovertime}) -when such inputs would be particularly useful.} +when such inputs would be particularly useful. Together with providing operational outputs such as the predicted short-term incidence in currently affected countries or the risk of @@ -446,45 +424,39 @@ \subsection*{Risk of spatial spread} two weeks (Fig~\ref{fig:pm24}, second row). We found that these near real-time estimates of \(R_t\) were in good agreement with retrospective estimates obtained using the entire incidence time-series (Fig~\ref{fig:pm24}, bottom row, -correlation coefficients varying between \sangeeta{0.58 and 0.90}, +correlation coefficients varying between 0.58 and 0.90, Fig~\ref{fig:rcorrrealretro}). The risk of spatial spread in our model relies on estimating movement -\sangeeta{patterns} of infectious cases. Our method also provides estimates of -%parameters of the underlying gravity model (see Methods), \(p_{stay}\), +patterns of infectious cases. Our method also provides estimates of the probability of a case staying within a country throughout their infectious period, and the extent to which -distance between two locations affects the flow \sangeeta{of - infectious cases} between them. The +distance between two locations affects the flow of +infectious cases between them. The real-time estimates of these parameters over the course of the epidemic (Fig \ref{fig:parsul2}), suggest that while the relative flow -\sangeeta{of cases} between locations did not vary substantially over time, the probability +of cases between locations did not vary substantially over time, the probability of travel across national borders may have decreased after the initial phase of the epidemic. -% Figure 6 also illustrates how accumulating data -% throughout the course of the epidemic allows estimating such parameters -% more precisely. Finally, we quantify the relative risk of importation of a case into a country from any other currently affected country. The risk of importation is proportional to the population flow into a country from all other countries estimated using a mobility model (here, gravity -model) weighted by the infectivity at each country \sangeeta{(which - depends on the number of cases and time at which they were infected, - see Methods)}. Our +model) weighted by the infectivity at each country (which +depends on the number of cases and time at which they were infected, +see Methods). Our estimates of the countries with higher risk of acting as source of importations are largely consistent with the reported source of cases (Fig \ref{fig:imprisk}). In 4 out of the 5 reported cases of international spread of the epidemic in West Africa, the model correctly attributed -\sangeeta{the highest} relative -risk of acting as a source of importation \sangeeta{(relative risk $ > +the highest relative +risk of acting as a source of importation (relative risk $ > 0.9$ for importation into -Liberia, Nigeria and Sierra Leone and 0.49 in case of Mali)} to the actual source +Liberia, Nigeria and Sierra Leone and 0.49 in case of Mali) to the actual source identified through retrospective epidemiological and genomic investigations \cite{gire2014genomic}. -% According to "retrspective" investigations -% \section*{Discussion}\label{discussion} @@ -495,22 +467,22 @@ \section*{Discussion}\label{discussion} surveillance data from ProMED or HealthMap to 1) predict the short-term epidemic trajectory in currently affected countries, 2) quantify the short-term risk of spread to other countries and 3) for countries at -risk \sangeeta{of importation}, quantifies where the risk comes from. We apply our model to data +risk of importation, quantify where the risk comes from. We apply our model to data collected during the West African Ebola epidemic of 2013-2016, curated -by the \sangeeta{outbreak analysts} at ProMED/HealthMap, and we compare the model's output +by the outbreak analysts at ProMED/HealthMap, and we compare the model's outputs to those obtained when using the data collated by the WHO and made available at the end of the epidemic. In spite of the manual curation of the data carried out by -\sangeeta{outbreak analysts} at ProMED and HealthMap, substantial issues remained in +outbreak analysts at ProMED and HealthMap, substantial issues remained in the quality and consistency of the data feeds. Dealing with issues such as missing data and inconsistent records will be a key challenge in using these data for prospective real-time analysis. Despite these challenges, we show the potential of digital surveillance tools to -1) \sangeeta{inform} early detection , 2) characterise the spread, and 3) forecast +1) inform early detection, 2) characterise the spread, and 3) forecast the future trajectory of outbreaks. Particularly in an evolving outbreak scenario, when information from traditional surveillance is limited and -only available with a significant delay, digital surveillance data \sangeeta{can} +only available with a significant delay, digital surveillance data can be used to complement the information gap. For instance, during the West African Ebola epidemic, the first situation report by the WHO was published only at the end of August 2014 \cite{whositrep}, @@ -522,8 +494,8 @@ \section*{Discussion}\label{discussion} In general, we found that, after systematic processing to remove inconsistencies, data from ProMED and HealthMap were in -\sangeeta{reasonably} good agreement -with the data collated by WHO \sangeeta{and collected} through traditional surveillance methods. +reasonably good agreement +with the data collected through traditional surveillance methods and collated by WHO. In particular, both the incidence time-series and retrospective national estimates of transmissibility over time were well correlated across the three data sources. This suggests that digital surveillance data are a @@ -575,11 +547,11 @@ \section*{Discussion}\label{discussion} international spread 1 to 4 weeks in advance. Choosing a cut-off to maximise sensitivity led to low model specificity. On occasions the model predicted cases in countries, such as Côte d'Ivoire, where -\sangeeta{neither WHO nor} ProMED reported any case. +neither WHO nor ProMED reported any case. However this may also be due to imperfect case reporting. Thus our method could be used with a high cut-off as a highly sensitive surveillance system with an alert triggering further epidemiological -investigations \sangeeta{and implementation of epidemic preparedness measures}. +investigations and implementation of epidemic preparedness measures. A key feature of our model is that it provides, for each country identified as at risk, a map of where the @@ -591,7 +563,7 @@ \section*{Discussion}\label{discussion} digital surveillance into concrete operational outputs in real-time that could assist in epidemic management and control. -\sangeeta{The systematic collection, storage, organization and communication +The systematic collection, storage, organization and communication of disease surveillance data were especially challenging during the West African Ebola epidemic as the deficiencies in transportation and communication resources, @@ -600,22 +572,22 @@ \section*{Discussion}\label{discussion} The collection of case incidence data and rapid dissemination through digital surveillance systems was further hampered by the limited information technology and internet -service. Thus, } +service in the countries most affected. Thus, for the West African Ebola epidemic, ProMED and HealthMap data were -available at a coarse spatial scale with the \sangeeta{sub-national} level information +available at a coarse spatial scale with the sub-national level information for cases missing in most of the records. This limited our analysis to the spread of the outbreak across national borders only, although -\sangeeta{in principle} our -model could deal with data at any spatial scale. \sangeeta{Both ProMED and HealthMap -collect more granular data for most outbreaks, utilizing these data +in principle our +model could deal with data at any spatial scale. Both ProMED and HealthMap +collect more granular data for most outbreaks. Utilizing these data to investigate outbreaks and regions would provide further evidence of the ability of digital surveillance data to usefully complement data collected from traditional -surveillance. Another} interesting research +surveillance. Another interesting research avenue would be to explore ways of integrating timely data from ProMED and HealthMap with delayed data from ground -surveillance \sangeeta{ to generate timely insights into the spatial spread -of an outbreak}. +surveillance to generate timely insights into the spatial spread +of an outbreak. The framework presented in this paper was developed as a proof-of-concept to use digital surveillance data for near real-time @@ -637,27 +609,27 @@ \section*{Discussion}\label{discussion} Importantly, many other open data sources could be included in our framework to improve model performance. For example, data on human mobility could be used to further inform the parametric form and -parameter values of our mobility model. \sangeeta{We have incorporated +parameter values of our mobility model. We have incorporated a simple and well-characterised model of population movement in the current work. In addition to utilising other possible data sources, future work could consider other models of human population movement -\cite{simini2012universal}.} +\cite{simini2012universal}. When relevant, spatially-explicit data on population-level immunity to the circulating pathogen (e.g.~following previous epidemics and/or due to vaccination) -could also be used to refine our transmission model. \sangeeta{ +could also be used to refine our transmission model. Finally, the impact of the health capacity of a region to respond to a public health emergency could also be accounted for in future iterations of -the model. Ongoing efforts to collate } +the model. Ongoing efforts to collate quantitative information on the performance of health systems and the ability of regions or countries to respond to an epidemic -\cite{healthsites}, \cite{maina2019spatial} \sangeeta{can potentially provide - valuable data sources for future work}. -\sangeeta{Here using a relatively simple +\cite{healthsites}, \cite{maina2019spatial} can potentially provide + valuable data for future work. +Here using a relatively simple modelling approach we provide one of the first pieces of evidence of the potential value of digital surveillance for real-time quantitative analysis of -epidemic data, with important operational and actionable outputs.} +epidemic data, with important operational and actionable outputs. \subsection*{Figures}\label{figures} @@ -723,13 +695,13 @@ \subsection*{Figures}\label{figures} \end{figure*} \begin{figure*} -\includegraphics{{../../ms-figures/2019-10-31_alerts_in_1_all_roc_ProMED_14_28}.pdf} +\includegraphics{{../../ms-figures/2019-11-09_alerts_in_1_all_roc_ProMED_14_28}.pdf} \caption{Predicted weekly presence of cases in each country. The left panel shows the True and False alert rates using different thresholds for classification for alerts raised 1 (violet), 2 (light violet), 3 (dark pink) and 4 (green) weeks ahead. The black curve depicts the overall True and False alert rates. On each -curve, the dot shows the True and False Alert rates at 50\% threshold. +curve, the dot shows the True and False Alert rates at 42.5\% threshold. For a given threshold (\(x^{th}\) percentile of the forecast interval), we defined a True alert for a week where the \(x^{th}\) percentile of the forecast interval and the @@ -743,22 +715,22 @@ \subsection*{Figures}\label{figures} similarly the ratio of false alerts to the total number of false alerts and weeks of no alert (where the observed and the threshold incidence are both 0). The right panel shows the True (green), False (orange) and -Missed (red) 1 week ahead alerts using the 50\textsuperscript{th} percentile of the +Missed (red) 1 week ahead alerts using the 42.5\textsuperscript{th} percentile of the forecast interval as threshold. The figure only shows countries on the -African continent for which either the 50\textsuperscript{th} +African continent for which either the 42.5\textsuperscript{th} percentile of the predicted incidence or the observed incidence was greater than 0 at least once. The first alert in each country is shown -using larger symbols (square or triangle). Alerts in a country in a +using larger squares. Alerts in a country in a week where there were no observed cases in the previous week are shown -using hollow triangles. In each case, weeks for which all observed points +using triangles. In each case, weeks for which all observed points were imputed are shown in lighter shades. Country codes, shown on the y-axis, are as -follows: BFA - Burkina Faso, CIV - Côte d'Ivoire, -GHA - Ghana, GIN - Guinea, LBR - Liberia, MLI - Mali, NGA - Nigeria, -SEN - Senegal, SLE - Sierra Leone. The alerts are based on forecasts -using the ProMED data, a 2-week calibration window and a 4 week +follows: CIV - Côte d’Ivoire, GHA - Ghana, GIN - Guinea, +LBR - Liberia, MLI - Mali, NGA - Nigeria, SEN - Senegal, +SLE - Sierra Leone, BFA - Burkina Faso. The alerts are based on forecasts +using ProMED data, a 2-week calibration window and a 4 week forecast horizon.} \label{fig:alerts} \end{figure*} @@ -802,7 +774,7 @@ \subsection*{Supporting Information (SI)} \label{processing-promedhealthmap-data-feed} We used a set of curated ProMED and HealthMap records on the human cases of Ebola in West Africa. The dates in the feeds ranged from March 2014 -to October 2016. \sangeeta{Each dataset} recorded the cumulative number of +to October 2016. Each dataset recorded the cumulative number of suspected/confirmed cases and suspected/confirmed deaths by country at various dates in this period. We derived the country specific daily and weekly incidence time series from these data after the following data @@ -812,16 +784,16 @@ \subsection*{Supporting Information (SI)} \item We first extracted the total case counts as a sum of suspected and confirmed cases (ProMED and HealthMap data did not record probable - cases \sangeeta{for the West African Ebola epidemic}). + cases for the West African Ebola epidemic). \item Each unique record in ProMED is associated with a - unique alert id. \sangeeta{A alert id can correspond to various reports from + unique alert id. An alert id can correspond to various reports from different sources (news, social media etc.) which might report different case numbers. In such cases, we assigned the median of the - case numbers to the record.} + case numbers to the record. \item In some instances, cumulative case count was inconsistent in that it - failed to be monotonically \sangeeta{non-decreasing}. + failed to be monotonically non-decreasing. We identified consecutive dates (\(t_k\) and \(t_{k + 1}\)) where the cumulative case count was not increasing. If removing either \(t_k\) or \(t_{k + 1}\) made the @@ -830,9 +802,9 @@ \subsection*{Supporting Information (SI)} we removed both \(t_k\) and \(t_{k + 1}\). These rules were applied iteratively until the cumulative case count was consistent. \item - \sangeeta{If an inconsistent point was at the end of the the cumulative case + If an inconsistent point was at the end of the the cumulative case series, applying the above rules led to the removal of a large - number of points. Hence,} to remove outliers at the end of , we used + number of points. Hence, to remove outliers at the end, we used Chebyshev inequality with sample mean \cite{saw1984chebyshev}. Given a set of observations \(X_1, X_2 \dots X_n\), the formulation of Chebyshev inequality by Saw et al.~gives the probability that the @@ -852,15 +824,16 @@ \subsection*{Supporting Information (SI)} \end{itemize} An example of the pre-processing of ProMED feed is presented in the -supplementary text (Figure 1). +supplementary text (SI Fig~\ref{fig:dataclean}). \subsection*{Data collated by WHO}\label{data-collated-by-who} -We used the \sangeeta{West African Ebola} incidence data +We used the West African Ebola incidence data collated by the WHO during the -epidemic \sangeeta{which was} made available by \cite{garske20160308} -\sangeeta{approximately an year after the end of the epidemic}, referred to as -the ``WHO data'' in the interest of brevity. The cleaned version of the +epidemic which was made available +approximately an year after the end of the epidemic +\cite{garske20160308}. +This data set is referred to as ``WHO data'' in the interest of brevity. The cleaned version of the WHO data consisted of cases reported between December 2013 and October 2015 in the three most affected countries - Guinea, Sierra Leone and Liberia. The location of residence of cases was geo-coded to the second @@ -868,13 +841,13 @@ \subsection*{Data collated by WHO}\label{data-collated-by-who} match the spatial resolution of ProMED and HealthMap that were only available at the country level. -\sangeeta{Since the data collected by ProMED and - HealthMap are manually curated by outbreak analysts, we have used - the word ``curate'' in refering to their data collection - process. Similarly, we refer to the data ``collated'' by WHO +Since the data collected by ProMED and +HealthMap are manually curated by outbreak analysts, we have used +the word ``curate'' in referring to their data collection +process. Similarly, we refer to the data ``collated'' by WHO as WHO coordinated the international response to the outbreak and in this role, they collated the information from Ministries of Health, -situation reports from NGOs, and local and foreign medical teams.} +situation reports from NGOs, and local and international medical teams. \subsection*{Demographic Data}\label{demographic-data} @@ -1005,8 +978,8 @@ \subsection*{Statistical Model}\label{statistical-model} \label{eq:lambdajt} \end{equation} -The likelihood of the parameters for data \sangeeta{up to time $T$, the duration -of the outbreak so far}, is +The likelihood of the parameters for data up to time $T$, the duration +of the outbreak so far, is \begin{equation*} \mathcal{L} = P @@ -1015,7 +988,7 @@ \subsection*{Statistical Model}\label{statistical-model} p_{stay}, \gamma, {\langle R_{j}^{t} \rangle}_{j = 1, 2 \dots n}^{t = 1, 2, \dots T}\right) = - \prod_{t = 1}^{t}{e^{-\Lambda_{i}^{t}} + \prod_{t = 1}^{T}{e^{-\Lambda_{i}^{t}} \frac{\left(I_{i}^{t}\right)^{\Lambda_{i}^{t}}}{\Lambda_{i}^{t} !}}. \end{equation*} @@ -1049,8 +1022,8 @@ \subsection*{Statistical Model}\label{statistical-model} windows increasing the number of parameters in the model. To reduce the number of parameters in the model, we divided the 55 countries on -African continent into 5 groups and \sangeeta{forced each country in a -group to have the same reproduction number} in each time window. +African continent into 5 groups and forced each country in a +group to have the same reproduction number in each time window. The first three groups correspond of the three mainly affected countries - Sierra Leone, Guinea and Liberia. The countries that shared a border with these three countries @@ -1072,9 +1045,7 @@ \subsection*{Statistical Model}\label{statistical-model} For the gravity model parameters \(p_{stay}\) and \(\gamma\), we chose uninformative uniform priors. Since \(p_{stay}\) is a probability, the -prior was a uniform distribution on the interval \([0, 1]\). For -simplicity, and in the absence of data to inform these, we assumed the -exponents \(\alpha\) and \(\beta\) on the populations to be 1. +prior was a uniform distribution on the interval \([0, 1]\). \(\gamma\) was allowed to vary between 1 and 2 in the results presented in the main text. We performed a sensitivity analysis where \(\gamma\) has a uniform prior between 1 and 10. The results of this analysis are @@ -1104,7 +1075,7 @@ \section*{Model Validation}\label{assess} Given the retrospective nature of our analysis, we validated the incidence projected using our model against observed incidence. In -addition to the \sangeeta{accuracy} of the projected incidence, the uncertainty +addition to the accuracy of the forecasted incidence, the uncertainty associated with the forecasts (e.g., measured by the width of the prediction interval) is an important indicator of model performance. A narrow prediction interval that contains the observed values is diff --git a/tex/pnas/SI.tex b/tex/pnas/SI.tex index c84c66f..fe2c93b 100644 --- a/tex/pnas/SI.tex +++ b/tex/pnas/SI.tex @@ -33,25 +33,26 @@ \section*{Overview}\label{overview} In this supplement, we present the details of the pre-processing of -ProMED and HealthMap feeds \ref{sec:data-cleaning}, +ProMED and HealthMap feeds (Section~\ref{sec:data-cleaning}), the model results using ProMED, HealthMap and WHO data and the impact of the datasources and model parameters on the performance of the model. We varied the length of the time window used for model fitting (see Methods for details). SI Sections 2, SI Section 3 and SI Section 4 present the forecasts using ProMED, HealthMap and WHO data respectively using different calibration windows (2, 4 -and 6 weeks) and forecast horizons (4, 6 and 8 weeks). The calibration window -did not have a strong influence on the model performance (SI Figure 28). -Similarly, the data used also did not influence the model performance -(SI Figure 29). We explored the impact of alternate priors for the +and 6 weeks) and forecast horizons (4, 6 and 8 weeks). The model +performance was moderately better with shorter calibration windows +(Fig~\ref{fig:perftw}) and with WHO data (Fig~\ref{fig:perfds}). +We explored the impact of alternate priors for the gravity model parameter on the results from the model. The results of -this sensitivity analysis are presented in SI Section 7. We present -additional analysis on the predicting the spatial spread of the +this sensitivity analysis are presented in +Section~\ref{sec:sensitivity-analysis}. +We present additional analysis on the predicting the spatial spread of the epidemic in Section~\ref{sec:spatial-spread}. -\section{Processing ProMED and HealthMap data} -\subsection{Data cleaning}\label{sec:data-cleaning} +\section{Processing ProMED and HealthMap data}\label{sec:data-cleaning} +\subsection{Data cleaning} The data from ProMED and HealthMap was pre-processed to remove inconsistencies and address missing data (see Methods). The data on @@ -96,11 +97,10 @@ \subsection{Data cleaning}\label{sec:data-cleaning} \end{subfigure} \hfill \begin{subfigure}[t]{0.3\textwidth} \end{subfigure} \hfill - \caption{Illustration of workflow for Processing ProMED feed. Raw ProMED feed consisted of suspected and confirmed cases and suspected and confirmed - deaths. The top left figure (a) shows the suspected (blue) and + deaths. The top left figure (a) shows the suspected (teal) and confirmed (red) cases in Sierra Leone in the raw feed. We used the suspected and confirmed cases to derive cumulative incidence data (b). If there were more than one alert on a day, @@ -138,9 +138,13 @@ \subsection{Correlation between R estimates}\label{correlation-between-r-estimat The correlation between estimates of time-varying reproduction number estimated using ProMED, HealthMap and WHO data depended on the time window used for estimation and the country -(Fig~\ref{fig:rcorr}). Similarly, for each data source, the -retrospective estimates of R obtained using the full incidence curve -were in reasonably good agreement with the real-time estimates of R. +(Fig~\ref{fig:rcorr}). Restricting the analysis was robust to using +reproduction number estimates with lower uncertainty (coefficient of +variation less than 0.25, Fig~\ref{fig:lowcv}) +Similarly, for each data source, the +retrospective estimates of reproduction number in the Bayesian +framework using the full incidence curve +were in reasonably good agreement with the real-time estimates. The strength of the correlation varied depending on the country and the length of the calibration window (Fig~\ref{fig:rcorrrealretro}). @@ -149,7 +153,7 @@ \subsection{Correlation between R estimates}\label{correlation-between-r-estimat \includegraphics[width=\textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/other/2019-10-25_median_R_corr.pdf} \caption{Correlation between time-varying reproduction number estimated from ProMED, HealthMap and WHO data. The reproduction numbers were - estimated using R package EpiEstim over a 2, 4 or 6 week sliding + estimated using R package EpiEstim over a 4 week sliding window. Median estimates from WHO data are on the x-axis and the median estimates using ProMED (blue) and HealthMap (green) data are on the y-axis. All correlation coefficients were statistically @@ -157,6 +161,20 @@ \subsection{Correlation between R estimates}\label{correlation-between-r-estimat \label{fig:rcorr} \end{figure}\FloatBarrier +\begin{figure} + \centering + \includegraphics[width=\textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/other/2019-11-08_who-hm-promed-comparison-8-1_low_cv} + \caption{Correlation between time-varying reproduction number estimated from + ProMED, HealthMap and WHO data. Only estimates with a coefficient + of variation less than 0.25 were included in this analysis. + The reproduction numbers were + estimated using R package EpiEstim over a 4 week sliding + window. Median estimates from WHO data are on the x-axis and the + median estimates using ProMED (blue) and HealthMap (green) data + are on the y-axis. All correlation coefficients were statistically + significant.} + \label{fig:lowcv} +\end{figure}\FloatBarrier \begin{figure} \centering @@ -204,7 +222,7 @@ \subsubsection{Forecast horizon of 4 weeks}\label{sec:pm24} All & 30.8\% & 40.2\% & 66.7\% & 48.7\% \\ \bottomrule \end{tabular} -\end{table}\FloatBarrier +\end{table} \subsubsection{Forecast horizon of 6 weeks}\label{sec:pm26} @@ -963,7 +981,8 @@ \section{Sensitivity Analysis}\label{sec:sensitivity-analysis} population flow. Here \(\gamma\) is allowed to vary between 1 and 10. \(p_{stay}\) represents the probability of an individual to stay in a given location during their infectious period. The solid lines - represents the median estimates obtained using ProMED data. The shaded + represents the median estimates obtained using ProMED (blue), + HealthMap (green) and WHO (yellow) data. The shaded regions represent the 95\% CrI.} \label{fig:parsul10} \end{figure}\FloatBarrier @@ -1115,8 +1134,8 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} Fig~\ref{fig:alerts4weekahead}. We also assessed the sensitivity of the model when the analysis was restricted to countries other than Guinea, Liberia and Sierra Leone (Fig~\ref{fig:alertsallbut3}). At -50\% threshold, the -model exhibited high specificity (98.0\%) but poor sensitivity (42.9\%) in predicting +93\% threshold, the +model exhibited high specificity (83.3\%) and sensitivity (82.0\%) in predicting presence of cases in weeks following a week with no observed cases in each country (Fig~\ref{fig:rocnocases}). In predicting presence of cases in countries with no or low incidence, or in a @@ -1131,7 +1150,7 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} \begin{figure} \centering -\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-30_alerts_by_week_ProMED_14_28} +\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-11-09_alerts_by_week_ProMED_14_28_allcountries} \caption{Predicted weekly presence of cases in each country up to 4 weeks ahead. For a given threshold (\(x^{th}\) percentile of the forecast interval), we defined a True alert for a week @@ -1142,9 +1161,9 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} the threshold for a country was 0 but the observed incidence for that country was greater than 0. The panels shows the True (green), False (orange) and Missed (red) 1, 2, 3 and 4 week ahead alerts using the -50\textsuperscript{th} percentile of the forecast interval as threshold. +42.5\textsuperscript{th} percentile of the forecast interval as threshold. The figure only shows countries on the -African continent for which either the 50\textsuperscript{th} +African continent for which either the 42.5\textsuperscript{th} percentile of the forecast interval or the observed incidence was greater than 0 at least once. The first alert in each country is shown @@ -1181,14 +1200,14 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} \begin{figure} \centering -\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-31_alerts_in_1_all_roc_ProMED_14_28_allbut3} +\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-11-09_alerts_in_1_all_roc_ProMED_14_28_allbut3} \caption{Predicted weekly presence of cases in countries other than the three majorly affected countries (Guinea, Liberia and Sierra Leone). The left panel shows the True and False alert rates using different thresholds for classification for alerts raised 1 (violet), 2 (light violet), 3 (dark pink) and 4 (light green) weeks ahead. The black curve depicts the overall True and False alert rates. On each -curve, the dot shows the True and False Alert rates at 50\% threshold. +curve, the dot shows the True and False Alert rates at 92.5\% threshold. For a given threshold (\(x^{th}\) percentile of the forecast interval), we defined a True alert for a week where the \(x^{th}\) percentile of the forecast interval and the @@ -1202,10 +1221,10 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} similarly the ratio of false alerts to the total number of false alerts and weeks of no alert (where the observed and the threshold incidence are both 0). The right panel shows the True (green), False (orange) and -Missed (red) 1 week ahead alerts using the 50\textsuperscript{th} percentile of the +Missed (red) 1 week ahead alerts using the 92.5\textsuperscript{th} percentile of the forecast interval as threshold. The figure only shows countries on the -African continent for which either the 50\textsuperscript{th} +African continent for which either the 92.5\textsuperscript{th} percentile of the predicted incidence or the observed incidence was greater than 0 at least once. The first alert in each country is shown @@ -1214,9 +1233,17 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} using hollow triangles. In each case, weeks for which all observed points were imputed are shown in lighter shades. Country codes, shown on the y-axis, are as -follows: BFA - Burkina Faso, CIV - Côte d'Ivoire, -GHA - Ghana, MLI - Mali, NGA - Nigeria, -SEN - Senegal. The alerts are based on forecasts +follows: AGO - Angola, BEN - Benin, BFA - Burkina Faso, CIV - Côte +d’Ivoire, CMR - Cameroon, COD - Congo - Kinshasa, DZA - Algeria, EGY - +Egypt, ETH - Ethiopia, GHA - Ghana, GMB - Gambia, GNB - Guinea-Bissau, +KEN - Kenya, MAR - Morocco, MLI - Mali, MRT - Mauritania, NER - Niger, +NGA - Nigeria, SDN - Sudan, SEN - Senegal, TGO - Togo, TZA - Tanzania, +UGA - Uganda, ZAF - South Africa, MOZ - Mozambique, MWI - Malawi, SSD +- South Sudan, TCD - Chad, TUN - Tunisia, BDI - Burundi, CAF - Central +African Republic, COG - Congo - Brazzaville, LBY - Libya, MDG - +Madagascar, RWA - Rwanda, ZMB - Zambia, ZWE - Zimbabwe, ERI - Eritrea, +GAB - Gabon, SOM - Somalia, CPV - Cape Verde, GNQ - Equatorial Guinea, +NAM - Namibia. The alerts are based on forecasts using the ProMED data, a 2-week calibration window and a 4 week forecast horizon.} \label{fig:alertsallbut3} @@ -1225,11 +1252,10 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} \begin{figure} \centering \begin{subfigure}[b]{0.45\textwidth} -\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-31_roc_ProMEDno_cases_last_week_14_28} +\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-11-10_roc_ProMEDno_cases_last_week_14_28} \end{subfigure} \begin{subfigure}[b]{0.45\textwidth} -\includegraphics[width = \textwidth]{/Volumes/Sangeeta - EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-31_alerts_in_1_ProMED_no_cases_last_week_14_28} +\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-11-09_alerts_in_1_ProMED_no_cases_last_week_14_28_allcountries.pdf} \end{subfigure} \caption{Predicted weekly presence of cases for each country in weeks following a week with no observed cases. @@ -1237,7 +1263,7 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} different thresholds for classification for alerts raised 1 (violet), 2 (light violet), 3 (dark pink) and 4 (light pink) weeks ahead. The black curve depicts the overall True and False alert rates. On each -curve, the dot shows the True and False Alert rates at 50\% threshold. +curve, the dot shows the True and False Alert rates at 93\% threshold. For a given threshold (\(x^{th}\) percentile of the forecast interval), we defined a True alert for a week where the \(x^{th}\) percentile of the forecast interval and the @@ -1251,10 +1277,10 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} similarly the ratio of false alerts to the total number of false alerts and weeks of no alert (where the observed and the threshold incidence are both 0). The right panel shows the True (green), False (orange) and -Missed (red) 1 week ahead alerts using the 50\textsuperscript{th} percentile of the +Missed (red) 1 week ahead alerts using the 93\textsuperscript{rd} percentile of the forecast interval as threshold. The figure only shows countries on the -African continent for which either the 50\textsuperscript{th} +African continent for which either the 93\textsuperscript{rd} percentile of the predicted incidence or the observed incidence was greater than 0 at least once. The first alert in each country is shown @@ -1271,23 +1297,16 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} \label{fig:rocnocases} \end{figure}\FloatBarrier - - \begin{figure} \centering -\begin{subfigure}[b]{0.45\textwidth} -\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-31_ProMED_tpr_by_threshold_14_28_allbut3} -\end{subfigure} -\begin{subfigure}[b]{0.45\textwidth} -\includegraphics[width = \textwidth]{/Volumes/Sangeeta - EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-10-31_ProMED_tpr_by_threshold_14_28_no_cases_last_week} -\end{subfigure} -\caption{True and False alert rates at various thresholds for (A) all - countries except Guinea, Liberia and Sierra Leone, and (B) all - countries in weeks following a week with no observed cases. The - solid and dashed lines depict the True and False alert rates - respectively at 1 (violet), 2 (light violet), 3 (pink) and 4 - (light green) week ahead.} +\includegraphics[width = \textwidth]{/Volumes/Sangeeta EHD/mriids_manuscript/ms-figures/si-figures/ProMED/spatial_spread/2019-11-09_ProMED_tpr_by_threshold_14_28} +\caption{Sensitivity and specificity at various thresholds for (A) all + countries in Africa (B) all + countries in Africa except Guinea, Liberia and Sierra Leone, and (C) all + in Africa in weeks following a week with no observed cases. The + solid and dashed lines depict the sensitivity and specificity + respectively of 1 (violet), 2 (light violet), 3 (pink) and 4 + (light green) week ahead alerts.} \label{fig:tprbythreshold} \end{figure}\FloatBarrier @@ -1303,6 +1322,6 @@ \section{Risk of spatial spread}\label{sec:spatial-spread} weekly incidence for Guinea (deep orange), Liberia (green) and Sierra Leone (violet) obtained from ProMED data.} \label{fig:tprovertime} -\end{figure}\FloatBarrier +\end{figure}\FloatBarrier \end{document} \ No newline at end of file diff --git a/tex/pnas/mriids.bib b/tex/pnas/mriids.bib index 9ac6227..4f18e22 100644 --- a/tex/pnas/mriids.bib +++ b/tex/pnas/mriids.bib @@ -579,4 +579,39 @@ @article{thompson2019improved pages={100356}, year={2019}, publisher={Elsevier} +} +@article{stoddard2009role, + title={The role of human movement in the transmission of vector-borne pathogens}, + author={Stoddard, Steven T and Morrison, Amy C and Vazquez-Prokopec, Gonzalo M and Soldan, Valerie Paz and Kochel, Tadeusz J and Kitron, Uriel and Elder, John P and Scott, Thomas W}, + journal={PLoS neglected tropical diseases}, + volume={3}, + number={7}, + pages={e481}, + year={2009}, + publisher={Public Library of Science} +} +@article{kraemer2017spread, + title={Spread of yellow fever virus outbreak in Angola and the Democratic Republic of the Congo 2015--16: a modelling study}, + author={Kraemer, Moritz UG and Faria, Nuno R and Reiner Jr, Robert C and Golding, Nick and Nikolay, Birgit and Stasse, Stephanie and Johansson, Michael A and Salje, Henrik and Faye, Ousmane and Wint, GR William and others}, + journal={The Lancet infectious diseases}, + volume={17}, + number={3}, + pages={330--338}, + year={2017}, + publisher={Elsevier} +} +@article{tariq2019assessing, + title={Assessing reporting delays and the effective reproduction number: The Ebola epidemic in DRC, May 2018--January 2019}, + author={Tariq, A and Roosa, K and Mizumoto, K and Chowell, G}, + journal={Epidemics}, + volume={26}, + pages={128--133}, + year={2019}, + publisher={Elsevier} +} +@misc{internetgrowth, + year = {2019}, + title = {Individuals using the Internet (\% of population)}, + howpublished = {\url{https://data.worldbank.org/indicator/it.net.user.zs}}, + note = {[Online, accessed 08-Nov-2019]} } \ No newline at end of file