-
Notifications
You must be signed in to change notification settings - Fork 0
/
US_influenza_data.R
120 lines (100 loc) · 5.2 KB
/
US_influenza_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#SV Scarpino
#National ILI, flu lab tests, and total influenza cases
#Sept 2019
###########
#libraries#
###########
library(ggplot2)
###########
#acc funcs#
###########
###############
#Global Params#
###############
save_results <- FALSE
make_plots <- FALSE
time_stamp <- as.numeric(Sys.time())
######
#data#
######
ili <- read.csv("Data/ILINet.csv", skip = 1, stringsAsFactors = FALSE)
lab_old <- read.csv("Data/WHO_NREVSS_Combined_prior_to_2015_16.csv", skip = 1, stringsAsFactors = FALSE)
lab_new <- read.csv("Data/WHO_NREVSS_Clinical_Labs.csv", skip = 1, stringsAsFactors = FALSE)
pop <- read.csv("Data/us_pops.csv", skip = 1, stringsAsFactors = FALSE)
pops <- pop$Value[match(ili$YEAR, pop$Date)]*1000000 #US pop is in millions
ili$X.UNWEIGHTED.ILI <- as.numeric(ili$X.UNWEIGHTED.ILI)
lab_old$PERCENT.POSITIVE <- as.numeric(lab_old$PERCENT.POSITIVE)
lab_new$PERCENT.POSITIVE <- as.numeric(lab_new$PERCENT.POSITIVE)
ili$WEEK <- as.numeric(ili$WEEK)
############
#Build data#
############
#1. Multiply ili by percent pos. from WHO labs
ili_raw_raw <- ili$X.UNWEIGHTED.ILI
ili_raw <- ili_raw_raw/100
lab_pos_raw <- c(lab_old$PERCENT.POSITIVE, lab_new$PERCENT.POSITIVE) #NOTE should add a code check here that the dates line up (checked manually on Oct. 7th 2019)
lab_pos <- lab_pos_raw/100
flu_ili_who_raw <- ili_raw * lab_pos
flu_ili_who <- flu_ili_who_raw * pops
flu_ili_who_corrected <- flu_ili_who * 0.7 #Rolfes et al. 2019, see ciz075.pdf in the lit folder, estimated that the 2017-2018 influenza season was ~70% lower than what one obtains from ili_pos * who_pos. NOTE - should return and use the actual values from Rolfes with confidence intervals
#2. build season indicator
season <- rep(NA, nrow(ili))
start <- 1997:2018
stop <- 1998:2019
for(i in 1:length(start)){
sea.1.i <- which(ili$WEEK >= 40 & ili$YEAR == start[i])
sea.2.i <- which(ili$WEEK <= 20 & ili$YEAR == stop[i])
season[c(sea.1.i, sea.2.i)] <- paste0(start[i], "-", stop[i])
}
pan1 <- which(ili$WEEK >= 16 & ili$WEEK <= 33 & ili$YEAR == 2009) #note, should come back and check these against the literature
season[pan1] <- "2009 Pandemic First Wave"
pan2 <- which(ili$WEEK > 33 & ili$YEAR == 2009 | ili$WEEK <= 20 & ili$YEAR == 2010) #note, should come back and check these against the literature
season[pan2] <- "2009 Pandemic Second Wave"
#3. building data file
out <- data.frame(season, ili$YEAR, ili$WEEK, flu_ili_who, flu_ili_who_corrected, ili$X.UNWEIGHTED.ILI, lab_pos, pops)
colnames(out) <- c("Season", "Year", "Week", "Estimated_total_flu_US_ILI_WHO", "Estimated_total_flu_US_ILI_WHO_RolfesCorrected", "ILI_per_pop_weighted", "Percent_Pos_Labs", "US_pop")
if(save_results == TRUE){
#this is currently influenza_usa_sept2019.csv in the Results directory
filename <- paste0(time_stamp, "influenza_usa.csv")
write.csv(out, file = filename)
}
#4. Aggregate metrics
max_sea <- by(out$Estimated_total_flu_US_ILI_WHO_RolfesCorrected, out$Season, max, na.rm = T)
tot_sea <- by(out$Estimated_total_flu_US_ILI_WHO_RolfesCorrected, out$Season, sum, na.rm = T)
pop_out <- c(pop$Value[23:12], pop$Value[11], pop$Value[11], pop$Value[10:2])*1000000 #this is obviously bad and needs to be fixed
out2 <- data.frame(names(max_sea), as.numeric(max_sea), as.numeric(tot_sea), pop_out, as.numeric(tot_sea)/pop_out)
colnames(out2) <- c("season", "peak_cases", "total_cases", "population", "total_prop")
if(save_results == TRUE){
#this is currently flu_season_summary_USA_sept2019.csv in the Results directory
filename_season <- paste0(time_stamp, "flu_season_summary_usa.csv")
write.csv(out2, file = filename_season, row.names = FALSE, quote = FALSE)
}
#######
#Plots#
#######
if(make_plots == TRUE){
#I. Time series
#quartz()
y <- out$Estimated_total_flu_US_ILI_WHO_RolfesCorrected
x <- 1:length(y) #should use package to transform CDC weeks in dates.
plot(x, y, type = "l", main = "Influenza in United States 1998 - 2019", xlab = " ", ylab = "Estimated influenza cases", xaxt = "n", col = "#000000", lty = 1, lwd = 3, bty = "n")
at.x <- (1:nrow(out))[which(out[,"Week"] == 1)]
lab.x <- out[,"Year"][which(out[,"Week"] == 1)]
axis(1, at = at.x, labels = lab.x)
#II. Peak cases by season
#quartz()
par(mar = c(15, 4, 4, 2))
bplt <- barplot(out2$peak_cases, xaxt = "n", ylab = "Peak number of influenza cases", main = "Peak influenza cases in the United States")
labs <- as.character(out2$season)
text(bplt, par("usr")[3]-0.25, labels = labs, srt = 60, adj = c(1.1,1.1), xpd = TRUE, cex = 1) #this isn't quite right in terms of the label position, note the pandemics are too low.
#III. Total prop infected by season
#quartz()
par(mar = c(15, 4, 4, 2))
bplt2 <- barplot(out2$total_prop, xaxt = "n", ylab = "Total proportion of US infected with influenza", main = "Total proportion infected with influenza in the United States")
labs <- as.character(out2$season)
text(bplt2, 0, labels = labs, srt = 60, adj = c(1.1,1.1), xpd = TRUE, cex = 1) #this isn't quite right in terms of the label position, note the pandemics are too low.
#IV. Total cases by season
#quartz()
hist(out2$total_cases, col = "gray", xlab = "Total US influenza cases", main = "Total number of influenza cases in the US by season")
abline(v = mean(out2$total_cases), col = "red", lwd = 3, lty = 3)
}