diff --git a/NAMESPACE b/NAMESPACE index 0e746b8d..d32f60fc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(head,openair) S3method(names,openair) S3method(plot,openair) +S3method(print,aqStat) S3method(print,openair) S3method(results,default) S3method(results,openair) @@ -10,9 +11,11 @@ S3method(summary,openair) S3method(tail,openair) export(TaylorDiagram) export(TheilSen) +export(aqStat) export(aqStats) export(binData) export(bootMeanDF) +export(calcAQStats) export(calcFno2) export(calcPercentile) export(calendarPlot) diff --git a/R/aqStats.R b/R/aqStats.R index 54088e0c..d0f4bb12 100644 --- a/R/aqStats.R +++ b/R/aqStats.R @@ -1,7 +1,5 @@ #' Calculate summary statistics for air pollution data by year #' -#' Calculate a range of air pollution-relevant statistics by year. -#' #' This function calculates a range of common and air pollution-specific #' statistics from a data frame. The statistics are calculated on an annual #' basis and the input is assumed to be hourly data. The function can cope with @@ -85,6 +83,9 @@ #' ## example is for illustrative purposes only #' aqStats(selectByDate(mydata, year = 2004), pollutant = "no2") #' +#' @seealso [calcAQStats()], for a more verbose yet flexible way of calculating +#' air quality statistics +#' #' aqStats <- function(mydata, pollutant = "no2", diff --git a/R/calcAQStats.R b/R/calcAQStats.R new file mode 100644 index 00000000..d9ff59ab --- /dev/null +++ b/R/calcAQStats.R @@ -0,0 +1,243 @@ +#' Calculate user-defined air quality statistics +#' +#' [calcAQStats()] calculates user-defined annual air-quality statistics +#' constructed using [aqStat()]. This pair of functions allows users to flexibly +#' define air quality statistics and limits relevant to their local air quality +#' legislation and quickly track compliance. +#' +#' @rdname calc-aq-stats +#' @order 1 +#' +#' @param mydata A data frame of time series. Must include a `date` field and +#' any variables required by the provided `aqstats`. +#' @param aqstats A list of [aqStat()] objects to calculate. +#' @param period A time period to calculate statistics for. Currently, only +#' `"year"` is supported, which will calculate annual statistics. +#' @param progress Show a progress bar when large numbers of statistics are +#' being calculated? Defaults to `TRUE`. +#' +#' @author Jack Davison +#' @export +#' +#' @seealso [aqStats()], for a simpler but more prescriptive way of calculating +#' air quality statistics +#' +#' @section Data Transformation Pipeline: +#' +#' [calcAQStats()] does *a lot* in one go, so it is worth outlining the order +#' of proceedings: +#' +#' - First, the data is time-averaged using `avg.time`. This is passed +#' straight to [timeAverage()]. For hourly data and the default `avg.time` of +#' `"hour"`, effectively nothing happens at this stage. +#' +#' - Second, if `roll.width` is specified, a rolling mean is calculated +#' using [rollingMean()]. Typically this should be combined with `avg.time = +#' "hour"` to ensure *hourly* data is rolled. Most likely, `roll.width` will +#' be `8L` (for ozone & carbon monoxide) or `24L` (for particulates). +#' +#' - If `roll.avg.time` is set, the average rolled values will then +#' themselves be averaged. `roll.avg.stat` defaults to `"max"`, which is +#' expected to be the most useful for almost all applications. These options +#' are likely only useful when `stat = "limit"` to compare complex statistics +#' like "daily max rolling 8-hour mean ozone" with a limit value. +#' +#' - Next, the `stat` is used to calculate a `period` (by default, annual) +#' statistic. If `stat != "limit"` this, again, is passed straight to +#' [timeAverage()]. If `stat == "limit"`, each value is checked against the +#' `limit` and the number of values exceeding the limit are returned. +#' +#' +#' +#' @examples +#' # calculate some UK AQ limits +#' calcAQStats( +#' mydata = openair::mydata, +#' aqstats = list( +#' # PM10 limits +#' aqStat("pm10", "limit", "day", limit = 50), +#' aqStat("pm10", "limit", "year", limit = 40), +#' # NO2 limits +#' aqStat("no2", "limit", "hour", limit = 200), +#' aqStat("no2", "limit", "year", limit = 40), +#' # O3 limit +#' aqStat("o3", "limit", "hour", roll = 8L, limit = 100), +#' # SO2 limits +#' aqStat("so2", "limit", "15 min", limit = 266), +#' aqStat("so2", "limit", "hour", limit = 350), +#' aqStat("so2", "limit", "day", limit = 125), +#' # CO limits +#' aqStat("co", "limit", "hour", roll = 8L, roll.avg.time = "day", limit = 10) +#' ), +#' period = "year" +#' ) +calcAQStats <- function(mydata, + aqstats, + period = c("year"), + progress = TRUE) { + period <- match.arg(period) + + # catch passing a single aqstat + if (inherits(aqstats, "aqStat")) { + aqstats <- list(aqstats) + } + + # catch passing things that aren't aqstats + if (!all(purrr::map_vec(test, function(x) + inherits(x, "aqStat")))) { + cli::cli_abort( + c("x" = "Unknown object passed to {.field aqstats}.", + "i" = "Please pass a {.fun list} of objects created by {.fun aqStat}.") + ) + } + + # calculate air quality stats + purrr::map( + aqstats, + ~ calculate_aqstat(mydata, .x, period = period), + .progress = ifelse(progress, "Calculating Statistics", FALSE) + ) %>% + purrr::reduce(function(x, y) + dplyr::left_join(x, y, by = "date")) +} + +#' @rdname calc-aq-stats +#' @order 2 +#' +#' @param pollutant The pollutant (or other named variable) of interest, e.g., +#' `"no2"`. This should align with a column name in the `data.frame` passed to +#' [calcAQStats()]. +#' @param stat The grand statistic to calculate. Simple statistics include +#' `"mean"` (the default), `"min"`, `"median"` or `"max"`. Can also be +#' `"percentile"`, the percentile being defined using the `percentile` option. +#' Finally, can be `"limit"`, which compares values against the `limit` value +#' and returns the number of instances where the value exceeds the limit. +#' @param avg.time An initial averaging time to calculate *before* the `stat` is +#' calculated; passed directly to [timeAverage()]. +#' @param roll.width If an integer value is provided, a rolling mean will be +#' calculated *after* time averaging and before *stat*. For example, to +#' calculate an 8-hour rolling mean set `avg.time = "1 hour", roll = 8L`. +#' @param roll.avg.time,roll.avg.stat If `roll.width` is specified, users can +#' additionally set `roll.avg.time` and `roll.avg.stat` to time average the +#' rolling mean values using [timeAverage()]. This could be useful to +#' calculate a *max* daily running 8 hour mean to compare with an ozone limit, +#' for example. +#' @param limit The limit value, for `stat = "limit"`. +#' @param percentile The percentile value, for `stat = "percentile"`. +#' @param name Optionally change the output column name for this air quality +#' statistic. +#' +#' @author Jack Davison +#' @export +#' +aqStat <- function(pollutant = "no2", + stat = c("mean", "min", "median", "max", "percentile", "limit"), + avg.time = "hour", + limit = NA, + percentile = NA, + roll.width = NA, + roll.avg.time = NA, + roll.avg.stat = "max", + name = NULL) { + stat <- match.arg(stat) + + out <- + list( + pollutant = pollutant, + stat = stat, + avg.time = avg.time, + limit = limit, + percentile = percentile, + roll = roll.width, + roll.avg.time = roll.avg.time, + roll.avg.stat = roll.avg.stat, + name = name + ) + + class(out) <- "aqStat" + + return(out) +} + +#' Helper for calculating air quality statistics +#' @noRd +calculate_aqstat <- function(data, aqstat, period) { + thedata <- dplyr::select(data, dplyr::all_of(c("date", aqstat$pollutant))) + + thedata <- openair::timeAverage(thedata, avg.time = aqstat$avg.time) + + if (!is.na(aqstat$roll)) { + thedata <- + openair::rollingMean( + thedata, + pollutant = aqstat$pollutant, + width = aqstat$roll, + new.name = aqstat$pollutant + ) + + if (!is.na(aqstat$roll.avg.time)) { + thedata <- + openair::timeAverage( + thedata, + pollutant = aqstat$pollutant, + avg.time = aqstat$roll.avg.time, + statistic = aqstat$roll.avg.stat + ) + } + } + + if (aqstat$stat != "limit") { + thedata <- + openair::timeAverage( + thedata, + avg.time = period, + statistic = aqstat$stat, + percentile = aqstat$percentile + ) + } else { + thedata <- + thedata %>% + openair::cutData(period) %>% + dplyr::mutate(date = min(date, na.rm = TRUE), + .by = dplyr::all_of(period)) %>% + dplyr::summarise(calcstat = sum(.data[[aqstat$pollutant]] > aqstat$limit, na.rm = TRUE), + .by = "date") + } + + if (is.null(aqstat$name)) { + newname <- paste(aqstat$pollutant, aqstat$stat, aqstat$avg.time, sep = ".") + if (!is.na(aqstat$roll)) { + if (!is.na(aqstat$roll.avg.time)) { + newname <- paste(newname, paste0(aqstat$roll.avg.stat, aqstat$roll.avg.time, "roll", aqstat$roll), sep = ".") + } else { + newname <- paste(newname, paste0("roll", aqstat$roll), sep = ".") + } + } + if (aqstat$stat == "limit") { + newname <- paste(newname, aqstat$limit, sep = ".") + } + if (aqstat$stat == "percentile") { + newname <- paste(newname, aqstat$percentile, sep = ".") + } + } else { + newname <- aqstat$name + } + + names(thedata)[2] <- newname + + return(thedata) +} + +#' @method print aqStat +#' @author Jack Davison +#' @export +print.aqStat <- function(x) { + cli::cli_h2("Air Quality Statistic") + cli::cli_inform(c("i" = "Please use in {.fun calcAQStats}")) + + cli::cli_h3("Parameters") + purrr::imap(x, ~ paste0("{.strong ", toupper(.y), "}: '", .x, "'")) %>% + purrr::list_c() %>% + cli::cli_inform() +} + diff --git a/_pkgdown.yml b/_pkgdown.yml index b442dcb5..db2c3c1a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -47,6 +47,11 @@ reference: - linearRelation - calcFno2 - title: Utilities +- subtitle: Air Quality Statistics + desc: Functions for easily calculating air quality statistics and flexibly comparing them with AQ targets. + contents: + - calcAQStats + - aqStats - subtitle: Plot Utilities desc: Tools for refining openair plots. contents: @@ -54,9 +59,8 @@ reference: - quickText - drawOpenKey - subtitle: Data Utilities - desc: Tools for manipulating and summarising air quality data. + desc: Tools for manipulating air quality data. contents: - - aqStats - binData - bootMeanDF - calcPercentile diff --git a/man/aqStats.Rd b/man/aqStats.Rd index efb98ba1..4e31cfa2 100644 --- a/man/aqStats.Rd +++ b/man/aqStats.Rd @@ -42,15 +42,13 @@ have columns for each pollutant-site combination.} \item{...}{Other arguments, currently unused.} } \description{ -Calculate a range of air pollution-relevant statistics by year. -} -\details{ This function calculates a range of common and air pollution-specific statistics from a data frame. The statistics are calculated on an annual basis and the input is assumed to be hourly data. The function can cope with several sites and years e.g. using \code{type = "site"}. The user can control the output by setting \code{transpose} appropriately. - +} +\details{ Note that the input data is assumed to be in mass units e.g. ug/m3 for all species except CO (mg/m3). @@ -107,7 +105,10 @@ can be rounded at several stages during the calculations. ## example is for illustrative purposes only aqStats(selectByDate(mydata, year = 2004), pollutant = "no2") - +} +\seealso{ +\code{\link[=calcAQStats]{calcAQStats()}}, for a more verbose yet flexible way of calculating +air quality statistics } \author{ David Carslaw diff --git a/man/calc-aq-stats.Rd b/man/calc-aq-stats.Rd new file mode 100644 index 00000000..230d04f9 --- /dev/null +++ b/man/calc-aq-stats.Rd @@ -0,0 +1,123 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/calcAQStats.R +\name{calcAQStats} +\alias{calcAQStats} +\alias{aqStat} +\title{Calculate user-defined air quality statistics} +\usage{ +calcAQStats(mydata, aqstats, period = c("year"), progress = TRUE) + +aqStat( + pollutant = "no2", + stat = c("mean", "min", "median", "max", "percentile", "limit"), + avg.time = "hour", + limit = NA, + percentile = NA, + roll.width = NA, + roll.avg.time = NA, + roll.avg.stat = "max", + name = NULL +) +} +\arguments{ +\item{mydata}{A data frame of time series. Must include a \code{date} field and +any variables required by the provided \code{aqstats}.} + +\item{aqstats}{A list of \code{\link[=aqStat]{aqStat()}} objects to calculate.} + +\item{period}{A time period to calculate statistics for. Currently, only +\code{"year"} is supported, which will calculate annual statistics.} + +\item{progress}{Show a progress bar when large numbers of statistics are +being calculated? Defaults to \code{TRUE}.} + +\item{pollutant}{The pollutant (or other named variable) of interest, e.g., +\code{"no2"}. This should align with a column name in the \code{data.frame} passed to +\code{\link[=calcAQStats]{calcAQStats()}}.} + +\item{stat}{The grand statistic to calculate. Simple statistics include +\code{"mean"} (the default), \code{"min"}, \code{"median"} or \code{"max"}. Can also be +\code{"percentile"}, the percentile being defined using the \code{percentile} option. +Finally, can be \code{"limit"}, which compares values against the \code{limit} value +and returns the number of instances where the value exceeds the limit.} + +\item{avg.time}{An initial averaging time to calculate \emph{before} the \code{stat} is +calculated; passed directly to \code{\link[=timeAverage]{timeAverage()}}.} + +\item{limit}{The limit value, for \code{stat = "limit"}.} + +\item{percentile}{The percentile value, for \code{stat = "percentile"}.} + +\item{roll.width}{If an integer value is provided, a rolling mean will be +calculated \emph{after} time averaging and before \emph{stat}. For example, to +calculate an 8-hour rolling mean set \verb{avg.time = "1 hour", roll = 8L}.} + +\item{roll.avg.time, roll.avg.stat}{If \code{roll.width} is specified, users can +additionally set \code{roll.avg.time} and \code{roll.avg.stat} to time average the +rolling mean values using \code{\link[=timeAverage]{timeAverage()}}. This could be useful to +calculate a \emph{max} daily running 8 hour mean to compare with an ozone limit, +for example.} + +\item{name}{Optionally change the output column name for this air quality +statistic.} +} +\description{ +\code{\link[=calcAQStats]{calcAQStats()}} calculates user-defined annual air-quality statistics +constructed using \code{\link[=aqStat]{aqStat()}}. This pair of functions allows users to flexibly +define air quality statistics and limits relevant to their local air quality +legislation and quickly track compliance. +} +\section{Data Transformation Pipeline}{ + + +\code{\link[=calcAQStats]{calcAQStats()}} does \emph{a lot} in one go, so it is worth outlining the order +of proceedings: +\itemize{ +\item First, the data is time-averaged using \code{avg.time}. This is passed +straight to \code{\link[=timeAverage]{timeAverage()}}. For hourly data and the default \code{avg.time} of +\code{"hour"}, effectively nothing happens at this stage. +\item Second, if \code{roll.width} is specified, a rolling mean is calculated +using \code{\link[=rollingMean]{rollingMean()}}. Typically this should be combined with \code{avg.time = "hour"} to ensure \emph{hourly} data is rolled. Most likely, \code{roll.width} will +be \code{8L} (for ozone & carbon monoxide) or \code{24L} (for particulates). +\item If \code{roll.avg.time} is set, the average rolled values will then +themselves be averaged. \code{roll.avg.stat} defaults to \code{"max"}, which is +expected to be the most useful for almost all applications. These options +are likely only useful when \code{stat = "limit"} to compare complex statistics +like "daily max rolling 8-hour mean ozone" with a limit value. +\item Next, the \code{stat} is used to calculate a \code{period} (by default, annual) +statistic. If \code{stat != "limit"} this, again, is passed straight to +\code{\link[=timeAverage]{timeAverage()}}. If \code{stat == "limit"}, each value is checked against the +\code{limit} and the number of values exceeding the limit are returned. +} +} + +\examples{ +# calculate some UK AQ limits +calcAQStats( + mydata = openair::mydata, + aqstats = list( + # PM10 limits + aqStat("pm10", "limit", "day", limit = 50), + aqStat("pm10", "limit", "year", limit = 40), + # NO2 limits + aqStat("no2", "limit", "hour", limit = 200), + aqStat("no2", "limit", "year", limit = 40), + # O3 limit + aqStat("o3", "limit", "hour", roll = 8L, limit = 100), + # SO2 limits + aqStat("so2", "limit", "15 min", limit = 266), + aqStat("so2", "limit", "hour", limit = 350), + aqStat("so2", "limit", "day", limit = 125), + # CO limits + aqStat("co", "limit", "hour", roll = 8L, roll.avg.time = "day", limit = 10) + ), + period = "year" +) +} +\seealso{ +\code{\link[=aqStats]{aqStats()}}, for a simpler but more prescriptive way of calculating +air quality statistics +} +\author{ +Jack Davison +}