diff --git a/DESCRIPTION b/DESCRIPTION index f84e736a..1fdeafd9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: naomi Title: Naomi Model for Subnational HIV Estimates -Version: 2.9.27 +Version: 2.9.28 Authors@R: person(given = "Jeff", family = "Eaton", diff --git a/R/input-time-series.R b/R/input-time-series.R index 2ef2eba5..85df3254 100644 --- a/R/input-time-series.R +++ b/R/input-time-series.R @@ -11,13 +11,17 @@ ##' year, quarter,calendar_quarter and art_current ##' ##' @export -aggregate_art <- function(art, shape) { +aggregate_art <- function(art, shape, drop_geometry = TRUE) { ## Check if shape is object or file path - if (!inherits(shape, "sf")) { - areas <- sf::read_sf(shape) |> sf::st_drop_geometry() + if (drop_geometry) { + if(!inherits(shape, "sf")) { + areas <- sf::read_sf(shape) |> sf::st_drop_geometry() + } else { + areas <- shape |> sf::st_drop_geometry() + } } else { - areas <- shape |> sf::st_drop_geometry() + areas <- shape } ## Check if art is object or file path @@ -29,85 +33,71 @@ aggregate_art <- function(art, shape) { cols_list <- c("art_current", "art_new", "vl_tested_12mos", "vl_suppressed_12mos") cols_keep <- intersect(cols_list, colnames(art)) - art <- art |> + # make sure the ART data is the correct shape + clean_art <- art |> dplyr::select(area_id, sex, age_group, calendar_quarter, dplyr::any_of(cols_list)) - art_number <- art |> - dplyr::left_join(areas |> dplyr::select(area_id, area_level), by = "area_id") - - aggregate_art_by_level <- function(art_number, level) { - - ## Recursively aggregate ART data up from lowest level of programme data provided - # Levels to aggregate up from - max_dat <- dplyr::filter(art_number, area_level == level) - max_shape <- dplyr::filter(areas, area_level == level) - - # Ensure entries exist for all programme data age/sex/quarter combinations X - # shape file area_ids at finest stratification - - age_sex_df <- max_dat |> - dplyr::group_by(sex, age_group, calendar_quarter) |> - dplyr::summarise(.groups = "drop") - - art_full <- tidyr::crossing(area_id = unique(max_shape$area_id), - age_sex_df) |> - dplyr::left_join(max_dat, by = c("area_id", "sex", "age_group", "calendar_quarter")) |> - dplyr::mutate(area_level = level) - - - art_number_wide <- spread_areas(areas |> dplyr::filter(area_level <= level)) |> - dplyr::right_join(art_full, by = "area_id", multiple = "all") - - - # Function to aggregate based on area_id[0-9]$ columns in hierarchy - aggregate_data_art <- function(col_name) { - - max_col_name <- paste0("area_id", level) - - if (col_name == max_col_name) { - # Don't aggregate lowest level of data to retain missing values - df <- art_number_wide |> dplyr::select(area_id, sex, age_group, calendar_quarter, - all_of(cols_keep)) - } else { - - df <- art_number_wide |> - dplyr::group_by(!!col_name := .data[[col_name]], sex, age_group, calendar_quarter) |> - dplyr::summarise_at(dplyr::vars(dplyr::all_of(cols_keep)), ~sum(., na.rm = TRUE), - .groups = "drop") |> - dplyr::rename_with(~gsub("[0-9]$", "", .)) - - } - - } - - - # Aggregated data frame for area levels > data provided - aggregate_cols <- grep("^area_id*\\s*[0-9]$", colnames(art_number_wide), value = TRUE) - - aggregate_cols |> - lapply(function(x) aggregate_data_art(x)) |> - dplyr::bind_rows() |> - dplyr::ungroup() |> - dplyr::bind_rows() - } - - max_levels <- art_number |> dplyr::summarise(area_level = max(area_level), - .by = "calendar_quarter") + # get all combinations of all stratifications: + # > every calendar_quarter will have all age_groups present within that + # year in the data. If only one row is missing age_group "Y000_014" within + # a given calendar_quarter then we fill it in with NAs. If all rows within + # a calendar_quarter are missing "Y000_014" then we omit it + # > within a calendar_quarter and age_group we have consistent sex values which + # are either "both" or one of "male" and "female". Validation of sex values is + # done elsewhere + all_strat_art <- clean_art |> + dplyr::group_by(calendar_quarter, age_group) |> + dplyr::reframe(sex = unique(dplyr::cur_data()$sex)) |> + dplyr::ungroup() + + # this gets the max area_level per quarter so we can get a complete list of + # area_ids within that area level and left join later + quarter_by_area_level <- clean_art |> + dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_id") |> + dplyr::group_by(calendar_quarter) |> + dplyr::summarise(area_level = max(area_level), .groups = "drop") + + # combine all_strat_art with quarter_by_area_level and clean_art to get a + # complete data with all stratification combinations and area_ids for level + # with NAs where data was omitted in the CSV + agg_art <- all_strat_art |> + dplyr::left_join(quarter_by_area_level, by = "calendar_quarter") |> + dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_level") |> + dplyr::left_join(clean_art, by = c("area_id", "calendar_quarter", "age_group", "sex")) + + area_with_parent_ids <- areas |> + dplyr::select(area_id, parent_area_id) + + max_area_level <- max(agg_art$area_level) + + # every iteration of the loop aggregates the area up one area_level and + # adds all those rows to agg_art + for (level in max_area_level:1) { + agg_art <- agg_art |> + dplyr::filter(area_level == level) |> + dplyr::left_join(area_with_parent_ids, by = "area_id") |> + dplyr::group_by(parent_area_id, calendar_quarter, sex, age_group) |> + # > area_level is one less since we are aggregating to the parent + # > sum across all the other numeric values ignoring NAs + dplyr::summarise( + area_level = level - 1, + dplyr::across(dplyr::any_of(cols_list), ~sum(.x, na.rm = TRUE)), + .groups = "drop" + ) |> + dplyr::rename(area_id = parent_area_id) |> + dplyr::bind_rows(agg_art) + } - ## Note we can have the case here that different calendar quarters have - ## a different max level e.g. Mozambique - art_long <- lapply(unique(max_levels$area_level), - function(level) aggregate_art_by_level(art_number, level)) |> - dplyr::bind_rows() |> + # add in extra columns and sort + art_long <- agg_art |> + dplyr::left_join( + areas |> dplyr::select( + area_id, area_name, area_level_label, parent_area_id, area_sort_order + ), by = "area_id" + ) |> dplyr::mutate(year = calendar_quarter_to_year(calendar_quarter), quarter = calendar_quarter_to_quarter(calendar_quarter), time_period = paste0(year, " ", quarter)) |> - dplyr::left_join( - areas |> - dplyr::select(area_id, area_name, area_level, - area_level_label, parent_area_id, area_sort_order), - by = c("area_id") - ) |> dplyr::select(area_id, area_level, area_name, area_level_label,parent_area_id, area_sort_order, sex, age_group,time_period, year, quarter, calendar_quarter, dplyr::everything()) |> @@ -115,7 +105,6 @@ aggregate_art <- function(art, shape) { art_long$area_hierarchy <- build_hierarchy_label(art_long) art_long - } @@ -144,7 +133,7 @@ prepare_input_time_series_art <- function(art, shape) { ## Recursively aggregate ART data up from lowest level of programme data provided # Levels to aggregate up from - art_long <- aggregate_art(art, shape) + art_long <- aggregate_art(art, areas, drop_geometry = FALSE) sex_level <- unique(art_long$sex) age_level <- unique(art_long$age_group) admin_level <- max(art_long$area_level) @@ -264,36 +253,42 @@ prepare_input_time_series_art <- function(art, shape) { dplyr::arrange(area_sort_order, calendar_quarter) # Tag data with NAs at the lowest admin level - area_level <- max(art_plot_data_long$area_level) - - hierarchy_wide <- spread_areas(areas |> dplyr::filter(area_level <= admin_level)) |> - dplyr::select(dplyr::starts_with("area_id")) - + art_level <- max(art_plot_data_long$area_level) + # initialise missing_map with values that are missing, these will only show up + # at the max admin level per calendar_quarter missing_map <- art_plot_data_long |> - # For edge cases where data is provided at different admin levels for - # different years (get max area level by year) - dplyr::select(calendar_quarter, area_level) |> - dplyr::group_by(calendar_quarter) |> - dplyr::summarise(area_level = max(area_level)) |> - # Select lowest admin level for each year - dplyr::left_join(art_plot_data_long |> - dplyr::select(area_id, area_name, area_level, calendar_quarter, plot, value), - multiple = "all", by = dplyr::join_by(calendar_quarter, area_level)) |> - # Joint to wide hierarchy - dplyr::left_join(hierarchy_wide, by = dplyr::join_by(area_id)) |> - # Filter for districts with missing value - dplyr::filter(is.na(value)) |> - dplyr::select(missing = area_id, dplyr::everything()) |> - # Get into long format - tidyr::pivot_longer(cols = dplyr::starts_with("area_id"), values_to = "area_id") |> - # Aggregate missing observations by area_id at all levels - dplyr::group_by(area_id, plot, calendar_quarter) |> - dplyr::summarise(missing_ids = list(missing), .groups = "drop") - + dplyr::select(area_id, calendar_quarter, value, plot, area_level) |> + # find NAs, also check it isn't a NaN because these can appear in some + # derived columns where we divide by 0 + dplyr::filter(is.na(value) & !is.nan(value)) |> + dplyr::select(-value) |> + dplyr::mutate(missing_ids = as.list(area_id)) + + area_with_parent_ids <- areas |> + dplyr::select(area_id, parent_area_id) + + # same idea as in aggregate_art, every iteration of the loop aggregates + # up one admin level + for (level in art_level:1) { + missing_map <- missing_map |> + dplyr::filter(area_level == level) |> + dplyr::left_join(area_with_parent_ids, by = "area_id") |> + dplyr::group_by(parent_area_id, calendar_quarter, plot) |> + # > missing_ids merge the two lists together + # > area_level decrease area_level by one because we aggregate + # up to the parent + dplyr::summarise( + missing_ids = list(unlist(missing_ids, FALSE, FALSE)), + area_level = level - 1, + .groups = "drop" + ) |> + dplyr::rename(area_id = parent_area_id) |> + dplyr::bind_rows(missing_map) + } df_final <- art_plot_data_long |> - dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot)) |> + dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot, area_level)) |> dplyr::mutate(value = tidyr::replace_na(value, 0), missing = NULL) return(df_final) @@ -307,6 +302,8 @@ prepare_input_time_series_art <- function(art, shape) { ##' ##' @param anc Path to file containing ANC data or ANC data object ##' @param shape Path to file containing geojson areas data or areas data object +##' @param drop_geometry Setting this to FALSE will skip dropping geometry on +##' the shape file (default TRUE) ##' ##' @return Aggregated ANC data containing columns area_id, area_name, area_level, ##' area_level_label, sex,age_group, time_period, year, quarter, calendar_quarter, @@ -314,16 +311,20 @@ prepare_input_time_series_art <- function(art, shape) { ##' births_clients_ratio ##' @export -aggregate_anc <- function(anc, shape) { +aggregate_anc <- function(anc, shape, drop_geometry = TRUE) { ## Recursively aggregate ANC data up from lowest level of programm data provided # Level to aggregate from ## Check if shape is object or file path - if(!inherits(shape, "sf")) { - areas <- sf::read_sf(shape) |> sf::st_drop_geometry() + if (drop_geometry) { + if(!inherits(shape, "sf")) { + areas <- sf::read_sf(shape) |> sf::st_drop_geometry() + } else { + areas <- shape |> sf::st_drop_geometry() + } } else { - areas <- shape |> sf::st_drop_geometry() + areas <- shape } ## Check if anc is object or file path @@ -336,79 +337,67 @@ aggregate_anc <- function(anc, shape) { "anc_tested", "anc_tested_pos", "anc_known_neg", "births_facility") cols_keep <- intersect(cols_list, colnames(anc)) - anc <- anc |> + # make sure the ANC data is the correct shape, note that all the + # variables in cols_list are requried expect births_facility + clean_anc <- anc |> dplyr::select(area_id, age_group, year, dplyr::any_of(cols_list)) - anc_testing <- anc |> - dplyr::left_join( dplyr::select(areas, area_id, area_level), by = "area_id") - - # Split data by year and aggregate from lowest level available - anc_dat <- split(anc_testing , f = anc_testing$year) - - aggregate_anc_by_level <- function(anc_testing){ - - ## Recursively aggregate ANC data up from lowest level of programme data provided - # Level to aggregate from - anc_level <- max(anc_testing$area_level) - max_dat <- dplyr::filter(anc_testing, area_level == anc_level) - max_shape <- dplyr::filter(areas, area_level == anc_level) - - - # Ensure entries exist for all programme data age/sex/quarter combinations X - # shape file area_ids at finest stratification - anc_full <- tidyr::crossing(area_id = unique(max_shape$area_id), - age_group = unique(max_dat$age_group), - year = unique(max_dat$year)) |> - dplyr::left_join(max_dat, by = c("area_id", "age_group", "year")) - - # Join ANC data to hierarchy - anc_testing_wide <- areas |> - dplyr::filter(area_level <= anc_level) |> - spread_areas() |> - dplyr::right_join(anc_full, by = "area_id", multiple = "all") - - - # Function to aggregate based on area_id[0-9]$ columns in hierarchy - aggregate_data_anc <- function(col_name) { - - max_admin <- paste0("area_id", anc_level) - - if(col_name == max_admin) { - # Don't aggregate lowest level of data to retain missing values - df <- anc_testing_wide |> dplyr::select(area_id, age_group, year, - all_of(cols_keep)) - } else { - - df <- anc_testing_wide |> - dplyr::group_by(!!col_name := .data[[col_name]], age_group, year) |> - dplyr::summarise_at(dplyr::vars(dplyr::all_of(cols_keep)), ~sum(., na.rm = TRUE), - .groups = "drop") |> - dplyr::rename_with(~gsub("[0-9]$", "", .)) - } - } - - # Aggregated data frame for area levels - aggregate_cols <- grep("^area_id*\\s*[0-9]$", colnames(anc_testing_wide), value = TRUE) - - aggregated_anc <- aggregate_cols |> - lapply(aggregate_data_anc) |> - dplyr::bind_rows() |> - dplyr::ungroup() |> - dplyr::bind_rows() + # initialise aggregated anc with the max admin level (most fine grained) + # and fill in and missing rows - if a year has max admin level n (these can be + # different per year) then we fill in values for any missing areas at admin + # level n with NAs + agg_anc <- clean_anc |> + dplyr::left_join(dplyr::select(areas, area_id, area_level), by = "area_id") |> + dplyr::group_by(year) |> + # summarise to table with columns year, area_level (max area level for this year) + # and age_group + dplyr::summarize( + area_level = dplyr::first(area_level), + age_group = dplyr::first(age_group) + ) |> + # expand each year row to multiple rows with all area_ids for that admin level + # this is the complete list of area_ids that we need + dplyr::left_join( + areas |> dplyr::select(area_id, area_level), by = "area_level" + ) |> + # left join complete list of area_ids with our potentially missing area_ids in + # clean_anc to get rows of NAs if an area_id is missing + dplyr::left_join(clean_anc, by = c("year", "area_id", "age_group")) |> + dplyr::select(area_id, age_group, year, area_level, dplyr::any_of(cols_list)) + + area_with_parent_ids <- areas |> + dplyr::select(area_id, parent_area_id) + + max_area_level <- max(agg_anc$area_level) + + # every iteration of the loop aggregates the area up one area_level and + # adds all those rows to agg_anc + for (level in max_area_level:1) { + agg_anc <- agg_anc |> + dplyr::filter(area_level == level) |> + dplyr::left_join(area_with_parent_ids, by = "area_id") |> + dplyr::group_by(parent_area_id, year, age_group) |> + # > area_level is one less since we are aggregating to the parent + # > sum across all the other numeric values ignoring NAs + dplyr::summarise( + area_level = level - 1, + dplyr::across(dplyr::any_of(cols_list), ~sum(.x, na.rm = TRUE)), + .groups = "drop" + ) |> + dplyr::rename(area_id = parent_area_id) |> + dplyr::bind_rows(agg_anc) } - anc_long <- lapply(anc_dat, aggregate_anc_by_level) |> - dplyr::bind_rows() |> + # add in extra columns and sort + anc_long <- agg_anc |> + dplyr::left_join( + areas |> dplyr::select( + area_id, area_name, area_level_label, area_sort_order, parent_area_id + ), + by = "area_id" + ) |> dplyr::mutate(time_period = as.character(year), quarter = "Q4", sex = "female", calendar_quarter = paste0("CY", time_period, quarter)) |> - dplyr::left_join(areas |> - dplyr::select(area_id, area_name, area_level,area_level_label, - parent_area_id, area_sort_order), - by = "area_id") |> - dplyr::select(area_id, area_name, area_level, area_level_label,parent_area_id, - area_sort_order, sex, age_group, time_period, year, quarter, - calendar_quarter, dplyr::all_of(cols_keep)) |> - dplyr::ungroup() |> dplyr::arrange(year, area_sort_order) anc_long$area_hierarchy <- build_hierarchy_label(anc_long) @@ -439,7 +428,7 @@ prepare_input_time_series_anc <- function(anc, shape) { } ## Shape data for plot - anc_long <- aggregate_anc(anc, shape) + anc_long <- aggregate_anc(anc, areas, drop_geometry = FALSE) anc_plot_data_long <- anc_long |> dplyr::mutate( @@ -449,7 +438,7 @@ prepare_input_time_series_anc <- function(anc, shape) { anc_art_among_known = anc_already_art / anc_known_pos, anc_art_coverage = anc_already_art / anc_total_pos, births_clients_ratio = births_facility / anc_clients - ) |> + ) |> dplyr::select(area_id, area_name, area_level, area_level_label, parent_area_id, area_sort_order, age_group, time_period, year, quarter, calendar_quarter, anc_clients, anc_tested, anc_tested_pos, @@ -463,31 +452,41 @@ prepare_input_time_series_anc <- function(anc, shape) { # Tag data with NAs at the lowest admin level anc_level <- max(anc_plot_data_long$area_level) - hierarchy_wide <- spread_areas(areas |> dplyr::filter(area_level <= anc_level)) + # initialise missing_map with values that are missing, these will only show up + # at the max admin level per year missing_map <- anc_plot_data_long |> - # For edge cases where data is provided at different admin levels for - # different years (get max area level by year) - dplyr::select(calendar_quarter, area_level) |> - dplyr::group_by(calendar_quarter) |> - dplyr::summarise(area_level = max(area_level)) |> - # Select lowest admin level for each year - dplyr::left_join(anc_plot_data_long |> - dplyr::select(area_id, area_name, area_level, calendar_quarter, plot, value), - multiple = "all", by = dplyr::join_by(calendar_quarter, area_level)) |> - # Joint to wide hierarchy - dplyr::left_join(hierarchy_wide, by = dplyr::join_by(area_id)) |> - # Filter for districts with missing value - dplyr::filter(is.na(value)) |> - dplyr::select(missing = area_id, dplyr::everything()) |> - # Get into long format - tidyr::pivot_longer(cols = dplyr::starts_with("area_id"), values_to = "area_id") |> - # Aggregate missing observations by area_id at all levels - dplyr::group_by(area_id, plot, calendar_quarter) |> - dplyr::summarise(missing_ids = list(missing), .groups = "drop") + dplyr::select(area_id, year, value, plot, area_level) |> + # find NAs, also check it isn't a NaN because these can appear in some + # derived columns where we divide by 0 + dplyr::filter(is.na(value) & !is.nan(value)) |> + dplyr::select(-value) |> + dplyr::mutate(missing_ids = as.list(area_id)) + + area_with_parent_ids <- areas |> + dplyr::select(area_id, parent_area_id) + + # same idea as in aggregate_anc, every iteration of the loop aggregates + # up one admin level + for (level in anc_level:1) { + missing_map <- missing_map |> + dplyr::filter(area_level == level) |> + dplyr::left_join(area_with_parent_ids, by = "area_id") |> + dplyr::group_by(parent_area_id, year, plot) |> + # > missing_ids merge the two lists together + # > area_level decrease area_level by one because we aggregate + # up to the parent + dplyr::summarise( + missing_ids = list(unlist(missing_ids, FALSE, FALSE)), + area_level = level - 1, + .groups = "drop" + ) |> + dplyr::rename(area_id = parent_area_id) |> + dplyr::bind_rows(missing_map) + } df_final <- anc_plot_data_long |> - dplyr::left_join(missing_map, by = dplyr::join_by(area_id, calendar_quarter, plot)) |> + dplyr::left_join(missing_map, by = dplyr::join_by(area_id, year, plot, area_level)) |> dplyr::mutate(value = tidyr::replace_na(value, 0), missing = NULL) return(df_final) diff --git a/tests/testthat/test-input-time-series.R b/tests/testthat/test-input-time-series.R index 1d5542f8..196664e6 100644 --- a/tests/testthat/test-input-time-series.R +++ b/tests/testthat/test-input-time-series.R @@ -152,7 +152,7 @@ test_that("ART data can be aggregated when avalible at different admin levels", dplyr::group_by(area_level_label) %>% dplyr::summarise(n = dplyr::n(), .groups = "drop") - expect_equal(check1$n, c(10, 10, 6)) + expect_equal(check3$n, c(10, 10, 6)) # (4) Test that ART data can be aggregated with missing records