From 2cff80d9e8bf2602f8579ac31edd24d734a6cb28 Mon Sep 17 00:00:00 2001 From: Jennit07 <67372904+Jennit07@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:55:41 +0100 Subject: [PATCH] Create tests for social care sandpit extracts (#943) * Update `write_tests_xlsx` * Update documentation * Add in sandpit tests where the extract is saved * Setup tests for sandpit Further checks needed for writing to disk * Update documentation * Amend case_when statement * rename function to include 'sc' * Update documentation * Use `is.null` instead of `missing` * Update documentation * Add `year` as a parameter * Update documentation * Setup for writing sandpit tests to disk * Update parameters for sandpit tests * Update documentation * Use `process_tests_sc_sandpit` * Apply styling * Style code * update documentation Co-authored-by: Zihao Li * Rename variable sc_id Co-authored-by: Zihao Li * Rename variable Co-authored-by: Zihao Li * Rename variable Co-authored-by: Zihao Li * Update documentation * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/8689503990/attempts/1 Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/943#issuecomment-2056794120 Signed-off-by: check-spelling-bot on-behalf-of: @check-spelling * update spelling * update spelling expect variant --------- Signed-off-by: check-spelling-bot Co-authored-by: Jennit07 Co-authored-by: Zihao Li Co-authored-by: Zihao Li --- .github/actions/spelling/expect.txt | 8 +- NAMESPACE | 2 + R/process_tests_sc_sandpit.R | 144 ++++++++++++++++++++++++++++ R/read_lookup_sc_client.R | 3 + R/read_lookup_sc_demographics.R | 3 + R/read_sc_all_alarms_telecare.R | 3 + R/read_sc_all_care_home.R | 3 + R/read_sc_all_home_care.R | 3 + R/read_sc_all_sds.R | 3 + R/write_tests_xlsx.R | 25 +++-- man/process_tests_sc_sandpit.Rd | 20 ++++ man/produce_sc_sandpit_tests.Rd | 24 +++++ man/write_tests_xlsx.Rd | 2 +- 13 files changed, 228 insertions(+), 15 deletions(-) create mode 100644 R/process_tests_sc_sandpit.R create mode 100644 man/process_tests_sc_sandpit.Rd create mode 100644 man/produce_sc_sandpit_tests.Rd diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index d27b6e755..a4a34a58b 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -12,7 +12,7 @@ aut bedday birthtime bodyloc -BOXI +boxi callr canx carehome @@ -169,8 +169,6 @@ postcodes ppas prac praccode -prac -praccode ptypes purrr quickstart @@ -207,7 +205,7 @@ setkeyv setnafill setnames setorder -siar +Siar sigfac simd slf @@ -246,6 +244,6 @@ xlsx yearstay yml yyyyqx -zihao +Zihao zsav zstd diff --git a/NAMESPACE b/NAMESPACE index 91f6b66d9..4606cf3f2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -145,8 +145,10 @@ export(process_tests_sc_all_hc_episodes) export(process_tests_sc_all_sds_episodes) export(process_tests_sc_client_lookup) export(process_tests_sc_demographics) +export(process_tests_sc_sandpit) export(process_tests_sds) export(produce_episode_file_tests) +export(produce_sc_sandpit_tests) export(produce_source_extract_tests) export(produce_test_comparison) export(read_extract_acute) diff --git a/R/process_tests_sc_sandpit.R b/R/process_tests_sc_sandpit.R new file mode 100644 index 000000000..089f61aa1 --- /dev/null +++ b/R/process_tests_sc_sandpit.R @@ -0,0 +1,144 @@ +#' Process tests for the social care sandpit extracts +#' +#' @param type Name of sandpit extract. +#' +#' @return a [tibble][tibble::tibble-package] containing a test comparison. +#' @export +#' +process_tests_sc_sandpit <- function(type = c("at", "hc", "ch", "sds", "demographics", "client"), year = NULL) { + comparison <- produce_test_comparison( + old_data = produce_sc_sandpit_tests( + read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = previous_update())), + type = {{ type }} + ), + new_data = produce_sc_sandpit_tests( + read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = latest_update())), + type = {{ type }} + ) + ) + + comparison %>% + write_tests_xlsx(sheet_name = {{ type }}, year = year, workbook_name = "sandpit") + + return(comparison) +} + + +#' Produce tests for social care sandpit extracts. +#' +#' @param data new or old data for testing summary flags +#' (data is from [get_sandpit_extract_path()]) +#' @param type Name of sandpit extract. +#' +#' @return a dataframe with a count of each flag +#' from [calculate_measures()] +#' @export +#' +produce_sc_sandpit_tests <- function(data, type = c("demographics", "client", "at", "ch", "hc", "sds")) { + if (type == "demographics") { + missing_tests <- data %>% + dplyr::mutate( + n_missing_chi = is_missing(.data$chi_upi), + n_missing_sc_id = is_missing(.data$social_care_id), + n_missing_dob = is.na(.data$chi_date_of_birth), + n_missing_postcode = is_missing(.data$chi_postcode), + n_missing_gender = is_missing(.data$chi_gender_code) + ) %>% + dplyr::select(n_missing_chi:n_missing_gender) %>% + calculate_measures(measure = "sum") + + latest_flag_tests <- data %>% + dplyr::filter(!(is.na(.data$chi_upi))) %>% + dplyr::group_by(.data$chi_upi, .data$sending_location) %>% + dplyr::summarise(latest_count = sum(.data$latest_record_flag)) %>% + dplyr::ungroup() %>% + dplyr::mutate( + chi_latest_flag_0 = dplyr::if_else(.data$latest_count == 0, 1, 0), + chi_latest_flag_1 = dplyr::if_else(.data$latest_count == 1, 1, 0), + chi_latest_flag_2 = dplyr::if_else(.data$latest_count == 2, 1, 0), + chi_latest_flag_3 = dplyr::if_else(.data$latest_count == 3, 1, 0), + chi_latest_flag_4 = dplyr::if_else(.data$latest_count == 4, 1, 0), + chi_latest_flag_5 = dplyr::if_else(.data$latest_count == 5, 1, 0), + chi_latest_flag_6 = dplyr::if_else(.data$latest_count == 6, 1, 0), + chi_latest_flag_7 = dplyr::if_else(.data$latest_count == 7, 1, 0), + chi_latest_flag_8 = dplyr::if_else(.data$latest_count == 8, 1, 0), + chi_latest_flag_9 = dplyr::if_else(.data$latest_count == 9, 1, 0), + chi_latest_flag_10 = dplyr::if_else(.data$latest_count == 10, 1, 0) + ) %>% + dplyr::select(.data$chi_latest_flag_0:.data$chi_latest_flag_10) %>% + calculate_measures(measure = "sum") + + # add a flag for sc ids where there is multiple chi associated + sc_id_multi_chi <- data %>% + dplyr::distinct() %>% + dplyr::filter(!(is.na(.data$chi_upi))) %>% + dplyr::group_by(.data$social_care_id, .data$sending_location) %>% + dplyr::distinct(.data$chi_upi, .keep_all = TRUE) %>% + dplyr::mutate(distinct_chi_count = dplyr::n_distinct(.data$chi_upi)) %>% + dplyr::filter(distinct_chi_count > 1) %>% + dplyr::distinct(.data$social_care_id, .data$sending_location, .keep_all = TRUE) %>% + dplyr::mutate(sc_id_multi_chi = 1) %>% + create_sending_location_test_flags(.data$sending_location) %>% + dplyr::ungroup() %>% + dplyr::rename( + sc_id_multi_chi_Aberdeen_City = Aberdeen_City, + sc_id_multi_chi_Aberdeenshire = Aberdeenshire, + sc_id_multi_chi_Angus = Angus, + sc_id_multi_chi_Argyll_and_Bute = Argyll_and_Bute, + sc_id_multi_chi_City_of_Edinburgh = City_of_Edinburgh, + sc_id_multi_chi_Clackmannanshire = Clackmannanshire, + sc_id_multi_chi_Dumfries_and_Galloway = Dumfries_and_Galloway, + sc_id_multi_chi_Dundee_City = Dundee_City, + sc_id_multi_chi_East_Ayrshire = East_Ayrshire, + sc_id_multi_chi_East_Dunbartonshire = East_Dunbartonshire, + sc_id_multi_chi_East_Lothian = East_Lothian, + sc_id_multi_chi_East_Renfrewshire = East_Renfrewshire, + sc_id_multi_chi_Falkirk = Falkirk, + sc_id_multi_chi_Fife = Fife, + sc_id_multi_chi_Glasgow_City = Glasgow_City, + sc_id_multi_chi_Highland = Highland, + sc_id_multi_chi_Inverclyde = Inverclyde, + sc_id_multi_chi_Midlothian = Midlothian, + sc_id_multi_chi_Moray = Moray, + sc_id_multi_chi_Na_h_Eileanan_Siar = Na_h_Eileanan_Siar, + sc_id_multi_chi_North_Ayrshire = North_Ayrshire, + sc_id_multi_chi_North_Lanarkshire = North_Lanarkshire, + sc_id_multi_chi_Orkney_Islands = Orkney_Islands, + sc_id_multi_chi_Perth_and_Kinross = Perth_and_Kinross, + sc_id_multi_chi_Renfrewshire = Renfrewshire, + sc_id_multi_chi_Scottish_Borders = Scottish_Borders, + sc_id_multi_chi_Shetland_Islands = Shetland_Islands, + sc_id_multi_chi_South_Ayrshire = South_Ayrshire, + sc_id_multi_chi_South_Lanarkshire = South_Lanarkshire, + sc_id_multi_chi_Stirling = Stirling, + sc_id_multi_chi_West_Dunbartonshire = West_Dunbartonshire, + sc_id_multi_chi_West_Lothian = West_Lothian + ) %>% + dplyr::select(.data$sc_id_multi_chi, .data$sc_id_multi_chi_Aberdeen_City:.data$sc_id_multi_chi_West_Lothian) %>% + calculate_measures(measure = "sum") + + output <- list( + missing_tests, + latest_flag_tests, + sc_id_multi_chi + ) %>% + purrr::reduce(dplyr::full_join, by = c("measure", "value")) + + return(output) + } else if (type == "client" | type == "at" | type == "ch" | + type == "hc" | type == "sds") { + output <- data %>% + # create test flags + dplyr::mutate( + unique_sc_id = dplyr::lag(.data$social_care_id) != .data$social_care_id, + n_missing_sc_id = is_missing(.data$social_care_id) + ) %>% + create_sending_location_test_flags(.data$sending_location) %>% + # remove variables that won't be summed + dplyr::select(c("unique_sc_id":"West_Lothian")) %>% + # use function to sum new test flags + calculate_measures(measure = "sum") + + return(output) + } +} diff --git a/R/read_lookup_sc_client.R b/R/read_lookup_sc_client.R index 370a15722..d2b549671 100644 --- a/R/read_lookup_sc_client.R +++ b/R/read_lookup_sc_client.R @@ -80,6 +80,9 @@ read_lookup_sc_client <- function(fyyear, if (!fs::file_exists(get_sandpit_extract_path(type = "client", year = fyyear))) { client_data %>% write_file(get_sandpit_extract_path(type = "client", year = fyyear)) + + client_data %>% + process_tests_sc_sandpit(type = "client", year = fyyear) } else { client_data <- client_data } diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R index cb3cea3c2..729f3a445 100644 --- a/R/read_lookup_sc_demographics.R +++ b/R/read_lookup_sc_demographics.R @@ -28,6 +28,9 @@ read_lookup_sc_demographics <- function(sc_dvprod_connection = phs_db_connection if (!fs::file_exists(get_sandpit_extract_path(type = "demographics"))) { sc_demog %>% write_file(get_sandpit_extract_path(type = "demographics")) + + sc_demog %>% + process_tests_sc_sandpit(type = "demographics") } else { sc_demog <- sc_demog } diff --git a/R/read_sc_all_alarms_telecare.R b/R/read_sc_all_alarms_telecare.R index 5abd9bc7b..4af57d857 100644 --- a/R/read_sc_all_alarms_telecare.R +++ b/R/read_sc_all_alarms_telecare.R @@ -31,6 +31,9 @@ read_sc_all_alarms_telecare <- function(sc_dvprod_connection = phs_db_connection if (!fs::file_exists(get_sandpit_extract_path(type = "at"))) { at_full_data %>% write_file(get_sandpit_extract_path(type = "at")) + + at_full_data %>% + process_tests_sandpit(type = "at") } else { at_full_data <- at_full_data } diff --git a/R/read_sc_all_care_home.R b/R/read_sc_all_care_home.R index 870a94ded..0e74d6623 100644 --- a/R/read_sc_all_care_home.R +++ b/R/read_sc_all_care_home.R @@ -33,6 +33,9 @@ read_sc_all_care_home <- function(sc_dvprod_connection = phs_db_connection(dsn = if (!fs::file_exists(get_sandpit_extract_path(type = "ch"))) { ch_data %>% write_file(get_sandpit_extract_path(type = "ch")) + + ch_data %>% + process_tests_sandpit(type = "ch") } else { ch_data <- ch_data } diff --git a/R/read_sc_all_home_care.R b/R/read_sc_all_home_care.R index cca2d0a9b..3741785a7 100644 --- a/R/read_sc_all_home_care.R +++ b/R/read_sc_all_home_care.R @@ -50,6 +50,9 @@ read_sc_all_home_care <- function(sc_dvprod_connection = phs_db_connection(dsn = if (!fs::file_exists(get_sandpit_extract_path(type = "hc"))) { home_care_data %>% write_file(get_sandpit_extract_path(type = "hc")) + + home_care_date %>% + process_tests_sandpit(type = "hc") } else { home_care_data <- home_care_data } diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R index d9d5b8b1d..e184ffaeb 100644 --- a/R/read_sc_all_sds.R +++ b/R/read_sc_all_sds.R @@ -31,6 +31,9 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR if (!fs::file_exists(get_sandpit_extract_path(type = "sds"))) { sds_full_data %>% write_file(get_sandpit_extract_path(type = "sds")) + + sds_full_data %>% + process_tests_sandpit(type = "sds") } else { sds_full_data <- sds_full_data } diff --git a/R/write_tests_xlsx.R b/R/write_tests_xlsx.R index ffe86f48f..d2e1490f2 100644 --- a/R/write_tests_xlsx.R +++ b/R/write_tests_xlsx.R @@ -19,15 +19,22 @@ write_tests_xlsx <- function(comparison_data, sheet_name, year = NULL, - workbook_name = c("ep_file", "indiv_file", "lookup", "extract")) { + workbook_name = c("ep_file", "indiv_file", "lookup", "extract", "sandpit")) { # Set up the workbook ---- - if (workbook_name == "lookup" | missing(year) & workbook_name == "lookup") { - tests_workbook_name <- stringr::str_glue(latest_update(), "_lookups_tests") - } else { + if (is.null(year)) { tests_workbook_name <- dplyr::case_when( workbook_name == "ep_file" ~ stringr::str_glue(latest_update(), "_ep_file_tests"), workbook_name == "indiv_file" ~ stringr::str_glue(latest_update(), "_indiv_file_tests"), + workbook_name == "lookup" ~ stringr::str_glue(latest_update(), "_lookups_tests"), + workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests") + ) + } else if (workbook_name == "sandpit" & !is.null(year)) { + tests_workbook_name <- dplyr::case_when( + workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests") + ) + } else { + tests_workbook_name <- dplyr::case_when( workbook_name == "extract" ~ stringr::str_glue(latest_update(), "_{year}_extract_tests") ) } @@ -92,11 +99,11 @@ write_tests_xlsx <- function(comparison_data, date_today <- stringr::str_to_lower(date_today) - sheet_name_dated <- ifelse( - is.null(year), - stringr::str_glue("{sheet_name}_{date_today}"), - stringr::str_glue("{year}_{sheet_name}_{date_today}") - ) + if (is.null(year)) { + sheet_name_dated <- stringr::str_glue("{sheet_name}_{date_today}") + } else { + sheet_name_dated <- stringr::str_glue("{year}_{sheet_name}_{date_today}") + } # If there has already been a sheet created today, append the time if (sheet_name_dated %in% names(wb)) { diff --git a/man/process_tests_sc_sandpit.Rd b/man/process_tests_sc_sandpit.Rd new file mode 100644 index 000000000..d3c1f5984 --- /dev/null +++ b/man/process_tests_sc_sandpit.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_tests_sc_sandpit.R +\name{process_tests_sc_sandpit} +\alias{process_tests_sc_sandpit} +\title{Process tests for the social care sandpit extracts} +\usage{ +process_tests_sc_sandpit( + type = c("at", "hc", "ch", "sds", "demographics", "client"), + year = NULL +) +} +\arguments{ +\item{type}{Name of sandpit extract.} +} +\value{ +a \link[tibble:tibble-package]{tibble} containing a test comparison. +} +\description{ +Process tests for the social care sandpit extracts +} diff --git a/man/produce_sc_sandpit_tests.Rd b/man/produce_sc_sandpit_tests.Rd new file mode 100644 index 000000000..4f34d506b --- /dev/null +++ b/man/produce_sc_sandpit_tests.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_tests_sc_sandpit.R +\name{produce_sc_sandpit_tests} +\alias{produce_sc_sandpit_tests} +\title{Produce tests for social care sandpit extracts.} +\usage{ +produce_sc_sandpit_tests( + data, + type = c("demographics", "client", "at", "ch", "hc", "sds") +) +} +\arguments{ +\item{data}{new or old data for testing summary flags +(data is from \code{\link[=get_sandpit_extract_path]{get_sandpit_extract_path()}})} + +\item{type}{Name of sandpit extract.} +} +\value{ +a dataframe with a count of each flag +from \code{\link[=calculate_measures]{calculate_measures()}} +} +\description{ +Produce tests for social care sandpit extracts. +} diff --git a/man/write_tests_xlsx.Rd b/man/write_tests_xlsx.Rd index c510e2570..0788d0080 100644 --- a/man/write_tests_xlsx.Rd +++ b/man/write_tests_xlsx.Rd @@ -8,7 +8,7 @@ write_tests_xlsx( comparison_data, sheet_name, year = NULL, - workbook_name = c("ep_file", "indiv_file", "lookup", "extract") + workbook_name = c("ep_file", "indiv_file", "lookup", "extract", "sandpit") ) } \arguments{