Create tests for social care sandpit extracts (#943)

* Update `write_tests_xlsx` * Update documentation * Add in sandpit tests where the extract is saved * Setup tests for sandpit Further checks needed for writing to disk * Update documentation * Amend case_when statement * rename function to include 'sc' * Update documentation * Use `is.null` instead of `missing` * Update documentation * Add `year` as a parameter * Update documentation * Setup for writing sandpit tests to disk * Update parameters for sandpit tests * Update documentation * Use `process_tests_sc_sandpit` * Apply styling * Style code * update documentation Co-authored-by: Zihao Li <[email protected]> * Rename variable sc_id Co-authored-by: Zihao Li <[email protected]> * Rename variable Co-authored-by: Zihao Li <[email protected]> * Rename variable Co-authored-by: Zihao Li <[email protected]> * Update documentation * [check-spelling] Update metadata Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/8689503990/attempts/1 Accepted in #943 (comment) Signed-off-by: check-spelling-bot <[email protected]> on-behalf-of: @check-spelling <[email protected]> * update spelling * update spelling expect variant --------- Signed-off-by: check-spelling-bot <[email protected]> Co-authored-by: Jennit07 <[email protected]> Co-authored-by: Zihao Li <[email protected]> Co-authored-by: Zihao Li <[email protected]>
Public-Health-Scotland · Apr 17, 2024 · 2cff80d · 2cff80d
1 parent a0ac030
commit 2cff80d
Show file tree

Hide file tree

Showing 13 changed files with 228 additions and 15 deletions.
diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
@@ -12,7 +12,7 @@ aut
 bedday
 birthtime
 bodyloc
-BOXI
+boxi
 callr
 canx
 carehome
@@ -169,8 +169,6 @@ postcodes
 ppas
 prac
 praccode
-prac
-praccode
 ptypes
 purrr
 quickstart
@@ -207,7 +205,7 @@ setkeyv
 setnafill
 setnames
 setorder
-siar
+Siar
 sigfac
 simd
 slf
@@ -246,6 +244,6 @@ xlsx
 yearstay
 yml
 yyyyqx
-zihao
+Zihao
 zsav
 zstd
diff --git a/NAMESPACE b/NAMESPACE
@@ -145,8 +145,10 @@ export(process_tests_sc_all_hc_episodes)
 export(process_tests_sc_all_sds_episodes)
 export(process_tests_sc_client_lookup)
 export(process_tests_sc_demographics)
+export(process_tests_sc_sandpit)
 export(process_tests_sds)
 export(produce_episode_file_tests)
+export(produce_sc_sandpit_tests)
 export(produce_source_extract_tests)
 export(produce_test_comparison)
 export(read_extract_acute)

diff --git a/R/process_tests_sc_sandpit.R b/R/process_tests_sc_sandpit.R
@@ -0,0 +1,144 @@
+#' Process tests for the social care sandpit extracts
+#'
+#' @param type Name of sandpit extract.
+#'
+#' @return a [tibble][tibble::tibble-package] containing a test comparison.
+#' @export
+#'
+process_tests_sc_sandpit <- function(type = c("at", "hc", "ch", "sds", "demographics", "client"), year = NULL) {
+  comparison <- produce_test_comparison(
+    old_data = produce_sc_sandpit_tests(
+      read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = previous_update())),
+      type = {{ type }}
+    ),
+    new_data = produce_sc_sandpit_tests(
+      read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = latest_update())),
+      type = {{ type }}
+    )
+  )
+
+  comparison %>%
+    write_tests_xlsx(sheet_name = {{ type }}, year = year, workbook_name = "sandpit")
+
+  return(comparison)
+}
+
+
+#' Produce tests for social care sandpit extracts.
+#'
+#' @param data new or old data for testing summary flags
+#' (data is from [get_sandpit_extract_path()])
+#' @param type Name of sandpit extract.
+#'
+#' @return a dataframe with a count of each flag
+#' from [calculate_measures()]
+#' @export
+#'
+produce_sc_sandpit_tests <- function(data, type = c("demographics", "client", "at", "ch", "hc", "sds")) {
+  if (type == "demographics") {
+    missing_tests <- data %>%
+      dplyr::mutate(
+        n_missing_chi = is_missing(.data$chi_upi),
+        n_missing_sc_id = is_missing(.data$social_care_id),
+        n_missing_dob = is.na(.data$chi_date_of_birth),
+        n_missing_postcode = is_missing(.data$chi_postcode),
+        n_missing_gender = is_missing(.data$chi_gender_code)
+      ) %>%
+      dplyr::select(n_missing_chi:n_missing_gender) %>%
+      calculate_measures(measure = "sum")
+
+    latest_flag_tests <- data %>%
+      dplyr::filter(!(is.na(.data$chi_upi))) %>%
+      dplyr::group_by(.data$chi_upi, .data$sending_location) %>%
+      dplyr::summarise(latest_count = sum(.data$latest_record_flag)) %>%
+      dplyr::ungroup() %>%
+      dplyr::mutate(
+        chi_latest_flag_0 = dplyr::if_else(.data$latest_count == 0, 1, 0),
+        chi_latest_flag_1 = dplyr::if_else(.data$latest_count == 1, 1, 0),
+        chi_latest_flag_2 = dplyr::if_else(.data$latest_count == 2, 1, 0),
+        chi_latest_flag_3 = dplyr::if_else(.data$latest_count == 3, 1, 0),
+        chi_latest_flag_4 = dplyr::if_else(.data$latest_count == 4, 1, 0),
+        chi_latest_flag_5 = dplyr::if_else(.data$latest_count == 5, 1, 0),
+        chi_latest_flag_6 = dplyr::if_else(.data$latest_count == 6, 1, 0),
+        chi_latest_flag_7 = dplyr::if_else(.data$latest_count == 7, 1, 0),
+        chi_latest_flag_8 = dplyr::if_else(.data$latest_count == 8, 1, 0),
+        chi_latest_flag_9 = dplyr::if_else(.data$latest_count == 9, 1, 0),
+        chi_latest_flag_10 = dplyr::if_else(.data$latest_count == 10, 1, 0)
+      ) %>%
+      dplyr::select(.data$chi_latest_flag_0:.data$chi_latest_flag_10) %>%
+      calculate_measures(measure = "sum")
+
+    # add a flag for sc ids where there is multiple chi associated
+    sc_id_multi_chi <- data %>%
+      dplyr::distinct() %>%
+      dplyr::filter(!(is.na(.data$chi_upi))) %>%
+      dplyr::group_by(.data$social_care_id, .data$sending_location) %>%
+      dplyr::distinct(.data$chi_upi, .keep_all = TRUE) %>%
+      dplyr::mutate(distinct_chi_count = dplyr::n_distinct(.data$chi_upi)) %>%
+      dplyr::filter(distinct_chi_count > 1) %>%
+      dplyr::distinct(.data$social_care_id, .data$sending_location, .keep_all = TRUE) %>%
+      dplyr::mutate(sc_id_multi_chi = 1) %>%
+      create_sending_location_test_flags(.data$sending_location) %>%
+      dplyr::ungroup() %>%
+      dplyr::rename(
+        sc_id_multi_chi_Aberdeen_City = Aberdeen_City,
+        sc_id_multi_chi_Aberdeenshire = Aberdeenshire,
+        sc_id_multi_chi_Angus = Angus,
+        sc_id_multi_chi_Argyll_and_Bute = Argyll_and_Bute,
+        sc_id_multi_chi_City_of_Edinburgh = City_of_Edinburgh,
+        sc_id_multi_chi_Clackmannanshire = Clackmannanshire,
+        sc_id_multi_chi_Dumfries_and_Galloway = Dumfries_and_Galloway,
+        sc_id_multi_chi_Dundee_City = Dundee_City,
+        sc_id_multi_chi_East_Ayrshire = East_Ayrshire,
+        sc_id_multi_chi_East_Dunbartonshire = East_Dunbartonshire,
+        sc_id_multi_chi_East_Lothian = East_Lothian,
+        sc_id_multi_chi_East_Renfrewshire = East_Renfrewshire,
+        sc_id_multi_chi_Falkirk = Falkirk,
+        sc_id_multi_chi_Fife = Fife,
+        sc_id_multi_chi_Glasgow_City = Glasgow_City,
+        sc_id_multi_chi_Highland = Highland,
+        sc_id_multi_chi_Inverclyde = Inverclyde,
+        sc_id_multi_chi_Midlothian = Midlothian,
+        sc_id_multi_chi_Moray = Moray,
+        sc_id_multi_chi_Na_h_Eileanan_Siar = Na_h_Eileanan_Siar,
+        sc_id_multi_chi_North_Ayrshire = North_Ayrshire,
+        sc_id_multi_chi_North_Lanarkshire = North_Lanarkshire,
+        sc_id_multi_chi_Orkney_Islands = Orkney_Islands,
+        sc_id_multi_chi_Perth_and_Kinross = Perth_and_Kinross,
+        sc_id_multi_chi_Renfrewshire = Renfrewshire,
+        sc_id_multi_chi_Scottish_Borders = Scottish_Borders,
+        sc_id_multi_chi_Shetland_Islands = Shetland_Islands,
+        sc_id_multi_chi_South_Ayrshire = South_Ayrshire,
+        sc_id_multi_chi_South_Lanarkshire = South_Lanarkshire,
+        sc_id_multi_chi_Stirling = Stirling,
+        sc_id_multi_chi_West_Dunbartonshire = West_Dunbartonshire,
+        sc_id_multi_chi_West_Lothian = West_Lothian
+      ) %>%
+      dplyr::select(.data$sc_id_multi_chi, .data$sc_id_multi_chi_Aberdeen_City:.data$sc_id_multi_chi_West_Lothian) %>%
+      calculate_measures(measure = "sum")
+
+    output <- list(
+      missing_tests,
+      latest_flag_tests,
+      sc_id_multi_chi
+    ) %>%
+      purrr::reduce(dplyr::full_join, by = c("measure", "value"))
+
+    return(output)
+  } else if (type == "client" | type == "at" | type == "ch" |
+    type == "hc" | type == "sds") {
+    output <- data %>%
+      # create test flags
+      dplyr::mutate(
+        unique_sc_id = dplyr::lag(.data$social_care_id) != .data$social_care_id,
+        n_missing_sc_id = is_missing(.data$social_care_id)
+      ) %>%
+      create_sending_location_test_flags(.data$sending_location) %>%
+      # remove variables that won't be summed
+      dplyr::select(c("unique_sc_id":"West_Lothian")) %>%
+      # use function to sum new test flags
+      calculate_measures(measure = "sum")
+
+    return(output)
+  }
+}
diff --git a/R/read_lookup_sc_client.R b/R/read_lookup_sc_client.R
@@ -80,6 +80,9 @@ read_lookup_sc_client <- function(fyyear,
   if (!fs::file_exists(get_sandpit_extract_path(type = "client", year = fyyear))) {
     client_data %>%
       write_file(get_sandpit_extract_path(type = "client", year = fyyear))
+
+    client_data %>%
+      process_tests_sc_sandpit(type = "client", year = fyyear)
   } else {
     client_data <- client_data
   }

diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R
@@ -28,6 +28,9 @@ read_lookup_sc_demographics <- function(sc_dvprod_connection = phs_db_connection
   if (!fs::file_exists(get_sandpit_extract_path(type = "demographics"))) {
     sc_demog %>%
       write_file(get_sandpit_extract_path(type = "demographics"))
+
+    sc_demog %>%
+      process_tests_sc_sandpit(type = "demographics")
   } else {
     sc_demog <- sc_demog
   }

diff --git a/R/read_sc_all_alarms_telecare.R b/R/read_sc_all_alarms_telecare.R
@@ -31,6 +31,9 @@ read_sc_all_alarms_telecare <- function(sc_dvprod_connection = phs_db_connection
   if (!fs::file_exists(get_sandpit_extract_path(type = "at"))) {
     at_full_data %>%
       write_file(get_sandpit_extract_path(type = "at"))
+
+    at_full_data %>%
+      process_tests_sandpit(type = "at")
   } else {
     at_full_data <- at_full_data
   }

diff --git a/R/read_sc_all_care_home.R b/R/read_sc_all_care_home.R
@@ -33,6 +33,9 @@ read_sc_all_care_home <- function(sc_dvprod_connection = phs_db_connection(dsn =
   if (!fs::file_exists(get_sandpit_extract_path(type = "ch"))) {
     ch_data %>%
       write_file(get_sandpit_extract_path(type = "ch"))
+
+    ch_data %>%
+      process_tests_sandpit(type = "ch")
   } else {
     ch_data <- ch_data
   }

diff --git a/R/read_sc_all_home_care.R b/R/read_sc_all_home_care.R
@@ -50,6 +50,9 @@ read_sc_all_home_care <- function(sc_dvprod_connection = phs_db_connection(dsn =
   if (!fs::file_exists(get_sandpit_extract_path(type = "hc"))) {
     home_care_data %>%
       write_file(get_sandpit_extract_path(type = "hc"))
+
+    home_care_date %>%
+      process_tests_sandpit(type = "hc")
   } else {
     home_care_data <- home_care_data
   }

diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R
@@ -31,6 +31,9 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
   if (!fs::file_exists(get_sandpit_extract_path(type = "sds"))) {
     sds_full_data %>%
       write_file(get_sandpit_extract_path(type = "sds"))
+
+    sds_full_data %>%
+      process_tests_sandpit(type = "sds")
   } else {
     sds_full_data <- sds_full_data
   }

diff --git a/R/write_tests_xlsx.R b/R/write_tests_xlsx.R
@@ -19,15 +19,22 @@
 write_tests_xlsx <- function(comparison_data,
                              sheet_name,
                              year = NULL,
-                             workbook_name = c("ep_file", "indiv_file", "lookup", "extract")) {
+                             workbook_name = c("ep_file", "indiv_file", "lookup", "extract", "sandpit")) {
   # Set up the workbook ----
 
-  if (workbook_name == "lookup" | missing(year) & workbook_name == "lookup") {
-    tests_workbook_name <- stringr::str_glue(latest_update(), "_lookups_tests")
-  } else {
+  if (is.null(year)) {
     tests_workbook_name <- dplyr::case_when(
       workbook_name == "ep_file" ~ stringr::str_glue(latest_update(), "_ep_file_tests"),
       workbook_name == "indiv_file" ~ stringr::str_glue(latest_update(), "_indiv_file_tests"),
+      workbook_name == "lookup" ~ stringr::str_glue(latest_update(), "_lookups_tests"),
+      workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests")
+    )
+  } else if (workbook_name == "sandpit" & !is.null(year)) {
+    tests_workbook_name <- dplyr::case_when(
+      workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests")
+    )
+  } else {
+    tests_workbook_name <- dplyr::case_when(
       workbook_name == "extract" ~ stringr::str_glue(latest_update(), "_{year}_extract_tests")
     )
   }
@@ -92,11 +99,11 @@ write_tests_xlsx <- function(comparison_data,
 
   date_today <- stringr::str_to_lower(date_today)
 
-  sheet_name_dated <- ifelse(
-    is.null(year),
-    stringr::str_glue("{sheet_name}_{date_today}"),
-    stringr::str_glue("{year}_{sheet_name}_{date_today}")
-  )
+  if (is.null(year)) {
+    sheet_name_dated <- stringr::str_glue("{sheet_name}_{date_today}")
+  } else {
+    sheet_name_dated <- stringr::str_glue("{year}_{sheet_name}_{date_today}")
+  }
 
   # If there has already been a sheet created today, append the time
   if (sheet_name_dated %in% names(wb)) {

diff --git a/man/process_tests_sc_sandpit.Rd b/man/process_tests_sc_sandpit.Rd
diff --git a/man/produce_sc_sandpit_tests.Rd b/man/produce_sc_sandpit_tests.Rd
diff --git a/man/write_tests_xlsx.Rd b/man/write_tests_xlsx.Rd