From 2cff80d9e8bf2602f8579ac31edd24d734a6cb28 Mon Sep 17 00:00:00 2001
From: Jennit07 <67372904+Jennit07@users.noreply.github.com>
Date: Wed, 17 Apr 2024 12:55:41 +0100
Subject: [PATCH] Create tests for social care sandpit extracts (#943)

* Update `write_tests_xlsx`

* Update documentation

* Add in sandpit tests where the extract is saved

* Setup tests for sandpit
Further checks needed for writing to disk

* Update documentation

* Amend case_when statement

* rename function to include 'sc'

* Update documentation

* Use `is.null` instead of `missing`

* Update documentation

* Add `year` as a parameter

* Update documentation

* Setup for writing sandpit tests to disk

* Update parameters for sandpit tests

* Update documentation

* Use `process_tests_sc_sandpit`

* Apply styling

* Style code

* update documentation

Co-authored-by: Zihao Li <zihao.li@phs.scot>

* Rename variable sc_id

Co-authored-by: Zihao Li <zihao.li@phs.scot>

* Rename variable

Co-authored-by: Zihao Li <zihao.li@phs.scot>

* Rename variable

Co-authored-by: Zihao Li <zihao.li@phs.scot>

* Update documentation

* [check-spelling] Update metadata

Update for https://github.com/Public-Health-Scotland/source-linkage-files/actions/runs/8689503990/attempts/1
Accepted in https://github.com/Public-Health-Scotland/source-linkage-files/pull/943#issuecomment-2056794120

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
on-behalf-of: @check-spelling <check-spelling-bot@check-spelling.dev>

* update spelling

* update spelling expect variant

---------

Signed-off-by: check-spelling-bot <check-spelling-bot@users.noreply.github.com>
Co-authored-by: Jennit07 <Jennit07@users.noreply.github.com>
Co-authored-by: Zihao Li <zihao.li@phs.scot>
Co-authored-by: Zihao Li <lizihao_anu@outlook.com>
---
 .github/actions/spelling/expect.txt |   8 +-
 NAMESPACE                           |   2 +
 R/process_tests_sc_sandpit.R        | 144 ++++++++++++++++++++++++++++
 R/read_lookup_sc_client.R           |   3 +
 R/read_lookup_sc_demographics.R     |   3 +
 R/read_sc_all_alarms_telecare.R     |   3 +
 R/read_sc_all_care_home.R           |   3 +
 R/read_sc_all_home_care.R           |   3 +
 R/read_sc_all_sds.R                 |   3 +
 R/write_tests_xlsx.R                |  25 +++--
 man/process_tests_sc_sandpit.Rd     |  20 ++++
 man/produce_sc_sandpit_tests.Rd     |  24 +++++
 man/write_tests_xlsx.Rd             |   2 +-
 13 files changed, 228 insertions(+), 15 deletions(-)
 create mode 100644 R/process_tests_sc_sandpit.R
 create mode 100644 man/process_tests_sc_sandpit.Rd
 create mode 100644 man/produce_sc_sandpit_tests.Rd

diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt
index d27b6e755..a4a34a58b 100644
--- a/.github/actions/spelling/expect.txt
+++ b/.github/actions/spelling/expect.txt
@@ -12,7 +12,7 @@ aut
 bedday
 birthtime
 bodyloc
-BOXI
+boxi
 callr
 canx
 carehome
@@ -169,8 +169,6 @@ postcodes
 ppas
 prac
 praccode
-prac
-praccode
 ptypes
 purrr
 quickstart
@@ -207,7 +205,7 @@ setkeyv
 setnafill
 setnames
 setorder
-siar
+Siar
 sigfac
 simd
 slf
@@ -246,6 +244,6 @@ xlsx
 yearstay
 yml
 yyyyqx
-zihao
+Zihao
 zsav
 zstd
diff --git a/NAMESPACE b/NAMESPACE
index 91f6b66d9..4606cf3f2 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -145,8 +145,10 @@ export(process_tests_sc_all_hc_episodes)
 export(process_tests_sc_all_sds_episodes)
 export(process_tests_sc_client_lookup)
 export(process_tests_sc_demographics)
+export(process_tests_sc_sandpit)
 export(process_tests_sds)
 export(produce_episode_file_tests)
+export(produce_sc_sandpit_tests)
 export(produce_source_extract_tests)
 export(produce_test_comparison)
 export(read_extract_acute)
diff --git a/R/process_tests_sc_sandpit.R b/R/process_tests_sc_sandpit.R
new file mode 100644
index 000000000..089f61aa1
--- /dev/null
+++ b/R/process_tests_sc_sandpit.R
@@ -0,0 +1,144 @@
+#' Process tests for the social care sandpit extracts
+#'
+#' @param type Name of sandpit extract.
+#'
+#' @return a [tibble][tibble::tibble-package] containing a test comparison.
+#' @export
+#'
+process_tests_sc_sandpit <- function(type = c("at", "hc", "ch", "sds", "demographics", "client"), year = NULL) {
+  comparison <- produce_test_comparison(
+    old_data = produce_sc_sandpit_tests(
+      read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = previous_update())),
+      type = {{ type }}
+    ),
+    new_data = produce_sc_sandpit_tests(
+      read_file(get_sandpit_extract_path(type = {{ type }}, year = year, update = latest_update())),
+      type = {{ type }}
+    )
+  )
+
+  comparison %>%
+    write_tests_xlsx(sheet_name = {{ type }}, year = year, workbook_name = "sandpit")
+
+  return(comparison)
+}
+
+
+#' Produce tests for social care sandpit extracts.
+#'
+#' @param data new or old data for testing summary flags
+#' (data is from [get_sandpit_extract_path()])
+#' @param type Name of sandpit extract.
+#'
+#' @return a dataframe with a count of each flag
+#' from [calculate_measures()]
+#' @export
+#'
+produce_sc_sandpit_tests <- function(data, type = c("demographics", "client", "at", "ch", "hc", "sds")) {
+  if (type == "demographics") {
+    missing_tests <- data %>%
+      dplyr::mutate(
+        n_missing_chi = is_missing(.data$chi_upi),
+        n_missing_sc_id = is_missing(.data$social_care_id),
+        n_missing_dob = is.na(.data$chi_date_of_birth),
+        n_missing_postcode = is_missing(.data$chi_postcode),
+        n_missing_gender = is_missing(.data$chi_gender_code)
+      ) %>%
+      dplyr::select(n_missing_chi:n_missing_gender) %>%
+      calculate_measures(measure = "sum")
+
+    latest_flag_tests <- data %>%
+      dplyr::filter(!(is.na(.data$chi_upi))) %>%
+      dplyr::group_by(.data$chi_upi, .data$sending_location) %>%
+      dplyr::summarise(latest_count = sum(.data$latest_record_flag)) %>%
+      dplyr::ungroup() %>%
+      dplyr::mutate(
+        chi_latest_flag_0 = dplyr::if_else(.data$latest_count == 0, 1, 0),
+        chi_latest_flag_1 = dplyr::if_else(.data$latest_count == 1, 1, 0),
+        chi_latest_flag_2 = dplyr::if_else(.data$latest_count == 2, 1, 0),
+        chi_latest_flag_3 = dplyr::if_else(.data$latest_count == 3, 1, 0),
+        chi_latest_flag_4 = dplyr::if_else(.data$latest_count == 4, 1, 0),
+        chi_latest_flag_5 = dplyr::if_else(.data$latest_count == 5, 1, 0),
+        chi_latest_flag_6 = dplyr::if_else(.data$latest_count == 6, 1, 0),
+        chi_latest_flag_7 = dplyr::if_else(.data$latest_count == 7, 1, 0),
+        chi_latest_flag_8 = dplyr::if_else(.data$latest_count == 8, 1, 0),
+        chi_latest_flag_9 = dplyr::if_else(.data$latest_count == 9, 1, 0),
+        chi_latest_flag_10 = dplyr::if_else(.data$latest_count == 10, 1, 0)
+      ) %>%
+      dplyr::select(.data$chi_latest_flag_0:.data$chi_latest_flag_10) %>%
+      calculate_measures(measure = "sum")
+
+    # add a flag for sc ids where there is multiple chi associated
+    sc_id_multi_chi <- data %>%
+      dplyr::distinct() %>%
+      dplyr::filter(!(is.na(.data$chi_upi))) %>%
+      dplyr::group_by(.data$social_care_id, .data$sending_location) %>%
+      dplyr::distinct(.data$chi_upi, .keep_all = TRUE) %>%
+      dplyr::mutate(distinct_chi_count = dplyr::n_distinct(.data$chi_upi)) %>%
+      dplyr::filter(distinct_chi_count > 1) %>%
+      dplyr::distinct(.data$social_care_id, .data$sending_location, .keep_all = TRUE) %>%
+      dplyr::mutate(sc_id_multi_chi = 1) %>%
+      create_sending_location_test_flags(.data$sending_location) %>%
+      dplyr::ungroup() %>%
+      dplyr::rename(
+        sc_id_multi_chi_Aberdeen_City = Aberdeen_City,
+        sc_id_multi_chi_Aberdeenshire = Aberdeenshire,
+        sc_id_multi_chi_Angus = Angus,
+        sc_id_multi_chi_Argyll_and_Bute = Argyll_and_Bute,
+        sc_id_multi_chi_City_of_Edinburgh = City_of_Edinburgh,
+        sc_id_multi_chi_Clackmannanshire = Clackmannanshire,
+        sc_id_multi_chi_Dumfries_and_Galloway = Dumfries_and_Galloway,
+        sc_id_multi_chi_Dundee_City = Dundee_City,
+        sc_id_multi_chi_East_Ayrshire = East_Ayrshire,
+        sc_id_multi_chi_East_Dunbartonshire = East_Dunbartonshire,
+        sc_id_multi_chi_East_Lothian = East_Lothian,
+        sc_id_multi_chi_East_Renfrewshire = East_Renfrewshire,
+        sc_id_multi_chi_Falkirk = Falkirk,
+        sc_id_multi_chi_Fife = Fife,
+        sc_id_multi_chi_Glasgow_City = Glasgow_City,
+        sc_id_multi_chi_Highland = Highland,
+        sc_id_multi_chi_Inverclyde = Inverclyde,
+        sc_id_multi_chi_Midlothian = Midlothian,
+        sc_id_multi_chi_Moray = Moray,
+        sc_id_multi_chi_Na_h_Eileanan_Siar = Na_h_Eileanan_Siar,
+        sc_id_multi_chi_North_Ayrshire = North_Ayrshire,
+        sc_id_multi_chi_North_Lanarkshire = North_Lanarkshire,
+        sc_id_multi_chi_Orkney_Islands = Orkney_Islands,
+        sc_id_multi_chi_Perth_and_Kinross = Perth_and_Kinross,
+        sc_id_multi_chi_Renfrewshire = Renfrewshire,
+        sc_id_multi_chi_Scottish_Borders = Scottish_Borders,
+        sc_id_multi_chi_Shetland_Islands = Shetland_Islands,
+        sc_id_multi_chi_South_Ayrshire = South_Ayrshire,
+        sc_id_multi_chi_South_Lanarkshire = South_Lanarkshire,
+        sc_id_multi_chi_Stirling = Stirling,
+        sc_id_multi_chi_West_Dunbartonshire = West_Dunbartonshire,
+        sc_id_multi_chi_West_Lothian = West_Lothian
+      ) %>%
+      dplyr::select(.data$sc_id_multi_chi, .data$sc_id_multi_chi_Aberdeen_City:.data$sc_id_multi_chi_West_Lothian) %>%
+      calculate_measures(measure = "sum")
+
+    output <- list(
+      missing_tests,
+      latest_flag_tests,
+      sc_id_multi_chi
+    ) %>%
+      purrr::reduce(dplyr::full_join, by = c("measure", "value"))
+
+    return(output)
+  } else if (type == "client" | type == "at" | type == "ch" |
+    type == "hc" | type == "sds") {
+    output <- data %>%
+      # create test flags
+      dplyr::mutate(
+        unique_sc_id = dplyr::lag(.data$social_care_id) != .data$social_care_id,
+        n_missing_sc_id = is_missing(.data$social_care_id)
+      ) %>%
+      create_sending_location_test_flags(.data$sending_location) %>%
+      # remove variables that won't be summed
+      dplyr::select(c("unique_sc_id":"West_Lothian")) %>%
+      # use function to sum new test flags
+      calculate_measures(measure = "sum")
+
+    return(output)
+  }
+}
diff --git a/R/read_lookup_sc_client.R b/R/read_lookup_sc_client.R
index 370a15722..d2b549671 100644
--- a/R/read_lookup_sc_client.R
+++ b/R/read_lookup_sc_client.R
@@ -80,6 +80,9 @@ read_lookup_sc_client <- function(fyyear,
   if (!fs::file_exists(get_sandpit_extract_path(type = "client", year = fyyear))) {
     client_data %>%
       write_file(get_sandpit_extract_path(type = "client", year = fyyear))
+
+    client_data %>%
+      process_tests_sc_sandpit(type = "client", year = fyyear)
   } else {
     client_data <- client_data
   }
diff --git a/R/read_lookup_sc_demographics.R b/R/read_lookup_sc_demographics.R
index cb3cea3c2..729f3a445 100644
--- a/R/read_lookup_sc_demographics.R
+++ b/R/read_lookup_sc_demographics.R
@@ -28,6 +28,9 @@ read_lookup_sc_demographics <- function(sc_dvprod_connection = phs_db_connection
   if (!fs::file_exists(get_sandpit_extract_path(type = "demographics"))) {
     sc_demog %>%
       write_file(get_sandpit_extract_path(type = "demographics"))
+
+    sc_demog %>%
+      process_tests_sc_sandpit(type = "demographics")
   } else {
     sc_demog <- sc_demog
   }
diff --git a/R/read_sc_all_alarms_telecare.R b/R/read_sc_all_alarms_telecare.R
index 5abd9bc7b..4af57d857 100644
--- a/R/read_sc_all_alarms_telecare.R
+++ b/R/read_sc_all_alarms_telecare.R
@@ -31,6 +31,9 @@ read_sc_all_alarms_telecare <- function(sc_dvprod_connection = phs_db_connection
   if (!fs::file_exists(get_sandpit_extract_path(type = "at"))) {
     at_full_data %>%
       write_file(get_sandpit_extract_path(type = "at"))
+
+    at_full_data %>%
+      process_tests_sandpit(type = "at")
   } else {
     at_full_data <- at_full_data
   }
diff --git a/R/read_sc_all_care_home.R b/R/read_sc_all_care_home.R
index 870a94ded..0e74d6623 100644
--- a/R/read_sc_all_care_home.R
+++ b/R/read_sc_all_care_home.R
@@ -33,6 +33,9 @@ read_sc_all_care_home <- function(sc_dvprod_connection = phs_db_connection(dsn =
   if (!fs::file_exists(get_sandpit_extract_path(type = "ch"))) {
     ch_data %>%
       write_file(get_sandpit_extract_path(type = "ch"))
+
+    ch_data %>%
+      process_tests_sandpit(type = "ch")
   } else {
     ch_data <- ch_data
   }
diff --git a/R/read_sc_all_home_care.R b/R/read_sc_all_home_care.R
index cca2d0a9b..3741785a7 100644
--- a/R/read_sc_all_home_care.R
+++ b/R/read_sc_all_home_care.R
@@ -50,6 +50,9 @@ read_sc_all_home_care <- function(sc_dvprod_connection = phs_db_connection(dsn =
   if (!fs::file_exists(get_sandpit_extract_path(type = "hc"))) {
     home_care_data %>%
       write_file(get_sandpit_extract_path(type = "hc"))
+
+    home_care_date %>%
+      process_tests_sandpit(type = "hc")
   } else {
     home_care_data <- home_care_data
   }
diff --git a/R/read_sc_all_sds.R b/R/read_sc_all_sds.R
index d9d5b8b1d..e184ffaeb 100644
--- a/R/read_sc_all_sds.R
+++ b/R/read_sc_all_sds.R
@@ -31,6 +31,9 @@ read_sc_all_sds <- function(sc_dvprod_connection = phs_db_connection(dsn = "DVPR
   if (!fs::file_exists(get_sandpit_extract_path(type = "sds"))) {
     sds_full_data %>%
       write_file(get_sandpit_extract_path(type = "sds"))
+
+    sds_full_data %>%
+      process_tests_sandpit(type = "sds")
   } else {
     sds_full_data <- sds_full_data
   }
diff --git a/R/write_tests_xlsx.R b/R/write_tests_xlsx.R
index ffe86f48f..d2e1490f2 100644
--- a/R/write_tests_xlsx.R
+++ b/R/write_tests_xlsx.R
@@ -19,15 +19,22 @@
 write_tests_xlsx <- function(comparison_data,
                              sheet_name,
                              year = NULL,
-                             workbook_name = c("ep_file", "indiv_file", "lookup", "extract")) {
+                             workbook_name = c("ep_file", "indiv_file", "lookup", "extract", "sandpit")) {
   # Set up the workbook ----
 
-  if (workbook_name == "lookup" | missing(year) & workbook_name == "lookup") {
-    tests_workbook_name <- stringr::str_glue(latest_update(), "_lookups_tests")
-  } else {
+  if (is.null(year)) {
     tests_workbook_name <- dplyr::case_when(
       workbook_name == "ep_file" ~ stringr::str_glue(latest_update(), "_ep_file_tests"),
       workbook_name == "indiv_file" ~ stringr::str_glue(latest_update(), "_indiv_file_tests"),
+      workbook_name == "lookup" ~ stringr::str_glue(latest_update(), "_lookups_tests"),
+      workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests")
+    )
+  } else if (workbook_name == "sandpit" & !is.null(year)) {
+    tests_workbook_name <- dplyr::case_when(
+      workbook_name == "sandpit" ~ stringr::str_glue(latest_update(), "_sandpit_extract_tests")
+    )
+  } else {
+    tests_workbook_name <- dplyr::case_when(
       workbook_name == "extract" ~ stringr::str_glue(latest_update(), "_{year}_extract_tests")
     )
   }
@@ -92,11 +99,11 @@ write_tests_xlsx <- function(comparison_data,
 
   date_today <- stringr::str_to_lower(date_today)
 
-  sheet_name_dated <- ifelse(
-    is.null(year),
-    stringr::str_glue("{sheet_name}_{date_today}"),
-    stringr::str_glue("{year}_{sheet_name}_{date_today}")
-  )
+  if (is.null(year)) {
+    sheet_name_dated <- stringr::str_glue("{sheet_name}_{date_today}")
+  } else {
+    sheet_name_dated <- stringr::str_glue("{year}_{sheet_name}_{date_today}")
+  }
 
   # If there has already been a sheet created today, append the time
   if (sheet_name_dated %in% names(wb)) {
diff --git a/man/process_tests_sc_sandpit.Rd b/man/process_tests_sc_sandpit.Rd
new file mode 100644
index 000000000..d3c1f5984
--- /dev/null
+++ b/man/process_tests_sc_sandpit.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/process_tests_sc_sandpit.R
+\name{process_tests_sc_sandpit}
+\alias{process_tests_sc_sandpit}
+\title{Process tests for the social care sandpit extracts}
+\usage{
+process_tests_sc_sandpit(
+  type = c("at", "hc", "ch", "sds", "demographics", "client"),
+  year = NULL
+)
+}
+\arguments{
+\item{type}{Name of sandpit extract.}
+}
+\value{
+a \link[tibble:tibble-package]{tibble} containing a test comparison.
+}
+\description{
+Process tests for the social care sandpit extracts
+}
diff --git a/man/produce_sc_sandpit_tests.Rd b/man/produce_sc_sandpit_tests.Rd
new file mode 100644
index 000000000..4f34d506b
--- /dev/null
+++ b/man/produce_sc_sandpit_tests.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/process_tests_sc_sandpit.R
+\name{produce_sc_sandpit_tests}
+\alias{produce_sc_sandpit_tests}
+\title{Produce tests for social care sandpit extracts.}
+\usage{
+produce_sc_sandpit_tests(
+  data,
+  type = c("demographics", "client", "at", "ch", "hc", "sds")
+)
+}
+\arguments{
+\item{data}{new or old data for testing summary flags
+(data is from \code{\link[=get_sandpit_extract_path]{get_sandpit_extract_path()}})}
+
+\item{type}{Name of sandpit extract.}
+}
+\value{
+a dataframe with a count of each flag
+from \code{\link[=calculate_measures]{calculate_measures()}}
+}
+\description{
+Produce tests for social care sandpit extracts.
+}
diff --git a/man/write_tests_xlsx.Rd b/man/write_tests_xlsx.Rd
index c510e2570..0788d0080 100644
--- a/man/write_tests_xlsx.Rd
+++ b/man/write_tests_xlsx.Rd
@@ -8,7 +8,7 @@ write_tests_xlsx(
   comparison_data,
   sheet_name,
   year = NULL,
-  workbook_name = c("ep_file", "indiv_file", "lookup", "extract")
+  workbook_name = c("ep_file", "indiv_file", "lookup", "extract", "sandpit")
 )
 }
 \arguments{