Skip to content

Commit

Permalink
758 investigate extracts to identify areas of code which can be cut d…
Browse files Browse the repository at this point in the history
…own for processing times (#899)

* re-writing process_sc_all sds and alarm_telecare with data.table to improve the speed

* Update documentation

* Style code

* changes in line with new process_sc_all_sds dplyr version

* Style code

* remove duplicate columns

* remove duplicated columns

---------

Co-authored-by: lizihao-anu <[email protected]>
Co-authored-by: Megan McNicol <[email protected]>
  • Loading branch information
3 people authored Feb 28, 2024
1 parent b7e7138 commit 6dab611
Show file tree
Hide file tree
Showing 2 changed files with 149 additions and 118 deletions.
4 changes: 0 additions & 4 deletions R/process_sc_all_alarms_telecare.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,8 @@ process_sc_all_alarms_telecare <- function(

# Summarize to merge episodes
qtr_merge <- data[, .(
sending_location = data.table::last(sending_location),
social_care_id = data.table::last(social_care_id),
sc_latest_submission = data.table::last(period),
record_keydate2 = data.table::last(record_keydate2),
smrtype = data.table::last(smrtype),
pkg_count = data.table::last(pkg_count),
chi = data.table::last(chi),
gender = data.table::last(gender),
dob = data.table::last(dob),
Expand Down
263 changes: 149 additions & 114 deletions R/process_sc_all_sds.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ process_sc_all_sds <- function(
sc_demog_lookup,
write_to_disk = TRUE) {
# Match on demographics data (chi, gender, dob and postcode)
matched_sds_data <- data %>% #
matched_sds_data <- data %>%
dplyr::filter(.data$sds_start_date_after_period_end_date != 1) %>%
dplyr::right_join(
sc_demog_lookup,
Expand All @@ -24,123 +24,158 @@ process_sc_all_sds <- function(
# when multiple social_care_id from sending_location for single CHI
# replace social_care_id with latest
replace_sc_id_with_latest() %>%
dplyr::select(-latest_sc_id, -latest_flag, -sds_start_date_after_period_end_date) %>%
dplyr::distinct()

# Data Cleaning ---------------------------------------
sds_full_clean <- matched_sds_data %>%
# Deal with SDS option 4
# First turn the option flags into a logical T/F
dplyr::mutate(dplyr::across(
tidyselect::starts_with("sds_option_"),
~ dplyr::case_when(
.x == 1L ~ TRUE,
.x == 0L ~ FALSE,
is.na(.x) ~ FALSE
)
)) %>%
# SDS option 4 is derived when a person receives more than one option.
# e.g. if a person has options 1 and 2 then option 4 will be derived
dplyr::mutate(
sds_option_4 = rowSums(
dplyr::pick(tidyselect::starts_with("sds_option_"))
) > 1L,
.after = .data$sds_option_3
) %>%
# If SDS start date is missing, assign start of FY
dplyr::mutate(
sds_start_date = fix_sc_start_dates(
.data$sds_start_date,
.data$sds_period_start_date
),
# If SDS end date is missing, assign end of financial period
sds_end_date = fix_sc_missing_end_dates(
.data$sds_end_date,
.data$sds_period_end_date
),
# Fix sds_end_date is earlier than sds_start_date by setting end_date to be the end of fyear
sds_end_date = fix_sc_end_dates(
.data$sds_start_date,
.data$sds_end_date,
.data$sds_period_end_date
)
) %>%
dplyr::select(
-sds_period_start_date, -sds_period_end_date,
-sds_start_date_after_end_date
) %>%
# rename for matching source variables
dplyr::rename(
record_keydate1 = .data$sds_start_date,
record_keydate2 = .data$sds_end_date
) %>%
dplyr::select(-sds_start_date_after_period_end_date) %>%
dplyr::distinct() %>%
# Pivot longer on sds option variables
tidyr::pivot_longer(
cols = tidyselect::contains("sds_option_"),
names_to = "sds_option",
names_prefix = "sds_option_",
names_transform = list(sds_option = ~ paste0("SDS-", .x)),
values_to = "received"
) %>%
# Only keep rows where they received a package and remove duplicates
dplyr::filter(.data$received) %>%
dplyr::distinct() %>%
# Include source variables
# sds_options may contain only a few NA, replace NA by 0
dplyr::mutate(
smrtype = dplyr::case_when(
sds_option == "SDS-1" ~ "SDS-1",
sds_option == "SDS-2" ~ "SDS-2",
sds_option == "SDS-3" ~ "SDS-3",
sds_option == "SDS-4" ~ "SDS-4"
),
recid = "SDS",
# Create person id variable
person_id = stringr::str_glue("{sending_location}-{social_care_id}"),
# Use function for creating sc send lca variables
sc_send_lca = convert_sc_sending_location_to_lca(.data$sending_location)
sds_option_1 = tidyr::replace_na(sds_option_1, 0),
sds_option_2 = tidyr::replace_na(sds_option_2, 0),
sds_option_3 = tidyr::replace_na(sds_option_3, 0)
)

final_data <- sds_full_clean %>%
# use as.data.table to change the data format to data.table to accelerate
data.table::as.data.table() %>%
dplyr::group_by(
.data$sending_location,
.data$social_care_id,
.data$smrtype
) %>%
dplyr::arrange(.data$period,
.data$record_keydate1,
.data$record_keydate2,
.by_group = TRUE
) %>%
# Create a flag for episodes that are going to be merged
# Create an episode counter
dplyr::mutate(
distinct_episode = (.data$record_keydate1 > dplyr::lag(.data$record_keydate2)) %>%
tidyr::replace_na(TRUE),
episode_counter = cumsum(.data$distinct_episode)
) %>%
# Group by episode counter and merge episodes
dplyr::group_by(.data$episode_counter, .add = TRUE) %>%
dplyr::summarise(
sc_latest_submission = dplyr::last(.data$period),
record_keydate1 = min(.data$record_keydate1),
record_keydate2 = max(.data$record_keydate2),
sending_location = dplyr::last(.data$sending_location),
social_care_id = dplyr::last(.data$social_care_id),
chi = dplyr::last(.data$chi),
gender = dplyr::last(.data$gender),
dob = dplyr::last(.data$dob),
postcode = dplyr::last(.data$postcode),
recid = dplyr::last(.data$recid),
person_id = dplyr::last(.data$person_id),
sc_send_lca = dplyr::last(.data$sc_send_lca)
) %>%
dplyr::ungroup() %>%
dplyr::select(-.data$episode_counter) %>%
# change the data format from data.table to data.frame
tibble::as_tibble()
# Data Cleaning ---------------------------------------
# Convert matched_sds_data to data.table
sds_full_clean <- data.table::as.data.table(matched_sds_data)
rm(matched_sds_data)

# Deal with SDS option 4
# Convert option flags into logical T/F
cols_sds_option <- grep(
"^sds_option_",
names(sds_full_clean),
value = TRUE
)
sds_full_clean[, (cols_sds_option) := lapply(.SD, function(x) {
data.table::fifelse(x == 1L, TRUE, FALSE)
}),
.SDcols = cols_sds_option
]

# Derived SDS option 4 when a person receives more than one option
sds_full_clean[,
sds_option_4 := rowSums(.SD) > 1L,
.SDcols = cols_sds_option
]

# If SDS start date or end date is missing, assign start/end of FY
sds_full_clean[
,
sds_start_date := fix_sc_start_dates(sds_start_date, sds_period_start_date)
]
sds_full_clean[
,
sds_end_date := fix_sc_missing_end_dates(sds_end_date, sds_period_end_date)
]
sds_full_clean[
,
sds_end_date := fix_sc_end_dates(sds_start_date, sds_end_date, sds_period_end_date)
]

sds_full_clean[, c(
"sds_period_start_date",
"sds_period_end_date",
"sds_start_date_after_end_date"
) := NULL]

# Rename for matching source variables
data.table::setnames(
sds_full_clean,
c("sds_start_date", "sds_end_date"),
c("record_keydate1", "record_keydate2")
)

sds_full_clean <- unique(sds_full_clean)

cols_sds_option <- grep(
"^sds_option_",
names(sds_full_clean),
value = TRUE
)
# Pivot longer on sds option variables
sds_full_clean_long <- data.table::melt(
sds_full_clean,
id.vars = setdiff(names(sds_full_clean), cols_sds_option),
measure.vars = cols_sds_option,
variable.name = "sds_option",
value.name = "received"
)
rm(sds_full_clean)
sds_full_clean_long <- sds_full_clean_long[received == TRUE, ]
sds_full_clean_long[
,
sds_option := paste0("SDS-", sub("sds_option_", "", sds_option))
]

# Filter rows where they received a package and remove duplicates
sds_full_clean_long <- unique(sds_full_clean_long)

# Include source variables
sds_full_clean_long[, c(
"smrtype",
"recid",
"sc_send_lca"
) :=
list(
sds_option,
"SDS",
convert_sc_sending_location_to_lca(sending_location)
)]
sds_full_clean_long$person_id <- paste0(
sds_full_clean_long$sending_location,
"-",
sds_full_clean_long$social_care_id
)

# Group, arrange and create flags for episodes
sds_full_clean_long[,
c(
"period_rank",
"record_keydate1_rank",
"record_keydate2_rank"
) := list(
rank(period),
rank(record_keydate1),
rank(record_keydate2)
),
by = .(sending_location, social_care_id, smrtype)
]
data.table::setorder(
sds_full_clean_long,
period_rank,
record_keydate1_rank,
record_keydate2_rank
)

sds_full_clean_long[,
distinct_episode :=
(data.table::shift(record_keydate2, type = "lag") < record_keydate1) %>%
tidyr::replace_na(TRUE),
by = .(sending_location, social_care_id, smrtype)
]

sds_full_clean_long[,
episode_counter := cumsum(distinct_episode),
by = .(sending_location, social_care_id, smrtype)
]

# Merge episodes by episode counter
final_data <- sds_full_clean_long[, .(
sc_latest_submission = data.table::last(period),
record_keydate1 = min(record_keydate1),
record_keydate2 = max(record_keydate2),
chi = data.table::last(chi),
gender = data.table::last(gender),
dob = data.table::last(dob),
postcode = data.table::last(postcode),
recid = data.table::last(recid),
person_id = data.table::last(person_id),
sc_send_lca = data.table::last(sc_send_lca)
), by = .(sending_location, social_care_id, smrtype, episode_counter)]
rm(sds_full_clean_long)

# Drop episode_counter and convert back to data.frame if needed
final_data <- as.data.frame(final_data[, -"episode_counter"])
# final_data now holds the processed data in the format of a data.frame

if (write_to_disk) {
write_file(
Expand Down

0 comments on commit 6dab611

Please sign in to comment.