Skip to content

Commit

Permalink
syncrohnize with remote version
Browse files Browse the repository at this point in the history
Merge branch 'modularize_cleanepi' of https://github.com/epiverse-trace/cleanepi into modularize_cleanepi

# Conflicts:
#	R/clean_data.R
  • Loading branch information
= committed Feb 5, 2024
2 parents ff31f24 + 66c2fcf commit 09f1d58
Show file tree
Hide file tree
Showing 20 changed files with 465 additions and 300 deletions.
83 changes: 52 additions & 31 deletions R/calcualte_age.R
Original file line number Diff line number Diff line change
@@ -1,49 +1,69 @@
#' function to calculate age in days, weeks, months, or years from the date of birth
#' @param data a data frame with at least one date column.
#' @param date_column_name the name of the date column of interest.
#' Calculate age from a specified date column
#'
#' @param data the input data frame with the date column of interest
#' @param target_column the name of the date column of interest. default are
#' 'Date', or, 'DATE', or 'date'
#' @param end_date the end date. default: today's date
#' @param age_in a string indicating whether to calculate
#' the age in 'years', 'months', 'weeks', or 'days'.
#' The default is: 'years'.
#' @returns a data frame with 1 or 2 extra columns compared to the input data
#' frame.
#' @param age_in a string that specifies whether to return the age in 'years',
#' or 'months', or 'days', or 'weeks'. Default is 'years'.
#' @param ... Other extra arguments needed to perform this operation. They
#' include:
#' \enumerate{
#' \item "na_strings": a string that represents the missing values in the
#' date column of interest. This is only needed when the date column
#' contains missing values.
#' }
#'
#' @returns the input data frame with the following 1 or 2 extra columns:
#' @returns a data frame with the following 1 or 2 extra columns compared to the
#' input data frame:
#' \enumerate{
#' \item "age_in_years", or "age_in_months", or "age_in_weeks", or "age_in_days",
#' depending on the value of the 'age_in' parameter.
#' \item "remainder_days": a column with the number of remaining days
#' after the age is converted in weeks or months.
#' \item "age_in_years", or "age_in_months", or "age_in_weeks", or
#' "age_in_days", depending on the value of the 'age_in' parameter.
#' \item "remainder_days": a column with the number of remaining days after
#' the age is converted in weeks or months.
#' }
#' @export
#'
#' @examples
#' age <- calculate_age(
#' data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")),
#' date_column_name = "dateOfBirth",
#' end_date = Sys.Date(),
#' age_in = "months"
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi")),
#' target_column = "dateOfBirth",
#' end_date = Sys.Date(),
#' age_in = "months",
#' na_strings = "-99"
#' )
calculate_age <- function(data, date_column_name = NULL, end_date = Sys.Date(),
age_in = "years") {
calculate_age <- function(data,
target_column = NULL,
end_date = Sys.Date(),
age_in = "years",
...) {
checkmate::assert_data_frame(data, null.ok = FALSE)
checkmate::assert_character(date_column_name, null.ok = TRUE,
checkmate::assert_character(target_column, null.ok = TRUE,
any.missing = FALSE, len = 1L)
checkmate::assert_character(age_in, null.ok = FALSE, any.missing = FALSE,
len = 1L)
checkmate::assert_date(end_date, any.missing = FALSE, len = 1L,
null.ok = TRUE)

tmp_age <- remainder_days <- NULL
tmp_age <- remainder_days <- NULL
extra_args <- list(...)

# check if date column exists in the data
date_column_name <- check_column_existence(data, date_column_name)
target_column <- date_check_column_existence(data, target_column)

# replace missing data characters with NA
data <- replace_missing_char(data, date_column_name)
if ("na_strings" %in% names(extra_args)) {
na_strings <- extra_args[["na_strings"]]
}
data <- replace_missing_values(data, target_column,
na_strings = na_strings)

# standardize the input data if required
if (!lubridate::is.Date(data[[date_column_name]])) {
data <- standardize_date(data, date_column_name, format = NULL,
timeframe = NULL, check_timeframe = FALSE,
error_tolerance = 0.5)[[1L]]
if (!lubridate::is.Date(data[[target_column]])) {
data <- standardize_dates(data, target_column, format = NULL,
timeframe = NULL, error_tolerance = 0.5)
}

# calculate age
Expand All @@ -58,23 +78,23 @@ calculate_age <- function(data, date_column_name = NULL, end_date = Sys.Date(),
res <- switch(
age_in,
years = data %>%
dplyr::mutate(age_years = round((data[[date_column_name]] %--% end_date)
dplyr::mutate(age_years = round((data[[target_column]] %--% end_date)
%/% lubridate::years(1L))),
months = data %>%
dplyr::mutate(tmp_age = lubridate::as.period(end_date -
data[[date_column_name]])) %>% # nolint: line_length_linter.
data[[target_column]])) %>% # nolint: line_length_linter.
dplyr::mutate(age_months = tmp_age %/% months(1L), # nolint
remainder_days = (tmp_age %% months(1L)) %/% # nolint
lubridate::days(1L)) %>%
dplyr::select(-tmp_age),
days = data %>%
dplyr::mutate(tmp_age = lubridate::as.period(end_date -
data[[date_column_name]])) %>% # nolint: line_length_linter.
data[[target_column]])) %>% # nolint: line_length_linter.
dplyr::mutate(age_days = tmp_age %/% lubridate::days(1L)) %>%
dplyr::select(-tmp_age),
weeks = data %>%
dplyr::mutate(tmp_age = lubridate::as.period(end_date -
data[[date_column_name]])) %>% # nolint: line_length_linter.
data[[target_column]])) %>% # nolint: line_length_linter.
dplyr::mutate(age_weeks = tmp_age %/% lubridate::weeks(1L),
remainder_days = (tmp_age %% lubridate::weeks(1L))
%/% lubridate::days(1L)) %>%
Expand All @@ -83,5 +103,6 @@ calculate_age <- function(data, date_column_name = NULL, end_date = Sys.Date(),
if (age_in %in% c("months", "weeks") && all(res[["remainder_days"]] == 0L)) {
res <- res %>% dplyr::select(-remainder_days)
}
res

return(res)
}
81 changes: 34 additions & 47 deletions R/check_date_sequence.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,102 +2,89 @@
#'
#' @param x the string of interest
#' @keywords internal
#' @noRd
is_order <- function(x) {
x <- as_date(x)
!is.unsorted(x)
return(!is.unsorted(x))
}



#' Check order for sequence of date-events
#' Check whether the order of the sequence of date-events is valid
#'
#' @description This function checks whether a date sequence in
#' a vector of specified columns is correct or not.
#'
#' @param data the input data frame
#' @param event_cols a vector or a comma-separated list of the event columns
#' @param target_columns a vector or a comma-separated list of the event columns
#' names. Users should specify at least 2 column names in the expected order.
#' For example: event_cols = c("date_symptoms_onset", "date_hospitalization", "date_death").
#' For example: target_columns = c("date_symptoms_onset", "date_hospitalization", "date_death"). # nolint: line_length_linter
#' @param remove_bad_seq a Boolean to specify if rows with incorrect order
#' should be filtered out or not. The default is FALSE
#' @param report object will contain details about the results from the
#' date columns standardization
#'
#' @returns rows of the input data frame with incorrect date sequence
#' if `remove_bad_seq = FALSE`, the input data frame without those
#' rows if not
#' rows if not.
#' @export
#'
#' @examples
#' good_date_sequence <- check_date_sequence(
#' data = readRDS(system.file("extdata", "test_df.RDS", package = "cleanepi")),
#' event_cols = c("date_first_pcr_positive_test", "date.of.admission"),
#' remove_bad_seq = FALSE,
#' report = list()
#' data = readRDS(system.file("extdata", "test_df.RDS",
#' package = "cleanepi")),
#' target_columns = c("date_first_pcr_positive_test", "date.of.admission"),
#' remove_bad_seq = FALSE
#' )
check_date_sequence <- function(data, event_cols, remove_bad_seq = FALSE,
report = list()) {
checkmate::assert_vector(event_cols, any.missing = FALSE, min.len = 1L, max.len = dim(data)[2],
null.ok = FALSE, unique = TRUE)
check_date_sequence <- function(data, target_columns,
remove_bad_seq = FALSE) {

checkmate::assert_vector(target_columns, any.missing = FALSE, min.len = 1L,
max.len = dim(data)[2], null.ok = FALSE,
unique = TRUE)
checkmate::assert_data_frame(data, null.ok = FALSE)
checkmate::assert_logical(remove_bad_seq, any.missing = FALSE, len = 1L,
null.ok = FALSE)

# check if input is character string
if (all(grepl(",", event_cols, fixed = TRUE))) {
event_cols <- as.character(unlist(strsplit(event_cols, ",", fixed = TRUE)))
event_cols <- gsub(" ", "", event_cols, fixed = TRUE)
if (all(grepl(",", target_columns, fixed = TRUE))) {
target_columns <- as.character(unlist(strsplit(target_columns, ",", fixed = TRUE)))
target_columns <- trimws(target_columns)
}

# check if all columns are part of the data frame
if (!all(event_cols %in% names(data))) {
idx <- which(!(event_cols %in% names(data)))
event_cols <- event_cols[-idx]
warning("\nRemoving unrecognised column name: ", event_cols[idx],
if (!all(target_columns %in% names(data))) {
idx <- which(!(target_columns %in% names(data)))
target_columns <- target_columns[-idx]
warning("\nRemoving unrecognised column name: ", target_columns[idx],
call. = FALSE)
if (length(event_cols) < 2L) {
if (length(target_columns) < 2L) {
stop("\nAt least 2 event dates are required!")
}
}

# check and convert to Date if required
for (cols in event_cols) {
for (cols in target_columns) {
if (!lubridate::is.Date(data[[cols]])) {
data <- standardize_date(data, cols, timeframe = NULL,
check_timeframe = FALSE,
report = list(), error_tolerance = 0.5)[[1L]]
data <- standardize_dates(data, cols, timeframe = NULL,
error_tolerance = 0.5)
}
}

# checking the date sequence
tmp_data <- data %>% dplyr::select(dplyr::all_of(event_cols))
tmp_data <- data %>% dplyr::select(dplyr::all_of(target_columns))
order_date <- apply(tmp_data, 1L, is_order)
bad_order <- which(!order_date)
bad_order <- which(!order_date)
if (!all(order_date)) {
tmp_data <- data[bad_order, ]
# adding incorrect records to the report
data <- add_to_report(x = data,
key = "incorrect_date_sequence",
value = tmp_data)
if (remove_bad_seq) {
data <- data[-bad_order, ]
data <- data[-bad_order, ]
warning(length(bad_order),
" incorrect date sequences were detected and removed",
call. = FALSE)
}
}

# making the report
if (length(bad_order) > 0L) {
if (!("incorrect_date_sequence" %in% names(report))) {
report[["incorrect_date_sequence"]] <- list()
}
report[["incorrect_date_sequence"]][["date_sequence"]] <-
glue::glue_collapse(event_cols, sep = " < ")
report[["incorrect_date_sequence"]][["bad_sequence"]] <- tmp_data
} else {
report <- NULL
}

list(
data = data,
report = report
)
return(data)
}
Loading

0 comments on commit 09f1d58

Please sign in to comment.