From c55d2c1a1f09cde13bbb631373d0847ff2a1ee4b Mon Sep 17 00:00:00 2001 From: nrennie Date: Sun, 17 Nov 2024 15:28:28 +0000 Subject: [PATCH] fix special chars and missing bugs --- NAMESPACE | 3 +- R/add_special_chars.R | 86 ++++++++++++++++++++++++++++++++++++++++ R/change_case.R | 19 ++++++++- R/make_missing.R | 4 +- R/messy.R | 9 +++-- R/messy_colnames.R | 2 +- R/messy_strings.R | 51 ------------------------ R/utils.R | 7 ++++ README.md | 78 +++++++++++++++++++++++------------- man/add_special_chars.Rd | 26 ++++++++++++ man/change_case.Rd | 6 ++- man/make_string_messy.Rd | 22 ---------- man/messy.Rd | 5 ++- 13 files changed, 207 insertions(+), 111 deletions(-) create mode 100644 R/add_special_chars.R delete mode 100644 R/messy_strings.R create mode 100644 R/utils.R create mode 100644 man/add_special_chars.Rd delete mode 100644 man/make_string_messy.Rd diff --git a/NAMESPACE b/NAMESPACE index 2de6eec..2ab1d45 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,9 @@ # Generated by roxygen2: do not edit by hand +export(add_special_chars) export(add_whitespace) export(change_case) export(make_missing) -export(make_string_messy) export(messy) export(messy_colnames) importFrom(rlang,.data) -importFrom(stats,runif) diff --git a/R/add_special_chars.R b/R/add_special_chars.R new file mode 100644 index 0000000..ebaf937 --- /dev/null +++ b/R/add_special_chars.R @@ -0,0 +1,86 @@ +#' Add special characters to strings + +#' @param data input dataframe +#' @param cols set of columns to apply transformation to. If `NULL` +#' will apply to all columns. Default `NULL`. +#' @param messiness Percentage of values to change. Must be +#' between 0 and 1. Default 0.1. +#' @importFrom rlang .data +#' @return a dataframe the same size as the input data. +#' @export +#' @examples +#' add_special_chars(mtcars) +add_special_chars <- function(data, + cols = NULL, + messiness = 0.1) { + if (messiness < 0 || messiness > 1) { + stop("'messiness' must be between 0 and 1") + } + if (is.null(cols)) { + output <- data |> + dplyr::mutate( + dplyr::across( + dplyr::where(~ is.character(.x) | is.factor(.x)), + ~ special_chars(.x, messiness = messiness) + ) + ) + } else { + # check if all cols present in colnames + if (!all((cols %in% colnames(data)))) { + stop("All elements of 'cols' must be a column name in 'data'") + } else { + output <- data |> + dplyr::mutate( + dplyr::across( + dplyr::all_of(cols) & + dplyr::where(~ is.character(.x) | is.factor(.x)), + ~ special_chars(.x, messiness = messiness) + ) + ) + } + } + return(output) +} + +#' Function to make a character string messy +#' +#' Adds special characters and randomly +#' capitalises strings. +#' @param x Character vector +#' @param messiness Percentage of values to change. Must be +#' between 0 and 1. Default 0.1. +#' @return Messy character vector +#' @noRd +special_chars <- function(x, messiness = 0.1) { + # if factor, convert to character + if (is.factor(x)) { + x <- as.character(x) + } + + special_chars_string <- function(s, ...) { + # characters to insert + random_chars <- c( + "!", "@", "#", "$", "%", "^", "&", + "*", "(", ")", "_", "+", "-", "." + ) + + # Convert to vector of characters + chars <- strsplit(s, NULL)[[1]] + + # Randomly insert special characters using lapply + chars <- Reduce(function(acc, char) { + if (stats::runif(1) < messiness) { + char_to_insert <- sample(random_chars, 1) + return(c(acc, char_to_insert, char)) + } else { + return(c(acc, char)) + } + }, chars, init = character(0)) + + # Reassemble the string + return(paste(chars, collapse = "")) + } + + x_messy <- sapply(x, special_chars_string, USE.NAMES = FALSE) + return(x_messy) +} diff --git a/R/change_case.R b/R/change_case.R index 0170186..9a4b346 100644 --- a/R/change_case.R +++ b/R/change_case.R @@ -1,21 +1,29 @@ #' Change case #' #' Randomly switch between title case and lowercase for +#' character strings #' @param data input dataframe #' @param cols set of columns to apply transformation to. If `NULL` #' will apply to all columns. Default `NULL`. #' @param messiness Percentage of values to change. Must be #' between 0 and 1. Default 0.1. +#' @param case_type Whether the case should change based on +#' the `"word"` or `"letter"`. #' @importFrom rlang .data #' @return a dataframe the same size as the input data. #' @export change_case <- function(data, cols = NULL, - messiness = 0.1) { + messiness = 0.1, + case_type = "word") { if (messiness < 0 || messiness > 1) { stop("'messiness' must be between 0 and 1") } + if (!(case_type %in% c("word", "letter"))) { + stop("'case_type' must be either 'word' or 'letter'") + } + if (is.null(cols)) { output <- data |> dplyr::mutate( @@ -59,3 +67,12 @@ change_case <- function(data, } return(output) } + +# # Randomly change the case of each character using sapply +# chars <- sapply(chars, function(char) { +# if (stats::runif(1) < 0.5) { +# return(toupper(char)) +# } else { +# return(tolower(char)) +# } +# }) diff --git a/R/make_missing.R b/R/make_missing.R index d8d4775..2f1fbd9 100644 --- a/R/make_missing.R +++ b/R/make_missing.R @@ -27,7 +27,7 @@ make_missing <- function(data, dplyr::across( dplyr::everything(), ~ dplyr::case_when( - runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)), + runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)), TRUE ~ .x ) ) @@ -42,7 +42,7 @@ make_missing <- function(data, dplyr::across( dplyr::all_of(cols), ~ dplyr::case_when( - runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)), + runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)), TRUE ~ .x ) ) diff --git a/R/messy.R b/R/messy.R index 38b838a..e0b9f6e 100644 --- a/R/messy.R +++ b/R/messy.R @@ -8,16 +8,19 @@ #' missing values will be replaced with. If length is greater #' than 1, values will be replaced randomly. #' Default `NA`. +#' @param case_type Whether the case should change based on +#' the `"word"` or `"letter"`. #' @return a dataframe the same size as the input data. #' @export - messy <- function(data, messiness = 0.1, - missing = NA) { + missing = NA, + case_type = "word") { output <- data |> + add_special_chars(messiness = messiness) |> add_whitespace(messiness = messiness) |> make_missing(messiness = messiness, missing = missing) |> - change_case(messiness = messiness) + change_case(messiness = messiness, case_type = case_type) return(output) } diff --git a/R/messy_colnames.R b/R/messy_colnames.R index 5ecd021..a74aa7f 100644 --- a/R/messy_colnames.R +++ b/R/messy_colnames.R @@ -10,6 +10,6 @@ #' messy_colnames(mtcars) messy_colnames <- function(data) { # Assign the new column names to the dataframe - names(data) <- make_string_messy(names(data)) + names(data) <- special_chars(names(data)) return(data) } diff --git a/R/messy_strings.R b/R/messy_strings.R deleted file mode 100644 index 842ad9e..0000000 --- a/R/messy_strings.R +++ /dev/null @@ -1,51 +0,0 @@ -#' Make character vector messy -#' -#' Adds special characters and randomly -#' capitalises characters in the provided -#' character vector. -#' -#' @param x string vector to mess up -#' @return string vector that is messed up -#' @export -#' @examples -#' make_string_messy(c("Hello", "world")) -make_string_messy <- function(x) { - sapply(x, messy_string, USE.NAMES = FALSE) -} - -#' Function to make a string messy -#' -#' Adds special characters and randomly -#' capitalises strings. -#' @param s string to mess up -#' @return messy string -#' @importFrom stats runif -#' @noRd -messy_string <- function(s) { - random_chars <- c("!", "@", "#", "$", "%", "^", "&", "*", "(", ")", "_", "+", "-", ".") - - # Convert to vector of characters - chars <- strsplit(s, NULL)[[1]] - - # Randomly change the case of each character using sapply - chars <- sapply(chars, function(char) { - if (runif(1) < 0.5) { - return(toupper(char)) - } else { - return(tolower(char)) - } - }) - - # Randomly insert special characters using lapply - chars <- Reduce(function(acc, char) { - if (runif(1) < 0.2) { - char_to_insert <- sample(random_chars, 1) - return(c(acc, char_to_insert, char)) - } else { - return(c(acc, char)) - } - }, chars, init = character(0)) - - # Reassemble the string - return(paste(chars, collapse = "")) -} diff --git a/R/utils.R b/R/utils.R new file mode 100644 index 0000000..4b1e027 --- /dev/null +++ b/R/utils.R @@ -0,0 +1,7 @@ +#' Resample +#' +#' Resamples x of a specifc size +#' @param x either a vector of one or more elements from which to choose. +#' @return a vector of length size with elements drawn from either x +#' @noRd +resample <- function(x, ...) x[sample.int(length(x), ...)] diff --git a/README.md b/README.md index e2cb407..e824baa 100644 --- a/README.md +++ b/README.md @@ -24,17 +24,17 @@ messy(ToothGrowth[1:10,]) ``` ```r - len supp dose -1 4.2 vc 0.5 -2 11.5 VC 0.5 -3 7.3 VC 0.5 -4 5.8 VC 0.5 -5 6.4 VC 0.5 -6 10 VC 0.5 -7 11.2 VC 0.5 -8 11.2 VC 0.5 -9 5.2 VC 0.5 -10 7 + len supp dose +1 4.2 VC 0.5 +2 11.5 +3 7.3 VC 0.5 +4 5.8 (VC 0.5 +5 6.4 VC +6 10 VC 0.5 +7 11.2 0.5 +8 11.2 VC 0.5 +9 5.2 VC 0.5 +10 7 VC 0.5 ``` Increase how *messy* the data is: @@ -45,17 +45,17 @@ messy(ToothGrowth[1:10,], messiness = 0.7) ``` ```r - len supp dose -1 0.5 -2 -3 -4 -5 -6 10 0.5 -7 -8 0.5 -9 5.2 VC 0.5 -10 7 + len supp dose +1 +2 11.5 +3 +4 5.8 +5 .v*c +6 +7 +8 0.5 +9 v@c +10 ``` ### `add_whitespace()` @@ -125,6 +125,29 @@ change_case(ToothGrowth[1:10,], messiness = 0.5) 10 7.0 VC 0.5 ``` +### `add_special_chars()` + +Randomly add special characters to character strings: + +```r +set.seed(1234) +add_special_chars(ToothGrowth[1:10,]) +``` + +```r + len supp dose +1 4.2 VC 0.5 +2 11.5 VC 0.5 +3 7.3 VC 0.5 +4 5.8 (VC 0.5 +5 6.4 VC 0.5 +6 10.0 VC 0.5 +7 11.2 VC 0.5 +8 11.2 VC 0.5 +9 5.2 VC 0.5 +10 7.0 VC 0.5 +``` + ### `make_missing()` Randomly make some values missing using `NA`: @@ -178,7 +201,8 @@ set.seed(1234) ToothGrowth[1:10,] |> make_missing(cols = "supp", missing = " ") |> make_missing(cols = c("len", "dose"), missing = c(NA, 999)) |> - add_whitespace(cols = "supp", messiness = 0.5) + add_whitespace(cols = "supp", messiness = 0.5) |> + add_special_chars(cols = "supp") ``` ```r @@ -186,11 +210,11 @@ ToothGrowth[1:10,] |> 1 4.2 VC 0.5 2 11.5 VC NA 3 7.3 VC 0.5 -4 5.8 VC 0.5 +4 5.8 *VC 0.5 5 6.4 VC 0.5 6 10.0 VC 0.5 7 11.2 0.5 -8 11.2 VC NA -9 5.2 VC 0.5 -10 7.0 VC 0.5 +8 11.2 V#C NA +9 5.2 !VC 0.5 +10 7.0 VC* 0.5 ``` diff --git a/man/add_special_chars.Rd b/man/add_special_chars.Rd new file mode 100644 index 0000000..fa4b4a7 --- /dev/null +++ b/man/add_special_chars.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/add_special_chars.R +\name{add_special_chars} +\alias{add_special_chars} +\title{Add special characters to strings} +\usage{ +add_special_chars(data, cols = NULL, messiness = 0.1) +} +\arguments{ +\item{data}{input dataframe} + +\item{cols}{set of columns to apply transformation to. If \code{NULL} +will apply to all columns. Default \code{NULL}.} + +\item{messiness}{Percentage of values to change. Must be +between 0 and 1. Default 0.1.} +} +\value{ +a dataframe the same size as the input data. +} +\description{ +Add special characters to strings +} +\examples{ +add_special_chars(mtcars) +} diff --git a/man/change_case.Rd b/man/change_case.Rd index e126b3f..3752fa9 100644 --- a/man/change_case.Rd +++ b/man/change_case.Rd @@ -4,7 +4,7 @@ \alias{change_case} \title{Change case} \usage{ -change_case(data, cols = NULL, messiness = 0.1) +change_case(data, cols = NULL, messiness = 0.1, case_type = "word") } \arguments{ \item{data}{input dataframe} @@ -14,10 +14,14 @@ will apply to all columns. Default \code{NULL}.} \item{messiness}{Percentage of values to change. Must be between 0 and 1. Default 0.1.} + +\item{case_type}{Whether the case should change based on +the \code{"word"} or \code{"letter"}.} } \value{ a dataframe the same size as the input data. } \description{ Randomly switch between title case and lowercase for +character strings } diff --git a/man/make_string_messy.Rd b/man/make_string_messy.Rd deleted file mode 100644 index a8ffc35..0000000 --- a/man/make_string_messy.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/messy_strings.R -\name{make_string_messy} -\alias{make_string_messy} -\title{Make character vector messy} -\usage{ -make_string_messy(x) -} -\arguments{ -\item{x}{string vector to mess up} -} -\value{ -string vector that is messed up -} -\description{ -Adds special characters and randomly -capitalises characters in the provided -character vector. -} -\examples{ -make_string_messy(c("Hello", "world")) -} diff --git a/man/messy.Rd b/man/messy.Rd index 873e9ad..ae5c341 100644 --- a/man/messy.Rd +++ b/man/messy.Rd @@ -4,7 +4,7 @@ \alias{messy} \title{Messy} \usage{ -messy(data, messiness = 0.1, missing = NA) +messy(data, messiness = 0.1, missing = NA, case_type = "word") } \arguments{ \item{data}{input dataframe} @@ -16,6 +16,9 @@ between 0 and 1. Default 0.1.} missing values will be replaced with. If length is greater than 1, values will be replaced randomly. Default \code{NA}.} + +\item{case_type}{Whether the case should change based on +the \code{"word"} or \code{"letter"}.} } \value{ a dataframe the same size as the input data.