fix special chars and missing bugs

nrennie · Nov 17, 2024 · c55d2c1 · c55d2c1
1 parent 45896c8
commit c55d2c1
Show file tree

Hide file tree

Showing 13 changed files with 207 additions and 111 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,10 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
+export(add_special_chars)
 export(add_whitespace)
 export(change_case)
 export(make_missing)
-export(make_string_messy)
 export(messy)
 export(messy_colnames)
 importFrom(rlang,.data)
-importFrom(stats,runif)
diff --git a/R/add_special_chars.R b/R/add_special_chars.R
@@ -0,0 +1,86 @@
+#' Add special characters to strings
+
+#' @param data input dataframe
+#' @param cols set of columns to apply transformation to. If `NULL`
+#' will apply to all columns. Default `NULL`.
+#' @param messiness Percentage of values to change. Must be
+#' between 0 and 1. Default 0.1.
+#' @importFrom rlang .data
+#' @return a dataframe the same size as the input data.
+#' @export
+#' @examples
+#' add_special_chars(mtcars)
+add_special_chars <- function(data,
+                              cols = NULL,
+                              messiness = 0.1) {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+  if (is.null(cols)) {
+    output <- data |>
+      dplyr::mutate(
+        dplyr::across(
+          dplyr::where(~ is.character(.x) | is.factor(.x)),
+          ~ special_chars(.x, messiness = messiness)
+        )
+      )
+  } else {
+    # check if all cols present in colnames
+    if (!all((cols %in% colnames(data)))) {
+      stop("All elements of 'cols' must be a column name in 'data'")
+    } else {
+      output <- data |>
+        dplyr::mutate(
+          dplyr::across(
+            dplyr::all_of(cols) &
+              dplyr::where(~ is.character(.x) | is.factor(.x)),
+            ~ special_chars(.x, messiness = messiness)
+          )
+        )
+    }
+  }
+  return(output)
+}
+
+#' Function to make a character string messy
+#'
+#' Adds special characters and randomly
+#' capitalises strings.
+#' @param x Character vector
+#' @param messiness Percentage of values to change. Must be
+#' between 0 and 1. Default 0.1.
+#' @return Messy character vector
+#' @noRd
+special_chars <- function(x, messiness = 0.1) {
+  # if factor, convert to character
+  if (is.factor(x)) {
+    x <- as.character(x)
+  }
+
+  special_chars_string <- function(s, ...) {
+    # characters to insert
+    random_chars <- c(
+      "!", "@", "#", "$", "%", "^", "&",
+      "*", "(", ")", "_", "+", "-", "."
+    )
+
+    # Convert to vector of characters
+    chars <- strsplit(s, NULL)[[1]]
+
+    # Randomly insert special characters using lapply
+    chars <- Reduce(function(acc, char) {
+      if (stats::runif(1) < messiness) {
+        char_to_insert <- sample(random_chars, 1)
+        return(c(acc, char_to_insert, char))
+      } else {
+        return(c(acc, char))
+      }
+    }, chars, init = character(0))
+
+    # Reassemble the string
+    return(paste(chars, collapse = ""))
+  }
+
+  x_messy <- sapply(x, special_chars_string, USE.NAMES = FALSE)
+  return(x_messy)
+}
diff --git a/R/change_case.R b/R/change_case.R
@@ -1,21 +1,29 @@
 #' Change case
 #'
 #' Randomly switch between title case and lowercase for
+#' character strings
 #' @param data input dataframe
 #' @param cols set of columns to apply transformation to. If `NULL`
 #' will apply to all columns. Default `NULL`.
 #' @param messiness Percentage of values to change. Must be
 #' between 0 and 1. Default 0.1.
+#' @param case_type Whether the case should change based on
+#' the `"word"` or `"letter"`.
 #' @importFrom rlang .data
 #' @return a dataframe the same size as the input data.
 #' @export
 
 change_case <- function(data,
                         cols = NULL,
-                        messiness = 0.1) {
+                        messiness = 0.1,
+                        case_type = "word") {
   if (messiness < 0 || messiness > 1) {
     stop("'messiness' must be between 0 and 1")
   }
+  if (!(case_type %in% c("word", "letter"))) {
+    stop("'case_type' must be either 'word' or 'letter'")
+  }
+
   if (is.null(cols)) {
     output <- data |>
       dplyr::mutate(
@@ -59,3 +67,12 @@ change_case <- function(data,
   }
   return(output)
 }
+
+# # Randomly change the case of each character using sapply
+# chars <- sapply(chars, function(char) {
+#   if (stats::runif(1) < 0.5) {
+#     return(toupper(char))
+#   } else {
+#     return(tolower(char))
+#   }
+# })
diff --git a/R/make_missing.R b/R/make_missing.R
@@ -27,7 +27,7 @@ make_missing <- function(data,
         dplyr::across(
           dplyr::everything(),
           ~ dplyr::case_when(
-            runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)),
+            runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)),
             TRUE ~ .x
           )
         )
@@ -42,7 +42,7 @@ make_missing <- function(data,
           dplyr::across(
             dplyr::all_of(cols),
             ~ dplyr::case_when(
-              runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)),
+              runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)),
               TRUE ~ .x
             )
           )

diff --git a/R/messy.R b/R/messy.R
@@ -8,16 +8,19 @@
 #' missing values will be replaced with. If length is greater
 #' than 1, values will be replaced randomly.
 #' Default `NA`.
+#' @param case_type Whether the case should change based on
+#' the `"word"` or `"letter"`.
 #' @return a dataframe the same size as the input data.
 #' @export
 
-
 messy <- function(data,
                   messiness = 0.1,
-                  missing = NA) {
+                  missing = NA,
+                  case_type = "word") {
   output <- data |>
+    add_special_chars(messiness = messiness) |>
     add_whitespace(messiness = messiness) |>
     make_missing(messiness = messiness, missing = missing) |>
-    change_case(messiness = messiness)
+    change_case(messiness = messiness, case_type = case_type)
   return(output)
 }
diff --git a/R/messy_colnames.R b/R/messy_colnames.R
@@ -10,6 +10,6 @@
 #' messy_colnames(mtcars)
 messy_colnames <- function(data) {
   # Assign the new column names to the dataframe
-  names(data) <- make_string_messy(names(data))
+  names(data) <- special_chars(names(data))
   return(data)
 }
diff --git a/R/messy_strings.R b/R/messy_strings.R
diff --git a/R/utils.R b/R/utils.R
@@ -0,0 +1,7 @@
+#' Resample
+#'
+#' Resamples x of a specifc size
+#' @param x either a vector of one or more elements from which to choose.
+#' @return a vector of length size with elements drawn from either x
+#' @noRd
+resample <- function(x, ...) x[sample.int(length(x), ...)]
diff --git a/README.md b/README.md
@@ -24,17 +24,17 @@ messy(ToothGrowth[1:10,])
 ```
 
 ```r
-     len supp dose
-1    4.2   vc  0.5
-2   11.5   VC  0.5
-3    7.3   VC  0.5
-4    5.8   VC 0.5 
-5    6.4   VC  0.5
-6     10   VC  0.5
-7  11.2    VC  0.5
-8   11.2   VC  0.5
-9    5.2   VC  0.5
-10     7 <NA> <NA>
+    len supp dose
+1   4.2   VC  0.5
+2  11.5 <NA> <NA>
+3  7.3    VC  0.5
+4   5.8  (VC  0.5
+5   6.4   VC <NA>
+6    10   VC  0.5
+7  11.2 <NA>  0.5
+8  11.2   VC  0.5
+9  5.2    VC  0.5
+10    7   VC 0.5 
 ```
 
 Increase how *messy* the data is:
@@ -45,17 +45,17 @@ messy(ToothGrowth[1:10,], messiness = 0.7)
 ```
 
 ```r
-    len supp dose
-1  <NA> <NA> 0.5 
-2  <NA> <NA> <NA>
-3  <NA> <NA> <NA>
-4  <NA> <NA> <NA>
-5  <NA> <NA> <NA>
-6   10  <NA>  0.5
-7  <NA> <NA> <NA>
-8  <NA> <NA>  0.5
-9  5.2   VC   0.5
-10   7  <NA> <NA>
+     len  supp dose
+1   <NA>  <NA> <NA>
+2  11.5   <NA> <NA>
+3   <NA>  <NA> <NA>
+4   5.8   <NA> <NA>
+5   <NA> .v*c  <NA>
+6   <NA>  <NA> <NA>
+7   <NA>  <NA> <NA>
+8   <NA>  <NA> 0.5 
+9   <NA>  v@c  <NA>
+10  <NA>  <NA> <NA>
 ```
 
 ### `add_whitespace()`
@@ -125,6 +125,29 @@ change_case(ToothGrowth[1:10,], messiness = 0.5)
 10  7.0   VC  0.5
 ```
 
+### `add_special_chars()`
+
+Randomly add special characters to character strings:
+
+```r
+set.seed(1234)
+add_special_chars(ToothGrowth[1:10,])
+```
+
+```r
+    len supp dose
+1   4.2   VC  0.5
+2  11.5   VC  0.5
+3   7.3   VC  0.5
+4   5.8  (VC  0.5
+5   6.4   VC  0.5
+6  10.0   VC  0.5
+7  11.2   VC  0.5
+8  11.2   VC  0.5
+9   5.2   VC  0.5
+10  7.0   VC  0.5
+```
+
 ### `make_missing()`
 
 Randomly make some values missing using `NA`:
@@ -178,19 +201,20 @@ set.seed(1234)
 ToothGrowth[1:10,] |> 
   make_missing(cols = "supp", missing = " ") |> 
   make_missing(cols = c("len", "dose"), missing = c(NA, 999)) |> 
-  add_whitespace(cols = "supp", messiness = 0.5)
+  add_whitespace(cols = "supp", messiness = 0.5) |> 
+  add_special_chars(cols = "supp")
 ```
 
 ```r
     len supp dose
 1   4.2   VC  0.5
 2  11.5  VC    NA
 3   7.3   VC  0.5
-4   5.8  VC   0.5
+4   5.8 *VC   0.5
 5   6.4  VC   0.5
 6  10.0   VC  0.5
 7  11.2       0.5
-8  11.2   VC   NA
-9   5.2   VC  0.5
-10  7.0  VC   0.5
+8  11.2  V#C   NA
+9   5.2  !VC  0.5
+10  7.0 VC*   0.5
 ```
diff --git a/man/add_special_chars.Rd b/man/add_special_chars.Rd