nrennie · luisDVA · Dec 5, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -21,7 +21,7 @@ Suggests:
 Config/testthat/edition: 3
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.2.9000
 Config/Needs/website: nrennie/nrenniepkgdown
 URL: https://nrennie.rbind.io/messy/, https://github.com/nrennie/messy
 BugReports: https://github.com/nrennie/messy/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(add_special_chars)
 export(add_whitespace)
+export(alter_separators)
 export(change_case)
 export(duplicate_rows)
 export(make_missing)

diff --git a/R/alter_separators.R b/R/alter_separators.R
@@ -0,0 +1,65 @@
+#' Alter separators
+#'
+#' Randomly alter the separators (spaces and underscores) in character strings
+#' through duplication or replacement
+#' @param data input dataframe
+#' @param cols set of columns to apply transformation to. If `NULL`
+#' will apply to all columns. Default `NULL`.
+#' @param messiness Percentage of values to change. Must be
+#' between 0 and 1. Default 0.1.
+#' @return a dataframe the same size as the input data.
+#' @export
+#' @examples
+#' players <- data.frame(
+#' Player = c("Rey McSriff", "Kevin Nogilny", "Mike Sernandez", "Glenallen Mixon"),
+#' Rating = c("B_3", "A_4", "C_2", "C_3"),
+#'State = c("Arizona", "New Mexico", "North Carolina", "Texas"),
+#'Season = c(2001L, 2002L, 2000L, 2002L)
+#')
+#' alter_separators(players)
+alter_separators <- function(data, cols = NULL, messiness = 0.1) {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+
+  alter_sep <- function(x) {
+    alterations <- list(
+      function(s) gsub(" ", "  ", s),  # duplicate spaces
+      function(s) gsub(" ", "_", s),   # replace spaces with underscores
+      function(s) gsub("_", " ", s)    # replace underscores with spaces
+    )
+
+    n <- length(x)
+    to_alter <- sample(n, size = round(n * messiness), replace = FALSE)
+
+    x[to_alter] <- sapply(x[to_alter], function(s) {
+      # enforce changes
+      applicable_alterations <- alterations[!sapply(alterations, function(f) identical(f(s), s))]
+
+      if (length(applicable_alterations) > 0) {
+        # Sample one of the modifications
+        chosen_alteration <- sample(applicable_alterations, 1)[[1]]
+        return(chosen_alteration(s))
+      } else {
+        # If not applicable
+        return(s)
+      }
+    })
+
+    return(x)
+  }
+
+  if (is.null(cols)) {
+    output <- data |>
+      dplyr::mutate(dplyr::across(dplyr::where(is.character), \(x) alter_sep(x)))
+  } else {
+    # are cols present
+    if (!all((cols %in% colnames(data)))) {
+      stop("All elements of 'cols' must be a column name in 'data'")
+    } else {
+      output <- data |>
+        dplyr::mutate(dplyr::across(dplyr::all_of(cols), \(x) alter_sep(x)))
+    }
+  }
+  return(output)
+}
diff --git a/R/messy.R b/R/messy.R
@@ -23,6 +23,7 @@ messy <- function(data,
     add_special_chars(messiness = messiness) |>
     add_whitespace(messiness = messiness) |>
     make_missing(messiness = messiness, missing = missing) |>
-    change_case(messiness = messiness, case_type = case_type)
+    change_case(messiness = messiness, case_type = case_type) |>
+    alter_separators(messiness = messiness)
   return(output)
 }
diff --git a/man/alter_separators.Rd b/man/alter_separators.Rd
diff --git a/tests/testthat/test-alter_separators.R b/tests/testthat/test-alter_separators.R
@@ -0,0 +1,57 @@
+test_that("alter_separators works ", {
+  # Test data
+  test_df <- data.frame(
+    X1 = c("a b", "c_d", "e f", "g_h", "i j"),
+    X2 = c("k l", "m n", "o_p", "q r", "s_t"),
+    X3 = 1:5,
+    stringsAsFactors = FALSE)
+
+  # strings get altered
+  result_df <- alter_separators(test_df, messiness = 1)
+  expect_true(all(result_df$X1 != test_df$X1 | result_df$X2 != test_df$X2))
+  expect_identical(result_df$X3, test_df$X3)
+
+  # messiness determines % of rows altered
+  result_low <- alter_separators(test_df, messiness = 0.2)
+  altered_count_low <- sum(result_low$X1 != test_df$X1) + sum(result_low$X2 != test_df$X2)
+  expect_true(altered_count_low <= 2)  # 2  or 20% of 10 values for both cols
+
+  result_high <- alter_separators(test_df, messiness = 0.8)
+  altered_count_high <- sum(result_high$X1 != test_df$X1) + sum(result_high$X2 != test_df$X2)
+  expect_true(altered_count_high >= 6)  # 80% of 10 values
+
+  # when cols argument is used
+  result_X1 <- alter_separators(test_df, cols = "X1", messiness = 1)
+  expect_true(all(result_X1$X1 != test_df$X1))
+  expect_identical(result_X1$X2, test_df$X2)
+  expect_identical(result_X1$X3, test_df$X3)
+
+  # messiness must be a value between 0-1
+  expect_error(alter_separators(test_df, messiness = -0.01))
+  expect_error(alter_separators(test_df, messiness = 1.5))
+
+  # invalid column names for cols
+  expect_error(alter_separators(test_df, cols = "test_col3"))
+
+  # alterations applied?
+  result <- alter_separators(test_df, messiness = 1)
+  for (i in 1:nrow(test_df)) {
+    # were spaced duplicated or replaced, or were underscores replaced with " "
+    expect_true(
+      grepl("  ", result$X1[i]) ||
+        (grepl("_", result$X1[i]) & !grepl("_", test_df$X1[i])) ||
+        (!grepl("_", result$X1[i]) & grepl("_", test_df$X1[i]))
+    )
+    expect_true(
+      grepl("  ", result$X2[i]) ||
+        (grepl("_", result$X2[i]) && !grepl("_", test_df$X2[i])) ||
+        (!grepl("_", result$X2[i]) && grepl("_", test_df$X2[i]))
+    )
+  }
+
+  # when strings remain unchanged
+  test_df_noseparators <- data.frame(X1 = c("ab", "cd", "ef"))
+  result_noseparators <- alter_separators(test_df_noseparators, messiness = 1)
+  expect_identical(result_noseparators, test_df_noseparators)
+
+})