diff --git a/DESCRIPTION b/DESCRIPTION index 0de464f..2465f5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,7 @@ Suggests: Config/testthat/edition: 3 Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.2.9000 Config/Needs/website: nrennie/nrenniepkgdown URL: https://nrennie.rbind.io/messy/, https://github.com/nrennie/messy BugReports: https://github.com/nrennie/messy/issues diff --git a/NAMESPACE b/NAMESPACE index 0420fb0..a8c52aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(add_special_chars) export(add_whitespace) +export(alter_separators) export(change_case) export(duplicate_rows) export(make_missing) diff --git a/R/alter_separators.R b/R/alter_separators.R new file mode 100644 index 0000000..4be8a4b --- /dev/null +++ b/R/alter_separators.R @@ -0,0 +1,65 @@ +#' Alter separators +#' +#' Randomly alter the separators (spaces and underscores) in character strings +#' through duplication or replacement +#' @param data input dataframe +#' @param cols set of columns to apply transformation to. If `NULL` +#' will apply to all columns. Default `NULL`. +#' @param messiness Percentage of values to change. Must be +#' between 0 and 1. Default 0.1. +#' @return a dataframe the same size as the input data. +#' @export +#' @examples +#' players <- data.frame( +#' Player = c("Rey McSriff", "Kevin Nogilny", "Mike Sernandez", "Glenallen Mixon"), +#' Rating = c("B_3", "A_4", "C_2", "C_3"), +#'State = c("Arizona", "New Mexico", "North Carolina", "Texas"), +#'Season = c(2001L, 2002L, 2000L, 2002L) +#') +#' alter_separators(players) +alter_separators <- function(data, cols = NULL, messiness = 0.1) { + if (messiness < 0 || messiness > 1) { + stop("'messiness' must be between 0 and 1") + } + + alter_sep <- function(x) { + alterations <- list( + function(s) gsub(" ", " ", s), # duplicate spaces + function(s) gsub(" ", "_", s), # replace spaces with underscores + function(s) gsub("_", " ", s) # replace underscores with spaces + ) + + n <- length(x) + to_alter <- sample(n, size = round(n * messiness), replace = FALSE) + + x[to_alter] <- sapply(x[to_alter], function(s) { + # enforce changes + applicable_alterations <- alterations[!sapply(alterations, function(f) identical(f(s), s))] + + if (length(applicable_alterations) > 0) { + # Sample one of the modifications + chosen_alteration <- sample(applicable_alterations, 1)[[1]] + return(chosen_alteration(s)) + } else { + # If not applicable + return(s) + } + }) + + return(x) + } + + if (is.null(cols)) { + output <- data |> + dplyr::mutate(dplyr::across(dplyr::where(is.character), \(x) alter_sep(x))) + } else { + # are cols present + if (!all((cols %in% colnames(data)))) { + stop("All elements of 'cols' must be a column name in 'data'") + } else { + output <- data |> + dplyr::mutate(dplyr::across(dplyr::all_of(cols), \(x) alter_sep(x))) + } + } + return(output) +} diff --git a/R/messy.R b/R/messy.R index 354e988..8d2be22 100644 --- a/R/messy.R +++ b/R/messy.R @@ -23,6 +23,7 @@ messy <- function(data, add_special_chars(messiness = messiness) |> add_whitespace(messiness = messiness) |> make_missing(messiness = messiness, missing = missing) |> - change_case(messiness = messiness, case_type = case_type) + change_case(messiness = messiness, case_type = case_type) |> + alter_separators(messiness = messiness) return(output) } diff --git a/man/alter_separators.Rd b/man/alter_separators.Rd new file mode 100644 index 0000000..15c3c94 --- /dev/null +++ b/man/alter_separators.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/alter_separators.R +\name{alter_separators} +\alias{alter_separators} +\title{Alter separators} +\usage{ +alter_separators(data, cols = NULL, messiness = 0.1) +} +\arguments{ +\item{data}{input dataframe} + +\item{cols}{set of columns to apply transformation to. If \code{NULL} +will apply to all columns. Default \code{NULL}.} + +\item{messiness}{Percentage of values to change. Must be +between 0 and 1. Default 0.1.} +} +\value{ +a dataframe the same size as the input data. +} +\description{ +Randomly alter the separators (spaces and underscores) in character strings +through duplication or replacement +} +\examples{ +players <- data.frame( +Player = c("Rey McSriff", "Kevin Nogilny", "Mike Sernandez", "Glenallen Mixon"), +Rating = c("B_3", "A_4", "C_2", "C_3"), +State = c("Arizona", "New Mexico", "North Carolina", "Texas"), +Season = c(2001L, 2002L, 2000L, 2002L) +) +alter_separators(players) +} diff --git a/tests/testthat/test-alter_separators.R b/tests/testthat/test-alter_separators.R new file mode 100644 index 0000000..02b606f --- /dev/null +++ b/tests/testthat/test-alter_separators.R @@ -0,0 +1,57 @@ +test_that("alter_separators works ", { + # Test data + test_df <- data.frame( + X1 = c("a b", "c_d", "e f", "g_h", "i j"), + X2 = c("k l", "m n", "o_p", "q r", "s_t"), + X3 = 1:5, + stringsAsFactors = FALSE) + + # strings get altered + result_df <- alter_separators(test_df, messiness = 1) + expect_true(all(result_df$X1 != test_df$X1 | result_df$X2 != test_df$X2)) + expect_identical(result_df$X3, test_df$X3) + + # messiness determines % of rows altered + result_low <- alter_separators(test_df, messiness = 0.2) + altered_count_low <- sum(result_low$X1 != test_df$X1) + sum(result_low$X2 != test_df$X2) + expect_true(altered_count_low <= 2) # 2 or 20% of 10 values for both cols + + result_high <- alter_separators(test_df, messiness = 0.8) + altered_count_high <- sum(result_high$X1 != test_df$X1) + sum(result_high$X2 != test_df$X2) + expect_true(altered_count_high >= 6) # 80% of 10 values + + # when cols argument is used + result_X1 <- alter_separators(test_df, cols = "X1", messiness = 1) + expect_true(all(result_X1$X1 != test_df$X1)) + expect_identical(result_X1$X2, test_df$X2) + expect_identical(result_X1$X3, test_df$X3) + + # messiness must be a value between 0-1 + expect_error(alter_separators(test_df, messiness = -0.01)) + expect_error(alter_separators(test_df, messiness = 1.5)) + + # invalid column names for cols + expect_error(alter_separators(test_df, cols = "test_col3")) + + # alterations applied? + result <- alter_separators(test_df, messiness = 1) + for (i in 1:nrow(test_df)) { + # were spaced duplicated or replaced, or were underscores replaced with " " + expect_true( + grepl(" ", result$X1[i]) || + (grepl("_", result$X1[i]) & !grepl("_", test_df$X1[i])) || + (!grepl("_", result$X1[i]) & grepl("_", test_df$X1[i])) + ) + expect_true( + grepl(" ", result$X2[i]) || + (grepl("_", result$X2[i]) && !grepl("_", test_df$X2[i])) || + (!grepl("_", result$X2[i]) && grepl("_", test_df$X2[i])) + ) + } + + # when strings remain unchanged + test_df_noseparators <- data.frame(X1 = c("ab", "cd", "ef")) + result_noseparators <- alter_separators(test_df_noseparators, messiness = 1) + expect_identical(result_noseparators, test_df_noseparators) + +})