Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add function for messing up the separators in strings #14

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Suggests:
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
RoxygenNote: 7.3.2.9000
Config/Needs/website: nrennie/nrenniepkgdown
URL: https://nrennie.rbind.io/messy/, https://github.com/nrennie/messy
BugReports: https://github.com/nrennie/messy/issues
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(add_special_chars)
export(add_whitespace)
export(alter_separators)
export(change_case)
export(duplicate_rows)
export(make_missing)
Expand Down
65 changes: 65 additions & 0 deletions R/alter_separators.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#' Alter separators
#'
#' Randomly alter the separators (spaces and underscores) in character strings
#' through duplication or replacement
#' @param data input dataframe
#' @param cols set of columns to apply transformation to. If `NULL`
#' will apply to all columns. Default `NULL`.
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @return a dataframe the same size as the input data.
#' @export
#' @examples
#' players <- data.frame(
#' Player = c("Rey McSriff", "Kevin Nogilny", "Mike Sernandez", "Glenallen Mixon"),
#' Rating = c("B_3", "A_4", "C_2", "C_3"),
#'State = c("Arizona", "New Mexico", "North Carolina", "Texas"),
#'Season = c(2001L, 2002L, 2000L, 2002L)
#')
#' alter_separators(players)
alter_separators <- function(data, cols = NULL, messiness = 0.1) {
if (messiness < 0 || messiness > 1) {
stop("'messiness' must be between 0 and 1")
}

alter_sep <- function(x) {
alterations <- list(
function(s) gsub(" ", " ", s), # duplicate spaces
function(s) gsub(" ", "_", s), # replace spaces with underscores
function(s) gsub("_", " ", s) # replace underscores with spaces
)

n <- length(x)
to_alter <- sample(n, size = round(n * messiness), replace = FALSE)

x[to_alter] <- sapply(x[to_alter], function(s) {
# enforce changes
applicable_alterations <- alterations[!sapply(alterations, function(f) identical(f(s), s))]

if (length(applicable_alterations) > 0) {
# Sample one of the modifications
chosen_alteration <- sample(applicable_alterations, 1)[[1]]
return(chosen_alteration(s))
} else {
# If not applicable
return(s)
}
})

return(x)
}

if (is.null(cols)) {
output <- data |>
dplyr::mutate(dplyr::across(dplyr::where(is.character), \(x) alter_sep(x)))
} else {
# are cols present
if (!all((cols %in% colnames(data)))) {
stop("All elements of 'cols' must be a column name in 'data'")
} else {
output <- data |>
dplyr::mutate(dplyr::across(dplyr::all_of(cols), \(x) alter_sep(x)))
}
}
return(output)
}
3 changes: 2 additions & 1 deletion R/messy.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ messy <- function(data,
add_special_chars(messiness = messiness) |>
add_whitespace(messiness = messiness) |>
make_missing(messiness = messiness, missing = missing) |>
change_case(messiness = messiness, case_type = case_type)
change_case(messiness = messiness, case_type = case_type) |>
alter_separators(messiness = messiness)
return(output)
}
33 changes: 33 additions & 0 deletions man/alter_separators.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 57 additions & 0 deletions tests/testthat/test-alter_separators.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
test_that("alter_separators works ", {
# Test data
test_df <- data.frame(
X1 = c("a b", "c_d", "e f", "g_h", "i j"),
X2 = c("k l", "m n", "o_p", "q r", "s_t"),
X3 = 1:5,
stringsAsFactors = FALSE)

# strings get altered
result_df <- alter_separators(test_df, messiness = 1)
expect_true(all(result_df$X1 != test_df$X1 | result_df$X2 != test_df$X2))
expect_identical(result_df$X3, test_df$X3)

# messiness determines % of rows altered
result_low <- alter_separators(test_df, messiness = 0.2)
altered_count_low <- sum(result_low$X1 != test_df$X1) + sum(result_low$X2 != test_df$X2)
expect_true(altered_count_low <= 2) # 2 or 20% of 10 values for both cols

result_high <- alter_separators(test_df, messiness = 0.8)
altered_count_high <- sum(result_high$X1 != test_df$X1) + sum(result_high$X2 != test_df$X2)
expect_true(altered_count_high >= 6) # 80% of 10 values

# when cols argument is used
result_X1 <- alter_separators(test_df, cols = "X1", messiness = 1)
expect_true(all(result_X1$X1 != test_df$X1))
expect_identical(result_X1$X2, test_df$X2)
expect_identical(result_X1$X3, test_df$X3)

# messiness must be a value between 0-1
expect_error(alter_separators(test_df, messiness = -0.01))
expect_error(alter_separators(test_df, messiness = 1.5))

# invalid column names for cols
expect_error(alter_separators(test_df, cols = "test_col3"))

# alterations applied?
result <- alter_separators(test_df, messiness = 1)
for (i in 1:nrow(test_df)) {
# were spaced duplicated or replaced, or were underscores replaced with " "
expect_true(
grepl(" ", result$X1[i]) ||
(grepl("_", result$X1[i]) & !grepl("_", test_df$X1[i])) ||
(!grepl("_", result$X1[i]) & grepl("_", test_df$X1[i]))
)
expect_true(
grepl(" ", result$X2[i]) ||
(grepl("_", result$X2[i]) && !grepl("_", test_df$X2[i])) ||
(!grepl("_", result$X2[i]) && grepl("_", test_df$X2[i]))
)
}

# when strings remain unchanged
test_df_noseparators <- data.frame(X1 = c("ab", "cd", "ef"))
result_noseparators <- alter_separators(test_df_noseparators, messiness = 1)
expect_identical(result_noseparators, test_df_noseparators)

})