Skip to content

Commit

Permalink
fix special chars and missing bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
nrennie committed Nov 17, 2024
1 parent 45896c8 commit c55d2c1
Show file tree
Hide file tree
Showing 13 changed files with 207 additions and 111 deletions.
3 changes: 1 addition & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Generated by roxygen2: do not edit by hand

export(add_special_chars)
export(add_whitespace)
export(change_case)
export(make_missing)
export(make_string_messy)
export(messy)
export(messy_colnames)
importFrom(rlang,.data)
importFrom(stats,runif)
86 changes: 86 additions & 0 deletions R/add_special_chars.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#' Add special characters to strings

#' @param data input dataframe
#' @param cols set of columns to apply transformation to. If `NULL`
#' will apply to all columns. Default `NULL`.
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @importFrom rlang .data
#' @return a dataframe the same size as the input data.
#' @export
#' @examples
#' add_special_chars(mtcars)
add_special_chars <- function(data,
cols = NULL,
messiness = 0.1) {
if (messiness < 0 || messiness > 1) {
stop("'messiness' must be between 0 and 1")
}
if (is.null(cols)) {
output <- data |>
dplyr::mutate(
dplyr::across(
dplyr::where(~ is.character(.x) | is.factor(.x)),
~ special_chars(.x, messiness = messiness)
)
)
} else {
# check if all cols present in colnames
if (!all((cols %in% colnames(data)))) {
stop("All elements of 'cols' must be a column name in 'data'")
} else {
output <- data |>
dplyr::mutate(
dplyr::across(
dplyr::all_of(cols) &
dplyr::where(~ is.character(.x) | is.factor(.x)),
~ special_chars(.x, messiness = messiness)
)
)
}
}
return(output)
}

#' Function to make a character string messy
#'
#' Adds special characters and randomly
#' capitalises strings.
#' @param x Character vector
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @return Messy character vector
#' @noRd
special_chars <- function(x, messiness = 0.1) {
# if factor, convert to character
if (is.factor(x)) {
x <- as.character(x)
}

special_chars_string <- function(s, ...) {
# characters to insert
random_chars <- c(
"!", "@", "#", "$", "%", "^", "&",
"*", "(", ")", "_", "+", "-", "."
)

# Convert to vector of characters
chars <- strsplit(s, NULL)[[1]]

# Randomly insert special characters using lapply
chars <- Reduce(function(acc, char) {
if (stats::runif(1) < messiness) {
char_to_insert <- sample(random_chars, 1)
return(c(acc, char_to_insert, char))
} else {
return(c(acc, char))
}
}, chars, init = character(0))

# Reassemble the string
return(paste(chars, collapse = ""))
}

x_messy <- sapply(x, special_chars_string, USE.NAMES = FALSE)
return(x_messy)
}
19 changes: 18 additions & 1 deletion R/change_case.R
Original file line number Diff line number Diff line change
@@ -1,21 +1,29 @@
#' Change case
#'
#' Randomly switch between title case and lowercase for
#' character strings
#' @param data input dataframe
#' @param cols set of columns to apply transformation to. If `NULL`
#' will apply to all columns. Default `NULL`.
#' @param messiness Percentage of values to change. Must be
#' between 0 and 1. Default 0.1.
#' @param case_type Whether the case should change based on
#' the `"word"` or `"letter"`.
#' @importFrom rlang .data
#' @return a dataframe the same size as the input data.
#' @export

change_case <- function(data,
cols = NULL,
messiness = 0.1) {
messiness = 0.1,
case_type = "word") {
if (messiness < 0 || messiness > 1) {
stop("'messiness' must be between 0 and 1")
}
if (!(case_type %in% c("word", "letter"))) {
stop("'case_type' must be either 'word' or 'letter'")
}

if (is.null(cols)) {
output <- data |>
dplyr::mutate(
Expand Down Expand Up @@ -59,3 +67,12 @@ change_case <- function(data,
}
return(output)
}

# # Randomly change the case of each character using sapply
# chars <- sapply(chars, function(char) {
# if (stats::runif(1) < 0.5) {
# return(toupper(char))

Check warning on line 74 in R/change_case.R

View workflow job for this annotation

GitHub Actions / lint

file=R/change_case.R,line=74,col=7,[commented_code_linter] Commented code should be removed.
# } else {
# return(tolower(char))

Check warning on line 76 in R/change_case.R

View workflow job for this annotation

GitHub Actions / lint

file=R/change_case.R,line=76,col=7,[commented_code_linter] Commented code should be removed.
# }
# })
4 changes: 2 additions & 2 deletions R/make_missing.R
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ make_missing <- function(data,
dplyr::across(
dplyr::everything(),
~ dplyr::case_when(
runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)),
runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)),
TRUE ~ .x
)
)
Expand All @@ -42,7 +42,7 @@ make_missing <- function(data,
dplyr::across(
dplyr::all_of(cols),
~ dplyr::case_when(
runif(nrow(data)) <= messiness ~ unlist(sample(missing, 1)),
runif(nrow(data)) <= messiness ~ unlist(resample(missing, 1)),
TRUE ~ .x
)
)
Expand Down
9 changes: 6 additions & 3 deletions R/messy.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,19 @@
#' missing values will be replaced with. If length is greater
#' than 1, values will be replaced randomly.
#' Default `NA`.
#' @param case_type Whether the case should change based on
#' the `"word"` or `"letter"`.
#' @return a dataframe the same size as the input data.
#' @export


messy <- function(data,
messiness = 0.1,
missing = NA) {
missing = NA,
case_type = "word") {
output <- data |>
add_special_chars(messiness = messiness) |>
add_whitespace(messiness = messiness) |>
make_missing(messiness = messiness, missing = missing) |>
change_case(messiness = messiness)
change_case(messiness = messiness, case_type = case_type)
return(output)
}
2 changes: 1 addition & 1 deletion R/messy_colnames.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
#' messy_colnames(mtcars)
messy_colnames <- function(data) {
# Assign the new column names to the dataframe
names(data) <- make_string_messy(names(data))
names(data) <- special_chars(names(data))
return(data)
}
51 changes: 0 additions & 51 deletions R/messy_strings.R

This file was deleted.

7 changes: 7 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#' Resample
#'
#' Resamples x of a specifc size
#' @param x either a vector of one or more elements from which to choose.
#' @return a vector of length size with elements drawn from either x
#' @noRd
resample <- function(x, ...) x[sample.int(length(x), ...)]
78 changes: 51 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@ messy(ToothGrowth[1:10,])
```

```r
len supp dose
1 4.2 vc 0.5
2 11.5 VC 0.5
3 7.3 VC 0.5
4 5.8 VC 0.5
5 6.4 VC 0.5
6 10 VC 0.5
7 11.2 VC 0.5
8 11.2 VC 0.5
9 5.2 VC 0.5
10 7 <NA> <NA>
len supp dose
1 4.2 VC 0.5
2 11.5 <NA> <NA>
3 7.3 VC 0.5
4 5.8 (VC 0.5
5 6.4 VC <NA>
6 10 VC 0.5
7 11.2 <NA> 0.5
8 11.2 VC 0.5
9 5.2 VC 0.5
10 7 VC 0.5
```

Increase how *messy* the data is:
Expand All @@ -45,17 +45,17 @@ messy(ToothGrowth[1:10,], messiness = 0.7)
```

```r
len supp dose
1 <NA> <NA> 0.5
2 <NA> <NA> <NA>
3 <NA> <NA> <NA>
4 <NA> <NA> <NA>
5 <NA> <NA> <NA>
6 10 <NA> 0.5
7 <NA> <NA> <NA>
8 <NA> <NA> 0.5
9 5.2 VC 0.5
10 7 <NA> <NA>
len supp dose
1 <NA> <NA> <NA>
2 11.5 <NA> <NA>
3 <NA> <NA> <NA>
4 5.8 <NA> <NA>
5 <NA> .v*c <NA>
6 <NA> <NA> <NA>
7 <NA> <NA> <NA>
8 <NA> <NA> 0.5
9 <NA> v@c <NA>
10 <NA> <NA> <NA>
```

### `add_whitespace()`
Expand Down Expand Up @@ -125,6 +125,29 @@ change_case(ToothGrowth[1:10,], messiness = 0.5)
10 7.0 VC 0.5
```

### `add_special_chars()`

Randomly add special characters to character strings:

```r
set.seed(1234)
add_special_chars(ToothGrowth[1:10,])
```

```r
len supp dose
1 4.2 VC 0.5
2 11.5 VC 0.5
3 7.3 VC 0.5
4 5.8 (VC 0.5
5 6.4 VC 0.5
6 10.0 VC 0.5
7 11.2 VC 0.5
8 11.2 VC 0.5
9 5.2 VC 0.5
10 7.0 VC 0.5
```

### `make_missing()`

Randomly make some values missing using `NA`:
Expand Down Expand Up @@ -178,19 +201,20 @@ set.seed(1234)
ToothGrowth[1:10,] |>
make_missing(cols = "supp", missing = " ") |>
make_missing(cols = c("len", "dose"), missing = c(NA, 999)) |>
add_whitespace(cols = "supp", messiness = 0.5)
add_whitespace(cols = "supp", messiness = 0.5) |>
add_special_chars(cols = "supp")
```

```r
len supp dose
1 4.2 VC 0.5
2 11.5 VC NA
3 7.3 VC 0.5
4 5.8 VC 0.5
4 5.8 *VC 0.5
5 6.4 VC 0.5
6 10.0 VC 0.5
7 11.2 0.5
8 11.2 VC NA
9 5.2 VC 0.5
10 7.0 VC 0.5
8 11.2 V#C NA
9 5.2 !VC 0.5
10 7.0 VC* 0.5
```
26 changes: 26 additions & 0 deletions man/add_special_chars.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit c55d2c1

Please sign in to comment.