diff --git a/DESCRIPTION b/DESCRIPTION index 6967e8a..87680fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: messy Title: Create messy data from clean dataframes -Version: 0.0.2 +Version: 0.0.2.0002 Authors@R: c( person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"), email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X"))) diff --git a/NAMESPACE b/NAMESPACE index d05725f..0420fb0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(add_special_chars) export(add_whitespace) export(change_case) +export(duplicate_rows) export(make_missing) export(messy) export(messy_colnames) diff --git a/NEWS.md b/NEWS.md index b31c0dc..6cb8f1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,8 @@ # messy (development version) +* Add `duplicate_rows()` function * Add date(time) messy-ing functions: - - `messy_datetime_tzones()` will randomly set different timezones to datetime columns - `messy_datetime_formats()` and `messy_date_formats()` will format date(times) as characters, and scramble their strptime formats. - `split_datetimes()` and `split_dates()` will split datetime columns into "date" and "time" columns, and Date columns into "year", "month", and "day" columns. diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R new file mode 100644 index 0000000..5f62677 --- /dev/null +++ b/R/duplicate_rows.R @@ -0,0 +1,66 @@ +#' Duplicate rows and insert them into the dataframe in order or at random +#' +#' @param data input dataframe +#' @param messiness Percentage of rows to duplicate. Must be +#' between 0 and 1. Default 0.1. +#' @param shuffle Insert duplicated data underneath original data or insert randomly +#' @return A dataframe with duplicated rows inserted +#' @export +#' @examples +#' duplicate_rows(mtcars, messiness = 0.1) +duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) { + if (messiness < 0 || messiness > 1) { + stop("'messiness' must be between 0 and 1") + } + + # Calculate the number of rows to duplicate + num_rows_to_duplicate <- ceiling(nrow(data) * messiness) + + # Add an index column to preserve original order + # Mark rows as originals + data <- data |> + dplyr::mutate(original_index = dplyr::row_number()) |> + dplyr::mutate(is_duplicate = FALSE) + + # Duplicate rows according to messiness + duplicated_rows <- data |> + dplyr::slice_sample(n = num_rows_to_duplicate, replace = TRUE) + + # Add an identifier to distinguish duplicated rows + duplicated_rows <- duplicated_rows |> + dplyr::mutate(is_duplicate = TRUE) + + + # Combine original and duplicated rows + combined_data <- dplyr::bind_rows(data, duplicated_rows) + + # By default duplicated rows are added in the same order as original data + if (shuffle == FALSE) { + # Insert duplicated rows into the original dataframe + final_data <- combined_data |> + dplyr::arrange(.data$original_index) + + # Drop helper columns + final_data <- final_data |> + dplyr::select(-c(.data$original_index, .data$is_duplicate)) + } else { + # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained + # Assign a random index to the duplicated rows + final_data <- combined_data |> + dplyr::mutate(random_index = ifelse( + .data$is_duplicate, + sample(length(combined_data)), + .data$original_index + )) |> + dplyr::arrange(.data$random_index) + + # Drop helper columns + final_data <- final_data |> + dplyr::select(-c( + .data$original_index, + .data$is_duplicate, + .data$random_index + )) + } + return(final_data) +} diff --git a/man/duplicate_rows.Rd b/man/duplicate_rows.Rd new file mode 100644 index 0000000..72851ee --- /dev/null +++ b/man/duplicate_rows.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicate_rows.R +\name{duplicate_rows} +\alias{duplicate_rows} +\title{Duplicate rows and insert them into the dataframe in order or at random} +\usage{ +duplicate_rows(data, messiness = 0.1, shuffle = FALSE) +} +\arguments{ +\item{data}{input dataframe} + +\item{messiness}{Percentage of rows to duplicate. Must be +between 0 and 1. Default 0.1.} + +\item{shuffle}{Insert duplicated data underneath original data or insert randomly} +} +\value{ +A dataframe with duplicated rows inserted +} +\description{ +Duplicate rows and insert them into the dataframe in order or at random +} +\examples{ +duplicate_rows(mtcars, messiness = 0.1) +}