From c9fd06410be12279a2ea2b8653f6c55a2b2300f3 Mon Sep 17 00:00:00 2001 From: Philip-Leftwich Date: Tue, 19 Nov 2024 11:40:13 +0000 Subject: [PATCH 1/2] Proposed new function for messy - duplicate rows of data --- R/duplicate_rows.R | 63 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) create mode 100644 R/duplicate_rows.R diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R new file mode 100644 index 0000000..0ab3532 --- /dev/null +++ b/R/duplicate_rows.R @@ -0,0 +1,63 @@ +#' Duplicate rows and insert them into the dataframe in order or at random +#' +#' @param data input dataframe +#' @param messiness Percentage of rows to duplicate. Must be +#' between 0 and 1. Default 0.1. +#' @param shuffle Insert duplicated data underneath original data or insert randomly +#' @importFrom dplyr slice_sample mutate arrange +#' @return A dataframe with duplicated rows inserted +#' @export +#' @examples +#' duplicate_rows_random_insert(mtcars, messiness = 0.1) +duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) { + if (messiness < 0 || messiness > 1) { + stop("'messiness' must be between 0 and 1") + } + + # Calculate the number of rows to duplicate + num_rows_to_duplicate <- ceiling(nrow(data) * messiness) + + + # Add an index column to preserve original order + # Mark rows as originals + data <- data |> + dplyr::mutate(original_index = row_number()) |> + dplyr::mutate(is_duplicate = FALSE) + + # Duplicate rows according to messiness + duplicated_rows <- data |> + dplyr::slice_sample(n = num_rows_to_duplicate, replace = TRUE) + + # Add an identifier to distinguish duplicated rows + duplicated_rows <- duplicated_rows |> + dplyr::mutate(is_duplicate = TRUE) + + + # Combine original and duplicated rows + combined_data <- dplyr::bind_rows(data, duplicated_rows) + + # By default duplicated rows are added in the same order as original data + if(shuffle == FALSE){ + # Insert duplicated rows into the original dataframe + final_data <- combined_data |> + dplyr::arrange(original_index) + + # Drop helper columns + final_data <- final_data |> + dplyr::select(-c(original_index, is_duplicate)) + } else{ + # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained + # Assign a random index to the duplicated rows + final_data <- combined_data |> + dplyr::mutate(random_index = ifelse(is_duplicate, sample(length(combined_data)), original_index)) |> + dplyr::arrange(random_index) + + # Drop helper columns + final_data <- final_data |> + dplyr::select(-c(original_index, is_duplicate, random_index)) +} + return(final_data) + +} + + From 6e3d369760fc864a43bdc873ced5215958f0fcf0 Mon Sep 17 00:00:00 2001 From: nrennie Date: Sun, 1 Dec 2024 14:23:45 +0000 Subject: [PATCH 2/2] fix namespacing and docs for duplicate funcs --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 2 +- R/duplicate_rows.R | 45 +++++++++++++++++++++++-------------------- man/duplicate_rows.Rd | 25 ++++++++++++++++++++++++ 5 files changed, 52 insertions(+), 23 deletions(-) create mode 100644 man/duplicate_rows.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 6967e8a..87680fe 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: messy Title: Create messy data from clean dataframes -Version: 0.0.2 +Version: 0.0.2.0002 Authors@R: c( person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"), email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X"))) diff --git a/NAMESPACE b/NAMESPACE index d05725f..0420fb0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(add_special_chars) export(add_whitespace) export(change_case) +export(duplicate_rows) export(make_missing) export(messy) export(messy_colnames) diff --git a/NEWS.md b/NEWS.md index b31c0dc..6cb8f1b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,8 @@ # messy (development version) +* Add `duplicate_rows()` function * Add date(time) messy-ing functions: - - `messy_datetime_tzones()` will randomly set different timezones to datetime columns - `messy_datetime_formats()` and `messy_date_formats()` will format date(times) as characters, and scramble their strptime formats. - `split_datetimes()` and `split_dates()` will split datetime columns into "date" and "time" columns, and Date columns into "year", "month", and "day" columns. diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R index 0ab3532..5f62677 100644 --- a/R/duplicate_rows.R +++ b/R/duplicate_rows.R @@ -4,11 +4,10 @@ #' @param messiness Percentage of rows to duplicate. Must be #' between 0 and 1. Default 0.1. #' @param shuffle Insert duplicated data underneath original data or insert randomly -#' @importFrom dplyr slice_sample mutate arrange #' @return A dataframe with duplicated rows inserted #' @export #' @examples -#' duplicate_rows_random_insert(mtcars, messiness = 0.1) +#' duplicate_rows(mtcars, messiness = 0.1) duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) { if (messiness < 0 || messiness > 1) { stop("'messiness' must be between 0 and 1") @@ -17,11 +16,10 @@ duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) { # Calculate the number of rows to duplicate num_rows_to_duplicate <- ceiling(nrow(data) * messiness) - # Add an index column to preserve original order # Mark rows as originals data <- data |> - dplyr::mutate(original_index = row_number()) |> + dplyr::mutate(original_index = dplyr::row_number()) |> dplyr::mutate(is_duplicate = FALSE) # Duplicate rows according to messiness @@ -37,27 +35,32 @@ duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) { combined_data <- dplyr::bind_rows(data, duplicated_rows) # By default duplicated rows are added in the same order as original data - if(shuffle == FALSE){ - # Insert duplicated rows into the original dataframe - final_data <- combined_data |> - dplyr::arrange(original_index) - - # Drop helper columns - final_data <- final_data |> - dplyr::select(-c(original_index, is_duplicate)) - } else{ - # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained + if (shuffle == FALSE) { + # Insert duplicated rows into the original dataframe + final_data <- combined_data |> + dplyr::arrange(.data$original_index) + + # Drop helper columns + final_data <- final_data |> + dplyr::select(-c(.data$original_index, .data$is_duplicate)) + } else { + # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained # Assign a random index to the duplicated rows final_data <- combined_data |> - dplyr::mutate(random_index = ifelse(is_duplicate, sample(length(combined_data)), original_index)) |> - dplyr::arrange(random_index) + dplyr::mutate(random_index = ifelse( + .data$is_duplicate, + sample(length(combined_data)), + .data$original_index + )) |> + dplyr::arrange(.data$random_index) # Drop helper columns final_data <- final_data |> - dplyr::select(-c(original_index, is_duplicate, random_index)) -} + dplyr::select(-c( + .data$original_index, + .data$is_duplicate, + .data$random_index + )) + } return(final_data) - } - - diff --git a/man/duplicate_rows.Rd b/man/duplicate_rows.Rd new file mode 100644 index 0000000..72851ee --- /dev/null +++ b/man/duplicate_rows.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/duplicate_rows.R +\name{duplicate_rows} +\alias{duplicate_rows} +\title{Duplicate rows and insert them into the dataframe in order or at random} +\usage{ +duplicate_rows(data, messiness = 0.1, shuffle = FALSE) +} +\arguments{ +\item{data}{input dataframe} + +\item{messiness}{Percentage of rows to duplicate. Must be +between 0 and 1. Default 0.1.} + +\item{shuffle}{Insert duplicated data underneath original data or insert randomly} +} +\value{ +A dataframe with duplicated rows inserted +} +\description{ +Duplicate rows and insert them into the dataframe in order or at random +} +\examples{ +duplicate_rows(mtcars, messiness = 0.1) +}