From c9fd06410be12279a2ea2b8653f6c55a2b2300f3 Mon Sep 17 00:00:00 2001
From: Philip-Leftwich <philip.leftwich@gmail.com>
Date: Tue, 19 Nov 2024 11:40:13 +0000
Subject: [PATCH 1/2] Proposed new function for messy - duplicate rows of data

---
 R/duplicate_rows.R | 63 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 R/duplicate_rows.R

diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R
new file mode 100644
index 0000000..0ab3532
--- /dev/null
+++ b/R/duplicate_rows.R
@@ -0,0 +1,63 @@
+#' Duplicate rows and insert them into the dataframe in order or at random
+#'
+#' @param data input dataframe
+#' @param messiness Percentage of rows to duplicate. Must be
+#' between 0 and 1. Default 0.1.
+#' @param shuffle Insert duplicated data underneath original data or insert randomly
+#' @importFrom dplyr slice_sample mutate arrange
+#' @return A dataframe with duplicated rows inserted
+#' @export
+#' @examples
+#' duplicate_rows_random_insert(mtcars, messiness = 0.1)
+duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+
+  # Calculate the number of rows to duplicate
+  num_rows_to_duplicate <- ceiling(nrow(data) * messiness)
+
+
+  # Add an index column to preserve original order
+  # Mark rows as originals
+  data <- data |>
+    dplyr::mutate(original_index = row_number()) |>
+    dplyr::mutate(is_duplicate = FALSE)
+
+  # Duplicate rows according to messiness
+  duplicated_rows <- data |>
+    dplyr::slice_sample(n = num_rows_to_duplicate, replace = TRUE)
+
+  # Add an identifier to distinguish duplicated rows
+  duplicated_rows <- duplicated_rows |>
+    dplyr::mutate(is_duplicate = TRUE)
+
+
+  # Combine original and duplicated rows
+  combined_data <- dplyr::bind_rows(data, duplicated_rows)
+
+  # By default duplicated rows are added in the same order as original data
+  if(shuffle == FALSE){
+  # Insert duplicated rows into the original dataframe
+  final_data <- combined_data |>
+    dplyr::arrange(original_index)
+
+  # Drop helper columns
+  final_data <- final_data |>
+    dplyr::select(-c(original_index, is_duplicate))
+  } else{
+ # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained
+    # Assign a random index to the duplicated rows
+    final_data <- combined_data |>
+      dplyr::mutate(random_index = ifelse(is_duplicate, sample(length(combined_data)), original_index)) |>
+      dplyr::arrange(random_index)
+
+    # Drop helper columns
+    final_data <- final_data |>
+      dplyr::select(-c(original_index, is_duplicate, random_index))
+}
+  return(final_data)
+
+}
+
+

From 6e3d369760fc864a43bdc873ced5215958f0fcf0 Mon Sep 17 00:00:00 2001
From: nrennie <nrennie35@gmail.com>
Date: Sun, 1 Dec 2024 14:23:45 +0000
Subject: [PATCH 2/2] fix namespacing and docs for duplicate funcs

---
 DESCRIPTION           |  2 +-
 NAMESPACE             |  1 +
 NEWS.md               |  2 +-
 R/duplicate_rows.R    | 45 +++++++++++++++++++++++--------------------
 man/duplicate_rows.Rd | 25 ++++++++++++++++++++++++
 5 files changed, 52 insertions(+), 23 deletions(-)
 create mode 100644 man/duplicate_rows.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 6967e8a..87680fe 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: messy
 Title: Create messy data from clean dataframes
-Version: 0.0.2
+Version: 0.0.2.0002
 Authors@R: c(
     person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"),
     email = "nrennie35@gmail.com", comment = c(ORCID = "0000-0003-4797-557X")))
diff --git a/NAMESPACE b/NAMESPACE
index d05725f..0420fb0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -3,6 +3,7 @@
 export(add_special_chars)
 export(add_whitespace)
 export(change_case)
+export(duplicate_rows)
 export(make_missing)
 export(messy)
 export(messy_colnames)
diff --git a/NEWS.md b/NEWS.md
index b31c0dc..6cb8f1b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,8 +1,8 @@
 
 # messy (development version)
 
+* Add `duplicate_rows()` function
 * Add date(time) messy-ing functions:
-
   - `messy_datetime_tzones()` will randomly set different timezones to datetime columns
   - `messy_datetime_formats()` and `messy_date_formats()` will format date(times) as characters, and scramble their strptime formats.
   - `split_datetimes()` and `split_dates()` will split datetime columns into "date" and "time" columns, and Date columns into "year", "month", and "day" columns.
diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R
index 0ab3532..5f62677 100644
--- a/R/duplicate_rows.R
+++ b/R/duplicate_rows.R
@@ -4,11 +4,10 @@
 #' @param messiness Percentage of rows to duplicate. Must be
 #' between 0 and 1. Default 0.1.
 #' @param shuffle Insert duplicated data underneath original data or insert randomly
-#' @importFrom dplyr slice_sample mutate arrange
 #' @return A dataframe with duplicated rows inserted
 #' @export
 #' @examples
-#' duplicate_rows_random_insert(mtcars, messiness = 0.1)
+#' duplicate_rows(mtcars, messiness = 0.1)
 duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) {
   if (messiness < 0 || messiness > 1) {
     stop("'messiness' must be between 0 and 1")
@@ -17,11 +16,10 @@ duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) {
   # Calculate the number of rows to duplicate
   num_rows_to_duplicate <- ceiling(nrow(data) * messiness)
 
-
   # Add an index column to preserve original order
   # Mark rows as originals
   data <- data |>
-    dplyr::mutate(original_index = row_number()) |>
+    dplyr::mutate(original_index = dplyr::row_number()) |>
     dplyr::mutate(is_duplicate = FALSE)
 
   # Duplicate rows according to messiness
@@ -37,27 +35,32 @@ duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) {
   combined_data <- dplyr::bind_rows(data, duplicated_rows)
 
   # By default duplicated rows are added in the same order as original data
-  if(shuffle == FALSE){
-  # Insert duplicated rows into the original dataframe
-  final_data <- combined_data |>
-    dplyr::arrange(original_index)
-
-  # Drop helper columns
-  final_data <- final_data |>
-    dplyr::select(-c(original_index, is_duplicate))
-  } else{
- # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained
+  if (shuffle == FALSE) {
+    # Insert duplicated rows into the original dataframe
+    final_data <- combined_data |>
+      dplyr::arrange(.data$original_index)
+
+    # Drop helper columns
+    final_data <- final_data |>
+      dplyr::select(-c(.data$original_index, .data$is_duplicate))
+  } else {
+    # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained
     # Assign a random index to the duplicated rows
     final_data <- combined_data |>
-      dplyr::mutate(random_index = ifelse(is_duplicate, sample(length(combined_data)), original_index)) |>
-      dplyr::arrange(random_index)
+      dplyr::mutate(random_index = ifelse(
+        .data$is_duplicate,
+        sample(length(combined_data)),
+        .data$original_index
+      )) |>
+      dplyr::arrange(.data$random_index)
 
     # Drop helper columns
     final_data <- final_data |>
-      dplyr::select(-c(original_index, is_duplicate, random_index))
-}
+      dplyr::select(-c(
+        .data$original_index,
+        .data$is_duplicate,
+        .data$random_index
+      ))
+  }
   return(final_data)
-
 }
-
-
diff --git a/man/duplicate_rows.Rd b/man/duplicate_rows.Rd
new file mode 100644
index 0000000..72851ee
--- /dev/null
+++ b/man/duplicate_rows.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/duplicate_rows.R
+\name{duplicate_rows}
+\alias{duplicate_rows}
+\title{Duplicate rows and insert them into the dataframe in order or at random}
+\usage{
+duplicate_rows(data, messiness = 0.1, shuffle = FALSE)
+}
+\arguments{
+\item{data}{input dataframe}
+
+\item{messiness}{Percentage of rows to duplicate. Must be
+between 0 and 1. Default 0.1.}
+
+\item{shuffle}{Insert duplicated data underneath original data or insert randomly}
+}
+\value{
+A dataframe with duplicated rows inserted
+}
+\description{
+Duplicate rows and insert them into the dataframe in order or at random
+}
+\examples{
+duplicate_rows(mtcars, messiness = 0.1)
+}