Merge pull request #9 from nrennie/dev

Merge dev into main
nrennie · Dec 1, 2024 · 4ac9f7f · 4ac9f7f
2 parents 3f2696b + 6e3d369
commit 4ac9f7f
Show file tree

Hide file tree

Showing 5 changed files with 94 additions and 2 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: messy
 Title: Create messy data from clean dataframes
-Version: 0.0.2
+Version: 0.0.2.0002
 Authors@R: c(
     person(given = "Nicola", family = "Rennie", role = c("aut", "cre", "cph"),
     email = "[email protected]", comment = c(ORCID = "0000-0003-4797-557X")))

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(add_special_chars)
 export(add_whitespace)
 export(change_case)
+export(duplicate_rows)
 export(make_missing)
 export(messy)
 export(messy_colnames)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,8 @@
 
 # messy (development version)
 
+* Add `duplicate_rows()` function
 * Add date(time) messy-ing functions:
-
   - `messy_datetime_tzones()` will randomly set different timezones to datetime columns
   - `messy_datetime_formats()` and `messy_date_formats()` will format date(times) as characters, and scramble their strptime formats.
   - `split_datetimes()` and `split_dates()` will split datetime columns into "date" and "time" columns, and Date columns into "year", "month", and "day" columns.

diff --git a/R/duplicate_rows.R b/R/duplicate_rows.R
@@ -0,0 +1,66 @@
+#' Duplicate rows and insert them into the dataframe in order or at random
+#'
+#' @param data input dataframe
+#' @param messiness Percentage of rows to duplicate. Must be
+#' between 0 and 1. Default 0.1.
+#' @param shuffle Insert duplicated data underneath original data or insert randomly
+#' @return A dataframe with duplicated rows inserted
+#' @export
+#' @examples
+#' duplicate_rows(mtcars, messiness = 0.1)
+duplicate_rows <- function(data, messiness = 0.1, shuffle = FALSE) {
+  if (messiness < 0 || messiness > 1) {
+    stop("'messiness' must be between 0 and 1")
+  }
+
+  # Calculate the number of rows to duplicate
+  num_rows_to_duplicate <- ceiling(nrow(data) * messiness)
+
+  # Add an index column to preserve original order
+  # Mark rows as originals
+  data <- data |>
+    dplyr::mutate(original_index = dplyr::row_number()) |>
+    dplyr::mutate(is_duplicate = FALSE)
+
+  # Duplicate rows according to messiness
+  duplicated_rows <- data |>
+    dplyr::slice_sample(n = num_rows_to_duplicate, replace = TRUE)
+
+  # Add an identifier to distinguish duplicated rows
+  duplicated_rows <- duplicated_rows |>
+    dplyr::mutate(is_duplicate = TRUE)
+
+
+  # Combine original and duplicated rows
+  combined_data <- dplyr::bind_rows(data, duplicated_rows)
+
+  # By default duplicated rows are added in the same order as original data
+  if (shuffle == FALSE) {
+    # Insert duplicated rows into the original dataframe
+    final_data <- combined_data |>
+      dplyr::arrange(.data$original_index)
+
+    # Drop helper columns
+    final_data <- final_data |>
+      dplyr::select(-c(.data$original_index, .data$is_duplicate))
+  } else {
+    # if shuffle is TRUE then duplicated data is added randomly while the original data order is maintained
+    # Assign a random index to the duplicated rows
+    final_data <- combined_data |>
+      dplyr::mutate(random_index = ifelse(
+        .data$is_duplicate,
+        sample(length(combined_data)),
+        .data$original_index
+      )) |>
+      dplyr::arrange(.data$random_index)
+
+    # Drop helper columns
+    final_data <- final_data |>
+      dplyr::select(-c(
+        .data$original_index,
+        .data$is_duplicate,
+        .data$random_index
+      ))
+  }
+  return(final_data)
+}
diff --git a/man/duplicate_rows.Rd b/man/duplicate_rows.Rd