R4EPI · zkamvar · Feb 11, 2020 · Feb 10, 2020 · Feb 10, 2020 · Feb 10, 2020
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -8,4 +8,4 @@
 ^\.travis\.yml$
 ^appveyor\.yml$
 ^codecov\.yml$
-
+^docs$
diff --git a/.gitignore b/.gitignore
@@ -40,3 +40,4 @@ vignettes/*.pdf
 
 # README html preview
 README.html
+inst/doc
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: epidict
 Title: Standardized Data Dictionaries for the 'R4EPIs' Project
-Version: 0.0.0.9000
+Version: 0.1.0
 Authors@R: c(
     person(given = "Zhian N.",
            family = "Kamvar",
@@ -15,15 +15,18 @@ Authors@R: c(
            role = c("aut")),
     person(given = "Patrick",
            family = "Keating",
-           role = c("aut")))
+           role = c("aut")),
+    person(given = "Annick",
+           family = "Lenglet",
+           role = c("ctb")))
 Description: The 'R4EPIs' project <https://R4epis.netlify.com> seeks to provide 
     a set of standardized tools for analysis of outbreak and survey data in
-    humanitarian aid settings. This package provides standardized data
-    dictionaries for four outbreak scenarios (Acute Jaundice Syndrome, Cholera,
-    Measles, Meningitis) and three surveys (Retrospective mortality and access
-    to care, Malnutrition, and Vaccination coverage). In addition, a data
-    generator from these dictionaries is provided. 
-URL: https://r4epis.netlify.com, https://github.com/R4EPI/epidict
+    humanitarian aid settings. This package currently provides standardized data
+    dictionaries from MSF OCA for four outbreak scenarios (Acute Jaundice
+    Syndrome, Cholera, Measles, Meningitis) and three surveys (Retrospective
+    mortality and access to care, Malnutrition, and Vaccination coverage). In
+    addition, a data generator from these dictionaries is provided.  
+URL: https://r4epis.netlify.com, https://github.com/R4EPI/epidict, https://r4epi.github.io/epidict
 License: GPL-3
 Imports: 
     tibble,
@@ -36,8 +39,12 @@ Imports:
 Suggests: 
     testthat (>= 2.1.0),
     matchmaker,
-    covr
+    covr,
+    knitr,
+    rmarkdown,
+    DT
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.0.2
+VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# epidict 0.1.0
+
+* Fix broken URLs in README
+* Add vignettes and README describing the dictionaries
+* Release to CRAN
+
 # epidict 0.0.0.9000
 
 * Added a `NEWS.md` file to track changes to the package.

diff --git a/R/gen_data.R b/R/gen_data.R
@@ -1,11 +1,14 @@
 #' Generate random linelist or survey data
+#' 
+#' Based on a dictionary generator like [msf_dict()] or [msf_dict_survey()],
+#' this function will generate a randomized data set based on values defined in
+#' the dictionaries. The randomized dataset produced should mimic an excel
+#' export from DHIS2 for outbreaks and a Kobo export for surveys.
 #'
 #' @param dictionary Specify which dictionary you would like to use.
 #'
-#' @param varnames Specify name of column that contains varnames. Currently
-#' default set to "Item".  (this can probably be deleted once dictionaries
-#' standardise) If `dictionary` is "Mortality", `varnames` needs to be
-#' "column_name"`.
+#' @param varnames Specify name of column that contains variable names.
+#'   If `dictionary` is a survey, `varnames` needs to be "column_name"`.
 #'
 #' @param numcases Specify the number of cases you want (default is 300)
 #'
@@ -21,34 +24,30 @@
 #'
 #' if (require("dplyr") & require("matchmaker")) {
 #'   withAutoprint({
+#'
 #'     # You will often want to use MSF dictionaries to translate codes to human-
 #'     # readable variables. Here, we generate a data set of 20 cases:
 #'     dat <- gen_data(
-#'       dictionary = "Cholera", varnames = "data_element_shortname",
-#'       numcases = 20, org = "MSF"
+#'       dictionary = "Cholera", 
+#'       varnames = "data_element_shortname",
+#'       numcases = 20, 
+#'       org = "MSF"
 #'     )
 #'     print(dat)
 #'
 #'     # We want the expanded dictionary, so we will select `compact = FALSE`
 #'     dict <- msf_dict(disease = "Cholera", long = TRUE, compact = FALSE, tibble = TRUE)
 #'     print(dict)
 #'
-#'     # We can use linelist's clean_variable_spelling to translate the codes. First,
-#'     # we want to reorder the columns of the dictionary like so:
-#'     #
-#'     #  - 1st column: option codes
-#'     #  - 2nd column: translations
-#'     #  - 3rd column: data column name
-#'     #  - 4th column: order of options
-#'
-#'     # Now we can use linelist to filter the data:
+#'     # Now we can use matchmaker to filter the data:
 #'     dat_clean <- matchmaker::match_df(dat, dict,
 #'       from = "option_code",
 #'       to = "option_name",
 #'       by = "data_element_shortname",
 #'       order = "option_order_in_set"
 #'     )
 #'     print(dat_clean)
+#'
 #'   })
 #' }
 gen_data <- function(dictionary, varnames = "data_element_shortname", numcases = 300, org = "MSF") {

diff --git a/R/gen_eligible_interviewed.R b/R/gen_eligible_interviewed.R
diff --git a/R/gen_msf_data.R b/R/gen_msf_data.R
@@ -248,12 +248,6 @@ gen_msf_data <- function(dictionary, dat_dict, is_survey, varnames = "data_eleme
       household = "q14_hh_no"
     )
 
-
-    dis_output <- gen_eligible_interviewed(dis_output,
-      household = "q14_hh_no",
-      cluster = "q77_what_is_the_cluster_number"
-    )
-
     # use household num as a standin for fact_0_id for now
     dis_output$fact_0_id <- dis_output$q14_hh_no
 

diff --git a/R/msf_dict.R b/R/msf_dict.R
@@ -1,11 +1,12 @@
 #' MSF data dictionaries and dummy datasets
 #'
-#' These function reads in MSF data dictionaries and produces randomised
-#' datasets based on values defined in the dictionaries.  The randomised
-#' dataset produced should mimic an excel export from DHIS2.
+#' These function produces MSF OCA dictionaries based on DHIS2 data sets
+#' defining the data element name, code, short names, types, and key/value pairs
+#' for translating the codes into human-readable format. 
 #'
 #' @param disease Specify which disease you would like to use.
-#'   Currently supports "Cholera", "Measles" and "Meningitis".
+#'   - `msf_dict()` supports "AJS", "Cholera", "Measles", "Meningitis"
+#'   - `msf_dict_survey()` supports "Mortality", "Nutrition", and "Vaccination"
 #'
 #' @param name the name of the dictionary stored in the package.
 #'
@@ -20,7 +21,7 @@
 #'   format with each option getting one row. If `FALSE`, then two data frames
 #'   are returned, one with variables and the other with content options.
 #'
-#' @seealso [matchmaker::match_df()] [gen_data()]
+#' @seealso [matchmaker::match_df()] [gen_data()] [msf_dict_survey()]
 #' @export
 #' @examples
 #'
@@ -29,24 +30,18 @@
 #'     # You will often want to use MSF dictionaries to translate codes to human-
 #'     # readable variables. Here, we generate a data set of 20 cases:
 #'     dat <- gen_data(
-#'       dictionary = "Cholera", varnames = "data_element_shortname",
-#'       numcases = 20, org = "MSF"
+#'       dictionary = "Cholera", 
+#'       varnames = "data_element_shortname",
+#'       numcases = 20, 
+#'       org = "MSF"
 #'     )
 #'     print(dat)
 #'
 #'     # We want the expanded dictionary, so we will select `compact = FALSE`
 #'     dict <- msf_dict(disease = "Cholera", long = TRUE, compact = FALSE, tibble = TRUE)
 #'     print(dict)
 #'
-#'     # We can use linelist's clean_variable_spelling to translate the codes. First,
-#'     # we want to reorder the columns of the dictionary like so:
-#'     #
-#'     #  - 1st column: option codes
-#'     #  - 2nd column: translations
-#'     #  - 3rd column: data column name
-#'     #  - 4th column: order of options
-#'
-#'     # Now we can use linelist to filter the data:
+#'     # Now we can use matchmaker to filter the data:
 #'     dat_clean <- matchmaker::match_df(dat, dict,
 #'       from = "option_code",
 #'       to = "option_name",

diff --git a/R/msf_dict_survey.R b/R/msf_dict_survey.R
@@ -9,6 +9,7 @@
 msf_dict_survey <- function(disease, name = "MSF-survey-dict.xlsx",
                             tibble = TRUE,
                             compact = FALSE) {
+
   disease <- get_dictionary(disease)$survey
 
   if (length(disease) == 0) {

diff --git a/R/utils-generators.R b/R/utils-generators.R
@@ -126,5 +126,41 @@ gen_hh_clusters <- function(dis_output, n, cluster = "cluster_number", household
     dis_output[[household]][dis_output[[cluster]] == i] <- hhid
   }
 
+  dis_output <- gen_eligible_interviewed(
+    dis_output, 
+    household = household, 
+    cluster = cluster
+  )
+
   return(dis_output)
 }
+
+#' generate eligible and interviewed columns in a data frame
+#'
+#' @param dis_output a data frame containing household and cluster
+#' @param household [character] the column specifying household
+#' @param cluster [character] the column specifying cluster
+#'
+#' @return dis_output with two additional columns:
+#'   - eligible: the number of individuals within each household and cluster
+#'   - interviewed: 75% of eligible
+#'
+#' @noRd
+gen_eligible_interviewed <- function(dis_output, household = "q14_hh_no", cluster = "q77_what_is_the_cluster_number") {
+
+  dis_output[["eligible"]] <- NULL
+  dis_output[["interviewed"]] <- NULL
+
+  hh <- rlang::sym(household)
+  cl <- rlang::sym(cluster)
+
+  # get counts of people by household and cluster
+  hh_count <- dplyr::count(dis_output, !!hh, !!cl, .drop = FALSE, name = "eligible")
+
+  # make interviewed 3/4s of those eligible
+  hh_count[["interviewed"]] <- round(hh_count[["eligible"]] * 0.75, digits = 0L)
+
+  # merge with dis_output and return
+  dplyr::left_join(dis_output, hh_count, by = c(household, cluster))
+
+}
diff --git a/R/utils.R b/R/utils.R
@@ -2,17 +2,14 @@
 #'
 #' @param x a character vector
 #' @param sep a separator to use for non-alphabetical characters
-#' @param transformation passed to [stri::stri_trans_general()]
 #' @param protect any special characters that need to be protected
 #'
 #' @return a transformed character vector
 #' @keywords internal
 #' @note This was taken from the dev version of epitrix to reduce the number
 #'   of packages imported (and because it's not going to be on CRAN anytime soon)
-# Not needed because we don't deal with accents here: importFrom stringi stri_trans_general
 #' @noRd
-tidy_labels <- function(x, sep = "_", # transformation = "Any-Latin; Latin-ASCII",
-                        protect = "") {
+tidy_labels <- function(x, sep = "_", protect = "") {
   x <- as.character(x)
 
   ## On the processing of the input: