From bd8e2464b840a6501dff47857ea0612957e65150 Mon Sep 17 00:00:00 2001
From: ummel <ummel@berkeley.edu>
Date: Thu, 5 Dec 2024 13:13:06 -0700
Subject: [PATCH] Minor fixes and documentation updates.

---
 .github/workflows/pkgdown.yaml |  2 +-
 R/analyze_fusionACS.R          | 40 +++++++++++++++++++++++-----------
 R/assemble.R                   |  2 +-
 R/monotonic.R                  |  2 +-
 man/analyze_fusionACS.Rd       | 14 ++++++------
 5 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
index 0010be4..24fea81 100644
--- a/.github/workflows/pkgdown.yaml
+++ b/.github/workflows/pkgdown.yaml
@@ -1,4 +1,4 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples#build-pkgdown-site
 # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 # 10/29/24: Set on:workflow_dispatch to allow workflow to be triggered manually: https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/manually-running-a-workflow
 on:
diff --git a/R/analyze_fusionACS.R b/R/analyze_fusionACS.R
index 26e0dc9..08c9995 100644
--- a/R/analyze_fusionACS.R
+++ b/R/analyze_fusionACS.R
@@ -1,7 +1,7 @@
 #' Analyze fusionACS microdata
 #'
 #' @description
-#' For fusionACS usage only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables.
+#' For fusionACS internal use only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables.
 #' Efficiently computes means, medians, sums, proportions, and counts, optionally across population subgroups.
 #' The use of native ACS weights or ORNL UrbanPop synthetic population weights is automatically determined given the requested geographic resolution.
 #' Requires a local \code{/fusionData} directory in the working directory path with assumed file structure and conventions.
@@ -15,11 +15,10 @@
 #' @param M Integer. The first \code{M} implicates are used. Set \code{M = Inf} to use all available implicates.
 #' @param R Integer. The first \code{R} replicate weights are used. Set \code{R = Inf} to use all available replicate weights.
 #' @param cores Integer. Number of cores used for multithreading in \code{\link[collapse]{collapse-package}} functions.
-#' @param version_up Integer. TEMPORARY. Use \code{1} to access national, single-implicate weights. Use \code{2} to access 10-replicate initial weights for 17 metro areas.
+#' @param version_up Integer. Use \code{version_up = 1} to access national, single-implicate weights. Use \code{version_up = 2} to access 10-replicate weights for 17 metro areas.
 #' @param force_up Logical. If \code{TRUE}, force use of UrbanPop weights even if the requested analysis can be done using native ACS weights.
-
 #'
-#' @details Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), tract10, zcta10 (zip code), and bg10 (block group).
+#' @details Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), zcta10 (zip code), tract10 (census tract), and bg10 (block group).
 #'
 #' @details The final point estimates are the mean estimates across implicates. The final margin of error is derived from the pooled standard error across implicates, calculated using Rubin's pooling rules (1987). The within-implicate standard error's are calculated using the replicate weights.
 #'
@@ -41,14 +40,14 @@
 #'  \item{lhs}{Optional analysis name; the "left hand side" of the analysis formula.}
 #'  \item{rhs}{The "right hand side" of the analysis formula.}
 #'  \item{type}{Type of analysis: sum, mean, median, prop(ortion) or count.}
-#'  \item{N}{Number of microdata observations used to construct the estimate.}
 #'  \item{level}{Factor levels for categorical analyses; NA otherwise.}
-#'  \item{est}{Point estimate; mean estimate across implicates.}
+#'  \item{N}{Mean number of valid microdata observations across all implicates and replicates; i.e. the sample size used to construct the estimate.}
+#'  \item{est}{Point estimate; mean estimate across all implicates and replicates.}
 #'  \item{moe}{Margin of error associated with the 90% confidence interval.}
 #'  \item{se}{Standard error of the estimate.}
 #'  \item{df}{Degrees of freedom used to calculate the margin of error.}
-#'  \item{cv}{Coefficient of variation; a measure of estimate reliability.}
-#'  \item{rshare}{Share of uncertainty attributable to replicate weights (as opposed to across-implicates uncertainty).}
+#'  \item{cv}{Coefficient of variation; conventional scale-independent measure of estimate reliability. Calculated as: \code{100 * moe / 1.645 / est}}
+#'  \item{rshare}{Share of \code{moe} attributable to replicate weight uncertainty (as opposed to uncertainty across implicates).}
 #'  }
 #'
 #' @references Rubin, D.B. (1987). \emph{Multiple imputation for nonresponse in surveys}. Hoboken, NJ: Wiley.
@@ -329,12 +328,12 @@ analyze_fusionACS <- function(analyses,
   # Attempt to convert any non-formula entries in 'analyses' into a plausible formula
   # This applies to legacy analysis formulation of the kind:
   #  analyses <- list(mean = c("natural_gas", "aircon"), median = "electricity")
-  # The code below converts these to an equivalent formula provided that the function referenced is in .FAST_STAT_FUN
+  # The code below converts these to an equivalent formula and assigns LHS as concatenation of analysis variable name and outer function
   analyses <- lapply(seq_along(analyses), function(i) {
     x <- analyses[[i]]
     if (!rlang::is_formula(x)) {
       f <- names(analyses)[i]  # The requested outer function
-      fobj <- paste("~", f, "(", x, ")")  # No LHS name in this case
+      fobj <- paste0(gsub(" ", "_", str_squish(x)), "_", f, "~", f, "(`", x, "`)")
       lapply(fobj, as.formula)
     } else {
       x
@@ -386,8 +385,17 @@ analyze_fusionACS <- function(analyses,
   anames <- paste0("A..", match(aexp, unique(aexp)))
   names(alist) <- anames
 
-  # Outer function of each analysis; check that the requested function is allowed
+  # Outer function of each analysis
   afun <- purrr::map_chr(alist, 1)
+
+  # Abbreviation of the inner expression with function appended
+  # Used below to assign LHS when none is provided
+  afun <- gsub("proportion", "prop", afun)
+  lhs.abb <- paste(abbreviate(gsub('`', '', purrr::map_chr(alist, 2))), afun, sep = "_")
+
+  # Convert outer functions, if necessary, and check that the requested function is allowed
+  afun <- gsub("count", "sum", afun)  # Alternative way of requesting a sum
+  afun <- gsub("prop", "mean", afun)
   invalid <- !afun %in% c('sum', 'mean', 'median')  # Valid outer functions
   if (any(invalid)) stop("Outer functions must be sum(), mean(), or median()")
 
@@ -404,10 +412,13 @@ analyze_fusionACS <- function(analyses,
   # The "ANALYSIS" label used to identify each analysis (combination of function and analysis variable, separated by single dot)
   alabel <- paste(afun, names(afun), sep = ".")
 
-  # LHS and RHS of each analysis; assigned to the final results
+  # LHS and RHS of each analysis; assigned to the final output
   alhs <- sapply(alist, function(x) ifelse(length(x[[3]]), x[[3]], NA))
   arhs <- purrr::map_chr(alist, 4)
 
+  # If no LHS provided, assign an abbreviation based on the inner expression
+  alhs[is.na(alhs)] <- lhs.abb[is.na(alhs)]
+
   #-----
 
   # Extract input variables required by 'fun' user function
@@ -780,12 +791,15 @@ analyze_fusionACS <- function(analyses,
 
     # Safety check on dimensions
     stopifnot(nrow(sim) / nrow(static) == Mimp)
-    stopifnot(all(avars %in% names(sim)))
 
     cat("Successfully applied user fun() to microdata\n")
 
   }
 
+  # Check if all required analysis variables are present in 'sim' prior to evaluating inner expressions
+  miss <- setdiff(avars, names(sim))
+  if (length(miss)) stop("The following analysis variables are not present in 'sim': ", paste(miss, collapse = ", "))
+
   #-------
 
   # 'solo' analyses are those with no inner expression modification (can simply rename the target variable)
diff --git a/R/assemble.R b/R/assemble.R
index 4256ef7..cd26532 100644
--- a/R/assemble.R
+++ b/R/assemble.R
@@ -133,7 +133,7 @@ assemble <- function(year,
         dt <- fusionModel::read_fsd(path = x,
                                     columns = intersect(xn, c('M', 'year', 'hid', 'pid', keep)),
                                     M = M,
-                                    df = if (is.null(df)) NULL else select(df, any_of(xn)),
+                                    df = if (any(xn %in% names(df))) select(df, any_of(xn)) else NULL,
                                     cores = cores)
       } else {
         dt <- data.table()
diff --git a/R/monotonic.R b/R/monotonic.R
index 785179b..aac3b24 100644
--- a/R/monotonic.R
+++ b/R/monotonic.R
@@ -6,7 +6,7 @@
 #' @param y Numeric.
 #' @param w Numeric. Optional observation weights.
 #' @param preserve Logical. Preserve the original mean of the \code{y} values in the returned values?
-#' @param expend Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x = 0}.
+#' @param expend Logical. Assume \code{y} is an expenditure variable? If \code{TRUE}, a safety check is implemented to ensure \code{y > 0} when \code{x > 0}.
 #' @param fast Logical. If \code{TRUE}, only \code{\link[scam]{supsmu}} is used with coercion of result to monotone.
 #' @param nmax Integer. Maximum number of observations to use for smoothing. Set lower for faster computation. \code{nmax = Inf} eliminates sampling.
 #' @param plot Logical. Plot the (sampled) data points and derived monotonic relationship?
diff --git a/man/analyze_fusionACS.Rd b/man/analyze_fusionACS.Rd
index 6bfd38d..33b01b4 100644
--- a/man/analyze_fusionACS.Rd
+++ b/man/analyze_fusionACS.Rd
@@ -37,7 +37,7 @@ analyze_fusionACS(
 
 \item{cores}{Integer. Number of cores used for multithreading in \code{\link[collapse]{collapse-package}} functions.}
 
-\item{version_up}{Integer. TEMPORARY. Use \code{1} to access national, single-implicate weights. Use \code{2} to access 10-replicate initial weights for 17 metro areas.}
+\item{version_up}{Integer. Use \code{version_up = 1} to access national, single-implicate weights. Use \code{version_up = 2} to access 10-replicate weights for 17 metro areas.}
 
 \item{force_up}{Logical. If \code{TRUE}, force use of UrbanPop weights even if the requested analysis can be done using native ACS weights.}
 }
@@ -48,24 +48,24 @@ A tibble reporting analysis results, possibly across subgroups defined in \code{
 \item{lhs}{Optional analysis name; the "left hand side" of the analysis formula.}
 \item{rhs}{The "right hand side" of the analysis formula.}
 \item{type}{Type of analysis: sum, mean, median, prop(ortion) or count.}
-\item{N}{Number of microdata observations used to construct the estimate.}
 \item{level}{Factor levels for categorical analyses; NA otherwise.}
-\item{est}{Point estimate; mean estimate across implicates.}
+\item{N}{Mean number of valid microdata observations across all implicates and replicates; i.e. the sample size used to construct the estimate.}
+\item{est}{Point estimate; mean estimate across all implicates and replicates.}
 \item{moe}{Margin of error associated with the 90\% confidence interval.}
 \item{se}{Standard error of the estimate.}
 \item{df}{Degrees of freedom used to calculate the margin of error.}
-\item{cv}{Coefficient of variation; a measure of estimate reliability.}
-\item{rshare}{Share of uncertainty attributable to replicate weights (as opposed to across-implicates uncertainty).}
+\item{cv}{Coefficient of variation; conventional scale-independent measure of estimate reliability. Calculated as: \code{100 * moe / 1.645 / est}}
+\item{rshare}{Share of \code{moe} attributable to replicate weight uncertainty (as opposed to uncertainty across implicates).}
 }
 }
 \description{
-For fusionACS usage only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables.
+For fusionACS internal use only. Calculation of point estimates and associated uncertainty (margin of error) for analyses using ACS and/or fused donor survey variables.
 Efficiently computes means, medians, sums, proportions, and counts, optionally across population subgroups.
 The use of native ACS weights or ORNL UrbanPop synthetic population weights is automatically determined given the requested geographic resolution.
 Requires a local \code{/fusionData} directory in the working directory path with assumed file structure and conventions.
 }
 \details{
-Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), tract10, zcta10 (zip code), and bg10 (block group).
+Allowable geographic units of analysis specified in \code{by} are currently limited to: region, division, state, cbsa10, puma10, county10, cousubfp10 (county subdivision), zcta10 (zip code), tract10 (census tract), and bg10 (block group).
 
 The final point estimates are the mean estimates across implicates. The final margin of error is derived from the pooled standard error across implicates, calculated using Rubin's pooling rules (1987). The within-implicate standard error's are calculated using the replicate weights.