From 1f35680f948afcfdc16500ca2604bf63578e9ce0 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 12:50:00 +0100 Subject: [PATCH 01/11] get(numericalWeightingVar) instead of tmpVarForMultiplication --- R/ipf.r | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index b6498b7..3f063ba 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -128,7 +128,7 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, numericalWeighting, numericalWeightingVar) { epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- f <- NULL - temporary_hid <- temporary_hvar <- tmpVarForMultiplication <- value <- + temporary_hid <- temporary_hvar <- value <- wValue <- wvst <- NULL combined_factors <- dat[[paste0("combined_factors_", i)]] @@ -143,19 +143,17 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conP list element to define numerical variable - setnames(dat, numericalWeightingVar, "tmpVarForMultiplication") - dat[, f := ipf_step_f(calibWeight * tmpVarForMultiplication, + dat[, f := ipf_step_f(calibWeight * get(numericalWeightingVar), combined_factors, con_current)] dat[, wValue := value / f] # try to divide the weight between units with larger/smaller value in the # numerical variable linear dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), - tmpVarForMultiplication, calibWeight), + get(numericalWeightingVar), calibWeight), by = eval(paste0("combined_factors_", i))] - setnames(dat, "tmpVarForMultiplication", numericalWeightingVar) } else { # categorical variable to be calibrated @@ -194,7 +192,7 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, looseH, numericalWeighting, numericalWeightingVar) { epsHcur <- OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- f <- NULL - maxFac <- temporary_hid <- temporary_hvar <- tmpVarForMultiplication <- + maxFac <- temporary_hid <- temporary_hvar <- value <- wValue <- wvst <- NULL setnames(dat, valueH[i], "value") @@ -209,19 +207,17 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conH list element to define numerical variable - setnames(dat, numericalWeightingVar, "tmpVarForMultiplication") - dat[, f := ipf_step_f(calibWeight * wvst * tmpVarForMultiplication, + dat[, f := ipf_step_f(calibWeight * wvst * get(numericalWeightingVar), combined_factors, con_current)] dat[, wValue := value / f] # try to divide the weight between units with larger/smaller value in the # numerical variable linear dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), - tmpVarForMultiplication, calibWeight), + get(numericalWeightingVar), calibWeight), by = eval(paste0("combined_factors_h_", i))] - setnames(dat, "tmpVarForMultiplication", numericalWeightingVar) } else { # categorical variable to be calibrated dat[, f := ipf_step_f(calibWeight * wvst, combined_factors, con_current)] @@ -499,7 +495,7 @@ ipf <- function( stop("The provided dataset must not have a column called 'w'") OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- - f <- temporary_hid <- temporary_hvar <- tmpVarForMultiplication <- + f <- temporary_hid <- temporary_hvar <- value <- wValue <- wvst <- NULL dat_original <- dat dat <- copy(dat) From 65189b8b86b7a106fcd7d17346ad1df63152b0a2 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 12:50:15 +0100 Subject: [PATCH 02/11] test for numericalweighting extended --- tests/testthat/test_ipf.R | 62 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test_ipf.R b/tests/testthat/test_ipf.R index 2652ac4..49b6ece 100644 --- a/tests/testthat/test_ipf.R +++ b/tests/testthat/test_ipf.R @@ -18,6 +18,7 @@ eusilcS[, agegroup := cut(age, c(-Inf, 10 * 1:9, Inf), right = FALSE)] # some recoding of netIncome for reasons of simplicity eusilcS[is.na(netIncome), netIncome := 0] eusilcS[netIncome < 0, netIncome := 0] +eusilcS[, HHnetIncome := sum(netIncome),by=household] # set hsize to 1,...,5+ eusilcS[, hsize := cut(hsize, c(0:4, Inf), labels = c(1:4, "5+"))] # treat households as a factor variable @@ -39,7 +40,9 @@ conP3 <- xtabs(weight * netIncome ~ gender, data = eusilcS) ## constraints on household level conH1 <- xtabs(weight ~ hsize + state, data = eusilcS, subset = !duplicated(household)) - +## constraints on household level netIncome +conH2 <- xtabs(weight * HHnetIncome ~ state, data = eusilcS, + subset = !duplicated(household)) # array of convergence limits for conH1 epsH1 <- conH1 epsH1[1:4, ] <- 0.005 @@ -49,7 +52,7 @@ epsH1["5+", ] <- 0.2 -test_that("ipf with a numerical variable works as expected", { +test_that("ipf with a numerical variable works as expected - computeLinear", { # without array epsP1 calibweights1 <- ipf( eusilcS, hid = "household", @@ -57,7 +60,60 @@ test_that("ipf with a numerical variable works as expected", { conH = list(conH1), epsP = list(1e-06, 1e-06, 1e-03), epsH = 0.01, - bound = NULL, verbose = FALSE, maxIter = 200) + bound = NULL, verbose = FALSE, maxIter = 200,numericalWeighting = computeLinear) + expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / + sum(conP3) < .01) + expect_true(all( + abs(calibweights1[, sum(calibWeight * netIncome), by = gender]$V1 - conP3) / + conP3 < .01 + )) + + err <- max(c( + max(abs(xtabs(calibWeight ~ agegroup, data = calibweights1) - + conP1) / conP1), + max(abs(xtabs(calibWeight ~ gender + state, data = calibweights1) - + conP2) / conP2), + max(abs(xtabs(calibWeight ~ hsize + state, data = calibweights1, + subset = !duplicated(household)) - conH1) / conH1))) + expect_true(err < .01) +}) + +test_that("ipf with a numerical variable works as expected - computeLinearG1", { + # without array epsP1 + calibweights1 <- ipf( + eusilcS, hid = "household", + conP = list(conP1, conP2, netIncome = conP3), + conH = list(conH1), + epsP = list(1e-06, 1e-06, 1e-03), + epsH = 0.01, + bound = NULL, verbose = FALSE, maxIter = 200,numericalWeighting = computeLinearG1) + expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / + sum(conP3) < .01) + expect_true(all( + abs(calibweights1[, sum(calibWeight * netIncome), by = gender]$V1 - conP3) / + conP3 < .01 + )) + + err <- max(c( + max(abs(xtabs(calibWeight ~ agegroup, data = calibweights1) - + conP1) / conP1), + max(abs(xtabs(calibWeight ~ gender + state, data = calibweights1) - + conP2) / conP2), + max(abs(xtabs(calibWeight ~ hsize + state, data = calibweights1, + subset = !duplicated(household)) - conH1) / conH1))) + expect_true(err < .01) +}) + + +test_that("ipf with a numerical variable in households as expected", { + # without array epsP1 + calibweights1 <- ipf( + eusilcS, hid = "household", + conP = list(conP1, conP2), + conH = list(conH1, HHnetIncome = conH2), + epsP = list(1e-06, 1e-06), + epsH = list(0.01, 0.01), + bound = NULL, verbose = FALSE, maxIter = 50, numericalWeighting = computeFrac) expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / sum(conP3) < .01) expect_true(all( From 17130fb156445a7c23acd01234304b3684c61f71 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 12:56:47 +0100 Subject: [PATCH 03/11] remove tempory_hid and use hid directly --- R/ipf.r | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index 3f063ba..3efefe1 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -128,7 +128,7 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, numericalWeighting, numericalWeightingVar) { epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- f <- NULL - temporary_hid <- temporary_hvar <- value <- + temporary_hvar <- value <- wValue <- wvst <- NULL combined_factors <- dat[[paste0("combined_factors_", i)]] @@ -192,7 +192,7 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, looseH, numericalWeighting, numericalWeightingVar) { epsHcur <- OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- f <- NULL - maxFac <- temporary_hid <- temporary_hvar <- + maxFac <- temporary_hvar <- value <- wValue <- wvst <- NULL setnames(dat, valueH[i], "value") @@ -495,7 +495,7 @@ ipf <- function( stop("The provided dataset must not have a column called 'w'") OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- - f <- temporary_hid <- temporary_hvar <- + f <- temporary_hvar <- value <- wValue <- wvst <- NULL dat_original <- dat dat <- copy(dat) @@ -525,18 +525,14 @@ ipf <- function( dat[, hid := 1:nrow(dat)] dat[, wvst := 1] } else { - setnames(dat, hid, "temporary_hid") - dat[, wvst := as.numeric(!duplicated(temporary_hid))] - setnames(dat, "temporary_hid", hid) + dat[, wvst := as.numeric(!duplicated(get(hid)))] } - setnames(dat, hid, "temporary_hid") - if (!is.factor(dat$temporary_hid)) { + if (!is.factor(dat[[hid]])) { if (conversion_messages) message("convert household variable ", hid, " to factor") - dat[, temporary_hid := as.factor(temporary_hid)] + dat[, c(hid) := as.factor(get(hid))] } - setnames(dat, "temporary_hid", hid) ## Names of the calibration variables for Person and household dimension pColNames <- lapply(conP, function(x) names(dimnames(x))) @@ -617,16 +613,14 @@ ipf <- function( ## Check for non-unqiue values inside of a household for variabels used ## in Household constraints for (hh in hColNames){ - setnames(dat, hid, "temporary_hid") for (h in hh) { setnames(dat, h, "temporary_hvar") if (dat[, length(unique(temporary_hvar)), - by = temporary_hid][, any(V1 != 1)]) { + by = c(hid)][, any(V1 != 1)]) { stop(paste(h, "has different values inside a household")) } setnames(dat, "temporary_hvar", h) } - setnames(dat, "temporary_hid", hid) } } From 6e4a0a98e785178e014af9b66b7d05e4d677ddc9 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 12:59:04 +0100 Subject: [PATCH 04/11] remove conversion f hid to factor --- R/ipf.r | 6 ------ 1 file changed, 6 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index 3efefe1..b3e8f10 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -528,12 +528,6 @@ ipf <- function( dat[, wvst := as.numeric(!duplicated(get(hid)))] } - if (!is.factor(dat[[hid]])) { - if (conversion_messages) - message("convert household variable ", hid, " to factor") - dat[, c(hid) := as.factor(get(hid))] - } - ## Names of the calibration variables for Person and household dimension pColNames <- lapply(conP, function(x) names(dimnames(x))) hColNames <- lapply(conH, function(x) names(dimnames(x))) From 8a34401b3a4f8b77bd43ff2e05667367e3b1aeb1 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 13:11:17 +0100 Subject: [PATCH 05/11] use w as name for baseweight directly --- R/ipf.r | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index b3e8f10..173b453 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -125,12 +125,12 @@ check_population_totals <- function(con, dat, type = "personal") { } calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, - numericalWeighting, numericalWeightingVar) { - epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- baseWeight <- + numericalWeighting, numericalWeightingVar, w) { + epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- calibWeight <- epsvalue <- f <- NULL temporary_hvar <- value <- wValue <- wvst <- NULL - + variableKeepingTheBaseWeight <- w combined_factors <- dat[[paste0("combined_factors_", i)]] setnames(dat, valueP[i], "value") setnames(dat, paste0("epsP_", i), "epsPcur") @@ -174,7 +174,7 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, } } if (!is.null(bound)) { - dat[!is.na(f), calibWeight := boundsFak(calibWeight, baseWeight, f, + dat[!is.na(f), calibWeight := boundsFak(calibWeight, get(variableKeepingTheBaseWeight), f, bound = bound)] #,by=eval(pColNames[[i]])] } else { @@ -189,8 +189,9 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, } calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, - looseH, numericalWeighting, numericalWeightingVar) { - epsHcur <- OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- + looseH, numericalWeighting, numericalWeightingVar, w) { + variableKeepingTheBaseWeight <- w + epsHcur <- OriginalSortingVariable <- V1 <- calibWeight <- epsvalue <- f <- NULL maxFac <- temporary_hvar <- value <- wValue <- wvst <- NULL @@ -240,10 +241,10 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } if (!is.null(bound)) { if (!looseH) { - dat[, calibWeight := boundsFak(g1 = calibWeight, g0 = baseWeight, f = f, + dat[, calibWeight := boundsFak(g1 = calibWeight, g0 = get(variableKeepingTheBaseWeight), f = f, bound = bound)]#,by=eval(hColNames[[i]])] }else{ - dat[, calibWeight := boundsFakHH(g1 = calibWeight, g0 = baseWeight, + dat[, calibWeight := boundsFakHH(g1 = calibWeight, g0 = get(variableKeepingTheBaseWeight), eps = epsHcur, orig = value, p = wValue, bound = bound)] } @@ -490,11 +491,11 @@ ipf <- function( check_population_totals(conP, dat, "personal") check_population_totals(conH, dat, "household") + variableKeepingTheBaseWeight <- w + if ("variableKeepingTheBaseWeight" %in% names(dat)) + stop("The provided dataset must not have a column called 'variableKeepingTheBaseWeight'") - if ("w" %in% names(dat)) - stop("The provided dataset must not have a column called 'w'") - - OriginalSortingVariable <- V1 <- baseWeight <- calibWeight <- epsvalue <- + OriginalSortingVariable <- V1 <- calibWeight <- epsvalue <- f <- temporary_hvar <- value <- wValue <- wvst <- NULL dat_original <- dat @@ -509,13 +510,11 @@ ipf <- function( ###fixed target value, should not be changed in iterations valueH <- paste0("valueH", seq_along(conH)) ###Housekeeping of the varNames used - usedVarNames <- c(valueP, valueH, "value", "baseWeight", "wvst", "wValue") + usedVarNames <- c(valueP, valueH, "value", "wvst", "wValue") if (any(names(dat) %in% usedVarNames)) { renameVars <- names(dat)[names(dat) %in% usedVarNames] setnames(dat, renameVars, paste0(renameVars, "_safekeeping")) - if (isTRUE(w == "baseWeight")) - w <- "baseWeight_safekeeping" } ### Treatment of HID, creating 0,1 var for being the first hh member #delVars <- c() @@ -593,14 +592,12 @@ ipf <- function( dat[, paste0("valueH", i) := tmp] } - if (is.null(w)) { + if (is.null(variableKeepingTheBaseWeight)) { if (!is.null(bound) && is.null(w)) stop("Bounds are only reasonable if base weights are provided") dat[, calibWeight := 1] - #delVars <- c(delVars,"baseWeight") } else { - dat[, calibWeight := dat[, w, with = FALSE]] - setnames(dat, w, "baseWeight") + dat[, calibWeight := dat[, variableKeepingTheBaseWeight, with = FALSE]] } if (check_hh_vars) { @@ -665,7 +662,8 @@ ipf <- function( i = i, dat = dat, error = error, valueP = valueP, pColNames = pColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, - numericalWeightingVar = numericalWeightingTmp) + numericalWeightingVar = numericalWeightingTmp, + w = variableKeepingTheBaseWeight) } ## replace person weight with household average @@ -683,7 +681,8 @@ ipf <- function( hColNames = hColNames, bound = bound, verbose = verbose, calIter = calIter, looseH = looseH, numericalWeighting = numericalWeighting, - numericalWeightingVar = numericalWeightingTmp) + numericalWeightingVar = numericalWeightingTmp, + w = variableKeepingTheBaseWeight) } } else { ### Person calib @@ -696,7 +695,8 @@ ipf <- function( i = i, dat = dat, error = error, valueP = valueP, pColNames = pColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, - numericalWeightingVar = numericalWeightingTmp) + numericalWeightingVar = numericalWeightingTmp, + w = variableKeepingTheBaseWeight) ## replace person weight with household average dh <- dat[[hid]] @@ -712,7 +712,8 @@ ipf <- function( i = i, dat = dat, error = error, valueH = valueH, hColNames = hColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, - numericalWeightingVar = numericalWeightingTmp, looseH = looseH) + numericalWeightingVar = numericalWeightingTmp, looseH = looseH, + w = variableKeepingTheBaseWeight) } } } From fc79ae389c54688780853b4e437387fbc3e73878 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 13:26:47 +0100 Subject: [PATCH 06/11] make the naming of calibWeight a parameter --- R/ipf.r | 80 ++++++++++++++++++++++++++++++------------------------ man/ipf.Rd | 5 +++- 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index 173b453..ab60215 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -125,12 +125,13 @@ check_population_totals <- function(con, dat, type = "personal") { } calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, - numericalWeighting, numericalWeightingVar, w) { + numericalWeighting, numericalWeightingVar, w, cw) { epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- - calibWeight <- epsvalue <- f <- NULL + epsvalue <- f <- NULL temporary_hvar <- value <- wValue <- wvst <- NULL variableKeepingTheBaseWeight <- w + variableKeepingTheCalibWeight <- cw combined_factors <- dat[[paste0("combined_factors_", i)]] setnames(dat, valueP[i], "value") setnames(dat, paste0("epsP_", i), "epsPcur") @@ -144,20 +145,20 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, ## numerical variable to be calibrated ## use name of conP list element to define numerical variable - dat[, f := ipf_step_f(calibWeight * get(numericalWeightingVar), + dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * get(numericalWeightingVar), combined_factors, con_current)] dat[, wValue := value / f] # try to divide the weight between units with larger/smaller value in the # numerical variable linear dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), - get(numericalWeightingVar), calibWeight), + get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), by = eval(paste0("combined_factors_", i))] } else { # categorical variable to be calibrated - dat[, f := ipf_step_f(dat$calibWeight, combined_factors, con_current)] + dat[, f := ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)] } if (dat[!is.na(f), any(abs(1 / f - 1) > epsPcur)]) { ## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps @@ -167,18 +168,18 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, if (calIter %% 100 == 0) { tmp <- dat[!is.na(f) & (abs(1 / f - 1) > epsPcur), list(maxFac = max(abs(1 / f - 1)), .N, head(epsPcur, 1), - sumCalib = sum(calibWeight), head(value, 1)), + sumCalib = sum(get(variableKeepingTheCalibWeight)), head(value, 1)), by = eval(pColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) message("-----------------------------------------\n") } } if (!is.null(bound)) { - dat[!is.na(f), calibWeight := boundsFak(calibWeight, get(variableKeepingTheBaseWeight), f, + dat[!is.na(f), c(variableKeepingTheCalibWeight) := boundsFak(get(variableKeepingTheCalibWeight), get(variableKeepingTheBaseWeight), f, bound = bound)] #,by=eval(pColNames[[i]])] } else { - dat[!is.na(f), calibWeight := f * calibWeight, + dat[!is.na(f), c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), by = eval(paste0("combined_factors_", i))] } error <- TRUE @@ -189,9 +190,10 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, } calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, - looseH, numericalWeighting, numericalWeightingVar, w) { + looseH, numericalWeighting, numericalWeightingVar, w, cw) { variableKeepingTheBaseWeight <- w - epsHcur <- OriginalSortingVariable <- V1 <- calibWeight <- + variableKeepingTheCalibWeight <- cw + epsHcur <- OriginalSortingVariable <- V1 <- epsvalue <- f <- NULL maxFac <- temporary_hvar <- value <- wValue <- wvst <- NULL @@ -209,19 +211,19 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, ## numerical variable to be calibrated ## use name of conH list element to define numerical variable - dat[, f := ipf_step_f(calibWeight * wvst * get(numericalWeightingVar), + dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * wvst * get(numericalWeightingVar), combined_factors, con_current)] dat[, wValue := value / f] # try to divide the weight between units with larger/smaller value in the # numerical variable linear dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), - get(numericalWeightingVar), calibWeight), + get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), by = eval(paste0("combined_factors_h_", i))] } else { # categorical variable to be calibrated - dat[, f := ipf_step_f(calibWeight * wvst, combined_factors, con_current)] + dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * wvst, combined_factors, con_current)] } dat[, wValue := value / f] @@ -232,7 +234,7 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, if (calIter %% 100 == 0) { tmp <- dat[!is.na(f) & (abs(1 / f - 1) > epsHcur), list(maxFac = max(abs(1 / f - 1)), .N, head(epsHcur, 1), - sumCalibWeight = sum(calibWeight * wvst), + sumCalibWeight = sum(get(variableKeepingTheCalibWeight) * wvst), head(value, 1)), by = eval(hColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) @@ -241,15 +243,15 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } if (!is.null(bound)) { if (!looseH) { - dat[, calibWeight := boundsFak(g1 = calibWeight, g0 = get(variableKeepingTheBaseWeight), f = f, + dat[, c(variableKeepingTheCalibWeight) := boundsFak(g1 = get(variableKeepingTheCalibWeight), g0 = get(variableKeepingTheBaseWeight), f = f, bound = bound)]#,by=eval(hColNames[[i]])] }else{ - dat[, calibWeight := boundsFakHH(g1 = calibWeight, g0 = get(variableKeepingTheBaseWeight), + dat[, c(variableKeepingTheCalibWeight) := boundsFakHH(g1 = get(variableKeepingTheCalibWeight), g0 = get(variableKeepingTheBaseWeight), eps = epsHcur, orig = value, p = wValue, bound = bound)] } } else { - dat[, calibWeight := f * calibWeight, by = eval( + dat[, c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), by = eval( paste0("combined_factors_h_", i))] } error <- TRUE @@ -261,7 +263,7 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } ## recreate the formula argument to xtabs based on conP, conH -getFormulas <- function(con, w = "calibWeight") { +getFormulas <- function(con, w) { formOut <- NULL for (i in seq_along(con)) { lhs <- names(con)[i] @@ -279,20 +281,21 @@ getFormulas <- function(con, w = "calibWeight") { ## enrich dat_original with the calibrated weights and assign attributes addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, - maxIter, calIter, returnNA) { - wvst <- OriginalSortingVariable <- calibWeight <- NULL + maxIter, calIter, returnNA, cw) { + variableKeepingTheCalibWeight <- cw + wvst <- OriginalSortingVariable <- outTable <- copy(dat_original) # add calibrated weights. Use setkey to make sure the indexes match setkey(dat, OriginalSortingVariable) if ((maxIter < calIter) & returnNA) - outTable[, calibWeight := NA] + outTable[, c(variableKeepingTheCalibWeight) := NA] else - outTable[, calibWeight := dat$calibWeight] + outTable[, c(variableKeepingTheCalibWeight) := dat[[variableKeepingTheCalibWeight]]] - formP <- getFormulas(conP) - formH <- getFormulas(conH) + formP <- getFormulas(conP, w = variableKeepingTheCalibWeight) + formH <- getFormulas(conH, w = variableKeepingTheCalibWeight) # general information setattr(outTable, "converged", (maxIter >= calIter)) @@ -402,6 +405,7 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, #' household constraints #' @param conversion_messages show a message, if inputs need to be reformatted. This can be useful for speed #' optimizations if ipf is called several times with similar inputs (for example bootstrapping) +#' @param nameCalibWeight character defining the name of the variable for the newly generated calibrated weight. #' @return The function will return the input data `dat` with the #' calibrated weights `calibWeight` as an additional column as well as attributes. If no convergence has been reached in `maxIter` #' steps, and `returnNA` is `TRUE` (the default), the column `calibWeights` will only consist of `NA`s. The attributes of the table are @@ -487,15 +491,17 @@ ipf <- function( dat, hid = NULL, conP = NULL, conH = NULL, epsP = 1e-6, epsH = 1e-2, verbose = FALSE, w = NULL, bound = 4, maxIter = 200, meanHH = TRUE, allPthenH = TRUE, returnNA = TRUE, looseH = FALSE, numericalWeighting = - computeLinear, check_hh_vars = TRUE, conversion_messages = FALSE) { + computeLinear, check_hh_vars = TRUE, conversion_messages = FALSE, + nameCalibWeight = "calibWeight") { check_population_totals(conP, dat, "personal") check_population_totals(conH, dat, "household") variableKeepingTheBaseWeight <- w + variableKeepingTheCalibWeight <- nameCalibWeight if ("variableKeepingTheBaseWeight" %in% names(dat)) stop("The provided dataset must not have a column called 'variableKeepingTheBaseWeight'") - OriginalSortingVariable <- V1 <- calibWeight <- epsvalue <- + OriginalSortingVariable <- V1 <- epsvalue <- f <- temporary_hvar <- value <- wValue <- wvst <- NULL dat_original <- dat @@ -595,9 +601,9 @@ ipf <- function( if (is.null(variableKeepingTheBaseWeight)) { if (!is.null(bound) && is.null(w)) stop("Bounds are only reasonable if base weights are provided") - dat[, calibWeight := 1] + dat[, c(variableKeepingTheCalibWeight) := 1] } else { - dat[, calibWeight := dat[, variableKeepingTheBaseWeight, with = FALSE]] + dat[, c(variableKeepingTheCalibWeight) := dat[, variableKeepingTheBaseWeight, with = FALSE]] } if (check_hh_vars) { @@ -663,12 +669,13 @@ ipf <- function( pColNames = pColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, numericalWeightingVar = numericalWeightingTmp, - w = variableKeepingTheBaseWeight) + w = variableKeepingTheBaseWeight, + cw = variableKeepingTheCalibWeight) } ## replace person weight with household average dh <- dat[[hid]] - dat[, calibWeight := meanfun(calibWeight, dh)] + dat[, c(variableKeepingTheCalibWeight) := meanfun(get(variableKeepingTheCalibWeight), dh)] ### Household calib for (i in seq_along(conH)) { @@ -682,7 +689,8 @@ ipf <- function( calIter = calIter, looseH = looseH, numericalWeighting = numericalWeighting, numericalWeightingVar = numericalWeightingTmp, - w = variableKeepingTheBaseWeight) + w = variableKeepingTheBaseWeight, + cw = variableKeepingTheCalibWeight) } } else { ### Person calib @@ -696,11 +704,12 @@ ipf <- function( pColNames = pColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, numericalWeightingVar = numericalWeightingTmp, - w = variableKeepingTheBaseWeight) + w = variableKeepingTheBaseWeight, + cw = variableKeepingTheCalibWeight) ## replace person weight with household average dh <- dat[[hid]] - dat[, calibWeight := meanfun(calibWeight, dh)] + dat[, c(variableKeepingTheCalibWeight) := meanfun(get(variableKeepingTheCalibWeight), dh)] ### Household calib for (i in seq_along(conH)) { @@ -713,7 +722,8 @@ ipf <- function( hColNames = hColNames, bound = bound, verbose = verbose, calIter = calIter, numericalWeighting = numericalWeighting, numericalWeightingVar = numericalWeightingTmp, looseH = looseH, - w = variableKeepingTheBaseWeight) + w = variableKeepingTheBaseWeight, + cw = variableKeepingTheCalibWeight) } } } @@ -727,5 +737,5 @@ ipf <- function( } addWeightsAndAttributes(dat, conP, conH, epsP, epsH, dat_original, maxIter, - calIter, returnNA) + calIter, returnNA, variableKeepingTheCalibWeight) } diff --git a/man/ipf.Rd b/man/ipf.Rd index b894e87..e734b83 100644 --- a/man/ipf.Rd +++ b/man/ipf.Rd @@ -8,7 +8,8 @@ ipf(dat, hid = NULL, conP = NULL, conH = NULL, epsP = 1e-06, epsH = 0.01, verbose = FALSE, w = NULL, bound = 4, maxIter = 200, meanHH = TRUE, allPthenH = TRUE, returnNA = TRUE, looseH = FALSE, numericalWeighting = computeLinear, - check_hh_vars = TRUE, conversion_messages = FALSE) + check_hh_vars = TRUE, conversion_messages = FALSE, + nameCalibWeight = "calibWeight") } \arguments{ \item{dat}{a \code{data.table} containing household ids (optionally), base @@ -81,6 +82,8 @@ household constraints} \item{conversion_messages}{show a message, if inputs need to be reformatted. This can be useful for speed optimizations if ipf is called several times with similar inputs (for example bootstrapping)} + +\item{nameCalibWeight}{character defining the name of the variable for the newly generated calibrated weight.} } \value{ The function will return the input data \code{dat} with the From 58bbc0dbd6d9a69686588301126a34a5ed0115a6 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Tue, 5 Feb 2019 13:33:07 +0100 Subject: [PATCH 07/11] add test for renaming calibWeight --- tests/testthat/test_ipf.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/testthat/test_ipf.R b/tests/testthat/test_ipf.R index 49b6ece..c90eadc 100644 --- a/tests/testthat/test_ipf.R +++ b/tests/testthat/test_ipf.R @@ -146,3 +146,19 @@ test_that("ipf works as expected", { subset = !duplicated(household)) - conH1) / conH1))) expect_true(err < .01) }) + +test_that("ipf works as expected calibWeight renamed", { + # with array epsP1, base weights and bound + calibweights2 <- ipf( + eusilcS, hid = "household", conP = list(conP1, conP2), conH = list(conH1), + epsP = 1e-06, epsH = list(epsH1), w = "baseWeight", bound = 4, + verbose = FALSE, maxIter = 200, nameCalibWeight = "calibWeightNew") + err <- max(c( + max(abs(xtabs(calibWeightNew ~ agegroup, data = calibweights2) - conP1) / + conP1), + max(abs(xtabs(calibWeightNew ~ gender + state, data = calibweights2) - conP2) / + conP2), + max(abs(xtabs(calibWeightNew ~ hsize + state, data = calibweights2, + subset = !duplicated(household)) - conH1) / conH1))) + expect_true(err < .01) +}) From 02089cc2c5702b472f6d79fda7b0d5abfb49a6d5 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Wed, 6 Feb 2019 11:53:30 +0100 Subject: [PATCH 08/11] set instead of := --- R/ipf.r | 98 +++++++++++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 51 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index ab60215..8b2d726 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -19,11 +19,11 @@ getMeanFun <- function(meanHH) { if (identical(meanHH, FALSE)) meanHH <- "none" meanfun <- switch(meanHH, - arithmetic = arithmetic_mean, - geometric = geometric_mean, - none = function(x, w){ - x - } + arithmetic = arithmetic_mean, + geometric = geometric_mean, + none = function(x, w){ + x + } ) if (is.null(meanfun)) stop("invalid value for meanHH") @@ -144,21 +144,20 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conP list element to define numerical variable - - dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * get(numericalWeightingVar), - combined_factors, con_current)] - dat[, wValue := value / f] + set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[[numericalWeightingVar]], + combined_factors, con_current)) + set(dat, j = "wValue", value = dat[["value"]]/dat[["f"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), - get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), + get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), by = eval(paste0("combined_factors_", i))] } else { # categorical variable to be calibrated - dat[, f := ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)] + set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)) } if (dat[!is.na(f), any(abs(1 / f - 1) > epsPcur)]) { ## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps @@ -176,7 +175,7 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, } if (!is.null(bound)) { dat[!is.na(f), c(variableKeepingTheCalibWeight) := boundsFak(get(variableKeepingTheCalibWeight), get(variableKeepingTheBaseWeight), f, - bound = bound)] + bound = bound)] #,by=eval(pColNames[[i]])] } else { dat[!is.na(f), c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), @@ -204,16 +203,15 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, combined_factors <- dat[[paste0("combined_factors_h_", i)]] tmp <- data.table(x = factor(levels(combined_factors))) setnames(tmp, "x", paste0("combined_factors_h_", i)) - paste0("combined_factors_h_", i) + paste0("combined_factors_h_", i) con_current <- dat[tmp, on = paste0("combined_factors_h_", i), mult = "first", value] if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conH list element to define numerical variable - - dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * wvst * get(numericalWeightingVar), - combined_factors, con_current)] - dat[, wValue := value / f] + set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["wvst"]] * dat[[numericalWeightingVar]], + combined_factors, con_current)) + set(dat, j = "wValue", value = dat[["value"]] / dat[["f"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear @@ -223,10 +221,10 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } else { # categorical variable to be calibrated - dat[, f := ipf_step_f(get(variableKeepingTheCalibWeight) * wvst, combined_factors, con_current)] + set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["wvst"]], combined_factors, con_current)) } - dat[, wValue := value / f] + set(dat, j = "wValue", value = dat[["value"]]/dat[["f"]]) if (dat[!is.na(f), any(abs(1 / f - 1) > epsHcur)]) { if (verbose && calIter %% 10 == 0) { @@ -243,12 +241,15 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } if (!is.null(bound)) { if (!looseH) { - dat[, c(variableKeepingTheCalibWeight) := boundsFak(g1 = get(variableKeepingTheCalibWeight), g0 = get(variableKeepingTheBaseWeight), f = f, - bound = bound)]#,by=eval(hColNames[[i]])] + set(dat, j = variableKeepingTheCalibWeight, value = + boundsFak(g1 = dat[[variableKeepingTheCalibWeight]], g0 = dat[[variableKeepingTheBaseWeight]], f = dat[["f"]], + bound = bound)) }else{ - dat[, c(variableKeepingTheCalibWeight) := boundsFakHH(g1 = get(variableKeepingTheCalibWeight), g0 = get(variableKeepingTheBaseWeight), - eps = epsHcur, orig = value, - p = wValue, bound = bound)] + set(dat, j = variableKeepingTheCalibWeight, value = + boundsFakHH(g1 = get(variableKeepingTheCalibWeight), g0 = dat[[variableKeepingTheBaseWeight]], + eps = dat[["epsHcur"]], orig = dat[["value"]], + p = dat[["wValue"]], bound = bound) + ) } } else { dat[, c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), by = eval( @@ -284,15 +285,17 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, maxIter, calIter, returnNA, cw) { variableKeepingTheCalibWeight <- cw wvst <- OriginalSortingVariable <- - outTable <- copy(dat_original) + outTable <- copy(dat_original) # add calibrated weights. Use setkey to make sure the indexes match setkey(dat, OriginalSortingVariable) - if ((maxIter < calIter) & returnNA) + if ((maxIter < calIter) & returnNA){ outTable[, c(variableKeepingTheCalibWeight) := NA] - else + } else { outTable[, c(variableKeepingTheCalibWeight) := dat[[variableKeepingTheCalibWeight]]] + } + formP <- getFormulas(conP, w = variableKeepingTheCalibWeight) formH <- getFormulas(conH, w = variableKeepingTheCalibWeight) @@ -551,7 +554,7 @@ ipf <- function( ) } else if (!identical(levels(dat[[colname]]), - dimnames(conP[[i]])[[colname]])) { + dimnames(conP[[i]])[[colname]])) { if (conversion_messages) message("correct levels of column ", colname) set( @@ -561,10 +564,8 @@ ipf <- function( } } combined_factors <- combine_factors(dat, conP[[i]]) - - dat[, paste0("combined_factors_", i) := combined_factors] - tmp <- as.vector(conP[[i]][combined_factors]) - dat[, paste0("valueP", i) := tmp] + set(dat, j = paste0("combined_factors_", i), value = combined_factors) + set(dat, j = paste0("valueP", i), value = as.vector(conP[[i]][combined_factors])) } for (i in seq_along(conH)) { colnames <- hColNames[[i]] @@ -581,7 +582,7 @@ ipf <- function( ) } else if (!identical(levels(dat[[colname]]), - dimnames(conH[[i]])[[colname]])) { + dimnames(conH[[i]])[[colname]])) { if (conversion_messages) message("correct levels of column ", colname) set( @@ -593,17 +594,16 @@ ipf <- function( combined_factors <- combine_factors(dat, conH[[i]]) - dat[, paste0("combined_factors_h_", i) := combined_factors] - tmp <- as.vector(conH[[i]][combined_factors]) - dat[, paste0("valueH", i) := tmp] + set(dat, j = paste0("combined_factors_h_", i), value = combined_factors) + set(dat, j = paste0("valueH", i), value = as.vector(conH[[i]][combined_factors])) } if (is.null(variableKeepingTheBaseWeight)) { if (!is.null(bound) && is.null(w)) stop("Bounds are only reasonable if base weights are provided") - dat[, c(variableKeepingTheCalibWeight) := 1] + set(dat, j = variableKeepingTheCalibWeight, value = 1) } else { - dat[, c(variableKeepingTheCalibWeight) := dat[, variableKeepingTheBaseWeight, with = FALSE]] + set(dat, j = variableKeepingTheCalibWeight, value = dat[[variableKeepingTheBaseWeight]]) } if (check_hh_vars) { @@ -613,7 +613,7 @@ ipf <- function( for (h in hh) { setnames(dat, h, "temporary_hvar") if (dat[, length(unique(temporary_hvar)), - by = c(hid)][, any(V1 != 1)]) { + by = c(hid)][, any(V1 != 1)]) { stop(paste(h, "has different values inside a household")) } setnames(dat, "temporary_hvar", h) @@ -625,30 +625,28 @@ ipf <- function( for (i in seq_along(epsP)) { if (is.array(epsP[[i]])) { combined_factors <- dat[[paste0("combined_factors_", i)]] - tmp <- as.vector(epsP[[i]][combined_factors]) - dat[, paste0("epsP_", i) := tmp] + set(dat, j = paste0("epsP_", i), value = as.vector(epsP[[i]][combined_factors])) } else { - dat[, paste0("epsP_", i) := epsP[[i]]] + set(dat, j = paste0("epsP_", i), value = epsP[[i]]) } } } else { for (i in seq_along(conP)) { - dat[, paste0("epsP_", i) := epsP] + set(dat, j = paste0("epsP_", i), value = epsP) } } if (is.list(epsH)) { for (i in seq_along(epsH)) { if (is.array(epsH[[i]])) { combined_factors <- dat[[paste0("combined_factors_h_", i)]] - tmp <- as.vector(epsH[[i]][combined_factors]) - dat[, paste0("epsH_", i) := tmp] + set(dat, j = paste0("epsH_", i), value = as.vector(epsH[[i]][combined_factors])) } else { - dat[, paste0("epsH_", i) := epsH[[i]]] + set(dat, j = paste0("epsH_", i), value = epsH[[i]]) } } } else { for (i in seq_along(conH)) { - dat[, paste0("epsH_", i) := epsH] + set(dat, j = paste0("epsH_", i), value = epsH) } } ###Calib @@ -674,8 +672,7 @@ ipf <- function( } ## replace person weight with household average - dh <- dat[[hid]] - dat[, c(variableKeepingTheCalibWeight) := meanfun(get(variableKeepingTheCalibWeight), dh)] + set(dat, j = variableKeepingTheCalibWeight, value =meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) ### Household calib for (i in seq_along(conH)) { @@ -708,8 +705,7 @@ ipf <- function( cw = variableKeepingTheCalibWeight) ## replace person weight with household average - dh <- dat[[hid]] - dat[, c(variableKeepingTheCalibWeight) := meanfun(get(variableKeepingTheCalibWeight), dh)] + set(dat, j = variableKeepingTheCalibWeight, value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) ### Household calib for (i in seq_along(conH)) { From 6b157e35ce3a1938c0cffaf2f4b28b24f328eba3 Mon Sep 17 00:00:00 2001 From: alexkowa Date: Wed, 6 Feb 2019 13:37:31 +0100 Subject: [PATCH 09/11] fVariableForCalibrationIPF instead of f --- R/ipf.r | 74 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index 8b2d726..565f345 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -127,9 +127,9 @@ check_population_totals <- function(con, dat, type = "personal") { calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, numericalWeighting, numericalWeightingVar, w, cw) { epsPcur <- maxFac <- OriginalSortingVariable <- V1 <- - epsvalue <- f <- NULL + epsvalue <- fVariableForCalibrationIPF <- NULL temporary_hvar <- value <- - wValue <- wvst <- NULL + wValue <- representativeHouseholdForCalibration <- NULL variableKeepingTheBaseWeight <- w variableKeepingTheCalibWeight <- cw combined_factors <- dat[[paste0("combined_factors_", i)]] @@ -144,29 +144,29 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conP list element to define numerical variable - set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[[numericalWeightingVar]], - combined_factors, con_current)) - set(dat, j = "wValue", value = dat[["value"]]/dat[["f"]]) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * + dat[[numericalWeightingVar]], combined_factors, con_current)) + set(dat, j = "wValue", value = dat[["value"]]/dat[["fVariableForCalibrationIPF"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear - dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), + dat[, fVariableForCalibrationIPF := numericalWeighting(head(wValue, 1), head(value, 1), get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), by = eval(paste0("combined_factors_", i))] } else { # categorical variable to be calibrated - set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)) } - if (dat[!is.na(f), any(abs(1 / f - 1) > epsPcur)]) { + if (dat[!is.na(fVariableForCalibrationIPF), any(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)]) { ## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps ## Werte uebergeben?? if (verbose && calIter %% 10 == 0) { message(calIter, ":Not yet converged for P-Constraint", i, "\n") if (calIter %% 100 == 0) { - tmp <- dat[!is.na(f) & (abs(1 / f - 1) > epsPcur), - list(maxFac = max(abs(1 / f - 1)), .N, head(epsPcur, 1), + tmp <- dat[!is.na(fVariableForCalibrationIPF) & (abs(1 / fVariableForCalibrationIPF - 1) > epsPcur), + list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, head(epsPcur, 1), sumCalib = sum(get(variableKeepingTheCalibWeight)), head(value, 1)), by = eval(pColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) @@ -174,11 +174,12 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, } } if (!is.null(bound)) { - dat[!is.na(f), c(variableKeepingTheCalibWeight) := boundsFak(get(variableKeepingTheCalibWeight), get(variableKeepingTheBaseWeight), f, - bound = bound)] + dat[!is.na(fVariableForCalibrationIPF), c(variableKeepingTheCalibWeight) := boundsFak(get(variableKeepingTheCalibWeight), + get(variableKeepingTheBaseWeight), fVariableForCalibrationIPF, bound = bound)] #,by=eval(pColNames[[i]])] } else { - dat[!is.na(f), c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), + dat[!is.na(fVariableForCalibrationIPF), c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * + get(variableKeepingTheCalibWeight), by = eval(paste0("combined_factors_", i))] } error <- TRUE @@ -193,9 +194,9 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, variableKeepingTheBaseWeight <- w variableKeepingTheCalibWeight <- cw epsHcur <- OriginalSortingVariable <- V1 <- - epsvalue <- f <- NULL + epsvalue <- fVariableForCalibrationIPF <- NULL maxFac <- temporary_hvar <- - value <- wValue <- wvst <- NULL + value <- wValue <- representativeHouseholdForCalibration <- NULL setnames(dat, valueH[i], "value") setnames(dat, paste0("epsH_", i), "epsHcur") @@ -209,30 +210,31 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conH list element to define numerical variable - set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["wvst"]] * dat[[numericalWeightingVar]], - combined_factors, con_current)) - set(dat, j = "wValue", value = dat[["value"]] / dat[["f"]]) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["representativeHouseholdForCalibration"]] * + dat[[numericalWeightingVar]], combined_factors, con_current)) + set(dat, j = "wValue", value = dat[["value"]] / dat[["fVariableForCalibrationIPF"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear - dat[, f := numericalWeighting(head(wValue, 1), head(value, 1), + dat[, fVariableForCalibrationIPF := numericalWeighting(head(wValue, 1), head(value, 1), get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), by = eval(paste0("combined_factors_h_", i))] } else { # categorical variable to be calibrated - set(dat, j = "f", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["wvst"]], combined_factors, con_current)) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["representativeHouseholdForCalibration"]], + combined_factors, con_current)) } - set(dat, j = "wValue", value = dat[["value"]]/dat[["f"]]) + set(dat, j = "wValue", value = dat[["value"]]/dat[["fVariableForCalibrationIPF"]]) - if (dat[!is.na(f), any(abs(1 / f - 1) > epsHcur)]) { + if (dat[!is.na(fVariableForCalibrationIPF), any(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur)]) { if (verbose && calIter %% 10 == 0) { message(calIter, ":Not yet converged for H-Constraint", i, "\n") if (calIter %% 100 == 0) { - tmp <- dat[!is.na(f) & (abs(1 / f - 1) > epsHcur), - list(maxFac = max(abs(1 / f - 1)), .N, head(epsHcur, 1), - sumCalibWeight = sum(get(variableKeepingTheCalibWeight) * wvst), + tmp <- dat[!is.na(fVariableForCalibrationIPF) & (abs(1 / fVariableForCalibrationIPF - 1) > epsHcur), + list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, head(epsHcur, 1), + sumCalibWeight = sum(get(variableKeepingTheCalibWeight) * representativeHouseholdForCalibration), head(value, 1)), by = eval(hColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) @@ -242,7 +244,8 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, if (!is.null(bound)) { if (!looseH) { set(dat, j = variableKeepingTheCalibWeight, value = - boundsFak(g1 = dat[[variableKeepingTheCalibWeight]], g0 = dat[[variableKeepingTheBaseWeight]], f = dat[["f"]], + boundsFak(g1 = dat[[variableKeepingTheCalibWeight]], g0 = dat[[variableKeepingTheBaseWeight]], + f = dat[["fVariableForCalibrationIPF"]], bound = bound)) }else{ set(dat, j = variableKeepingTheCalibWeight, value = @@ -252,8 +255,8 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, ) } } else { - dat[, c(variableKeepingTheCalibWeight) := f * get(variableKeepingTheCalibWeight), by = eval( - paste0("combined_factors_h_", i))] + dat[, c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * get(variableKeepingTheCalibWeight), + by = eval(paste0("combined_factors_h_", i))] } error <- TRUE } @@ -284,7 +287,7 @@ getFormulas <- function(con, w) { addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, maxIter, calIter, returnNA, cw) { variableKeepingTheCalibWeight <- cw - wvst <- OriginalSortingVariable <- + representativeHouseholdForCalibration <- OriginalSortingVariable <- outTable <- copy(dat_original) # add calibrated weights. Use setkey to make sure the indexes match @@ -311,7 +314,7 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, # adjusted constraints (conP, conH according to the calibrated weights) setattr(outTable, "conP_adj", lapply(formP, xtabs, dat)) - setattr(outTable, "conH_adj", lapply(formH, xtabs, dat[wvst == 1])) + setattr(outTable, "conH_adj", lapply(formH, xtabs, dat[representativeHouseholdForCalibration == 1])) # tolerances setattr(outTable, "epsP", epsP) @@ -506,7 +509,7 @@ ipf <- function( OriginalSortingVariable <- V1 <- epsvalue <- f <- temporary_hvar <- - value <- wValue <- wvst <- NULL + value <- wValue <- representativeHouseholdForCalibration <- NULL dat_original <- dat dat <- copy(dat) ## originalsorting is fucked up without this @@ -519,7 +522,7 @@ ipf <- function( ###fixed target value, should not be changed in iterations valueH <- paste0("valueH", seq_along(conH)) ###Housekeeping of the varNames used - usedVarNames <- c(valueP, valueH, "value", "wvst", "wValue") + usedVarNames <- c(valueP, valueH, "value", "representativeHouseholdForCalibration", "wValue") if (any(names(dat) %in% usedVarNames)) { renameVars <- names(dat)[names(dat) %in% usedVarNames] @@ -531,9 +534,9 @@ ipf <- function( #delVars <- c("hid") hid <- "hid" dat[, hid := 1:nrow(dat)] - dat[, wvst := 1] + dat[, representativeHouseholdForCalibration := 1] } else { - dat[, wvst := as.numeric(!duplicated(get(hid)))] + dat[, representativeHouseholdForCalibration := as.numeric(!duplicated(get(hid)))] } ## Names of the calibration variables for Person and household dimension @@ -731,7 +734,8 @@ ipf <- function( } calIter <- calIter + 1 } - + # Remove Help Variables + dat[,fVariableForCalibrationIPF:=NULL] addWeightsAndAttributes(dat, conP, conH, epsP, epsH, dat_original, maxIter, calIter, returnNA, variableKeepingTheCalibWeight) } From d5b121dc7e555caae3880ef11ef10ef28fea813f Mon Sep 17 00:00:00 2001 From: Gregor de Cillia Date: Tue, 12 Feb 2019 10:43:39 +0100 Subject: [PATCH 10/11] resolve lints --- R/ipf.r | 145 ++++++++++++++++++++++++-------------- tests/testthat/test_ipf.R | 14 ++-- 2 files changed, 103 insertions(+), 56 deletions(-) diff --git a/R/ipf.r b/R/ipf.r index 565f345..02c22e2 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -144,41 +144,55 @@ calibP <- function(i, dat, error, valueP, pColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conP list element to define numerical variable - set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * - dat[[numericalWeightingVar]], combined_factors, con_current)) - set(dat, j = "wValue", value = dat[["value"]]/dat[["fVariableForCalibrationIPF"]]) + set(dat, j = "fVariableForCalibrationIPF", + value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * + dat[[numericalWeightingVar]], + combined_factors, con_current)) + set(dat, j = "wValue", value = dat[["value"]] / + dat[["fVariableForCalibrationIPF"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear - dat[, fVariableForCalibrationIPF := numericalWeighting(head(wValue, 1), head(value, 1), - get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), - by = eval(paste0("combined_factors_", i))] - + dat[, fVariableForCalibrationIPF := numericalWeighting( + head(wValue, 1), head(value, 1), get(numericalWeightingVar), + get(variableKeepingTheCalibWeight)), + by = eval(paste0("combined_factors_", i))] } else { # categorical variable to be calibrated - set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f( + dat[[variableKeepingTheCalibWeight]], combined_factors, con_current)) } - if (dat[!is.na(fVariableForCalibrationIPF), any(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)]) { + if (dat[!is.na(fVariableForCalibrationIPF), + any(abs(1 / fVariableForCalibrationIPF - 1) > epsPcur)]) { ## sicherheitshalber abs(epsPcur)? Aber es wird schon niemand negative eps ## Werte uebergeben?? if (verbose && calIter %% 10 == 0) { message(calIter, ":Not yet converged for P-Constraint", i, "\n") if (calIter %% 100 == 0) { - tmp <- dat[!is.na(fVariableForCalibrationIPF) & (abs(1 / fVariableForCalibrationIPF - 1) > epsPcur), - list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, head(epsPcur, 1), - sumCalib = sum(get(variableKeepingTheCalibWeight)), head(value, 1)), - by = eval(pColNames[[i]])] + tmp <- dat[ + !is.na(fVariableForCalibrationIPF) & + (abs(1 / fVariableForCalibrationIPF - 1) > epsPcur), + list( + maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, + head(epsPcur, 1), + sumCalib = sum(get(variableKeepingTheCalibWeight)), head(value, 1)), + by = eval(pColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) message("-----------------------------------------\n") } } if (!is.null(bound)) { - dat[!is.na(fVariableForCalibrationIPF), c(variableKeepingTheCalibWeight) := boundsFak(get(variableKeepingTheCalibWeight), - get(variableKeepingTheBaseWeight), fVariableForCalibrationIPF, bound = bound)] + dat[!is.na(fVariableForCalibrationIPF), + c(variableKeepingTheCalibWeight) := + boundsFak( + get(variableKeepingTheCalibWeight), + get(variableKeepingTheBaseWeight), fVariableForCalibrationIPF, + bound = bound)] #,by=eval(pColNames[[i]])] } else { - dat[!is.na(fVariableForCalibrationIPF), c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * + dat[!is.na(fVariableForCalibrationIPF), + c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * get(variableKeepingTheCalibWeight), by = eval(paste0("combined_factors_", i))] } @@ -210,32 +224,45 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, if (!is.null(numericalWeightingVar)) { ## numerical variable to be calibrated ## use name of conH list element to define numerical variable - set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["representativeHouseholdForCalibration"]] * + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f( + dat[[variableKeepingTheCalibWeight]] * + dat[["representativeHouseholdForCalibration"]] * dat[[numericalWeightingVar]], combined_factors, con_current)) - set(dat, j = "wValue", value = dat[["value"]] / dat[["fVariableForCalibrationIPF"]]) + set(dat, j = "wValue", value = dat[["value"]] / + dat[["fVariableForCalibrationIPF"]]) # try to divide the weight between units with larger/smaller value in the # numerical variable linear - dat[, fVariableForCalibrationIPF := numericalWeighting(head(wValue, 1), head(value, 1), - get(numericalWeightingVar), get(variableKeepingTheCalibWeight)), - by = eval(paste0("combined_factors_h_", i))] + dat[, fVariableForCalibrationIPF := numericalWeighting( + head(wValue, 1), head(value, 1), get(numericalWeightingVar), + get(variableKeepingTheCalibWeight)), + by = eval(paste0("combined_factors_h_", i))] } else { # categorical variable to be calibrated - set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f(dat[[variableKeepingTheCalibWeight]] * dat[["representativeHouseholdForCalibration"]], - combined_factors, con_current)) + set(dat, j = "fVariableForCalibrationIPF", value = ipf_step_f( + dat[[variableKeepingTheCalibWeight]] * + dat[["representativeHouseholdForCalibration"]], + combined_factors, con_current)) } - set(dat, j = "wValue", value = dat[["value"]]/dat[["fVariableForCalibrationIPF"]]) + set(dat, j = "wValue", value = dat[["value"]] / + dat[["fVariableForCalibrationIPF"]]) - if (dat[!is.na(fVariableForCalibrationIPF), any(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur)]) { + if (dat[!is.na(fVariableForCalibrationIPF), + any(abs(1 / fVariableForCalibrationIPF - 1) > epsHcur)]) { if (verbose && calIter %% 10 == 0) { message(calIter, ":Not yet converged for H-Constraint", i, "\n") if (calIter %% 100 == 0) { - tmp <- dat[!is.na(fVariableForCalibrationIPF) & (abs(1 / fVariableForCalibrationIPF - 1) > epsHcur), - list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, head(epsHcur, 1), - sumCalibWeight = sum(get(variableKeepingTheCalibWeight) * representativeHouseholdForCalibration), - head(value, 1)), by = eval(hColNames[[i]])] + tmp <- dat[ + !is.na(fVariableForCalibrationIPF) & + (abs(1 / fVariableForCalibrationIPF - 1) > epsHcur), + list(maxFac = max(abs(1 / fVariableForCalibrationIPF - 1)), .N, + head(epsHcur, 1), + sumCalibWeight = sum(get(variableKeepingTheCalibWeight) * + representativeHouseholdForCalibration), + head(value, 1)), + by = eval(hColNames[[i]])] print(tmp[order(maxFac, decreasing = TRUE), ]) message("-----------------------------------------\n") @@ -243,19 +270,22 @@ calibH <- function(i, dat, error, valueH, hColNames, bound, verbose, calIter, } if (!is.null(bound)) { if (!looseH) { - set(dat, j = variableKeepingTheCalibWeight, value = - boundsFak(g1 = dat[[variableKeepingTheCalibWeight]], g0 = dat[[variableKeepingTheBaseWeight]], - f = dat[["fVariableForCalibrationIPF"]], - bound = bound)) + set(dat, j = variableKeepingTheCalibWeight, value = boundsFak( + g1 = dat[[variableKeepingTheCalibWeight]], + g0 = dat[[variableKeepingTheBaseWeight]], + f = dat[["fVariableForCalibrationIPF"]], + bound = bound)) }else{ - set(dat, j = variableKeepingTheCalibWeight, value = - boundsFakHH(g1 = get(variableKeepingTheCalibWeight), g0 = dat[[variableKeepingTheBaseWeight]], - eps = dat[["epsHcur"]], orig = dat[["value"]], - p = dat[["wValue"]], bound = bound) + set(dat, j = variableKeepingTheCalibWeight, value = boundsFakHH( + g1 = get(variableKeepingTheCalibWeight), + g0 = dat[[variableKeepingTheBaseWeight]], + eps = dat[["epsHcur"]], orig = dat[["value"]], + p = dat[["wValue"]], bound = bound) ) } } else { - dat[, c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * get(variableKeepingTheCalibWeight), + dat[, c(variableKeepingTheCalibWeight) := fVariableForCalibrationIPF * + get(variableKeepingTheCalibWeight), by = eval(paste0("combined_factors_h_", i))] } error <- TRUE @@ -296,7 +326,8 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, if ((maxIter < calIter) & returnNA){ outTable[, c(variableKeepingTheCalibWeight) := NA] } else { - outTable[, c(variableKeepingTheCalibWeight) := dat[[variableKeepingTheCalibWeight]]] + outTable[, c(variableKeepingTheCalibWeight) := + dat[[variableKeepingTheCalibWeight]]] } @@ -314,7 +345,8 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, # adjusted constraints (conP, conH according to the calibrated weights) setattr(outTable, "conP_adj", lapply(formP, xtabs, dat)) - setattr(outTable, "conH_adj", lapply(formH, xtabs, dat[representativeHouseholdForCalibration == 1])) + setattr(outTable, "conH_adj", lapply( + formH, xtabs, dat[representativeHouseholdForCalibration == 1])) # tolerances setattr(outTable, "epsP", epsP) @@ -505,7 +537,8 @@ ipf <- function( variableKeepingTheBaseWeight <- w variableKeepingTheCalibWeight <- nameCalibWeight if ("variableKeepingTheBaseWeight" %in% names(dat)) - stop("The provided dataset must not have a column called 'variableKeepingTheBaseWeight'") + stop("The provided dataset must not have a column called", + " 'variableKeepingTheBaseWeight'") OriginalSortingVariable <- V1 <- epsvalue <- f <- temporary_hvar <- @@ -522,7 +555,8 @@ ipf <- function( ###fixed target value, should not be changed in iterations valueH <- paste0("valueH", seq_along(conH)) ###Housekeeping of the varNames used - usedVarNames <- c(valueP, valueH, "value", "representativeHouseholdForCalibration", "wValue") + usedVarNames <- c(valueP, valueH, "value", + "representativeHouseholdForCalibration", "wValue") if (any(names(dat) %in% usedVarNames)) { renameVars <- names(dat)[names(dat) %in% usedVarNames] @@ -536,7 +570,8 @@ ipf <- function( dat[, hid := 1:nrow(dat)] dat[, representativeHouseholdForCalibration := 1] } else { - dat[, representativeHouseholdForCalibration := as.numeric(!duplicated(get(hid)))] + dat[, representativeHouseholdForCalibration := + as.numeric(!duplicated(get(hid)))] } ## Names of the calibration variables for Person and household dimension @@ -568,7 +603,8 @@ ipf <- function( } combined_factors <- combine_factors(dat, conP[[i]]) set(dat, j = paste0("combined_factors_", i), value = combined_factors) - set(dat, j = paste0("valueP", i), value = as.vector(conP[[i]][combined_factors])) + set(dat, j = paste0("valueP", i), + value = as.vector(conP[[i]][combined_factors])) } for (i in seq_along(conH)) { colnames <- hColNames[[i]] @@ -598,7 +634,8 @@ ipf <- function( combined_factors <- combine_factors(dat, conH[[i]]) set(dat, j = paste0("combined_factors_h_", i), value = combined_factors) - set(dat, j = paste0("valueH", i), value = as.vector(conH[[i]][combined_factors])) + set(dat, j = paste0("valueH", i), + value = as.vector(conH[[i]][combined_factors])) } if (is.null(variableKeepingTheBaseWeight)) { @@ -606,7 +643,8 @@ ipf <- function( stop("Bounds are only reasonable if base weights are provided") set(dat, j = variableKeepingTheCalibWeight, value = 1) } else { - set(dat, j = variableKeepingTheCalibWeight, value = dat[[variableKeepingTheBaseWeight]]) + set(dat, j = variableKeepingTheCalibWeight, + value = dat[[variableKeepingTheBaseWeight]]) } if (check_hh_vars) { @@ -628,7 +666,8 @@ ipf <- function( for (i in seq_along(epsP)) { if (is.array(epsP[[i]])) { combined_factors <- dat[[paste0("combined_factors_", i)]] - set(dat, j = paste0("epsP_", i), value = as.vector(epsP[[i]][combined_factors])) + set(dat, j = paste0("epsP_", i), + value = as.vector(epsP[[i]][combined_factors])) } else { set(dat, j = paste0("epsP_", i), value = epsP[[i]]) } @@ -642,7 +681,8 @@ ipf <- function( for (i in seq_along(epsH)) { if (is.array(epsH[[i]])) { combined_factors <- dat[[paste0("combined_factors_h_", i)]] - set(dat, j = paste0("epsH_", i), value = as.vector(epsH[[i]][combined_factors])) + set(dat, j = paste0("epsH_", i), + value = as.vector(epsH[[i]][combined_factors])) } else { set(dat, j = paste0("epsH_", i), value = epsH[[i]]) } @@ -675,7 +715,8 @@ ipf <- function( } ## replace person weight with household average - set(dat, j = variableKeepingTheCalibWeight, value =meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) + set(dat, j = variableKeepingTheCalibWeight, + value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) ### Household calib for (i in seq_along(conH)) { @@ -708,7 +749,8 @@ ipf <- function( cw = variableKeepingTheCalibWeight) ## replace person weight with household average - set(dat, j = variableKeepingTheCalibWeight, value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) + set(dat, j = variableKeepingTheCalibWeight, + value = meanfun(dat[[variableKeepingTheCalibWeight]], dat[[hid]])) ### Household calib for (i in seq_along(conH)) { @@ -735,7 +777,8 @@ ipf <- function( calIter <- calIter + 1 } # Remove Help Variables - dat[,fVariableForCalibrationIPF:=NULL] + fVariableForCalibrationIPF <- NULL + dat[, fVariableForCalibrationIPF := NULL] addWeightsAndAttributes(dat, conP, conH, epsP, epsH, dat_original, maxIter, calIter, returnNA, variableKeepingTheCalibWeight) } diff --git a/tests/testthat/test_ipf.R b/tests/testthat/test_ipf.R index c90eadc..e178315 100644 --- a/tests/testthat/test_ipf.R +++ b/tests/testthat/test_ipf.R @@ -18,7 +18,7 @@ eusilcS[, agegroup := cut(age, c(-Inf, 10 * 1:9, Inf), right = FALSE)] # some recoding of netIncome for reasons of simplicity eusilcS[is.na(netIncome), netIncome := 0] eusilcS[netIncome < 0, netIncome := 0] -eusilcS[, HHnetIncome := sum(netIncome),by=household] +eusilcS[, HHnetIncome := sum(netIncome), by = household] # set hsize to 1,...,5+ eusilcS[, hsize := cut(hsize, c(0:4, Inf), labels = c(1:4, "5+"))] # treat households as a factor variable @@ -60,7 +60,8 @@ test_that("ipf with a numerical variable works as expected - computeLinear", { conH = list(conH1), epsP = list(1e-06, 1e-06, 1e-03), epsH = 0.01, - bound = NULL, verbose = FALSE, maxIter = 200,numericalWeighting = computeLinear) + bound = NULL, verbose = FALSE, maxIter = 200, + numericalWeighting = computeLinear) expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / sum(conP3) < .01) expect_true(all( @@ -86,7 +87,8 @@ test_that("ipf with a numerical variable works as expected - computeLinearG1", { conH = list(conH1), epsP = list(1e-06, 1e-06, 1e-03), epsH = 0.01, - bound = NULL, verbose = FALSE, maxIter = 200,numericalWeighting = computeLinearG1) + bound = NULL, verbose = FALSE, maxIter = 200, + numericalWeighting = computeLinearG1) expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / sum(conP3) < .01) expect_true(all( @@ -113,7 +115,8 @@ test_that("ipf with a numerical variable in households as expected", { conH = list(conH1, HHnetIncome = conH2), epsP = list(1e-06, 1e-06), epsH = list(0.01, 0.01), - bound = NULL, verbose = FALSE, maxIter = 50, numericalWeighting = computeFrac) + bound = NULL, verbose = FALSE, maxIter = 50, + numericalWeighting = computeFrac) expect_true(abs(calibweights1[, sum(calibWeight * netIncome)] - sum(conP3)) / sum(conP3) < .01) expect_true(all( @@ -156,7 +159,8 @@ test_that("ipf works as expected calibWeight renamed", { err <- max(c( max(abs(xtabs(calibWeightNew ~ agegroup, data = calibweights2) - conP1) / conP1), - max(abs(xtabs(calibWeightNew ~ gender + state, data = calibweights2) - conP2) / + max(abs( + xtabs(calibWeightNew ~ gender + state, data = calibweights2) - conP2) / conP2), max(abs(xtabs(calibWeightNew ~ hsize + state, data = calibweights2, subset = !duplicated(household)) - conH1) / conH1))) From 7cbf07c140fcb83cc8d97cc8f05dc8571e8137d7 Mon Sep 17 00:00:00 2001 From: Gregor de Cillia Date: Mon, 1 Apr 2019 18:28:58 +0200 Subject: [PATCH 11/11] resolve most linters --- R/calc.stError.R | 292 ++++++++++++++++++++++++------------ R/computeFrac.R | 16 +- R/demo.eusilc.R | 23 +-- R/draw.bootstrap.R | 193 ++++++++++++++++-------- R/generateHHID.R | 43 ++++-- R/ipf.r | 136 +++++++++-------- R/plot.R | 36 +++-- R/print.R | 3 +- R/recalib.R | 67 ++++++--- R/rescaled.bootstrap.R | 72 ++++++--- man/calc.stError.Rd | 303 ++++++++++++++++++++++++++------------ man/computeFrac.Rd | 16 +- man/demo.eusilc.Rd | 23 +-- man/draw.bootstrap.Rd | 226 ++++++++++++++++++---------- man/generate.HHID.Rd | 43 ++++-- man/ipf.Rd | 96 ++++++------ man/plot.surveysd.Rd | 42 ++++-- man/print.surveysd.Rd | 3 +- man/recalib.Rd | 65 +++++--- man/rescaled.bootstrap.Rd | 71 ++++++--- tests/testthat/test_ipf.R | 3 +- 21 files changed, 1165 insertions(+), 607 deletions(-) diff --git a/R/calc.stError.R b/R/calc.stError.R index ee30c1f..6d09923 100644 --- a/R/calc.stError.R +++ b/R/calc.stError.R @@ -1,35 +1,71 @@ -#' @title Calcualte point estimates and their standard errors using bootstrap weights. +#' @title Calcualte point estimates and their standard errors using bootstrap +#' weights. #' #' @description -#' Calculate point estimates as well as standard errors of variables in surveys. Standard errors are estimated using bootstrap weights (see [draw.bootstrap] and [recalib]). -#' In addition the standard error of an estimate can be calcualted using the survey data for 3 or more consecutive periods, which results in a reduction of the standard error. -#' -#' -#' @param dat either data.frame or data.table containing the survey data. Surveys can be a panel survey or rotating panel survey, but does not need to be. For rotating panel survey bootstrap weights can be created using [draw.bootstrap] and [recalib]. -#' @param weights character specifying the name of the column in `dat` containing the original sample weights. Used to calculate point estimates. -#' @param b.weights character vector specifying the names of the columns in `dat` containing bootstrap weights. Used to calculate standard errors. -#' @param period character specifying the name of the column in `dat` containing the sample periods. -#' @param var character vector containing variable names in `dat` on which `fun` shall be applied for each sample period. +#' Calculate point estimates as well as standard errors of variables in surveys. +#' Standard errors are estimated using bootstrap weights (see [draw.bootstrap] +#' and [recalib]). In addition the standard error of an estimate can be +#' calcualted using the survey data for 3 or more consecutive periods, which +#' results in a reduction of the standard error. +#' +#' @param dat either data.frame or data.table containing the survey data. +#' Surveys can be a panel survey or rotating panel survey, but does not need +#' to be. For rotating panel survey bootstrap weights can be created using +#' [draw.bootstrap] and [recalib]. +#' @param weights character specifying the name of the column in `dat` +#' containing the original sample weights. Used to calculate point estimates. +#' @param b.weights character vector specifying the names of the columns in +#' `dat` containing bootstrap weights. Used to calculate standard errors. +#' @param period character specifying the name of the column in `dat` +#' containing the sample periods. +#' @param var character vector containing variable names in `dat` on which `fun` +#' shall be applied for each sample period. #' @param fun function which will be applied on `var` for each sample period. -#' Predefined functions are [weightedRatio], [weightedSum], but can also take any other function which returns a double or integer and uses weights as its second argument. -#' @param national boolean, if TRUE point estimates resulting from fun will be divided by the point estimate at the national level. -#' @param group character vectors or list of character vectors containig variables in `dat`. For each list entry `dat` will be split in subgroups according to the containing variables as well as `period`. -#' The pointestimates are then estimated for each subgroup seperately. If `group=NULL` the data will split into sample periods by default. -#' @param fun.adjust.var can be either `NULL` or a function. This argument can be used to apply a function for each `period` and bootstrap weight to the data. The resulting estimates will be passed down to `fun`. See details for more explanations. -#' @param adjust.var can be either `NULL` or a character specifying the first argument in `fun.adjust.var`. -#' @param period.diff character vectors, defining periods for which the differences in the point estimate as well it's standard error is calculated. Each entry must have the form of `"period1 - period2"`. Can be NULL -#' @param period.mean odd integer, defining the range of periods over which the sample mean of point estimates is additionally calcualted. -#' @param bias boolean, if `TRUE` the sample mean over the point estimates of the bootstrap weights is returned. -#' @param size.limit integer defining a lower bound on the number of observations on `dat` in each group defined by `period` and the entries in `group`. -#' Warnings are returned if the number of observations in a subgroup falls below `size.limit`. In addition the concerned groups are available in the function output. -#' @param cv.limit non-negativ value defining a upper bound for the standard error in relation to the point estimate. If this relation exceed `cv.limit`, for a point estimate, they are flagged and available in the function output. -#' @param p numeric vector containing values between 0 and 1. Defines which quantiles for the distribution of `var` are additionally estimated. -#' @param add.arg additional arguments which will be passed to fun. Can be either a named list or vector. The names of the object correspond to the function arguments and the values to column names in dat, see also examples. -#' -#' @details `calc.stError` takes survey data (`dat`) and returns point estimates as well as their standard Errors -#' defined by `fun` and `var` for each sample period in `dat`. -#' `dat` must be household data where household members correspond to multiple rows with the same household identifier. -#' The data should at least containt the following columns: +#' Predefined functions are [weightedRatio], [weightedSum], but can also take +#' any other function which returns a double or integer and uses weights as +#' its second argument. +#' @param national boolean, if TRUE point estimates resulting from fun will be +#' divided by the point estimate at the national level. +#' @param group character vectors or list of character vectors containig +#' variables in `dat`. For each list entry `dat` will be split in subgroups +#' according to the containing variables as well as `period`. The +#' pointestimates are then estimated for each subgroup seperately. If +#' `group=NULL` the data will split into sample periods by default. +#' @param fun.adjust.var can be either `NULL` or a function. This argument can +#' be used to apply a function for each `period` and bootstrap weight to the +#' data. The resulting estimates will be passed down to `fun`. See details for +#' more explanations. +#' @param adjust.var can be either `NULL` or a character specifying the first +#' argument in `fun.adjust.var`. +#' @param period.diff character vectors, defining periods for which the +#' differences in the point estimate as well it's standard error is +#' calculated. Each entry must have the form of `"period1 - period2"`. Can be +#' NULL +#' @param period.mean odd integer, defining the range of periods over which the +#' sample mean of point estimates is additionally calcualted. +#' @param bias boolean, if `TRUE` the sample mean over the point estimates of +#' the bootstrap weights is returned. +#' @param size.limit integer defining a lower bound on the number of +#' observations on `dat` in each group defined by `period` and the entries in +#' `group`. Warnings are returned if the number of observations in a subgroup +#' falls below `size.limit`. In addition the concerned groups are available in +#' the function output. +#' @param cv.limit non-negativ value defining a upper bound for the standard +#' error in relation to the point estimate. If this relation exceed +#' `cv.limit`, for a point estimate, they are flagged and available in the +#' function output. +#' @param p numeric vector containing values between 0 and 1. Defines which +#' quantiles for the distribution of `var` are additionally estimated. +#' @param add.arg additional arguments which will be passed to fun. Can be +#' either a named list or vector. The names of the object correspond to the +#' function arguments and the values to column names in dat, see also +#' examples. +#' +#' @details `calc.stError` takes survey data (`dat`) and returns point estimates +#' as well as their standard Errors defined by `fun` and `var` for each sample +#' period in `dat`. `dat` must be household data where household members +#' correspond to multiple rows with the same household identifier. The data +#' should at least contain the following columns: #' #' * Column indicating the sample period; #' * Column indicating the household ID; @@ -37,65 +73,122 @@ #' * Columns which contain the bootstrap weights (see output of [recalib]); #' * Columns listed in `var` as well as in `group` #' -#' For each variable in `var` as well as sample period the function `fun` is applied using the original as well as the bootstrap sample weights.\cr -#' The point estimate is then selected as the result of `fun` when using the original sample weights and it's standard error is estimated with the result of `fun` using the bootstrap sample weights. \cr +#' For each variable in `var` as well as sample period the function `fun` is +#' applied using the original as well as the bootstrap sample weights.\cr +#' The point estimate is then selected as the result of `fun` when using the +#' original sample weights and it's standard error is estimated with the result +#' of `fun` using the bootstrap sample weights. \cr #' \cr -#' `fun` can be any function which returns a double or integer and uses sample weights as it's second argument. The predifined options are `weightedRatio` and `weightedSum`.\cr +#' `fun` can be any function which returns a double or integer and uses sample +#' weights as it's second argument. The predifined options are `weightedRatio` +#' and `weightedSum`.\cr #' \cr -#' For the option `weightedRatio` a weighted ratio (in \%) of `var` is calculated for `var` equal to 1, e.g `sum(weight[var==1])/sum(weight[!is.na(var)])*100`.\cr -#' Additionally using the option `national=TRUE` the weighted ratio (in \%) is divided by the weighted ratio at the national level for each `period`. +#' For the option `weightedRatio` a weighted ratio (in \%) of `var` is +#' calculated for `var` equal to 1, e.g +#' `sum(weight[var==1])/sum(weight[!is.na(var)])*100`.\cr +#' Additionally using the option `national=TRUE` the weighted ratio (in \%) is +#' divided by the weighted ratio at the national level for each `period`. #' \cr -#' If `group` is not `NULL` but a vector of variables from `dat` then `fun` is applied on each subset of `dat` defined by all combinations of values in `group`.\cr -#' For instance if `group = "sex"` with "sex" having the values "Male" and "Female" in `dat` the point estimate and standard error is calculated on the subsets of `dat` with only "Male" or "Female" value for "sex". This is done for each value of `period`. -#' For variables in `group` which have `NA`s in `dat` the rows containing the missings will be discarded. \cr -#' When `group` is a list of character vectors, subsets of `dat` and the following estimation of the point estimate, including the estimate for the standard error, are calculated for each list entry.\cr +#' If `group` is not `NULL` but a vector of variables from `dat` then `fun` is +#' applied on each subset of `dat` defined by all combinations of values in +#' `group`.\cr +#' For instance if `group = "sex"` with "sex" having the values "Male" and +#' "Female" in `dat` the point estimate and standard error is calculated on the +#' subsets of `dat` with only "Male" or "Female" value for "sex". This is done +#' for each value of `period`. For variables in `group` which have `NA`s in +#' `dat` the rows containing the missings will be discarded. \cr +#' When `group` is a list of character vectors, subsets of `dat` and the +#' following estimation of the point estimate, including the estimate for the +#' standard error, are calculated for each list entry.\cr #' \cr -#' The optional parameters `fun.adjust.var` and `adjust.var` can be used if the values in `var` are dependent on the `weights`. As is for instance the case for the poverty thershhold calculated from EU-SILC. -#' In such a case an additional function can be supplied using `fun.adjust.var` as well as its first argument `adjust.var`, which needs to be part of the data set `dat`. Then, before applying `fun` on variable `var` -#' for all `period` and groups, the function `fun.adjust.var` is applied to `adjust.var` using each of the bootstrap weights seperately (NOTE: weight is used as the second argument of `fun.adjust.var`). +#' The optional parameters `fun.adjust.var` and `adjust.var` can be used if the +#' values in `var` are dependent on the `weights`. As is for instance the case +#' for the poverty thershhold calculated from EU-SILC. +#' In such a case an additional function can be supplied using `fun.adjust.var` +#' as well as its first argument `adjust.var`, which needs to be part of the +#' data set `dat`. Then, before applying `fun` on variable `var` +#' for all `period` and groups, the function `fun.adjust.var` is applied to +#' `adjust.var` using each of the bootstrap weights seperately (NOTE: weight is +#' used as the second argument of `fun.adjust.var`). #' Thus creating i=1,...,`length(b.weights)` additional variables. -#' For applying `fun` on `var` the estimates for the bootstrap replicate will now use each of the corresponding new additional variables. So instead of +#' For applying `fun` on `var` the estimates for the bootstrap replicate will +#' now use each of the corresponding new additional variables. So instead of #' \deqn{fun(var,weights,...),fun(var,b.weights[1],...),fun(var,b.weights[2],...),...} #' the function `fun` will be applied in the way #' \deqn{fun(var,weights,...),fun(var.1,b.weights[1],...),fun(var.2,b.weights[2],...),...} #' -#' where `var.1`, `var.2`, `...` correspond to the estimates resulting from `fun.adjust.var` and `adjust.var`. -#' NOTE: This procedure is especially usefull if the `var` is dependent on `weights` and `fun` is applied on subgroups of the data set. Then it is not possible to capture this -#' procedure with `fun` and `var`, see examples for a more hands on explanation. +#' where `var.1`, `var.2`, `...` correspond to the estimates resulting from +#' `fun.adjust.var` and `adjust.var`. +#' NOTE: This procedure is especially usefull if the `var` is dependent on +#' `weights` and `fun` is applied on subgroups of the data set. Then it is not +#' possible to capture this procedure with `fun` and `var`, see examples for a +#' more hands on explanation. #' \cr -#' When defining `period.diff` the difference of point estimates between periods as well their standard errors are calculated.\cr -#' The entries in `period.diff` must have the form of `"period1 - period2"` which means that the results of the point estimates for `period2` will be substracted from the results of the point estimates for `period1`.\cr +#' When defining `period.diff` the difference of point estimates between periods +#' as well their standard errors are calculated.\cr +#' The entries in `period.diff` must have the form of `"period1 - period2"` +#' which means that the results of the point estimates for `period2` will be +#' substracted from the results of the point estimates for `period1`.\cr #' \cr -#' Specifying `period.mean` leads to an improvement in standard error by averaging the results for the point estimates, using the bootstrap weights, over `period.mean` periods. -#' Setting, for instance, `period.mean = 3` the results in averaging these results over each consecutive set of 3 periods.\cr -#' Estimating the standard error over these averages gives an improved estimate of the standard error for the central period, which was used for averaging.\cr -#' The averaging of the results is also applied in differences of point estimates. For instance defining `period.diff = "2015-2009"` and `period.mean = 3` -#' the differences in point estimates of 2015 and 2009, 2016 and 2010 as well as 2017 and 2011 are calcualated and finally the average over these 3 differences is calculated. -#' The periods set in `period.diff` are always used as starting periods from which `period.mean-1` consecutive periods are used to build the average. +#' Specifying `period.mean` leads to an improvement in standard error by +#' averaging the results for the point estimates, using the bootstrap weights, +#' over `period.mean` periods. +#' Setting, for instance, `period.mean = 3` the results in averaging these +#' results over each consecutive set of 3 periods.\cr +#' Estimating the standard error over these averages gives an improved estimate +#' of the standard error for the central period, which was used for +#' averaging.\cr +#' The averaging of the results is also applied in differences of point +#' estimates. For instance defining `period.diff = "2015-2009"` and +#' `period.mean = 3` +#' the differences in point estimates of 2015 and 2009, 2016 and 2010 as well as +#' 2017 and 2011 are calcualated and finally the average over these 3 +#' differences is calculated. +#' The periods set in `period.diff` are always used as starting periods from +#' which `period.mean-1` consecutive periods are used to build the average. #' \cr -#' Setting `bias` to `TRUE` returns the calculation of a mean over the results from the bootstrap replicates. In the output the corresponding columns is labeled *_mean* at the end.\cr +#' Setting `bias` to `TRUE` returns the calculation of a mean over the results +#' from the bootstrap replicates. In the output the corresponding columns is +#' labeled *_mean* at the end.\cr #' \cr -#' If `fun` needs more arguments they can be supplied in `add.arg`. This can either be a named list or vector.\cr +#' If `fun` needs more arguments they can be supplied in `add.arg`. This can +#' either be a named list or vector.\cr #' \cr -#' The parameter `size.limit` indicates a lower bound of the sample size for subsets in `dat` created by `group`. If the sample size of a subset falls below `size.limit` a warning will be displayed.\cr -#' In addition all subsets for which this is the case can be selected from the output of `calc.stError` with `$smallGroups`.\cr -#' With the parameter `cv.limit` one can set an upper bound on the coefficient of variantion. Estimates which exceed this bound are flagged with `TRUE` and are available in the function output with `$cvHigh`. -#' `cv.limit` must be a positive integer and is treated internally as \%, e.g. for `cv.limit=1` the estimate will be flagged if the coefficient of variantion exceeds 1\%.\cr +#' The parameter `size.limit` indicates a lower bound of the sample size for +#' subsets in `dat` created by `group`. If the sample size of a subset falls +#' below `size.limit` a warning will be displayed.\cr +#' In addition all subsets for which this is the case can be selected from the +#' output of `calc.stError` with `$smallGroups`.\cr +#' With the parameter `cv.limit` one can set an upper bound on the coefficient +#' of variantion. Estimates which exceed this bound are flagged with `TRUE` and +#' are available in the function output with `$cvHigh`. +#' `cv.limit` must be a positive integer and is treated internally as \%, e.g. +#' for `cv.limit=1` the estimate will be flagged if the coefficient of +#' variantion exceeds 1\%.\cr #' \cr -#' When specifying `period.mean`, the decrease in standard error for choosing this method is internally calcualted and a rough estimate for an implied increase in sample size is available in the output with `$stEDecrease`. -#' The rough estimate for the increase in sample size uses the fact that for a sample of size \eqn{n} the sample estimate for the standard error of most point estimates converges with a factor \eqn{1/\sqrt{n}} against the true standard error \eqn{\sigma}. +#' When specifying `period.mean`, the decrease in standard error for choosing +#' this method is internally calcualted and a rough estimate for an implied +#' increase in sample size is available in the output with `$stEDecrease`. +#' The rough estimate for the increase in sample size uses the fact that for a +#' sample of size \eqn{n} the sample estimate for the standard error of most +#' point estimates converges with a factor \eqn{1/\sqrt{n}} against the true +#' standard error \eqn{\sigma}. #' #' #' @return Returns a list containing: #' -#' * `Estimates`: data.table containing period differences and/or k period averages for estimates of -#' `fun` applied to `var` as well as the corresponding standard errors, which are calculated using the bootstrap weights. -#' In addition the sample size, `n`, and poplutaion size for each group is added to the output. -#' * `smallGroups`: data.table containing groups for which the number of observation falls below `size.limit`. -#' * `cvHigh`: data.table containing a boolean variable which indicates for each estimate if the -#' estimated standard error exceeds `cv.limit`. -#' * `stEDecrease`: data.table indicating for each estimate the theoretical increase in sample size which is -#' gained when averaging over k periods. Only returned if `period.mean` is not `NULL`. +#' * `Estimates`: data.table containing period differences and/or k period +#' averages for estimates of +#' `fun` applied to `var` as well as the corresponding standard errors, which +#' are calculated using the bootstrap weights. In addition the sample size, +#' `n`, and poplutaion size for each group is added to the output. +#' * `smallGroups`: data.table containing groups for which the number of +#' observation falls below `size.limit`. +#' * `cvHigh`: data.table containing a boolean variable which indicates for each +#' estimate if the estimated standard error exceeds `cv.limit`. +#' * `stEDecrease`: data.table indicating for each estimate the theoretical +#' increase in sample size which is gained when averaging over k periods. Only +#' returned if `period.mean` is not `NULL`. #' #' @seealso [draw.bootstrap] \cr #' [recalib] @@ -113,37 +206,45 @@ #' #' # estimate weightedRatio for povertyRisk per period #' -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio) +#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", +#' fun = weightedRatio) #' err.est$Estimates #' -#' # calculate weightedRatio for povertyRisk and fraction of one-person households per period +#' # calculate weightedRatio for povertyRisk and fraction of one-person +#' # households per period #' #' dat_boot_calib[, onePerson := .N == 1, by = .(year, hid)] -#' err.est <- calc.stError(dat_boot_calib, var = c("povertyRisk", "onePerson"), fun = weightedRatio) +#' err.est <- calc.stError(dat_boot_calib, var = c("povertyRisk", "onePerson"), +#' fun = weightedRatio) #' err.est$Estimates #' #' # estimate weightedRatio for povertyRisk per period and gender #' #' group <- "gender" -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group) +#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", +#' fun = weightedRatio, group = group) #' err.est$Estimates #' -#' # estimate weightedRatio for povertyRisk per period and gender, region and combination of both +#' # estimate weightedRatio for povertyRisk per period and gender, region and +#' # combination of both #' #' group <- list("gender", "region", c("gender", "region")) -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group) +#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", +#' fun = weightedRatio, group = group) #' err.est$Estimates #' #' # use average over 3 periods for standard error estimation #' -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, period.mean = 3) +#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", +#' fun = weightedRatio, period.mean = 3) #' err.est$Estimates #' #' # get estimate for difference of period 2016 and 2013 #' #' period.diff <- c("2015-2011") -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, -#' period.diff = period.diff, period.mean = 3) +#' err.est <- calc.stError( +#' dat_boot_calib, var = "povertyRisk", fun = weightedRatio, +#' period.diff = period.diff, period.mean = 3) #' err.est$Estimates #' #' # use add.arg-argument @@ -156,10 +257,12 @@ #' period.mean = 0, add.arg=add.arg) #' err.est$Estimates #' # compare with direkt computation -#' compare.value <- dat_boot_calib[,fun(povertyRisk,pWeight,b=onePerson),by=c("year")] +#' compare.value <- dat_boot_calib[,fun(povertyRisk,pWeight,b=onePerson), +#' by=c("year")] #' all((compare.value$V1-err.est$Estimates$val_povertyRisk)==0) #' -#' # use a function from an other package that has sampling weights as its second argument +#' # use a function from an other package that has sampling weights as its +#' # second argument #' # for example gini() from laeken #' #' library(laeken) @@ -169,15 +272,18 @@ #' return(gini(x, w)$value) #' } #' -#' ## make sure povertyRisk get coerced to a numeric in order to work with the external functions +#' ## make sure povertyRisk get coerced to a numeric in order to work with the +#' ## external functions #' invisible(dat_boot_calib[, povertyRisk := as.numeric(povertyRisk)]) #' -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = help_gini, group = group, -#' period.diff = period.diff, period.mean = 3) +#' err.est <- calc.stError( +#' dat_boot_calib, var = "povertyRisk", fun = help_gini, group = group, +#' period.diff = period.diff, period.mean = 3) #' err.est$Estimates #' #' # using fun.adjust.var and adjust.var to estimate povmd60 indicator -#' # for each period and bootstrap weight before applying the weightedRatio point estimate +#' # for each period and bootstrap weight before applying the weightedRatio +#' # point estimate #' #' # this function estimates the povmd60 indicator with x as income vector #' # and w as weight vector @@ -191,9 +297,9 @@ #' # the povmd60 indicator for each bootstrap weight #' # and the resultung indicators are passed to function weightedRatio #' -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, -#' group = group, fun.adjust.var = povmd, adjust.var = "eqIncome", -#' period.mean = 3) +#' err.est <- calc.stError( +#' dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group, +#' fun.adjust.var = povmd, adjust.var = "eqIncome", period.mean = 3) #' err.est$Estimates #' #' # why fun.adjust.var and adjust.var are needed (!!!): @@ -211,9 +317,9 @@ #' # but this results in different results in subgroups #' # compared to using fun.adjust.var and adjust.var #' -#' err.est.different <- calc.stError(dat_boot_calib, var = "eqIncome", fun = povmd2, -#' group = group, fun.adjust.var = NULL, adjust.var = NULL, -#' period.mean = 3) +#' err.est.different <- calc.stError( +#' dat_boot_calib, var = "eqIncome", fun = povmd2, group = group, +#' fun.adjust.var = NULL, adjust.var = NULL, period.mean = 3) #' err.est.different$Estimates #' #' ## results are equal for yearly estimates @@ -231,7 +337,8 @@ # wrapper-function to apply fun to var using weights (~weights, b.weights) -# and calculating standard devation (using the bootstrap replicates) per period and for k-period rolling means +# and calculating standard devation (using the bootstrap replicates) per period +# and for k-period rolling means calc.stError <- function( dat, weights = attr(dat, "weights"), b.weights = attr(dat, "b.rep"), period = attr(dat, "period"), var, fun = weightedRatio, national = FALSE, @@ -564,7 +671,8 @@ calc.stError <- function( # function to apply fun to var using weights (~weights, b.weights) -# and calculating standard devation (using the bootstrap replicates) per period and for k-period rolling means +# and calculating standard devation (using the bootstrap replicates) per period +# and for k-period rolling means help.stError <- function( dat, period, var, weights, b.weights = paste0("w", 1:1000), fun, national, group, fun.adjust.var, adjust.var, period.diff = NULL, period.mean = diff --git a/R/computeFrac.R b/R/computeFrac.R index 09f3186..9183986 100644 --- a/R/computeFrac.R +++ b/R/computeFrac.R @@ -1,15 +1,15 @@ #' Numerical weighting functions #' -#' Customize weight-updating within factor levels in case of numerical calibration. The -#' functions described here serve as inputs for [ipf]. +#' Customize weight-updating within factor levels in case of numerical +#' calibration. The functions described here serve as inputs for [ipf]. #' #' `computeFrac` provides the "standard" IPU updating scheme given as #' #' \deqn{f = target/curValue} #' -#' which means that each weight inside the level will be multtiplied by the same factor when -#' doing the actual update step (`w := f*w`). `computeLinear` on the other hand -#' calculates `f` as +#' which means that each weight inside the level will be multtiplied by the same +#' factor when doing the actual update step (`w := f*w`). `computeLinear` on the +#' other hand calculates `f` as #' #' \ifelse{html}{ #' \out{
fi = a · xi + b
} @@ -25,14 +25,16 @@ #' }{\deqn{\sum f_i * w_i = \sum w_i}} #' # \eqn{\sum}\out{fi wi xi} = `target` -#' `computeLinearG1` calculates `f` in the same way as `computeLinear`, but if `f_i*w_i<1` `f_i` will be set to `1/w_i`. +#' `computeLinearG1` calculates `f` in the same way as `computeLinear`, but if +#'`f_i*w_i<1` `f_i` will be set to `1/w_i`. #' #' @md #' @param curValue Current summed up value. Same as `sum(x*w)` #' @param target Target value. An element of `conP` in [ipf] #' @param x Vector of numeric values to be calibrated against #' @param w Vector of weights -#' @param boundLinear The output `f` will satisfy `1/boundLinear <= f <= boundLinear`. See `bound` in [ipf] +#' @param boundLinear The output `f` will satisfy +#' `1/boundLinear <= f <= boundLinear`. See `bound` in [ipf] #' #' @return A weight multiplier `f` #' diff --git a/R/demo.eusilc.R b/R/demo.eusilc.R index 065264b..6154f7d 100644 --- a/R/demo.eusilc.R +++ b/R/demo.eusilc.R @@ -1,27 +1,32 @@ #' Generate multiple years of EU-SILC data #' -#' Create a dummy dataset to be used for demonstrating the functionalities of the `surveysd` package -#' based on [laeken::eusilc]. Please refer to the documentation page of the original data for -#' details about the variables. +#' Create a dummy dataset to be used for demonstrating the functionalities of +#' the `surveysd` package based on [laeken::eusilc]. Please refer to the +#' documentation page of the original data for details about the variables. #' #' @param n Number of years to generate. Should be at least 1 -#' @param prettyNames Create easy-to-read names for certain variables. Recommended for demonstration -#' purposes. Otherwise, use the original codes documented in [laeken::eusilc]. +#' @param prettyNames Create easy-to-read names for certain variables. +#' Recommended for demonstration purposes. Otherwise, use the original codes +#' documented in [laeken::eusilc]. #' #' @details -#' If `prettyNames` is `TRUE`, the following variables will be available in an easy-to-read manner. +#' If `prettyNames` is `TRUE`, the following variables will be available in an +#' easy-to-read manner. #' -#' * `hid` Household id. Consistent with respect to the reference period (`year`) +#' * `hid` Household id. Consistent with respect to the reference period +#' (`year`) #' * `hsize` Size of the household. derived from `hid` and `period` #' * `region` Federal state of austria where the household is located #' * `pid` Personal id. Consistent with respect to the reference period (`year`) #' * `age` Age-class of the respondent #' * `gender` A persons gender (`"male"`, `"Female"`) -#' * `ecoStat` Ecnomic status (`"part time"`, `"full time"`, `"unemployed"`, ...) +#' * `ecoStat` Ecnomic status +#' (`"part time"`, `"full time"`, `"unemployed"`, ...) #' * `citizenship` Citizenship (`"AT"`, `"EU"`, `"other"`) #' * `pWeight` Personal sample weight inside the reference period #' * `year`. Simulated reference period -#' * `povertyRisk`. Logical variable determining whether a respondent is at risk of poverty +#' * `povertyRisk`. Logical variable determining whether a respondent is at risk +#' of poverty #' #' @importFrom dplyr recode #' @examples diff --git a/R/draw.bootstrap.R b/R/draw.bootstrap.R index 4d26275..e0decae 100644 --- a/R/draw.bootstrap.R +++ b/R/draw.bootstrap.R @@ -1,73 +1,130 @@ #' @title Draw bootstrap replicates #' -#' @description Draw bootstrap replicates from survey data with rotating panel design. -#' Survey information, like ID, sample weights, strata and population totals per strata, should be specified to ensure meaningfull survey bootstraping. +#' @description Draw bootstrap replicates from survey data with rotating panel +#' design. Survey information, like ID, sample weights, strata and population +#' totals per strata, should be specified to ensure meaningfull survey +#' bootstraping. #' -#' -#' @param dat either data.frame or data.table containing the survey data with rotating panel design. +#' @param dat either data.frame or data.table containing the survey data with +#' rotating panel design. #' @param REP integer indicating the number of bootstrap replicates. -#' @param hid character specifying the name of the column in `dat` containing the household id. If -#' `NULL` (the default), the household structure is not regarded. -#' @param weights character specifying the name of the column in `dat` containing the sample weights. -#' @param period character specifying the name of the column in `dat` containing the sample periods. -#' If `NULL` (the default), it is assumed that all observations belong to the same -#' period. -#' @param strata character vector specifying the name(s) of the column in `dat` by which the population -#' was stratified. If `strata` is a vector stratification will be assumed as the combination -#' of column names contained in `strata`. Setting in addition `cluster` not NULL stratification -#' will be assumed on multiple stages, where each additional entry in `strata` specifies the -#' stratification variable for the next lower stage. see Details for more information. -#' @param cluster character vector specifying cluster in the data. If not already specified in `cluster` household ID is taken es the lowest level cluster. -#' @param totals character specifying the name of the column in `dat` containing the the totals per strata and/or cluster. Is ONLY optional if `cluster` is `NULL` or equal `hid` and `strata` contains one columnname! -#' Then the households per strata will be calcualted using the `weights` argument. If clusters and strata for multiple stages are specified `totals` needs to be a vector of `length(strata)` specifying the column on `dat` -#' that contain the total number of PSUs at each stage. `totals` is interpreted from left the right, meaning that the first argument corresponds to the number of PSUs at the first and the last argument to the number of PSUs at the last stage. -#' @param single.PSU either "merge" or "mean" defining how single PSUs need to be dealt with. -#' For `single.PSU="merge"` single PSUs at each stage are merged with the strata or cluster with the next least number of PSUs. If multiple of those exist one will be select via random draw -#' For `single.PSU="mean"` single PSUs will get the mean over all bootstrap replicates at the stage which did not contain single PSUs. -#' @param boot.names character indicating the leading string of the column names for each bootstrap replica. If NULL defaults to "w". -#' @param split logical, if TRUE split households are considered using `pid`, for more information see Details. -#' @param pid column in `dat` specifying the personal identifier. This identifier needs to be unique for each person throught the whole data set. -#' @param new.method logical, if TRUE bootstrap replicates will never be negative even if in some strata the whole population is in the sample. WARNING: This is still experimental and resulting standard errors might be underestimated! Use this if for some strata the whole population is in the sample! +#' @param hid character specifying the name of the column in `dat` containing +#' the household id. If `NULL` (the default), the household structure is not +#' regarded. +#' @param weights character specifying the name of the column in `dat` +#' containing the sample weights. +#' @param period character specifying the name of the column in `dat` containing +#' the sample periods. If `NULL` (the default), it is assumed that all +#' observations belong to the same period. +#' @param strata character vector specifying the name(s) of the column in `dat` +#' by which the population was stratified. If `strata` is a vector +#' stratification will be assumed as the combination of column names contained +#' in `strata`. Setting in addition `cluster` not NULL stratification will be +#' assumed on multiple stages, where each additional entry in `strata` +#' specifies the stratification variable for the next lower stage. see Details +#' for more information. +#' @param cluster character vector specifying cluster in the data. If not +#' already specified in `cluster` household ID is taken es the lowest level +#' cluster. +#' @param totals character specifying the name of the column in `dat` containing +#' the the totals per strata and/or cluster. Is ONLY optional if `cluster` is +#' `NULL` or equal `hid` and `strata` contains one columnname! Then the +#' households per strata will be calcualted using the `weights` argument. If +#' clusters and strata for multiple stages are specified `totals` needs to be +#' a vector of `length(strata)` specifying the column on `dat` that contain +#' the total number of PSUs at each stage. `totals` is interpreted from left +#' the right, meaning that the first argument corresponds to the number of +#' PSUs at the first and the last argument to the number of PSUs at the last +#' stage. +#' @param single.PSU either "merge" or "mean" defining how single PSUs need to +#' be dealt with. For `single.PSU="merge"` single PSUs at each stage are +#' merged with the strata or cluster with the next least number of PSUs. If +#' multiple of those exist one will be select via random draw. For +#' `single.PSU="mean"` single PSUs will get the mean over all bootstrap +#' replicates at the stage which did not contain single PSUs. +#' @param boot.names character indicating the leading string of the column names +#' for each bootstrap replica. If NULL defaults to "w". +#' @param split logical, if TRUE split households are considered using `pid`, +#' for more information see Details. +#' @param pid column in `dat` specifying the personal identifier. This +#' identifier needs to be unique for each person throught the whole data set. +#' @param new.method logical, if TRUE bootstrap replicates will never be +#' negative even if in some strata the whole population is in the sample. +#' WARNING: This is still experimental and resulting standard errors might be +#' underestimated! Use this if for some strata the whole population is in the +#' sample! #' -#' @return the survey data with the number of REP bootstrap replicates added as columns. +#' @return the survey data with the number of REP bootstrap replicates added as +#' columns. #' -#' @details `draw.bootstrap` takes `dat` and draws `REP` bootstrap replicates from it. -#' `dat` must be household data where household members correspond to multiple rows with the same household -#' identifier. For most practical applications, the following columns should be available in the dataset +#' @details `draw.bootstrap` takes `dat` and draws `REP` bootstrap replicates +#' from it. +#' `dat` must be household data where household members correspond to multiple +#' rows with the same household +#' identifier. For most practical applications, the following columns should be +#' available in the dataset #' and passed via the corresponding parameters: #' #' * Column indicating the sample period (parameter `period`). #' * Column indicating the household ID (parameter `hid`) #' * Column containing the household sample weights (parameter `weights`); -#' * Columns by which population was stratified during the sampling process (parameter: `strata`). +#' * Columns by which population was stratified during the sampling process +#' (parameter: `strata`). #' -#' For single stage sampling design a column the argument `totals` is optional, meaning that a column of the number of PSUs at the first stage does not need to be supplied. -#' For this case the number of PSUs is calculated and added to `dat` using `strata` and `weights`. By setting `cluster` to NULL single stage sampling design is always assumed and -#' if `strata` contains of multiple column names the combination of all those column names will be used for stratification. +#' For single stage sampling design a column the argument `totals` is optional, +#' meaning that a column of the number of PSUs at the first stage does not need +#' to be supplied. +#' For this case the number of PSUs is calculated and added to `dat` using +#' `strata` and `weights`. By setting `cluster` to NULL single stage sampling +#' design is always assumed and +#' if `strata` contains of multiple column names the combination of all those +#' column names will be used for stratification. #' -#' In the case of multi stage sampling design the argument `totals` needs to be specified and needs to have the same number of arguments as `strata`. +#' In the case of multi stage sampling design the argument `totals` needs to be +#' specified and needs to have the same number of arguments as `strata`. #' -#' If `cluster` is `NULL` or does not contain `hid` at the last stage, `hid` will automatically be used as the final cluster. If, besides `hid`, clustering in additional stages is specified the number of column names in -#' `strata` and `cluster` (including `hid`) must be the same. If for any stage there was no clustering or stratification one can set "1" or "I" for this stage. +#' If `cluster` is `NULL` or does not contain `hid` at the last stage, `hid` +#' will automatically be used as the final cluster. If, besides `hid`, +#' clustering in additional stages is specified the number of column names in +#' `strata` and `cluster` (including `hid`) must be the same. If for any stage +#' there was no clustering or stratification one can set "1" or "I" for this +#' stage. #' -#' For example `strata=c("REGION","I"),cluster=c("MUNICIPALITY","HID")` would speficy a 2 stage sampling design where at the first stage the municipalities where drawn stratified by regions -#' and at the 2nd stage housholds are drawn in each municipality without stratification. +#' For example `strata=c("REGION","I"),cluster=c("MUNICIPALITY","HID")` would +#' speficy a 2 stage sampling design where at the first stage the municipalities +#' where drawn stratified by regions +#' and at the 2nd stage housholds are drawn in each municipality without +#' stratification. #' -#' Bootstrap replicates are drawn for each survey period (`period`) using the function [rescaled.bootstrap]. -#' Afterwards the bootstrap replicates for each household are carried forward from the first period the household enters the survey to all the censecutive periods it stays in the survey. +#' Bootstrap replicates are drawn for each survey period (`period`) using the +#' function [rescaled.bootstrap]. +#' Afterwards the bootstrap replicates for each household are carried forward +#' from the first period the household enters the survey to all the censecutive +#' periods it stays in the survey. #' -#' This ensures that the bootstrap replicates follow the same logic as the sampled households, making the bootstrap replicates more comparable to the actual sample units. +#' This ensures that the bootstrap replicates follow the same logic as the +#' sampled households, making the bootstrap replicates more comparable to the +#' actual sample units. #' -#' If `split` ist set to `TRUE` and `pid` is specified, the bootstrap replicates are carried forward using the personal identifiers instead of the houshold identifier. +#' If `split` ist set to `TRUE` and `pid` is specified, the bootstrap replicates +#' are carried forward using the personal identifiers instead of the houshold +#' identifier. #' This takes into account the issue of a houshold splitting up. -#' Any person in this new split household will get the same bootstrap replicate as the person that has come from an other household in the survey. -#' People who enter already existing households will also get the same bootstrap replicate as the other households members had in the previous periods. +#' Any person in this new split household will get the same bootstrap replicate +#' as the person that has come from an other household in the survey. +#' People who enter already existing households will also get the same bootstrap +#' replicate as the other households members had in the previous periods. #' -#' @return Returns a data.table containing the original data as well as the number of `REP` columns containing the bootstrap replicates for each repetition.\cr -#' The columns of the bootstrap replicates are by default labeled "w*Number*" where *Number* goes from 1 to `REP`. -#' If the column names of the bootstrap replicates should start with a different character or string the parameter `boot.names` can be used. +#' @return Returns a data.table containing the original data as well as the +#' number of `REP` columns containing the bootstrap replicates for each +#' repetition.\cr +#' The columns of the bootstrap replicates are by default labeled "w*Number*" +#' where *Number* goes from 1 to `REP`. If the column names of the bootstrap +#' replicates should start with a different character or string the parameter +#' `boot.names` can be used. #' -#' @seealso [`data.table`][data.table::data.table] for more information on data.table objects. +#' @seealso [`data.table`][data.table::data.table] for more information on +#' data.table objects. #' #' @author Johannes Gussenbauer, Alexander Kowarik, Statistics Austria #' @@ -76,15 +133,18 @@ #' eusilc <- demo.eusilc(prettyNames = TRUE) #' #' ## draw sample without stratification or clustering -#' dat_boot <- draw.bootstrap(eusilc, REP = 10, weights = "pWeight", period = "year") +#' dat_boot <- draw.bootstrap(eusilc, REP = 10, weights = "pWeight", +#' period = "year") #' #' ## use stratification w.r.t. region and clustering w.r.t. households -#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", -#' strata = "region", period = "year") +#' dat_boot <- draw.bootstrap( +#' eusilc, REP = 10, hid = "hid", weights = "pWeight", +#' strata = "region", period = "year") #' #' ## use multi-level clustering -#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", -#' strata = c("region", "age"), period = "year") +#' dat_boot <- draw.bootstrap( +#' eusilc, REP = 10, hid = "hid", weights = "pWeight", +#' strata = c("region", "age"), period = "year") #' #' #' # create spit households @@ -93,21 +153,28 @@ #' year <- year[-1] #' leaf_out <- c() #' for(y in year){ -#' split.person <- eusilc[year == (y-1) & !duplicated(hid) & !(hid %in% leaf_out), -#' sample(pid, 20)] -#' overwrite.person <- eusilc[(year == (y)) & !duplicated(hid) & !(hid %in% leaf_out), -#' .(pid = sample(pid, 20))] +#' split.person <- eusilc[ +#' year == (y-1) & !duplicated(hid) & !(hid %in% leaf_out), +#' sample(pid, 20) +#' ] +#' overwrite.person <- eusilc[ +#' (year == (y)) & !duplicated(hid) & !(hid %in% leaf_out), +#' .(pid = sample(pid, 20)) +#' ] #' overwrite.person[, c("pidsplit", "year_curr") := .(split.person, y)] #' -#' eusilc[overwrite.person, pidsplit := i.pidsplit, on = .(pid, year >= year_curr)] +#' eusilc[overwrite.person, pidsplit := i.pidsplit, +#' on = .(pid, year >= year_curr)] #' leaf_out <- c(leaf_out, -#' eusilc[pid %in% c(overwrite.person$pid, overwrite.person$pidsplit), +#' eusilc[pid %in% c(overwrite.person$pid, +#' overwrite.person$pidsplit), #' unique(hid)]) #' } #' -#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", -#' strata = c("region", "age"), period = "year", split = TRUE, -#' pid = "pidsplit") +#' dat_boot <- draw.bootstrap( +#' eusilc, REP = 10, hid = "hid", weights = "pWeight", +#' strata = c("region", "age"), period = "year", split = TRUE, +#' pid = "pidsplit") #' # split households were considered e.g. household and #' # split household were both selected or not selected #' dat_boot[, data.table::uniqueN(w1), by = pidsplit][V1 > 1] diff --git a/R/generateHHID.R b/R/generateHHID.R index 9950b95..af46a5f 100644 --- a/R/generateHHID.R +++ b/R/generateHHID.R @@ -1,17 +1,27 @@ -#' Generate new houshold ID for survey data with rotating panel design taking into account split households +#' Generate new houshold ID for survey data with rotating panel design taking +#' into account split households #' -#' Generating a new houshold ID for survey data using a houshold ID and a personal ID. -#' For surveys with rotating panel design containing housholds, houshold members can move from an existing household to a new one, that was not originally in the sample. -#' This leads to the creation of so called split households. Using a peronal ID (that stays fixed over the whole survey), an indicator for different time steps and a houshold ID, -#' a new houshold ID is assigned to the original and the split household. +#' Generating a new houshold ID for survey data using a houshold ID and a +#' personal ID. +#' For surveys with rotating panel design containing housholds, houshold members +#' can move from an existing household to a new one, that was not originally in +#' the sample. This leads to the creation of so called split households. Using a +#' peronal ID (that stays fixed over the whole survey), an indicator for +#' different time steps and a houshold ID, a new houshold ID is assigned to the +#' original and the split household. #' #' @param dat data table of data frame containing the survey data -#' @param period column name of \code{dat} containing an indicator for the rotations, e.g years, quarters, months, ect... -#' @param pid column name of \code{dat} containing the personal identifier. This needs to be fixed for an indiviual throught the whole survey -#' @param hid column name of \code{dat} containing the household id. This needs to for a household throught the whole survey +#' @param period column name of \code{dat} containing an indicator for the +#' rotations, e.g years, quarters, months, ect... +#' @param pid column name of \code{dat} containing the personal identifier. This +#' needs to be fixed for an indiviual throught the whole survey +#' @param hid column name of \code{dat} containing the household id. This needs +#' to for a household throught the whole survey #' -#' @return the survey data \code{dat} as data.table object containing a new and an old household ID. The new household ID which considers the split households is now named \code{hid} and the original household ID -#' has a trailing "_orig". +#' @return the survey data \code{dat} as data.table object containing a new and +#' an old household ID. The new household ID which considers the split +#' households is now named \code{hid} and the original household ID has a +#' trailing "_orig". #' @export generate.HHID #' #' @examples @@ -34,16 +44,19 @@ #' .(rb030=sample(rb030,20))] #' overwrite.person[,c("rb030split","year_curr"):=.(split.person,y)] #' -#' eusilc[overwrite.person,rb030split:=i.rb030split,on=.(rb030,year>=year_curr)] -#' leaf_out <- c(leaf_out, -#' eusilc[rb030%in%c(overwrite.person$rb030,overwrite.person$rb030split), -#' unique(db030)]) +#' eusilc[overwrite.person, +#' rb030split:=i.rb030split,on=.(rb030,year>=year_curr)] +#' leaf_out <- c( +#' leaf_out, +#' eusilc[rb030%in%c(overwrite.person$rb030,overwrite.person$rb030split), +#' unique(db030)]) #' } #' #' # pid which are in split households #' eusilc[,.(uniqueN(db030)),by=list(rb030split)][V1>1] #' -#' eusilc.new <- generate.HHID(eusilc,period="year",pid="rb030split",hid="db030") +#' eusilc.new <- generate.HHID(eusilc, period = "year", pid = "rb030split", +#' hid = "db030") #' #' # no longer any split households in the data #' eusilc.new[,.(uniqueN(db030)),by=list(rb030split)][V1>1] diff --git a/R/ipf.r b/R/ipf.r index 02c22e2..5cac685 100644 --- a/R/ipf.r +++ b/R/ipf.r @@ -371,89 +371,101 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, #' This function implements the weighting procedure described #' [here](http://www.ajs.or.at/index.php/ajs/article/viewFile/doi10.17713ajs.v45i3.120/512). #' -#' `conP` and `conH` are contingency tables, which can be created with `xtabs`. The `dimnames` of those -#' tables should match the names and levels of the corresponding columns in `dat`. +#' `conP` and `conH` are contingency tables, which can be created with `xtabs`. +#' The `dimnames` of those tables should match the names and levels of the +#' corresponding columns in `dat`. #' -#' `maxIter`, `epsP` and `epsH` are the stopping criteria. `epsP` and `epsH` describe relative tolerances -#' in the sense that +#' `maxIter`, `epsP` and `epsH` are the stopping criteria. `epsP` and `epsH` +#' describe relative tolerances in the sense that #' \out{\deqn{1-epsP < \frac{w_{i+1}}{w_i} < 1+epsP}{1-epsP < w(i+1)/w(i) < 1+epsP} } -#' will be used as convergence criterium. Here i is the iteration step and wi is the weight of a -#' specific person at step i. +#' will be used as convergence criterium. Here i is the iteration step and wi is +#' the weight of a specific person at step i. #' #' The algorithm -#' performs best if all varables occuring in the constraints (`conP` and `conH`) as well as the -#' household variable are coded as `factor`-columns in `dat`. Otherwise, conversions will be necessary -#' which can be monitored with the `conversion_messages` argument. -#' Setting `check_hh_vars` to `FALSE` can also incease the performance of the scheme. +#' performs best if all varables occuring in the constraints (`conP` and `conH`) +#' as well as the household variable are coded as `factor`-columns in `dat`. +#' Otherwise, conversions will be necessary which can be monitored with the +#' `conversion_messages` argument. Setting `check_hh_vars` to `FALSE` can also +#' incease the performance of the scheme. #' #' @name ipf #' @md #' @aliases ipf #' @param dat a `data.table` containing household ids (optionally), base -#' weights (optionally), household and/or personal level variables (numerical -#' or categorical) that should be fitted. -#' @param hid name of the column containing the household-ids -#' within `dat` or NULL if such a variable does not exist. -#' @param w name if the column containing the base -#' weights within `dat` or NULL if such a variable does not exist. In the -#' latter case, every observation in `dat` is assigned a starting weight -#' of 1. +#' weights (optionally), household and/or personal level variables (numerical +#' or categorical) that should be fitted. +#' @param hid name of the column containing the household-ids within `dat` or +#' NULL if such a variable does not exist. +#' @param w name if the column containing the base weights within `dat` or NULL +#' if such a variable does not exist. In the latter case, every observation +#' in `dat` is assigned a starting weight of 1. #' @param conP list or (partly) named list defining the constraints on person -#' level. The list elements are contingency tables in array representation -#' with dimnames corresponding to the names of the relevant calibration -#' variables in `dat`. If a numerical variable is to be calibrated, the -#' respective list element has to be named with the name of that numerical -#' variable. Otherwise the list element shoud NOT be named. +#' level. The list elements are contingency tables in array representation +#' with dimnames corresponding to the names of the relevant calibration +#' variables in `dat`. If a numerical variable is to be calibrated, the +#' respective list element has to be named with the name of that numerical +#' variable. Otherwise the list element shoud NOT be named. #' @param conH list or (partly) named list defining the constraints on -#' household level. The list elements are contingency tables in array -#' representation with dimnames corresponding to the names of the relevant -#' calibration variables in `dat`. If a numerical variable is to be -#' calibrated, the respective list element has to be named with the name of -#' that numerical variable. Otherwise the list element shoud NOT be named. +#' household level. The list elements are contingency tables in array +#' representation with dimnames corresponding to the names of the relevant +#' calibration variables in `dat`. If a numerical variable is to be +#' calibrated, the respective list element has to be named with the name of +#' that numerical variable. Otherwise the list element shoud NOT be named. #' @param epsP numeric value or list (of numeric values and/or arrays) -#' specifying the convergence limit(s) for `conP`. The list can contain -#' numeric values and/or arrays which must appear in the same order as the -#' corresponding constraints in `conP`. Also, an array must have the same -#' dimensions and dimnames as the corresponding constraint in `conP`. +#' specifying the convergence limit(s) for `conP`. The list can contain +#' numeric values and/or arrays which must appear in the same order as the +#' corresponding constraints in `conP`. Also, an array must have the same +#' dimensions and dimnames as the corresponding constraint in `conP`. #' @param epsH numeric value or list (of numeric values and/or arrays) -#' specifying the convergence limit(s) for `conH`. The list can contain -#' numeric values and/or arrays which must appear in the same order as the -#' corresponding constraints in `conH`. Also, an array must have the same -#' dimensions and dimnames as the corresponding constraint in `conH`. +#' specifying the convergence limit(s) for `conH`. The list can contain +#' numeric values and/or arrays which must appear in the same order as the +#' corresponding constraints in `conH`. Also, an array must have the same +#' dimensions and dimnames as the corresponding constraint in `conH`. #' @param verbose if TRUE, some progress information will be printed. #' @param bound numeric value specifying the multiplier for determining the -#' weight trimming boundary if the change of the base weights should be -#' restricted, i.e. if the weights should stay between 1/`bound`*`w` -#' and `bound`*\code{w}. +#' weight trimming boundary if the change of the base weights should be +#' restricted, i.e. if the weights should stay between 1/`bound`*`w` +#' and `bound`*\code{w}. #' @param maxIter numeric value specifying the maximum number of iterations #' that should be performed. #' @param meanHH if TRUE, every person in a household is assigned the mean of -#' the person weights corresponding to the household. If `"geometric"`, the geometric mean -#' is used rather than the arithmetic mean. -#' @param allPthenH if TRUE, all the person level calibration steps are performed before the houshold level calibration steps (and `meanHH`, if specified). -#' If FALSE, the houshold level calibration steps (and `meanHH`, if specified) are performed after everey person level calibration step. -#' This can lead to better convergence properties in certain cases but also means that the total number of calibration steps is increased. -#' @param returnNA if TRUE, the calibrated weight will be set to NA in case of no convergence. -#' @param looseH if FALSE, the actual constraints `conH` are used for calibrating all the hh weights. -#' If TRUE, only the weights for which the lower and upper thresholds defined by `conH` and `epsH` are exceeded -#' are calibrated. They are however not calibrated against the actual constraints `conH` but against -#' these lower and upper thresholds, i.e. `conH`-`conH`*`epsH` and `conH`+`conH`*\code{epsH}. +#' the person weights corresponding to the household. If `"geometric"`, the +#' geometric mean is used rather than the arithmetic mean. +#' @param allPthenH if TRUE, all the person level calibration steps are +#' performed before the houshold level calibration steps (and `meanHH`, if +#' specified). If FALSE, the houshold level calibration steps (and `meanHH`, +#' if specified) are performed after everey person level calibration step. +#' This can lead to better convergence properties in certain cases but also +#' means that the total number of calibration steps is increased. +#' @param returnNA if TRUE, the calibrated weight will be set to NA in case of +#' no convergence. +#' @param looseH if FALSE, the actual constraints `conH` are used for +#' calibrating all the hh weights. If TRUE, only the weights for which the +#' lower and upper thresholds defined by `conH` and `epsH` are exceeded are +#' calibrated. They are however not calibrated against the actual constraints +#' `conH` but against these lower and upper thresholds, i.e. +#' `conH`-`conH`*`epsH` and `conH`+`conH`*\code{epsH}. #' @param numericalWeighting See [numericalWeighting] -#' @param check_hh_vars If `TRUE` check for non-unique values inside of a household for variables in -#' household constraints -#' @param conversion_messages show a message, if inputs need to be reformatted. This can be useful for speed -#' optimizations if ipf is called several times with similar inputs (for example bootstrapping) -#' @param nameCalibWeight character defining the name of the variable for the newly generated calibrated weight. -#' @return The function will return the input data `dat` with the -#' calibrated weights `calibWeight` as an additional column as well as attributes. If no convergence has been reached in `maxIter` -#' steps, and `returnNA` is `TRUE` (the default), the column `calibWeights` will only consist of `NA`s. The attributes of the table are -#' attributes derived from the `data.table` class as well as the following. +#' @param check_hh_vars If `TRUE` check for non-unique values inside of a +#' household for variables in household constraints +#' @param conversion_messages show a message, if inputs need to be reformatted. +#' This can be useful for speed optimizations if ipf is called several times +#' with similar inputs (for example bootstrapping) +#' @param nameCalibWeight character defining the name of the variable for the +#' newly generated calibrated weight. +#' @return The function will return the input data `dat` with the calibrated +#' weights `calibWeight` as an additional column as well as attributes. If no +#' convergence has been reached in `maxIter` steps, and `returnNA` is `TRUE` +#' (the default), the column `calibWeights` will only consist of `NA`s. The +#' attributes of the table are attributes derived from the `data.table` class +#' as well as the following. #' \tabular{ll}{ #' `converged` \tab Did the algorithm converge in `maxIter` steps? \cr #' `iterations` \tab The number of iterations performed. \cr #' `conP`, `conH`, `epsP`, `epsH` \tab See Arguments. \cr #' `conP_adj`, `conH_adj` \tab Adjusted versions of `conP` and `conH` \cr -#' `formP`, `formH` \tab Formulas that were used to calculate `conP_adj` and `conH_adj` based on the output table. +#' `formP`, `formH` \tab Formulas that were used to calculate `conP_adj` and +#' `conH_adj` based on the output table. #' } #' @seealso `\link[simPop]{ipu}` #' @export ipf @@ -482,7 +494,8 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, #' # treat households as a factor variable #' eusilcS[, household := as.factor(household)] #' -#' ## example for base weights assuming a simple random sample of households stratified per region +#' ## example for base weights assuming a simple random sample of households +#' ## stratified per region #' eusilcS[, regSamp := .N, by = state] #' eusilcS[, regPop := sum(weight), by = state] #' eusilcS[, baseWeight := regPop/regSamp] @@ -496,7 +509,8 @@ addWeightsAndAttributes <- function(dat, conP, conH, epsP, epsH, dat_original, #' conP3 <- xtabs(weight*netIncome ~ gender, data = eusilcS) #' #' ## constraints on household level -#' conH1 <- xtabs(weight ~ hsize + state, data = eusilcS, subset = !duplicated(household)) +#' conH1 <- xtabs(weight ~ hsize + state, data = eusilcS, +#' subset = !duplicated(household)) #' #' # array of convergence limits for conH1 #' epsH1 <- conH1 diff --git a/R/plot.R b/R/plot.R index 99a9b63..eada563 100644 --- a/R/plot.R +++ b/R/plot.R @@ -3,17 +3,28 @@ #' Plot results of `calc.stError()` #' #' @param x object of class 'surveysd' output of function [calc.stError] -#' @param variable Name of the variable for which standard errors have been calcualated in `dat` -#' @param type can bei either 'summary' or 'grouping', default value is 'summary'. -#' For 'summary' a barplot is created giving an overview of the number of estimates having the flag `smallGroup`, `cvHigh`, both or none of them. -#' For 'grouping' results for point estimate and standard error are plotted for pre defined groups. -#' @param groups If `type='grouping'` variables must be defined by which the data is grouped. Only 2 levels are supported as of right now. -#' If only one group is defined the higher group will be the estimate over the whole period. -#' Results are plotted for the first argument in `groups` as well as for the combination of `groups[1]` and `groups[2]`. -#' @param sd.type can bei either `'ribbon'` or `'dot'` and is only used if `type='grouping'`. Default is `"dot"` -#' For `sd.type='dot'` point estimates are plotted and flagged if the corresponding standard error and/or the standard error using the mean over k-periods exceeded the value `cv.limit` (see [calc.stError]). -#' For `sd.type='ribbon'` the point estimates including ribbons, defined by point estimate +- estimated standard error are plotted. -#' The calculated standard errors using the mean over k periods are plotted using less transparency. Results for the higher level (~`groups[1]`) are coloured grey. +#' @param variable Name of the variable for which standard errors have been +#' calcualated in `dat` +#' @param type can bei either `"summary"` or `"grouping"`, default value is +#' `"summary"`. For `"summary"` a barplot is created giving an overview of the +#' number of estimates having the flag `smallGroup`, `cvHigh`, both or none +#' of them. For 'grouping' results for point estimate and standard error are +#' plotted for pre defined groups. +#' @param groups If `type='grouping'` variables must be defined by which the +#' data is grouped. Only 2 levels are supported as of right now. If only one +#' group is defined the higher group will be the estimate over the whole +#' period. Results are plotted for the first argument in `groups` as well as +#' for the combination of `groups[1]` and `groups[2]`. +#' @param sd.type can bei either `'ribbon'` or `'dot'` and is only used if +#' `type='grouping'`. Default is `"dot"` +#' For `sd.type='dot'` point estimates are plotted and flagged if the +#' corresponding standard error and/or the standard error using the mean over +#' k-periods exceeded the value `cv.limit` (see [calc.stError]). +#' For `sd.type='ribbon'` the point estimates including ribbons, defined by +#' point estimate +- estimated standard error are plotted. +#' The calculated standard errors using the mean over k periods are plotted +#' using less transparency. Results for the higher level (~`groups[1]`) are +#' coloured grey. #' @param ... additional arguments supplied to plot. #' #' @examples @@ -32,7 +43,8 @@ #' #' # estimate weightedRatio for povmd60 per period #' group <- list("gender", "region", c("gender", "region")) -#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, +#' err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", +#' fun = weightedRatio, #' group = group , period.mean = NULL) #' #' diff --git a/R/print.R b/R/print.R index 61c8534..c765ffe 100644 --- a/R/print.R +++ b/R/print.R @@ -1,7 +1,8 @@ #' @title Print function for surveysd objects #' #' @description -#' Prints the results of a call to [calc.stError]. Shows used variables and function, number of point estiamtes +#' Prints the results of a call to [calc.stError]. Shows used variables and +#' function, number of point estiamtes #' as well as properties of the results. #' #' @param x an object of class `'surveysd'` diff --git a/R/recalib.R b/R/recalib.R index 8832fc4..1448248 100644 --- a/R/recalib.R +++ b/R/recalib.R @@ -6,36 +6,58 @@ #' #' #' @details -#' `recalib` takes survey data (`dat`) containing the bootstrap replicates generated by [draw.bootstrap] and calibrates weights for each bootstrap -#' replication according to population totals for person- or household-specific variables. \cr -#' `dat` must be household data where household members correspond to multiple rows with the same household identifier. The data should at least containt the following columns: +#' `recalib` takes survey data (`dat`) containing the bootstrap replicates +#' generated by [draw.bootstrap] and calibrates weights for each bootstrap +#' replication according to population totals for person- or household-specific +#' variables. \cr +#' `dat` must be household data where household members correspond to multiple +#' rows with the same household identifier. The data should at least containt +#' the following columns: #' #' * Column indicating the sample period; #' * Column indicating the household ID; #' * Column containing the household sample weights; -#' * Columns which contain the bootstrap replicates (see output of [draw.bootstrap]); -#' * Columns indicating person- or household-specific variables for which sample weight should be adjusted. +#' * Columns which contain the bootstrap replicates (see output of +#' [draw.bootstrap]); +#' * Columns indicating person- or household-specific variables for which sample +#' weight should be adjusted. #' -#' For each period and each variable in `conP.var` and/or `conH.var` contingency tables are estimated to get margin totals on personal- and/or household-specific variables in the population.\cr -#' Afterwards the bootstrap replicates are multiplied with the original sample weight and the resulting product ist then adjusted using `\link[surveysd]{ipf}` to match the previously calcualted contingency tables. -#' In this process the columns of the bootstrap replicates are overwritten by the calibrated weights.\cr +#' For each period and each variable in `conP.var` and/or `conH.var` contingency +#' tables are estimated to get margin totals on personal- and/or +#' household-specific variables in the population.\cr +#' Afterwards the bootstrap replicates are multiplied with the original sample +#' weight and the resulting product ist then adjusted using [ipf()] to match the +#' previously calcualted contingency tables. In this process the columns of the +#' bootstrap replicates are overwritten by the calibrated weights.\cr #' +#' @param dat either data.frame or data.table containing the sample survey for +#' various periods. +#' @param hid character specifying the name of the column in `dat` containing +#' the household ID. +#' @param weights character specifying the name of the column in `dat` +#' containing the sample weights. +#' @param b.rep character specifying the names of the columns in `dat` +#' containing bootstrap weights which should be recalibratet +#' @param period character specifying the name of the column in `dat` containing +#' the sample period. +#' @param conP.var character vector containig person-specific variables to which +#' weights should be calibrated. Contingency tables for the population are +#' calculated per `period` using `weights`. +#' @param conH.var character vector containig household-specific variables to +#' which weights should be calibrated. Contingency tables for the population +#' are calculated per `period` using `weights`. +#' @param ... additional arguments passed on to function [ipf()] from this +#' package. #' -#' @param dat either data.frame or data.table containing the sample survey for various periods. -#' @param hid character specifying the name of the column in `dat` containing the household ID. -#' @param weights character specifying the name of the column in `dat` containing the sample weights. -#' @param b.rep character specifying the names of the columns in `dat` containing bootstrap weights which should be recalibratet -#' @param period character specifying the name of the column in `dat` containing the sample period. -#' @param conP.var character vector containig person-specific variables to which weights should be calibrated. Contingency tables for the population are calculatet per `period` using `weights`. -#' @param conH.var character vector containig household-specific variables to which weights should be calibrated. Contingency tables for the population are calculatet per `period` using `weights`. -#' @param ... additional arguments passed on to function `\link[surveysd]{ipf}` from this package. +#' @return Returns a data.table containing the survey data as well as the +#' calibrated weights for the bootstrap replicates. The original bootstrap +#' replicates are overwritten by the calibrated weights. If calibration of a +#' bootstrap replicate does not converge the bootsrap weight is not returned +#' and numeration of the returned bootstrap weights is reduced by one. #' #' -#' @return Returns a data.table containing the survey data as well as the calibrated weights for the bootstrap replicates. The original bootstrap replicates are overwritten by the calibrated weights. -#' If calibration of a bootstrap replicate does not converge the bootsrap weight is not returned and numeration of the returned bootstrap weights is reduced by one. -#' -#' -#' @seealso `\link[surveysd]{ipf}` for more information on iterative proportional fitting. +#' @seealso [ipf()] for more information on iterative +#' proportional fitting. #' #' @author Johannes Gussenbauer, Alexander Kowarik, Statistics Austria #' @@ -44,7 +66,8 @@ #' #' eusilc <- demo.eusilc(prettyNames = TRUE) #' -#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", +#' dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", +#' weights = "pWeight", #' strata = "region", period = "year") #' #' # calibrate weight for bootstrap replicates diff --git a/R/rescaled.bootstrap.R b/R/rescaled.bootstrap.R index 60feb26..88a35b5 100644 --- a/R/rescaled.bootstrap.R +++ b/R/rescaled.bootstrap.R @@ -1,33 +1,65 @@ #' @title Draw bootstrap replicates #' -#' @description Draw bootstrap replicates from survey data using the rescaled bootstrap for stratified multistage sampling, presented by Preston, J. (2009). -#' +#' @description Draw bootstrap replicates from survey data using the rescaled +#' bootstrap for stratified multistage sampling, presented by Preston, J. +#' (2009). #' #' @param dat either data frame or data table containing the survey sample #' @param REP integer indicating the number of bootstraps to be drawn -#' @param strata string specifying the column name in `dat` that is used for stratification. For multistage sampling multiple column names can be specified by `strata=c("strata1>strata2>strata3")`. -#' See Details for more information. -#' @param cluster string specifying the column name in `dat` that is used for clustering. For multistage sampling multiple column names can be specified by `cluster=c("cluster1>cluster2>cluster3")`. +#' @param strata string specifying the column name in `dat` that is used for +#' stratification. For multistage sampling multiple column names can be +#' specified by `strata=c("strata1>strata2>strata3")`. See Details for more +#' information. +#' @param cluster string specifying the column name in `dat` that is used for +#' clustering. For multistage sampling multiple column names can be specified +#' by `cluster=c("cluster1>cluster2>cluster3")`. #' See Details for more information. -#' @param fpc string specifying the column name in `dat` that contains the number of PSUs at the first stage. For multistage sampling the number of PSUs at each stage must be specified by `strata=c("fpc1>fpc2>fpc3")`. -#' @param single.PSU either "merge" or "mean" defining how single PSUs need to be dealt with. -#' For `single.PSU="merge"` single PSUs at each stage are merged with the strata or cluster with the next least number of PSUs. If multiple of those exist one will be select via random draw -#' For `single.PSU="mean"` single PSUs will get the mean over all bootstrap replicates at the stage which did not contain single PSUs. -#' @param return.value either "data" or "replicates" specifying the return value of the function. For "data" the survey data is returned as class `data.table`, for "replicates" only the bootstrap replicates are returned as `data.table`. -#' @param check.input logical, if TRUE the input will be checked before applying the bootstrap procedure -#' @param new.method logical, if TRUE bootstrap replicates will never be negative even if in some strata the whole population is in the sample. WARNING: This is still experimental and resulting standard errors might be underestimated! Use this if for some strata the whole population is in the sample! +#' @param fpc string specifying the column name in `dat` that contains the +#' number of PSUs at the first stage. For multistage sampling the number of +#' PSUs at each stage must be specified by `strata=c("fpc1>fpc2>fpc3")`. +#' @param single.PSU either "merge" or "mean" defining how single PSUs need to +#' be dealt with. For `single.PSU="merge"` single PSUs at each stage are +#' merged with the strata or cluster with the next least number of PSUs. If +#' multiple of those exist one will be select via random draw. For +#' `single.PSU="mean"` single PSUs will get the mean over all bootstrap +#' replicates at the stage which did not contain single PSUs. +#' @param return.value either "data" or "replicates" specifying the return value +#' of the function. For "data" the survey data is returned as class +#' `data.table`, for "replicates" only the bootstrap replicates are returned +#' as `data.table`. +#' @param check.input logical, if TRUE the input will be checked before applying +#' the bootstrap procedure +#' @param new.method logical, if TRUE bootstrap replicates will never be +#' negative even if in some strata the whole population is in the sample. +#' WARNING: This is still experimental and resulting standard errors might be +#' underestimated! Use this if for some strata the whole population is in the +#' sample! #' -#' @details For specifying multistage sampling designs the column names in `strata`,`cluster` and `fpc` need to seperated by ">".\cr -#' For multistage sampling the strings are read from left to right meaning that the column name before the first ">" is taken as the column for stratification/clustering/number of PSUs at the first and the column after the last ">" is taken as the column for stratification/clustering/number of PSUs at the last stage. -#' If for some stages the sample was not stratified or clustered one must specify this by "1" or "I", e.g. `strata=c("strata1>I>strata3")` if there was no stratification at the second stage or `cluster=c("cluster1>cluster2>I")` if there were no clusters at the last stage.\cr -#' The number of PSUs at each stage is not calculated internally and must be specified for any sampling design. -#' For single stage sampling using stratification this can usually be done by adding over all sample weights of each PSU by each strata-code.\cr -#' Spaces in each of the strings will be removed, so if column names contain spaces they should be renamed before calling this procedure! +#' @details For specifying multistage sampling designs the column names in +#' `strata`,`cluster` and `fpc` need to seperated by ">".\cr +#' For multistage sampling the strings are read from left to right meaning that +#' the column name before the first ">" is taken as the column for +#' stratification/clustering/number of PSUs at the first and the column after +#' the last ">" is taken as the column for stratification/clustering/number of +#' PSUs at the last stage. +#' If for some stages the sample was not stratified or clustered one must +#' specify this by "1" or "I", e.g. `strata=c("strata1>I>strata3")` if there was +#' no stratification at the second stage or `cluster=c("cluster1>cluster2>I")` +#' if there were no clusters at the last stage.\cr +#' The number of PSUs at each stage is not calculated internally and must be +#' specified for any sampling design. +#' For single stage sampling using stratification this can usually be done by +#' adding over all sample weights of each PSU by each strata-code.\cr +#' Spaces in each of the strings will be removed, so if column names contain +#' spaces they should be renamed before calling this procedure! #' -#' @return returns the complete data set including the bootstrap replicates or just the bootstrap replicates, depending on `return.value="data"` or `return.value="replicates"` respectively. +#' @return returns the complete data set including the bootstrap replicates or +#' just the bootstrap replicates, depending on `return.value="data"` or +#' `return.value="replicates"` respectively. #' @export rescaled.bootstrap #' -#' @references Preston, J. (2009). Rescaled bootstrap for stratified multistage sampling. Survey Methodology. 35. 227-234. +#' @references Preston, J. (2009). Rescaled bootstrap for stratified multistage +#' sampling. Survey Methodology. 35. 227-234. #' #' @author Johannes Gussenbauer, Statistics Austria #' diff --git a/man/calc.stError.Rd b/man/calc.stError.Rd index 79db658..6707b28 100644 --- a/man/calc.stError.Rd +++ b/man/calc.stError.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/calc.stError.R \name{calc.stError} \alias{calc.stError} -\title{Calcualte point estimates and their standard errors using bootstrap weights.} +\title{Calcualte point estimates and their standard errors using bootstrap +weights.} \usage{ calc.stError(dat, weights = attr(dat, "weights"), b.weights = attr(dat, "b.rep"), period = attr(dat, "period"), var, fun = weightedRatio, @@ -12,65 +13,105 @@ calc.stError(dat, weights = attr(dat, "weights"), b.weights = attr(dat, add.arg = NULL) } \arguments{ -\item{dat}{either data.frame or data.table containing the survey data. Surveys can be a panel survey or rotating panel survey, but does not need to be. For rotating panel survey bootstrap weights can be created using \link{draw.bootstrap} and \link{recalib}.} +\item{dat}{either data.frame or data.table containing the survey data. +Surveys can be a panel survey or rotating panel survey, but does not need +to be. For rotating panel survey bootstrap weights can be created using +\link{draw.bootstrap} and \link{recalib}.} -\item{weights}{character specifying the name of the column in \code{dat} containing the original sample weights. Used to calculate point estimates.} +\item{weights}{character specifying the name of the column in \code{dat} +containing the original sample weights. Used to calculate point estimates.} -\item{b.weights}{character vector specifying the names of the columns in \code{dat} containing bootstrap weights. Used to calculate standard errors.} +\item{b.weights}{character vector specifying the names of the columns in +\code{dat} containing bootstrap weights. Used to calculate standard errors.} -\item{period}{character specifying the name of the column in \code{dat} containing the sample periods.} +\item{period}{character specifying the name of the column in \code{dat} +containing the sample periods.} -\item{var}{character vector containing variable names in \code{dat} on which \code{fun} shall be applied for each sample period.} +\item{var}{character vector containing variable names in \code{dat} on which \code{fun} +shall be applied for each sample period.} \item{fun}{function which will be applied on \code{var} for each sample period. -Predefined functions are \link{weightedRatio}, \link{weightedSum}, but can also take any other function which returns a double or integer and uses weights as its second argument.} - -\item{national}{boolean, if TRUE point estimates resulting from fun will be divided by the point estimate at the national level.} - -\item{group}{character vectors or list of character vectors containig variables in \code{dat}. For each list entry \code{dat} will be split in subgroups according to the containing variables as well as \code{period}. -The pointestimates are then estimated for each subgroup seperately. If \code{group=NULL} the data will split into sample periods by default.} - -\item{fun.adjust.var}{can be either \code{NULL} or a function. This argument can be used to apply a function for each \code{period} and bootstrap weight to the data. The resulting estimates will be passed down to \code{fun}. See details for more explanations.} - -\item{adjust.var}{can be either \code{NULL} or a character specifying the first argument in \code{fun.adjust.var}.} - -\item{period.diff}{character vectors, defining periods for which the differences in the point estimate as well it's standard error is calculated. Each entry must have the form of \code{"period1 - period2"}. Can be NULL} - -\item{period.mean}{odd integer, defining the range of periods over which the sample mean of point estimates is additionally calcualted.} - -\item{bias}{boolean, if \code{TRUE} the sample mean over the point estimates of the bootstrap weights is returned.} - -\item{size.limit}{integer defining a lower bound on the number of observations on \code{dat} in each group defined by \code{period} and the entries in \code{group}. -Warnings are returned if the number of observations in a subgroup falls below \code{size.limit}. In addition the concerned groups are available in the function output.} - -\item{cv.limit}{non-negativ value defining a upper bound for the standard error in relation to the point estimate. If this relation exceed \code{cv.limit}, for a point estimate, they are flagged and available in the function output.} - -\item{p}{numeric vector containing values between 0 and 1. Defines which quantiles for the distribution of \code{var} are additionally estimated.} - -\item{add.arg}{additional arguments which will be passed to fun. Can be either a named list or vector. The names of the object correspond to the function arguments and the values to column names in dat, see also examples.} +Predefined functions are \link{weightedRatio}, \link{weightedSum}, but can also take +any other function which returns a double or integer and uses weights as +its second argument.} + +\item{national}{boolean, if TRUE point estimates resulting from fun will be +divided by the point estimate at the national level.} + +\item{group}{character vectors or list of character vectors containig +variables in \code{dat}. For each list entry \code{dat} will be split in subgroups +according to the containing variables as well as \code{period}. The +pointestimates are then estimated for each subgroup seperately. If +\code{group=NULL} the data will split into sample periods by default.} + +\item{fun.adjust.var}{can be either \code{NULL} or a function. This argument can +be used to apply a function for each \code{period} and bootstrap weight to the +data. The resulting estimates will be passed down to \code{fun}. See details for +more explanations.} + +\item{adjust.var}{can be either \code{NULL} or a character specifying the first +argument in \code{fun.adjust.var}.} + +\item{period.diff}{character vectors, defining periods for which the +differences in the point estimate as well it's standard error is +calculated. Each entry must have the form of \code{"period1 - period2"}. Can be +NULL} + +\item{period.mean}{odd integer, defining the range of periods over which the +sample mean of point estimates is additionally calcualted.} + +\item{bias}{boolean, if \code{TRUE} the sample mean over the point estimates of +the bootstrap weights is returned.} + +\item{size.limit}{integer defining a lower bound on the number of +observations on \code{dat} in each group defined by \code{period} and the entries in +\code{group}. Warnings are returned if the number of observations in a subgroup +falls below \code{size.limit}. In addition the concerned groups are available in +the function output.} + +\item{cv.limit}{non-negativ value defining a upper bound for the standard +error in relation to the point estimate. If this relation exceed +\code{cv.limit}, for a point estimate, they are flagged and available in the +function output.} + +\item{p}{numeric vector containing values between 0 and 1. Defines which +quantiles for the distribution of \code{var} are additionally estimated.} + +\item{add.arg}{additional arguments which will be passed to fun. Can be +either a named list or vector. The names of the object correspond to the +function arguments and the values to column names in dat, see also +examples.} } \value{ Returns a list containing: \itemize{ -\item \code{Estimates}: data.table containing period differences and/or k period averages for estimates of -\code{fun} applied to \code{var} as well as the corresponding standard errors, which are calculated using the bootstrap weights. -In addition the sample size, \code{n}, and poplutaion size for each group is added to the output. -\item \code{smallGroups}: data.table containing groups for which the number of observation falls below \code{size.limit}. -\item \code{cvHigh}: data.table containing a boolean variable which indicates for each estimate if the -estimated standard error exceeds \code{cv.limit}. -\item \code{stEDecrease}: data.table indicating for each estimate the theoretical increase in sample size which is -gained when averaging over k periods. Only returned if \code{period.mean} is not \code{NULL}. +\item \code{Estimates}: data.table containing period differences and/or k period +averages for estimates of +\code{fun} applied to \code{var} as well as the corresponding standard errors, which +are calculated using the bootstrap weights. In addition the sample size, +\code{n}, and poplutaion size for each group is added to the output. +\item \code{smallGroups}: data.table containing groups for which the number of +observation falls below \code{size.limit}. +\item \code{cvHigh}: data.table containing a boolean variable which indicates for each +estimate if the estimated standard error exceeds \code{cv.limit}. +\item \code{stEDecrease}: data.table indicating for each estimate the theoretical +increase in sample size which is gained when averaging over k periods. Only +returned if \code{period.mean} is not \code{NULL}. } } \description{ -Calculate point estimates as well as standard errors of variables in surveys. Standard errors are estimated using bootstrap weights (see \link{draw.bootstrap} and \link{recalib}). -In addition the standard error of an estimate can be calcualted using the survey data for 3 or more consecutive periods, which results in a reduction of the standard error. +Calculate point estimates as well as standard errors of variables in surveys. +Standard errors are estimated using bootstrap weights (see \link{draw.bootstrap} +and \link{recalib}). In addition the standard error of an estimate can be +calcualted using the survey data for 3 or more consecutive periods, which +results in a reduction of the standard error. } \details{ -\code{calc.stError} takes survey data (\code{dat}) and returns point estimates as well as their standard Errors -defined by \code{fun} and \code{var} for each sample period in \code{dat}. -\code{dat} must be household data where household members correspond to multiple rows with the same household identifier. -The data should at least containt the following columns: +\code{calc.stError} takes survey data (\code{dat}) and returns point estimates +as well as their standard Errors defined by \code{fun} and \code{var} for each sample +period in \code{dat}. \code{dat} must be household data where household members +correspond to multiple rows with the same household identifier. The data +should at least contain the following columns: \itemize{ \item Column indicating the sample period; \item Column indicating the household ID; @@ -79,53 +120,106 @@ The data should at least containt the following columns: \item Columns listed in \code{var} as well as in \code{group} } -For each variable in \code{var} as well as sample period the function \code{fun} is applied using the original as well as the bootstrap sample weights.\cr -The point estimate is then selected as the result of \code{fun} when using the original sample weights and it's standard error is estimated with the result of \code{fun} using the bootstrap sample weights. \cr +For each variable in \code{var} as well as sample period the function \code{fun} is +applied using the original as well as the bootstrap sample weights.\cr +The point estimate is then selected as the result of \code{fun} when using the +original sample weights and it's standard error is estimated with the result +of \code{fun} using the bootstrap sample weights. \cr \cr -\code{fun} can be any function which returns a double or integer and uses sample weights as it's second argument. The predifined options are \code{weightedRatio} and \code{weightedSum}.\cr +\code{fun} can be any function which returns a double or integer and uses sample +weights as it's second argument. The predifined options are \code{weightedRatio} +and \code{weightedSum}.\cr \cr -For the option \code{weightedRatio} a weighted ratio (in \%) of \code{var} is calculated for \code{var} equal to 1, e.g \code{sum(weight[var==1])/sum(weight[!is.na(var)])*100}.\cr -Additionally using the option \code{national=TRUE} the weighted ratio (in \%) is divided by the weighted ratio at the national level for each \code{period}. +For the option \code{weightedRatio} a weighted ratio (in \%) of \code{var} is +calculated for \code{var} equal to 1, e.g +\code{sum(weight[var==1])/sum(weight[!is.na(var)])*100}.\cr +Additionally using the option \code{national=TRUE} the weighted ratio (in \%) is +divided by the weighted ratio at the national level for each \code{period}. \cr -If \code{group} is not \code{NULL} but a vector of variables from \code{dat} then \code{fun} is applied on each subset of \code{dat} defined by all combinations of values in \code{group}.\cr -For instance if \code{group = "sex"} with "sex" having the values "Male" and "Female" in \code{dat} the point estimate and standard error is calculated on the subsets of \code{dat} with only "Male" or "Female" value for "sex". This is done for each value of \code{period}. -For variables in \code{group} which have \code{NA}s in \code{dat} the rows containing the missings will be discarded. \cr -When \code{group} is a list of character vectors, subsets of \code{dat} and the following estimation of the point estimate, including the estimate for the standard error, are calculated for each list entry.\cr +If \code{group} is not \code{NULL} but a vector of variables from \code{dat} then \code{fun} is +applied on each subset of \code{dat} defined by all combinations of values in +\code{group}.\cr +For instance if \code{group = "sex"} with "sex" having the values "Male" and +"Female" in \code{dat} the point estimate and standard error is calculated on the +subsets of \code{dat} with only "Male" or "Female" value for "sex". This is done +for each value of \code{period}. For variables in \code{group} which have \code{NA}s in +\code{dat} the rows containing the missings will be discarded. \cr +When \code{group} is a list of character vectors, subsets of \code{dat} and the +following estimation of the point estimate, including the estimate for the +standard error, are calculated for each list entry.\cr \cr -The optional parameters \code{fun.adjust.var} and \code{adjust.var} can be used if the values in \code{var} are dependent on the \code{weights}. As is for instance the case for the poverty thershhold calculated from EU-SILC. -In such a case an additional function can be supplied using \code{fun.adjust.var} as well as its first argument \code{adjust.var}, which needs to be part of the data set \code{dat}. Then, before applying \code{fun} on variable \code{var} -for all \code{period} and groups, the function \code{fun.adjust.var} is applied to \code{adjust.var} using each of the bootstrap weights seperately (NOTE: weight is used as the second argument of \code{fun.adjust.var}). +The optional parameters \code{fun.adjust.var} and \code{adjust.var} can be used if the +values in \code{var} are dependent on the \code{weights}. As is for instance the case +for the poverty thershhold calculated from EU-SILC. +In such a case an additional function can be supplied using \code{fun.adjust.var} +as well as its first argument \code{adjust.var}, which needs to be part of the +data set \code{dat}. Then, before applying \code{fun} on variable \code{var} +for all \code{period} and groups, the function \code{fun.adjust.var} is applied to +\code{adjust.var} using each of the bootstrap weights seperately (NOTE: weight is +used as the second argument of \code{fun.adjust.var}). Thus creating i=1,...,\code{length(b.weights)} additional variables. -For applying \code{fun} on \code{var} the estimates for the bootstrap replicate will now use each of the corresponding new additional variables. So instead of +For applying \code{fun} on \code{var} the estimates for the bootstrap replicate will +now use each of the corresponding new additional variables. So instead of \deqn{fun(var,weights,...),fun(var,b.weights[1],...),fun(var,b.weights[2],...),...} the function \code{fun} will be applied in the way \deqn{fun(var,weights,...),fun(var.1,b.weights[1],...),fun(var.2,b.weights[2],...),...} -where \code{var.1}, \code{var.2}, \code{...} correspond to the estimates resulting from \code{fun.adjust.var} and \code{adjust.var}. -NOTE: This procedure is especially usefull if the \code{var} is dependent on \code{weights} and \code{fun} is applied on subgroups of the data set. Then it is not possible to capture this -procedure with \code{fun} and \code{var}, see examples for a more hands on explanation. +where \code{var.1}, \code{var.2}, \code{...} correspond to the estimates resulting from +\code{fun.adjust.var} and \code{adjust.var}. +NOTE: This procedure is especially usefull if the \code{var} is dependent on +\code{weights} and \code{fun} is applied on subgroups of the data set. Then it is not +possible to capture this procedure with \code{fun} and \code{var}, see examples for a +more hands on explanation. \cr -When defining \code{period.diff} the difference of point estimates between periods as well their standard errors are calculated.\cr -The entries in \code{period.diff} must have the form of \code{"period1 - period2"} which means that the results of the point estimates for \code{period2} will be substracted from the results of the point estimates for \code{period1}.\cr +When defining \code{period.diff} the difference of point estimates between periods +as well their standard errors are calculated.\cr +The entries in \code{period.diff} must have the form of \code{"period1 - period2"} +which means that the results of the point estimates for \code{period2} will be +substracted from the results of the point estimates for \code{period1}.\cr \cr -Specifying \code{period.mean} leads to an improvement in standard error by averaging the results for the point estimates, using the bootstrap weights, over \code{period.mean} periods. -Setting, for instance, \code{period.mean = 3} the results in averaging these results over each consecutive set of 3 periods.\cr -Estimating the standard error over these averages gives an improved estimate of the standard error for the central period, which was used for averaging.\cr -The averaging of the results is also applied in differences of point estimates. For instance defining \code{period.diff = "2015-2009"} and \code{period.mean = 3} -the differences in point estimates of 2015 and 2009, 2016 and 2010 as well as 2017 and 2011 are calcualated and finally the average over these 3 differences is calculated. -The periods set in \code{period.diff} are always used as starting periods from which \code{period.mean-1} consecutive periods are used to build the average. +Specifying \code{period.mean} leads to an improvement in standard error by +averaging the results for the point estimates, using the bootstrap weights, +over \code{period.mean} periods. +Setting, for instance, \code{period.mean = 3} the results in averaging these +results over each consecutive set of 3 periods.\cr +Estimating the standard error over these averages gives an improved estimate +of the standard error for the central period, which was used for +averaging.\cr +The averaging of the results is also applied in differences of point +estimates. For instance defining \code{period.diff = "2015-2009"} and +\code{period.mean = 3} +the differences in point estimates of 2015 and 2009, 2016 and 2010 as well as +2017 and 2011 are calcualated and finally the average over these 3 +differences is calculated. +The periods set in \code{period.diff} are always used as starting periods from +which \code{period.mean-1} consecutive periods are used to build the average. \cr -Setting \code{bias} to \code{TRUE} returns the calculation of a mean over the results from the bootstrap replicates. In the output the corresponding columns is labeled \emph{_mean} at the end.\cr +Setting \code{bias} to \code{TRUE} returns the calculation of a mean over the results +from the bootstrap replicates. In the output the corresponding columns is +labeled \emph{_mean} at the end.\cr \cr -If \code{fun} needs more arguments they can be supplied in \code{add.arg}. This can either be a named list or vector.\cr +If \code{fun} needs more arguments they can be supplied in \code{add.arg}. This can +either be a named list or vector.\cr \cr -The parameter \code{size.limit} indicates a lower bound of the sample size for subsets in \code{dat} created by \code{group}. If the sample size of a subset falls below \code{size.limit} a warning will be displayed.\cr -In addition all subsets for which this is the case can be selected from the output of \code{calc.stError} with \code{$smallGroups}.\cr -With the parameter \code{cv.limit} one can set an upper bound on the coefficient of variantion. Estimates which exceed this bound are flagged with \code{TRUE} and are available in the function output with \code{$cvHigh}. -\code{cv.limit} must be a positive integer and is treated internally as \%, e.g. for \code{cv.limit=1} the estimate will be flagged if the coefficient of variantion exceeds 1\%.\cr +The parameter \code{size.limit} indicates a lower bound of the sample size for +subsets in \code{dat} created by \code{group}. If the sample size of a subset falls +below \code{size.limit} a warning will be displayed.\cr +In addition all subsets for which this is the case can be selected from the +output of \code{calc.stError} with \code{$smallGroups}.\cr +With the parameter \code{cv.limit} one can set an upper bound on the coefficient +of variantion. Estimates which exceed this bound are flagged with \code{TRUE} and +are available in the function output with \code{$cvHigh}. +\code{cv.limit} must be a positive integer and is treated internally as \%, e.g. +for \code{cv.limit=1} the estimate will be flagged if the coefficient of +variantion exceeds 1\%.\cr \cr -When specifying \code{period.mean}, the decrease in standard error for choosing this method is internally calcualted and a rough estimate for an implied increase in sample size is available in the output with \code{$stEDecrease}. -The rough estimate for the increase in sample size uses the fact that for a sample of size \eqn{n} the sample estimate for the standard error of most point estimates converges with a factor \eqn{1/\sqrt{n}} against the true standard error \eqn{\sigma}. +When specifying \code{period.mean}, the decrease in standard error for choosing +this method is internally calcualted and a rough estimate for an implied +increase in sample size is available in the output with \code{$stEDecrease}. +The rough estimate for the increase in sample size uses the fact that for a +sample of size \eqn{n} the sample estimate for the standard error of most +point estimates converges with a factor \eqn{1/\sqrt{n}} against the true +standard error \eqn{\sigma}. } \examples{ # Import data and calibrate @@ -138,37 +232,45 @@ dat_boot_calib <- recalib(dat_boot, conP.var = "gender", conH.var = "region") # estimate weightedRatio for povertyRisk per period -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio) +err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", + fun = weightedRatio) err.est$Estimates -# calculate weightedRatio for povertyRisk and fraction of one-person households per period +# calculate weightedRatio for povertyRisk and fraction of one-person +# households per period dat_boot_calib[, onePerson := .N == 1, by = .(year, hid)] -err.est <- calc.stError(dat_boot_calib, var = c("povertyRisk", "onePerson"), fun = weightedRatio) +err.est <- calc.stError(dat_boot_calib, var = c("povertyRisk", "onePerson"), + fun = weightedRatio) err.est$Estimates # estimate weightedRatio for povertyRisk per period and gender group <- "gender" -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group) +err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", + fun = weightedRatio, group = group) err.est$Estimates -# estimate weightedRatio for povertyRisk per period and gender, region and combination of both +# estimate weightedRatio for povertyRisk per period and gender, region and +# combination of both group <- list("gender", "region", c("gender", "region")) -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group) +err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", + fun = weightedRatio, group = group) err.est$Estimates # use average over 3 periods for standard error estimation -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, period.mean = 3) +err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", + fun = weightedRatio, period.mean = 3) err.est$Estimates # get estimate for difference of period 2016 and 2013 period.diff <- c("2015-2011") -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, - period.diff = period.diff, period.mean = 3) +err.est <- calc.stError( + dat_boot_calib, var = "povertyRisk", fun = weightedRatio, + period.diff = period.diff, period.mean = 3) err.est$Estimates # use add.arg-argument @@ -181,10 +283,12 @@ err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = fun, period.mean = 0, add.arg=add.arg) err.est$Estimates # compare with direkt computation -compare.value <- dat_boot_calib[,fun(povertyRisk,pWeight,b=onePerson),by=c("year")] +compare.value <- dat_boot_calib[,fun(povertyRisk,pWeight,b=onePerson), + by=c("year")] all((compare.value$V1-err.est$Estimates$val_povertyRisk)==0) -# use a function from an other package that has sampling weights as its second argument +# use a function from an other package that has sampling weights as its +# second argument # for example gini() from laeken library(laeken) @@ -194,15 +298,18 @@ help_gini <- function(x, w){ return(gini(x, w)$value) } -## make sure povertyRisk get coerced to a numeric in order to work with the external functions +## make sure povertyRisk get coerced to a numeric in order to work with the +## external functions invisible(dat_boot_calib[, povertyRisk := as.numeric(povertyRisk)]) -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = help_gini, group = group, - period.diff = period.diff, period.mean = 3) +err.est <- calc.stError( + dat_boot_calib, var = "povertyRisk", fun = help_gini, group = group, + period.diff = period.diff, period.mean = 3) err.est$Estimates # using fun.adjust.var and adjust.var to estimate povmd60 indicator -# for each period and bootstrap weight before applying the weightedRatio point estimate +# for each period and bootstrap weight before applying the weightedRatio +# point estimate # this function estimates the povmd60 indicator with x as income vector # and w as weight vector @@ -216,9 +323,9 @@ povmd <- function(x, w){ # the povmd60 indicator for each bootstrap weight # and the resultung indicators are passed to function weightedRatio -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, - group = group, fun.adjust.var = povmd, adjust.var = "eqIncome", - period.mean = 3) +err.est <- calc.stError( + dat_boot_calib, var = "povertyRisk", fun = weightedRatio, group = group, + fun.adjust.var = povmd, adjust.var = "eqIncome", period.mean = 3) err.est$Estimates # why fun.adjust.var and adjust.var are needed (!!!): @@ -236,9 +343,9 @@ povmd2 <- function(x, w){ # but this results in different results in subgroups # compared to using fun.adjust.var and adjust.var -err.est.different <- calc.stError(dat_boot_calib, var = "eqIncome", fun = povmd2, - group = group, fun.adjust.var = NULL, adjust.var = NULL, - period.mean = 3) +err.est.different <- calc.stError( + dat_boot_calib, var = "eqIncome", fun = povmd2, group = group, + fun.adjust.var = NULL, adjust.var = NULL, period.mean = 3) err.est.different$Estimates ## results are equal for yearly estimates diff --git a/man/computeFrac.Rd b/man/computeFrac.Rd index 277b9cc..a357c98 100644 --- a/man/computeFrac.Rd +++ b/man/computeFrac.Rd @@ -22,23 +22,24 @@ computeFrac(curValue, target, x, w) \item{w}{Vector of weights} -\item{boundLinear}{The output \code{f} will satisfy \code{1/boundLinear <= f <= boundLinear}. See \code{bound} in \link{ipf}} +\item{boundLinear}{The output \code{f} will satisfy +\code{1/boundLinear <= f <= boundLinear}. See \code{bound} in \link{ipf}} } \value{ A weight multiplier \code{f} } \description{ -Customize weight-updating within factor levels in case of numerical calibration. The -functions described here serve as inputs for \link{ipf}. +Customize weight-updating within factor levels in case of numerical +calibration. The functions described here serve as inputs for \link{ipf}. } \details{ \code{computeFrac} provides the "standard" IPU updating scheme given as \deqn{f = target/curValue} -which means that each weight inside the level will be multtiplied by the same factor when -doing the actual update step (\code{w := f*w}). \code{computeLinear} on the other hand -calculates \code{f} as +which means that each weight inside the level will be multtiplied by the same +factor when doing the actual update step (\code{w := f*w}). \code{computeLinear} on the +other hand calculates \code{f} as \ifelse{html}{ \out{
fi = a · xi + b
} @@ -53,5 +54,6 @@ where \code{a} and \code{b} are chosen, so f satisfies the following two equatio \out{
∑ fi wi = ∑ wi
} }{\deqn{\sum f_i * w_i = \sum w_i}} -\code{computeLinearG1} calculates \code{f} in the same way as \code{computeLinear}, but if \code{f_i*w_i<1} \code{f_i} will be set to \code{1/w_i}. +\code{computeLinearG1} calculates \code{f} in the same way as \code{computeLinear}, but if +\code{f_i*w_i<1} \code{f_i} will be set to \code{1/w_i}. } diff --git a/man/demo.eusilc.Rd b/man/demo.eusilc.Rd index c534403..bed7acc 100644 --- a/man/demo.eusilc.Rd +++ b/man/demo.eusilc.Rd @@ -9,28 +9,33 @@ demo.eusilc(n = 8, prettyNames = FALSE) \arguments{ \item{n}{Number of years to generate. Should be at least 1} -\item{prettyNames}{Create easy-to-read names for certain variables. Recommended for demonstration -purposes. Otherwise, use the original codes documented in \link[laeken:eusilc]{laeken::eusilc}.} +\item{prettyNames}{Create easy-to-read names for certain variables. +Recommended for demonstration purposes. Otherwise, use the original codes +documented in \link[laeken:eusilc]{laeken::eusilc}.} } \description{ -Create a dummy dataset to be used for demonstrating the functionalities of the \code{surveysd} package -based on \link[laeken:eusilc]{laeken::eusilc}. Please refer to the documentation page of the original data for -details about the variables. +Create a dummy dataset to be used for demonstrating the functionalities of +the \code{surveysd} package based on \link[laeken:eusilc]{laeken::eusilc}. Please refer to the +documentation page of the original data for details about the variables. } \details{ -If \code{prettyNames} is \code{TRUE}, the following variables will be available in an easy-to-read manner. +If \code{prettyNames} is \code{TRUE}, the following variables will be available in an +easy-to-read manner. \itemize{ -\item \code{hid} Household id. Consistent with respect to the reference period (\code{year}) +\item \code{hid} Household id. Consistent with respect to the reference period +(\code{year}) \item \code{hsize} Size of the household. derived from \code{hid} and \code{period} \item \code{region} Federal state of austria where the household is located \item \code{pid} Personal id. Consistent with respect to the reference period (\code{year}) \item \code{age} Age-class of the respondent \item \code{gender} A persons gender (\code{"male"}, \code{"Female"}) -\item \code{ecoStat} Ecnomic status (\code{"part time"}, \code{"full time"}, \code{"unemployed"}, ...) +\item \code{ecoStat} Ecnomic status +(\code{"part time"}, \code{"full time"}, \code{"unemployed"}, ...) \item \code{citizenship} Citizenship (\code{"AT"}, \code{"EU"}, \code{"other"}) \item \code{pWeight} Personal sample weight inside the reference period \item \code{year}. Simulated reference period -\item \code{povertyRisk}. Logical variable determining whether a respondent is at risk of poverty +\item \code{povertyRisk}. Logical variable determining whether a respondent is at risk +of poverty } } \examples{ diff --git a/man/draw.bootstrap.Rd b/man/draw.bootstrap.Rd index 9930bc9..bbe6577 100644 --- a/man/draw.bootstrap.Rd +++ b/man/draw.bootstrap.Rd @@ -10,102 +10,162 @@ draw.bootstrap(dat, REP = 1000, hid = NULL, weights, period = NULL, pid = NULL, new.method = FALSE) } \arguments{ -\item{dat}{either data.frame or data.table containing the survey data with rotating panel design.} +\item{dat}{either data.frame or data.table containing the survey data with +rotating panel design.} \item{REP}{integer indicating the number of bootstrap replicates.} -\item{hid}{character specifying the name of the column in \code{dat} containing the household id. If -\code{NULL} (the default), the household structure is not regarded.} - -\item{weights}{character specifying the name of the column in \code{dat} containing the sample weights.} - -\item{period}{character specifying the name of the column in \code{dat} containing the sample periods. -If \code{NULL} (the default), it is assumed that all observations belong to the same -period.} - -\item{strata}{character vector specifying the name(s) of the column in \code{dat} by which the population -was stratified. If \code{strata} is a vector stratification will be assumed as the combination -of column names contained in \code{strata}. Setting in addition \code{cluster} not NULL stratification -will be assumed on multiple stages, where each additional entry in \code{strata} specifies the -stratification variable for the next lower stage. see Details for more information.} - -\item{cluster}{character vector specifying cluster in the data. If not already specified in \code{cluster} household ID is taken es the lowest level cluster.} - -\item{totals}{character specifying the name of the column in \code{dat} containing the the totals per strata and/or cluster. Is ONLY optional if \code{cluster} is \code{NULL} or equal \code{hid} and \code{strata} contains one columnname! -Then the households per strata will be calcualted using the \code{weights} argument. If clusters and strata for multiple stages are specified \code{totals} needs to be a vector of \code{length(strata)} specifying the column on \code{dat} -that contain the total number of PSUs at each stage. \code{totals} is interpreted from left the right, meaning that the first argument corresponds to the number of PSUs at the first and the last argument to the number of PSUs at the last stage.} - -\item{single.PSU}{either "merge" or "mean" defining how single PSUs need to be dealt with. -For \code{single.PSU="merge"} single PSUs at each stage are merged with the strata or cluster with the next least number of PSUs. If multiple of those exist one will be select via random draw -For \code{single.PSU="mean"} single PSUs will get the mean over all bootstrap replicates at the stage which did not contain single PSUs.} - -\item{boot.names}{character indicating the leading string of the column names for each bootstrap replica. If NULL defaults to "w".} - -\item{split}{logical, if TRUE split households are considered using \code{pid}, for more information see Details.} - -\item{pid}{column in \code{dat} specifying the personal identifier. This identifier needs to be unique for each person throught the whole data set.} - -\item{new.method}{logical, if TRUE bootstrap replicates will never be negative even if in some strata the whole population is in the sample. WARNING: This is still experimental and resulting standard errors might be underestimated! Use this if for some strata the whole population is in the sample!} +\item{hid}{character specifying the name of the column in \code{dat} containing +the household id. If \code{NULL} (the default), the household structure is not +regarded.} + +\item{weights}{character specifying the name of the column in \code{dat} +containing the sample weights.} + +\item{period}{character specifying the name of the column in \code{dat} containing +the sample periods. If \code{NULL} (the default), it is assumed that all +observations belong to the same period.} + +\item{strata}{character vector specifying the name(s) of the column in \code{dat} +by which the population was stratified. If \code{strata} is a vector +stratification will be assumed as the combination of column names contained +in \code{strata}. Setting in addition \code{cluster} not NULL stratification will be +assumed on multiple stages, where each additional entry in \code{strata} +specifies the stratification variable for the next lower stage. see Details +for more information.} + +\item{cluster}{character vector specifying cluster in the data. If not +already specified in \code{cluster} household ID is taken es the lowest level +cluster.} + +\item{totals}{character specifying the name of the column in \code{dat} containing +the the totals per strata and/or cluster. Is ONLY optional if \code{cluster} is +\code{NULL} or equal \code{hid} and \code{strata} contains one columnname! Then the +households per strata will be calcualted using the \code{weights} argument. If +clusters and strata for multiple stages are specified \code{totals} needs to be +a vector of \code{length(strata)} specifying the column on \code{dat} that contain +the total number of PSUs at each stage. \code{totals} is interpreted from left +the right, meaning that the first argument corresponds to the number of +PSUs at the first and the last argument to the number of PSUs at the last +stage.} + +\item{single.PSU}{either "merge" or "mean" defining how single PSUs need to +be dealt with. For \code{single.PSU="merge"} single PSUs at each stage are +merged with the strata or cluster with the next least number of PSUs. If +multiple of those exist one will be select via random draw. For +\code{single.PSU="mean"} single PSUs will get the mean over all bootstrap +replicates at the stage which did not contain single PSUs.} + +\item{boot.names}{character indicating the leading string of the column names +for each bootstrap replica. If NULL defaults to "w".} + +\item{split}{logical, if TRUE split households are considered using \code{pid}, +for more information see Details.} + +\item{pid}{column in \code{dat} specifying the personal identifier. This +identifier needs to be unique for each person throught the whole data set.} + +\item{new.method}{logical, if TRUE bootstrap replicates will never be +negative even if in some strata the whole population is in the sample. +WARNING: This is still experimental and resulting standard errors might be +underestimated! Use this if for some strata the whole population is in the +sample!} } \value{ -the survey data with the number of REP bootstrap replicates added as columns. - -Returns a data.table containing the original data as well as the number of \code{REP} columns containing the bootstrap replicates for each repetition.\cr -The columns of the bootstrap replicates are by default labeled "w\emph{Number}" where \emph{Number} goes from 1 to \code{REP}. -If the column names of the bootstrap replicates should start with a different character or string the parameter \code{boot.names} can be used. +the survey data with the number of REP bootstrap replicates added as +columns. + +Returns a data.table containing the original data as well as the +number of \code{REP} columns containing the bootstrap replicates for each +repetition.\cr +The columns of the bootstrap replicates are by default labeled "w\emph{Number}" +where \emph{Number} goes from 1 to \code{REP}. If the column names of the bootstrap +replicates should start with a different character or string the parameter +\code{boot.names} can be used. } \description{ -Draw bootstrap replicates from survey data with rotating panel design. -Survey information, like ID, sample weights, strata and population totals per strata, should be specified to ensure meaningfull survey bootstraping. +Draw bootstrap replicates from survey data with rotating panel +design. Survey information, like ID, sample weights, strata and population +totals per strata, should be specified to ensure meaningfull survey +bootstraping. } \details{ -\code{draw.bootstrap} takes \code{dat} and draws \code{REP} bootstrap replicates from it. -\code{dat} must be household data where household members correspond to multiple rows with the same household -identifier. For most practical applications, the following columns should be available in the dataset +\code{draw.bootstrap} takes \code{dat} and draws \code{REP} bootstrap replicates +from it. +\code{dat} must be household data where household members correspond to multiple +rows with the same household +identifier. For most practical applications, the following columns should be +available in the dataset and passed via the corresponding parameters: \itemize{ \item Column indicating the sample period (parameter \code{period}). \item Column indicating the household ID (parameter \code{hid}) \item Column containing the household sample weights (parameter \code{weights}); -\item Columns by which population was stratified during the sampling process (parameter: \code{strata}). +\item Columns by which population was stratified during the sampling process +(parameter: \code{strata}). } -For single stage sampling design a column the argument \code{totals} is optional, meaning that a column of the number of PSUs at the first stage does not need to be supplied. -For this case the number of PSUs is calculated and added to \code{dat} using \code{strata} and \code{weights}. By setting \code{cluster} to NULL single stage sampling design is always assumed and -if \code{strata} contains of multiple column names the combination of all those column names will be used for stratification. - -In the case of multi stage sampling design the argument \code{totals} needs to be specified and needs to have the same number of arguments as \code{strata}. - -If \code{cluster} is \code{NULL} or does not contain \code{hid} at the last stage, \code{hid} will automatically be used as the final cluster. If, besides \code{hid}, clustering in additional stages is specified the number of column names in -\code{strata} and \code{cluster} (including \code{hid}) must be the same. If for any stage there was no clustering or stratification one can set "1" or "I" for this stage. - -For example \code{strata=c("REGION","I"),cluster=c("MUNICIPALITY","HID")} would speficy a 2 stage sampling design where at the first stage the municipalities where drawn stratified by regions -and at the 2nd stage housholds are drawn in each municipality without stratification. - -Bootstrap replicates are drawn for each survey period (\code{period}) using the function \link{rescaled.bootstrap}. -Afterwards the bootstrap replicates for each household are carried forward from the first period the household enters the survey to all the censecutive periods it stays in the survey. - -This ensures that the bootstrap replicates follow the same logic as the sampled households, making the bootstrap replicates more comparable to the actual sample units. - -If \code{split} ist set to \code{TRUE} and \code{pid} is specified, the bootstrap replicates are carried forward using the personal identifiers instead of the houshold identifier. +For single stage sampling design a column the argument \code{totals} is optional, +meaning that a column of the number of PSUs at the first stage does not need +to be supplied. +For this case the number of PSUs is calculated and added to \code{dat} using +\code{strata} and \code{weights}. By setting \code{cluster} to NULL single stage sampling +design is always assumed and +if \code{strata} contains of multiple column names the combination of all those +column names will be used for stratification. + +In the case of multi stage sampling design the argument \code{totals} needs to be +specified and needs to have the same number of arguments as \code{strata}. + +If \code{cluster} is \code{NULL} or does not contain \code{hid} at the last stage, \code{hid} +will automatically be used as the final cluster. If, besides \code{hid}, +clustering in additional stages is specified the number of column names in +\code{strata} and \code{cluster} (including \code{hid}) must be the same. If for any stage +there was no clustering or stratification one can set "1" or "I" for this +stage. + +For example \code{strata=c("REGION","I"),cluster=c("MUNICIPALITY","HID")} would +speficy a 2 stage sampling design where at the first stage the municipalities +where drawn stratified by regions +and at the 2nd stage housholds are drawn in each municipality without +stratification. + +Bootstrap replicates are drawn for each survey period (\code{period}) using the +function \link{rescaled.bootstrap}. +Afterwards the bootstrap replicates for each household are carried forward +from the first period the household enters the survey to all the censecutive +periods it stays in the survey. + +This ensures that the bootstrap replicates follow the same logic as the +sampled households, making the bootstrap replicates more comparable to the +actual sample units. + +If \code{split} ist set to \code{TRUE} and \code{pid} is specified, the bootstrap replicates +are carried forward using the personal identifiers instead of the houshold +identifier. This takes into account the issue of a houshold splitting up. -Any person in this new split household will get the same bootstrap replicate as the person that has come from an other household in the survey. -People who enter already existing households will also get the same bootstrap replicate as the other households members had in the previous periods. +Any person in this new split household will get the same bootstrap replicate +as the person that has come from an other household in the survey. +People who enter already existing households will also get the same bootstrap +replicate as the other households members had in the previous periods. } \examples{ \dontrun{ eusilc <- demo.eusilc(prettyNames = TRUE) ## draw sample without stratification or clustering -dat_boot <- draw.bootstrap(eusilc, REP = 10, weights = "pWeight", period = "year") +dat_boot <- draw.bootstrap(eusilc, REP = 10, weights = "pWeight", + period = "year") ## use stratification w.r.t. region and clustering w.r.t. households -dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", - strata = "region", period = "year") +dat_boot <- draw.bootstrap( + eusilc, REP = 10, hid = "hid", weights = "pWeight", + strata = "region", period = "year") ## use multi-level clustering -dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", - strata = c("region", "age"), period = "year") +dat_boot <- draw.bootstrap( + eusilc, REP = 10, hid = "hid", weights = "pWeight", + strata = c("region", "age"), period = "year") # create spit households @@ -114,21 +174,28 @@ year <- eusilc[, unique(year)] year <- year[-1] leaf_out <- c() for(y in year){ - split.person <- eusilc[year == (y-1) & !duplicated(hid) & !(hid \%in\% leaf_out), - sample(pid, 20)] - overwrite.person <- eusilc[(year == (y)) & !duplicated(hid) & !(hid \%in\% leaf_out), - .(pid = sample(pid, 20))] + split.person <- eusilc[ + year == (y-1) & !duplicated(hid) & !(hid \%in\% leaf_out), + sample(pid, 20) + ] + overwrite.person <- eusilc[ + (year == (y)) & !duplicated(hid) & !(hid \%in\% leaf_out), + .(pid = sample(pid, 20)) + ] overwrite.person[, c("pidsplit", "year_curr") := .(split.person, y)] - eusilc[overwrite.person, pidsplit := i.pidsplit, on = .(pid, year >= year_curr)] + eusilc[overwrite.person, pidsplit := i.pidsplit, + on = .(pid, year >= year_curr)] leaf_out <- c(leaf_out, - eusilc[pid \%in\% c(overwrite.person$pid, overwrite.person$pidsplit), + eusilc[pid \%in\% c(overwrite.person$pid, + overwrite.person$pidsplit), unique(hid)]) } -dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", - strata = c("region", "age"), period = "year", split = TRUE, - pid = "pidsplit") +dat_boot <- draw.bootstrap( + eusilc, REP = 10, hid = "hid", weights = "pWeight", + strata = c("region", "age"), period = "year", split = TRUE, + pid = "pidsplit") # split households were considered e.g. household and # split household were both selected or not selected dat_boot[, data.table::uniqueN(w1), by = pidsplit][V1 > 1] @@ -136,7 +203,8 @@ dat_boot[, data.table::uniqueN(w1), by = pidsplit][V1 > 1] } \seealso{ -\code{\link[data.table:data.table]{data.table}} for more information on data.table objects. +\code{\link[data.table:data.table]{data.table}} for more information on +data.table objects. } \author{ Johannes Gussenbauer, Alexander Kowarik, Statistics Austria diff --git a/man/generate.HHID.Rd b/man/generate.HHID.Rd index 7d19c31..b78f45c 100644 --- a/man/generate.HHID.Rd +++ b/man/generate.HHID.Rd @@ -2,28 +2,38 @@ % Please edit documentation in R/generateHHID.R \name{generate.HHID} \alias{generate.HHID} -\title{Generate new houshold ID for survey data with rotating panel design taking into account split households} +\title{Generate new houshold ID for survey data with rotating panel design taking +into account split households} \usage{ generate.HHID(dat, period = "RB010", pid = "RB030", hid = "DB030") } \arguments{ \item{dat}{data table of data frame containing the survey data} -\item{period}{column name of \code{dat} containing an indicator for the rotations, e.g years, quarters, months, ect...} +\item{period}{column name of \code{dat} containing an indicator for the +rotations, e.g years, quarters, months, ect...} -\item{pid}{column name of \code{dat} containing the personal identifier. This needs to be fixed for an indiviual throught the whole survey} +\item{pid}{column name of \code{dat} containing the personal identifier. This +needs to be fixed for an indiviual throught the whole survey} -\item{hid}{column name of \code{dat} containing the household id. This needs to for a household throught the whole survey} +\item{hid}{column name of \code{dat} containing the household id. This needs +to for a household throught the whole survey} } \value{ -the survey data \code{dat} as data.table object containing a new and an old household ID. The new household ID which considers the split households is now named \code{hid} and the original household ID -has a trailing "_orig". +the survey data \code{dat} as data.table object containing a new and +an old household ID. The new household ID which considers the split +households is now named \code{hid} and the original household ID has a +trailing "_orig". } \description{ -Generating a new houshold ID for survey data using a houshold ID and a personal ID. -For surveys with rotating panel design containing housholds, houshold members can move from an existing household to a new one, that was not originally in the sample. -This leads to the creation of so called split households. Using a peronal ID (that stays fixed over the whole survey), an indicator for different time steps and a houshold ID, -a new houshold ID is assigned to the original and the split household. +Generating a new houshold ID for survey data using a houshold ID and a +personal ID. +For surveys with rotating panel design containing housholds, houshold members +can move from an existing household to a new one, that was not originally in +the sample. This leads to the creation of so called split households. Using a +peronal ID (that stays fixed over the whole survey), an indicator for +different time steps and a houshold ID, a new houshold ID is assigned to the +original and the split household. } \examples{ \dontrun{ @@ -45,16 +55,19 @@ for(y in year){ .(rb030=sample(rb030,20))] overwrite.person[,c("rb030split","year_curr"):=.(split.person,y)] - eusilc[overwrite.person,rb030split:=i.rb030split,on=.(rb030,year>=year_curr)] - leaf_out <- c(leaf_out, - eusilc[rb030\%in\%c(overwrite.person$rb030,overwrite.person$rb030split), - unique(db030)]) + eusilc[overwrite.person, + rb030split:=i.rb030split,on=.(rb030,year>=year_curr)] + leaf_out <- c( + leaf_out, + eusilc[rb030\%in\%c(overwrite.person$rb030,overwrite.person$rb030split), + unique(db030)]) } # pid which are in split households eusilc[,.(uniqueN(db030)),by=list(rb030split)][V1>1] -eusilc.new <- generate.HHID(eusilc,period="year",pid="rb030split",hid="db030") +eusilc.new <- generate.HHID(eusilc, period = "year", pid = "rb030split", + hid = "db030") # no longer any split households in the data eusilc.new[,.(uniqueN(db030)),by=list(rb030split)][V1>1] diff --git a/man/ipf.Rd b/man/ipf.Rd index e734b83..906d3a7 100644 --- a/man/ipf.Rd +++ b/man/ipf.Rd @@ -16,8 +16,8 @@ ipf(dat, hid = NULL, conP = NULL, conH = NULL, epsP = 1e-06, weights (optionally), household and/or personal level variables (numerical or categorical) that should be fitted.} -\item{hid}{name of the column containing the household-ids -within \code{dat} or NULL if such a variable does not exist.} +\item{hid}{name of the column containing the household-ids within \code{dat} or +NULL if such a variable does not exist.} \item{conP}{list or (partly) named list defining the constraints on person level. The list elements are contingency tables in array representation @@ -47,10 +47,9 @@ dimensions and dimnames as the corresponding constraint in \code{conH}.} \item{verbose}{if TRUE, some progress information will be printed.} -\item{w}{name if the column containing the base -weights within \code{dat} or NULL if such a variable does not exist. In the -latter case, every observation in \code{dat} is assigned a starting weight -of 1.} +\item{w}{name if the column containing the base weights within \code{dat} or NULL +if such a variable does not exist. In the latter case, every observation +in \code{dat} is assigned a starting weight of 1.} \item{bound}{numeric value specifying the multiplier for determining the weight trimming boundary if the change of the base weights should be @@ -61,41 +60,52 @@ and \code{bound}*\code{w}.} that should be performed.} \item{meanHH}{if TRUE, every person in a household is assigned the mean of -the person weights corresponding to the household. If \code{"geometric"}, the geometric mean -is used rather than the arithmetic mean.} - -\item{allPthenH}{if TRUE, all the person level calibration steps are performed before the houshold level calibration steps (and \code{meanHH}, if specified). -If FALSE, the houshold level calibration steps (and \code{meanHH}, if specified) are performed after everey person level calibration step. -This can lead to better convergence properties in certain cases but also means that the total number of calibration steps is increased.} - -\item{returnNA}{if TRUE, the calibrated weight will be set to NA in case of no convergence.} - -\item{looseH}{if FALSE, the actual constraints \code{conH} are used for calibrating all the hh weights. -If TRUE, only the weights for which the lower and upper thresholds defined by \code{conH} and \code{epsH} are exceeded -are calibrated. They are however not calibrated against the actual constraints \code{conH} but against -these lower and upper thresholds, i.e. \code{conH}-\code{conH}*\code{epsH} and \code{conH}+\code{conH}*\code{epsH}.} +the person weights corresponding to the household. If \code{"geometric"}, the +geometric mean is used rather than the arithmetic mean.} + +\item{allPthenH}{if TRUE, all the person level calibration steps are +performed before the houshold level calibration steps (and \code{meanHH}, if +specified). If FALSE, the houshold level calibration steps (and \code{meanHH}, +if specified) are performed after everey person level calibration step. +This can lead to better convergence properties in certain cases but also +means that the total number of calibration steps is increased.} + +\item{returnNA}{if TRUE, the calibrated weight will be set to NA in case of +no convergence.} + +\item{looseH}{if FALSE, the actual constraints \code{conH} are used for +calibrating all the hh weights. If TRUE, only the weights for which the +lower and upper thresholds defined by \code{conH} and \code{epsH} are exceeded are +calibrated. They are however not calibrated against the actual constraints +\code{conH} but against these lower and upper thresholds, i.e. +\code{conH}-\code{conH}*\code{epsH} and \code{conH}+\code{conH}*\code{epsH}.} \item{numericalWeighting}{See \link{numericalWeighting}} -\item{check_hh_vars}{If \code{TRUE} check for non-unique values inside of a household for variables in -household constraints} +\item{check_hh_vars}{If \code{TRUE} check for non-unique values inside of a +household for variables in household constraints} -\item{conversion_messages}{show a message, if inputs need to be reformatted. This can be useful for speed -optimizations if ipf is called several times with similar inputs (for example bootstrapping)} +\item{conversion_messages}{show a message, if inputs need to be reformatted. +This can be useful for speed optimizations if ipf is called several times +with similar inputs (for example bootstrapping)} -\item{nameCalibWeight}{character defining the name of the variable for the newly generated calibrated weight.} +\item{nameCalibWeight}{character defining the name of the variable for the +newly generated calibrated weight.} } \value{ -The function will return the input data \code{dat} with the -calibrated weights \code{calibWeight} as an additional column as well as attributes. If no convergence has been reached in \code{maxIter} -steps, and \code{returnNA} is \code{TRUE} (the default), the column \code{calibWeights} will only consist of \code{NA}s. The attributes of the table are -attributes derived from the \code{data.table} class as well as the following. +The function will return the input data \code{dat} with the calibrated +weights \code{calibWeight} as an additional column as well as attributes. If no +convergence has been reached in \code{maxIter} steps, and \code{returnNA} is \code{TRUE} +(the default), the column \code{calibWeights} will only consist of \code{NA}s. The +attributes of the table are attributes derived from the \code{data.table} class +as well as the following. \tabular{ll}{ \code{converged} \tab Did the algorithm converge in \code{maxIter} steps? \cr \code{iterations} \tab The number of iterations performed. \cr \code{conP}, \code{conH}, \code{epsP}, \code{epsH} \tab See Arguments. \cr \code{conP_adj}, \code{conH_adj} \tab Adjusted versions of \code{conP} and \code{conH} \cr -\code{formP}, \code{formH} \tab Formulas that were used to calculate \code{conP_adj} and \code{conH_adj} based on the output table. +\code{formP}, \code{formH} \tab Formulas that were used to calculate \code{conP_adj} and +\code{conH_adj} based on the output table. } } \description{ @@ -106,20 +116,22 @@ individual level constraints. This function implements the weighting procedure described \href{http://www.ajs.or.at/index.php/ajs/article/viewFile/doi10.17713ajs.v45i3.120/512}{here}. -\code{conP} and \code{conH} are contingency tables, which can be created with \code{xtabs}. The \code{dimnames} of those -tables should match the names and levels of the corresponding columns in \code{dat}. +\code{conP} and \code{conH} are contingency tables, which can be created with \code{xtabs}. +The \code{dimnames} of those tables should match the names and levels of the +corresponding columns in \code{dat}. -\code{maxIter}, \code{epsP} and \code{epsH} are the stopping criteria. \code{epsP} and \code{epsH} describe relative tolerances -in the sense that +\code{maxIter}, \code{epsP} and \code{epsH} are the stopping criteria. \code{epsP} and \code{epsH} +describe relative tolerances in the sense that \out{\deqn{1-epsP < \frac{w_{i+1}}{w_i} < 1+epsP}{1-epsP < w(i+1)/w(i) < 1+epsP} } -will be used as convergence criterium. Here i is the iteration step and wi is the weight of a -specific person at step i. +will be used as convergence criterium. Here i is the iteration step and wi is +the weight of a specific person at step i. The algorithm -performs best if all varables occuring in the constraints (\code{conP} and \code{conH}) as well as the -household variable are coded as \code{factor}-columns in \code{dat}. Otherwise, conversions will be necessary -which can be monitored with the \code{conversion_messages} argument. -Setting \code{check_hh_vars} to \code{FALSE} can also incease the performance of the scheme. +performs best if all varables occuring in the constraints (\code{conP} and \code{conH}) +as well as the household variable are coded as \code{factor}-columns in \code{dat}. +Otherwise, conversions will be necessary which can be monitored with the +\code{conversion_messages} argument. Setting \code{check_hh_vars} to \code{FALSE} can also +incease the performance of the scheme. } \examples{ \dontrun{ @@ -145,7 +157,8 @@ eusilcS[, hsize := cut(hsize, c(0:4, Inf), labels = c(1:4, "5+"))] # treat households as a factor variable eusilcS[, household := as.factor(household)] -## example for base weights assuming a simple random sample of households stratified per region +## example for base weights assuming a simple random sample of households +## stratified per region eusilcS[, regSamp := .N, by = state] eusilcS[, regPop := sum(weight), by = state] eusilcS[, baseWeight := regPop/regSamp] @@ -159,7 +172,8 @@ conP2 <- xtabs(weight ~ gender + state, data = eusilcS) conP3 <- xtabs(weight*netIncome ~ gender, data = eusilcS) ## constraints on household level -conH1 <- xtabs(weight ~ hsize + state, data = eusilcS, subset = !duplicated(household)) +conH1 <- xtabs(weight ~ hsize + state, data = eusilcS, + subset = !duplicated(household)) # array of convergence limits for conH1 epsH1 <- conH1 diff --git a/man/plot.surveysd.Rd b/man/plot.surveysd.Rd index 7e4bc56..2447665 100644 --- a/man/plot.surveysd.Rd +++ b/man/plot.surveysd.Rd @@ -11,20 +11,31 @@ \arguments{ \item{x}{object of class 'surveysd' output of function \link{calc.stError}} -\item{variable}{Name of the variable for which standard errors have been calcualated in \code{dat}} - -\item{type}{can bei either 'summary' or 'grouping', default value is 'summary'. -For 'summary' a barplot is created giving an overview of the number of estimates having the flag \code{smallGroup}, \code{cvHigh}, both or none of them. -For 'grouping' results for point estimate and standard error are plotted for pre defined groups.} - -\item{groups}{If \code{type='grouping'} variables must be defined by which the data is grouped. Only 2 levels are supported as of right now. -If only one group is defined the higher group will be the estimate over the whole period. -Results are plotted for the first argument in \code{groups} as well as for the combination of \code{groups[1]} and \code{groups[2]}.} - -\item{sd.type}{can bei either \code{'ribbon'} or \code{'dot'} and is only used if \code{type='grouping'}. Default is \code{"dot"} -For \code{sd.type='dot'} point estimates are plotted and flagged if the corresponding standard error and/or the standard error using the mean over k-periods exceeded the value \code{cv.limit} (see \link{calc.stError}). -For \code{sd.type='ribbon'} the point estimates including ribbons, defined by point estimate +- estimated standard error are plotted. -The calculated standard errors using the mean over k periods are plotted using less transparency. Results for the higher level (~\code{groups[1]}) are coloured grey.} +\item{variable}{Name of the variable for which standard errors have been +calcualated in \code{dat}} + +\item{type}{can bei either \code{"summary"} or \code{"grouping"}, default value is +\code{"summary"}. For \code{"summary"} a barplot is created giving an overview of the +number of estimates having the flag \code{smallGroup}, \code{cvHigh}, both or none +of them. For 'grouping' results for point estimate and standard error are +plotted for pre defined groups.} + +\item{groups}{If \code{type='grouping'} variables must be defined by which the +data is grouped. Only 2 levels are supported as of right now. If only one +group is defined the higher group will be the estimate over the whole +period. Results are plotted for the first argument in \code{groups} as well as +for the combination of \code{groups[1]} and \code{groups[2]}.} + +\item{sd.type}{can bei either \code{'ribbon'} or \code{'dot'} and is only used if +\code{type='grouping'}. Default is \code{"dot"} +For \code{sd.type='dot'} point estimates are plotted and flagged if the +corresponding standard error and/or the standard error using the mean over +k-periods exceeded the value \code{cv.limit} (see \link{calc.stError}). +For \code{sd.type='ribbon'} the point estimates including ribbons, defined by +point estimate +- estimated standard error are plotted. +The calculated standard errors using the mean over k periods are plotted +using less transparency. Results for the higher level (~\code{groups[1]}) are +coloured grey.} \item{...}{additional arguments supplied to plot.} } @@ -47,7 +58,8 @@ dat_boot_calib <- recalib(dat_boot, conP.var = "gender", conH.var = "region") # estimate weightedRatio for povmd60 per period group <- list("gender", "region", c("gender", "region")) -err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", fun = weightedRatio, +err.est <- calc.stError(dat_boot_calib, var = "povertyRisk", + fun = weightedRatio, group = group , period.mean = NULL) diff --git a/man/print.surveysd.Rd b/man/print.surveysd.Rd index 284d5ab..2a8a844 100644 --- a/man/print.surveysd.Rd +++ b/man/print.surveysd.Rd @@ -12,6 +12,7 @@ \item{...}{additonal parameters} } \description{ -Prints the results of a call to \link{calc.stError}. Shows used variables and function, number of point estiamtes +Prints the results of a call to \link{calc.stError}. Shows used variables and +function, number of point estiamtes as well as properties of the results. } diff --git a/man/recalib.Rd b/man/recalib.Rd index f3aca39..ee32145 100644 --- a/man/recalib.Rd +++ b/man/recalib.Rd @@ -9,52 +9,76 @@ recalib(dat, hid = attr(dat, "hid"), weights = attr(dat, "weights"), conP.var = c("RB090"), conH.var = c("DB040", "DB100"), ...) } \arguments{ -\item{dat}{either data.frame or data.table containing the sample survey for various periods.} +\item{dat}{either data.frame or data.table containing the sample survey for +various periods.} -\item{hid}{character specifying the name of the column in \code{dat} containing the household ID.} +\item{hid}{character specifying the name of the column in \code{dat} containing +the household ID.} -\item{weights}{character specifying the name of the column in \code{dat} containing the sample weights.} +\item{weights}{character specifying the name of the column in \code{dat} +containing the sample weights.} -\item{b.rep}{character specifying the names of the columns in \code{dat} containing bootstrap weights which should be recalibratet} +\item{b.rep}{character specifying the names of the columns in \code{dat} +containing bootstrap weights which should be recalibratet} -\item{period}{character specifying the name of the column in \code{dat} containing the sample period.} +\item{period}{character specifying the name of the column in \code{dat} containing +the sample period.} -\item{conP.var}{character vector containig person-specific variables to which weights should be calibrated. Contingency tables for the population are calculatet per \code{period} using \code{weights}.} +\item{conP.var}{character vector containig person-specific variables to which +weights should be calibrated. Contingency tables for the population are +calculated per \code{period} using \code{weights}.} -\item{conH.var}{character vector containig household-specific variables to which weights should be calibrated. Contingency tables for the population are calculatet per \code{period} using \code{weights}.} +\item{conH.var}{character vector containig household-specific variables to +which weights should be calibrated. Contingency tables for the population +are calculated per \code{period} using \code{weights}.} -\item{...}{additional arguments passed on to function \code{\link[surveysd]{ipf}} from this package.} +\item{...}{additional arguments passed on to function \code{\link[=ipf]{ipf()}} from this +package.} } \value{ -Returns a data.table containing the survey data as well as the calibrated weights for the bootstrap replicates. The original bootstrap replicates are overwritten by the calibrated weights. -If calibration of a bootstrap replicate does not converge the bootsrap weight is not returned and numeration of the returned bootstrap weights is reduced by one. +Returns a data.table containing the survey data as well as the +calibrated weights for the bootstrap replicates. The original bootstrap +replicates are overwritten by the calibrated weights. If calibration of a +bootstrap replicate does not converge the bootsrap weight is not returned +and numeration of the returned bootstrap weights is reduced by one. } \description{ Calibrate weights for bootstrap replicates by using iterative proportional updating to match population totals on various household and personal levels. } \details{ -\code{recalib} takes survey data (\code{dat}) containing the bootstrap replicates generated by \link{draw.bootstrap} and calibrates weights for each bootstrap -replication according to population totals for person- or household-specific variables. \cr -\code{dat} must be household data where household members correspond to multiple rows with the same household identifier. The data should at least containt the following columns: +\code{recalib} takes survey data (\code{dat}) containing the bootstrap replicates +generated by \link{draw.bootstrap} and calibrates weights for each bootstrap +replication according to population totals for person- or household-specific +variables. \cr +\code{dat} must be household data where household members correspond to multiple +rows with the same household identifier. The data should at least containt +the following columns: \itemize{ \item Column indicating the sample period; \item Column indicating the household ID; \item Column containing the household sample weights; -\item Columns which contain the bootstrap replicates (see output of \link{draw.bootstrap}); -\item Columns indicating person- or household-specific variables for which sample weight should be adjusted. +\item Columns which contain the bootstrap replicates (see output of +\link{draw.bootstrap}); +\item Columns indicating person- or household-specific variables for which sample +weight should be adjusted. } -For each period and each variable in \code{conP.var} and/or \code{conH.var} contingency tables are estimated to get margin totals on personal- and/or household-specific variables in the population.\cr -Afterwards the bootstrap replicates are multiplied with the original sample weight and the resulting product ist then adjusted using \code{\link[surveysd]{ipf}} to match the previously calcualted contingency tables. -In this process the columns of the bootstrap replicates are overwritten by the calibrated weights.\cr +For each period and each variable in \code{conP.var} and/or \code{conH.var} contingency +tables are estimated to get margin totals on personal- and/or +household-specific variables in the population.\cr +Afterwards the bootstrap replicates are multiplied with the original sample +weight and the resulting product ist then adjusted using \code{\link[=ipf]{ipf()}} to match the +previously calcualted contingency tables. In this process the columns of the +bootstrap replicates are overwritten by the calibrated weights.\cr } \examples{ \dontrun{ eusilc <- demo.eusilc(prettyNames = TRUE) -dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", weights = "pWeight", +dat_boot <- draw.bootstrap(eusilc, REP = 10, hid = "hid", + weights = "pWeight", strata = "region", period = "year") # calibrate weight for bootstrap replicates @@ -68,7 +92,8 @@ dat_boot_calib <- recalib(dat_boot, conP.var = c("gender", "age"), } \seealso{ -\code{\link[surveysd]{ipf}} for more information on iterative proportional fitting. +\code{\link[=ipf]{ipf()}} for more information on iterative +proportional fitting. } \author{ Johannes Gussenbauer, Alexander Kowarik, Statistics Austria diff --git a/man/rescaled.bootstrap.Rd b/man/rescaled.bootstrap.Rd index bb36505..4c862fc 100644 --- a/man/rescaled.bootstrap.Rd +++ b/man/rescaled.bootstrap.Rd @@ -14,37 +14,69 @@ rescaled.bootstrap(dat, REP = 1000, strata = "DB050>1", \item{REP}{integer indicating the number of bootstraps to be drawn} -\item{strata}{string specifying the column name in \code{dat} that is used for stratification. For multistage sampling multiple column names can be specified by \code{strata=c("strata1>strata2>strata3")}. -See Details for more information.} +\item{strata}{string specifying the column name in \code{dat} that is used for +stratification. For multistage sampling multiple column names can be +specified by \code{strata=c("strata1>strata2>strata3")}. See Details for more +information.} -\item{cluster}{string specifying the column name in \code{dat} that is used for clustering. For multistage sampling multiple column names can be specified by \code{cluster=c("cluster1>cluster2>cluster3")}. +\item{cluster}{string specifying the column name in \code{dat} that is used for +clustering. For multistage sampling multiple column names can be specified +by \code{cluster=c("cluster1>cluster2>cluster3")}. See Details for more information.} -\item{fpc}{string specifying the column name in \code{dat} that contains the number of PSUs at the first stage. For multistage sampling the number of PSUs at each stage must be specified by \code{strata=c("fpc1>fpc2>fpc3")}.} +\item{fpc}{string specifying the column name in \code{dat} that contains the +number of PSUs at the first stage. For multistage sampling the number of +PSUs at each stage must be specified by \code{strata=c("fpc1>fpc2>fpc3")}.} -\item{single.PSU}{either "merge" or "mean" defining how single PSUs need to be dealt with. -For \code{single.PSU="merge"} single PSUs at each stage are merged with the strata or cluster with the next least number of PSUs. If multiple of those exist one will be select via random draw -For \code{single.PSU="mean"} single PSUs will get the mean over all bootstrap replicates at the stage which did not contain single PSUs.} +\item{single.PSU}{either "merge" or "mean" defining how single PSUs need to +be dealt with. For \code{single.PSU="merge"} single PSUs at each stage are +merged with the strata or cluster with the next least number of PSUs. If +multiple of those exist one will be select via random draw. For +\code{single.PSU="mean"} single PSUs will get the mean over all bootstrap +replicates at the stage which did not contain single PSUs.} -\item{return.value}{either "data" or "replicates" specifying the return value of the function. For "data" the survey data is returned as class \code{data.table}, for "replicates" only the bootstrap replicates are returned as \code{data.table}.} +\item{return.value}{either "data" or "replicates" specifying the return value +of the function. For "data" the survey data is returned as class +\code{data.table}, for "replicates" only the bootstrap replicates are returned +as \code{data.table}.} -\item{check.input}{logical, if TRUE the input will be checked before applying the bootstrap procedure} +\item{check.input}{logical, if TRUE the input will be checked before applying +the bootstrap procedure} -\item{new.method}{logical, if TRUE bootstrap replicates will never be negative even if in some strata the whole population is in the sample. WARNING: This is still experimental and resulting standard errors might be underestimated! Use this if for some strata the whole population is in the sample!} +\item{new.method}{logical, if TRUE bootstrap replicates will never be +negative even if in some strata the whole population is in the sample. +WARNING: This is still experimental and resulting standard errors might be +underestimated! Use this if for some strata the whole population is in the +sample!} } \value{ -returns the complete data set including the bootstrap replicates or just the bootstrap replicates, depending on \code{return.value="data"} or \code{return.value="replicates"} respectively. +returns the complete data set including the bootstrap replicates or +just the bootstrap replicates, depending on \code{return.value="data"} or +\code{return.value="replicates"} respectively. } \description{ -Draw bootstrap replicates from survey data using the rescaled bootstrap for stratified multistage sampling, presented by Preston, J. (2009). +Draw bootstrap replicates from survey data using the rescaled +bootstrap for stratified multistage sampling, presented by Preston, J. +(2009). } \details{ -For specifying multistage sampling designs the column names in \code{strata},\code{cluster} and \code{fpc} need to seperated by ">".\cr -For multistage sampling the strings are read from left to right meaning that the column name before the first ">" is taken as the column for stratification/clustering/number of PSUs at the first and the column after the last ">" is taken as the column for stratification/clustering/number of PSUs at the last stage. -If for some stages the sample was not stratified or clustered one must specify this by "1" or "I", e.g. \code{strata=c("strata1>I>strata3")} if there was no stratification at the second stage or \code{cluster=c("cluster1>cluster2>I")} if there were no clusters at the last stage.\cr -The number of PSUs at each stage is not calculated internally and must be specified for any sampling design. -For single stage sampling using stratification this can usually be done by adding over all sample weights of each PSU by each strata-code.\cr -Spaces in each of the strings will be removed, so if column names contain spaces they should be renamed before calling this procedure! +For specifying multistage sampling designs the column names in +\code{strata},\code{cluster} and \code{fpc} need to seperated by ">".\cr +For multistage sampling the strings are read from left to right meaning that +the column name before the first ">" is taken as the column for +stratification/clustering/number of PSUs at the first and the column after +the last ">" is taken as the column for stratification/clustering/number of +PSUs at the last stage. +If for some stages the sample was not stratified or clustered one must +specify this by "1" or "I", e.g. \code{strata=c("strata1>I>strata3")} if there was +no stratification at the second stage or \code{cluster=c("cluster1>cluster2>I")} +if there were no clusters at the last stage.\cr +The number of PSUs at each stage is not calculated internally and must be +specified for any sampling design. +For single stage sampling using stratification this can usually be done by +adding over all sample weights of each PSU by each strata-code.\cr +Spaces in each of the strings will be removed, so if column names contain +spaces they should be renamed before calling this procedure! } \examples{ data(eusilc, package = "laeken") @@ -65,7 +97,8 @@ eusilc.bootstrap <- rescaled.bootstrap(eusilc,REP=100,strata=c("new_strata"), } \references{ -Preston, J. (2009). Rescaled bootstrap for stratified multistage sampling. Survey Methodology. 35. 227-234. +Preston, J. (2009). Rescaled bootstrap for stratified multistage +sampling. Survey Methodology. 35. 227-234. } \author{ Johannes Gussenbauer, Statistics Austria diff --git a/tests/testthat/test_ipf.R b/tests/testthat/test_ipf.R index e178315..9d19bef 100644 --- a/tests/testthat/test_ipf.R +++ b/tests/testthat/test_ipf.R @@ -24,7 +24,8 @@ eusilcS[, hsize := cut(hsize, c(0:4, Inf), labels = c(1:4, "5+"))] # treat households as a factor variable eusilcS[, household := as.factor(household)] -## example for base weights assuming a simple random sample of households stratified per region +## example for base weights assuming a simple random sample of households +## stratified per region eusilcS[, regSamp := .N, by = state] eusilcS[, regPop := sum(weight), by = state] eusilcS[, baseWeight := regPop / regSamp]