-
Notifications
You must be signed in to change notification settings - Fork 0
/
CVranger.R
97 lines (89 loc) · 4.71 KB
/
CVranger.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#' Title K-fold cross validation of a ranger random forest model.
#'
#' @param x data.frame of all training instances for model
#' @param nfolds integer Number of folds to use for validation
#' @param fm Formula to use for random forest model building
#' @param quants numeric vector with lower and upper quantiles for uncertainty prediction intervals
#' @param os string. Operating system for implementation. Options are "windows" or "linux".
#' @param train.params List of training parameters for building random forest models.
#' @param nthreads Integer number of logical cores to use for parallelization of function.
#' @param casewts Character name of field with case weights values
#'
#' @return data.frame with original fields along with CV predictions
#' @export
#'
CVranger <- function(x, nfolds = 10, fm, os = "windows", quants, train.params, nthreads = detectCores() - 1, casewts = "tot_wts"){
if(!validObject(train.params)){
message('No train.params given, using default RF params: ntree=100, min.node.size=1')
train.params <- list(ntrees = 100, min.node.size = 1)
}
if(! "tot_wts" %in% colnames(x)){
message('No case weights given, all case weights are being set to 1')
x$casewts <- 1
} else { x$casewts <- x[,casewts] }
if(nthreads == 0){nthreads = 1}
## Linux CV function implementation in forked parallel list apply
if(os == "linux"){
pts.extcvm <- x
set.seed(420)
pts.extcvm$folds <- sample.int(nfolds,size =length(pts.extcvm[,1]),replace=T)
fnthreads <- ifelse(nthreads > nfolds, floor((nthreads - nfolds)/nfolds), 1)
lappthreads <- ifelse(nthreads >= nfolds, nfolds, nthreads)
CV_factorRF <- function(g){#,pts.extcvm, formulaStringCVm){
traindf <- subset(pts.extcvm, pts.extcvm$folds != g)
testdf <- subset(pts.extcvm, pts.extcvm$folds == g)
set.seed(420)
rf.pcv <- ranger(fm, data=traindf, num.trees = train.params$ntrees, quantreg = T, num.threads = fnthreads,
min.node.size = train.params$min.node.size,case.weights = traindf$casewts)
traindf$pcvpredpre <- predict(rf.pcv,data=traindf, num.threads = fnthreads)$predictions
testdf$pcvpredpre <- predict(rf.pcv, data=testdf, num.threads = fnthreads)$predictions
testdf$pcvpredpre.025 <- predict(rf.pcv, data=testdf, type = "quantiles", quantiles = c(quants[1]), num.threads = fnthreads)$predictions
testdf$pcvpredpre.975 <- predict(rf.pcv, data=testdf, type = "quantiles", quantiles = c(quants[2]), num.threads = fnthreads)$predictions
attach(traindf)
lm.pcv <- lm(prop_t~pcvpredpre)
detach(traindf)
testdf$pcvpred <- predict(lm.pcv, newdata=testdf)
testdf$foldRsq <- 1-var(testdf$prop_t - testdf$pcvpred, na.rm=TRUE)/var(testdf$prop_t, na.rm=TRUE)
return(testdf)
}
## Linux parallel list apply
cpus <- lappthreads
cl <- makeCluster(cpus, type="FORK")
registerDoParallel(cl)
pts.extpcv.lst <- parLapply(cl,1:nfolds,try(CV_factorRF))
stopCluster(cl)
pts.extpcv <- plyr::rbind.fill(pts.extpcv.lst)
pts.extpcv$pcvpred <- as.numeric(pts.extpcv$pcvpred)
pts.extpcv$valtype <- paste("cv",nfolds,"f",sep="")
return(pts.extpcv)
}
## Windows (or linux) list apply implementation with ranger steps parallelized
if(os == "windows"){
pts.extcvm <- x
set.seed(420)
pts.extcvm$folds <- sample.int(nfolds,size =length(pts.extcvm[,1]),replace=T)
CV_factorRF <- function(g){
traindf <- subset(pts.extcvm, pts.extcvm$folds != g)
testdf <- subset(pts.extcvm, pts.extcvm$folds == g)
set.seed(420)
rf.pcv <- ranger(fm, data=traindf, num.trees = train.params$ntrees, quantreg = T, num.threads = nthreads,
min.node.size = train.params$min.node.size,case.weights = traindf$casewts)
traindf$pcvpredpre <- predict(rf.pcv,data=traindf, num.threads = nthreads)$predictions
testdf$pcvpredpre <- predict(rf.pcv, data=testdf, num.threads = nthreads)$predictions
testdf$pcvpredpre.025 <- predict(rf.pcv, data=testdf, type = "quantiles", quantiles = c(quants[1]), num.threads = nthreads)$predictions
testdf$pcvpredpre.975 <- predict(rf.pcv, data=testdf, type = "quantiles", quantiles = c(quants[2]), num.threads = nthreads)$predictions
attach(traindf)
lm.pcv <- lm(prop_t~pcvpredpre)
detach(traindf)
testdf$pcvpred <- predict(lm.pcv, newdata=testdf)
testdf$foldRsq <- 1-var(testdf$prop_t - testdf$pcvpred, na.rm=TRUE)/var(testdf$prop_t, na.rm=TRUE)
return(testdf)
}
## list apply of function
pts.extpcv.lst <- lapply(1:nfolds,try(CV_factorRF))
pts.extpcv <- plyr::rbind.fill(pts.extpcv.lst)
pts.extpcv$pcvpred <- as.numeric(pts.extpcv$pcvpred)
pts.extpcv$valtype <- paste("cv",nfolds,"f",sep="")
return(pts.extpcv)
}
}