-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPG.R
262 lines (229 loc) · 8.32 KB
/
PG.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# Author: Yen-Chi Chen
# Maintainer: Yen-Chi Chen <[email protected]>
# Reference: Pattern Graphs: a Graphical Approach to Nonmonotone Missing Data
# arXiv: https://arxiv.org/abs/2004.00744
# Date: 03/15/2021
### The R script for applying pattern graph based on IPW approach.
#' @import mgcv
library(mgcv)
#' @import igraph
library(igraph)
#' Creating a pattern graph object.
#' @param dat_miss The primary data matrix that may contain missingness (does not include the covariates).
#' @param dat_cov The data matrix of the covariates that we wish to adjust,
#' cannot contain missingness.
#' @export
PG = function(dat_miss, dat_cov=NULL) UseMethod("PG")
#' Creating a pattern graph object.
#' @param dat_miss The primary data matrix that may contain missingness (does not include the covariates).
#' @param dat_cov The data matrix of the covariates that we wish to adjust,
#' cannot contain missingness.
#' @return A S4 object of class "PG". A list consisting
#' \item{dat_miss}
#' The input missing data.
#' \item{dat_cov}
#' The input covariates.
#' \item{info}
#' A list of information on the patterns of each observation.
#' \item{parent}
#' The parent pattern of each response pattern.
#' \item{summary}
#' The summary of frequency of each pattern and the corresponding pattern index.
#' \item{labels.pattern}
#' The response pattern label of each observation.
#' @export
PG.default = function(dat_miss, dat_cov=NULL){
pg_list = list()
pg_list[["dat_miss"]] = data.frame(dat_miss)
pg_list[["dat_cov"]] = data.frame(dat_cov)
miss_pat = uniquecombs(!is.na(dat_miss)+0)
# finding all response patterns
n_pat = nrow(miss_pat)
# number of patterns
row.names(miss_pat) = c(1:n_pat)
pat_comp = which(apply(miss_pat, 1, prod)==1)
names(pat_comp) = NULL
# the index of complete case
idx_comp = attr(miss_pat,"index")==pat_comp
# index of complete cases
n_comp = sum(idx_comp)
# size of complete case
pg_list[["info"]] = miss_pat+0
for(j in 1:n_pat){
pat = miss_pat[j,]
if(sum(pat==c(1,1))<2){
pg_list[["parent"]][[j]] = pat_comp
# this is CCMV case
}
}
## summary
info_tmp = pg_list$info
pat_obs = attr(pg_list$info, "index")
attr(info_tmp, "index") = NULL
info_all = cbind(info_tmp, table(pat_obs))
colnames(info_all) = c(colnames(info_tmp), "Cases")
pg_list[["summary"]] = info_all
pg_list[["labels.pattern"]] = pat_obs
class(pg_list) = "PG"
return(pg_list)
}
print.PG = function(x){
n_pat = nrow(x$info)
pat_comp = which(apply(x$info, 1, prod)==1)
pat_idx = (1:n_pat)[-pat_comp]
pat_names = rep(NA, n_pat)
for(j in 1:n_pat){
pat_names[j] = paste(x$info[j,]+0, collapse="")
}
rel_matrix_named = NULL
for(j in pat_idx){
rel_matrix_named = rbind(rel_matrix_named,
cbind( pat_names[x$parent[[j]]], pat_names[j]))
}
pg_igraph_named = graph_from_edgelist(rel_matrix_named)
cat("Response Patterns:\n")
print(x$summary)
cat("\n")
plot(pg_igraph_named)
print(pg_igraph_named)
}
#' Fitting the pattern graph based on IPW approach.
#' @param PG The input PG object, created from the PG function.
#' @export
PGfit = function(PG) UseMethod("PGfit")
#' Fitting the pattern graph based on IPW approach. Currently, the propensity score
#' is based on logistic regression.
#' @param PG The input PG object, from the 'PG' function.
#' @return A S4 object of class "PGfit". A list consisting
#' \item{igraph}
#' The igraph object of the pattern graph.
#' \item{igraph_named}
#' The igraph object of the pattern graph with vertex named after patterns.
#' \item{paths}
#' All paths in the pattern graph.
#' \item{model.par}
#' The fitted logistic parameter at each pattern.
#' \item{comp.idx}
#' The index of complete observations.
#' \item{comp.odds}
#' A matrix of the selection odds of complete cases evaluated at each pattern.
#' Each row is a complete case and each column represents a response pattern.
#' \item{comp.data}
#' The complete case data along with the corresponding weights.
#' \item{weights}
#' The weights of each complete case.
#' @export
PGfit.default = function(PG){
miss_pat = PG[["info"]]+0
dat_miss = PG[["dat_miss"]]
dat_cov = PG[["dat_cov"]]
### create base-variables
pg_fit = list()
model_par = list()
# parameter list
n_pat = nrow(miss_pat)
# total sample size
pat_comp = which(apply(miss_pat, 1, prod)==1)
names(pat_comp) = NULL
# the index of complete case
idx_comp = attr(miss_pat,"index")==pat_comp
# index of complete cases
n_comp = sum(idx_comp)
# number of complete case
pat_idx = (1:n_pat)[-pat_comp]
# all pattern index except complete case
odds_complete = matrix(1, nrow= n_comp, ncol=n_pat)
# odds evaluated on complete case
pat_names = rep(NA, n_pat)
for(j in 1:n_pat){
pat_names[j] = paste(miss_pat[j,]+0, collapse="")
}
# names of patterns
### creating igraph object
rel_matrix = NULL
rel_matrix_named = NULL
for(j in pat_idx){
rel_matrix = rbind(rel_matrix, cbind( PG$parent[[j]], j))
rel_matrix_named = rbind(rel_matrix_named, cbind( pat_names[PG$parent[[j]]], pat_names[j]))
}
pg_igraph = graph_from_edgelist(rel_matrix)
pg_igraph_named = graph_from_edgelist(rel_matrix_named)
plot(pg_igraph_named)
pg_path = all_simple_paths(pg_igraph, from=pat_comp, mode="out")
pg_fit[["igraph"]] = pg_igraph
pg_fit[["igraph_named"]] = pg_igraph_named
pg_fit[["paths"]] = all_simple_paths(pg_igraph_named, from=pat_names[pat_comp], mode="out")
### fitting the model
for(j in pat_idx){
var_idx = as.logical(PG$info[j,])
if(sum(var_idx)==0&is.null(dat_cov)){
# totally missing case
fit = sum(attr(miss_pat,"index")==j)/sum(is.element(attr(miss_pat,"index"),
PG[["parent"]][[j]]))
model_par[[j]] = log(fit)
odds_complete[,j]= fit
}
#### with covariates: always have non-empty variables
if(sum(var_idx)==0){
w_cur = attr(miss_pat,"index")==j
new.dat1 = cbind(1, dat_cov[w_cur,])
# current
w_pa = is.element(attr(miss_pat,"index"), PG[["parent"]][[j]])
new.dat0 = cbind(0, dat_cov[w_pa,])
# parent
new.dat = rbind(as.matrix(new.dat1), as.matrix(new.dat0))
colnames(new.dat) = c("lab", names(dat_cov))
fit = glm(lab~., family= "binomial", data = data.frame(new.dat))
model_par[[j]] = fit$coefficients
dat_comp = as.matrix(cbind(1, dat_cov[idx_comp,]))
odds_complete[,j] = exp(dat_comp %*% fit$coefficients)
}
if(sum(var_idx)>0){
w_cur = attr(miss_pat,"index")==j
new.dat1 = cbind(1,dat_miss[w_cur,var_idx], dat_cov[w_cur,])
# current
w_pa = is.element(attr(miss_pat,"index"), PG[["parent"]][[j]])
new.dat0 = cbind(0,dat_miss[w_pa,var_idx], dat_cov[w_pa,])
# parent
new.dat = rbind(as.matrix(new.dat1), as.matrix(new.dat0))
colnames(new.dat) = c("lab", names(dat_pg$dat_miss)[var_idx],
names(dat_cov))
fit = glm(lab~., family= "binomial", data = data.frame(new.dat))
model_par[[j]] = fit$coefficients
dat_comp = as.matrix(cbind(1,dat_miss[idx_comp,var_idx], dat_cov[idx_comp,]))
odds_complete[,j] = exp(dat_comp %*% fit$coefficients)
}
}
colnames(odds_complete) = pat_names
rownames(odds_complete) = rownames(dat_miss[idx_comp,])
### computing path odds
path_odds = matrix(NA, nrow= n_comp,ncol= length(pg_path))
for(k in 1:length(pg_path)){
path_val = rep(1, n_comp)
for(in_path in 2:length(pg_path[[k]])){
path_val = path_val*odds_complete[,pg_path[[k]][in_path]]
}
path_odds[,k] = path_val
}
weight_final = rowSums(path_odds)+1
# complete data creation
dat_comp = cbind(weight_final, dat_miss[idx_comp,], dat_cov[idx_comp,])
names(dat_comp) = c("weight",names(dat_miss), names(dat_cov))
if(is.null(names(dat_cov))){
names(dat_comp) = c("weight",names(dat_miss),"covariate")
}
pg_fit[["model.par"]] = model_par
pg_fit[["comp.idx"]] = which(idx_comp)
pg_fit[["comp.odds"]] = odds_complete
pg_fit[["comp.data"]] = dat_comp
pg_fit[["weight"]] = weight_final
class(pg_fit) = "PGfit"
return(pg_fit)
}
print.PGfit = function(x){
print(x$igraph_named)
plot(x$igraph_named)
cat(paste("\n Size of complete cases: ",length(x$comp.idx)))
cat("\n Complete cases info:\n")
print(head(x$comp.data))
}