-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathInteraction Clean 111616 334 pm.R
397 lines (269 loc) · 14 KB
/
Interaction Clean 111616 334 pm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
##############################
##############################
####Interaction Screening CA Trend.
###Data Creation for CA Trend. Y binary, X_j trinary.
library(boot)
makeYBinary <- function(n, percent0 = 0.5)
{
sample(c(0,1),n,replace = TRUE, prob = c(percent0,1-percent0))###These are the Y's
}
############################################################################################
############################################################################################
# Here we are going to generate X_ij data that is very similar to that found in simulation 3 of Huang.
# In their simulation, Y has 4 levels, but instead I am going to only let Y have two levels and use theta_kj
# from the final two lines of Table 3 of Huang. (So for k = 3, 4).
makeX_ijForInteraction <- function(n,p, numInteract = 4, Y)
{
probability = c(0.7, 0.9, 0.1, 0.1,
0.2, 0.1, 0.9, 0.7)####Table 3
table3 = matrix(data =probability , 2,4, byrow = TRUE)
X_ij = matrix(nrow = n, ncol = p)
##The X_js are stored here for all n-many replicates.
####Initialized to -1 for ease of locating missed assignments.
for(j in 1:numInteract)
{
table3Vals = table3[Y+1,j]###If Y = m, the (m+1)th row of the jth column is the probability theta_mj given Y = m.
###This collects the requisite table values and puts them in a vector.
X_ij[,2*j-1] = sapply(table3Vals, rbinom, n = 1, size = 2)
###We define a whole column here! Each entry takes on the value of 0, 1, or 2.
#print(table3Vals)
#####This next part is a bit tricky. We need to create a table of probabilities that are conditional
##### on the value of X_{2j-1} AND on the probability theta_mj (which corresonds to the value of Y_i).
##### This makes our probabilities for generating X_2j.
conditionalProbs = (0.05*(table3Vals >= 0.5) + 0.8*(table3Vals < 0.5))*(X_ij[,2*j-1] == 0) +
(0.95*(table3Vals >= 0.5) + 0.8*(table3Vals < 0.5))*(X_ij[,2*j-1] == 1) +
(0.2*(table3Vals >= 0.5) + 0.65*(table3Vals < 0.5))*(X_ij[,2*j-1] == 2)#
######
#print(j)
#print(conditionalProbs)
####We have a 1 by n vector where each row represents the probability of "success" in the random binomial trial process
#### used to determine the value of X_i(2j). So index i holds the probability of 'success' for the ith trial.
X_ij[,2*j] = sapply(conditionalProbs, rbinom, n = 1, size = 2)
###We define a whole column here! Each entry takes on the value of 0, 1, or 2.
}
for(j in (2*numInteract+1):p)
{
X_ij[,j] = sample(c(0,1,2), n, replace= TRUE, prob = rep(0.333333,3))
}
return(X_ij)
}
set.seed(79117)
Ysamp = makeYBinary(n = 10)
testX = makeX_ijForInteraction(n = 10, p = 10, numInteract = 4, Y = Ysamp)
Ysamp
testX
#####################################################################################
#####################################################################################
### Find Numerators and denominators.
#####################################################################################
#####################################################################################
makeNumerators <- function(X_ij, Y, p )
{
######In so far as I can see right now, this cannot be vectorized.
numerators = matrix(nrow= p, ncol = p)
for(j_1 in 1:p)
for(j_2 in 1:p)
if(j_1 < j_2)
{
numerators[j_1, j_2] = cum3(a = X_ij[,j_1], b = X_ij[,j_2], c = Y, unbiased = FALSE)
}
return(numerators)
}
Ysamp = makeYBinary(n = 200, percent0 = 0.5)
#Ysamp
Xcovars =makeX_ijForInteraction(n = 200, p = 2000, numInteract = 4, Y = Ysamp)
#Xcovars
timestamp()##------ Wed Nov 16 15:54:04 2016 ------##
numerators = makeNumerators(X_ij = Xcovars, Y = Ysamp, p = 2000)
timestamp()##------ Wed Nov 16 15:55:15 2016 ------##
###Making the numerators takes about 1 minute. (p = 2000)
###
numerators[1:13, 1:13]
#####A very simple method for getting the covariance approximation used by Agresti (in an equivalent form).
#####In the proof of consitency of the estimator given in Agresti for the covariance, I show that (n-1)/n*cov(X, Y)
##### is equivalent to that estimator given in Agresti.
covarApprox<-function(X1, X2 = X1, n)
{
biasedCovar = (n-1)/n * cov(x = X1, y = X2, use = "all.obs", method = "pearson")
return(biasedCovar)
}
sdApprox<-function(x, n)
{
biasedSD = (n-1)/n * sd(x, na.rm = TRUE)
return(biasedSD)
}
# make_COVCOVDemominators <- function(X_ij, Y, p)
# {#####MUCH TOO SLOW.
# denoms = matrix(nrow= p, ncol = p)
# for(j_1 in 1:p)
# for(j_2 in 1:p)
# if(j_1 < j_2)
# {
# denoms[j_1, j_2] = covarApprox(X1 = X_ij[,j_1], X2 = Y)*covarApprox( X1 = X_ij[,j_2], X2 = Y)
# }
#
# return(denoms)####This is a matrix of dimension p by p. We have each cov(X_j, Y)*cov(X_k, Y) in an entry
#
# }
make_COVCOVDemominators <- function(X_ij, Y, p, n)
{###This vectorized method is MUCH faster.
#denoms = matrix(nrow= p, ncol = p)
####Think of some way to do a matrix computation here to maybe speed this up.
covars = apply(X_ij, MARGIN = 2, covarApprox, X2 = Y, n = n )
covarMat = matrix(covars, nrow = p, ncol = p)
denoms = covarMat * t(covarMat)
return(denoms)####This is a matrix of dimension p by p. We have each cov(X_j, Y)*cov(X_k, Y) in an entry
}
make_SDSDSDDemominators <- function(X_ij, Y, p, n)
{
stdevs = apply(X_ij, MARGIN = 2, sdApprox, n= n)
sdMat = matrix(stdevs, nrow = p, ncol = p)
denoms = covarMat * t(sdMat)
return(denoms)####This is a matrix of dimension p by p. We have each sd(X_i)*sd(X_j)*sd(Y) in an entry
}
Ysamp = makeYBinary(n = 200, percent0 = 0.5)
#Ysamp
Xcovars =makeX_ijForInteraction(n = 200, p = 2000, numInteract = 4, Y = Ysamp)
timestamp()##------ Wed Nov 16 15:54:04 2016 ------##
numerators = makeNumerators(X_ij = Xcovars, Y = Ysamp, p = 2000)
timestamp()##------ Wed Nov 16 15:55:15 2016 ------##
####A little over a minute (p = 2000)
#covars = apply(Xcovars, MARGIN = 2, covarApprox, X2 = Ysamp)
#covarMat = matrix(covars,nrow = 2000, ncol = 2000)
#covarMat
#denoms = covarMat * t(covarMat)
timestamp()
denoms = make_COVCOVDemominators(X_ij = Xcovars, Y = Ysamp, p = 2000, n = 200)
timestamp()
denoms[1:13, 1:13]
calculateR_j_1j_2 <- function(denom, cummulant_3_way, d)
{###d is the fixed number of X_j that we retained from the main effect test.
###This preassumes that we have already found the covariances.
###The computationual requirements are such that we can perhaps only look at the top 40 or so most significant main effects
### for a significant interaction.
### The denominators and numerators need to be the top d significant main effects. These already need to be selected.
denom[denom < 0.00001] = 0.01
R_j_1j_2 = abs(cummulant_3_way/denom)
return(R_j_1j_2)####A d by d matrix. All values ar non-negative. We are only looking at magnitude.
}
bigR = calculateR_j_1j_2(denom = denoms, cummulant_3_way = numerators, d = 2000)
calculateNEWR_j_1j_2 <- function(oldNumerators, cummulant_3_way, d)
{###d is the fixed number of X_j that we retained from the main effect test.
###This preassumes that we have already found the covariances.
###The computationual requirements are such that we can only look at the top 40 or so most significant main effects
### for a significant interaction.
### The old Numerators need to be the top d significant main effects. These already need to be selected.
######THIS IS WHERE WE HAVE TO CHANGE. MAKE THE DENOMINATORS DIFFERENT THAN THE ORIGINAL IDEA.
###THREE WAY CUMULANT OR VARIANCE.The Covariance is a bunch of head cheese.
covarProdMat = matrix(oldNumerators, nrow = d, ncol = d)###Filled by column
newDenom = covarProdMat*t(covarProdMat)
R_j_1j_2 = abs(cummulant_3_way/newDenom)
R_j_1j_2[is.na(R_j_1j_2)] = 0 ####Set the NAs all to zero. This smooths out sorting problems.
#####The sort() function cannot return indices AND remove NAs for some reason.
return(R_j_1j_2)####A d by d matrix. All values ar non-negative. We are only looking at magnitude.
}
bigR = calculateR_j_1j_2(denom = denoms, cummulant_3_way = numerators, d = 2000)
sortedR = sort(bigR, index.return = TRUE, decreasing = TRUE)####The sorted array is COLUMNWISE.??????
columnNums = (head(sortedR$ix, n = 10)-1) %/%2000 + 1
rowNums = (head(sortedR$ix, n = 10)-1) %%2000 + 1
columnNums
rowNums
rbind(rowNums, columnNums)
#####This extracts the values in the Rj_1j_2 matrix at the specified indices.
#####We use the fact that bigR is symmetric.
##### For some reason the sort function returns indices that are row-wise. ???? That is the only thing I can figure.
for(index in 1:10)
{
if(rowNums[index]<= columnNums[index])
{
print(bigR[rowNums[index], columnNums[index]])
}
else
{
print(bigR[columnNums[index],rowNums[index]])
print("Flipped index")
}
}
prepForp_j <-function(sortedIndicies, d)
{
firstDfeatures = sortedIndicies[1:d]
containsEach = 1:8 %in% firstDfeatures
return(containsEach)
}
#### We report the minimum model size required to contain all true predictors.
###################################################################################################
###################################################################################################
# This is where we screen for interaction. The run time on this could be long.
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
# \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / _
# \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ /
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
# \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / _
# \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ /
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
# \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / _
# \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ /
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
# \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / _
# \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ /
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
# \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / _
# \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ /
# / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \__/ / __ \ \_
# / / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \____/ / \ \__
############################################################
###This is where we screen for interaction.
############################################################
findTrueModelIndices <- function(row_col_mat = matrix(1:8, nrow =2, ncol = 4) , p)
{
indices = apply(row_col_mat, 2, function(rowCol){return(c(rowCol[1] + p*(rowCol[2]-1)))})#,rowCol[2] + p*(rowCol[1]-1)))})
###This bit commented out is if the sort function is not able to ignore NA.
return(indices)#####This is currently a vector. Assuming that the sort function does what I want, this will spit out the
####### VECTOR indices of the true interactions.
}
rawIndex = matrix(1:8, nrow =2, ncol = 4)
rawIndex
trueindex = findTrueModelIndices(p = 500)
c(503, 501, 1005) %in% trueindex
###########################
runSimCATrend_Interaction <- function(n, p, preseed = 7919, trueMod_Indices)
{
set.seed(preseed)
#print(trueMod_Indices)
Y = makeYBinary(n, percent0 = 0.5)
covarsX_ij = makeX_ijForInteraction(n = n, p = p, numInteract = 4, Y = Y)
numerators = makeNumerators(X_ij = covarsX_ij, Y = Y, p = p)
denominators = make_COVCOVDemominators(p= p, Y = Y, X_ij = covarsX_ij , n = n)
bigRmatrix = calculateR_j_1j_2(denom = denominators, cummulant_3_way = numerators, d = p) ###Throw no possibilities out
#####I might/should also run some simulations that drop some covariates. (Like only keep the top 100 or something)
sortedR = sort(bigRmatrix, index.return = TRUE, decreasing = TRUE)####The sorted array is COLUMNWISE.?????? Yes. I believe so.
#sortedR = sort(numerators, index.return = TRUE, decreasing = TRUE)
#print(c("Larget value", max(abs(bigRmatrix), na.rm = TRUE)))
in.TrueModel = (sortedR$ix %in% trueMod_Indices)#####True mod indices is passed in as a prameter.
minModelSize = max(which(in.TrueModel))#####This finds the smallest model size required to include the true model.
return(minModelSize)
}
runSimCATrend_Interaction(n = 50, p = 300, preseed = round(100*runif(n = 1, min = 0, max = 20)), trueMod_Indices = findTrueModelIndices(p = 300))
reps = 100
minModelSizes = vector(length = reps)
trueIndex = findTrueModelIndices(p = 3000)
timestamp()##------ Wed Nov 16 17:24:38 2016 ------##
for(w in 1:reps)
{
if(w %% 25 == 0)
{print("Benchmark" )}
newSeed = 7919 +20*w + w^2 - w^3
simList = runSimCATrend_Interaction(n = 200, p = 3000, preseed = newSeed, trueMod_Indices = trueIndex)
minModelSizes[w] = simList
#p_j_forD8[w,] = simList[[2]]
#p_j_forD10[w,] = simList[[3]]
#p_j_forD12[w,] = simList[[4]]
}
timestamp()##------ Thu Nov 17 11:19:01 2016 ------##
mean(minModelSizes)#3729128
#3000^2-3729128 = 5270872, so a little less than half way in is the final true effect.