-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathcatgNoise.r
115 lines (111 loc) · 4.63 KB
/
catgNoise.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# title: Interactive RStudio Simulation of Noise and Categorization
# major: statistical pitfalls
# minor: categorization
# Interactive demonstration of power loss by categorizing
# a single x in a linear model, and by introducing
# measurement error in x. The categorical analysis amounts to a
# two-sample t-test, and the continuous test a test of whether the
# slope is zero after fitting a straight line. If the user specifies
# a change in slope after x=0, the slope test will be suboptimal.
# But it will often still be better than the test using categorization.
# If the true relationship is nonlinear, a better approach is to fit a
# nonlinear relationship. Here the power of a simple quadratic
# regression is shown.
#
# Frank Harrell Vanderbilt Biostatistics 8May11
sim <- function(n=200, nsim=1000, slope=1, slope2=1, sigma=4, sigmaNoise=0,
cutpt=0, eerror=FALSE)
{
x <- rnorm(n)
zcont <- zcat <- r2cont <- r2cat <- pcont <- pquad <- pcat <-
numeric(nsim)
par(mfrow=c(2,2))
cl <- gray(.75)
xnoise <- if(sigmaNoise == 0) x else x + rnorm(n, 0, sd=sigmaNoise)
stats <- function(x, y)
{
k <- lm.fit.qr.bare(x, y)
r2 <- k$rsquared
a <- NCOL(x)
b <- NROW(x) - a - 1
fstat <- r2/(1 - r2) * b / a
p <- 1 - pf(fstat, a, b)
list(r2 = r2, p=p, z=-qnorm(pmax(p, 1e-15)/2))
}
for(i in 1:nsim)
{
if(i == 1)
{
rr <- range(x, xnoise)
plot(x, xnoise, xlab='Original x', ylab='x with Noise',
sub='First iteration', cex.sub=.6,
xlim=rr, ylim=rr,
cex.lab=.8, cex.main=.9, main='Effect of Noise')
abline(a=0, b=1, col=cl)
abline(h=cutpt, col=cl)
propwrong <- mean((x > cutpt) != (xnoise > cutpt))
ee <- paste('Prop. misclassified >', cutpt,':', round(propwrong, 2), sep='')
if(eerror)
{
diff <- mean(x[x > cutpt]) - mean(x[x <= cutpt])
pdiff <- propwrong * diff
ee <- paste(ee, '\nEffective |error|:', round(pdiff, 2), sep='')
}
text(rr[2], rr[1]+.06*diff(rr), ee, adj=1, cex=.7, col='blue')
text(rr[1], rr[2]-.02*diff(rr), paste('Mean |error|:',
round(mean(abs(x - xnoise)), 2), sep=''), adj=0, cex=.7, col='blue')
lines(c(min(x), 0), slope*c(min(x), 0), col='red')
lines(c(0, max(x)), slope*c(0, max(x)) +
(slope2 - slope)*c(0, max(x)), col='red')
title(sub='Line:\nTrue relationship with Y', col.sub='red', cex.sub=.7, adj=1, line=2.75)
}
y <- slope*x + (slope2 - slope)*pmax(x, 0) + rnorm(n, 0, sd=sigma)
k <- stats(xnoise, y)
zcont[i] <- k$z
r2cont[i] <- k$r2
pcont[i] <- k$p
pquad[i] <- stats(cbind(xnoise, xnoise^2), y)$p
k <- stats(1*(xnoise < cutpt), y)
zcat[i] <- k$z
r2cat[i] <- k$r2
pcat[i] <- k$p
}
rr <- range(c(r2cont, r2cat))
plot(r2cont, r2cat, xlim=rr, ylim=rr,
xlab=expression(paste(R^2, ' continuous')),
ylab=expression(paste(R^2, ' dichotomous')),
cex.lab=.8, main='Explained Variation in Y', cex.main=.9)
abline(a=0, b=1, col=cl)
rr <- range(c(zcont, zcat))
plot(zcont, zcat, xlim=rr, ylim=rr,
xlab='Z-value r-test',
ylab=expression(paste('Z-value', phantom(.), t,
phantom(.), 'test')), main='Z Statistics',
cex.lab=.8, cex.main=.9)
abline(a=0, b=1, col=cl)
powercont <- round(mean(pcont < 0.05), 3)
powerquad <- round(mean(pquad < 0.05), 3)
powercat <- round(mean(pcat < 0.05), 3)
pow <- format(c(powercont, powerquad, powercat))
plot.new()
text(c(.1,.7), c(.92,.92), c('x Modeled As','Power'),
adj=0, cex=.9)
lines(c(.1, 1), c(.8, .8), col=cl)
lines(c(.67,.67), c(.98,.55), col=cl)
text(c(.1,.1), c(.7, .6, .5),
c('Linear', 'Quadratic', paste('Cut at', cutpt)), adj=0, cex=.8)
text(c(.92, .92), c(.7, .6, .5), pow, adj=1, cex=.8)
text(.1, .25, paste(nsim, 'simulations'), adj=0, cex=.8)
}
require(manipulate)
require(Hmisc)
manipulate(sim(n=n, nsim=nsim, sigma=sigma, sigmaNoise=sigmaNoise,
slope2=slope2, cutpt=Cutpoint),
n=slider(200, 1000, step=100),
sigma=slider(2, 10, label='Residual S.D.'),
slope2=slider(-1, 3, step=.25, initial=1, label='Slope after x=0'),
sigmaNoise=slider(0, 5, step=.5, label='Noise S.D.'),
Cutpoint=slider(0, 3, step=.5,
label='Cut point (optimal=mean x=0)'),
nsim=slider(300,2000, step=100, initial=500,
label='No. of simulations'))