-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path.Rhistory
203 lines (203 loc) · 9.06 KB
/
.Rhistory
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
list.files()
fst_file <- 'fst.CEU_CHB.region3.weir.fst'
fst <- read.table(fst_file, header = TRUE)
head(fst_)
head(fst)
top_snp <- subset(fst, WEIR_AND_CHOCKERHAM_FST == max(fst$WEIR_AND_COCKERHAM_FST))
top_snp <- subset(fst, WEIR_AND_COCKERHAM_FST == max(fst$WEIR_AND_COCKERHAM_FST))
top_snp
max(fst$WEIR_AND_COCKERHAM_FST)
top_snp <- subset(fst, WEIR_AND_COCKERHAM_FST != 'NaN'))
subset(fst, WEIR_AND_COCKERHAM_FST != 'NaN'))
fst <- subset(fst, WEIR_AND_COCKERHAM_FST != 'NaN')
head(fst)
top_snp <- subset(fst, WEIR_AND_COCKERHAM_FST == max(fst$WEIR_AND_COCKERHAM_FST))
top_snp
list.files()
CEU_frq <- read.table('auton_p1_CEU.frq', header = TRUE)
?read.table
CEU_frq <- read.table('auton_p1_CEU.frq', header = FALSE, skip = 1)
head(CEU_frq)
names(CEU_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ')
CHB_frq <- read.table('auton_p1_CHB.frq', header = FALSE, skip = 1)
names(CHB_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ')
head(CEU_frq)
head(CHB_frq)
top_snp <- subset(CEU_frq, POS = top_snp$POS)
top_snp <- subset(fst, WEIR_AND_COCKERHAM_FST == max(fst$WEIR_AND_COCKERHAM_FST))
top_snp_frq <- subset(CEU_frq, POS = top_snp$POS)
top_snp_frq
top_snp$POS
top_snp_frq <- subset(CEU_frq, POS == top_snp$POS)
top_snp_frq
cbind(CEU_frq, 'CEU')
CEU_frq <- read.table('auton_p1_CEU.frq', header = FALSE, skip = 1)
CEU_frq <- cbind(CEU_frq, 'CEU')
names(CEU_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ', 'POP')
CHB_frq <- read.table('auton_p1_CHB.frq', header = FALSE, skip = 1)
CHB_frq <- cbind(CHB_frq, 'CHB')
names(CHB_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ', 'POP')
top_snp_frq <- subset(CEU_frq, POS == top_snp$POS)
top_snp_frq
top_snp_frq <- rbind(top_snp_frq,
subset(CHB_frq, POS == top_snp$POS))
top_snp_frq
print (top_snp_freq)
print (top_snp_frq)
?print
print (top_snp_frq[1:7])
cat (top_snp_frq)
print (top_snp_frq)
print (top_snp_frq[,c('CHROM', 'POS', 'FREQ', 'POP')])
Auton Part 2
========================================================
## Calculation of Fst scores ##
The Fst scores calculated using the Weir and Cockerham method comparing the populations are shown in Table 1. The pairwise Fst values show lower differentiation between African-European and African-Asian pairs and higher level of differentiation between European-Asian populations. The higher level of differentiation between European and Asian populations because patterns of migration through human history show that humans moved out of Africa to eventually form the different populations. Therefore, the African population serves as the evolutionary common ancestor to the European and Asian populations. In otherwords, after migrating out of Africa, the European and Asian populations took different evolutionary paths and the lower level of differentiation between each population and the African population points to the common ancestors that each population had.
**Table 1: Mean and Weighted Fst scores**
Population | Mean Fst | Weighted Fst
-----------|--------- | ---
CEU v YRI | 0.22196 | 0.072607
CEU v CHB | 0.19103 | 0.066182
YRI v CHB | 0.22196 | 0.072607
```{r, echo=FALSE}
## read fst files
fst_file <- 'fst.CEU_CHB.region3.weir.fst'
fst <- read.table(fst_file, header = TRUE)
fst <- subset(fst, WEIR_AND_COCKERHAM_FST != 'NaN')
top_snp <- subset(fst, WEIR_AND_COCKERHAM_FST == max(fst$WEIR_AND_COCKERHAM_FST))
```
## Top SNP in region 3 ##
The top snp is located on Chromosome `r top_snp$CHROM` at position `r top_snp$POS`. A search of the dbSNP database shows this position maps to rs17261772. This SNP is located within exon 19 of the RAB3GAP1 gene. RAB3GAP1 (RAB3 GTPase activation protein subunit 1) encodes for the catalytic subunit of Rab GTPase activating protein complex. Mutations in this gene have been found in Warburg micro syndrome. Warburg micro syndrome is an autosomal recessive syndrome that is characterized by abnormalities in the eye, central nervous system and genitalia {Aligianis 2005}. There is a possible founder effect in the Danish population indicated by a common haplotype shared by unrelated individuals in that population {Morris-Rosendahl 2010}.
### Description of allele frequency ###
rs17261772 is a tri-allelic SNP with C/G/T as possible alternative alleles. The C allele leads to a synonymous variant while the G allele leads to a missense variant. In the CHB population, the SNP is monomorphic for the C allele while in the CEU population, the T and C alleles are present. The following table shows the allele frequencies in the dataset.
```{r, echo=FALSE}
## Read allele frequency data for CEU and CHB populations to
## determine the relative frequency of rs17261772
CEU_frq <- read.table('auton_p1_CEU.frq', header = FALSE, skip = 1)
CEU_frq <- cbind(CEU_frq, 'CEU')
names(CEU_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ', 'POP')
CHB_frq <- read.table('auton_p1_CHB.frq', header = FALSE, skip = 1)
CHB_frq <- cbind(CHB_frq, 'CHB')
names(CHB_frq) <- c('CHROM','POS','CHROM', 'N_ALLELES', 'N_CHR','FREQ', 'POP')
top_snp_frq <- subset(CEU_frq, POS == top_snp$POS)
top_snp_frq <- rbind(top_snp_frq,
subset(CHB_frq, POS == top_snp$POS))
```
**Table 2: Allele frequency in dataset**
```{r, echo=FALSE}
print (top_snp_frq[,c('CHROM', 'POS', 'FREQ', 'POP')])
```
** Table 3: rs17261772 allele frequency in various populations**
Population | T frequency | C frequency
---|---|---
All | 27% | 73%
EUR | 59% | 41%
AFR | 41% | 96%
ASN | 0% | 100%
In most other populations, the T allele is the minor allele. Table 3 shows the allele frequency for different populations taken from 1000 genomes data. The high frequency of T allele in the CEU population is most likely due to a founder effect where a subset of the population that migrated into Europe contained the T allele and because both the T and C alleles code for the same amino acid, Phe, there was no selective pressure to keep one allele over the other.
print (top_snp_frq[,c('CHROM', 'POS', 'FREQ', 'POP')])
print (top_snp_frq[,c('CHROM', 'POS', 'FREQ', 'POP')], row.names = FALSE)
97768/16849745
setwd("~/GitHub/cge_final/cge_final")
# Read ped file with genotype data
ped <- read.csv("./questiondat_tao.csv", header=TRUE)
## check missing along columns. Use inline function to calculate the missingness
## for each SNP
missing <- apply(ped,2,function(x) sum(is.na(x))/length(x))
## remove SNPs with missing rate > 10%
missing <- subset(missing, (missing > 0.1))
ped <- ped[,(!names(ped) %in% names(missing))]
### Create a function to performe Hardy-Weinberg test
HWtest <- function(x){
# Remove NA values from input
xnna=x[!is.na(x)]
# Calculate some ratio... what is it?
pbar<-sum(xnna)/(length(xnna)*2)
E=c((1-pbar)^2,2*pbar*(1-pbar),pbar^2)*length(xnna)
# get a count of the different genotypes
O=table(xnna)
t=sum((O-E)^2/E)
p=1-pchisq(t,df=1)
return(c(pbar,p))
}
hwe_test <- apply(ped[2:dim(ped)[2]],2,function(x)HWtest(x)) ### for cases and controls
hwe_test <- apply(ped[,2:dim(ped)[2]],2,function(x)HWtest(x)) ### for cases and controls
?apply
hwe_test <- apply(ped,2,function(x)HWtest(x)) ### for cases and controls
HWtest(ped[2,])
HWtest(ped[3,])
HWtest(ped[,2])
HWtest(ped[,3])
HWtest(ped[,1])
alleles <- ped[,(2:dim(ped)[2])]
hwe_test <- apply(alleles,2,function(x)HWtest(x)) ### for cases and controls
HWtest(alleles[,1])
HWtest(alleles[,2])
HWtest(alleles[,3])
HWtest(alleles[,4])
HWtest(alleles[,5])
HWtest(alleles[,6])
HWtest(alleles[,7])
HWtest(alleles[,8])
HWtest(alleles[,9])
table(alleles[,9])
table(alleles[,10])
testdat <- as.data.frame(ped)
names(ped)[1:5]
summary(glm(DPHENOTY~rs10508547,family="binomial",data=testdat))
summary(glm(PHENOTY~rs10508547,family="binomial",data=testdat))
testdat$PHENOTY <- factor(testdat$PHENOTY)
summary(glm(PHENOTY~rs10508547,family="binomial",data=testdat))
names(testdat)[2:length(names(testdat))
]
glm_tests <- list()
?append
for (snp in names(testdat)[2:length(names(testdat))]){
snp_glm <- glm(PHENOTY~snp, family = 'binomial', data = testdat)
glm_tests <- c(glm_testssnp_glm)
}
eval(snp)
snp_glm <- glm(PHENOTY~eval(snp), family = 'binomial', data = testdat)
snp_glm <- glm(PHENOTY~rs10508547, family = 'binomial', data = testdat)
?glm
as.formula(paste('PHENOTY ~ ', snp))
snp_glm <- glm(as.formula(paste('PHENOTY~'snp), family = 'binomial', data = testdat)
as.formula(paste('PHENOTY~',snp)
)
snp_glm <- glm(as.formula(paste('PHENOTY~',snp)), family = 'binomial', data = testdat)
snp_glm
summary(snp_glm)
?list
glm_tests <- c(glm_tests,snp = snp_glm)
glm_tests
gc()
rm(glm_tests)
gc()
head(hwe_test)
alleles <- ped[,(2:dim(ped)[2])]
hwe_test <- apply(alleles,2,function(x)HWtest(x)) ### for cases and controls
total <- dim(testdat)[1]
total
?textProgressBar
gc()
ls()
gc()
names(gene_beta) <- c('probe', 'gene', 'norm_beta', 'braca_beta')
ls()
rm(list = ls())
gc()
annot <- read.table('illumina550_chr10.txt', header = TRUE)
wilcox.test(1,20,alternative='two.sided')
wilcox.test(1,20000,alternative='two.sided')
wilcox.test(1,20000,nu=0,alternative='two.sided')
wilcox.test(1,20000,mu=0,alternative='two.sided')
wilcox.test(1,1,mu=0,alternative='two.sided')
wilcox.test(1,2,mu=0,alternative='two.sided')
wilcox.test(c(1,2,1,1,1,1),c(2,2,2,2,2,2),mu=0,alternative='two.sided')
?abs
?pnorm
?prop.test
t.test
?t.test
?anova
?pnorm