-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLongFormChrom1.R
101 lines (67 loc) · 3.14 KB
/
LongFormChrom1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#source("https://bioconductor.org/biocLite.R")
####install.packages(c("binomTools","feather", "readr"),lib=c("/uufs/chpc.utah.edu/common/home/u6011224/software/pkg/RLibs/3.3.2i"),
#### repos=c("http://cran.us.r-project.org"),verbose=TRUE)
source("Interact Functions.R")
library(snpStats)
library(R.utils)
library(base)
library(boot)
library(binomTools)
library(data.table)
library(dplyr)
library(magrittr)
library(feather) # v0.0.0.9000
library(readr) # v0.2.2
###Although this BIM table is only assciated with the Y = 1 group, the BIM table for the control (Y = 0) group is identical.
###Because of this, I just use the BIM table from the Y = 1 group as the main BIM table.
BIMTable = fread("NICHD_PolycysticOvary_c1_PCOS.bim",
col.names = c("Chromosome", "ID","GeneticDist", "Position", "Allele1", "Allele2"))
BIMTable = BIMTable[,-3] ###Remove the GeneticDistance column. Has no information.
#dim(filter(BIMTable, Chromosome %in% 1:23, Position>0))###The "cleansed" table we want. 729286 (maybe messed up by find replace) by 5
cleanBIMTable = filter(BIMTable, Chromosome %in% 1:23, Position>0)
filter(cleanBIMTable, Chromosome == 1) %>%
select(ID) ->
name1
# genotypes
########Clean bed 1
bed1 = read.plink("NICHD_PolycysticOvary_c1_PCOS.bed", select.snps = name1[,1])
bed1 = bed1$genotypes
bed0 = read.plink("NICHD_PolycysticOvary_c2_NUGENE.bed", select.snps = name1[,1])
bed0 = bed0$genotypes
genotype = rbind(bed1, bed0)
snpsum.col <- col.summary(genotype)
call <- 0.95 ###Why remove low call rate?
use <- with(snpsum.col, (!is.na(Call.rate) & Call.rate >= call))
use[is.na(use)] <- FALSE
cat(ncol(genotype)-sum(use),"SNPs will be removed due to low call rate.\n")
genotype <- genotype[,use]
snpsum.col <- snpsum.col[use,]
minor <- 0.1
use1 <- with(snpsum.col, (!is.na(MAF) & MAF > minor) )
use1[is.na(use1)] <- FALSE
cat(ncol(genotype)-sum(use1),"SNPs will be removed due to low MAF .\n" )
genotype <- genotype[,use1]
snpsum.col <- snpsum.col[use1,]
name1 = rownames(snpsum.col)###Update the chromsomes we are looking at.
class(genotype) = "matrix"
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###print("Line 39 executed")
X = apply(genotype, 2, as.numeric)
Y = c(rep(1, 1043), rep(0, 3056))
data = cbind(Y, X)
##rm(X)####delete X, it does not need to be reference again.
#smallData = data[,c("Y", name7[1:4000,1])]###Dimensions are off here.
#dim(smallData)
# Iterative CA
timestamp()
R3LongPrint(data, names = name1, resultFileName = "chrom1LEAN.csv")
timestamp()
###which(result != 0, arr.ind = TRUE)
##fwrite(as.data.frame(result), file = "chrom7Short.csv", row.names = TRUE, col.names = TRUE