-
Notifications
You must be signed in to change notification settings - Fork 0
/
4_FilterRNA.R
120 lines (92 loc) · 3.58 KB
/
4_FilterRNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#### Initialize ----
# set working directory to source file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# clear workspace
rm(list = ls())
library(tidyverse)
library(openxlsx)
library(maplet)
#### Load mapping and annotation files ----
# load preprocessed metabolomics data
load("results/Workspace_3_FilterMetabo.Rdata")
#### Load Transcriptomics Data ----
# load rna data
rna <- lapply(1:length(mm$Cohort), function(x) {
read.table(file=paste0("data/transcriptomics_processed/", mm$RNAFile[x], sep=""), sep = ",", row.names = 1, header = TRUE)
})
names(rna) <- mm$Cohort
#### Subset data to only samples with both metabolomics and rna data ----
rna <- lapply(names(rna) %>% {names(.)=.;.}, function(x) {
rna[[x]][,colnames(rna[[x]]) %in% (mapping_file$RNAID[mapping_file$Cohort==x] %>% make.names)]
})
#### Reorder samples so that they are matched correctly between metabolomics and rna data ----
# reorder samples to match with mapping file
rna <- lapply(names(rna) %>% {names(.)=.;.}, function(x){
rnaorder <- mapping_file$RNAID[mapping_file$Cohort %in% x] %>% make.names()
rna[[x]][,match(rnaorder, colnames(rna[[x]]))]
})
# check order
sapply(names(rna), function(x){
all(colnames(rna[[x]]) == mapping_file$RNAID[mapping_file$Cohort %in% x] %>% make.names())
}) %>% all
#### Check initial dimensions ----
sapply(rna,dim)
#### Separate Tumor and Normal Samples ----
rna_T <- lapply(rna, function(x) {
x[,colnames(x) %in% (mapping_file$RNAID[mapping_file$TN %in% c("Tumor", "TUMOR")] %>% make.names)]
})
rna_N <- lapply(rna, function(x) {
x[,colnames(x) %in% (mapping_file$RNAID[mapping_file$TN %in% c("Normal", "NORMAL")] %>% make.names)]
})
# remove empty elements from list of normal samples
rna_N <- rna_N[sapply(rna_N, function(x) dim(x)[2]) > 0]
# check dimensions
sapply(rna_T, dim)
sapply(rna_N, dim)
#### Filter ----
# first remove variables with more than imp.max missing values
imp.max <- 0.8
countmiss <- function(x, y, imp.max){
# find min
m <- min(y, na.rm=T)
# count number of equal values
repeats <- which(y==m) %>% length
# find imputation percentage
(repeats-1)/ncol(x)
}
filter_func <- function(x, imp.max){
# compute variance
zerovar <- apply(x, 1, var)
# remove metabolites with zero variance
x <- x[which(zerovar!=0),]
# compute missingness percentage
miss <- apply(x, 1, function(y){
countmiss(x, y, imp.max)
})
# only keep metabolites with less or equal imp.max missing values
x[which(miss<=imp.max),]
}
# filter
rna_T <- lapply(rna_T, function(x){filter_func(x, imp.max)})
rna_N <- lapply(rna_N, function(x){filter_func(x, imp.max)})
# check dimensions
sapply(rna_T, dim)
sapply(rna_N, dim)
#### Save filtered Data to File ----
rna_all <- c(rna_T,rna_N)
names(rna_all)[1:length(rna_T)] <- sprintf("%s_Tumor",names(rna_T))
names(rna_all)[(length(rna_T)+1):length(rna_all)] <- sprintf("%s_Normal",names(rna_N))
# add one sheet to preprocessed data files
lapply(names(rna_all), function(x){
print(x)
# load Excel file
wb <- loadWorkbook(file=sprintf("results/preprocessed_data/PreprocessedData_%s.xlsx",gsub(pattern = "_Tumor|_Normal", replacement = "", x)))
# add sheet
sheet = addWorksheet(wb, sprintf("rna_imputed_filtered_%s",strsplit(x, "_(?!.*_)", perl=TRUE)[[1]][2]))
writeData(wb, sheet=sheet, rna_all[[x]], rowNames = T, colNames = T)
# save workbook
saveWorkbook(wb, sprintf("results/preprocessed_data/PreprocessedData_%s.xlsx",gsub(pattern = "_Tumor|_Normal", replacement = "", x)), overwrite = TRUE)
}) %>% invisible
#### Save Results ----
save(rna_all,
file = "results/Workspace_4_FilterRNA.Rdata")