-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcp_subset_controls.R
180 lines (125 loc) · 7.54 KB
/
cp_subset_controls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#List of controls to be used for subsetting signatures of controls
#Returns: list of cell lines with controls for compounds available (list_ctl).
#Subset of sig_ids that match the cell lines with available controls (sigs)
source('load_data.R')
library(stringr)
library(skimr)
library(plyr)
library(dplyr)
# 1. Selection of controls for analysis of compounds
# select consensus controls: "ctl_vehicle.cns" and "ctl_trt.cns"
sig_ctl <- siginfo %>% filter(stringr::str_detect(pert_type,"ctl_") & str_detect(pert_type,".cns"))
sig_ctl <- sig_ctl %>% filter(pert_type != "ctl_vector.cns" )
#select tissues that have samples from both normal and tumor cells
#adding the information about cells
sig_ctl <- merge(sig_ctl, cellinfo, by = "cell_id")
#list of tissues with controls of tumor and normal cells
aux_table <- sig_ctl %>% group_by(primary_site, sample_type) %>%
filter(sample_type == "tumor" | sample_type == "normal") %>% group_by(primary_site)
#getting the list of sig_ids for controls
list_ctl <- subset(sig_ctl, primary_site %in% aux_table$primary_site, select = c(sig_id, pert_iname, cell_id, primary_site, sample_type, pert_time))
#-------------------------------------------------------------------------------------------------------------------------------#
#2. Selection of signatures that match the controls
# - select pertubations with compounds - "trt_cp"
# - select signatures that match set of controls by: cell id and pert_itime
# If two different pertubagens in the control are available, keep both
#Only include signatures with more than 3 replicates
#adding important columns in the sigmetrics dataset
index <- match(sigmetrics$sig_id, siginfo$sig_id)
siginfo$distil_nsample <- sigmetrics$distil_nsample[index]
sigs <- subset(siginfo, pert_type == "trt_cp" & cell_id %in% list_ctl$cell_id & pert_time %in% list_ctl$pert_time & distil_nsample>=3)
#------------------------------------------------------------------------------------------------------------------------------#
#3. Threshold of "modulating z-score" for each cell line and controls
#given a cell line, extract the associated consensus controls, and determine the maximum z-score for a given quantile
#' Computes the 1% and 99% quantile of the distribution of expression data for a gene for each experiment involving a control.
#'
#' @param cell cell_id to compute the threshold
#' @param list_ctls List of signatures id associated with controls.
#' @param pert The type of control to be used. Options are "DMSO", "UnTrt", "H2O". The default uses "DMSO" and "UnTrt". This ignores controls with H2O, because they consistently have different results, and are available only for a few cell lines)
#' @return Dataframe with the 1% and 99% quantile of each selected control
#'
get_thresholds <- function(cell = "MCF7", list_ctls = list_ctl, pert = FALSE){
#in case the threshold should be computed using only one kind of control, inform name (options: "DMSO", "UnTrt", "H2O")
#The default ignores controls with H2O (consistently different results, and available only for a few cell lines)
if (pert == FALSE){
ctl_cell <- subset(list_ctls, cell_id == cell & pert_iname != "H2O")
} else{
ctl_cell <- subset(list_ctls, cell_id == cell & pert_iname == pert)
}
#getting the gene expression data for all controls associated with this cell_id
ds <- cmapR::parse.gctx(ds_path,
cid=ctl_cell$sig_id)
expr_data <- as.data.frame(ds@mat)
quantile <- data.frame(sig_id= character(ncol(expr_data)), pert_iname = character(ncol(expr_data)), first = numeric(ncol(expr_data)), last = numeric(ncol(expr_data)),
stringsAsFactors = FALSE)
for (i in 1:ncol(expr_data)){
sig <- as.character(ctl_cell[i,1])
quantile[i,1] <- sig
quantile[i,2] <- as.character(ctl_cell[i,1])
quantile[i,3:4] <- quantile(expr_data[,i] , c(.01, .99))
i = i+1
}
return(quantile)
}
get_z_threshold <- function(cell_id, list_ctls = list_ctl, pert = FALSE){
ctls_data <- get_thresholds(cell = cell_id, list_ctls, pert)
#computing the mean of the 1% z-scores threshold across all controls
zs <- as.data.frame(abs(c(ctls_data$first, ctls_data$last)))
return(skim_to_wide(zs))
}
#' Computes the summary statistics of the 1% and 99% quantile of the distribution of expression data for all controls for a cell line.
#'
#' @param list_cells cell_id or vector of cell_id to compute the threshold
#' @param tissue Set to false when searching based on cell_id. Otherwise, select here the name of the tissue of interest
#' @param list_ctls List of signatures id associated with controls.
#' @param pert The type of control to be used. Options are "DMSO", "UnTrt", "H2O". The default uses "DMSO" and "UnTrt". This ignores controls with H2O, because they consistently have different results, and are available only for a few cell lines)
#' @return Dataframe with the 1% and 99% quantile of each selected control
#' @examples
#' get_z_list(c("MCF7", "A375"), tissue = FALSE)
#' get_z_list(list_cells = FALSE, tissue = "breast")
get_z_list <- function(list_cells = "MCF7", tissue = FALSE, list_ctls = list_ctl, pert = FALSE){
#Geting the list of cell_id requested
if(tissue != FALSE){
cells <- subset(list_ctls, primary_site %in% tissue, cell_id)
cells <- unique(cells$cell_id)
}else if(any(list_cells == "all")){
cells <- subset(cellinfo, cell_id %in% list_ctl$cell_id)
cells <- unique(cells$cell_id)
}else{
cells <- unique(list_cells)
}
z_control <- plyr::adply(matrix(cells), 1, get_z_threshold)
z_control$variable <- cells
z_control$X1 <- NULL
return(z_control)
}
#------------------------------------------------------------------------------------------------------------------------------#
#4. Threshold of Signature Strength for each cell line
#' Computes the Signature Strength of each control signature
#' The Connectivity Map dataset contains an annotation of how many landmark genes were up and down modulated by each signature.
#' This can be considered a proxy of specificity of the action of a perturbation - if the signature strength is high, one could argue that this affects a lot of genes.
#' On the other hand, if it is low, there could imply a more specific relationship between the perturbation and the affected genes.
#'
#' @param cells cell_id to select the controls to compute the signature stregth
#' @param list_ctls List of signatures id associated with controls.
#' @param pert The type of control to be used. Options are "DMSO", "UnTrt", "H2O". The default uses "DMSO" and "UnTrt". This ignores controls with H2O, because they consistently have different results, and are available only for a few cell lines)
#' @return Dataframe with number of landmark genes up and down modulated by each gene signature
#' @example
#' get_ss_threshold("MCF7")
get_ss_threshold <- function(cell, list_ctls = list_ctl, pert = FALSE){
if (pert == FALSE){
ctl_cell <- subset(list_ctls, cell_id == cell & pert_iname != "H2O")
} else{
ctl_cell <- subset(list_ctls, cell_id == cell & pert_iname == pert)
}
modulating <- data.frame(sig_id= character(nrow(ctl_cell)),
ngenes_modulated_dn_lm = numeric(nrow(ctl_cell)), ngenes_modulated_up_lm = numeric(nrow(ctl_cell)), stringsAsFactors = FALSE)
for (i in 1:nrow(ctl_cell)){
sig <- as.character(ctl_cell[i,1])
modulating[i,1] <- sig
lm_mod <- subset(sigmetrics, sig_id == ctl_cell[i,1], c(ngenes_modulated_dn_lm, ngenes_modulated_up_lm))
modulating[i,2:3] <- lm_mod
i = i+1
}
return(modulating)
}