-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathplot.differential.isoforms.R
230 lines (159 loc) · 9.3 KB
/
plot.differential.isoforms.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
## Q: How many isoforms are expressed per gene per cell line?
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("plyr"))
options(stringsAsFactors=F)
pseudocount = 1e-04
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
##################
# OPTION PARSING
##################
option_list <- list(
make_option(c("-i", "--input_matrix"), help="the matrix with the trancsript rpkms"),
make_option(c("-a", "--annotation"), help="tsv with gene_id, tx_id, gene_type, gene_name, transcript_name IN THIS ORDER"),
make_option(c("-o", "--output"), help="choose the name for the output file"),
make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
make_option(c("-d", "--de"), help='output of edgeR for differentially expressed genes')
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
print(opt)
#--------------------
# FUNCTIONS
#--------------------
# FUNCTION 1
# This function calculates the relative expression of the most expressed isoform of a gene in a given condition.
# It returns NaN when the gene has no isoforms expressed
maj_iso_rel_expr = function(x) {max(x,na.rm=T)/sum(x,na.rm=T)}
# FUNCTION 2
# This function calculates the Shannon entropy for expressed isoforms of a gene in a given condition
# It returns NA when the gene has no isoforms expressed.
# The function is taken from the script:
source('~/R/functions.R')
# convert NAs to nul
na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
###########
## HUMAN ##
###########
# (only whole cell PolyA+ is considered, ENCODE Jan11 two bioreplicates)
# gene, tx annotation file (only protein coding gene, but all txs)
human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx','gene_type','gene_name','transcript_type'))
human_gn_tx = (merge(human_gn_tx, as.data.frame(table(human_gn_tx$gene), responseName='ann_iso'), by.x='gene', by.y='Var1'))
# transcript matrix
human_expr = read.table(opt$input_matrix, h=T)
# read the metadata
mdata = read.table(opt$metadata, h=T)
# replace NAs (bad IDR) with zeros
human_expr[is.na(human_expr)] <- 0
# add the gene locus to each tx
human_tx_gn_expr = merge(human_gn_tx, human_expr, by.x='tx', by.y='row.names')
# DE table
DE = read.table(opt$de, h=T)
DE = subset(merge(human_gn_tx, DE, by.y='row.names', by.x='tx'), FDR<0.01)
DE = merge(DE, as.data.frame(table(DE$gene), responseName='de_tx'), by.y='Var1', by.x='gene')
ggplot(subset(DE, ann_iso<=25)) + geom_boxplot(aes(x=factor(ann_iso), y=de_tx) )
# shown in the slides
case = subset(human_tx_gn_expr, gene=='ENSG00000035681')
df = merge(melt(case), unique(mdata[c('labExpId','cell')]), by.x = 'variable', by.y='labExpId')
df = ddply(df, .(tx,cell), transform, avg=mean(value), sdev = sd(value))
gp = ggplot(subset(df, tx %in% c('ENST00000038176', 'ENST00000523177', 'ENST00000519174', 'ENST00000523106', 'ENST00000521972')),
aes(x=cell, y=avg))
gp = gp + geom_line(aes(group=tx, color=tx, linetype=transcript_type))
gp + geom_errorbar(aes(ymin=avg-sdev, ymax=avg+sdev, color=tx), width=.02)
# for testing
case = subset(human_tx_gn_expr, gene=='ENSG00000010017')
df = merge(melt(case), unique(mdata[c('labExpId','cell')]), by.x = 'variable', by.y='labExpId')
df = ddply(df, .(tx,cell), transform, avg=mean(value), sdev = sd(value))
gp = ggplot(df, aes(x=cell, y=avg))
gp = gp + geom_line(aes(group=tx, color=tx, linetype=transcript_type))
gp + geom_errorbar(aes(ymin=avg-sdev, ymax=avg+sdev, color=tx), width=.02)
case = subset(human_tx_gn_expr, gene=='ENSG00000162517')
df = merge(melt(case), unique(mdata[c('labExpId','cell')]), by.x = 'variable', by.y='labExpId')
df = ddply(df, .(tx,cell), transform, avg=mean(value), sdev = sd(value))
gp = ggplot(df,aes(x=cell, y=avg))
gp = gp + geom_line(aes(group=tx, color=tx, linetype=transcript_type))
gp + geom_errorbar(aes(ymin=avg-sdev, ymax=avg+sdev, color=tx), width=.02)
DE_no_genes = subset(DE, !(gene %in% genes$merged))
genes = head(subset(DE, FDR<0.01 & ann_iso==4 & abs(logFC)>3)$gene,3)
df = merge(melt(subset(human_tx_gn_expr, gene %in% genes)), unique(mdata[c('labExpId','cell')]), by.x = 'variable', by.y='labExpId')
df = ddply(df, .(tx,cell), transform, avg=mean(value), sdev=sd(value))
gp = ggplot(df, aes(x=cell, y=avg))
gp = gp + geom_line(aes(group=tx, color=tx, linetype=transcript_type))
gp = gp + facet_wrap(~gene_name, scale='free_y', nrow=1)
gp + geom_errorbar(aes(ymin=avg-sdev, ymax=avg+sdev, color=tx), width=0.07)
q(save='no')
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#### EXPRESSED ISOFORMS
# count the number of expressed isoforms in each cell line
human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), function(x) sum(!is.na(x)&x!=0))
human_expr_iso_melt = melt(human_expr_iso, variable.name='sample_name', value.name='expr_iso')
#### MAJOR ISOFORM EXPRESSION
# calculate the relative epxression of the most expressed isoform for each gene in each sample
human_rel_expr_maj = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), maj_iso_rel_expr)
human_rel_expr_maj_melt = melt(human_rel_expr_maj, variable.name='sample_name', value.name='rel_maj')
#### ISOFORM ENTROPY
# calculate the entropy of expressed isoforms for each gene in each sample
human_entr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(gene=human_tx_gn_expr$gene), entropy)
human_entr_iso_melt = melt(human_entr_iso, variable.name='sample_name', value.name='entr_iso')
#### ANNOTATED ISOFORMS
# count the number of annotated isoforms for each gene
human_ann_iso = as.data.frame(table(human_tx_gn_expr$gene))
names(human_ann_iso) = c('gene','ann_isoforms')
#### MERGING
# add the annotation information to the expressed isoforms
human_iso_all = merge(human_expr_iso_melt, human_rel_expr_maj_melt, by=c('gene', 'sample_name'))
human_iso_all = merge(human_iso_all, human_entr_iso_melt, by=c('gene', 'sample_name'))
human_iso_all = merge(human_iso_all, human_ann_iso, by = 'gene')
human_iso_all$organism = 'Human'
hs_mm_data_expr_ann_iso = human_iso_all
hs_mm_data_expr_ann_iso = merge(mdata[c("labExpId", opt$fill_by[!is.na(opt$fill_by)], opt$color_by[!is.na(opt$color_by)])],
hs_mm_data_expr_ann_iso, by.x='labExpId', by.y='sample_name')
write.table(hs_mm_data_expr_ann_iso, file=sprintf('%s.summary_isoform_expression.tsv',opt$output), sep='\t', quote=F, row.names=F)
}
if (opt$INPUT!="") {
hs_mm_data_expr_ann_iso = read.table(opt$INPUT, h=T)}
#q(save='no')
# label vectors with the number of genes having a given number of isoforms
#mm_lab = table(subset(hs_mm_data_expr_ann_iso, !duplicated(gene) & rnaExtract=='total')$ann_isoforms)
#head(subset(hs_mm_data_expr_ann_iso, !duplicated(gene) & tolower(rnaExtract)=='longpolya'))
hs_lab = table(subset(hs_mm_data_expr_ann_iso, !duplicated(gene) & organism == 'Human')$ann_isoforms)
#merged_labels = mapply(function(x,y) sprintf('%s\n%s', x,y), as.numeric(hs_lab)[1:30], as.numeric(mm_lab)[1:30])
merged_labels = as.numeric(hs_lab)[1:30]
##########
## plot ##
##########
# COMMENT: I have to add the number of genes in each boxplot
library(ggplot2)
pdf(opt$output, height=7, width=10)
gp = ggplot(subset(hs_mm_data_expr_ann_iso,ann_isoforms<=20 & expr_iso>0),aes(as.factor(ann_isoforms), expr_iso))
#gp = gp + geom_boxplot(fill='green') + ylim(c(0,20))
gp = gp + geom_boxplot(aes_string(fill=na2null(opt$fill_by), color=na2null(opt$color_by))) + ylim(c(0,20))
gp = gp + labs(y='Number of isoforms expressed per gene', x='Number of annotated isoforms per gene')
gp = gp + geom_abline(linetype='dotted', size=2, color='grey')
gp = gp + annotate('text', x = 1:20, y = 0, label = names(hs_lab)[1:20])
gp = gp + scale_fill_manual(values=cbbPalette)
gp = gp + scale_color_manual(values=rev(cbbPalette))
gp = gp + scale_x_discrete(labels=merged_labels) + theme(axis.text.x = element_text(angle=90, vjust=0.5))
#gp = gp + facet_wrap(~labExpId)
gp
gp = ggplot(subset(hs_mm_data_expr_ann_iso,ann_isoforms<=20),aes(as.factor(ann_isoforms), rel_maj))
gp = gp + geom_boxplot(aes_string(fill=na2null(opt$fill_by), color=na2null(opt$color_by))) +ylim(c(0,1))
#gp = gp + geom_boxplot(fill='green') +ylim(c(0,1))
gp = gp + labs(y='Major isoform relative expression', x='Number of annotated isoforms per gene')
gp = gp + annotate('text', x = 1:20, y = 0, label = names(hs_lab)[1:20])
gp = gp + scale_fill_manual(values=cbbPalette)
gp = gp + scale_color_manual(values=rev(cbbPalette))
gp = gp + scale_x_discrete(labels=merged_labels) + theme(axis.text.x = element_text(angle=90, vjust=0.5))
gp
gp = ggplot(subset(hs_mm_data_expr_ann_iso,ann_isoforms<=20 & ann_isoforms>=2),aes(as.factor(ann_isoforms), entr_iso))
gp = gp + geom_boxplot(aes_string(fill=na2null(opt$fill_by), color=na2null(opt$color_by)))
#gp = gp + geom_boxplot(fill='green')
gp = gp + labs(y = 'Shannon entropy', x = 'Number of annotated isoforms per gene')
gp = gp + geom_point(aes(x=as.factor(ann_isoforms), y = log(ann_isoforms+1)), color='red')
gp = gp + annotate('text', x = (2:20)-1, y = -1/4, label = names(hs_lab)[2:20])
gp = gp + scale_fill_manual(values=cbbPalette)
gp = gp + scale_color_manual(values=rev(cbbPalette))
gp
dev.off()