-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathmajor.isoform.across.samples.R
110 lines (79 loc) · 5.44 KB
/
major.isoform.across.samples.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#############################
# LIBRARIES
#####################
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library("optparse"))
options(stringsAsFactors=F)
pseudocount = 1e-04
cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
##################
# OPTION PARSING
##################
option_list <- list(
make_option(c("-i", "--input_matrix"), help="the matrix with transcript expression you want to analyze"),
make_option(c("-a", "--annotation"), help="two-column file with gene and tx ids"),
make_option(c("-o", "--output"), help="choose the name for the output file WITHOUT extension"),
make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
make_option(c("-c", "--compare_by"), help="field of the metadata by which you want to compare")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
#q(save='no')
entropy = function(x) {x = x[!is.na(x)]; x =x[x!=0]; p=x/sum(x); if (length(x) != 0) {return(-sum(p*log(p)))} else {NA}}
which_is_maj_iso = function(x) if (sum(x==0)==length(x)) {return(0)} else {return(which.max(x))}
gn_fun1 = function(x) max(table(as.numeric(x))) # to be revised!!!
gn_fun2 = function(x) length(unique(x[x!=0]))
###########
## HUMAN ##
###########
# using only protein coding genes AND transcripts
human_gn_tx = read.table(opt$annotation, h=F, col.names=c('gene','tx'))
# transcript matrix
human_expr = read.table(opt$input_matrix, h=T)
human_expr[is.na(human_expr)] <- 0
# transcript matrix only for protein coding transcripts with gene id appended
human_tx_gn_expr = merge(human_gn_tx,human_expr,by.x='tx',by.y='row.names')
human_expr_iso = aggregate(human_tx_gn_expr[,-(1:2)], list(human_tx_gn_expr$gene), which_is_maj_iso)
rownames(human_expr_iso) = human_expr_iso[,1]
human_expr_iso[,1] = NULL
human_expr_iso_fun = data.frame(n_samples=apply(human_expr_iso,1,gn_fun1),n_maj_iso=apply(human_expr_iso,1,gn_fun2), organism='Human')
# add the number of annotated for human
hs_ann_iso = data.frame(table(human_gn_tx$gene))
names(hs_ann_iso) = c('gene','ann_iso')
human_expr_iso_fun_ann = merge(hs_ann_iso, human_expr_iso_fun, by.x='gene', by.y='row.names')
hs_mm_expr_iso_fun_ann = human_expr_iso_fun_ann
# read the metadata
mdata = read.table(opt$metadata, h=T, sep='\t')
variables = unique(mdata[,opt$compare_by])
shift = sapply(variables, function(x) {ids = unique(mdata[which(mdata[opt$compare_by]==x), 'labExpId']);
apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} )
})
for (el in variables){
ids = unique(mdata[which(mdata[opt$compare_by]==el), 'labExpId'])
human_expr_iso[el] = apply(human_expr_iso[ids], 1, function(x) if (length(unique(x))==1) {return(unique(x))}else{return(NA)} )
}
shift2ggplot = data.frame(
# Total genes in the annotation
Total_genes = nrow(shift),
# Genes with the same major isoform within the same group
same_maj_iso_within = nrow(shift[which(rowSums(shift==0) ==0),]),
# Genes with the same major isoform consistent within the group, but different between
diff_maj_iso_between = sum(apply(shift[which(rowSums(shift==0) ==0),], 1, function(x) length(unique(x)))>1)
)
write.table(shift2ggplot, sprintf("%s.tsv", opt$output), sep='\t', quote=F, row.names=F)
## PLOT ##
library(ggplot2)
pdf(sprintf("%s.pdf", opt$output),width=12)
#ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso <= 25),aes(x=as.factor(ann_iso), y= n_samples)) + geom_boxplot() + facet_wrap(~organism,scale='free') + xlab('Number of annotated isoforms') + ylab('Max number of samples where the same major isoform is expressed') + opts(title='Most diffused major isoform')
ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0),aes(x=as.factor(n_maj_iso),fill=cut(ann_iso, c(1,2,3,4,5,10,15,25,Inf)))) + geom_histogram(position='stack') + guides(fill = guide_legend(reverse = TRUE)) + scale_fill_hue(name='Total isoforms per gene',labels=c(2,3,4,5,'6-10','11-15','16-25','>25')) + labs(x='Major isoforms per gene', y='Number of protein coding genes')
ggplot(subset(hs_mm_expr_iso_fun_ann,ann_iso<=35 & ann_iso>=2 & n_maj_iso>=1 ),aes(x=as.numeric(ann_iso))) + geom_histogram(aes(fill=cut(n_maj_iso, c(0,1,2,3,4,5,6,7,Inf))),position='stack',binwidth=1) + guides(fill=guide_legend(reverse=TRUE)) + stat_bin(aes(y=..count../x),breaks=seq(2,34,by=1),geom='line') + labs(x='Annotated isoforms per gene', y='Number of protein coding genes') + scale_fill_hue(name='Major isoforms per gene',labels=c(1,2,3,4,5,6,7,'>8'))
gp = ggplot(subset(hs_mm_expr_iso_fun_ann, ann_iso >= 2 & n_maj_iso>0), aes(x=ann_iso, y=n_maj_iso))
gp = gp + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,50,100,500,1000,2000,3000,Inf))),colour="black",size=.2)
gp = gp + scale_fill_brewer(name='Number\nof genes', palette='Greens') + xlim(c(2,20))
gp = gp + ylim(c(0,20))
gp = gp + labs(x='Annotated isoforms per gene', y='Major isoforms per gene')
gp
#ggplot(subset(hs_mm_expr_iso_median_ann,hs_ann_iso!=1 & mm_ann_iso!=1 ),aes(x=floor(mm_entropy*10),y=floor(hs_entropy*10))) + stat_bin2d(binwidth=c(1,1),aes(fill=cut(..count.., c(0,1,5,10,100,500,1000,2000,3000,Inf))),colour ="black",size=.2) + scale_fill_brewer("count") + geom_abline(linetype=4) + opts(title="Median of entropy of isoform expression") + xlab('Mouse (entropy*10)') + ylab('Human (entropy*10)')
dev.off()