-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathKEGG_enrichment.R
executable file
·94 lines (67 loc) · 3.68 KB
/
KEGG_enrichment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("optparse"))
options(stringsAsFactors=F)
##################
# OPTION PARSING
##################
option_list <- list(
make_option(c("-u", "--universe"), help="a list of human gene identifiers (ensEMBL ids), NO header"),
make_option(c("-G", "--genes"), help="a list of human gene identifiers for the foreground (ensEMBL ids), WITH header"),
#make_option(c("-l", "--log"), action="store_true", default=FALSE, help="apply the log [default=FALSE]"),
#make_option(c("-p", "--pseudocount"), type="double", help=sprintf("specify a pseudocount for the log [default=%s]",pseudocount), default=pseudocount),
#make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
make_option(c("-o", "--output"), help="additional tags for otuput [default=out]")
#make_option(c("-f", "--fill_by"), help="choose the color you want to fill by [default=NA]", type='character', default=NA)
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
print(opt)
##------------
## LIBRARIES
##------------
suppressPackageStartupMessages(library("KEGG.db"))
suppressPackageStartupMessages(library("org.Hs.eg.db"))
suppressPackageStartupMessages(library("GOstats"))
suppressPackageStartupMessages(library("plyr"))
na2null = function(x) if(is.na(x)) {return(NULL)}else{return(x)}
############################
# BEGIN
############################
U = read.table(opt$universe, h=F, col.names='hs')
G = read.table(opt$genes, h=T, col.names='hs')
#orth = read.table('~/Documents/db/human-mouse/orthologs/merged_orthologs_HumanMouse_1_1_genes.tsv', h=F, col.names=c('hs','mm'))
#genes = read.table('~/Documents/human-mouse/tissues/maxdiff_tissuespecific.tsv', h=F, col.names=c('hs','mm','sample'))
# I want to create a list of parameters to perform GO enrichment on different gene sets
# take the entrez gene ids for all the orthologous genes which will be my universe (the same for all the sets)
universe = unlist(mget(U$hs, org.Hs.egENSEMBL2EG, ifnotfound=NA))
sprintf("%s background genes; %s with a corresponding entrez id", nrow(U), length(unique(universe)))
# how many genes am I able to map?
# First thing notice that also ensembl gene ids longer than 15 characters are included
# if I remove these genes I end up with:
# length(unique(as.character(universe[which(nchar(names(universe)) == 15)]))) ----> 15593
createParams = function(x) {
geneset = unlist(mget(x, org.Hs.egENSEMBL2EG, ifnotfound=NA))
sprintf("%s foreground genes; %s with a corresponding entrez id", length(x), length(unique(geneset)))
pv = 1-(1-0.05)**(1/length(x))
params = new("KEGGHyperGParams",
geneIds = geneset,
universeGeneIds = universe,
annotation = 'org.Hs.eg.db',
pvalueCutoff = pv,
testDirection='over')
return(params)}
res = hyperGTest(createParams(G$hs))
write.table(summary(res), file=sprintf("%s.tsv", opt$output), quote=F, sep="\t", row.names=F)
htmlReport(res, file=sprintf("%s.html", opt$output))
q(save='no')
#listOfParamObjs = aggregate(G$hs, list(G$sample), createParams)$x
# run it (and hope it is ok! )
#resultList = lapply(listOfParamObjs, hyperGTest)
# give the name to each GO enrichment
#names(resultList) <- aggregate(genes$hs, list(genes$sample), length)[,1]
# put the results in a data.frame
#resultDf = rbind.fill(lapply(as.list(names(resultList)[-which(lapply(resultList, function(x) nrow(summary(x))) == 0)]),
# function(n)data.frame(summary(resultList[[n]]), n_genes=length(geneIds(resultList[[n]])), sample=n)))
# save the data.frame in a file
#write.table(resultDf, file='maxdiff_tissuespecific_GO.tsv', quote=F, row.names=F, col.names=T, sep='\t')