-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathadd_quantile.R
executable file
·165 lines (115 loc) · 5.29 KB
/
add_quantile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env Rscript
options(stringsAsFactors=FALSE)
##################
# OPTION PARSING
##################
suppressPackageStartupMessages(library("optparse"))
option_list <- list(
make_option(c("-i", "--input"), default="stdin",
help="File or stdin [default=%default]"),
make_option(c("-o", "--output"), default="quantile.out.tsv",
help="Output file name. <stdout> for printing on stdout [default=%default]"),
make_option(c("--header"), action="store_true", default=FALSE,
help="Use this if the input has a header [default=%default]"),
make_option(c("-m", "--method"), default="quantiles",
help="Choose the way to bin the data: < quantiles | breaks > [default=%default]"),
make_option(c("-c", "--column"), default=1,
help="The column with the values to divide in quantiles [default=%default]"),
make_option(c("-q", "--quantiles"), default=4, type="integer",
help="Number of quantiles [default=%default]"),
make_option(c("-s", "--resolve_breaks"), default=FALSE, action="store_true",
help="When breaks are not unique, don't crash but create fewer quantiles [default=%default]"),
make_option(c("-b", "--breaks"), default="c(-Inf,1,2,3,Inf)",
help="Breaks for the intervals, comma-separated, with c notation [default=%default]"),
make_option(c("--paste"), action="store_true", default=FALSE,
help="Paste index and interval in a unique column [default=%default]"),
make_option(c("--rowNames"), action="store_true", default=FALSE,
help="Print row names in the output [default=%default]"),
make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
help="if you want more output [default=%default]")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
if (opt$verbose) {print(opt)}
#------------
# LIBRARIES
#------------
if (opt$verbose) {cat("Loading libraries... ")}
#suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(ggplot2))
#suppressPackageStartupMessages(library(plyr))
if (opt$verbose) {cat("DONE\n\n")}
# ============== #
# Functions #
# ============== #
formatInterval = function(interval) {
interval = as.character(interval)
if (length(grep("-Inf", interval) != 0)) {
string = strsplit(interval, split=",")[[1]][2]
outInterval = sprintf("<=%s", gsub("[\\)\\]]", "", string, perl=TRUE))
#outInterval = sprintf("<=%s", gsub("\\)", "", strsplit(interval, split=",")[[1]][2]))
return(outInterval)
}
if (length(grep("Inf", interval) != 0)) {
string = strsplit(interval, split=",")[[1]][1]
outInterval = sprintf(">%s", gsub("[\\(\\[]", "", string, perl=TRUE))
return(outInterval)
}
# left = gsub("[\\(\\]]", "", strsplit(interval, split=",")[[1]][1], perl=TRUE)
# right = gsub("\\)", "", strsplit(interval, split=",")[[1]][2])
# outInterval = sprintf("%s <= x < %s", left, right)
outInterval = interval
return(outInterval)
}
# ======== #
# BEGIN #
# ======== #
# Read data
if (opt$input == "stdin") {inF = file("stdin")} else {inF = opt$input}
m = read.table(inF, h=opt$header, sep="\t")
pasteSEP= ":"
# Quantiles
if (opt$method == "quantiles") {
quantile_header = sprintf("quantile_%s", colnames(m)[opt$column])
quant_index_header = sprintf("quant_index_%s", colnames(m)[opt$column])
quantile_paste_header = sprintf("quant_paste_%s", colnames(m)[opt$column])
if (opt$resolve_breaks) {
breaks <- c(unique(quantile(m[,opt$column], probs=seq(0,1,1/opt$quantiles), na.rm=T)))
m[,quantile_header] = cut(m[,opt$column], breaks, include.lowest=TRUE)
m[,quant_index_header] = match(m[,quantile_header], levels(m[,quantile_header]))
} else {
m[,quantile_header] = cut_number(m[,opt$column], opt$quantiles)
m[,quant_index_header] = as.numeric(cut_number(m[,opt$column], opt$quantile))
}
if (opt$paste) {
m[, quantile_paste_header] <- with(m, paste(quant_index_header, quantile_header, sep=pasteSEP))
# if (max(m[,quant_index_header]) > 9 & max(m[,quant_index_header]) < 100) {
# m[m$quant_index_header<9, quantile_paste_header] <- with(m{}, paste0("0", quantile_paste_header))
# }
m[, quantile_header] <- NULL; m[, quant_index_header] <- NULL
}
}
# Intervals
if (opt$method == "breaks") {
interval_header = sprintf("interval_%s", colnames(m)[opt$column])
interval_index_header = sprintf("interval_index_%s", colnames(m)[opt$column])
interval_paste_header = sprintf("interval_paste_%s", colnames(m)[opt$column])
breaks = eval(parse(text=opt$breaks))
m[,interval_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, dig.lab=4)
m[,interval_index_header] = cut(m[,opt$column], breaks=breaks, include.lowest=TRUE, right=TRUE, labels=FALSE)
# Format the interval
m[,interval_header] = sapply(m[,interval_header], formatInterval)
if (opt$paste) {
m[, interval_paste_header] <- paste( as.character( m[, interval_index_header] ), m[, interval_header], sep=pasteSEP)
if (max(m[,interval_index_header]) > 9 & max(m[,interval_index_header]) < 100) {
m[m[,interval_index_header]<10, interval_paste_header] <- paste0("0", m[m[,interval_index_header]<10, interval_paste_header])
}
m[, interval_header] <- NULL; m[, interval_index_header] <- NULL
}
}
# Print output
output = ifelse(opt$output == "stdout", "", opt$output)
write.table(m, file=output, row.names=opt$rowNames, quote=FALSE, col.names=opt$header, sep="\t")
# EXIT
quit(save='no')