-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathk-means.R
executable file
·84 lines (54 loc) · 2.05 KB
/
k-means.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env Rscript
options(stringsAsFactors=FALSE)
##################
# OPTION PARSING #
##################
suppressPackageStartupMessages(library("optparse"))
option_list <- list(
make_option(c("-i", "--input"), default="stdin",
help="File or stdin [default=%default]"),
make_option(c("-o", "--output"), default="Kmeans.tsv",
help="Output file name. Can be stdout [default=%default]"),
make_option(c("--header"), action="store_true", default=FALSE,
help="The input matrix has a header [default=%default]"),
make_option(c("-l", "--log10"), action="store_true", default=FALSE,
help="Apply the log10 to the whole matrix as pre-processing step [default=%default]"),
make_option(c("-p", "--pseudocount"), default=0.001,
help="Pseudocount to add when applying the log [default=%default]"),
make_option(c("-k", "--nb_clusters"), default=3,
help="Number of desired clusters [default=%default]"),
make_option(c("-B", "--iterations"), default=50,
help="Number of initializations to determine the best clustering [default=%default]"),
make_option(c("-v", "--verbose"), action="store_true", default=FALSE,
help="if you want more output [default=%default]")
)
parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
if (opt$verbose) {print(opt)}
suppressPackageStartupMessages(library("ggplot2"))
##############
# BEGIN
##############
if (opt$input == "stdin") {
m = read.table(file("stdin"), h=T)
} else {
m = read.table(opt$input, h=T)
}
if (opt$log10) {
m = log10(m + opt$pseudocount)
}
set.seed(123)
# Find the clusters from multiple random initializations
Klist = replicate(opt$iterations, kmeans(m, opt$nb_clusters), simplify=F)
# Choose the best
K = Klist[[which.max(sapply(1:length(Klist), function(i) {Klist[[i]]$betweenss/Klist[[i]]$totss}))]]$cluster
m$Kmeans = K
# OUTPUT
if (opt$output == "stdout") {
output = ""
} else {
output = opt$output
}
write.table(m, output, quote=FALSE, col.names=TRUE, row.names=TRUE, sep='\t')
q(save='no')