-
Notifications
You must be signed in to change notification settings - Fork 1
/
kmercontigalignmerge.Rscript
executable file
·334 lines (275 loc) · 16.1 KB
/
kmercontigalignmerge.Rscript
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
#!/usr/bin/env Rscript
options(error = quote({
dump.frames(to.file=TRUE, dumpto="Rcoredump")
load("Rcoredump.rda")
print(Rcoredump)
q()
}))
# Start time
start.time = proc.time()[3]
args = commandArgs(trailingOnly=TRUE)
# Process arguments - SGE added out_prefix to usage
help = c("kmercontigalignmerge.R merge kmer/gene alignment combinations",
"Adapted from Daniel Wilson (2018) kmermerge.Rscript",
"Usage: kmercontigalignmerge.Rscript task_id n p output_prefix analysis_dir input_files kmer_type kmer_length ref_fa nucmerident software_file")
if(length(args!=0)){
if(args[1]=="-help" | args[1]=="-h"){
cat(help,sep="\n")
q("no")
}
}
if(length(args)!=11) {
cat(help,sep="\n")
cat("Received arguments: ",args,"\n")
stop("\nIncorrect usage\n")
}
###################################################################################################
## Functions and software paths
###################################################################################################
get_count_batch_parameters = function(p = NULL, b = NULL, n = NULL){
beg = ((1:p)-1)*b+1
end = sapply(1:p, function(t,b,n) min(t*b,n), b=b,n=n,USE.NAMES =F)
# end = min(t*b,n)
return(cbind(beg,end))
}
###################################################################################################
# Initialize variables
process = as.integer(args[1])
n = as.integer(args[2])
p = as.integer(args[3])
out_prefix = as.character(args[4]) # SGE added
output_dir = as.character(args[5]) # SGE added
input_files = as.character(args[6]) # SGE added
kmer_type = tolower(as.character(args[7])) # SGE added
kmer_length = as.integer(args[8]) # SGE added
ref.fa = as.character(args[9]) # SGE added
ident_threshold = as.numeric(args[10]) # SGE added
software_file = as.character(args[11]) # SGE added
if(is.na(n)) stop("Error: n must be an integer","\n")
if(is.na(p)) stop("Error: p must be an integer","\n")
if(!file.exists(output_dir)) stop("Error: output directory doesn't exist","\n") # SGE added
if(unlist(strsplit(output_dir,""))[length(unlist(strsplit(output_dir,"")))]!="/") output_dir = paste0(output_dir, "/") # SGE added
if(!file.exists(input_files)) stop("Error: input file paths file doesn't exist","\n") # SGE added
if(kmer_type!="protein" & kmer_type!="nucleotide") stop("Error: variant type must be either protein or nucleotide","\n")
if(is.na(kmer_length)) stop("Error: kmer length must be an integer","\n")
if(!file.exists(ref.fa)) stop("Error: reference fasta file doesn't exist","\n") # SGE added
if(ident_threshold>100 | ident_threshold<0) stop("Error: nucmer identity threshold must be between 0-100","\n")
if(!file.exists(software_file)) stop("Error: software file doesn't exist","\n")
# Input files
input_files_path = input_files
# Contig align directory
contigalign_dir = file.path(output_dir,paste0(kmer_type,"kmer", kmer_length, "_kmergenealign/"))
if(!dir.exists(contigalign_dir)) stop("Error: contig align directory", contigalign_dir, "doesn't exist","\n")
# Read in software file
software_paths = read.table(software_file, h = T, sep = "\t", quote = "")
# Required software and script paths
# Begin with software
required_software = c("scriptpath")
if(any(is.na(match(required_software, as.character(software_paths$name))))) stop(paste0("Error: missing required software path in the software file - requires ",paste(required_software, collapse = ", ")),"\n")
script_location = as.character(software_paths$path)[which(tolower(as.character(software_paths$name))=="scriptpath")]
if(!dir.exists(script_location)) stop("Error: script location directory specified in the software paths file doesn't exist","\n")
sortstringspath = file.path(script_location, "sort_strings")
if(!file.exists(sortstringspath)) stop("Error: sort_strings path doesn't exist - check pipeline script location in the software file","\n")
stringlist2countpath = file.path(script_location, "stringlist2count")
if(!file.exists(stringlist2countpath)) stop("Error: stringlist2count path doesn't exist - check pipeline script location in the software file","\n")
# Report variables
cat("#############################################", "\n")
cat("Running on host: ",system("hostname", intern=TRUE),"\n")
cat("Command line arguments","\n")
cat(args, "\n\n")
cat("Parameters:","\n")
cat("task_id:", process, "\n")
cat("n:", n,"\n")
cat("p:", p,"\n")
cat("Output prefix:", out_prefix,"\n")
cat("Analysis directory:", output_dir,"\n")
cat("Input files path:", input_files,"\n")
cat("Kmer type:", kmer_type,"\n")
cat("Kmer length:", kmer_length,"\n")
cat("Reference fasta file:", ref.fa,"\n")
cat("Nucmer alignment minimum % identity:", ident_threshold,"\n")
cat("Software file:", software_file, "\n")
cat("Script location:", script_location, "\n")
cat("#############################################", "\n\n")
# Read in sample IDs
input_files = scan(input_files, what = character(0), sep = "\n", quiet = TRUE) # SGE added
if(n!=length(input_files)) stop("n does not equal the number of samples in the input path file","\n")
if(!all(file.exists(input_files))) stop("Error: not all input files exist","\n")
# Get the reference name
ref.name = scan(ref.fa, what = character(0), sep = "\n", nlines = 1, quiet = TRUE)
ref.name = unlist(strsplit(ref.name, " "))[1]
if(substr(ref.name,1,1)!=">") stop("Error: reference fasta file does not begin with a name starting with '>'","\n")
ref.name = substr(ref.name,2,1e6)
cat("Reference name:",ref.name,"\n")
b = as.integer(ceiling(n/p))
if(b==1) stop("Cannot have batchsize = 1. Try p < n/2")
p = as.integer(ceiling(n/b))
t = process
imax = as.integer(ceiling(log(n)/log(b)))
cat("Parameters [n batchsize processes taskid imax]: ",n,b,p,t,imax,"\n")
if(t>p) {
cat("Task",t,"not required\n")
quit("no")
}
# Merge
i = 0
while(TRUE) {
i=i+1
if(!((t %% b^(i-1))==0 | (t==p & i<=imax))) break()
cat("t =",t,"i =",i,"\n")
if(i==1) {
# First round: merge source files
outfile = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length, ".j.",i,".",t,".txt") # SGE added contigalign_dir out_prefix and . before j
outfile_completed = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".j.",i,".",t,".completed.txt") # SGE added contigalign_dir out_prefix and . before j
beg = b*(t-1)+1
end = min(b*t, n)
cat("Beg:",beg," End:",end,"\n")
if(end<beg) stop("Problem with input arguments, please check")
infiles = input_files[beg:end]
infiles_size = sapply(infiles, function(x) as.numeric(system(paste0("ls -l ",x," | cut -d ' ' -f5"), intern = T)), USE.NAMES = F) # SGE added - could make check greater than zero
if(!all(file.exists(infiles)) | !all(infiles_size>0)) stop("Could not find files or files empty ",paste(infiles, collapse = " ")) # SGE added replacing above as input files should not be being created as the script is running - could mean partially written files are used
if(length(infiles)==1) {
cmd = paste0("zcat ",infiles[1]," | cut -f1 > ",outfile) # SGE changed from zcat to cat
} else if(length(infiles)==2) {
cmd = paste0("sort -u <(zcat ",infiles[1]," | cut -f1) <(zcat ",infiles[2]," | cut -f1) > ",outfile) # SGE changed from zcat to cat
} else {
cmd = paste0("sort -u <(zcat ",infiles[1]," | cut -f1) <(zcat ",infiles[2]," | cut -f1)",
paste0(" | sort -u - <(zcat ",infiles[3:length(infiles)]," | cut -f1)",collapse=""),
" > ",outfile) # SGE changed from zcat to cat
}
stopifnot(system2("/bin/bash",paste0("-c '",cmd,"'"), wait = TRUE)==0)
stopifnot(system2("/bin/bash",paste0("-c 'touch ",outfile_completed,"'"), wait = TRUE)==0)
} else {
# Subsequent rounds: merge merged files
te = as.integer(ceiling(t/(b^(i-1)))*b^(i-1))
outfile = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".j.",i,".",te,".txt") # SGE added contigalign_dir out_prefix and . before j
outfile_completed = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".j.",i,".",te,".completed.txt") # SGE added contigalign_dir out_prefix and . before j
beg = te-b^(i-1)+b^(i-2)
end = min(te,as.integer(ceiling(t/b^(i-2))*b^(i-2)))
cat(paste0("Beg", i,":"),beg,paste0("End",i,":"),end,"\n") # SGE changed from 2 to i
if(end<beg) stop("Problem with input arguments, please check")
inc = b^(i-2)
infiles = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".j.",i-1,".",seq(from=beg,to=end,by=inc),".txt") # SGE added contigalign_dir out_prefix and . before j
infiles_completed = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".j.",i-1,".",seq(from=beg,to=end,by=inc),".completed.txt") # SGE added
nattempts = 0
while(!all(file.exists(infiles_completed)) | !all(file.exists(infiles))) {
nattempts=nattempts+1
if(nattempts>100) stop("Could not find files ",paste(infiles_completed,collapse=" "))
Sys.sleep(60) # SGE changed from 1 to 60
}
infiles_size = sapply(infiles, function(x) as.numeric(system(paste0("ls -l ",x," | cut -d ' ' -f5"), intern = T)), USE.NAMES = F) # SGE added this and in below while statement - could set higher than zero, an appropriate number for completed
if(any(infiles_size==0)) stop("One or more file size is zero ",infiles,"\n") # SGE added
if(length(infiles)==1) {
cmd = paste0("mv ",infiles[1]," ",outfile)
} else if(length(infiles)==2) {
cmd = paste0("sort -u ",infiles[1]," ",infiles[2]," > ",outfile)
} else {
cmd = paste0("sort -u ",infiles[1]," ",infiles[2],
paste0(" | sort -u - ",infiles[3:length(infiles)],collapse=""),
" > ",outfile)
}
stopifnot(system2("/bin/bash",paste0("-c '",cmd,"'"), wait = TRUE)==0)
# If length of infiles is one, it has been moved and doesn't exist. If more than one, delete the temp files. # SGE added
if(length(infiles)>1){
cmd = paste("rm",paste(infiles,collapse=" "))
stopifnot(system2("/bin/bash",paste0("-c '",cmd,"'"), wait = TRUE)==0)
}
stopifnot(system2("/bin/bash",paste0("-c 'touch ",outfile_completed,"'"), wait = TRUE)==0)
}
}
if(t==p) {
# Check outfile isn't empty # SGE added
outfile_size = system(paste0("ls -l ",outfile," | cut -d ' ' -f5"), intern = T) # SGE adde
if(outfile_size==0) stop(outfile, " file is empty","\n") # SGE adde
final.kmer.txt.gz = paste0(output_dir, out_prefix,"_", kmer_type, kmer_length,".", ref.name, "_t",ident_threshold, ".kmeralignmerge.txt.gz")
# outfile = paste0(output_dir, out_prefix,"_kmeralignmerge.txt")
nKmersFinal = system(paste0("cat ", outfile, " | wc -l"), intern = T)
dummycount = rep(1, nKmersFinal)
dummycountfile = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length, "_dummycount.txt")
cat("Written dummy count","\n")
cat(dummycount, file = dummycountfile, sep = "\n")
outfile_count = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"_kmeralignmerge_dummycount.txt")
cat("outfile:", outfile, "\n")
cat("dummycountfile:", dummycountfile, "\n")
cat("outfile_count:", outfile_count, "\n")
stopifnot(system(paste0("paste ", outfile, " ", dummycountfile, " > ", outfile_count))==0)
sortCommand = paste(c(sortstringspath, outfile_count, "| cut -f1 | gzip -c >", final.kmer.txt.gz), collapse=" ")
cat("Sort command:","\n")
stopifnot(system(sortCommand)==0)
cat(sortCommand, "\n")
# Remove all completed files
# completed_files = dir(pattern=glob2rx(paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"*.j*.completed.txt"))) # SGE added .j
# completed_files = c(completed_files, dir(pattern=glob2rx(paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"*.kmercontigalign.completed.txt")))) # SGE added
completed_files = system(paste0("ls ", paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"*.j*.completed.txt")), intern = T) # SGE changed to
completed_files = c(completed_files, system(paste0("ls ", paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"*.kmercontigalign.completed.txt")), intern = T))
cmd = paste("rm",paste(completed_files,collapse=" "))
stopifnot(system2("/bin/bash",paste0("-c '",cmd,"'"), wait = TRUE)==0)
stopifnot(system(paste0("rm ", outfile))==0)
stopifnot(system(paste0("rm ", contigalign_dir, out_prefix,"_", kmer_type, kmer_length, "_dummycount.txt"))==0)
stopifnot(system(paste0("rm ", contigalign_dir, out_prefix,"_", kmer_type, kmer_length,"_kmeralignmerge_dummycount.txt"))==0)
# Write file for final file being completed
outfile_completed = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".",ref.name,"_t",ident_threshold, ".final.kmeralignmerge.completed.txt") # SGE added out_prefix and . before j
stopifnot(system2("/bin/bash",paste0("-c 'touch ",outfile_completed,"'"), wait = TRUE)==0)
}
cat("Process finished for kmeralignmerge","\n")
# Get counts for kmer/gene combinations
cat("Getting counts for kmeralignmerge","\n")
infile_completed = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".", ref.name,"_t",ident_threshold, ".final.kmeralignmerge.completed.txt")
infile = paste0(output_dir, out_prefix,"_", kmer_type, kmer_length,".", ref.name,"_t",ident_threshold, ".kmeralignmerge.txt.gz")
nattempts = 0
while(!all(file.exists(infile_completed)) | !all(file.exists(infile))) {
nattempts=nattempts+1
if(nattempts>100) stop("Could not find file ", infile_completed)
Sys.sleep(60) # SGE changed from 1 to 60
}
# Read kmers
# Total number of kmers
n = as.integer(scan(pipe(paste0("zcat ", infile," | wc -l")), quiet = TRUE))
if(n<1) stop("No kmer/gene combinations found in", infile)
# Number of kmers (batch size) per process
b = as.integer(ceiling(n/p))
# Define kmers to process
# beg = (t-1)*b+1
# end = min(t*b,n)
# SGE changed to function
beg = get_count_batch_parameters(p=p,n=n,b=b)[t,1]
end = get_count_batch_parameters(p=p,n=n,b=b)[t,2]
out_prefix_counts = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".",beg,"-",end) # SGE changed ":" to "-" in out_prefix
kmersublistfile = paste0(out_prefix_counts,".",t,".temp_kmerlist.txt.gz")
cmd = paste0("zcat ", infile," | head -n ",end," | tail -n ",end-beg+1," | gzip -c > ",kmersublistfile)
stopifnot(system(cmd)==0)
cmd = paste(stringlist2countpath,kmersublistfile, input_files_path, out_prefix_counts,0,1)
stopifnot(system(cmd)==0)
cmd = paste("rm",kmersublistfile)
stopifnot(system(cmd)==0)
# Check that files have been created and are not empty
outfiles = paste0(out_prefix_counts,c(".count.txt.gz"))
outfiles_size = sapply(outfiles, function(x) as.numeric(system(paste0("ls -l ",x," | cut -d ' ' -f5"), intern = T)), USE.NAMES = F) # SGE added this - could set higher than zero
if(any(outfiles_size==0)) stop("One or more JOB_INDEX ",t, " ", out_prefix_counts, " kmer align count files are empty")
# Write file for final file being completed
outfile_completed = paste0(out_prefix_counts,".count.completed.txt") # SGE added out_prefix and . before j
stopifnot(system2("/bin/bash",paste0("-c 'touch ",outfile_completed,"'"), wait = TRUE)==0)
# Merge files and remove intermediate files
if(t==p){
count_batch_parameters = get_count_batch_parameters(p=p,n=n,b=b)
alignCountFiles = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".", count_batch_parameters[,1],"-", count_batch_parameters[,2],".count.txt.gz")
alignCountCompletedFiles = paste0(contigalign_dir, out_prefix,"_", kmer_type, kmer_length,".", count_batch_parameters[,1],"-", count_batch_parameters[,2],".count.completed.txt")
nattempts = 0
while(!all(file.exists(alignCountCompletedFiles)) | !all(file.exists(alignCountFiles))) {
nattempts=nattempts+1
if(nattempts>100) stop("Could not find file ", alignCountCompletedFiles)
Sys.sleep(60) # SGE changed from 1 to 60
}
for(i in 1:length(alignCountFiles)){
if(i==1) count = scan(gzfile(alignCountFiles[i]), sep = "\n", quiet = TRUE) else count = c(count, scan(gzfile(alignCountFiles[i]), sep = "\n", quiet = TRUE))
}
count = as.numeric(count)
final_count_file = paste0(output_dir, out_prefix,"_", kmer_type, kmer_length,".", ref.name,"_t",ident_threshold, ".kmeralignmerge.count.txt")
cat(count, file = final_count_file, sep = "\n")
stopifnot(system(paste0("gzip ", final_count_file))==0)
# Delete intermediate count files
cat("Deleting intermediate files","\n")
cmd = paste("rm",paste(c(alignCountFiles, alignCountCompletedFiles, infile_completed),collapse=" "))
stopifnot(system2("/bin/bash",paste0("-c '",cmd,"'"), wait = TRUE)==0)
}
cat("Finished in",(proc.time()[3]-start.time)/60,"minutes\n")