-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcactus_gpu.smk
361 lines (307 loc) · 17.3 KB
/
cactus_gpu.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#############################################################################
# Pipeline for read mapping simulations with varying divergence
#############################################################################
import sys
import os
import re
import logging
import subprocess
import lib.cactuslib as cactuslib
import lib.treelib as treelib
#############################################################################
# System setup
DRY_RUN = False;
if any([arg in sys.argv for arg in ["--dry-run", "--dryrun", "-n"]]):
DRY_RUN = True;
# Whether the pipeline is running in dry-run mode
log_level = "info";
if any([arg in sys.argv for arg in ["--rulegraph", "--dag"]]):
log_level = "notset";
# Set the log level based on the arguments
#log_level = "debug";
# Uncomment to set the log level to debug
log_verbosity = "screen"; # "screen", "file", "both"
log_filename = f"cactus.{log_level}.log"; # Log file name if log_verbosity is "file" or "both"
cactuslib.configureLogging(log_filename, log_level.upper(), log_verbosity.upper())
cactuslib_logger = logging.getLogger('cactuslib')
# Setup logging if debugging
# wd = config["working_dir"];
# # if not os.path.exists(wd):
# # cactuslib_logger.info(f"Creating working directory at {wd}");
# # os.makedirs(wd);
# cactuslib_logger.info(f"Working directory: {os.getcwd()}");
# cactuslib_logger.info(f"Changing working directory to: {wd}");
# os.chdir(wd);
# Switching to the working directory of the project so paths can be relative
USE_GPU = config["use_gpu"]
# Whether to use GPU or CPU cactus
TMPDIR = config["tmp_dir"];
if not os.path.exists(TMPDIR):
cactuslib_logger.info(f"Creating temporary directory at {TMPDIR}");
os.makedirs(TMPDIR);
# A directory with lots of space to use for temporary files generated by the cactus-align command
if config["cactus_path"].lower() in ["download", ""]:
cactus_image_path = cactuslib.downloadCactusImage(USE_GPU);
else:
cactus_image_path = config["cactus_path"];
# The path to the cactus image, either downloaded or specified in the config file
if not os.path.exists(cactus_image_path):
cactuslib_logger.error(f"Could not find cactus image at {cactus_image_path}");
sys.exit(1);
# Check if the cactus image exists
CACTUS_PATH = "singularity exec --nv --cleanenv " + cactus_image_path;
CACTUS_PATH_TMP = "singularity exec --nv --cleanenv --bind " + TMPDIR + ":/tmp " + cactus_image_path;
# The path to the cactus image with and without a tmpdir binding
#############################################################################
# Input files and output paths
INPUT_FILE = os.path.abspath(config["input_file"]);
if not os.path.isfile(INPUT_FILE):
cactuslib_logger.error(f"Could not find input file at {INPUT_FILE}");
sys.exit(1);
else:
cactuslib_logger.info(f"Input file found at {INPUT_FILE}");
# The cactus input file used to generate the config file with cactus-prepare
OUTPUT_DIR = os.path.abspath(config["output_dir"]);
OVERWRITE_OUTPUT_DIR = config["overwrite_output_dir"];
# The output directory specified when cactus-prepare was run
OUTPUT_HAL = os.path.join(OUTPUT_DIR, config["final_hal"]);
#OUTPUT_MAF = os.path.join(OUTPUT_DIR, config["final_hal"].replace(".hal", ".maf"));
#job_path = os.path.join(OUTPUT_DIR, "jobstore");
# The temporary/job directory specified in cactus-prepare
#############################################################################
# cactus-prepare
cactuslib.runCactusPrepare(INPUT_FILE, CACTUS_PATH, OUTPUT_DIR, OVERWRITE_OUTPUT_DIR, OUTPUT_HAL, USE_GPU);
CACTUS_FILE = os.path.join(OUTPUT_DIR, os.path.basename(INPUT_FILE));
# Run cactus-prepare to generate the cactus input file with ancestral nodes and labeled tree
#############################################################################
# Reading files
tips = cactuslib.readTips(INPUT_FILE);
# The main dictionary for storing information and file paths for tips in the tree:
# [output fasta file from mask step] : { 'input' : "original genome fasta file", 'name' : "genome name in tree", 'output' : "expected output from mask step (same as key)" }
####################
internals, anc_tree = cactuslib.initializeInternals(CACTUS_FILE, tips);
# The main dictionary for storing information and file paths for internal nodes in the tree:
# [node name] : { 'name' : "node name in tree", 'blast-inputs' : [the expected inputs for the blast step], 'align-inputs' : [the expected inputs for the align step],
# 'hal-inputs' : [the expected inputs for the hal2fasta step], 'blast-output' : "the .cigar file output from the blast step",
# 'align-output' : "the .hal file output from the align step", 'hal-output' : "the fasta file output from the hal2fasta step" }
####################
tinfo, anc_tree, root = treelib.treeParse(anc_tree);
root_name = tinfo[root][3];
internals = cactuslib.parseInternals(internals, tips, tinfo, anc_tree);
# The tree is parsed to get the root node and the internal nodes are updated with the correct names
if log_level == "debug":
cactuslib_logger.debug("EXITING BEFORE RULES. DEBUG MODE.");
sys.exit(0);
# Exit before running rules if in debug mode
#############################################################################
# Final rule - rule that depends on final expected output file and initiates all
# the other rules
localrules: all
rule all:
input:
os.path.join(OUTPUT_DIR, "hal-append-subtree.log"),
# The log file from the append rule (halAppendSubtree)
#expand(os.path.join(OUTPUT_DIR, "{final_tip}"), final_tip=[tips[name]['output'] for name in tips]),
# The masked input files from rule mask
#expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=[node for node in internals]),
# The final FASTA sequences from each internal node after rules blast, align, and convert
#OUTPUT_MAF
#os.path.join(OUTPUT_DIR, root_name + ".maf")
# The .maf file from rul maf
## Rule all specifies the final output files expected
# #############################################################################
# # Pipeline rules
# --configFile {params.config_file}
rule mask:
input:
lambda wildcards: [ tips[name]['input'] for name in tips if tips[name]['output'] == wildcards.final_tip ][0]
output:
os.path.join(OUTPUT_DIR, "{final_tip}")
params:
path = CACTUS_PATH,
input_file = INPUT_FILE,
cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
genome_name = lambda wildcards: [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0],
host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0] + "-mask"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
job_tmp_dir = lambda wildcards: os.path.join("/tmp", [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0] + "-mask"), # This is the tmp dir in the container, which is bound to the host tmp dir
gpu_opt = f"--gpu {config["mask_gpu"]}" if USE_GPU else ""
resources:
slurm_partition = config["mask_partition"],
cpus_per_task = config["mask_cpu"],
mem_mb = config["mask_mem"],
runtime = config["mask_time"],
slurm_extra = f"'--gres=gpu:{config["mask_gpu"]}'" if USE_GPU else ""
# shell:
# """
# {params.path} cactus-preprocess {params.job_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --realTimeLogging true --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt}
# """
run:
if os.path.isdir(params.host_tmp_dir):
shell("{params.path} cactus-preprocess {params.job_tmp_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt} --restart")
else:
shell("{params.path} cactus-preprocess {params.job_tmp_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt} ")
# When not requesting all CPU on a node: toil.batchSystems.abstractBatchSystem.InsufficientSystemResources: The job LastzRepeatMaskJob is requesting 64.0 cores, more than the maximum of 32 cores that SingleMachineBatchSystem was configured with, or enforced by --maxCores.Scale is set to 1.0.
## This rule runs cactus-preprocess for every genome (tip in the tree), which does some masking
## Runtimes for turtles range from 8 to 15 minutes with the above resoureces
####################
rule blast:
input:
lambda wildcards: [ os.path.join(OUTPUT_DIR, input_file) for input_file in internals[wildcards.internal_node]['input-seqs'] ]
output:
os.path.join(OUTPUT_DIR, "{internal_node}.cigar")
params:
path = CACTUS_PATH_TMP,
cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
node = lambda wildcards: wildcards.internal_node,
host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-blast"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
job_tmp_dir = lambda wildcards: os.path.join("/tmp", wildcards.internal_node + "-blast"), # This is the tmp dir in the container, which is bound to the host tmp dir
gpu_opt = f"--gpu {config["blast_gpu"]}" if USE_GPU else ""
resources:
slurm_partition = config["blast_partition"],
cpus_per_task = config["blast_cpu"],
mem_mb = config["blast_mem"],
runtime = config["blast_time"],
slurm_extra = f"'--gres=gpu:{config["blast_gpu"]}'" if USE_GPU else ""
run:
if os.path.isdir(params.host_tmp_dir):
shell("{params.path} cactus-blast {params.job_tmp_dir} {params.cactus_file} {output} --root {params.node} --logInfo --retryCount 0 --lastzCores {resources.cpus_per_task} {params.gpu_opt} --restart")
else:
shell("{params.path} cactus-blast {params.job_tmp_dir} {params.cactus_file} {output} --root {params.node} --logInfo --retryCount 0 --lastzCores {resources.cpus_per_task} {params.gpu_opt}")
## This rule runs cactus-blast for every internal node
## Runtimes for turtles range from 1 to 10 hours with the above resources
####################
rule align:
input:
cigar_file = os.path.join(OUTPUT_DIR, "{internal_node}.cigar"),
#seq_files = lambda wildcards: [ os.path.join(OUTPUT_DIR, input_file) for input_file in internals[wildcards.internal_node]['desc-seqs'] ]
output:
os.path.join(OUTPUT_DIR, "{internal_node}.hal")
params:
path = CACTUS_PATH_TMP,
#config_file = os.path.join(OUTPUT_DIR, CONFIG_FILE),
cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
node = lambda wildcards: wildcards.internal_node,
#job_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-align"),
host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-align"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
job_tmp_dir = lambda wildcards: os.path.join("/tmp", wildcards.internal_node + "-align"), # This is the tmp dir in the container, which is bound to the host tmp dir
work_dir = TMPDIR,
gpu_opt = "--gpu" if USE_GPU else ""
resources:
slurm_partition = config["align_partition"],
cpus_per_task = config["align_cpu"],
mem_mb = config["align_mem"],
runtime = config["align_time"],
slurm_extra = f"'--gres=gpu:{config["align_gpu"]}'" if USE_GPU else ""
run:
if os.path.isdir(params.host_tmp_dir):
shell("{params.path} cactus-align {params.job_tmp_dir} {params.cactus_file} {input.cigar_file} {output} --root {params.node} --logInfo --retryCount 0 --workDir {params.work_dir} --maxCores {resources.cpus_per_task} --defaultDisk 450G {params.gpu_opt} --restart")
else:
shell("{params.path} cactus-align {params.job_tmp_dir} {params.cactus_file} {input.cigar_file} {output} --root {params.node} --logInfo --retryCount 0 --workDir {params.work_dir} --maxCores {resources.cpus_per_task} --defaultDisk 450G {params.gpu_opt}")
## This rule runs cactus-align for every internal node
## Runtimes for turtles range from 4 to 16 hours with the above resources
####################
rule convert:
input:
os.path.join(OUTPUT_DIR, "{internal_node}.hal")
#lambda wildcards: [ os.path.join(output_dir, input_file) for input_file in internals[wildcards.internal_node]['hal-inputs'] ][0]
output:
os.path.join(OUTPUT_DIR, "{internal_node}.fa")
params:
path = CACTUS_PATH,
node = lambda wildcards: wildcards.internal_node,
resources:
slurm_partition = config["convert_partition"],
cpus_per_task = config["convert_cpu"],
mem_mb = config["convert_mem"],
time = config["convert_time"]
shell:
"""
{params.path} hal2fasta {input} {params.node} --hdf5InMemory > {output}
"""
## This rule runs hal2fasta to convert .hal files for each internal node to .fasta files
## Runtime for turtles is only about 30 seconds per node
####################
rule copy_hal:
input:
all_hals = expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=internals),
anc_hal = os.path.join(OUTPUT_DIR, root_name + ".hal")
output:
OUTPUT_HAL
resources:
slurm_partition = config["copy_partition"],
cpus_per_task = config["copy_cpu"],
mem_mb = config["copy_mem"],
runtime = config["copy_time"]
shell:
"""
cp {input.anc_hal} {output}
"""
## Copying the root .hal file here, since failures in the subsequent rules
## would mean the blast/align steps have to be re-run for that node, but this means a little extra
## storage is required
####################
rule append:
input:
#expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=internals)
OUTPUT_HAL
output:
touch(os.path.join(OUTPUT_DIR, "hal-append-subtree.log"))
resources:
slurm_partition = config["append_partition"],
cpus_per_task = config["append_cpu"],
mem_mb = config["append_mem"],
runtime = config["append_time"]
run:
with open(os.path.join(OUTPUT_DIR, "hal-append-subtree.log"), "w") as appendfile:
for node in internals:
appendfile.write(node + "\n");
if node == root_name:
appendfile.write("Node is root node. Nothing to be done.\n");
appendfile.write("----------" + "\n\n");
appendfile.flush();
continue;
# If the node is the root we don't want to append since that is the hal file we
# are appending to
#cmd = ["singularity", "exec", "--nv", "--cleanenv", "--bind", TMPDIR + ":/tmp", config["cactus_path"], "halAppendSubtree", os.path.join(OUTPUT_DIR, root_name + ".hal"), os.path.join(OUTPUT_DIR, node + ".hal"), node, node, "--merge", "--hdf5InMemory"];
cmd = ["singularity", "exec", "--nv", "--cleanenv", "--bind", TMPDIR + ":/tmp", config["cactus_path"], "halAppendSubtree", OUTPUT_HAL, os.path.join(OUTPUT_DIR, node + ".hal"), node, node, "--merge", "--hdf5InMemory"];
appendfile.write("RUNNING COMMAND:\n");
appendfile.write(" ".join(cmd) + "\n");
appendfile.flush();
# Generate the command for the current node
result = subprocess.run(cmd, capture_output=True, text=True);
# Run the command for the current node and capture the output
appendfile.write("COMMAND STDOUT:\n")
appendfile.write(result.stdout + "\n");
appendfile.write("COMMAND STDERR:\n")
appendfile.write(result.stderr + "\n");
appendfile.write("\nDONE!\n");
appendfile.write("----------" + "\n\n");
appendfile.flush();
# Print the output of the command to the log file
# TODO: Maybe check for errors in stderr and exit with non-zero if found? Not sure if that would work...
# Note that calling singularity with --nv will print text to stderr even though there is no error
## End node loop
## This rule runs halAppendSubtree on every internal node in the tree to combine alignments into a single file.
## Because this command writes to the same file for every node, jobs must be run serially, so this command
## is run in a run block with pyhton's subprocess.run() function.
## Output is captured in the 'hal-append-subtree.log'
####################
# rule maf:
# input:
# final_hal = OUTPUT_HAL,
# append_log = os.path.join(OUTPUT_DIR, "hal-append-subtree.log")
# output:
# OUTPUT_MAF
# params:
# path = CACTUS_PATH_TMP
# resources:
# slurm_partition = config["maf_partition"],
# cpus_per_task = config["maf_cpu"],
# mem_mb = config["maf_mem"],
# runtime = config["maf_time"]
# # {params.path} hal2mafMP.py --numProc {resources.cpus} {input.final_hal} {output}
# shell:
# """
# {params.path} hal2maf {input.final_hal} {output}
# """
#############################################################################