cactus_gpu.smk

#############################################################################
# Pipeline for read mapping simulations with varying divergence
#############################################################################

import sys
import os
import re
import logging
import subprocess

import lib.cactuslib as cactuslib
import lib.treelib as treelib

#############################################################################
# System setup

DRY_RUN = False;
if any([arg in sys.argv for arg in ["--dry-run", "--dryrun", "-n"]]):
    DRY_RUN = True;
# Whether the pipeline is running in dry-run mode

log_level = "info";

if any([arg in sys.argv for arg in ["--rulegraph", "--dag"]]):
    log_level = "notset";
# Set the log level based on the arguments

#log_level = "debug";
# Uncomment to set the log level to debug

log_verbosity = "screen"; # "screen", "file", "both"
log_filename = f"cactus.{log_level}.log"; # Log file name if log_verbosity is "file" or "both"
cactuslib.configureLogging(log_filename, log_level.upper(), log_verbosity.upper())
cactuslib_logger = logging.getLogger('cactuslib')
# Setup logging if debugging

# wd = config["working_dir"];
# # if not os.path.exists(wd):
# #     cactuslib_logger.info(f"Creating working directory at {wd}");
# #     os.makedirs(wd);
# cactuslib_logger.info(f"Working directory: {os.getcwd()}");
# cactuslib_logger.info(f"Changing working directory to: {wd}");
# os.chdir(wd);
# Switching to the working directory of the project so paths can be relative

USE_GPU = config["use_gpu"]
# Whether to use GPU or CPU cactus

TMPDIR = config["tmp_dir"];
if not os.path.exists(TMPDIR):
    cactuslib_logger.info(f"Creating temporary directory at {TMPDIR}");
    os.makedirs(TMPDIR);
# A directory with lots of space to use for temporary files generated by the cactus-align command

if config["cactus_path"].lower() in ["download", ""]:
    cactus_image_path = cactuslib.downloadCactusImage(USE_GPU);
else:
    cactus_image_path = config["cactus_path"];
    # The path to the cactus image, either downloaded or specified in the config file

    if not os.path.exists(cactus_image_path):
        cactuslib_logger.error(f"Could not find cactus image at {cactus_image_path}");
        sys.exit(1);
    # Check if the cactus image exists

CACTUS_PATH = "singularity exec --nv --cleanenv " + cactus_image_path;
CACTUS_PATH_TMP = "singularity exec --nv --cleanenv --bind " + TMPDIR + ":/tmp " + cactus_image_path;
# The path to the cactus image with and without a tmpdir binding

#############################################################################
# Input files and output paths

INPUT_FILE = os.path.abspath(config["input_file"]);
if not os.path.isfile(INPUT_FILE):
    cactuslib_logger.error(f"Could not find input file at {INPUT_FILE}");
    sys.exit(1);
else:
    cactuslib_logger.info(f"Input file found at {INPUT_FILE}");
# The cactus input file used to generate the config file with cactus-prepare

OUTPUT_DIR = os.path.abspath(config["output_dir"]);
OVERWRITE_OUTPUT_DIR = config["overwrite_output_dir"];
# The output directory specified when cactus-prepare was run

OUTPUT_HAL = os.path.join(OUTPUT_DIR, config["final_hal"]);
#OUTPUT_MAF = os.path.join(OUTPUT_DIR, config["final_hal"].replace(".hal", ".maf"));

#job_path = os.path.join(OUTPUT_DIR, "jobstore");
# The temporary/job directory specified in cactus-prepare

#############################################################################
# cactus-prepare

cactuslib.runCactusPrepare(INPUT_FILE, CACTUS_PATH, OUTPUT_DIR, OVERWRITE_OUTPUT_DIR, OUTPUT_HAL, USE_GPU);
CACTUS_FILE = os.path.join(OUTPUT_DIR, os.path.basename(INPUT_FILE));
# Run cactus-prepare to generate the cactus input file with ancestral nodes and labeled tree

#############################################################################
# Reading files

tips = cactuslib.readTips(INPUT_FILE);
# The main dictionary for storing information and file paths for tips in the tree:
# [output fasta file from mask step] : { 'input' : "original genome fasta file", 'name' : "genome name in tree", 'output' : "expected output from mask step (same as key)" }

####################

internals, anc_tree = cactuslib.initializeInternals(CACTUS_FILE, tips);
# The main dictionary for storing information and file paths for internal nodes in the tree:
# [node name] : { 'name' : "node name in tree", 'blast-inputs' : [the expected inputs for the blast step], 'align-inputs' : [the expected inputs for the align step],
#                   'hal-inputs' : [the expected inputs for the hal2fasta step], 'blast-output' : "the .cigar file output from the blast step",
#                   'align-output' : "the .hal file output from the align step", 'hal-output' : "the fasta file output from the hal2fasta step" }

####################

tinfo, anc_tree, root = treelib.treeParse(anc_tree);
root_name = tinfo[root][3];
internals = cactuslib.parseInternals(internals, tips, tinfo, anc_tree);
# The tree is parsed to get the root node and the internal nodes are updated with the correct names

if log_level == "debug":
    cactuslib_logger.debug("EXITING BEFORE RULES. DEBUG MODE.");
    sys.exit(0);
# Exit before running rules if in debug mode

#############################################################################
# Final rule - rule that depends on final expected output file and initiates all
# the other rules

localrules: all

rule all:
    input:
        os.path.join(OUTPUT_DIR, "hal-append-subtree.log"),
        # The log file from the append rule (halAppendSubtree)  

        #expand(os.path.join(OUTPUT_DIR, "{final_tip}"), final_tip=[tips[name]['output'] for name in tips]),
        # The masked input files from rule mask

        #expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=[node for node in internals]),
        # The final FASTA sequences from each internal node after rules blast, align, and convert

        #OUTPUT_MAF
        #os.path.join(OUTPUT_DIR, root_name + ".maf")
        # The .maf file from rul maf
## Rule all specifies the final output files expected

# #############################################################################
# # Pipeline rules

# --configFile {params.config_file} 

rule mask:
    input:
        lambda wildcards: [ tips[name]['input'] for name in tips if tips[name]['output'] == wildcards.final_tip ][0]
    output:
        os.path.join(OUTPUT_DIR, "{final_tip}")    
    params:
        path = CACTUS_PATH,
        input_file = INPUT_FILE,
        cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
        genome_name = lambda wildcards: [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0],
        host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0] + "-mask"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
        job_tmp_dir = lambda wildcards: os.path.join("/tmp", [ name for name in tips if tips[name]['output'] == wildcards.final_tip ][0] + "-mask"), # This is the tmp dir in the container, which is bound to the host tmp dir
        gpu_opt = f"--gpu {config["mask_gpu"]}" if USE_GPU else ""
    resources:
        slurm_partition = config["mask_partition"],
        cpus_per_task = config["mask_cpu"],
        mem_mb = config["mask_mem"],
        runtime = config["mask_time"],
        slurm_extra = f"'--gres=gpu:{config["mask_gpu"]}'" if USE_GPU else ""
    # shell:
    #     """
    #     {params.path} cactus-preprocess {params.job_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --realTimeLogging true --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt}
    #     """
    run:
        if os.path.isdir(params.host_tmp_dir):
            shell("{params.path} cactus-preprocess {params.job_tmp_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt}  --restart")
        else:
            shell("{params.path} cactus-preprocess {params.job_tmp_dir} {params.input_file} {params.cactus_file} --inputNames {params.genome_name} --logInfo --retryCount 0 --maxCores {resources.cpus_per_task} {params.gpu_opt} ")
        # When not requesting all CPU on a node: toil.batchSystems.abstractBatchSystem.InsufficientSystemResources: The job LastzRepeatMaskJob is requesting 64.0 cores, more than the maximum of 32 cores that SingleMachineBatchSystem was configured with, or enforced by --maxCores.Scale is set to 1.0.

## This rule runs cactus-preprocess for every genome (tip in the tree), which does some masking
## Runtimes for turtles range from 8 to 15 minutes with the above resoureces

####################

rule blast:
    input:
        lambda wildcards: [ os.path.join(OUTPUT_DIR, input_file) for input_file in internals[wildcards.internal_node]['input-seqs'] ]
    output:
        os.path.join(OUTPUT_DIR, "{internal_node}.cigar")
    params:
        path = CACTUS_PATH_TMP,
        cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
        node = lambda wildcards: wildcards.internal_node,
        host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-blast"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
        job_tmp_dir = lambda wildcards: os.path.join("/tmp", wildcards.internal_node + "-blast"), # This is the tmp dir in the container, which is bound to the host tmp dir
        gpu_opt = f"--gpu {config["blast_gpu"]}" if USE_GPU else ""
    resources:
        slurm_partition = config["blast_partition"],
        cpus_per_task = config["blast_cpu"],
        mem_mb = config["blast_mem"],
        runtime = config["blast_time"],
        slurm_extra = f"'--gres=gpu:{config["blast_gpu"]}'" if USE_GPU else ""
    run:
        if os.path.isdir(params.host_tmp_dir):
            shell("{params.path} cactus-blast {params.job_tmp_dir} {params.cactus_file} {output} --root {params.node} --logInfo --retryCount 0 --lastzCores {resources.cpus_per_task} {params.gpu_opt} --restart")
        else:
            shell("{params.path} cactus-blast {params.job_tmp_dir} {params.cactus_file} {output} --root {params.node} --logInfo --retryCount 0 --lastzCores {resources.cpus_per_task} {params.gpu_opt}")
## This rule runs cactus-blast for every internal node
## Runtimes for turtles range from 1 to 10 hours with the above resources

####################

rule align:
    input:
        cigar_file = os.path.join(OUTPUT_DIR, "{internal_node}.cigar"),
        #seq_files = lambda wildcards: [ os.path.join(OUTPUT_DIR, input_file) for input_file in internals[wildcards.internal_node]['desc-seqs'] ]
    output:
        os.path.join(OUTPUT_DIR, "{internal_node}.hal")
    params:
        path = CACTUS_PATH_TMP,
        #config_file = os.path.join(OUTPUT_DIR, CONFIG_FILE),
        cactus_file = os.path.join(OUTPUT_DIR, CACTUS_FILE),
        node = lambda wildcards: wildcards.internal_node,
        #job_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-align"),
        host_tmp_dir = lambda wildcards: os.path.join(TMPDIR, wildcards.internal_node + "-align"), # This is the tmp dir for the host system, which is bound to /tmp in the singularity container
        job_tmp_dir = lambda wildcards: os.path.join("/tmp", wildcards.internal_node + "-align"), # This is the tmp dir in the container, which is bound to the host tmp dir
        work_dir = TMPDIR,
        gpu_opt = "--gpu" if USE_GPU else ""
    resources:
        slurm_partition = config["align_partition"],
        cpus_per_task = config["align_cpu"],
        mem_mb = config["align_mem"],
        runtime = config["align_time"],
        slurm_extra = f"'--gres=gpu:{config["align_gpu"]}'" if USE_GPU else ""
    run:
        if os.path.isdir(params.host_tmp_dir):
            shell("{params.path} cactus-align {params.job_tmp_dir} {params.cactus_file} {input.cigar_file} {output} --root {params.node} --logInfo --retryCount 0 --workDir {params.work_dir} --maxCores {resources.cpus_per_task} --defaultDisk 450G {params.gpu_opt} --restart")
        else:
            shell("{params.path} cactus-align {params.job_tmp_dir} {params.cactus_file} {input.cigar_file} {output} --root {params.node} --logInfo --retryCount 0 --workDir {params.work_dir} --maxCores {resources.cpus_per_task} --defaultDisk 450G {params.gpu_opt}")
## This rule runs cactus-align for every internal node
## Runtimes for turtles range from 4 to 16 hours with the above resources

####################

rule convert:
    input:
        os.path.join(OUTPUT_DIR, "{internal_node}.hal")
        #lambda wildcards: [ os.path.join(output_dir, input_file) for input_file in internals[wildcards.internal_node]['hal-inputs'] ][0]
    output:
        os.path.join(OUTPUT_DIR, "{internal_node}.fa")
    params:
        path = CACTUS_PATH,
        node = lambda wildcards: wildcards.internal_node,
    resources:
        slurm_partition = config["convert_partition"],
        cpus_per_task = config["convert_cpu"],
        mem_mb = config["convert_mem"],
        time = config["convert_time"]
    shell:
        """
        {params.path} hal2fasta {input} {params.node} --hdf5InMemory > {output}
        """
## This rule runs hal2fasta to convert .hal files for each internal node to .fasta files
## Runtime for turtles is only about 30 seconds per node

####################

rule copy_hal:
    input:
        all_hals = expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=internals),
        anc_hal = os.path.join(OUTPUT_DIR, root_name + ".hal")
    output:
        OUTPUT_HAL
    resources:
        slurm_partition = config["copy_partition"],
        cpus_per_task = config["copy_cpu"],
        mem_mb = config["copy_mem"],
        runtime = config["copy_time"]    
    shell:
        """
        cp {input.anc_hal} {output}
        """
## Copying the root .hal file here, since failures in the subsequent rules
## would mean the blast/align steps have to be re-run for that node, but this means a little extra
## storage is required

####################

rule append:
    input:
        #expand(os.path.join(OUTPUT_DIR, "{internal_node}.fa"), internal_node=internals)
        OUTPUT_HAL
    output:
        touch(os.path.join(OUTPUT_DIR, "hal-append-subtree.log"))
    resources:
        slurm_partition = config["append_partition"],
        cpus_per_task = config["append_cpu"],
        mem_mb = config["append_mem"],
        runtime = config["append_time"]
    run:
        with open(os.path.join(OUTPUT_DIR, "hal-append-subtree.log"), "w") as appendfile:
            for node in internals:
                appendfile.write(node + "\n");
                if node == root_name:
                    appendfile.write("Node is root node. Nothing to be done.\n");
                    appendfile.write("----------" + "\n\n");
                    appendfile.flush();
                    continue;
                # If the node is the root we don't want to append since that is the hal file we
                # are appending to

                #cmd = ["singularity", "exec", "--nv", "--cleanenv", "--bind", TMPDIR + ":/tmp", config["cactus_path"], "halAppendSubtree", os.path.join(OUTPUT_DIR, root_name + ".hal"), os.path.join(OUTPUT_DIR, node + ".hal"), node, node, "--merge", "--hdf5InMemory"];
                cmd = ["singularity", "exec", "--nv", "--cleanenv", "--bind", TMPDIR + ":/tmp", config["cactus_path"], "halAppendSubtree", OUTPUT_HAL, os.path.join(OUTPUT_DIR, node + ".hal"), node, node, "--merge", "--hdf5InMemory"];
                appendfile.write("RUNNING COMMAND:\n");
                appendfile.write(" ".join(cmd) + "\n");
                appendfile.flush();
                # Generate the command for the current node

                result = subprocess.run(cmd, capture_output=True, text=True);
                # Run the command for the current node and capture the output

                appendfile.write("COMMAND STDOUT:\n")
                appendfile.write(result.stdout + "\n");
                appendfile.write("COMMAND STDERR:\n")
                appendfile.write(result.stderr + "\n");
                appendfile.write("\nDONE!\n");
                appendfile.write("----------" + "\n\n");
                appendfile.flush();
                # Print the output of the command to the log file
                # TODO: Maybe check for errors in stderr and exit with non-zero if found? Not sure if that would work...
                # Note that calling singularity with --nv will print text to stderr even though there is no error
        ## End node loop
    ## This rule runs halAppendSubtree on every internal node in the tree to combine alignments into a single file.
    ## Because this command writes to the same file for every node, jobs must be run serially, so this command
    ## is run in a run block with pyhton's subprocess.run() function.
    ## Output is captured in the 'hal-append-subtree.log'

####################

# rule maf:
#     input:
#         final_hal = OUTPUT_HAL,
#         append_log = os.path.join(OUTPUT_DIR, "hal-append-subtree.log")
#     output:
#         OUTPUT_MAF
#     params:
#         path = CACTUS_PATH_TMP
#     resources:
#         slurm_partition = config["maf_partition"],
#         cpus_per_task = config["maf_cpu"],
#         mem_mb = config["maf_mem"],
#         runtime = config["maf_time"]
#         # {params.path} hal2mafMP.py --numProc {resources.cpus} {input.final_hal} {output}
#     shell:
#         """
#         {params.path} hal2maf {input.final_hal} {output}
#         """

#############################################################################