metro

#!/usr/bin/env python

"""METRO: Mouse nEoanTigen pRedictOr
About:
    This is the main entry for the METRO pipeline.
USAGE:
	$ metro <build|prepare|find|predict> [OPTIONS]
Example:
    $ metro build -h
    $ metro prepare -h
    $ metro find -h
    $ metro predict -h
"""

from __future__ import print_function
from src import version 
from src.utils import (Colors,
    initialize, 
    fatal,
    err,
    require,
    permissions) 
from src.mutator import (mutate,
    NonCodingVariantError,
    UnsupportedVariantTypeError,
    VariantParsingError,
    NonMatchingReferenceBases)
from src.aminoacid import (translate,
    convert_aa_cooridate,
    truncate,
    InvalidCodonError)
from src.reader import (fasta, 
    excel,
    tsv,
    csv,
    maf)
import sys, os, subprocess
import argparse, textwrap
import numpy as np
import pandas as pd

__author__ = 'Skyler Kuhn, Samantha Sevilla'
__version__ = version
__email__ = 'kuhnsa@nih.gov, samantha.sevilla@nih.gov'
_name = os.path.basename(sys.argv[0])
_description = 'Mouse nEoanTigen pRedictOr'


def bash(cmd, interpreter='/bin/bash', strict=True, **kwargs):
    """
    Interface to run a process or bash command. Using subprocess.call_check()
    due to portability across most python versions. It was introduced in python 2.5
    and it is also interoperabie across all python 3 versions. 
    @param cmd <str>:
        Shell command to run
    @param interpreter <str>:
        Interpreter for command to run [default: bash]
    @pararm strict <bool>:
        Prefixes any command with 'set -euo pipefail' to ensure process fail with
        the expected exit-code  
    @params kwargs <check_call()>:
        Keyword arguments to modify subprocess.check_call() behavior
    @return exitcode <int>:
        Returns the exit code of the run command, failures return non-zero exit codes
    """
    # Changes behavior of default shell
    prefix = ''
    # set -e: exit immediately upon error
    # set -u: treats unset variables as an error
    # set -o pipefail: exits if a error occurs in any point of a pipeline
    if strict: prefix = 'set -euo pipefail; '

    exitcode = subprocess.check_call(prefix + cmd, shell = True, executable = interpreter, **kwargs)

    if exitcode != 0: 
        fatal("""\n\tFatal: Failed to run '{}' command!
        └── Command returned a non-zero exitcode of '{}'.
        """.format(process, exitcode)
        )

    return exitcode


def build(sub_args):
    """Builds the reference files for the METRO pipeline from a genomic FASTA
    file and a GTF file. Disclaimer: hybrid genomes not supported.
    @param sub_args <parser.parse_args() object>:
        Parsed arguments for build sub-command
    """
    # Initialize the output directory
    initialize(sub_args.outputDir, links=[sub_args.ref_fa, sub_args.ref_gtf])
    
    # Check for required runtime dependencies
    # Build command needs samtools and gffread from cufflinks package
    require(cmds=["samtools", "gffread"], suggestions=["samtools", "cufflinks"])
    
    # Build Index for the Genomic FASTA file
    # samtools faidx ref.fa -fai-idx /path/to/ref.fa.fai
    process = "samtools faidx {} --fai-idx {}.fai".format(
        sub_args.ref_fa, 
        os.path.join(sub_args.outputDir, os.path.basename(sub_args.ref_fa))
    )

    print("Running: " + process)
    exitcode = bash(process)

    # Extract transcript CDS sequences from reference FASTA file and GTF file
    # gffread -F -x /path/to/transcripts.fa -g genome.fa transcripts.gtf
    process = "gffread -F -x {} -g {} {}".format(
        os.path.join(sub_args.outputDir, "transcripts.fa"),
        os.path.join(sub_args.outputDir, os.path.basename(sub_args.ref_fa)),
        os.path.join(sub_args.outputDir, os.path.basename(sub_args.ref_gtf))
    )

    print("Running: " + process)
    exitcode = bash(process)


def prepare(sub_args):
    """Creates input files for the metro from MAF files.
    Uses user-inputs to determine filtering thresholds.
    @param sub_args <parser.parse_args() object>:
        Parsed arguments for run sub-command
    """
    # Check that col names exist, if they don't exit and error
    def check(col,df_in):
        if col not in df_in:
            fatal("""\n\tThe following column is required in prepare '{}'.""".format(col))
    
    # Read in each input file and check for column 
    # header and create variant id (VIDA), calculate 
    # the average VAF and append df to list
    print("--Processing input files")
    df_list=[]
    for file_input in sub_args.mafFiles:
        #check access
        err('----Opening {}'.format(file_input))

        # Create fileID to track input
        file_name=file_input.split(".")[0]

        # Read input MAF file
        df_sub=pd.read_csv(file_input,delimiter="\t",skiprows=1)
    
        # Check required cols are in input file; if they are missing 
        # print to user and exit
        required_cols=["Hugo_Symbol", "Start_Position", "End_Position",
                        "Reference_Allele", "Tumor_Seq_Allele1",
                        "Tumor_Seq_Allele2"]
        for in_col in required_cols: check(in_col,df_sub)
        
        # Add fileID
        df_sub = df_sub.assign(file_id=file_name)
        
        # Add Variant ID (VIDA): 
        # Example: [Hugo_Symbol]_[Start_Position]_[End_Position]_[Reference_Allele][Tumor_Seq_Allele1][Tumor_Seq_Allele2]
        df_sub["VIDA"] = df_sub["Hugo_Symbol"] + "_" + df_sub["Start_Position"].astype(str) + "_" + df_sub["End_Position"].astype(str) + "_" + df_sub["Reference_Allele"] + df_sub["Tumor_Seq_Allele1"] + df_sub["Tumor_Seq_Allele2"]

        # Calculate average VAF: t_alt_count/t_depth
        df_sub["av_VAF"] = df_sub["t_alt_count"]/df_sub["t_depth"]

        # Append to df list
        df_list.append(df_sub)

    # Merge all file dfs
    df_merged = pd.concat(df_list)

    # First level of filtering - 
    # Determine VIDA count value withib df
    # and include only VIDA's found in all input files
    df_value_counts=pd.DataFrame(df_merged['VIDA'].value_counts(dropna=True, sort=True))
    df_value_counts = df_value_counts.reset_index()
    df_value_counts.columns = ['VIDA', 'VIDA_Counts']
    df_filt = df_value_counts[df_value_counts["VIDA_Counts"] >= len(sub_args.mafFiles)]
    
    # Second level of filtering - 
    # For each VIDA determine which also have:
    # IMPACT (HIGH or MOD) >= sub_args.impactFilter
    # FILTER (PASS) >= sub_args.passFilter
    # group_av_VAF > sub_args.vafFilter
    vida_list=[]
    print("--Applying filters")
    for i,row in df_filt.iterrows():
        vida_id=str(row['VIDA'])

        # Subset df_merged to calculuate stats specific to this VIDA
        df_sub = df_merged[df_merged["VIDA"] == vida_id].reset_index()

        # Calculate average av_VAF
        av_VAF = df_sub["av_VAF"].mean()
        
        # Determine IMPACT counts, filter for HIGH or MODERATE
        impact_count=df_sub['IMPACT'][(df_sub.IMPACT == "HIGH") | 
                    (df_sub.IMPACT == "MODERATE")].count()

        # Determine FILTER counts, filter for PASS
        filter_count=df_sub['FILTER'][(df_sub.FILTER == "PASS")].count()

        # Create VIDA LIST of passing filters
        if ((av_VAF >= sub_args.vafFilter) &  (impact_count >= sub_args.impactFilter) & (filter_count >= sub_args.passFilter)):
            vida_list.append(vida_id)

    # Subset df to include only VIDAs that meet filtering requirements
    df_final = df_merged[df_merged['VIDA'].isin(vida_list)]
    df_out=df_final.drop_duplicates()

    # Name the file the filter name
    # create METRO run input file
    VAF_val=str(int(sub_args.vafFilter*100))    
    assap_input_file = os.path.join(sub_args.outputDir, sub_args.outprefix + "_VAF" + VAF_val + "_Variant.csv")
    df_out.to_csv(assap_input_file, index=False)


def find(sub_args):
    """Determines the consequence of a mutation on a protein product. Obtains the 
    mutated amino acid sequence for a given variant.
    @param sub_args <parser.parse_args() object>:
        Parsed arguments for run sub-command
    """
    # Initialize the output directory
    initialize(sub_args.outputDir)
    # Truncates non-frame shift mutations 
    # +/- N positions from mutation start 
    # site in the amino acid sequence
    subset = int(sub_args.subset)
    # Dictionary to quickly map each 
    # transcript ID to its coding DNA
    # sequence or CDS sequence. The 
    # build coomand can be used to 
    # generate this reference file.
    transcriptome = {}
    for sid, sequence in fasta(sub_args.transcripts):
        # Grab the transcipt id and remove the version suffix
        transcript_id = sid.split(' ')[0].split('.')[0]
        transcriptome[transcript_id] = sequence

    # Run METRO against each user supplied input file 
    for file in sub_args.input:
        # Parse field of interest from each 
        # excel file. Each input file is 
        # required have the following fields:
        # 'Transcript_ID', 'Variant_Classification',
        # 'HGVSc', 'Hugo_Symbol' where 'Transcript_ID'
        # and 'Variant_Classification','HGVSc' are 
        # mandatory.
        err('Opening {}'.format(file))
        # Handler for reading in MAF-like files in
        # different file formats and/or using different 
        # delimeters (like comma versus tab).
        # Defaults to TSV reader which is the 
        # most common file type for MAF or 
        # VCF files if the file does not have
        # an excel-like file extension or 
        # a CSV-like file extension.
        df = maf(file, subset=['Transcript_ID','Variant_Classification','HGVSc','Hugo_Symbol','Gene'])
        # Create output file name from input file
        # Output file name generated by removing the
        # suffix or input file name extension and 
        # adding a new extension '.metro.tsv' 
        output_file = os.path.join(sub_args.outputDir, "{}.metro.tsv".format(os.path.splitext(os.path.basename(file))[0]))
        err('Writing output file {}'.format(output_file))
        with open(output_file, 'w') as ofh:
            # Write header to output file
            ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                "Variant_Classification", 
                "Hugo_Symbol", 
                "Transcript_ID", 
                "HGVSc", 
                "Variant_Start_Position", 
                "WT_Transcript_Sequence",
                "Mutated_Transcript_Sequence", 
                "WT_AA_Sequence",
                "Mutated_AA_Sequence", 
                "WT_Subset_AA_Sequence",
                "Mutated_Subset_AA_Sequence"))

            # Mutate each recorded variant in the input file. 
            for i,row in df.iterrows():
                # Variant class is used to determine the size 
                # of the downstream portion of the subset AA
                # sequence from the variant start site. Frame
                # shift mutations will report until the end of 
                # the coding AA sequence or until a stop codon 
                # is reached.
                variant_class = str(row['Variant_Classification'])
                transcript = str(row['Transcript_ID'])
                hgvs = str(row['HGVSc'])
                hugo = str(row['Hugo_Symbol'])

                if (hgvs and hgvs != 'nan') and (transcript and transcript != 'nan') and (variant_class and variant_class != 'nan'):
                    try:
                        sequence = transcriptome[transcript]
                    except KeyError:
                        # Skip over un-annotated transcript.
                        # Recorded transcript is not annotated
                        # in the user provided reference file,
                        # which may indicate that the user did 
                        # not provide the same reference files
                        # to call variants and to generate the 
                        # MAF file in the build sub command
                        err("{} {}".format("WARNING: Transcript {} not found in provided transcripts FASTA file!".format(transcript),
                        "Please verify the correct reference file is provided!"))
                        continue
                    try:
                        # Mutate the coding DNA sequence based on
                        # the recorded HGVS representation of the 
                        # mutation, and get the mutation start site.
                        # HGVS terms representing mutations in non-exonic 
                        # regions will return a NonCodingVariantError. 
                        # HGVS terms without a parser or HGVS terms which 
                        # are not supported will return a 
                        # UnsupportedVariantTypeError.
                        # HGVS terms which cannot be parsed during term
                        # tokenization will return a VariantParsingError.
                        # HGVS terms containing the variants reference 
                        # sequence will be checked against the transcripts
                        # sequence. If the transcript sequence does not 
                        # match the HGVS term sequence then a 
                        # NonMatchingReferenceBases error is raised.
                        mutated_dna, variant_position = mutate(sequence, hgvs)
                        # Translate the wt and mutated coding DNA sequence into 
                        # an amino acid sequence. Sequences containing 
                        # codons with non-stardard nucleotide 
                        # representations (i.e. not "A,a,C,c,G,T,t") 
                        # will not be translated and will return  
                        # InvalidCodonError.
                        wt_amino_acid = translate(sequence)
                        mutated_amino_acid = translate(mutated_dna)
                        # Convert coding DNA varaint start site to 
                        # amino acid coordinate system.
                        aa_variant_position = convert_aa_cooridate(variant_position)
                        if variant_class.lower().startswith('frame_shift'):
                            # In the Subset_AA_sequence representation of
                            # the mutated and wt amino acid sequence, the
                            # downstream portion of frameshift mutations
                            # are reported until one of the following
                            # conditions are met: the end of the coding
                            # sequence is reached, OR until the first
                            # terminating stop codon is reached.
                            truncated_wt_aa = truncate(wt_amino_acid, aa_variant_position, subset)
                            truncated_mutated_aa = truncate(mutated_amino_acid, aa_variant_position, subset)
                        else:
                            # In the Subset_AA_sequence representation of
                            # the wt and mutated amino acid sequence, the
                            # upstream and downstream portion of non-frame
                            # shift mutations are +/- N amino acids of the
                            # mutation start site. This vairable is adjustable
                            # via the --subset cli option.
                            truncated_wt_aa = truncate(wt_amino_acid, aa_variant_position, subset, subset)
                            truncated_mutated_aa = truncate(mutated_amino_acid, aa_variant_position, subset, subset)
                        # Write results to output file
                        ofh.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(variant_class, hugo, transcript, hgvs, variant_position,
                            sequence, mutated_dna, wt_amino_acid, mutated_amino_acid, truncated_wt_aa, truncated_mutated_aa))
                    except NonCodingVariantError as e: 
                        err("WARNING: Skipping over non-coding DNA HGVS variant '{}' reported in {}!".format(hgvs, transcript))
                    except UnsupportedVariantTypeError as e:
                        err("WARNING: Skipping over unsupported HGVS variant class '{}' reported in {}!".format(hgvs, transcript))
                    except VariantParsingError as e:
                        err("WARNING: Skipping over HGVS variant '{}' reported in {} because it could not be parsed!".format(hgvs, transcript))
                    except NonMatchingReferenceBases as e:
                        err("WARNING: Skipping over HGVS variant '{}' reported in {} due to non-matching reference sequence!".format(hgvs, transcript),
                        "Please verify the correct reference file is provided!")
                    except InvalidCodonError as e:
                        err("WARNING: Skipping over HGVS variant '{}' reported in {} due to invalid codon in mutated sequence!".format(hgvs, transcript),
                        "Please review the following mutated coding DNA sequence for any errors:\n\t> {}".format(mutated_dna))


def predict(sub_args):
    """
    Runs the prediction tool netMHC with output of the run sub command. 
    This step requies that netMHCpan is installed and in a user's $PATH.
    @param sub_args <parser.parse_args() object>:
        Parsed arguments for run sub-command
    """
    # Check whether NETMHC is executable
    import distutils.spawn
    def check_netMHC(name):
        if distutils.spawn.find_executable(name) is None:
            fatal("netMHCpan must be executable on users $PATH. Review documentation for information on installation.")
    check_netMHC("netMHCpan")

    # Check kmer length is an odd number
    if (sub_args.kmerLength % 2 ) == 0:
        # TODO: add a lambda function to enforce 
        # this type/assumption with argparse  
        fatal("WARNING: --kmerLength must be an odd number. Please revise input and try again")

    # Check allele list is <=20 in length
    split_alleleList=sub_args.alleleList.split(",")
    if len(split_alleleList)>20:
        # TODO: add a lambda function to enforce 
        # this type/assumption with argparse
        fatal("WARNING: --kmerLength must be an odd number. Please revise input and try again")

    # Parse fields of interest from each excel file. 
    # This file can be the output of METRO run or maybe 
    # an input created by a user. Input file is required 
    # have the following fields:
    # - Hugo_Symbol
    # - Transcript_ID
    # - Variant_Classification	
    # - WT_Subset_AA_Sequence
    # - Mutated_Subset_AA_Sequence
    print("--Preparing Data")
    err('----Opening {}'.format(sub_args.mutationFile))
    df = maf(sub_args.mutationFile[0], 
    subset=['Hugo_Symbol','Transcript_ID','Variant_Classification',
            'WT_Subset_AA_Sequence','Mutated_Subset_AA_Sequence'])

    # Remove duplicate values
    df = df.drop_duplicates()

    # Create fasta header line
    df["header"] = ">" + df["Transcript_ID"] + "_" + df["Hugo_Symbol"]

    # for each potential mutation
    kmer_list = []
    up_list = []
    down_list = []
    for i,row in df.iterrows():
        mut_type=str(row['Variant_Classification'])
        wt_str=str(row['WT_Subset_AA_Sequence'])
        mt_str=str(row['Mutated_Subset_AA_Sequence'])
        
        # if the mt_str and the wt_str are the same, then skip
        if wt_str != mt_str:

            # Determine where the differences are between 
            # the mt_str and wt_str. First check with mt_str 
            # as the source, if there is no difference because
            # the string stops early on the mt_str use the 
            # wt_str as the source to capture
            try:
                diff_loc=[mut_loc for mut_loc in range(len(mt_str)) if wt_str[mut_loc] != mt_str[mut_loc]]
            except:
                diff_loc=[mut_loc for mut_loc in range(len(wt_str)) if wt_str[mut_loc] != mt_str[mut_loc]]

            # Handle frameshifts differently than other mutation types
            if "Frame" in mut_type:
                # Set prediction position up and downstream of mutation
                # where downstream is all AA following mutation,
                # where upstream is 10 AA upstream of mutation or 
                # until the start of the seq
                up_loc = int(diff_loc[0]) - 10
                if (up_loc <= 0): up_loc = 0
                down_loc = len(mt_str)

            else:
                # Set prediction position up and downstream of mutation
                # to the length of the kmer with the mutation centered
                # add one to downstream for inclusive range locations
                up_loc = int(diff_loc[0]) - (sub_args.kmerLength-1)/2
                down_loc = int(diff_loc[0]) + (sub_args.kmerLength-1)/2 + 1

                # Check peptide length to ensure kmer length is possible within 
                # the defined mutation location.
                # if the peptide is shorter than the kmer then use the whole peptide
                # if it's too short on the upstream then have a longer downstream
                # if it's too short on the downstream then have a longer upstream
                if (len(mt_str) >= down_loc) & (up_loc > 0):
                    next
                elif (len(mt_str) <= sub_args.kmerLength):
                    up_loc = 0
                    down_loc = len(mt_str)
                elif (up_loc <= 0):
                    up_loc = 0
                elif (len(mt_str) <= down_loc):
                    down_loc = len(mt_str)

            # Set verification position up and downstream of mutation
            # where upstream is 3 AA proceeding mutation or until the start of the seq
            # where downstream is 3 AA following mutation or until the end of the seq
            up_verify = int(diff_loc[0]) - 3
            down_verify = int(diff_loc[0]) + 4
            if (up_loc < 0): up_verify = 0 
            if (len(mt_str) <= down_verify): down_verify=len(mt_str) 

            # Add kmer AA sequences to predict
            kmer_list.append(mt_str[int(up_loc):int(down_loc)])

            # Save up/downstream windows
            # example: SLEYVSTRQPLRRSLRSCSTP with mutation at pos 10 L
            # up_window is RQPL | down_window is LRRS
            up_list.append(mt_str[int(up_verify):int(diff_loc[0]+1)])
            down_list.append(mt_str[int(diff_loc[0]):int(down_verify)])
        else:
            # Create list of kmer AA sequences to predict
            kmer_list.append("")

            # Save up/downstream windows
            # example: SLEYVSTRQPLRRSLRSCSTP with mutation at pos 10 L
            # up_window is RQPL | down_window is LRRS
            up_list.append("")
            down_list.append("")        
  
    # Add lists to df
    df["kmer_Seqs"]=kmer_list
    df["up_verify"]=up_list
    df["down_verify"]=down_list

    # Subset df for only samples with a kmer, drop duplicates
    df = df.dropna(thresh=1)
    df = df.drop_duplicates()

    # Create a dicts for use downstream
    symbol_dict = dict(zip(df.kmer_Seqs, df.Hugo_Symbol))
    up_dict = dict(zip(df.kmer_Seqs, df.up_verify))
    down_dict = dict(zip(df.kmer_Seqs, df.down_verify))

    # Create file in fasta format
    netMHC_input = os.path.join(sub_args.outputDir,sub_args.outprefix + "_input_netmhc.tsv")
    df.to_csv(netMHC_input, columns=["header", "kmer_Seqs",], header=False, index=False, sep="\n")

    # Run netMHC, with parallelization via RAY
    netmhc_intermed = os.path.join(sub_args.outputDir, sub_args.outprefix + "_output_netmhc_")
    process = "python src/predictor.py {} {} {} {} {}".format(
        sub_args.alleleList,
        netMHC_input,
        sub_args.peptideLength,
        netmhc_intermed,
        str(sub_args.threads)
    )
    print("Running: " + process)
    exitcode = bash(process)
    
    # Read in output of netMHC    
    print("--Post-Processing")
    df_list=[]
    for allele in split_alleleList:
        # Read in intermed output of netmhc
        netmhc_raw_output=os.path.join(netmhc_intermed + "raw_" + allele + ".tsv")
        df_tmp = csv(netmhc_raw_output, index_col=None, header=None,sep="\t")
        
        # Remove first two rows: 
        # the first row includes only the allele name  
        # the second row includes the header
        df_tmp.drop([0],inplace=True)

        # Drop cols Ave, NB
        df_tmp.drop([9,10], axis=1,inplace=True)

        # Add column ID of the allele
        df_tmp = df_tmp.assign(allele=allele)
        df_tmp.columns = ['Pos', 'Peptide', 'ID', 'core', 
                        'icore', 'EL-score', 'EL_Rank', 'BA-score', 'BA_Rank','allele']
        df_list.append(df_tmp)
    df = pd.concat(df_list)

    # Search for up verification AA sequence in output peptide, 
    # if is does not exist search for downstream verification 
    # sequence. If neither exist the mutation is not in sequence 
    # and rows should be removed from futher analysis.
    # For example, a 9-mer could be generated in the flanking 
    # 9 AA sequence of a 21-mer region. Must be down after df 
    # transformation as each core peptide analyzed in a row 
    # across alleles may be different from one another.
    keeprows_list=[]
    peptidelen_list=[]
    for i,row in df.iterrows():
        search_key=str(row['Peptide'])
        pep_len=len(search_key)

        # Search core peptide for upstream/downstream keys
        res_up = [val for key, val in up_dict.items() if search_key in key]
        res_down = [val for key, val in down_dict.items() if search_key in key]

        # Some peptides have more than one mutation and therefore 
        # the sub AA sequence will match to more than one key. 
        # Check if there is a match in the upstream validator, 
        # if there isn't then check downstream.
        res="N"
        for up_loc in res_up:
            if up_loc in search_key: res="Y"
        if res == "N":
            for down_loc in res_down:
                if down_loc in search_key: res="Y"
        
        # Append values to list
        keeprows_list.append(res)
        peptidelen_list.append(pep_len)

    # Add peptide length as col
    df["peptide_length"]=peptidelen_list

    # Drop rows that were not validated
    df["keeprows"]=keeprows_list
    df.drop(df.index[df['keeprows'] == "N"], inplace = True)

    # Peptides length varies to match all possibilites of the peptideLength list
    # dictionary created above includes the fullPeptide:Hugo_Symbol
    # In order to match the partial terms, must iterate through the df and assign the
    # correct Hugo_Symbol
    # Attempted to map using df['Hugo_Symbol'] = df['ID'].map(symbol_dict), however,
    # output will shorten the ID column making this unreliable to use as key
    gene_list=[]
    for i,row in df.iterrows():
        search_key=str(row['Peptide'])
        res = [val for key, val in symbol_dict.items() if search_key in key]
        res = res[0].replace("[", "").replace("]", "")
        gene_list.append(res)
    df["Hugo_Symbol"]=gene_list

    # Add categorical labels to the strength of prediction
    df["EL_Rank"] = pd.to_numeric(df["EL_Rank"])
    df = df.sort_values(by=['EL_Rank'])
    df.loc[df['EL_Rank']<=sub_args.highbind,'prediction_strength'] = 'Strong'
    df.loc[df['EL_Rank']>sub_args.highbind,'prediction_strength'] = 'Weak'
    df.loc[df['EL_Rank']>sub_args.lowbind,'prediction_strength'] = 'Unlikely'

    # create final output df
    df_sub = df.iloc[:, np.r_[9,12,2,1,10,13,3:9]]
    df_sub.columns = ['Allele', 'Hugo_Symbol', 'ID', 'Peptide', 
                        'Peptide_Length','Prediction_Strength',
                        'core', 'icore', 'EL-score', 'EL_Rank', 'BA-score', 'BA_Rank']
    netmhc_final_output = os.path.join(sub_args.outputDir, sub_args.outprefix + "_output_netmhc_final.tsv")
    df_sub.to_csv(netmhc_final_output, header=True, index=False, sep="\t")


def parsed_arguments():
    """Parses user-provided command-line arguments. Requires argparse and textwrap
    package. argparse was added to standard lib in python 3.2 and textwrap was added
    in python 3.5. To create custom help formatting for subparsers a docstring is
    used create the help message for required options. argparse does not support named
    subparser groups, which is normally what would be used to accomphish this reformatting.
    As so, the help message for require options must be suppressed. If a new required arg
    is added to a subparser, it must be added to the docstring and the usage statement
    also must be updated.
    """

    # Add stylized terminal name and description
    c = Colors
    styled_name = "{0}{1}METRO{2}".format(c.bold, c.cyan, c.end)
    description = "{0}{1}{2}".format(c.bold, _description, c.end)
    named_description = '{0}{1}: {2}{3}'.format(
        c.bg_black, 
        styled_name, 
        description, 
        c.end
    )    

    # Create a top-level parser
    parser = argparse.ArgumentParser(
        description = named_description
    )

    # Adding Verison information
    parser.add_argument('--version', action = 'version', version='%(prog)s {}'.format(__version__))

    # Create sub-command parser
    subparsers = parser.add_subparsers()
    
    # Options for the "build" sub-command
    # Grouped sub-parser arguments are currently not supported.
    # https://bugs.python.org/issue9341
    # Here is a work around to create more useful help message for named
    # options that are required! Please note: if a required arg is added the
    # description below should be updated (i.e. update usage and add new option)
    required_build_options = textwrap.dedent("""\
        {0}

        {2}{3}Usage:{5}
          $ {1} build [--help] \\
                    --ref-fa REF_FA \\
                    --ref-gtf REF_GTF \\
                    --outputDir OUTPUTDIR

        {2}{3}Description:{5}
          Builds reference files for the mouse neoantigen prediction pipeline 
        from a genomic FASTA file and an annotation in GTF format. The build 
        sub command creates a FASTA file containing the sequence of each 
        transcript. 

        {2}{3}Required arguments:{5}
          --ref-fa REF_FA        Genomic FASTA file of the reference genome.
                                 This file can be downloaded from GENCODE. It
                                 should match the genomic FASTA file used for 
                                 variant calling.

          --ref-gtf REF_GTF      GTF file for the reference genome. This file
                                 can be downloaded from GENCODE. Ideally, this 
                                 file should match the GTF file version your 
                                 variant annotation uses.   

          --outputDir OUTPUTDIR  Path to an output directory. This path is where 
                                 the pipeline will create all of its output files. 
                                 If the provided output directory does not exist, 
                                 it will be created. After the build sub command 
                                 completes, the transcripts.fa or transcriptomic 
                                 FASTA file can be used or supplied to the run 
                                 sub command.
        
        {2}{3}Optional arguments:{5}
          -h, --help             Show usage information, help message, and exit.
        """.format(named_description, _name, c.bold, c.url, c.italic, c.end))

    # Display example usage in epilog
    build_epilog = textwrap.dedent("""\
        {2}{3}Example:{4}
          # Step 1.) Grab an interactive node 
          # do not run on head node!
          srun -N 1 -n 1 --time=1:00:00 --mem=8gb  --cpus-per-task=2 --pty bash
          module purge
          module load cuffdiff samtools

          # Step 2.) Build {0} reference files
          {0} build --ref-fa GRCm39.primary_assembly.genome.fa \\
                      --ref-gtf gencode.vM26.annotation.gtf \\
                      --outputDir /scratch/$USER/METRO/refs/

        {2}{3}Version:{4}
          {1}
        """.format(_name, __version__, c.bold, c.url, c.end))

    # Supressing help message of required args to overcome no sub-parser named groups
    subparser_build = subparsers.add_parser(
        'build',
        help = 'Builds the reference files for METRO.',
        usage = argparse.SUPPRESS,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description = required_build_options,
        epilog = build_epilog,
        add_help=False
    )
    # Input Genomic FASTA file
    subparser_build.add_argument(
        '--ref-fa',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        help = argparse.SUPPRESS
    )
    # Input Reference GTF file
    subparser_build.add_argument(
        '--ref-gtf',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        help = argparse.SUPPRESS
    )
    # Output Directory (build working directory)
    subparser_build.add_argument(
        '--outputDir',
        type = lambda option: os.path.abspath(os.path.expanduser(option)),
        required = True,
        help = argparse.SUPPRESS
    )
    
    # Optional Arguments
    # Add custom help message
    subparser_build.add_argument(
        '-h', '--help', 
        action='help', 
        help=argparse.SUPPRESS
    )

    # Options for the "input" sub-command
    # Grouped sub-parser arguments are currently not supported by argparse.
    # https://bugs.python.org/issue9341
    # Here is a work around to create more useful help message for named
    # options that are required! Please note: if a required arg is added the
    # description below should be updated (i.e. update usage and add new option)
    required_prepare_options = textwrap.dedent("""\
        {0}

        {2}{3}Usage:{5}
          $ {1} prepare [--help] \\
                    [--impactFilter IMPACTFILTER] \\
                    [--passFilter PASSFILTER] \\
                    [--vafFilter VAFFILTER] \\
                    --mafFiles MAFFILES \\
                    --outputDir OUTPUTDIR \\
                    --outprefix OUTprefix

        {2}{3}Description:{5}
          Prepares input files for the mouse neoantigen prediction pipeline. 
        The input sub command takes a VCF-like or MAF-like file and filters 
        it using the parameters outlined below. This is to prepare a final,
        filtered input file to the {1} run sub command. 

        {2}{3}Required arguments:{5}
            --mafFiles MAFFILES   
                                 Input VCF file(s) in MAF format. Provide a minimum 
                                 of two files, separated by a comma. Each input file 
                                 must have the follow required columns in the header:
                                    • Hugo_Symbol 
                                    • Start_Position 
                                    • End_Position 
                                    • Reference_Allele
                                    • Tumor_Seq_Allele1 
                                    • Tumor_Seq_Allele2 
            
            --outputDir OUTPUTDIR 
                                 Path to an output directory. This path is where the 
                                 pipeline will create all of its output files. If the 
                                 provided output directory does not exist, then it 
                                 will be created automatically. 

            --outprefix OUTprefix
                                 Prefix for output file names.

        {2}{3}Optional arguments:{5}
            -h, --help           Show usage information, help message, and exit.
            --impactFilter IMPACTFILTER
                                 Minimum number of input files with an IMPACT rating 
                                 of "MODERATE" or "HIGH" to be included. 
                                 Default: 2

            --passFilter PASSFILTER
                                 Minimum number of input files with a filter rating 
                                 of "PASS" to be included. 
                                 Default: 2

            --vafFilter VAFFILTER
                                 Minimum value for average VAF calculated as 
                                 (t_alt_count/t_depth) to be included. 
                                 Default: 0.2
        """.format(named_description, _name, c.bold, c.url, c.italic, c.end))

    # Display example usage in epilog
    prepare_epilog = textwrap.dedent("""\
        {2}{3}Example:{4}
          # Step 1.) Grab an interactive node
          # do not run on head node!
          srun -N 1 -n 1 --time=1:00:00 --mem=8gb  --cpus-per-task=2 --pty bash
          module purge
          module load python/3.5

          # Step 2.) Run METRO subcommand {0} to prepare files for processing
          ./{0} prepare \\
                --mafFiles /data/*.maf \\
                --outputDir /scratch/$USER/METRO/ \\
                --outprefix test  \\ 
                --vafFilter 0.2 \\
                --passFilter 2 \\
                --impactFilter 2

        {2}{3}Version:{4}
          {1}
        """.format(_name, __version__, c.bold, c.url, c.end))

    # Supressing help message of required args to overcome no sub-parser named groups
    subparser_prepare = subparsers.add_parser(
        'prepare',
        help = 'Filter input MAF files for METRO variant binding.',
        usage = argparse.SUPPRESS,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description = required_prepare_options,
        epilog  = prepare_epilog,
        add_help=False
    )

    # Required arguements
    # Input mutation file
    subparser_prepare.add_argument(
        '--mafFiles',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        nargs = '+',
        help = argparse.SUPPRESS
    )
    # Output Directory (analysis working directory)
    subparser_prepare.add_argument(
        '--outputDir',
        type = lambda option: os.path.abspath(os.path.expanduser(option)),
        required = True,
        help = argparse.SUPPRESS
    )
    # Output prefix
    subparser_prepare.add_argument(
        '--outprefix',
        required = True,
        help = argparse.SUPPRESS
    )
    
    # Optional arguements
    # Add custom help message
    subparser_prepare.add_argument(
        '-h', '--help', 
        action='help', 
        help=argparse.SUPPRESS
    )
    # average VAF filter
    subparser_prepare.add_argument(
        '--vafFilter',
        required = False,
        default='.2',
        type = float,
        help = argparse.SUPPRESS
    )
    # PASS filter
    subparser_prepare.add_argument(
        '--passFilter',
        required = False,
        default='2',
        type = int,
        help = argparse.SUPPRESS
    )
    # IMPACT (MODERATE, HIGH) filter
    subparser_prepare.add_argument(
        '--impactFilter',
        required = False,
        default='2',
        type = int,
        help = argparse.SUPPRESS
    )

    # Options for the "run" sub-command
    # Grouped sub-parser arguments are currently not supported by argparse.
    # https://bugs.python.org/issue9341
    # Here is a work around to create more useful help message for named
    # options that are required! Please note: if a required arg is added the
    # description below should be updated (i.e. update usage and add new option)
    required_find_options = textwrap.dedent("""\
        {0}

        {2}{3}Usage:{5}
          $ {1} find [--help] \\
                   [--subset SUBSET] \\
                   --input INPUT [INPUT ...] \\
                   --transcripts TRANSCRIPTS \\
                   --outputDir OUTPUT

        {2}{3}Description:{5}
          Runs the mouse neoantigen prediction pipeline with reference files built 
        with the build sub command to determine the consequence of a mutation on a 
        protein product. The run sub command takes a VCF-like or MAF-like file and 
        a FASTA file containing transcript sequences. 

        {2}{3}Required arguments:{5}
          --input INPUT [INPUT ...]
                           Input VCF-like or MAF-like files to process. One or more 
                           files can be provided. A mutated amino acid sequence will 
                           be generated for each variant in the supplied input file.

          --transcripts TRANSCRIPTS
                           Transcriptomic FASTA file. This reference file contains 
                           the sequence of each transcript in the reference genome. 
                           The file can be generated by running the build sub command. 
                           When creating this reference file, it is very important to 
                           use the same genomic FASTA and annotation file to call and 
                           annotate variants.

          --outputDir OUTPUT
                           Path to an output directory. This path is where the pipeline
                           will create all of its output files. If the provided output 
                           directory does not exist, it will be created.
        
        {2}{3}Optional arguments:{5}
          -h, --help       Show usage information, help message, and exit.
          --subset SUBSET  Subset resulting mutated amino acid sequence. If defined,
                           this option will obtain the mutated amino acid sequence
                           (AAS) +/- N amino acids of the mutation start site. By
                           default, the first 30 upstream and downstream amino acids
                           from the mutation site are recorded for non-frame shift
                           mutations. Amino acids downstream of a frame shit mutation
                           will be reported until the end of the amino acids sequence
                           for the variants transcript or until the first reported
                           terminating stop codon is found. 
                           Default: 30
        """.format(named_description, _name, c.bold, c.url, c.italic, c.end))

    # Display example usage in epilog
    find_epilog = textwrap.dedent("""\
        {2}{3}Example:{4}
          # Step 1.) Grab an interactive node
          # do not run on head node!
          srun -N 1 -n 1 --time=1:00:00 --mem=8gb  --cpus-per-task=2 --pty bash
          module purge
          module load python/3.5

          # Step 2.) Run METRO find to find mutated protein products
          ./metro find \\
                    --input /data/*.xlsx \\
                    --outputDir /scratch/$USER/METRO \\
                    --subset 30 \\
                    --transcripts transcripts.fa 

        {2}{3}Version:{4}
          {1}
        """.format(_name, __version__, c.bold, c.url, c.end))

    # Supressing help message of required args to overcome no sub-parser named groups
    # Required arguments
    subparser_find = subparsers.add_parser(
        'find',
        help = 'Run METRO find against called variants.',
        usage = argparse.SUPPRESS,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description = required_find_options,
        epilog  = find_epilog,
        add_help = False
    )

    # Required arguments
    # Input FastQ files
    subparser_find.add_argument(
        '--input',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        nargs = '+',
        help = argparse.SUPPRESS
    )
    # Output Directory (analysis working directory)
    subparser_find.add_argument(
        '--outputDir',
        type = lambda option: os.path.abspath(os.path.expanduser(option)),
        required = True,
        help = argparse.SUPPRESS
    )
    # Reference Transcriptome
    subparser_find.add_argument(
        '--transcripts',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        help = argparse.SUPPRESS
    )

    # Optional arguments
    # Add custom help message
    subparser_find.add_argument(
        '-h', '--help', 
        action='help', 
        help=argparse.SUPPRESS
    )
    # Obtain the mutated amino acid sequence(AAS) 
    # +- N amino acids of the mutated amino acid.
    subparser_find.add_argument(
        '--subset',
        type = int,
        required = False,
        default = 30,
        help = argparse.SUPPRESS
    )
    
    # Options for the "predict" sub-command
    # Grouped sub-parser arguments are currently not supported by argparse.
    # https://bugs.python.org/issue9341
    # Here is a work around to create more useful help message for named
    # options that are required! Please note: if a required arg is added the
    # description below should be updated (i.e. update usage and add new option)
    required_predict_options = textwrap.dedent("""\
        {0}

        {2}{3}Usage:{5}
          $ {1} predict [--help] \\
                      [--kmerLength KMERLENGTH] [--peptideLength PEPTIDELENGTH] \\
                      [--highbind HIGHBIND] [--lowbind LOWBIND] \\
                      [--threads THREADS] \\
                      --mutationFile MUTATIONFILE \\
                      --alleleList ALLELELIST \\
                      --outputDir OUTPUTDIR \\
                      --outprefix OUTPREFIX

        {2}{3}Description:{5}
          Run the mouse neoantigen prediction pipeline with output files from 
        the run sub command to predict the binding affinity of peptides to MHC
        class molecules. The predict sub command takes a metro TSV from the run
        sub command and a list alleles to evaluate as input. Binding prediction 
        is completed with NetMHCpan. Please do not forget to cite those authors.

        {2}{3}Citation:{5}
         {4}@NetMHCpan{5}
          • Gapped sequence alignment using artificial neural networks: application 
            to the MHC class I system. Andreatta M, Nielsen M. Bioinformatics (2016) 
            Feb 15; 32(4):511-7

        {2}{3}Required arguments:{5}
            --mutationFile MUTATIONFILE
                            Input file in tsv format. This can be the output of the 
                            {1} run command or a user generated file. Each input 
                            file must have the follow required columns in the header:
                                • Variant_Classification
                                • Hugo_Symbol	
                                • Transcript_ID
                                • WT_Subset_AA_Sequence	
                                • Mutated_Subset_AA_Sequence
            --alleleList ALLELELIST
                            Allele name(s). More than one allele can be provided at a
                            time. Multiple alleles should be seperated be seperated by 
                            commas (without spaces, max 20 per submission). For a full 
                            list of alleles possible, please visit this NetMHCpan page:
                            https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/MHC_allele_names.txt
            
            --outputDir OUTPUTDIR 
                            Path to an output directory. This path is where the pipeline
                            will create all of its output files. If the provided output 
                            directory does not exist, it will be created. 
            
            --outputprefix OUTPUTprefix
                            Prefix for outputs file names. 

        {2}{3}Optional arguments:{5}
            -h, --help      Show usage information, help message, and exit.

            --threads THREADS
                            Number of threads to use for multiprocessing. Providng more 
                            threads will signficantly reduce the overall run time. Each 
                            allele will be evaluated in parallel. The optimal number of
                            cores can be found by min(len(allelelist)-1, CPUsAvailable-1).
                            Default: 4

            --kmerLength KMERLENGTH
                            Length of Mutated_Subset_AA_Sequence to submit in prediction 
                            analysis. Will set mutation to be center of length for non-
                            frameshift mutations, so value must be an odd number.
                            Default: 21

            --peptideLength PEPTIDELENGTH
                            Single value, or list, of length to shorten peptide sequence
                            for prediction analysis. If list, each length is separated by 
                            a comma and without spaces.
                            Default: 8,9,10,11

            --highbind HIGHBIND
                            Threshold to define binding affinity as "STRONG" for netHMC 
                            output. Must be an integer that is lower than --lowbind.
                            Default: 0.5

            --lowbind LOWBIND
                            Threshold to define binding affinity as "WEAK" for netHMC 
                            output. Must be an integer that is higher than --highbind.
                            Default: 2
        """.format(named_description, _name, c.bold, c.url, c.italic, c.end))

    # Display example usage in epilog
    predict_epilog = textwrap.dedent("""\
        {2}{3}Example:{4}
          # Step 1.) Grab an interactive node 
          # do not run on head node!
          srun -N 1 -n 1 --time=1:00:00 --mem=8gb  --cpus-per-task=18 --pty bash
          module purge
          module load python/3.5

          # Step 2.) Run {0} to find mutated protein products
          # Using number of available threads - 2
          ./metro predict \\
                --threads 16 \\
                --mutationFile /scratch/$USER/METRO/test_Variant.metro.tsv \\
                --allelList H-2-Ld,H-2-Dd,H-2-Kb \\
                --peptideLength 8,9,10,11 \\
                --kmerLength 21 \\
                --highbind 0.5 \\
                --lowbind 2 \\
                --outputDir /scratch/$USER/METRO/ \\
                --outprefix test

        {2}{3}Version:{4}
          {1}
        """.format(_name, __version__, c.bold, c.url, c.end))

    
    # Supressing help message of required args to overcome no sub-parser named groups
    subparser_predict = subparsers.add_parser(
        'predict',
        help = 'Predict METRO variant binding using NETMHC.',
        usage = argparse.SUPPRESS,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description = required_predict_options,
        epilog  = predict_epilog,
        add_help = False
    )

    # Required arguments
    # Input mutation file
    subparser_predict.add_argument(
        '--mutationFile',
        # Check if the file exists and if it is readable
        type = lambda file: permissions(parser, file, os.R_OK),
        required = True,
        nargs = '+',
        help = argparse.SUPPRESS
    )
    # HLA allele name
    subparser_predict.add_argument(
        '--alleleList',
        required = True,
        help = argparse.SUPPRESS
    )                           
    # Output Directory (analysis working directory)
    subparser_predict.add_argument(
        '--outputDir',
        type = lambda option: os.path.abspath(os.path.expanduser(option)),
        required = True,
        help = argparse.SUPPRESS
    )
    # Output prefix
    subparser_predict.add_argument(
        '--outprefix',
        required = True,
        help = argparse.SUPPRESS
    )

    # Optional arguments
    # Custom help message
    subparser_predict.add_argument(
        '-h', '--help', 
        action='help', 
        help=argparse.SUPPRESS
    )
    # Threads for multiprocessing
    subparser_predict.add_argument(
        '--threads',
        required = False,
        default=4,
        type = int,
        help = argparse.SUPPRESS
    )
    # kmerLength
    subparser_predict.add_argument(
        '--kmerLength',
        required = False,
        default='21',
        type = int,
        help = argparse.SUPPRESS
    )
    # Peptide length
    subparser_predict.add_argument(
        '--peptideLength',
        required = False,
        default='8,9,10,11',
        help = argparse.SUPPRESS
    )
    # High Binding Threshold
    subparser_predict.add_argument(
        '--highbind',
        required = False,
        default='.5',
        type = float,
        help = argparse.SUPPRESS
    )
    # Low Binding Threshold
    subparser_predict.add_argument(
        '--lowbind',
        required = False,
        default='2',
        type = int,
        help = argparse.SUPPRESS
    )
    
    # Sanity check for user command line arguments 
    if len(sys.argv) < 2:
        parser.error("""\n\t └── Fatal: failed to provide a valid sub command to {0}!
             Please run '{0} -h' to view more information about {0}'s usage.""".format(
                _name
            )
        )

    # Define handlers for each sub-parser
    subparser_build.set_defaults(func = build)
    subparser_prepare.set_defaults(func = prepare)
    subparser_find.set_defaults(func = find)
    subparser_predict.set_defaults(func = predict)

    # Parse command-line args
    args = parser.parse_args()
    return args


def main():

    # Collect args for sub-command
    args = parsed_arguments()

    # Mediator method to call sub-command's set handler function
    args.func(args)


if __name__ == '__main__':
    main()