Config

#!/bin/bash

#   More complete information on how to fill out
#       this Config file can be found at:
#       https://github.com/MorrellLAB/sequence_handling/wiki/Configuration

###################################################
##########     Shared across handlers    ##########
###################################################

#   Where are we storing the output files?
#       Final directory is ${OUT_DIR}/Name_of_Handler
OUT_DIR=/home/morrellp/large/soybean_toy-all-libs

#   Name this project
PROJECT=soybean_toy_dataset_all_libs

#   What email should we use for job notifications?
EMAIL=large@umn.edu

#   What encoding is used for quality values?
#       Look at the FastQC files to determine the sequence encoding.
#       Choose from: 'sanger', 'illumina', 'solexa', or 'phred'
#           Illumina 1.8+ and Oxford Nanopore use 'sanger' encoding.
QUAL_ENCODING=sanger

#   Sequencing platform technology
#       What platform were the reads produced?
#       Valid options are:
#           CAPILLARY, LS454, ILLUMINA,
#           SOLID, HELICOS, IONTORRENT,
#           ONT, and PACBIO
SEQ_PLATFORM=ILLUMINA

#   What reference genome are we using?
#       Include the full physical file path.
REF_GEN=/home/morrellp/large/soybean_toy/genome/Gmax_275_v2.0.fa

#   Is this organism barley?
#       Choose from: "true" or "false"
BARLEY=false

#   Are you running the analysis on the Minnesota Supercomputing Institute (MSI)?
#       Choose from: "true" or "false"
MSI=true

#   Are you submitting the job with qsub from PBS (Portable Batch System)
#       Choose from: "true" or "false"
USE_PBS=false

#   Are you submitting the job with sbatch from Slurm?
#       Choose from: "true" or "false"
USE_SLURM=true

#   Do the quality scores need to be adjusted for GATK? Default: false
#       Change to true if you get errors from GATK like:
#       "<Sample> appears to be using the wrong encoding for quality scores: we encountered an extremely high quality score"
FIX_QUALITY_SCORES=false

#   Where should Picard/GATK store temporary files?
#       If you've encountered issues with running out of temp space
#           with picard, you can optionally specify a temp directory
#       Otherwise, leave blank
TMP=/home/morrellp/large/soybean_toy-all-libs/tmp

############################################
##########     GBS_Demultiplex    ##########
############################################

#   IMPORTANT: GBS_Demultiplex is not yet functional and is still under development!

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
GD_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
GD_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=1gb --tmp=1gb -t 06:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${GD_QUEUE}"

#   USE_PBS: What are our QS
GD_QSUB="mem=1gb,nodes=1:ppn=4,walltime=6:00:00"

GBS_SAMPLES=

#   A key file with the following columns:
#       Flowcell Lane Barcode Sample PlateName Row Column
KEY_FILE=

#   Are the barcodes at the beginning or the end of the sequence?
#       Choose from: 'beginning' or 'end'
LINE_ENDING=

#   How many mismatches are allowed?
MISMATCH_TOL=1

#   Do we allow partial overlap?
PARTIAL=0

#   What extension or naming convention do the samples have?
FILE_TYPE=

############################################
##########   Quality_Assessment   ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
QA_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
QA_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=1gb --tmp=1gb -t 06:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${QA_QUEUE}"

#   USE_PBS: What are our QSub settings for Quality_Assessment?
#       Below are the recommended settings
QA_QSUB="mem=1gb,nodes=1:ppn=4,walltime=6:00:00"

#   Provide a list of FastQ, SAM, or BAM files to be used
#       Include the full file path.
QA_SAMPLES=/home/morrellp/large/soybean_toy-all-libs/All-libraries.txt

#   What is the size of the genome in basepairs (for whole-genome sequencing)
#   or capture region (for exome capture)?
#   If unavailable, put "NA"
TARGET=NA

############################################
##########    Adapter_Trimming    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
AT_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
AT_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=1gb --tmp=1gb -t 10:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${AT_QUEUE}"

#   USE_PBS: What are our QSub settings for Adapter_Trimming?
#       Below are the recommended settings
AT_QSUB="mem=1gb,nodes=1:ppn=4,walltime=10:00:00"

#   Where is the list of original FastQ files?
#       Include the full file path.
RAW_SAMPLES=/home/morrellp/large/soybean_toy-all-libs/All-libraries.txt

#   What shared suffix do the forward samples have?
#       Example: _1_sequence.txt.gz
FORWARD_NAMING=R1_001.fastq.gz

#   What shared suffix do the reverse samples have?
#       Example: _2_sequence.txt.gz
REVERSE_NAMING=R2_001.fastq.gz

#   If you have single-end samples, leave
#       FORWARD_NAMING and REVERSE_NAMING
#       filled with values that do NOT
#       match your samples

#	  Where is the adapter file? Include the full physical file path.
ADAPTERS=/home/morrellp/large/Softwares/adapters.txt

#   What is Scythe's prior contamination rate?
#       Scythe's documentation suggests starting at
#       0.05 and then experimenting as needed
PRIOR=0.05

############################################
##########    Quality_Trimming    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
QT_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
QT_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=1gb --tmp=1gb -t 10:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${QT_QUEUE}"

#   USE_PBS: What are our QSub settings for Quality_Trimming?
#       Below are the recommended settings
QT_QSUB="mem=1gb,nodes=1:ppn=4,walltime=10:00:00"

#   Where is our list of trimmed samples? Include the full file path.
#ADAPTED_LIST=${OUT_DIR}/Adapter_Trimming/${PROJECT}_trimmed_adapters.txt
ADAPTED_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-trimmed.txt

#   What shared suffix do the trimmed forward samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
FORWARD_ADAPTED=_Forward_ScytheTrimmed.fastq.gz

#   What shared suffix do the trimmed reverse samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
REVERSE_ADAPTED=_Reverse_ScytheTrimmed.fastq.gz

#   What shared suffix do the trimmed single samples have?
#       If using the Adapter_Trimming handler,
#       leave the following definition as is
SINGLES_ADAPTED=_Single_ScytheTrimmed.fastq.gz

#   If you have single-end samples, leave
#       FORWARD_ADAPTED and REVERSE_ADAPTED
#       filled with values that do NOT
#       match your samples

#   If you have paired-end samples, leave
#       SINGLES_ADAPTED filled with values that
#       do NOT match your samples

#   What is the threshold for trimming?
#       For standard trimming, use 20
QT_THRESHOLD=20

############################################
##########  	Read_Mapping	  ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
RM_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
RM_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=22gb --tmp=2gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${RM_QUEUE}"

#   USE_PBS: What are our QSub settings for Read_Mapping?
#       Below are the recommended settings
#       For large files, it may be necessary to use mesabi
RM_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is our list of trimmed samples?
#       Include the full file path.
TRIMMED_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-trimmed-2.txt

#   How are trimmed forward samples named?
#       If using the Adapter_Trimming handler
#       FORWARD_TRIMMED=_Forward_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       FORWARD_TRIMMED=_R1_trimmed.fastq.gz
FORWARD_TRIMMED=_Forward_ScytheTrimmed.fastq.gz

#   How are trimmed reverse samples named?
#       If using the Adapter_Trimming handler
#       REVERSE_TRIMMED=_Reverse_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       REVERSE_TRIMMED=_R2_trimmed.fastq.gz
REVERSE_TRIMMED=_Reverse_ScytheTrimmed.fastq.gz

#   How are trimmed single-end samples named?
#       If using the Adapter_Trimming handler
#       SINGLES_TRIMMED=_Single_ScytheTrimmed.fastq.gz
#       If using the Quality_Trimming handler
#       SINGLES_TRIMMED=_single_trimmed.fastq.gz
SINGLES_TRIMMED=_Single_ScytheTrimmed.fastq.gz

#   BWA mem parameters; below are the defaults for BWA mem
#       Note that you may need to adjust parameters based on species
#       How many threads are we using?
THREADS=16

#       What is our minimum seed length?
SEED=8

#       What is the band width?
WIDTH=100

#       What is our off-diagonal x-dropoff (Z-dropoff)?
DROPOFF=100

#       What is our re-seed value?
RE_SEED=1.0

#       What is our cutoff value?
CUTOFF=10000

#       What is our matching score?
MATCH=1

#       What is our mismatch penalty?
MISMATCH=4

#       What is our gap penalty?
GAP=8

#       What is our gap extension penalty?
EXTENSION=1

#       What is our clipping penalty?
CLIP=5

#       What is our unpaired read penatly?
UNPAIRED=9

#       Do we rescue missing hits? Note, this means that reads may not be matched. Requires paired-end mode
RESCUE=false

#       Do we assume the first input query is interleaved?
INTERLEAVED=false

#       What is our minimum threshold?
RM_THRESHOLD=85

#       Do we output all alignments, and mark as secondary?
SECONDARY=false

#       Do we append FASTA/Q comments to SAM files?
APPEND=false

#       Do we use hard clipping?
HARD=false

#       Do we mark split hits as secondary?
SPLIT=true

#       What is the verbosity level?
#           Choose from:
#               'disabled', 'errors'
#               'warnings', 'all', or 'debug'
VERBOSITY='all'

############################################
##########     SAM_Processing     ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
SP_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
SP_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=22gb --tmp=8gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${SP_QUEUE}"

#   USE_PBS: What are our QSub settings for SAM_Processing?
#       Below are the recommended settings
SP_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Specify how you would like your SAM files to be processed
#       Choose from:
#           'picard' (recommended) or 'samtools'
METHOD=picard

#   Number of threads used for SAM_Processing
#   Using more threads requires more memory
#       This is ignored when USE_PBS=true
SAM_PROCESSING_THREADS=120

#   Where is the list of read-mapped samples?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
MAPPED_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-SAM-2.txt

#   The next two variables are only used if processing with Picard
#       If using SAMtools, leave these variables blank

#   What is the maximum number of file handles that we can use?
#       For UNIX systems, the per-process maximum number of files
#           that can be open may be found with 'ulimit -n'
#       We recommend setting MAX_FILES to be slightly under this value
MAX_FILES=1000

#   What is the maximum number (integer) of records stored in RAM before spilling to disk?
#       This is used in the Picard SortSam and MarkDuplicates and is ignored if SAM_Processing with Samtools.
#       You can try reducing this number if you are having core dump/out of memory errors.
#       Increasing this number reduces the number of file handles needed to sort the file,
#       and increases the amount of RAM needed.
#   Default value: 500000
PICARD_MAX_REC_IN_RAM=500000

#   What is the sorting collection size ratio we want to use?
#       This is used in Picard MarkDuplicates and is ignored if SAM_Processing with Samtools.
#       You can try reducing this number if you are having core dump/out of memory errors.
#   Default value: 0.25
SORTING_COLL_SIZE_RATIO=0.25

############################################
##########    Coverage_Mapping    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
CM_QUEUE="small"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
CM_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=22gb --tmp=2gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${CM_QUEUE}"

#   USE_PBS: What are our Qsub settings for Coverage_Mapping?
#       Below are the recommended settings
CM_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is the list of finished BAM files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
BAM_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-BAM.txt

#   For exome capture, use the capture regions file (BED format)
#   For whole-genome sequencing, leave this blank
#   Coverage_Mapping should not be used for GBS data
REGIONS_FILE=

############################################
##########    Haplotype_Caller    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple partitions that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
HC_QUEUE="ram1t"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
HC_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=250gb --tmp=22gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${HC_QUEUE}"

#   USE_PBS: What are our Qsub settings for Haplotype_Caller?
#       Below are the recommended settings
#     NOTE: "mem=" is used even if qsub is not used
HC_QSUB="mem=250gb,nodes=1:ppn=24,walltime=24:00:00"

#   Number of threads used for Haplotype_Caller
#   Using more threads requires more memory
#   This is ignored when USE_PBS=true
#   Default: "NA"
HAPLOTYPE_CALLER_THREADS=120

#   Where is the list of finished BAM files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
FINISHED_BAM_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-BAM-Chr.txt

#   What is the nucleotide diversity per base pair (Watterson's theta)?
#       For barley: 0.008
#       For soybean: 0.001
THETA=0.001

#   If true, GATK will not trim down the active region from the full region
#       (active + extension) to just the active interval for genotyping
#       Recommended value: false
DO_NOT_TRIM_ACTIVE_REGIONS=false

#   If true, all bases will be considered active regions
#       Recommended value: false
FORCE_ACTIVE=false

#   Optional: Provide a list of custom intervals if you are interested in specific regions AND/OR want to parallelize processing across regions/chromosomes.
#   Default: false. Otherwise, fill in full path to intervals file.
#   IMPORTANT: GATK accepts the following file extensions:
#       .intervals, .list, .interval_list, .bed, and .vcf
#       sequence_handling Haplotype_Caller currently handles: .intervals, .list
#   For .intervals/.list files, format one per line in one of the two formats: 1) chr1 or 2) chr1:100-200
#   For example files, see: https://github.com/MorrellLAB/sequence_handling/tree/master/Example_Files
#       For additional info, please see Wiki page.
HC_CUSTOM_INTERVALS=false

#   Optional: If your genome has scaffolds or parts of the reference not covered by
#       the chromosomes above, include the full filepath to a SORTED list of
#       names. One scaffold name per line.
#   IMPORTANT: If you provide a scaffolds list, you must provide a custom intervals list too.
#   Default is set to false. Otherwise, fill in full path to intervals file.
HC_SCAFFOLDS=false

#   Do we want to parallelize across regions?
#   Set to true if yes.
#   Default is set to false.
#       Parallelizing across regions for Haplotype_Caller means each
#       region/chromosome provided in HC_CUSTOM_INTERVALS list gets run
#       in as it's own job for every sample.
HC_PARALLELIZE=false

################################################
##########     Genomics_DB_Import     ##########
################################################

#   USAGE NOTE: Genomics DB Import is used in GATK4.

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
GDBI_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
GDBI_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=248gb --tmp=100gb -t 96:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${GDBI_QUEUE}"

#   USE_PBS: What are our Qsub settings for Genomics_DB_Import?
#       Below are the default settings but this depends on the number of samples
#     NOTE: mem has to be in the  unit of gb, and it has to be greater than 4gb
#     NOTE2: "mem=" is used even if qsub is not used
GDBI_QSUB="mem=400gb,nodes=1:ppn=4,walltime=96:00:00"

#   Number of threads used for Genomics_DB_IMPORT
#   Using more threads requires more memory
#       This is ignored when USE_PBS=true
GDBI_THREADS=120

#   REQUIRED: Please fill out variables under "Genotype_GVCFs" listed below.
#       Genomics_DB_Import shares variables with Genotype_GVCFs.

############################################
##########     Genotype_GVCFs     ##########
############################################

#   USAGE NOTE: For accurate results, run Genotype_GVCFs on all of your samples at once
#       DO NOT break this job into batches of samples

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
GG_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
GG_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=248gb --tmp=100gb -t 96:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${GG_QUEUE}"

#   USE_PBS: What are our Qsub settings for Genotype_GVCFs?
#       Below are the recommended settings
#     NOTE: "mem=" is used even if qsub is not used
GG_QSUB="mem=248gb,nodes=1:ppn=4,walltime=96:00:00"

#   Number of threads used for Genotype_GVCFs
#   Using more threads requires more memory
#       This is ignored when USE_PBS=true
GG_THREADS=120

#   Where is the list of GVCF files?
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
GVCF_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-VCF.txt

#   What is the nucleotide diversity per base pair (Watterson's theta)?
#   Genotype_GVCFs uses the THETA under Haplotype_Caller for this

#   Where is the reference dictionary file? (.dict)
#       Include the full file path.
REF_DICT=/home/morrellp/large/soybean_toy/genome/Gmax_275_v2.0.dict

#   How many chromosomes or chromosome parts does the reference have, excluding scaffolds? (integer value)
#       For barley: 15 (7*2 chromosome parts + chrUn)
#       For soybean: 20
NUM_CHR=20

#   Optional: Provide a list of custom intervals if you are interested in specific regions AND/OR want to parallelize processing across regions/chromosomes.
#   IMPORTANT: GATK accepts the following file extensions:
#       .intervals, .list, .interval_list, .bed, and .vcf
#       sequence_handling currently handles: .intervals, .list, and .bed
#   For .intervals/.list files, format one per line as follows: chr1:100-200
#   For example files, see: https://github.com/MorrellLAB/sequence_handling/tree/master/Example_Files
#       For additional info, please see Wiki page.
#   Default is set to false. Otherwise, fill in full path to intervals file.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored (uses HC_CUSTOM_INTERVALS).
CUSTOM_INTERVALS=/home/morrellp/large/soybean_toy/genome/Gmax_275_v2.0-Chr.list

#   Optional: If your genome has scaffolds or parts of the reference not covered by
#       the chromosomes above, include the full filepath to a SORTED list of
#       names. One scaffold name per line.
#   Default is set to false. Otherwise, fill in full path to intervals file.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored (uses HC_SCAFFOLDS).
SCAFFOLDS=/home/morrellp/large/soybean_toy/genome/scaffolds.intervals
#SCAFFOLDS=

#   Do we want to parallelize across regions or chromosomes?
#   Set to true if yes.
#   Default is set to false.
#       Parallelizing across regions means each region/chromosome provided in
#       CUSTOM_INTERVALS list gets run as its own job.
#       In the case that we are parallelizing across regions AND we have scaffolds,
#       regions in CUSTOM_INTERVALS will be parallelized but the scaffolds will not be parallelized.
#       Scaffolds will instead be imported into its own gendb workspace.
#   NOTE: If HC_PARALLELIZE=true and HC_CUSTOM_INTERVALS is set, the value below is ignored.
PARALLELIZE=true

#  When parallelized, separate vcf output files are created for separate intervals in
#    ${OUT_DIR}/Genotype_GVCFs/vcf_split_regions
#  If you set the following option to true, these separate files get combined into a single vcf:
#     ${OUT_DIR}/Genotype_GVCFs/raw_variants.vcf
#  This option is ignored if PARALLELIZE=false
#     NOTE: this could require lots of memory, so if it fails, you need to increase
#           mem= option in GG_QSUB (this option is used even if you are not using qsub)
#  IMPORTANT: Currently, this option only works for non-PBS. For PBS users, split VCF files will
#     get combined and sorted in the Create_HC_Subset handler. If you don't plan to use the
#     Create_HC_Subset handler, the HelperScripts/combine_and_sort_split_vcf.sh
#     script will combine and sort the split VCF files for you.
#  Default is set to true.
GG_COMBINED_VCF=true

#   What is the sample ploidy? (integer value)
#   For highly inbred samples (most barleys): 1
PLOIDY=2

############################################
##########    Create_HC_Subset    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
CHS_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
CHS_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=22gb --tmp=8gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${CHS_QUEUE}"

#   USE_PBS: What are our Qsub settings for Create_HC_Subset?
#       Below are the recommended settings
CHS_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is our raw variants VCF file?
#       If we parallelized across regions in Genotype_GVCFs and don't have a
#       concatenated VCF file (GG_COMBINED_VCF was set to false), fill out the next variable
#       called "CHS_VCF_LIST=" with the full filepath to a list of split vcf files.
#   IMPORTANT: DO NOT LEAVE BLANK. If you don't have a single raw variants VCF file and are going to concatenate split VCF files
#       put "NA" here. Odd errors may appear if this is not filled out.
CHS_RAW_VCF="NA"

#   Where is the list of chromosome part VCF files?
#       If this is filled out, this handler will concatenate VCF files split into parts.
#       NOTE: This concatenation and sorting feature requires lots of memory, so keep adding more memory if you get out of memory errors.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
#   Default is "NA", replace with full filepath to list of split VCF files if you do not have a single raw variants VCF
CHS_VCF_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-VCF-genotyped.txt

#   If exome capture, include the full file path to the capture regions file (BED format)
#   This should be the same file as the REGIONS_FILE in Coverage_Mapping. In this case, put "${REGIONS_FILE}" below.
#   If not exome capture, put "NA"
CAPTURE_REGIONS="NA"

#   Subset unfiltered vcf to one chromosome for percentile tables used to get a sense of the distirbution of per sample DP and GQ?
#   Recommended for very large genomes where too much memory is required and job fails due to running out of memory
#   Default is "NA", replace with exact matching chromosome name. For more than one chromosome (or chromosome part), use comma WITHOUT spaces to delimit. Example: "chr1H_part1,chr1H_part2"
SUBSET_CHR="NA"
#   File prefix we should use for subsetted output file
#   Default is "NA". Example: "chr1H"
SUBSET_CHR_OUT_PREFIX="NA"

#   What is the depth per sample (DP) cutoff? (integer)
#       If a sample's DP is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 5
CHS_DP_PER_SAMPLE_CUTOFF=5

#   What is the genotyping quality (GQ) cutoff? (integer)
#       If a sample's GQ is below this threshold, it will count as a "bad" sample for that site.
#       Recommended value: 10th percentile of the raw GQ percentile table
CHS_GQ_CUTOFF=10

#   What is the maximum proportion of "lowGQ" (below the GQ cutoff) samples allowed per site?
#       Sites with more "lowGQ" samples than this threshold will be filtered out.
#       Recommended value: "0.2"
CHS_MAX_LOWGQ=0.2

#   What is the maximum proportion of heterozygous genotypes per site? (proportion)
#       Sites with a proportion above this threshold will be filtered out
#       Recommended value: "0.1" based on highly inbred species like barley
CHS_MAX_HET=0.1

#   What is the maximum proportion of missing calls per site? (proportion)
#       Sites with a proportion above this threshold will be filtered out
#       Recommended value: "0.2"
CHS_MAX_MISS=0.2

#   What is the QUAL score cutoff? (integer)
#       Sites with a QUAL below this cutoff will be excluded.
#       Recommended value: 40
CHS_QUAL_CUTOFF=40

#   How much of the variants do you want to sample randomly for QC graphs (as genome intervals)?
#       The number of genomic regions to sample:
CHS_SUBSET_N=1000000
#		The length (in base pairs) of genomic regions to sample:
CHS_SUBSET_LEN=100

############################################
##########  Variant_Recalibrator  ##########
############################################

#   DO NOT RUN VARIANT_RECALIBRATOR IF YOU HAVE LESS THAN 30 EXOME SAMPLES
#   GATK has empirically found in human data that you need at least 1 whole genome or 30 exome samples to have enough variant sites for decent modeling.
#   What matters is having a large number of variant sites, so this minimum number of samples is only an estimate.
#   See documentation here for details: https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR-

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
VR_QUEUE="amdlarge,amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
VR_SBATCH="--nodes=1 --ntasks-per-node=4 --mem=240gb --tmp=22gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${VR_QUEUE}"

#   USE_PBS: What are our Qsub settings for Variant_Recalibrator?
#       Below are the recommended settings
VR_QSUB="mem=240gb,nodes=1:ppn=4,walltime=24:00:00"

#   What reference genome are we using?
#       Include the full file path.
#       If barley AND running GATK3, make sure to use the pseudomolecular reference
#           NOT the parts reference used above
#       If running GATK4, make sure this is the same reference used during Haplotype_Caller and Genotype_GVCFs
VR_REF=/home/morrellp/large/soybean_toy/genome/Gmax_275_v2.0.fa

#   Do we have a single concatenated raw VCF file output from Create_HC_Subset?
#   If not, put "NA" below
#   GATK4 only accepts a single input variant file unlike earlier version of GATK, which accepted multiple input variant files.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
VR_RAW_VCF="NA"

#   Where is the list of chromosome part VCF files from Genotype_GVCFs?
#   Leave as the default "NA" if we already have a concatenated raw VCF file. Change otherwise.
#   GATK3 accepts multiple input variant files but GATK4 does not. If we are running GATK4, the handler will
#   take the list of split VCF files and concatenate them for you.
#       To generate this list, use sample_list_generator.sh
#       located at /sequence_handling/HelperScripts/sample_list_generator.sh
VR_VCF_LIST=/home/morrellp/large/soybean_toy-all-libs/All-libraries-VCF-genotyped.txt

#   Do we want to recalibrate indels, SNPs, or both?
#   Default is "BOTH"
#       Specifically, indels are recalibrated first then SNPs as recommended by GATK best practices
#   Valid values (case sensitive) include: "BOTH", "INDELS_ONLY", "SNPS_ONLY"
#       "BOTH" means indels are recalibrated first then SNPs as recommended by GATK best practices
#       "INDELS_ONLY" pulls out indels and only recalibrates indels
#       "SNPS_ONLY" pulls out SNPs and only recalibrates SNPs
RECAL_MODE="BOTH"

#   Which annotations do we want to include?
#   INDEL and SNPs are recalibrated separately, for flexibility you can specify different sets of annotations to use for each below.
#   Specifically, read up on GATK Variant Recalibrator's caveats for specific types of data (example: DP annotation should not be used for exome datasets)
#   See doc here for what the annotations mean: https://gatk.broadinstitute.org/hc/en-us/articles/360035890471
#   NOTE: MQ is intentionally left out when recalibrating indels, see this tutorial for more info: https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering
#   IMPORTANT: DP should NOT be used for exome datasets (see here https://gatk.broadinstitute.org/hc/en-us/articles/360035531612-Variant-Quality-Score-Recalibration-VQSR-)
#   DEFAULT annotations are included below, please add/exclude annotations as you see fit.
#   Please following this example format to add/exlude annotations:
#       VR_ANN_INDEL="-an FS -an ReadPosRankSum -an MQRankSum -an QD -an SOR -an DP"
VR_ANN_INDEL="-an QD -an FS -an ReadPosRankSum -an MQRankSum -an SOR -an DP"
VR_ANN_SNP="-an QD -an FS -an ReadPosRankSum -an MQ -an MQRankSum -an SOR -an DP"

#   What truth sensitivity tranche settings do we want to use?
#   These are the levels of truth sensitivity at which to slice the data (in percent)
#   and will be used with the VariantRecalibrator function call.
#       If modifying, please follow the format: "-tranche 100.0 -tranche 99.0"
TRANCHE_INDEL="-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 97.0 -tranche 96.0 -tranche 95.0 -tranche 94.0 -tranche 93.0 -tranche 92.0 -tranche 91.0 -tranche 90.0"
TRANCHE_SNP="-tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 -tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 -tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0"

#   EXTRA FLAGS. Are there additional flags (options) you would like to include?
#   Default is "NA", modify otherwise. DO NOT leave blank!
#   Use the same options format as listed in the gatk docs except separated by space, all on one line, and surrounded by double quotes.
#       Example format: RECAL_EXTRA_OPTIONS_INDEL="-L genomic_intervals -titv 2.15"
#   Recalibration step:
#   INDEL mode: Currently, indel recalibration uses the following flags:
#       -R, -V, -an (see VR_ANN_INDEL above), -tranche (see TRANCHE_INDEL above), -mode INDEL, -O, --resource, --tranches-file, --rscript-file
RECAL_EXTRA_OPTIONS_INDEL="NA"
#   SNP mode: Currently, SNP recalibration uses the following flags:
#       -R, -V, -an (see VR_ANN_SNP above), -tranche (see TRANCHE_SNP above), -mode SNP, -O, --resource, --tranches-file, --rscript-file
RECAL_EXTRA_OPTIONS_SNP="NA"

#   ApplyVQSR step (filtering step):
#   INDEL mode: Currently, indel ApplyVQSR uses the following options:
#       -R, -V, -mode INDEL, --truth-sensitivity-filter-level, --recal-file, --tranches-file, --create-output-variant-index, -O
FILTER_EXTRA_OPTIONS_INDEL="NA"
#   SNP mode: Currently, snp ApplyVQSR uses the following options:
#       -R, -V, -mode SNP, --truth-sensitivity-filter-level, --recal-file, --tranches-file, --create-output-variant-index, -O
FILTER_EXTRA_OPTIONS_SNP="NA"

#   Where are resource VCF files and their priors (integers) to be used for training our model?
#       At least one resource (high confidence subset counts as one) and prior pair is required.
#   MPORTANT: GATK 4 requires both training AND truth sets to run, otherwise it will return an error.
#       These can be supplied in a single resource or as separate resources (fill out the next section below).
#   Where is the high confidence subset of variants?
#   The high confidence set recommended settings: known=false,training=true,truth=false
#       If you used Create_HC_Subset, leave as the default
HC_SUBSET=${OUT_DIR}/Create_HC_Subset/${PROJECT}_high_confidence_subset.vcf
HC_PRIOR=5.0
HC_KNOWN=false
HC_TRAIN=true
HC_TRUTH=false

#   Do we have additional resource VCF files we can use?
#   "NA" is the default for missing resource files and priors, modify otherwise
#   Please specify "true" or "false" for KNOWN, TRAINING, and TRUTH for each resource below (this is currently set to defaults)
#   See GATK's documentation for definitions of known, training, and truth sets:
#       https://gatk.broadinstitute.org/hc/en-us/articles/360035890831-Known-variants-Training-resources-Truth-sets
RESOURCE_1=/home/morrellp/large/soybean_toy/genome/training_VCF_files/M92-220_final_filtered-edited.vcf
PRIOR_1=5.0
KNOWN_1=false
TRAINING_1=false
TRUTH_1=true

RESOURCE_2=/home/morrellp/large/soybean_toy/genome/training_VCF_files/BL107_final_filtered.vcf
PRIOR_2=5.0
KNOWN_2=false
TRAINING_2=false
TRUTH_2=true

RESOURCE_3=/home/morrellp/large/soybean_toy/genome/training_VCF_files/50k-SNPs-sorted-SNP_utils.vcf
PRIOR_3=5.0
KNOWN_3=true
TRAINING_3=true
TRUTH_3=false

RESOURCE_4="NA"
PRIOR_4="NA"
KNOWN_4=true
TRAINING_4=true
TRUTH_4=false

#   What is our truth sensitivity filter level?
#       Default is set to 99.9
TS_FILTER_LEVEL="99.9"

################################################
##########    Pre_Variant_Filtering   ##########
################################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
PVF_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
PVF_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=110gb --tmp=22gb -t 72:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${PVF_QUEUE}"

#   USE_PBS: What are our Qsub settings?
#       Below are the recommended settings
PVF_QSUB="mem=110gb,nodes=1:ppn=16,walltime=72:00:00"

#   Where is the VCF file we can use to graph annotations and generate percentile tables?
#       If you ran Variant_Recalibrator, this will be the .recalibrated.vcf.gz file
PVF_VCF=/home/morrellp/large/soybean_toy-all-libs/Variant_Recalibrator/soybean_toy_dataset_all_libs_indels_and_snps.recalibrated.vcf.gz

#   What is the number of genomic regions to subset for graphing annotation purposes?
#       Default: 1000000
PVF_GEN_NUM="1000000"

#   What is the length of genomic regions to subset for graphing annotation purposes?
#       Default: 100
PVF_GEN_LEN="100"

#   NOTE:
#   This handler works with the GATK VariantsToTable function output.
#   See more details about what each annotation means here:
#       https://gatk.broadinstitute.org/hc/en-us/articles/360036732151-VariantsToTable
#       https://gatk.broadinstitute.org/hc/en-us/articles/360037261311-ExcessHet
#       https://gatk.broadinstitute.org/hc/en-us/articles/360036366852-InbreedingCoeff

############################################
##########    Variant_Filtering   ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
VF_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
VF_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=110gb --tmp=22gb -t 72:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${VF_QUEUE}"

#   USE_PBS: What are our Qsub settings?
#       Below are the recommended settings
VF_QSUB="mem=110gb,nodes=1:ppn=16,walltime=72:00:00"

#   Where is the VCF file to be filtered?
#       If you ran Variant_Recalibrator and Pre_Variant_Filtering, this will be 
#       the .recalibrated.pass_sites.vcf.gz file
VF_VCF=/home/morrellp/large/soybean_toy-all-libs/Pre_Variant_Filtering/soybean_toy_dataset_all_libs_indels_and_snps.recalibrated.pass_sites.vcf.gz

#   What is the QUAL score cutoff? (integer)
#       Sites with a QUAL below this cutoff will be excluded.
#       Recommended value: 40
VF_QUAL_CUTOFF=40

#   What is the minimum number of reads needed to support a genotype? (integer)
#       Recommended value: 5
VF_MIN_DP="5"

#   What is the maximum number of reads allowed to support a genotype? (too many = gene duplication problems) (integer)
#       You can use the Pre_Variant_Filtering annotation plots and DP per sample percentile table as a guide, but please do
#       to decide on a final cutoff.
#       Default value: 100 (but please adjust accordingly for your study!)
VF_MAX_DP="100"

#   What is the genotyping quality (GQ) cutoff? (integer)
#       If a sample's GQ is below this threshold, it will get filtered out
#       Recommended value: 10th percentile of the recal_pass_sites GQ percentile table
#       Example format: 3
VF_GQ_CUTOFF="3"

#   What is the QD cutoff?
#   Sites below this cutoff will be filtered out.
#   See documentation for details: https://gatk.broadinstitute.org/hc/en-us/articles/360036732151-VariantsToTable
VF_QD_CUTOFF="2.0"

#   What is the SOR cutoff?
#   Sites above this cutoff will get filtered out.
#   See documentation for details: https://gatk.broadinstitute.org/hc/en-us/articles/360036732151-VariantsToTable
VF_SOR_CUTOFF="3.0"

#   What is the FS cutoff?
#   Sites above this cutoff will get filtered out.
#   See documentation for details: https://gatk.broadinstitute.org/hc/en-us/articles/360036732151-VariantsToTable
VF_FS_CUTOFF="60.0"

#   What is the MQ cutoff?
#   Sites below this cutoff will get filtered out.
#   See documentation for details: https://gatk.broadinstitute.org/hc/en-us/articles/360036732151-VariantsToTable
VF_MQ_CUTOFF="40.0"

#   What is the proportion of missingness we are willing to tolerate? (proportion)
#       Sites with a misssingness greater than this threshold will be filtered out
#       Default value: "0.25"
VF_MISS="0.25"

#   What is the maximum proportion of heterozygous genotypes allowed per site? (proportion)
#       Sites with a proportion above this threshold will be filtered out
#       Recommended value: "0.1" based on highly inbred species like barley
VF_MAX_HET="0.1"

#   What is the number of genomic regions to subset for graphing annotation purposes?
#       Default: 1000000
VF_GEN_NUM="1000000"

#   What is the length of genomic regions to subset for graphing annotation purposes?
#       Default: 100
VF_GEN_LEN="100"

#----------------
# The following filters have caveats and will only be used when filtering if the conditions are met
# However, they must be filled out!
#   Are the samples diploid?
#   Valid options are: 1) "true" or 2) "false"
#   Default: false
VF_DIPLOID="false"

#   If we have diploid samples, what excess heterozygosity cutoff should we use? (integer)
#   See documentation for details: https://gatk.broadinstitute.org/hc/en-us/articles/360037261311-ExcessHet
#   Refer to the Pre_Variant_Filtering plots to determine an appropriate cutoff.
#   Caveat: Can only be calculated for diploid samples.
#   Default: "NA"
VF_EXCESS_HET="NA"

#-----------------
### GATK3 specific variables for Variant_Filtering handler
#   If GATK4 was used, delete the following GATK3 specific variables in the Variant_Filtering handler

#   What is the maximum percent deviation from 50/50 reference/alternative reads allowed in heterozygotes? (decimal)
#       For example, MAX_DEV=0.1 allows 60/40 ref/alt and also 40/60 ref/alt but not anything more skewed
#       Recommended value: 0.1
VF_MAX_DEV=0.1

#   What is the maximum number of "bad" (low GQ, low DP, or missing genotype data) samples allowed at a site? (integer)
#       Sites with more "bad" samples than this threshold will be filtered out.
#       Recommended value: "0.2"
VF_MAX_BAD=0.2

#   If exome capture, include the full file path to the capture regions file (BED format)
#   For barley use the pseudomolecular BED file, not the parts file
#   If not exome capture, put "NA"
#   Note: This is specific to GATK 3. If you are running GATK 4, you can leave this blank.
VF_CAPTURE_REGIONS="NA"
#-----------------

############################################
##########    Variant_Analysis    ##########
############################################

#   Which queue are we submitting the job to?
#   Note: For USE_PBS, we can only specify one queue
#         For USE_SLURM, we can specify multiple queues that are comma separated, no spaces (e.g., "small,ram256g,ram1t")
VA_QUEUE="amdsmall,amd512,amd2tb"

#   USE_SLURM: What are our sbatch settings?
#       Below are the recommended settings
VA_SBATCH="--nodes=1 --ntasks-per-node=16 --mem=22gb --tmp=4gb -t 24:00:00 --mail-type=ALL --mail-user=${EMAIL} -p ${VA_QUEUE}"

#   What are our Qsub settings for Variant_Analysis?
#       Below are the recommended settings
VA_QSUB="mem=22gb,nodes=1:ppn=16,walltime=24:00:00"

#   Where is the VCF file? Include the full file path.
VA_VCF=/home/morrellp/large/soybean_toy-all-libs/Variant_Filtering/soybean_toy_dataset_all_libs_final.vcf.gz

#   Do we have a Sanger dataset we can make comparisons to?
#   If so, fill out the full filepath to the Sanger regions in BED file format.
#   IMPORTANT: This variable must be filled out, if you are not using these variables put "NA"
#   Default: "NA"
SANGER_BED="NA"

############################################
##########      Dependencies      ##########
############################################

#   This section defines installations to
#       various dependencies for sequence_handling.
#   If you are using the Minnesota Supercomputing Institute cluster
#       then uncomment all 'module load' lines.
#       Make sure you have access to all '_ML' modules.
#   If you need to install a dependency from source,
#       then uncomment the lines for the dependency and the
#       'export PATH=', and write the full path to the executable
#       for the program. For example:
#       PARALLEL=${HOME}/software/parallel-20151122/bin/parallel
#   If you have a system-wide installation for a program, you can
#       leave all lines commented out. sequence_handling will find
#       system-wide installed programs automatically.

#   Please visit https://github.com/MorrellLab/sequence_handling/wiki/Dependencies
#       for information on version requirements and compatibility

if [[ "$MSI" == "true" ]] ; then
#   Do we have GNU parallel installed
module load parallel/20190122
#PARALLEL=
#export PATH=${PARALLEL}:${PATH}

#   Do we have the Fastx Toolkit installed?
module load fastx_toolkit/0.0.14
#FASTX_TOOLKIT=
#export PATH=${FASTX_TOOLKIT}:${PATH}

#   Do we have FastQC installed?
module load fastqc/0.11.8
#FASTQC=
#export PATH=${FASTQC}:${PATH}

#   Do we have Riss-util installed?
module load riss_util/1.0
#RISS_UTIL=
#export PATH=${RISS_UTIL}:${PATH}

#   Do we have Seqqs installed?
module load seqqs_ML/3d05375
#SEQQS=
#export PATH=${SEQQS}:${PATH}

#   Do we have Sickle installed?
module load sickle_ML/1.33
#SICKLE=
#export PATH=${SICKLE}:${PATH}

#   Do we have Scythe installed?
module load scythe_ML/0.994
#SCYTHE=
#export PATH=${SCYTHE}:${PATH}

#   Do we have R installed?
module load R/3.6.3
#R=
#export PATH=${R}:${PATH}
#   REQUIRED if running on MSI: Where are the local R libraries stored?
#   To figure out the path, run the following interactively on MSI:
#       module load R/3.6.0
#       R # Start up R session on MSI
#       Sys.getenv()
#   Now, look for R_LIBS_USER and paste the path below
R_LIBS_USER="~/R/x86_64-pc-linux-gnu-library/3.6"
#R_LIBS_USER=/home/morrellp/large/R/x86_64-pc-linux-gnu-library/3.6/:$R_LIBS_USER
#   Do we have BWA installed?
module load bwa/0.7.17
#BWA=
#export PATH=${BWA}:${PATH}

#   Do we have SAMTools installed?
module load samtools/1.9
#SAMTOOLS=
#export PATH=${SAMTOOLS}:${PATH}

#   Do we have BEDTools 2.17.0 installed?
module load bedtools/2.17.0
#BEDTOOLS=
#export PATH=${BEDTOOLS}:${PATH}

#   Do we have htslib 1.9 installed?
#HTSLIB=
#export PATH=${HTSLIB}:${PATH}
module load htslib/1.9

#   Do we have Freebayes 1.3.1 from commit on 20190710 installed?
#FREEBAYES=
#export PATH=${FREEBAYES}:${PATH}
module load freebayes_ML/1.3.1_20190710

#   Do we have Java installed?
#module load java/jdk1.8.0_45
#JAVA=
#export PATH=${JAVA}:${PATH}
module load java/openjdk-8_202

#   What is the full file path for the GATK jar file?
#   You need BOTH the jar file and the module
GATK_JAR=/panfs/jay/groups/9/morrellp/public/Software/GATK_ML_4.1.2/gatk
module load gatk/4.1.2

#   What is the full file path for the Picard jar file?
PICARD_JAR=/panfs/jay/groups/9/morrellp/public/Software/picard_ML_2.23.1/picard.jar

#   Do we have vcftools installed?
module load vcftools_ML/0.1.14
#VCFTOOLS=
#export PATH=${VCFTOOLS}:${PATH}

#   Do we have vcflib installed?
module load vcflib_ML/1.0.0_rc2
#VCFLIB=
#export PATH=${VCFLIB}:${PATH}

#   Do we have python 3 installed?
module load python3/3.7.1_anaconda
#PYTHON3=
#export PATH=${PYTHON3}:${PATH}

#   Do we have analysis installed?
module load analysis/0.8.2
#ANALYSIS=
#export PATH=${ANALYSIS}:${PATH}

#   Do we have bcftools installed?
module load bcftools/1.10.2
#BCFTOOLS=
#export PATH=${BCFTOOLS}:${PATH}
#   Issue specific to MSI modules as of Mar 1, 2021
#   Comment out line if not needed.
#       When loading the bcftools/1.10.2 module, the perl/modules.centos7.5.26.1 module also
#       gets loaded. When the perl/modules.centos7.5.26.1 module gets loaded, the module
#       R/3.4.4-tiff also gets loaded. This introduces some R version conflicts since we are
#       loading R/3.6.3 above. A temporary workaround is to unload R/3.4.4-tiff
module unload R/3.4.4-tiff

#   Do we have python-epd installed?
#module load python-epd/1.5.2
#PYTHON-EPD=
#export PATH=${PYTHON-EPD}:${PATH}
#module load python2_ML/2.7.13

#   Do we have texlive installed?
module load texlive/20131202
#TEXLIVE=
#export PATH=${TEXLIVE}:${PATH}

module load singularity
fi