From e4b6b1cf2fbbd1cefd37f63d578e9485d005d6e7 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 28 Sep 2016 16:00:01 -0400
Subject: [PATCH 01/48] first version of portable blueprint

---
 bin/blueprint.py                           | 400 +++++++++++++++++++++
 bin/imp_prep.pl                            |  42 +--
 cluster_templates/broad_uger.array.sub.sh  |  29 ++
 cluster_templates/broad_uger.conf          |  11 +
 cluster_templates/broad_uger.single.sub.sh |  27 ++
 5 files changed, 488 insertions(+), 21 deletions(-)
 create mode 100755 bin/blueprint.py
 create mode 100755 cluster_templates/broad_uger.array.sub.sh
 create mode 100644 cluster_templates/broad_uger.conf
 create mode 100755 cluster_templates/broad_uger.single.sub.sh

diff --git a/bin/blueprint.py b/bin/blueprint.py
new file mode 100755
index 0000000..70acd87
--- /dev/null
+++ b/bin/blueprint.py
@@ -0,0 +1,400 @@
+#! /usr/bin/env python
+
+####################################
+# blueprint.py
+# written by Raymond Walters, September 2016
+"""
+manages job submission on different cluster architectures
+"""
+#
+####################################
+
+import os
+import subprocess
+from textwrap import dedent
+from py_helpers import read_conf, file_len
+
+def send_job(jobname,
+             arrayfile=None,
+             cmd=None,
+             logname=None,
+             logloc=None,
+             mem=None,
+             walltime=None,
+#             week=None,
+             njobs=None,
+             maxpar=10000,
+#             multi=None,
+             wait_file=None,
+             wait_name=None,
+             cluster=None,
+             sleep=30,
+             testonly=False):
+    
+    # validate args
+    if arrayfile is None and cmd is None:
+        raise ValueError("Require either array file or command.")
+    
+    elif arrayfile is not None and cmd is not None:
+        raise ValueError("Require either array file or command, not both.")
+
+
+    if logloc is None:
+        logloc = os.getcwd()
+        
+    if maxpar < 1:
+        maxpar = 10000
+
+    # get cluster queue name
+    if cluster is None:
+        conf_file = os.environ['HOME']+"/picopili.conf"
+        configs = read_conf(conf_file)
+        cluster = configs['queue'] 
+
+    # get queue template
+    pico_bin = os.path.dirname(os.path.realpath(__file__))
+    clust_dir = os.path.dirname(pico_bin) + '/cluster_templates'
+    
+    assert os.path.isdir(clust_dir), "Unable to find cluster job submission template directory %s" % str(clust_dir)
+
+    # load queue configuration info
+    # - submission syntax, queue names, job holds
+    clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+
+    # setup memory args
+    if mem is None:
+        mem = 2000
+    mem_mb = str(int(mem))
+    mem_gb = str(int(mem)/1000)
+
+    # queue picking from job length
+    if walltime is None:
+        walltime = 1
+        queue_name = clust_conf['hour_q']
+    elif walltime <= 1.0:
+        queue_name = clust_conf['hour_q']
+    elif walltime <= 2.0:
+        queue_name = clust_conf['hour2_q']
+    elif walltime <= 4.0:
+        queue_name = clust_conf['hour4_q']
+    elif walltime <= 24.0:
+        queue_name = clust_conf['day_q']
+    else:
+        queue_name = clust_conf['long_q']
+    
+    # job dependencies
+    if wait_name is not None:
+        hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name)
+        
+    elif wait_file is not None:
+        with open(wait_file, 'r') as wait_fi:
+            wait_name = wait_fi.readline()
+            hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name)
+
+    else:
+        hold_str = ""
+        
+
+
+    # template for single jobs
+    if cmd is not None and (njobs is None or njobs <= 1):
+        
+        with open(str(clust_dir)+'/'+str(cluster)+'.single.sub.sh','r') as single_templ:
+            templ = single_templ.read()
+            
+        njobs = 1
+        
+        # log name
+        if logname is None:
+            logname = str(jobname)+'.sub.log'
+            
+        # command line
+        cmd_str = cmd
+
+        # dummy task array args for dict
+        array_jobs = njobs
+        j_per_core = 1
+
+
+    # template for array jobs
+    else:
+        with open(str(clust_dir)+'/'+str(cluster)+'.array.sub.sh','r') as array_templ:
+            templ = array_templ.read()
+
+        # setup indexing tasks
+        j_per_core = int(clust_conf['array_core'])
+        if j_per_core == 1:
+            task_index = str(clust_conf['task_id'])
+        else:
+            task_index = "${tid}"
+
+        # cmd or array file spec
+        if cmd is not None:
+            cmd_line = cmd.format(task=task_index)
+        
+        else:
+            assert os.path.isfile(arrayfile), "Job array file %s not found." % str(arrayfile)
+            
+            njobs = file_len(arrayfile)
+            
+            cmd_tmp = dedent("""\
+                cline=`head -n {task} {fi} | tail -n 1`
+                echo $cline
+                $cline
+            """)
+            cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile)
+
+        # parallelization of array jobs on a node
+        if j_per_core > 1:
+            
+            from math import floor, ceil
+            
+            # max simul tasks with memory limit
+            node_mem = float(clust_conf['array_core'])
+            task_mem_lim = floor((node_mem-1.0)/float(mem))
+            
+            if task_mem_lim < 1:
+                task_mem_lim=1
+            
+            if task_mem_lim > j_per_core:
+                task_mem_lim = j_per_core
+            
+            # number of jobs to cover all tasks
+            array_jobs = ceil(float(njobs)/float(task_mem_lim))
+            
+            # setup to do task_mem_lim jobs on each node
+            # note: specified above that cmd_line uses ${tid} as task index 
+            par_tmp = dedent("""\
+                # array index for this job            
+                jj={job_index}
+                
+                # number of jobs to run on node
+                nodej={nodej}
+                
+                # total number of jobs to run in task array
+                maxj={njobs}
+                
+                # task index of first task on this node
+                tid=$(($nodej * ($jj - 1) + 1))
+                
+                # find index of last task for this node
+                # - from either node task limit (nodej)
+                #   or total numebr of tasks (maxj)
+                if [$tid -le $(($maxj - $nodej + 1))]; then
+                    last_task = $(($tid + $nodej - 1))
+                else
+                    last_task = $(($maxj))
+                
+                # start the tasks
+                while [ $tid -le max_task ]; do
+                    {cmd_line} &
+                    tid=$(($tid+1))
+                done
+                
+                # let all tasks finish
+                wait
+            """)
+            
+            cmd_str = par_tmp.format(njobs=str(njobs),
+                                     nodej=str(task_mem_lim),
+                                     job_index=str(clust_conf['task_id']),
+                                     cmd_line=cmd_line)
+            
+            
+        else:
+            array_jobs = njobs
+            cmd_str = cmd_line
+            
+            
+        # log name
+        if logname is None:
+            logname = str(jobname)+'.sub.'+str(clust_conf['log_task_id'])+'.log'        
+
+
+
+    # fill in template
+    jobdict = {"job_name": str(jobname),
+               "cmd_string": cmd_str, # formatted elsewhere
+               "log_name": str(logname),
+               "mem_in_mb": str(mem_mb),
+               "mem_in_gb": str(mem_gb),
+               "wall_hours": str(walltime),
+               "njobs": str(njobs),
+               "array_jobs": str(array_jobs),
+               "array_max": str(maxpar),
+               "core_par": str(j_per_core),
+               "task_id": str(clust_conf['task_id']),
+               "log_task_id": str(clust_conf['log_task_id']),
+               "queue_name": str(queue_name),
+               "sleep_time": str(sleep)
+               }
+
+            
+    # write job script
+    sub_file = open(str(jobname)+'.sub.sh','w')
+    sub_file.write(templ.format(**jobdict))
+    sub_file.close()
+    
+    # command to run
+    if hold_str != "":    
+        launch_str = clust_conf['sub_cmd']+' '+hold_str+' '+str(sub_file.name)
+    else:
+        launch_str = clust_conf['sub_cmd']+' '+str(sub_file.name)
+    
+    # record            
+    print launch_str
+
+    # run
+    if not testonly:
+        p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
+        out, err = p.communicate()
+        print out
+        return(p.returncode)
+#
+#
+#        # manual error nhandling here because of Broad LD_LIBRARY_PATH warning
+#        if p.returncode != 0:
+#            if "LD_LIBRARY_PATH" in out:
+#                print out
+#            else:
+#                raise IOError("Job submission failed\nCode: %d\nError: %s\nOutput: %s\n" % p.returncode, err, out)
+             
+    return 0
+
+
+####################################
+#
+# Parse arguments from ricopili interface if invoked directly
+# 
+####################################
+if __name__ == "__main__":
+
+    # conditional imports    
+    import argparse
+
+    # setup arguments matching usage in imp_prep.pl
+    parser = argparse.ArgumentParser(prog='blueprint.py',
+                                     formatter_class=lambda prog:
+                                     argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40))
+
+    arg_core = parser.add_argument_group('Job Description')
+
+    arg_core.add_argument('--name','--na',
+#                        aliases=['--na'],
+                        type=str,
+                        help='job name',
+                        required=True)    
+    arg_core.add_argument('--array',
+                        type=str,
+                        help='file containing command lines to be run',
+                        default=None,
+                        required=False)
+    arg_core.add_argument('--b','-b','--blueprint',
+#                        aliases=['--blueprint','--cmd'],
+                        type=str,
+                        help='command line to be run',
+                        default=None,
+                        required=False)
+
+    
+    arg_old = parser.add_argument_group('Ricopili Backwards Compatibility')
+    
+    arg_old.add_argument('--job','-j','--j',
+#                        aliases=['--j'],
+                        action='store_true',
+                        help='indicates ricopili call (repurposed)')    
+    parser.add_argument('--noerr',
+                        action='store_true',
+                        help='no output to ./errandout (for ricopili comparitibility)')
+    parser.add_argument('--direct','--di',
+#                        aliases=['--di'],
+                        action='store_true',
+                        help='start job without reading prefixes')
+
+    arg_req = parser.add_argument_group('Resource Requirements')
+
+    arg_req.add_argument('--mem',
+                        type=int,
+                        help='memeory requirement for each job, in Mb',
+                        default=2000,
+                        required=False)    
+    arg_req.add_argument('--walltime','--wa',
+ #                       aliases=['--wa'],
+                        type=int,
+                        help='walltime for each job, in hours',
+                        default=1,
+                        required=False)                            
+#    arg_req.add_argument('--week',
+#                        type=int,
+#                        help='use week/long queues',
+#                        default=None,
+#                        required=False)    
+    arg_req.add_argument('--njob',
+                        type=int,
+                        help='max number of jobs to be submitted',
+                        default=1000,
+                        required=False)                        
+    arg_req.add_argument('--maxpar',
+                        type=int,
+                        help='maximum number of jobs to run in parallel',
+                        default=10000,
+                        required=False)
+#    arg_req.add_argument('--multi',
+#                        type=str,
+#                        help='number of jobs to parallelize, and the number of threads to use for each parallel job (comma separated)',
+#                        default=None,
+#                        required=False)
+    arg_req.add_argument('--fwt',
+                        type=str,
+                        help='file listing job dependencies to wait for before launching job',
+                        default=None,
+                        required=False)
+    arg_req.add_argument('--wait-name',
+                        type=str,
+                        help='name of job dependency',
+                        default=None,
+                        required=False)                        
+
+    arg_test = parser.add_argument_group('Dev Testing')
+    
+    parser.add_argument('--testonly',
+                        action='store_true',
+                        help='Skip job submission',
+                        default=False)   
+    
+    
+    args = parser.parse_args()
+
+    # get queue
+    conf_file = os.environ['HOME']+"/picopili.conf"
+    configs = read_conf(conf_file)
+    queue = configs['queue']
+    
+    # set logfile name
+    if args.noerr:
+        logloc = os.getcwd()+'/errandout/'
+    else:
+        logloc = os.getcwd()
+    
+    # ignore arguments for direct
+    if args.direct:
+        args.njob=None
+        args.walltime=None
+        args.mem=None
+        
+    
+    send_job(jobname=args.name,
+             arrayfile=args.array,
+             cmd=args.b,
+             logloc=logloc,
+             mem=args.mem,
+             walltime=args.walltime,
+#             week=None,
+             njobs=args.njob,
+             maxpar=args.maxpar,
+#             multi=None,
+             wait_file=args.fwt,
+             wait_name=args.wait_name,
+             cluster=queue,
+             testonly=args.testonly)
+
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index bc98b50..f576f60 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -186,7 +186,7 @@ sub trans {
 my $checkpos_script = "checkpos_pico.pl";         ### my.pipeline_tar
 my $checkflip_script = "checkflip_pico.pl";       ### my.pipeline_tar
 my $mutt_script = "mutt";                  ### my.pipeline_tar
-my $blue_script = "blueprint_pico.pl";             ### my.pipeline_tar
+my $blue_script = "blueprint.py";             ### my.pipeline_tar
 
 push @test_scripts, $readref_script;
 push @test_scripts, $readrefsum_script;
@@ -572,27 +572,27 @@ sub send_jobarray {
     $command_line =~ s/--force1//;
 
 
-    my $wt_file = "$sjadir/blueprint_joblist_file-$sjaname.$outname";
+#    my $wt_file = "$sjadir/blueprint_joblist_file-$sjaname.$outname";
     chdir "$rootdir" or die "something strange";
-    if ($qloc eq "bsub") {
-	$wt_file =~ s/.*blueprint_joblist_file-//;
-    }
-
-    if ($qloc eq "slurm") {
-	$wt_file = "$sjadir/$jobfile.script.id";
-    }
-
-    if ($qloc eq "qsub") {
-	$wt_file = "$sjadir/j.$sjaname.$outname.id";
-    }
-    if ($qloc eq "qsub_c") {
-	$wt_file = "$sjadir/j.$sjaname.$outname.id";
-    }
-    if ($qloc eq "qsub_b") {
-	$wt_file = "$sjadir/j.$sjaname.$outname.id";
-    }
+#    if ($qloc eq "bsub") {
+#	$wt_file =~ s/.*blueprint_joblist_file-//;
+#    }
+#
+#    if ($qloc eq "slurm") {
+#	$wt_file = "$sjadir/$jobfile.script.id";
+#    }
+#
+#    if ($qloc eq "qsub") {
+#	$wt_file = "$sjadir/j.$sjaname.$outname.id";
+#    }
+#    if ($qloc eq "qsub_c") {
+#	$wt_file = "$sjadir/j.$sjaname.$outname.id";
+#    }
+#    if ($qloc eq "qsub_b") {
+#	$wt_file = "$sjadir/j.$sjaname.$outname.id";
+#    }
     
-
+    my $wt_name = "$sjaname.$outname";
 
     if ($serial) {
 	my $sys_re = "$command_line";
@@ -600,7 +600,7 @@ sub send_jobarray {
 	exit;
     }
     else {
-	my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 2 --di -j --fwt $wt_file --na _if_$outname";
+	my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 2 --di -j --wait-name $wt_name --na _if_$outname";
 	&mysystem ($sys_re);
     }
 
diff --git a/cluster_templates/broad_uger.array.sub.sh b/cluster_templates/broad_uger.array.sub.sh
new file mode 100755
index 0000000..4416e9c
--- /dev/null
+++ b/cluster_templates/broad_uger.array.sub.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# wrapper script for job submission on Broad UGER cluster
+#
+# The -V below above will provoke a warning that
+# LD_LIBRARY_PATH won't be used for security reasons;
+# this warning can be safely ignored
+
+#$ -j y
+#$ -cwd
+#$ -V
+#$ -N {job_name}
+#$ -o {log_name}
+#$ -q {queue_name}
+#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g
+#$ -t 1-{array_jobs}
+#$ -tc {array_max}
+
+# sleep option (for preventing race conditions on network file systems)
+sleep {sleep_time}
+
+# setup resources
+source /broad/software/scripts/useuse
+reuse -q Anaconda
+
+# main command line
+{cmd_string}
+
+# eof
diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf
new file mode 100644
index 0000000..658eae0
--- /dev/null
+++ b/cluster_templates/broad_uger.conf
@@ -0,0 +1,11 @@
+hour_q short
+hour2_q short
+hour4_q long
+day_q long
+long_q long
+sub_cmd qsub
+log_task_id $TASK_ID
+task_id ${SGE_TASK_ID}
+hold_flag -hold_jid
+array_core 1
+array_mem_mb 128000
diff --git a/cluster_templates/broad_uger.single.sub.sh b/cluster_templates/broad_uger.single.sub.sh
new file mode 100755
index 0000000..42a1335
--- /dev/null
+++ b/cluster_templates/broad_uger.single.sub.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# wrapper script for job submission on Broad UGER cluster
+#
+# The -V below above will provoke a warning that
+# LD_LIBRARY_PATH won't be used for security reasons;
+# this warning can be safely ignored
+
+#$ -j y
+#$ -cwd
+#$ -V
+#$ -N {job_name}
+#$ -o {log_name}
+#$ -q {queue_name}
+#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g
+
+# sleep option (for preventing race conditions on network file systems)
+sleep {sleep_time}
+
+# setup resources
+source /broad/software/scripts/useuse
+reuse -q Anaconda
+
+# main command line
+{cmd_string}
+
+# eof

From ab30a1231c61fd8d8470894349b147a159efa0d0 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 28 Sep 2016 16:55:15 -0400
Subject: [PATCH 02/48] remove old blueprint, move perl to config, port
 Utils.pm

---
 bin/blue_start_job.pl             |  106 --
 bin/blueprint_pico.pl             | 1581 -----------------------------
 bin/buigue_pico.pl                |   33 +-
 bin/imp_prep.pl                   |   40 +-
 bin/lift_to_hg19.pl               |   27 +-
 bin/{plague.pl => plague_pico.pl} |   40 +-
 bin/qc_rel.py                     |    4 +-
 bin/rp_perl/Utils.pm              |   49 +
 docs/RICOPILI.md                  |    5 +-
 9 files changed, 105 insertions(+), 1780 deletions(-)
 delete mode 100755 bin/blue_start_job.pl
 delete mode 100755 bin/blueprint_pico.pl
 rename bin/{plague.pl => plague_pico.pl} (85%)
 create mode 100755 bin/rp_perl/Utils.pm

diff --git a/bin/blue_start_job.pl b/bin/blue_start_job.pl
deleted file mode 100755
index 5706252..0000000
--- a/bin/blue_start_job.pl
+++ /dev/null
@@ -1,106 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-
-my $version = "1.0.0";
-my $progname = $0;
-$progname =~ s!^.*/!!;
-
-my $num = "";
-my $jobfile = "";
-my $parn = "";
-
-use Getopt::Long;
-GetOptions( 
-    "help"=> \my $help,
-    "parn=s"=> \$parn,
-    "n=s"=> \$num,
-    "jobfile=s"=> \$jobfile,
-    );
-
-use File::Basename;
-
-
-
-if ($help || $num eq "" || $jobfile eq ""){
-    print "usage: $progname FILES
-
-version: $version
-
-      options:
-
-        --help           print this message then quit
-        --n INT          line to take as command (like this n100)
-        --jobfile STRING jobfile from which to read command
-        --parn INT       number of parallel jobs (then multiplicated with n)
-
-
-  --n n100
-
-
- created by Stephan Ripke 2012 at MGH, Boston, MA
- in the frame of the PGC
-\n";
-    exit 2;
-}
-
-$num =~ s/n//;
-$num = $num * 1;
-if ($num == 0) {
-    print "--n doesn't make sense: $num after transformation\n";
-}
-
-
-if ($parn ne "") {
-
-    $parn = $parn * 1;
-    if ($parn == 0) {
-	print "--parn doesn't make sense: $num after transformation\n";
-    }
-
-    my $first_n = ($num-1) * $parn;
-    $first_n++;
-    
-    my $last_n = $num * $parn;
-
-    my $lc = 1;
-
-#    my @job_array;
-    die "$jobfile: ".$! unless open FILE, "< $jobfile";
-    die "$jobfile.sub$num: ".$! unless open OUT, "> $jobfile.sub$num";
-    while (my $cmd  = <FILE>){
-	chomp ($cmd);
-
-	if ($lc >= $first_n && $lc <= $last_n) {
-#	    push @job_array, $cmd;
-	    print OUT $cmd." &\n";
-	}
-
-	
-	$lc++;
-    }
-    close FILE;
-    print OUT "wait\n";
-    close OUT;
-
-    system ("chmod u+x $jobfile.sub$num");  
-
-#    exit;
-    system ("./$jobfile.sub$num");
-
-    exit;
-
-}
-
-
-
-
-my $sys  = `head -n $num $jobfile | tail -1`;
-chomp ($sys);
-system ($sys);
-#print $sys."\n";
-
-
-
-
diff --git a/bin/blueprint_pico.pl b/bin/blueprint_pico.pl
deleted file mode 100755
index 9de6391..0000000
--- a/bin/blueprint_pico.pl
+++ /dev/null
@@ -1,1581 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-
-my $version = "3.1.0";
-my $progname = $0;
-$progname =~ s!^.*/!!;
-
-my $command_line = "$progname @ARGV";
-
-
-
-#print "blueprint @ARGV\n";
-
-#############################
-# read config file
-#############################
-
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my %conf = ();
-
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
-
-sub trans {
-    my ($expr)=@_;
-    unless (exists $conf{$expr}) {
-	die "config file without entry: $expr\n";
-    }
-    $conf{$expr};
-}
-
-my $qloc = &trans("queue");
-
-my $qsub =0;
-my $qsub_computerome =0;
-my $qsub_broad =0; 
-if ($qloc eq "qsub"){
-    $qsub = 1;
-}
-if ($qloc eq "qsub_c"){
-    $qsub_computerome =1;
-}
-if ($qloc eq "qsub_b"){
-    $qsub_broad =1;
-}
-
-my $bsub =1 if ($qloc eq "bsub");
-my $msub =1 if ($qloc eq "msub");
-my $slurm =1 if ($qloc eq "slurm");
-
-
-my $startjob_script ="blue_start_job.pl";
-
-
-##########################################
-
-my $bp = "";
-my $sample="plink --bfile prefix --out QC2/prefix-qc2 --geno 0.1 --mind 0.1 --make-bed";
-
-my $prefix="prefix";
-
-my $jl_file="blueprint_joblist_file";
-
-my $core=12;
-my $inst=11;
-my $walltime="1";
-
-my $name_of_job="";
-
-my $week = -1;
-my $multi = -1;
-my $threads = 4;
-my $multi_str = "";
-my $errname="errandout";
-my $jmem = 2000;
-
-my $hhmm = "";
-
-my $cores = 16;
-
-my $job_bn_th = 1000;
-my $maxpar = 0;
-
-use Cwd;
-my $rootdir = &Cwd::cwd();
-
-my $usage = "
-Usage : $progname [options] -b \"blueprint\"
-
-version: $version
-
-##  --inst INT,INT           2 or 8 core, 16,24 gb number of instancies in one job, default=$core,$inst
-  --cores INT              on LISA, how many cores per node (default $16)
-  --walltime               walltime per job, default=$walltime
-  --week INT               the first N jobs go to the week queue
-  --multi INT,INT          the first N jobs getting multithreading frame, second number the number of threads, default 4
-  --job                    launches job on cluster or serail comands on home
-  --mach                   loads module mach in jobs
-#  --R                      loads module R in jobs
-  --plink                  loads module plink in jobs
-  --fortran                loads module fortran/intel in jobs
-  --blueprint STRING       uses STRING containing prefix and launches commands
-  --prefix STRING          uses STRING instead of prefix 
-  --px STRING,INT,INT      substitute STRING by a seq from INT to INT
-  --linech FILE,INT,STRING make own seq with fam-file and chunksize into STRING
-  --direct                 start job without reading prefixes
-  --name STRING            name of job
-  --wt STRING            wait for OK from joblist from this file
-  --fwt STRING            wait for finished from joblist from this file
-  --serial                not all the commands parallel
-  --errname STRING        name of errandout-subdir, default: $errname
-
-  --time HHMM             time, when the job is eligible
-
-  --mem INT               memory requested for each job (bsub), default = $jmem
-
-  --start                 directly start the commands, no jobs.
-
-  --njob INT            start INT jobs at a time
-
-  --noerr               noerrandout
-  --array FILE       jobarray, much quicker submission of larger numbers
-  --maxpar INT       max number of parallel jobs (this is for high IO jobs)
-
-sample of blueprint: $sample
-
-
-outdated {
- cores 2: 2 cores
- cores 8: any eight (any RAM)
- cores 12: strictly 8gb RAM
- cores 16: strictly 16gb RAM
- cores 24: strictly 24gb RAM
-}
-
-new: changes between 8 and 12 cores, instancies always one less
-
-
- writes output to a specific file $jl_file
-
- created by Stephan Ripke 2008 at MGH, Boston, MA
- 
-";
-
-#print "cd $ENV{PWD}\n";
-
-my $NOJ="";
-my $jobarray="";
-
-
-
-use Getopt::Long;
-GetOptions( "blueprint=s"=> \$bp,
-	    "job" => \my $job,
-	    "serial" => \my $serial,
-	    "week=i" => \$week,
-	    "multi=s" => \$multi_str,
-	    "inst=s" => \my $inst_str,
-	    "cores=i" => \$cores,
-	    "prefix=s"=> \$prefix,
-	    "wt=s"=> \my $wait_file,
-	    "fwt=s"=> \my $wait_fi_file,
-	    "time=s"=> \$hhmm,
-	    "errname=s"=> \$errname,
-	    "walltime=i" => \$walltime,
-	    "mem=i" => \$jmem,
-	    "njob=i" => \$job_bn_th,
-	    "maxpar=i" => \$maxpar,
-	    "mach" => \my $mach,
-	    "start" => \my $start,
-	    "direct" => \my $direct,
-	    "noerr" => \my $noerr,
-#	    "R" => \my $R,
-	    "plink" => \my $plink,
-	    "fortran" => \my $fortran,
-	    "px=s" => \my $pxarg,
-	    "linech=s" => \my $linechunk_str,
-	    "name=s" => \$NOJ,
-	    "array=s" => \$jobarray,
-	    "help!"=> \my $help );
-
-
-if ($help){
-    print $usage;
-    exit 2;
-}
-
-if ($bp eq "" && $jobarray eq ""){
-    print $usage;
-    exit 2;
-}
-
-#print "all is right\n";
-
-if ($hhmm ne ""){
-    $hhmm = "-a $hhmm";
-}
-
-
-$bp =~ s/dollarsign/\$/;
-#print "$bp\n";
-#exit;
-
-if ($NOJ eq ""){
-    $NOJ = $bp;
-    $NOJ =~ s/^[\s]+//g;
-    my @cols=  split /\s+/, $NOJ;
-    $cols[0] =~ s/[^a-zA-Z]//g;
-    $NOJ = $cols[0];
-}
-
-$name_of_job = "-N $NOJ";
-
-
-
-
-#####################################
-# subroutine to count lines of a file
-#####################################
-
-sub count_lines {
-    my ($file)=@_;
-    my $lc=0;
-    die "$file: ".$! unless open FILE, "< $file";
-    while (<FILE>){
-	$lc++;
-    }
-    close FILE;
-    $lc;
-}
-
-
-
-
-#print "$pxstart\t$pxend\n";
-#print "core: $core\n";
-
-my @blueprint_out;
-
-
-#############################
-# test, if running on server
-#############################
-#use Sys::Hostname;
-#my $host = hostname();
-#my $lisa=0;
-#$lisa=1 if ($host =~ m/sara/) ;
-
-#my $broad = 1 if ($ENV{"DOMAINNAME"} =~ /broadinstitute/);
-
-
-
-#####################################
-# print array to file
-####################################
-
-sub a2file {
-    my ($file, @lines)=@_;
-    die $! unless open FILE, "> $file";
-    foreach (@lines){
-	print FILE $_;
-    }
-    close FILE;
-}
-
-
-###################################################
-###  system call with test if successfull
-###################################################
-
-sub mysystem(){
-    my ($systemstr)="@_";
-    my $test_str = `$systemstr`;
-    push @blueprint_out, $test_str;
-#    print "$test_str";
-#system($systemstr);
-    my $status = ($? >> 8);
-    print "$systemstr\n->system call failed: $status\n" if ($status != 0);
-}
-
-
-
-my $wait_str = "";
-my $addon_str = "";
-
-my $count_jobs = 1 ;
-if ($wait_file){
-
-    my $wc = &count_lines($wait_file);
-    my $skip_lines = $wc-50 ;
-    die $! unless open WF, "< $wait_file";
-    $wait_str = "#PBS -W depend=afterok";
-
-    while (my $line = <WF>) {
-	$count_jobs ++;
-	next if ($count_jobs < $skip_lines);
-	chomp($line);
-	$line =~ s/[^0-9]//g;
-	$wait_str .= ":$line";
-	$addon_str .= ":$line";
-
-    }
-    close WF;
-
-#    print "$wait_str\n";
-
-}
-
-if ($wait_fi_file){
-    if ($qsub == 1) {
-
-	$count_jobs = 1 ;
-	die "$! <$wait_fi_file>;<$ENV{PWD}>" unless open WF, "< $wait_fi_file";
-	my $wc = &count_lines($wait_fi_file);
-	my $skip_lines = $wc-50;
-	$wait_str = "#PBS -W depend=afterany";
-	
-	while (my $line = <WF>) {
-	    $count_jobs ++;
-	    next if ($count_jobs < $skip_lines);
-	    chomp($line);
-	    $line =~ s/[^0-9]//g;
-	    $wait_str .= ":$line";
-	    
-#	    $wait_str .= ":$line.batch1.irc.sara.nl";
-	}
-	close WF;
-    }
-}
-
-
-my ($pxstring,$pxstart,$pxend) = split ',', $pxarg if ($pxarg);
-my ($lc_file,$lc_size, $lc_str) = split ',', $linechunk_str if ($linechunk_str);
-($core,$inst) = split ',', $inst_str if ($inst_str);
-
-$inst = 10000 if ($serial);
-
-
-if ($lc_file) {
-    print "file:$lc_file\n";
-    print "csize:$lc_size\n";
-    my $temp_size = &count_lines($lc_file);
-    print "fsize:$temp_size\n";
-    $pxstring = $lc_str;
-    $pxstart=0;
-    $pxend= sprintf "%d", $temp_size/$lc_size-1;
-    $pxend++ if ($temp_size % $lc_size != 0);
-    $pxarg = 1;
-}
-
-
-
-
-my $row_count=0;
-
-my @job_arr=();
-
-
-
-if ($direct){
-    push @job_arr, $bp;
-}
-else {
-    if ($jobarray eq "") {
-#	print "is it here?\n";
-	while (<>){
-	    chomp;
-	    my $line=$bp;
-	    $line =~ s/$prefix/$_/g;
-	    $line =~ s/"//g;
-#	print $line."\n";
-	    if ($pxarg){
-		foreach my $subs ($pxstart..$pxend){
-#		    print "this is a a command: $subs\n";
-		    my $line2=$line;
-		    $line2 =~ s/$pxstring/$subs/g;
-		    push @job_arr, $line2;
-		}
-	    }
-	    else {
-		
-		push @job_arr, $line;
-#		print "2: this is a a command: $line\n";
-	    }
-	}
-    }
-}
-
-#print "sleep\n";
-#sleep(3);
-
-my $module ="";
-$module = "module load plink\n" if $plink;
-$module .= "module load mach\n" if $mach;
-$module .= "module load fortran/intel\n" if $fortran;
-#$module .= "module load R\n" if $R;
-
-
-
-
-
-
-if ($job){
-    
-#    print "entering job\n";
-
-    
-    use File::Path;
-    my @created = mkpath(   ## $created ?
-			    $errname,
-			    {verbose => 0, mode => 0750},
-	);
-    
-    
-    
-
-
-    if ($NOJ eq "prefix") {
-	$NOJ = $job_arr[0];
-	$NOJ =~ s/^[\s]+//g;
-	my @cols=  split /\s+/, $NOJ;
-	$cols[0] =~ s/[^a-zA-Z]//g;
-	$NOJ = $cols[0];
-    } 
-
-########################################
-### BROAD (bsub)
-##########################################
-
-
-    if ($bsub) {
-#	print "entering bsub\n";
-	($multi,$threads) = split ',', $multi_str if ($multi_str ne "");
-#	print "multi: $multi\n";
-#	print "threads: $threads\n";
-#	sleep(10);
-	$threads = 4 if ($threads eq "");
-
-
-	my $wallmin = $walltime * 60;
-	$wallmin = 240 if ($wallmin > 240);
-	my $wallstr = "-W $wallmin";
-
-
-	if ($wallmin < 10){
-	    $wallstr = "-app shortjobs";
-#	    $wallmin = 10 ;
-	}
-	
-
-
-	my $sla_deadline = "";
-#	    $sla_deadline = "-sla DEADLINEsla";
-
-	my $time = "hour";
-
-	if ($jobarray ne "") {
-
-	    print "starting job_array\n";
-
-	    my $mem_str = ($jmem / 1000) * 1;
-	    $mem_str = 1 if ($mem_str < 1);
-	    my $Rusage  = "-R \"rusage[mem=$mem_str]\"";
-
-	    if ($multi >= 0){
-		my $threads_loc = $threads;
-		my $threads_loc_half = sprintf "%d",$threads / 2;
-		my $mem_loc = $mem_str * 1100;
-#		$Rusage  = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-		$Rusage  = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc";
-#		print "$cmd\n";
-
-	    }
-
-	    if ($week > 0) {
-		$time = "week";
-		$wallstr = "-W 10000";
-	    }
-
-
-#	    my $pretext = "";
-	    my $pretext = "$sla_deadline $wallstr -q $time $Rusage";
-
-	    my $sys = 'bsub -P unspecified--broadfolk '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']" -o "errandout/'.$NOJ.'..%J.%I"  '.$startjob_script.' --n n\$LSB_JOBINDEX  --jobfile '.$jobarray;
-
-	    if ($maxpar > 0) {	  
-
-## change maxpar of running job to 100 
-##  bmod -J "%100" 123 
-		## see here
-		## http://www.ccs.miami.edu/hpc/lsf/7.0.6/admin/jobarrays.html
-
-
-		  
-		$sys = 'bsub -P unspecified--broadfolk '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']%'.$maxpar.'" -o "errandout/'.$NOJ.'..%J.%I"  '.$startjob_script.' --n n\$LSB_JOBINDEX  --jobfile '.$jobarray;
-		print "$sys\n";
-	    }
-#	    exit;
-
-#	    print "sys: $sys\n";
-
-	    my $acmd_file = "$jobarray.array_cmd";
-	    while (-e $acmd_file) {
-		$acmd_file .= ".a";
-	    }
-	    die $! unless open AC, "> $acmd_file";
-	    print AC "$sys\n";
-	    close AC;
-#	    exit;
-	    &mysystem ($sys);  
-	    exit;
-	}
-
-	my $job_bn=0;
-	my $job_bn_name=0;
-	my $job_bn_str = sprintf "%09d", $job_bn_name;
-	
-#	$NOJ = substr($NOJ,0,8);
-	my $dirname = "";
-	$dirname = "$errname/" if ($errname ne "errandout");
-	my $blue_n = 0 ;
-	my $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n";
-
-	while (-e "$blue_name") {
-	    $blue_n++;
-	    $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n";
-	}
-
-	die "$! $blue_name" unless open JOB, "> $blue_name";
-
-	my $week_count = 0;
-	my $multi_count = 0;
-
-	my $zaehler = 2*$job_bn_th;
-	my $rand_th = $zaehler / @job_arr;
-	
-	foreach my $cmd (@job_arr){
-
-
-	    if ($noerr) {
-		$job_bn_str = "dump";
-	    }
-	    else {
-		$job_bn_str = sprintf "%09d", $job_bn_name;
-		while (-e "$errname/$NOJ.$job_bn_str") {
-		    $job_bn_name++;
-		    $job_bn_str = sprintf "%09d", $job_bn_name;
-		}
-	    }
-
-
-
-	    if ($start) {
-		&mysystem($cmd);
-		next;
-	    }
-
-	    my $wt_str = "";
-	    if ($wait_fi_file) {
-#		my $substr_wf = substr($wait_fi_file,0,8);
-		my $substr_wf = $wait_fi_file;
-	        $wt_str = '-w \'ended ("'.$substr_wf.'")\'';
-	    }
-
-	    $time = "hour";
-
-	    my $jmem_loc = $jmem;
-
-
-
-	    if ($week >= 0){
-#  here change from priority to week!!!
-#		$time = "priority";
-		$time = "week";
-#		$time = "week -G deadline";
-		$week = $week -1;
-#		$jmem_loc = $jmem * 2;
-		$jmem_loc = $jmem + 3000 ;
-#		$wallstr = '-W 2400';
-		$wallstr = '';
-	    }
-	    else {
-#		$time = "hour -G psychfolk";
-		$time = "hour";
-	    }
-
-
-#bsub -sla DEADLINEsla -q hou
-
-
-	    my $mem_str = ($jmem_loc / 1000) * 1;
-	    $mem_str = 1 if ($mem_str < 1);
-
-
-	    my $Rusage  = "-R \"rusage[mem=$mem_str]\"";
-
-
-	    if ($multi >= 0){
-		$multi = $multi -1;
-#		$jmem_loc = $jmem * 2;
-#		$Rusage  = "-n 2,4 -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-
-#		my $threads_loc = $threads + 2;
-		my $threads_loc = $threads;
-		my $threads_loc_half = sprintf "%d",$threads / 2;
-		my $mem_loc = $mem_str * 1100;
-#		$Rusage  = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-		$Rusage  = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc";
-#		print "$cmd\n";
-
-	    }
-
-	    next if (rand() > $rand_th);
-
-	    if ($week >= 0){
-		$week_count++;
-#		next if ($week_count > 25);
-	    }
-
-	    my $locname = $NOJ;
-	    if ($multi >= 0){
-		$multi_count++;
-#		$locname .= ".mu";
-	    }
-
-
-
-	    my $errstr = "$errname/$NOJ.$job_bn_str";
-	    if ($noerr) {
-		$errstr = "/dev/null";
-	    }
-
-	    my $jcmd = "bsub $sla_deadline $wt_str $wallstr $Rusage -J $locname -q $time -o $errstr \"$cmd\"";
-
-	    print JOB "$jcmd\n";
-
-	    
-
-	    if (1) {
-#		&mysystem ("$jcmd ");
-		&mysystem ("$jcmd 2> /dev/null > /dev/null");  # 0113
-	    }
-	    else {
-		print "$jcmd\n";
-	    }
-#	    sleep(1);
-#	    exit;
-	    $job_bn++;
-	    $job_bn_name++;
-
-	    last if ($job_bn > $job_bn_th);
-
-	}
-	close JOB;
-	exit;	
-    }
-    
-########################################
-### MSSM (msub)
-##########################################
-
-    if ($msub) {
-#	print "entering bsub\n";
-	($multi,$threads) = split ',', $multi_str if ($multi_str ne "");
-#	print "multi: $multi\n";
-#	print "threads: $threads\n";
-#	sleep(10);
-	$threads = 4 if ($threads eq "");
-
-
-	my $wallmin = $walltime * 60;
-	$wallmin = 240 if ($walltime > 240);
-	my $wallstr = "-W $wallmin";
-
-
-	if ($wallmin < 10){
-	    $wallstr = "-app shortjobs";
-#	    $wallmin = 10 ;
-	}
-	
-
-
-	my $sla_deadline = "";
-#	    $sla_deadline = "-sla DEADLINEsla";
-
-#	my $time = "hour";
-	my $time = "scavenger"; ## (this is msub)
-
-	if ($jobarray ne "") {
-
-	    print "starting job_array\n";
-
-	    my $mem_str = ($jmem / 1000) * 1;
-	    $mem_str = 1 if ($mem_str < 1);
-	    my $Rusage  = "-R \"rusage[mem=$mem_str]\"";
-
-	    if ($multi >= 0){
-		my $threads_loc = $threads;
-		my $threads_loc_half = sprintf "%d",$threads / 2;
-		my $mem_loc = $mem_str * 1100;
-#		$Rusage  = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-		$Rusage  = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc";
-#		print "$cmd\n";
-
-	    }
-
-
-
-#	    my $pretext = "";
-	    my $pretext = "$sla_deadline $wallstr -q $time $Rusage";
-	    my $sys = 'bsub  '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']" -o "errandout/'.$NOJ.'..%J.%I"  '.$startjob_script.' --n n\$LSB_JOBINDEX  --jobfile '.$jobarray;  
-
-
-#	    print "$sys\n";
-
-	    my $acmd_file = "$jobarray.array_cmd";
-	    while (-e $acmd_file) {
-		$acmd_file .= ".a";
-	    }
-	    die $! unless open AC, "> $acmd_file";
-	    print AC "$sys\n";
-	    close AC;
-#	    exit;
-#	    print "$sys\n";
-	    &mysystem ($sys);  
-	    exit;
-	}
-
-	my $job_bn=0;
-	my $job_bn_name=0;
-	my $job_bn_str = sprintf "%09d", $job_bn_name;
-	
-#	$NOJ = substr($NOJ,0,8);
-	my $dirname = "";
-	$dirname = "$errname/" if ($errname ne "errandout");
-	my $blue_n = 0 ;
-	my $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n";
-
-	while (-e "$blue_name") {
-	    $blue_n++;
-	    $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n";
-	}
-
-	die "$! $blue_name" unless open JOB, "> $blue_name";
-
-	my $week_count = 0;
-	my $multi_count = 0;
-
-	my $zaehler = 2*$job_bn_th;
-	my $rand_th = $zaehler / @job_arr;
-	
-	foreach my $cmd (@job_arr){
-
-
-	    if ($noerr) {
-		$job_bn_str = "dump";
-	    }
-	    else {
-		$job_bn_str = sprintf "%09d", $job_bn_name;
-		while (-e "$errname/$NOJ.$job_bn_str") {
-		    $job_bn_name++;
-		    $job_bn_str = sprintf "%09d", $job_bn_name;
-		}
-	    }
-
-
-
-	    if ($start) {
-#		print "cmd\n";
-		&mysystem($cmd);
-		next;
-	    }
-
-	    my $wt_str = "";
-	    if ($wait_fi_file) {
-#		my $substr_wf = substr($wait_fi_file,0,8);
-		my $substr_wf = $wait_fi_file;
-	        $wt_str = '-w \'ended ("'.$substr_wf.'")\'';
-	    }
-
-	    $time = "scavenger";
-
-	    my $jmem_loc = $jmem;
-
-
-
-	    if ($week >= 0){
-#  here change from priority to week!!!
-#		$time = "priority";
-		$time = "week";
-#		$time = "week -G deadline";
-		$week = $week -1;
-#		$jmem_loc = $jmem * 2;
-		$jmem_loc = $jmem + 3000 ;
-#		$wallstr = '-W 2400';
-		$wallstr = '';
-	    }
-	    else {
-#		$time = "hour -G psychfolk";
-#		$time = "hour";
-		$time = "scavenger";
-	    }
-
-
-#bsub -sla DEADLINEsla -q hou
-
-
-	    my $mem_str = ($jmem_loc / 1000) * 1;
-	    $mem_str = 1 if ($mem_str < 1);
-
-
-	    my $Rusage  = "-R \"rusage[mem=$mem_str]\"";
-
-
-	    if ($multi >= 0){
-		$multi = $multi -1;
-#		$jmem_loc = $jmem * 2;
-#		$Rusage  = "-n 2,4 -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-
-#		my $threads_loc = $threads + 2;
-		my $threads_loc = $threads;
-		my $threads_loc_half = sprintf "%d",$threads / 2;
-		my $mem_loc = $mem_str * 1100;
-#		$Rusage  = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\"";
-		$Rusage  = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc";
-#		print "$cmd\n";
-
-	    }
-
-	    next if (rand() > $rand_th);
-
-	    if ($week >= 0){
-		$week_count++;
-#		next if ($week_count > 25);
-	    }
-
-	    my $locname = $NOJ;
-	    if ($multi >= 0){
-		$multi_count++;
-#		$locname .= ".mu";
-	    }
-
-
-
-	    my $errstr = "$errname/$NOJ.$job_bn_str";
-	    if ($noerr) {
-		$errstr = "/dev/null";
-	    }
-
-	    my $jcmd = "bsub  $sla_deadline $wt_str $wallstr $Rusage -J $locname -q $time -o $errstr \"$cmd\"";
-
-	    print JOB "$jcmd\n";
-
-	    
-
-	    if (1) {
-#		&mysystem ("$jcmd ");
-#		print "$jcmd\n";
-		&mysystem ("$jcmd 2> /dev/null > /dev/null");  # 0113
-	    }
-	    else {
-		print "$jcmd\n";
-	    }
-#	    sleep(1);
-#	    exit;
-	    $job_bn++;
-	    $job_bn_name++;
-
-	    last if ($job_bn > $job_bn_th);
-
-	}
-	close JOB;
-	exit;	
-    }
-    
-
-
-
-########################################
-### SLURM 
-##########################################
-
-    if ($slurm) {
-
-#	my $job_n=0;
-	my $sum=0;
-	
-	my $jobname;
-
-#	print "cmds: @job_arr\n";
-#	print "cmds: $jobarray\n";
-
-
-	my $wallstr = "$walltime:00:00";
-
-	if ($walltime == 0) {
-	    $wallstr = "00:10:00";
-	}
-	
-#	    if ($start) {
-#		&mysystem($cmd);
-#		next;
-#	    }
-	
-	
-	if ($week > 0){
-#		my $walltime_loc = $walltime * 6;
-	    $wallstr = "48:00:00";
-	    $week = $week - 1;
-	}
-
-	$jobname="j.$job_bn_th.$NOJ";
-
-	my $jobfile = $jobarray.".script";
-
-
-	my $mem_str = ($jmem / 1000) * 1;
-	$mem_str = 1 if ($mem_str < 1);
-
-#	print "here\n";
-
-	my ($multi,$threads) = split ',', $multi_str if ($multi_str ne "");
-
-
-	if ($wait_fi_file){
-
-	    $jobfile = $wait_fi_file.".script";
-
-
-
-	    my $idn;
-	    die $!."($wait_fi_file)" unless open FILE, "< $wait_fi_file";
-	    my $line = <FILE>;
-	    my @cells = split /\s+/, $line;
-	    $idn = $cells[3];
-	    close FILE;
-
-	    die "$! ($jobfile)" unless open JOB, "> $jobfile";
-	    print JOB "#!/bin/sh\n";
-	    print JOB "#SBATCH --job-name $NOJ\n";
-	    print JOB "#SBATCH --output errandout/$NOJ.-%j.out\n";
-	    print JOB "#SBATCH --ntasks 1\n";
-	    print JOB "#SBATCH --cpus-per-task 1\n";
-	    print JOB "#SBATCH --mem-per-cpu $mem_str"."g\n";
-	    print JOB "#SBATCH --time $wallstr\n";
-	    print JOB "#SBATCH --dependency afterany:$idn\n";
-	    
-	    print JOB "$bp\n";
-	    close (JOB);
-
-	    
-	}
-	else {
-
-	    if ($jobarray eq "") {
-		$job_bn_th = 0;
-		$jobfile = "$NOJ.start";
-		$jobarray = "$NOJ.scripts";
-		die $! unless open JF, "> $jobarray";
-		foreach (@job_arr) {
-		    print JF "$_\n";
-		    $job_bn_th++;
-		}
-		close JF;
-		print $jobfile."\n";
-	    }
-#	    exit;
-
-
-
-
-
-	    die "$! ($jobfile)" unless open JOB, "> $jobfile";
-	    print JOB "#!/bin/sh\n";
-	    print JOB "#SBATCH --job-name $NOJ\n";
-	    print JOB "#SBATCH --output errandout/$NOJ.-%j.out\n";
-
-
-#	    print JOB "#SBATCH --ntasks $job_bn_th\n";
-	    my $aend = $job_bn_th;
-	    if ($aend > 1000) {
-		$aend = 1000;
-	    }
-
-	    
-	    my $ast = "#SBATCH --array=1-$aend\n";
-	    if ($maxpar > 0) {	
-		$ast = "#SBATCH --array=1-$aend%maxpar\n";
-	    }
-	    print JOB $ast;
-
-
-
-
-	    if ($multi > 0) {
-		print JOB "#SBATCH --cpus-per-task $threads\n";
-	    }
-	    else {
-		print JOB "#SBATCH --cpus-per-task 1\n";
-	    }
-	    print JOB "#SBATCH --mem-per-cpu $mem_str"."g\n";
-	    print JOB "#SBATCH --time $wallstr\n";
-	    
-#	    print JOB "dispatch -r $jobarray\n";
-	    print JOB "$startjob_script --n \$SLURM_ARRAY_TASK_ID  --jobfile $jobarray\n";
-	    close (JOB);
-	}
-#	print "debug: $jobfile\n";
-#	exit;
-
-	&mysystem ("sbatch $jobfile > $jobfile.id");	    
-#	print "send $jobfile to queue\n";
-#	exit;	    
-
-
-#	&a2file ($jl_file."-".$NOJ, @blueprint_out);
-
-
-	exit;	
-
-    }
-
-    
-########################################
-### BROAD UGER
-##########################################
-
-
-    if ($qsub_broad) {
-	
-	my $inst_n=0;
-	my $job_n=0;
-	my $sum=0;
-	
-	#    my $cores=$inst;
-	#    $cores=2 if ($cores < 2);
-	
-	my $jobname;
-
-	my $mem_str = ($jmem / 1000) * 1;
-	$mem_str = 1 if ($mem_str < 1);
-
-
-	$jobname="j.$NOJ";
-
-
-	unless ($wait_fi_file){
-	if ($jobarray eq "") {
-
-	    if (@job_arr > 0) {
-
-		die "$! ($jobname.scripts)" unless open SCR, "> $jobname.scripts";
-		foreach (@job_arr) {
-
-		    print SCR "$_\n";   
-
-		}
-		close SCR;
-		$jobarray = "$jobname.scripts";
-	    }
-	    else {
-		print "Exit: no jobs to process\n";
-		exit;
-	    }
-	    
-	    print "wrote $jobname.scripts\n";
-	    $job_bn_th = @job_arr;
-#	    print "sleep\n";
-#	    sleep (3);
-
-	}
-	}
-
-
-
-	
-	if ($jobarray ne "") {
-
-
-	    my $wallstr = "$walltime:00:00";
-	    if ($walltime ==0){
-		$wallstr = "0:10:00";
-	    }
-
-#	    $cores = 28;
-	    
-	    
-	    print "starting job_array, $jobname\n";
-
-
-	    my $qlong_str = "";
-	    if ($walltime > 2) {
-		$qlong_str = "-q long";
-#		$qlong_str = "-P sanctioned -q sanctioned";
-	    }
-	    die "$! ($jobname)" unless open JOB, "> $jobname";
-
-
-	    print JOB "#PBS -lnodes=1:ppn=1\n";
-	    print JOB "#PBS -lwalltime=$wallstr\n";
-	    print JOB "cd $rootdir\n";
-	    print JOB "$startjob_script -n ".'$SGE_TASK_ID'." --jobfile $jobarray\n";
-	    close JOB;
-	    
-	    my $qsub_cmd = "qsub -l m_mem_free=".$mem_str."g,h_vmem=".$mem_str."g $qlong_str -v PATH,rp_perlpackages -t 1-$job_bn_th -e $rootdir/$errname/ -o $rootdir/$errname/ $name_of_job $hhmm $jobname";
-	    my $qsub_txt = "$qsub_cmd > $jobname.id";
-	    my $qsub_log = "echo $qsub_cmd > $jobname.log";
-	    
-	    &mysystem ($qsub_log);
-	    &mysystem ($qsub_txt);
-	    exit;
-	}
-	
-	
-
-
-	if ($wait_fi_file){
-	    die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file";
-	    my $id_str = <WF>;
-	    chomp($id_str);
-	    $id_str =~ s/Your job-array //;
-	    $id_str =~ s/\..*//;
-
-
-	    close WF;
-
-	    my $wallstr = "$walltime:00:00";
-	    
-	    print "starting motherscript, depending on $id_str\n";
-	    
-	    $jobname="j.$NOJ";
-	    die "$! ($jobname)" unless open JOB, "> $jobname";
-	    print JOB "#PBS -lnodes=1\n";
-	    print JOB "#PBS -lwalltime=$wallstr\n";
-	    print JOB "cd $ENV{PWD}\n";
-	    print JOB "$bp\n";
-	    close JOB;
-	    
-	    my $qsub_cmd = "qsub -v PATH,rp_perlpackages -l m_mem_free=".$mem_str."g,h_vmem=".$mem_str."g  -hold_jid $id_str -e $ENV{PWD}/$errname/ -o $ENV{PWD}/$errname/ $name_of_job $hhmm $jobname";
-	    my $qsub_txt = "$qsub_cmd > $jobname.id";
-	    my $qsub_log = "echo $qsub_cmd > $jobname.log";
-
-	    &mysystem ($qsub_log);
-	    &mysystem ($qsub_txt);
-	    exit;
-	}
-
-
-
-
-
-
-
-
-	
-    }
-########################################
-### COMPUTEROME
-##########################################
-
-
-    if ($qsub_computerome) {
-	
-	my $inst_n=0;
-	my $job_n=0;
-	my $sum=0;
-	
-	#    my $cores=$inst;
-	#    $cores=2 if ($cores < 2);
-	
-	my $jobname;
-
-	my $mem_str = ($jmem / 1000) * 1;
-	$mem_str = 1 if ($mem_str < 1);
-
-
-	if ($jobarray ne "") {
-
-
-	    my $wallstr = "$walltime:00:00";
-	    if ($walltime ==0){
-		$wallstr = "0:10:00";
-	    }
-
-#	    $cores = 28;
-	    
-	    
-	    print "starting job_array, $jobname\n";
-	    $jobname="j.$NOJ";
-
-	    
-	    die "$! ($jobname)" unless open JOB, "> $jobname";
-
-
-	    print JOB "#PBS -lnodes=1:ppn=1\n";
-	    print JOB "#PBS -lmem=".$mem_str."gb\n";
-	    print JOB "#PBS -lwalltime=$wallstr\n";
-	    print JOB "cd $rootdir\n";
-	    print JOB "startjob_script -n ".'$PBS_ARRAYID'." --jobfile $jobarray\n";
-	    close JOB;
-	    
-
-	    my $qsub_txt = "qsub -V -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id";
-	    my $qsub_log = "echo qsub -V -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log";
-	    
-	    &mysystem ($qsub_log);
-	    &mysystem ($qsub_txt);
-	    exit;
-	}
-	
-	
-
-
-	if ($wait_fi_file){
-	    die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file";
-	    my $id_str = <WF>;
-	    chomp($id_str);
-	    close WF;
-
-	    my $wallstr = "$walltime:00:00";
-	    
-	    print "starting motherscript, depending on $id_str\n";
-	    
-	    $jobname="j.$NOJ";
-	    die "$! ($jobname)" unless open JOB, "> $jobname";
-	    print JOB "#PBS -lnodes=1:ppn=1\n";
-	    print JOB "#PBS -lmem=".$mem_str."gb\n";
-	    print JOB "#PBS -lwalltime=$wallstr\n";
-	    print JOB "cd $ENV{PWD}\n";
-	    print JOB "$bp\n";
-	    close JOB;
-	    
-	    
-	    my $qsub_txt = "qsub -V -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id";
-	    my $qsub_log = "echo qsub -V -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log";
-
-	    &mysystem ($qsub_log);
-	    &mysystem ($qsub_txt);
-	    exit;
-	}
-    }
-
-
-########################################
-### LISA
-##########################################
-
-    my $inst_n=0;
-    my $job_n=0;
-    my $sum=0;
-    
-#    my $cores=$inst;
-#    $cores=2 if ($cores < 2);
-    
-    my $jobname;
-
-
-    my $mem_str = ($jmem / 1000) * 1;
-    if ($mem_str < 8){
-	$mem_str = 32;
-    }
-    else {
-	$mem_str = 64;
-	$cores =6;
-	
-    }
-
-
-    if ($jobarray ne "") {
-
-
-	my $wallstr = "$walltime:00:00";
-	if ($walltime ==0){
-	    $wallstr = "0:10:00";
-	}
-
-	## very complicated to get the ceiling of the job-N = number of nodes
-	my $numnode = ($job_bn_th-0.000001)/$cores;
-	$numnode=sprintf "%d",$numnode;
-	$numnode++;
-
-	if ($numnode > 1000){
-	    $numnode = 1000;
-	}
-	
-	
-	print "starting job_array, $jobname\n";
-	$jobname="j.$NOJ";
-#	print "$ENV{PWD}"."\n";
-#	print "$rootdir"."\n";
-#	print "jpbname: $jobname"."\n";
-#	exit;
-
-	    
-	die "$! ($jobname)" unless open JOB, "> $jobname";
-
-
-
-	if ($mem_str == 64) {
-#	    print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores:mem64gb\n";
-	    print JOB "#PBS -lnodes=1:mem64gb\n";
-	}
-	else {
-#	    print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores\n";
-	    print JOB "#PBS -lnodes=1\n";
-	}
-#	print JOB "#PBS -lmem=".$mem_str."gb\n";
-#	}
-	print JOB "#PBS -lwalltime=$wallstr\n";
-	print JOB "cd $rootdir\n";
-	print JOB "$startjob_script --parn $cores -n ".'$PBS_ARRAYID'." --jobfile $jobarray\n";
-	close JOB;
-	
-	#PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00
-	#cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115
-	#my.start_job -n $PBS_ARRAYID --jobfile array_test_2
-
-      
-#	my $qsub_txt = "qsub -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id";
-	my $qsub_cmd = "qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname";
-	my $qsub_txt = "$qsub_cmd > $jobname.id";
-	my $qsub_log = "echo $qsub_cmd > $jobname.log";
-
-#	print "$qsub_txt\n";
-
-	#	exit;
-	&mysystem ($qsub_log);
-	&mysystem ($qsub_txt);
-#	sleep (3);	
-	exit;
-    }
-    
-    
-
-
-    if ($wait_fi_file){
-	die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file";
-	my $id_str = <WF>;
-	chomp($id_str);
-	close WF;
-
-	my $wallstr = "$walltime:00:00";
-	
-	print "starting motherscript, depending on $id_str\n";
-	
-	$jobname="j.$NOJ";
-	die "$! ($jobname)" unless open JOB, "> $jobname";
-	print JOB "#PBS -lnodes=1\n";
-#	print JOB "#PBS -lmem=".$mem_str."gb\n";
-	print JOB "#PBS -lwalltime=$wallstr\n";
-	print JOB "cd $ENV{PWD}\n";
-	print JOB "$bp\n";
-	close JOB;
-	
-	#PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00
-	#cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115
-	#my.start_job -n $PBS_ARRAYID --jobfile array_test_2
-
-      
-	my $qsub_cmd = "qsub -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname";
-	my $qsub_txt = "$qsub_cmd > $jobname.id";
-	my $qsub_log = "echo $qsub_cmd > $jobname.log";
-
-
-
-	print "$qsub_txt\n";
-
-	#	    exit;
-	&mysystem ($qsub_txt);
-	&mysystem ($qsub_log);
-	exit;
-    }
-
-
-
-
-
-    #####
-    ## here without jobarray
-    #####
-    
-    if (1) {
-
-	$jobname="j.$NOJ";
-	
-	die "$! ($jobname.jobarray)" unless open JOBA, "> $jobname.jobarray";
-	foreach my $cmd (@job_arr){
-	    print JOBA "$cmd\n";
-	}
-	close JOBA;
-	
-
-	my $wallstr = "$walltime:00:00";
-	if ($walltime ==0){
-	    $wallstr = "0:10:00";
-	}
-
-	$job_bn_th = @job_arr;
-
-#	print "N: $job_bn_th\n";
-#	exit;
-	#	$cores = 16;
-
-	## very complicated to get the ceiling of the job-N = number of nodes
-	my $numnode = ($job_bn_th-0.000001)/$cores;
-	$numnode=sprintf "%d",$numnode;
-	$numnode++;
-	
-	
-	print "starting job_array, $jobname\n";
-	$jobname="j.$NOJ";
-#	print "$ENV{PWD}"."\n";
-#	print "$rootdir"."\n";
-#	print "jpbname: $jobname"."\n";
-#	exit;
-
-
- 
-
-	    
-	die "$! ($jobname)" unless open JOB, "> $jobname";
-	print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores\n";
-	print JOB "#PBS -lwalltime=$wallstr\n";
-	print JOB "cd $rootdir\n";
-	print JOB "$startjob_script --parn $cores -n ".'$PBS_ARRAYID'." --jobfile $jobname.jobarray\n";
-	close JOB;
-	
-	#PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00
-	#cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115
-	#my.start_job -n $PBS_ARRAYID --jobfile array_test_2
-
-      
-#	my $qsub_txt = "qsub -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id";
-	my $qsub_txt = "qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id";
-	my $qsub_log = "echo qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log";
-
-#	print "$qsub_txt\n";
-
-#	exit;
-	&mysystem ($qsub_txt);
-	&mysystem ($qsub_log);
-#	sleep (3);	
-	exit;
-    }
-    
-    
-
-    
-    
-
-
-
-    
-    if (0) {
-	foreach my $cmd (@job_arr){
-
-	my $wallstr = "$walltime:00:00";
-
-	if ($walltime == 0) {
-	    $wallstr = "00:10:00";
-	}
-
-	if ($start) {
-	    &mysystem($cmd);
-	    next;
-	}
-
-
-	if ($week > 0){
-	    my $walltime_loc = $walltime * 6;
-	    $wallstr = "$walltime_loc:00:00";
-	    $week = $week - 1;
-	}
-
-
-
-	if ($inst_n == 0){
-
-	    my $rn = int(rand(2));
-#	    if ($rn == 0){
-
-#	    $core = 8;
-#	    $inst = 7 unless ($inst == 1);
-
-#	    }
-#	    else {
-#		$core = 12;
-#		$inst = 11;
-#	    }
-
-	    $jobname="j.$job_n.$NOJ";
-#	    print " $jobname\n";
-	    if ($qsub){
-		die "$! ($jobname)" unless open JOB, "> $jobname";
-#		print JOB "#PBS -lnodes=1:cores$core\n";
-		print JOB "#PBS -lnodes=1\n";
-
-		if (0) {
-		    print JOB "#PBS -lnodes=1:cores$core\n" if ($core == 2);
-		    print JOB "#PBS -lnodes=1:cores8\n" if ($core == 8);
-		    print JOB "#PBS -lnodes=1:cores8:mem8gb\n" if ($core == 12);
-		    print JOB "#PBS -lnodes=1:cores8:mem16gb\n" if ($core == 16);
-		    print JOB "#PBS -lnodes=1:cores8:mem24gb\n" if ($core == 24);
-		}
-
-		print JOB "#PBS -lwalltime=$wallstr\n";
-
-
-		##### as long as dependencies don't work
-		if (0) {
-		    print JOB "$wait_str\n";
-		}
-
-		print JOB "$module\n";
-		print JOB "cd $ENV{PWD}\n";
-	    }
-	}
-	if ($qsub){
-	    
-	    if ($wait_fi_file){
-		die $! unless open CMD, "> $NOJ.cmd";
-		print CMD "$bp\n";
-		close CMD;
-#		print "blueprint_addon --cmd $NOJ.cmd --fwt $wait_fi_file --out $NOJ\n";
-		print JOB "blueprint_addon --cmd $NOJ.cmd --fwt $wait_fi_file --out $NOJ\n";
-
-	    }
-	    else {
-		print JOB $cmd;
-#		print JOB " &" unless ($serial);
-		print JOB " &" ;
-#		print JOB " &" unless ($cmd =~ /;$/);
-		print JOB "\n";
-	    }
-
-
-
-
-	    $inst_n++;
-	    $sum++;
-	    if ($inst_n == $inst || $sum == @job_arr || $week > 0){
-		print JOB "wait\n";
-		close JOB;
-		$inst_n=0;
-		$job_n++;
-
-		my $qsub_txt = "qsub -e $errname/ -o $errname/ $name_of_job $hhmm $jobname";
-#		print "$qsub_txt\n";
-#		&mysystem ("qsub $wait_str -e $errname/ -o $errname/ $name_of_job $jobname") if $lisa;
-		&mysystem ($qsub_txt) if $qsub;
-		last if ($job_n > 200);
-
-	    }
-	}
-	else {
-	    &mysystem ($cmd);
-	}
-    }
-	
-	&a2file ($jl_file."-".$NOJ, @blueprint_out);
-    }
-}
-
-else {
-    foreach (@job_arr){
-	print $_."\n";
-
-    }
-    if ($qsub){
-	my $wallstr = "$walltime:00:00";
-	print "walltime: $wallstr\n";
-	print "cores: $core\n";
-	print "instancies: $inst\n";
-	print "module: $module\n";
-	print "dir: $ENV{PWD}\n";
-    }
-}
diff --git a/bin/buigue_pico.pl b/bin/buigue_pico.pl
index 92034ba..3812cfc 100755
--- a/bin/buigue_pico.pl
+++ b/bin/buigue_pico.pl
@@ -1,6 +1,15 @@
 #!/usr/bin/env perl
 use strict;
 
+#############################
+# load utility functions
+#############################
+
+use FindBin;
+use lib "$FindBin::Bin";
+use rp_perl::Utils qw(trans);
+
+
 my $version = "1.0.0";
 my $progname = $0;
 $progname =~ s!^.*/!!;
@@ -10,31 +19,13 @@
 # read config file
 #############################
 
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my %conf = ();
-
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
-
-sub trans {
-    my ($expr)=@_;
-    unless (exists $conf{$expr}) {
-	die "config file without entry: $expr\n";
-    }
-    $conf{$expr};
-}
-
 my $liloc = &trans("liloc");
 
-
-
+my $perlpack = &trans("perlpack");
+use lib $perlpack;
 
 #####################################################
-use lib $ENV{rp_perlpackages};
+# use lib $ENV{rp_perlpackages};
 
 
 
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index f576f60..d14a5ef 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -23,6 +23,14 @@
 #### 
 
 
+#############################
+# load utility functions
+#############################
+
+use FindBin;
+use lib "$FindBin::Bin";
+use Ricopili::Utils qw(trans);
+
 my $version = "1.0.24";
 my $progname = $0;
 $progname =~ s!^.*/!!;
@@ -45,24 +53,6 @@
 # read config file
 #############################
 
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my %conf = ();
-
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
-
-sub trans {
-    my ($expr)=@_;
-    unless (exists $conf{$expr}) {
-	die "config file without entry: $expr\n";
-    }
-    $conf{$expr};
-}
-
 my $ploc = &trans("p2loc");
 my $homedir = &trans("home");
 my $qloc = &trans("queue");
@@ -289,12 +279,12 @@ sub trans {
 # "testing environment variable rp_perlpackages
 ####################################
 
-print "testing environment variable rp_perlpackages....\n";
-unless (exists $ENV{rp_perlpackages}) {
-    print "Error: no environment variable for perl-packages, please re-install ricopili and make sure to follow all instructions\n";
-    print "------------------------------------\n";
-    exit;
-}
+# print "testing environment variable rp_perlpackages....\n";
+# unless (exists $ENV{rp_perlpackages}) {
+#     print "Error: no environment variable for perl-packages, please re-install ricopili and make sure to follow all instructions\n";
+#     print "------------------------------------\n";
+#     exit;
+# }
 print "....all set....\n";
 print "------------------------------------\n";
 
@@ -408,7 +398,7 @@ sub a2filenew_app {
 my $sjainfofile = "$loloc/impute_dir_info";
 unless (-e $sjainfofile) {
     print "log-file ($sjainfofile) is not existing\n";
-    print "please check loloc in ~/ricopili.conf\n";
+    print "please check loloc in ~/picopili.conf\n";
     exit;
 }
 #my $sjainfofile = "$homedir/impute_dir_info_35_test";
diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl
index 75ec534..782404b 100755
--- a/bin/lift_to_hg19.pl
+++ b/bin/lift_to_hg19.pl
@@ -28,31 +28,18 @@
 #awk '{print $4,$2}' liftes > liftes.new
 #/fg/debakkerscratch/ripke/plink/1.08/src/plink --bfile ../cmc2_051310.8_toimpute --update-map liftes.new --make-bed
 
-
-
 #############################
-# read config file
+# load utility functions
 #############################
 
-#print "host: ".$ENV{HOST}."\n";
-#exit;
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my %conf = ();
+use FindBin;
+use lib "$FindBin::Bin";
+use rp_perl::Utils qw(trans);
 
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
 
-sub trans {
-    my ($expr)=@_;
-    unless (exists $conf{$expr}) {
-	die "config file without entry: $expr\n";
-    }
-    $conf{$expr};
-}
+#############################
+# read config file
+#############################
 
 my $ploc = &trans("p2loc");
 my $liloc = &trans("liloc");
diff --git a/bin/plague.pl b/bin/plague_pico.pl
similarity index 85%
rename from bin/plague.pl
rename to bin/plague_pico.pl
index fb0ee88..9825be5 100755
--- a/bin/plague.pl
+++ b/bin/plague_pico.pl
@@ -1,6 +1,13 @@
 #!/usr/bin/env perl
 use strict;
-BEGIN { push @INC, $ENV{rp_perlpackages}.'/Compress-Raw-Zlib-2.065/blib/lib' }
+
+#############################
+# load utility functions
+#############################
+
+use FindBin;
+use lib "$FindBin::Bin";
+use rp_perl::Utils qw(trans);
 
 my $version = "1.0.0";
 my $progname = $0;
@@ -11,30 +18,14 @@
 # read config file
 #############################
 
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my %conf = ();
-
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
-
-sub trans {
-    my ($expr)=@_;
-    unless (exists $conf{$expr}) {
-	die "config file without entry: $expr\n";
-    }
-    $conf{$expr};
-}
-
 my $hmloc = &trans("hmloc");
-
+my $perlpack = &trans("perlpack");
+use lib $perlpack;
 
 #####################################################
 
 my $sc_file = "$hmloc/snp_platform_collection.txt.new.0815.gz";
+my $sc_file_0416 = "$hmloc/snp_platform_collection.txt.new.0416a.gz";
 
 my $scol = 2;
 
@@ -44,7 +35,9 @@ sub trans {
 
 version: $version
 
-  --scf    STRING  SNP collection file, default: $sc_file
+  --scf    STRING  SNP collection file
+                       default: $sc_file
+		       first checking this: $sc_file_0416
   --scol INT       column of SNPs, default = $scol
   --create STRING  create new entry with name STRING
   -help            print this message and exit
@@ -94,7 +87,6 @@ sub split_line {
 
 my %bsnps=();
 
-# use lib $ENV{rp_perlpackages}.'/Compress-Raw-Zlib-2.065/blib/lib';
 use Compress::Zlib ;
 
 ## read bim-file
@@ -113,6 +105,10 @@ sub split_line {
 
 ## compare with snp-collection
 
+if (-e  $sc_file_0416) {
+    $sc_file =  $sc_file_0416;
+}
+
 unless (-e $sc_file) {
     $sc_file = "$hmloc/snp_platform_collection.txt.new.0114.gz";
     if (-e $sc_file) {
diff --git a/bin/qc_rel.py b/bin/qc_rel.py
index 9d994f1..e25c372 100755
--- a/bin/qc_rel.py
+++ b/bin/qc_rel.py
@@ -130,7 +130,7 @@
     # get directory containing current script
     # (hack to get plague script location)
     rp_bin = os.path.dirname(os.path.realpath(__file__))
-    plague_ex = rp_bin + '/plague.pl'
+    plague_ex = rp_bin + '/plague_pico.pl'
 
 
 #############
@@ -1063,4 +1063,4 @@
 print '\n############'
 print '\n'
 print 'SUCCESS!\n'
-exit(0)
\ No newline at end of file
+exit(0)
diff --git a/bin/rp_perl/Utils.pm b/bin/rp_perl/Utils.pm
new file mode 100755
index 0000000..7bf48d2
--- /dev/null
+++ b/bin/rp_perl/Utils.pm
@@ -0,0 +1,49 @@
+package rp_perl::Utils;
+
+######################
+#
+# Adapted from ricopili (https://github.com/Nealelab/ricopili)
+# Original code by Robert Karlsson (@robkar on github)
+#
+######################
+
+use strict;
+use warnings;
+
+BEGIN {
+    require Exporter;
+    our @ISA         = qw(Exporter);
+    our @EXPORT_OK   = qw(trans $conf_file);
+}
+
+#############################
+# read config file
+#############################
+
+our $conf_file = $ENV{HOME}."/picopili.conf";
+my %conf = ();
+
+die $!."($conf_file)" unless open FILE, "< $conf_file";
+while (my $line = <FILE>){
+    next if ($line =~ /^#/);
+    my @cells = split /\s+/, $line;
+    next unless ($#cells >= 1);
+
+    # expand environment variables and '~' for home directory in conf entries
+    $cells[1] =~ s/^~/$ENV{HOME}/;
+    $cells[1] =~ s/\$\{(\w+)\}/$ENV{$1}/g;
+    $cells[1] =~ s/\$(\w+)/$ENV{$1}/g;
+
+    $conf{$cells[0]} = $cells[1];
+}
+close FILE;
+
+sub trans {
+    my ($expr) = @_;
+    unless (exists $conf{$expr}) {
+	     die "config file without entry: $expr\n";
+    }
+    $conf{$expr};
+}
+
+1;
diff --git a/docs/RICOPILI.md b/docs/RICOPILI.md
index df850e5..ef8e45e 100644
--- a/docs/RICOPILI.md
+++ b/docs/RICOPILI.md
@@ -1,13 +1,12 @@
 The following scripts are adapted from ricopili (https://github.com/Nealelab/ricopili) with very minor changes:
 
-* `blue_start_job.pl`, from `my.start_job`
-* `blueprint_pico.pl`, from `blueprint`
 * `buigue_pico.pl`, from `buigue`
 * `checkflip_pico.pl`, from `checkflip4`
 * `checkpos_pico.pl`, from `checkpos6`
 * `config`, from `rp_config`
 * `lift_to_hg19.pl`, from `lift18219`
-* `plague.pl`, from `plague_2`
+* `plague_pico.pl`, from `plague_2`
+* `./rp_perl/Utils.pm`, from `./Ricopili/Utils.pm`
 
 In addition, the following scripts are adapted from ricopili with more substantial changes as indicated:
 

From 863840490466c06b0d05db1b57330d6a1bfe95cc Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 28 Sep 2016 20:17:13 -0400
Subject: [PATCH 03/48] separate and simplify config, move imp_prep logs

---
 CONFIG                |    1 +
 bin/bin_check_pico    |    7 +
 bin/checkflip_pico.pl |    1 -
 bin/checkpos_pico.pl  |    1 -
 bin/config            | 1127 -----------------------------------------
 bin/config_pico.pl    |  714 ++++++++++++++++++++++++++
 bin/imp_prep.pl       |   24 +-
 config                |    1 -
 docs/RICOPILI.md      |    3 +-
 9 files changed, 731 insertions(+), 1148 deletions(-)
 create mode 120000 CONFIG
 create mode 100755 bin/bin_check_pico
 delete mode 100755 bin/config
 create mode 100755 bin/config_pico.pl
 delete mode 120000 config

diff --git a/CONFIG b/CONFIG
new file mode 120000
index 0000000..200ec28
--- /dev/null
+++ b/CONFIG
@@ -0,0 +1 @@
+./bin/config_pico.pl
\ No newline at end of file
diff --git a/bin/bin_check_pico b/bin/bin_check_pico
new file mode 100755
index 0000000..819b0c4
--- /dev/null
+++ b/bin/bin_check_pico
@@ -0,0 +1,7 @@
+#!/usr/bin/perl
+use strict;
+
+
+### dud script to check whether search path is correct
+### Jackie addition 01/22/14
+
diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl
index aba1290..69e2809 100755
--- a/bin/checkflip_pico.pl
+++ b/bin/checkflip_pico.pl
@@ -50,7 +50,6 @@ sub trans {
 
 my $sloc = &trans("sloc");
 my $hmloc = &trans("hmloc");
-#my $ploc = &trans("ploc");
 my $p2loc = &trans("p2loc");
 
 
diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl
index 6d5fc5c..8f417d6 100755
--- a/bin/checkpos_pico.pl
+++ b/bin/checkpos_pico.pl
@@ -50,7 +50,6 @@ sub trans {
 
 my $sloc = &trans("sloc");
 my $hmloc = &trans("hmloc");
-#my $ploc = &trans("ploc");
 my $p2loc = &trans("p2loc");
 
 
diff --git a/bin/config b/bin/config
deleted file mode 100755
index 709da3c..0000000
--- a/bin/config
+++ /dev/null
@@ -1,1127 +0,0 @@
-#!/usr/bin/env perl
-use strict;
-use File::Basename;
-use Cwd;
-use Data::Dumper;
-
-### Script to configure settings for ricopili pipeline
-### Jackie Goldstein, Jan 2014
-
-
-
-
-    
-my $version = "2.0.0";
-my $progname = $0;
-
-$progname =~ s!^.*/!!;
-
-my $cdir = cwd();
-my $home = $ENV{HOME};
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my $command_line = "$progname @ARGV";
-
-#############################
-# Ask user what cluster they're using
-#############################
-#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"other",0);
-#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"other",0);
-my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"co_ipsych",0,"other",0);
-my @cluster_names = ("broad","mssm","genomedk","lisa","computerome","co_ipsych","other");
-print "Please enter your cluster name from the following options:\n";
-my $i = 1;
-foreach (@cluster_names){
-    print "\t($i) $_\n";
-    $i += 1;
-}
-print "\n";
-my $cluster = "other";
-while (1) {    
-    $cluster = lc <>;
-    chomp $cluster;
-    if (exists $clusters{$cluster}){$clusters{$cluster} = 1;last;}
-    else {
-	$cluster =~ s/(\)|\()//g;
-	if ($cluster >= 1 && $cluster <= $i){$cluster -= 1; $cluster = $cluster_names[$cluster];$clusters{$cluster} = 1;last;}
-	else {
-	    print "Did not recognize option. Please enter a cluster name from the options below:\n";
-	    my $i = 1;
-	    foreach (@cluster_names){
-		print "\t($i) $_\n";
-		$i += 1;
-	    }
-	    print "\n";
-	    my $cluster = "other";
-	}
-    }
-}
-print "\nUsing the following cluster: $cluster\n\n";
-
-
-
-
-    
-#############################
-# Determine the shell
-#############################
-my $shell = '';
-if (exists $ENV{SHELL}){$shell = basename($ENV{SHELL});}
-if ($shell eq "bash-login-check"){$shell = "bash";}
-if ($shell ne "bash" && $shell ne "tcsh") {
-    print "Warning! Shell not recognized: $shell\n";
-    print "Please send email to rp_dev\@broadinstitute.org\n";
-}
-print "Detected you are using the following shell: $shell\n\n";
-
-
-###################################################
-###  system call with test if successful
-###################################################
-sub mysystem(){
-    my ($systemstr)="@_";
-    system($systemstr);
-    my $status = ($? >> 8);
-    die "$systemstr\n->system call failed: $status" if ($status != 0);
-}
-
-
-###################################################
-###  Check if rp_bin already installed
-###################################################
-system("bin_check"); # dummy script that doesn't do anything
-my $status_bin = ($? >> 8);
-system("bin_check_pdfjam"); # dummy script that doesn't do anything
-my $status_pdfjam = ($? >> 8);
-
-
-if ($clusters{lisa} == 1) {
-    unless (-e "$home/.bash_profile") {
-	die $! unless open FILE, "> $home/.bash_profile";
-	     print FILE 'if [ -f ~/.bashrc ]; then '."\n";
-	     print FILE '    . ~/.bashrc'."\n";
-	     print FILE 'fi'."\n";
-	close FILE;
-    }
-    unless (-e "$home/.bashrc") {
-	system "touch ~/.bashrc\n";
-    }
-}
-
-unless ($clusters{broad} == 1) {
-
-#    print "$cdir/pdfjam\n";
-    die $!."($cdir/pdfjam/pdfjam)" unless open FILE, "< $cdir/pdfjam/pdfjam";
-    die $!."($cdir/pdfjam/pdfjam.ow)" unless open OUT, "> $cdir/pdfjam/pdfjam.ow";
-    while (my $line = <FILE>){
-	$line =~ s!/psych/genetics_data/ricopili_tmp!/scratch!;
-	print OUT "$line";
-    }
-    close FILE;
-    close OUT;
-    system ("mv $cdir/pdfjam/pdfjam.ow $cdir/pdfjam/pdfjam");
-    print "rewrote $cdir/pdfjam.ow\n";
-    
-}
-
-
-
-    
-if ($status_bin == 0  && $status_pdfjam == 0 &&  !(-e "install_true")) {
-    print "\n----------------------------------------------------\n";                    
-    print "\n\nWarning: Ricopili is already installed.\n";
-    print "Do you wish to uninstall Ricopili first (recommended)? <y/n>\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {
-	    print "\n----------------------------------------------------\n";
-	    print "----------------------------------------------------\n";
-	    print "----------------------------------------------------\n";
-	    
-            print "\nTo uninstall Ricopili, you need to remove the following paths from your default search path:\n";
-            my @PATH = split(':',$ENV{PATH});
-            foreach (@PATH) {
-                if ($_ =~ "rp_bin" || $_ =~ "rp_perlpackages") {
-                    print "\t$_\n";}
-            }
-            print "If this seems incorrect, DO NOT continue with the uninstall instructions below!!!\n";
-	    print "If this is correct, please invoke the following 2 commands not preceeded by ##\n";
-	    print "after this, please restart ./rp_config\n"; 
-            
-            my @PATH = split ":", $ENV{PATH};
-            my @NEW_PATH = ();
-            foreach (@PATH) {
-                unless ($_ =~ "rp_bin") {
-                    push @NEW_PATH, $_;
-                }                    
-            }
-            my $new_path = join ":", @NEW_PATH;
-
-            my $i = 1;
-            # 1. Remove paths for this session
-            if ($shell eq "bash") {
-                print "\n----------------------------------------------------\n";
-                print "## Please enter the following command to remove rp_bin from the search path for this session:\n\n";
-                $i += 1;                
-                print "\texport PATH=$new_path\n";
-            }
-            elsif ($shell eq "tcsh") {
-                print "\n----------------------------------------------------\n";
-                print "## Please enter the following command to remove rp_bin from the search path for this session:\n\n";
-                $i += 1;                                
-                print "\tsetenv PATH $new_path\n";
-            }
-            else {
-                print "\n----------------------------------------------------\n";
-                print "## You will need to figure out how to change the current search path to the following for your shell:\n\n";
-                $i += 1;
-                print "\t$new_path\n";
-            }
-            # 2. Remove the path permanently from the search path
-            if ($clusters{broad} == 1) {
-                if (-e "$home/.my.bashrc") {
-                    &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.my.bashrc > my.bashrc_minus_rpbin.txt");
-                    &mysystem("cp $home/.my.bashrc my.bashrc.copy");
-                    print "\n----------------------------------------------------\n";                    
-                    print "\n$## To remove rp_bin permanently from the default search path in bash, run the following command:\n\n";
-                    $i += 1;
-                    print "\tmv my.bashrc_minus_rpbin.txt $home/.my.bashrc\n\n";
-                    print "## which will delete the following lines from your $home/.my.bashrc file:\n";
-
-		    
-		    my @tmp_lines = `grep rp_bin $home/.my.bashrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }
-		    @tmp_lines = `grep rp_perlpackages $home/.my.bashrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }		  		    
-
-                    print "\n## A copy of $home/.my.bashrc is available at my.bashrc.copy\n";                    
-                }
-                if (-e "$home/.my.cshrc") {
-                    &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.my.cshrc > my.cshrc_minus_rpbin.txt");
-                    &mysystem("cp $home/.my.cshrc my.cshrc.copy");
-                    print "\n----------------------------------------------------\n";                    
-                    print "\n$i. To remove rp_bin permanently from the default csh or tcsh search path, run the following command:\n\n";
-                    $i += 1;
-                    print "\tmv my.cshrc_minus_rpbin.txt $home/.my.cshrc\n\n";
-                    print "## which will delete the following lines from your $home/.my.cshrc file:\n";
-
-
-
-		    my @tmp_lines = `grep rp_bin $home/.my.cshrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }
-		    @tmp_lines = `grep rp_perlpackages $home/.my.cshrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }	
-##                    &mysystem("grep \"rp_bin\\|rp_perlpackages\" $home/.my.cshrc");
-
-
-
-                    print "\n## A copy of $home/.my.cshrc is available at my.cshrc.copy\n";
-                }
-            }
-            elsif ($clusters{genomedk} == 1 || $clusters{mssm} == 1 || $clusters{lisa} == 1 || $clusters{computerome} == 1 || $clusters{co_ipsych} == 1) {
-                if (-e "$home/.bashrc") {
-                    &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.bashrc > my.bashrc_minus_rpbin.txt");
-                    &mysystem("cp $home/.bashrc my.bashrc.copy");
-                    print "\n----------------------------------------------------\n";                    
-                    print "\n$## To remove rp_bin permanently from the default search path in bash, run the following command:\n\n";
-                    $i += 1;
-                    print "\tmv my.bashrc_minus_rpbin.txt $home/.bashrc\n\n";
-                    print "## which will delete the following lines from your $home/.bashrc file:\n";
-		    
-
-
-		    my @tmp_lines = `grep rp_bin $home/.bashrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }
-		    @tmp_lines = `grep rp_perlpackages $home/.bashrc`;
-		    foreach (@tmp_lines) {
-			print "#####  $_";
-		    }		    
-##                    &mysystem("grep \"rp_bin\\|rp_perlpackages\" $home/.bashrc");
-
-
-                    print "\n## A copy of $home/.bashrc is available at my.bashrc.copy\n";                    
-                }
-	    }
-
-
-	    
-
-            else {
-                print "\n----------------------------------------------------\n";                                    
-                print "\n## Remove the directories listed above from the same place where you permanently added the directories to the search path.\n";
-                $i += 1;
-            }
-            print "\n\n";
-            exit;
-        }
-        elsif ($answer eq "n") {&mysystem("touch install_true");last;}
-        else {print "Please answer with y or n.\n";}
-    }
-};
-
-###################################################
-###  Add rp_bin to default search path
-###################################################
-system("bin_check"); # dummy script that doesn't do anything
-my $status_bin = ($? >> 8);
-system("bin_check_pdfjam"); # dummy script that doesn't do anything
-my $status_pdfjam = ($? >> 8);
-
-
-
-
-# exit;
-
-
-unless ($status_bin == 0  && $status_pdfjam == 0) {    
-    my $bash = "$cdir/my.bashrc_rp_path";    
-    my $csh = "$cdir/my.cshrc_rp_path";    
-
-    die $! unless open FILE, "> $bash";
-    print FILE "\n\nPATH=$cdir:\$PATH\n";
-    print FILE "PATH=$cdir/pdfjam:\$PATH\n";
-    
-    if ($clusters{lisa}){
-        print FILE "export rp_perlpackages=/home/gwas/perl_modules\n";
-    }
-    if ($clusters{computerome}){
-        print FILE "export rp_perlpackages=/home/people/sripke/rp_external_bins/perl_packages\n";
-    }
-    if ($clusters{co_ipsych}){
-        print FILE "export rp_perlpackages=/data/user_tools/rp_external_bins/perl_packages\n";
-    }    
-    if ($clusters{broad}){
-        print FILE "export rp_perlpackages=/home/unix/sripke/perl_modules\n";
-    }
-    close FILE;
-
-    die $! unless open FILE, "> $csh";
-    print FILE "\n\nset path=($cdir \$path)\n";
-    print FILE "set path=($cdir/pdfjam \$path)\n";
-    if ($clusters{broad}){
-        print FILE "setenv rp_perlpackages /home/unix/sripke/perl_modules\n";
-    }
-    close FILE;
-
-
-    
-    print "\n----------------------------------------------------\n";     
-    print "## Please run the following commands to permanently add rp_bin to the default search path and restart the configuration: \n\n";
-
-    
-    if ($clusters{broad}){
-        my $i = 1;
-       
-        if (-e "$home/.my.bashrc") {
-            print "cat $bash >> ~/.my.bashrc\n";
-            $i += 1;
-        }
-        if (-e "$home/.my.cshrc") {
-            print "cat $csh >> ~/.my.cshrc\n";
-            $i += 1;
-        }
-
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;
-            print "export rp_perlpackages=/home/unix/sripke/perl_modules\n";
-            $i += 1;
-        }
-        elsif ($shell eq "tcsh") {
-            print "set path=($cdir \$path)\n";
-            $i += 1;
-            print "set path=($cdir/pdfjam \$path)\n";
-            $i += 1;
-            print "setenv rp_perlpackages /home/unix/sripke/perl_modules\n";
-            $i += 1;
-        }
-
-
-	
-        print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-    elsif ($clusters{genomedk}){
-	my $i = 1;
-	
-
-        if (-e "$home/.bashrc") {
-            print "cat $bash >> ~/.bashrc\n";
-            $i += 1;
-        }
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;        
-        }
-	print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-    elsif ($clusters{lisa}){
-	my $i = 1;    
-
-        if (-e "$home/.bashrc") {
-            print "cat $bash >> ~/.bashrc\n";
-            $i += 1;
-        }
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;        
-        }
-	print "export rp_perlpackages=/home/gwas/perl_modules\n";
-	print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-    elsif ($clusters{computerome}){
-	my $i = 1;    
-
-        if (-e "$home/.bashrc") {
-            print "cat $bash >> ~/.bashrc\n";
-            $i += 1;
-        }
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;        
-        }
-	print "export rp_perlpackages=/home/people/sripke/rp_external_bins/perl_packages\n";
-	print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-    elsif ($clusters{co_ipsych}){
-	my $i = 1;    
-
-        if (-e "$home/.bashrc") {
-            print "cat $bash >> ~/.bashrc\n";
-            $i += 1;
-        }
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;        
-        }
-	print "export rp_perlpackages=/data/user_tools/rp_external_bins/perl_packages\n";
-	print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-
-    
-
-    
-    elsif ($clusters{mssm}){
-	my $i = 1;    
-
-        if (-e "$home/.bashrc") {
-            print "cat $bash >> ~/.bashrc\n";
-            $i += 1;
-        }
-        if ($shell eq "bash") {
-            print "PATH=$cdir:\$PATH\n";
-            $i += 1;
-            print "PATH=$cdir/pdfjam:\$PATH\n";
-            $i += 1;        
-        }
-	print "./rp_config\n\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-    else {
-        print "You'll need to add the following paths to your default search path:\n";
-        print "\t$cdir\n";
-        print "\t$cdir/pdfjam\n\n";
-        print "If you are using a bash shell, sample commands are located in this file: $bash\n";
-        print "If you are using a tcsh shell, sample commands are located in this file: $csh\n";
-        print "For example instructions, see http://www.cyberciti.biz/faq/unix-linux-adding-path/\n";
-        print "If possible, add these paths permanently. Otherwise, you will need to do this everytime you start a new session.\n";
-        print "After these directories have been added to the search path, rerun this script: ./rp_config\n";
-        &mysystem("touch install_true");        
-        exit;
-    }
-}
-
-print "Required directories found in search path:\n";
-print "\trp_bin/ -- success\n";
-print "\trp_bin/pdfjam/ -- success\n\n";
-
-system("latex small2e > /dev/null"); # dummy script that doesn't do anything
-my $status_latex = ($? >> 8);
-unless ($status_latex) {
-    print "Detected pdflatex is installed.\n\n";
-    &mysystem("rm small2e.*");
-}
-else {
-    print "---------------------------------------\n\n";
-    print "Error -- pdflatex is not installed!\n\n";
-
-    if ($clusters{genomedk} == 1){
-	print "Run the following commands to add pdflatex to the default search path:\n";
-	print "\techo \"source /com/extra/texlive/2014/load.sh\" >> ~/.bashrc\n";
-	print "\tsource /com/extra/texlive/2014/load.sh\n\n";
-}
-    else {
-	print "Please install pdflatex by downloading the texlive package and following the installation instructions (https://www.tug.org/texlive/)\n\n";
-    }
-    print "Rerun this script once pdflatex has been added to the default search path (./rp_config)\n\n";
-    print "---------------------------------------\n";
-    exit;
-}
-
-### Make sure all perl packages are installed
-### JG addition -- wrote this block before I saw you added something similar above
-#if ($clusters{broad} == 1){
-#    unless (exists $ENV{rp_perlpackages}) {
-#        print "Run the following commands to add rp_perlpackages as an environmental variable:\n";
-#        print "echo \"export rp_perlpackages=/home/unix/sripke/perl_modules/\" >> ~/.my.bashrc\n";
-#        print "echo \"setenv rp_perlpackages /home/unix/sripke/perl_modules/\" >> ~/.my.cshrc\n";
-#        if ($shell eq "bash") {
-#            print "export rp_perlpackages=/home/unix/sripke/perl_modules/\n";            
-#        }
-#        if ($shell eq "tcsh") {
-#            print "setenv rp_perlpackages /home/unix/sripke/perl_modules/\n";
-#        }
-
-#        print "./rp_config\n\n";
-#        exit;
-#    }
-#    else { print "Detected rp_perlpackages as an environmental variable.\n\n";}
-#}
-if ($clusters{genomedk} == 1){
-    unless (exists $ENV{rp_perlpackages}) {
-        print "Run the following commands to add rp_perlpackages as an environmental variable:\n";
-        print "echo \"export rp_perlpackages=/project/ricopili/perl_packages/\" >> ~/.bashrc\n";
-        print "export rp_perlpackages=/project/ricopili/perl_packages/\n";
-        print "./rp_config\n\n";
-        exit;
-    }
-    else { print "Detected rp_perlpackages as an environmental variable.\n\n";}
-}
-if ($clusters{mssm} == 1){
-    unless (exists $ENV{rp_perlpackages}) {
-        print "Run the following commands to add rp_perlpackages as an environmental variable:\n";
-        print "echo \"export rp_perlpackages=/hpc/users/xripkes01/perl_modules/\" >> ~/.bashrc\n";
-        print "export rp_perlpackages=/hpc/users/xripkes01/perl_modules/\n";
-        print "./rp_config\n\n";
-        exit;
-    }
-    else { print "Detected rp_perlpackages as an environmental variable.\n\n";}
-}
-
-### Make sure lapack is installed
-if ($clusters{genomedk} == 1){
-    unless ($ENV{EXTRAS} =~ /lapack/) {
-        print "Run the following commands to add lapack to the default search path:\n";
-        print "echo \"source /com/extra/lapack/3.5.0/load.sh\" >> ~/.bashrc\n";
-        print "source /com/extra/lapack/3.5.0/load.sh\n";
-        print "./rp_config\n\n";
-        exit;
-    }
-    else { print "Detected lapack is installed.\n\n";}
-}
-
-my $ans_ow = "y";
-if (-e $conf_file) {
-    print "Configuration file already exists at $conf_file\n";
-    print "Do you wish to overwrite this file? (y/n)\n";
-    while (1) {
-        $ans_ow = lc <>;
-        chomp $ans_ow;
-        if ($ans_ow eq "y") {
-            print "Rewriting configuration file. Making a backup to $conf_file.copy\n\n";
-            &mysystem("cp $conf_file $conf_file.copy");
-            last;
-        }
-        elsif ($ans_ow eq "n") {print "Not overwriting $conf_file.\n";last;}
-        else {print "Please answer with y or n.\n";}
-    }
-};
-
-my $cd = cwd();
-my $sloc = "";
-my $loloc = "";
-my $initials = "";
-my $conf_file = $ENV{HOME}."/ricopili.conf";
-my $hdir = $ENV{HOME};
-my $email = "";
-my @text = ();
-
-if ($ans_ow eq "y"){
-#############################
-# make scratch directory
-#############################
-if ($clusters{broad} == 1) {
-    my $user_name = basename($ENV{HOME});
-    $sloc = "/broad/hptmp/$user_name/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-elsif ($clusters{lisa} == 1) {
-    $sloc = "/scratch/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-
-elsif ($clusters{computerome} == 1) {
-    $sloc = "/scratch/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-
-elsif ($clusters{co_ipsych} == 1) {
-    $sloc = "/data/scratch/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-
-elsif ($clusters{genomedk} == 1) {
-    $sloc = "/project/ricopili/scratch_dir/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-
-
-elsif ($clusters{mssm} == 1) {
-    my $user_name = $ENV{USER};
-    $sloc = "/sc/orga/scratch/$user_name/";
-    print "Do you want to use the following default scratch directory? (y or n)\n";
-    print "\t$sloc\n";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-}
-else {
-    print "Please enter a scratch directory to use:\n";
-    $sloc = "$cd/tmp/";
-    while (1) {
-        my $answer = lc <>;
-        chomp $answer;
-        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
-        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
-				$sloc = <>;
-				chomp $sloc;
-				$sloc =~ s/^~/$ENV{HOME}/g;
-				$sloc =~ s/^\./$cd/g;        
-				last;}
-        else {print "Please answer with y or n.\n";}
-    }
-
-}
-
-unless (-d $sloc) {
-    print "Making scratch directory: $sloc\n\n";
-    &mysystem("mkdir $sloc");    
-}
-else {
-    print "Scratch directory already exists at $sloc\n";
-}
-print "\n";
-
-#############################
-# write config file to home directory
-#############################
-print "Please enter your initials (2 letters):\n";
-while (1) {
-    $initials = lc <>;
-    chomp $initials;
-    if (length($initials) == 2) {last;}
-    else {print "Make sure initials are 2 letters!\n";}
-}
-print "\n";
-
-print "Please enter your email address:\n";
-my $email = <>;
-chomp $email;
-print "\n";
-
-
-
-my $defall = 0;
-
-if ($clusters{lisa} == 1) {
-    print "Do you want to use default values for the rest of the installation process? (y or n)\n";
-    while (1) {
-	my $answer = lc <>;
-	chomp $answer;
-	if ($answer eq "y") {print "Using default values for the rest of the installation process\n\n"; $defall = 1;last;}
-	elsif ($answer eq "n") {print "Not using default values for the rest of the installation process\n\n"; $defall = 0;last;}
-	else {print "Please answer with y or n.\n";}
-    }
-}
-
-
-
-
-
-    
-
-my $home_dir = $ENV{HOME};
-$loloc = "$home_dir/";
-print "Do you want to use the following default directory to store your log files? (y or n)\n";
-print "\t$loloc\n";
-if ($defall == 0) {
-    while (1) {
-	my $answer = lc <>;
-	chomp $answer;
-	if ($answer eq "y") {print "Using $loloc as the log directory.\n\n";last;}
-	elsif ($answer eq "n") {print "Please enter a log directory to use:\n";
-				$loloc = <>;
-				chomp $loloc;
-				$loloc =~ s/^~/$ENV{HOME}/g;
-				$loloc =~ s/^\./$cd/g;        
-				last;}
-	else {print "Please answer with y or n.\n";}
-    }
-}
-
-#print "Please enter the directory you wish to store your log files in:\n";
-#while (1) {
-#    $loloc = <>;
-#    chomp $loloc;
-#    $loloc =~ s/^~/$ENV{HOME}/g;
-#    $loloc =~ s/^\./$cd/g;
-#    print "Using $loloc as the directory for log files.\n";
-#    unless (-d $loloc) {&mysystem("mkdir $loloc");}
-#    last;
-#}
-print "\n";
-
-my %longvar = ("ploc","PLINK",
-	       "p2loc","PLINK2",
-	       "shloc","SHAPEIT",
-	       "i2loc","IMPUTE2",
-	       "liloc","Liftover",
-	       "eloc","Eigenstrat",
-	       "rloc","R",
-	       "rpac","Rpackages",
-	       "hmloc","HapMap reference",
-	       "meloc","METAL",
-	       "ldloc","LDscore",
-#	       "hvloc","HaploView"
-    );
-	       
-
-my %variables = ("ploc", "",
-		  "p2loc","",
-		  "shloc","",
-		 "i2loc","",
-		  "liloc","",
-		  "eloc","",
-		  "rloc","",
-		 "rpac","",
-		  "hmloc","",
-		  "meloc","",
-#		  "hvloc","",
-    );
-
-
-
-
-    
-if ($clusters{broad}){
-    %variables = ("ploc", "/home/unix/sripke/plink_src/src/",
-		  "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/",
-		  "shloc","/home/unix/sripke/shapeit/",
-		  "i2loc","/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/",
-		  "liloc","/home/unix/sripke/liftover/",
-		  "eloc","/home/unix/sripke/eigensoft/bin",
-		  "ldloc","/psych/genetics_data/ripke/ldsc/",
-		  "rloc","broadinstitute",
-		  "rpac","NA",
-		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/",
-		  "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/",
-#		  "hvloc","/home/radon01/sripke/bakker_ripke/haploview/",
-	);
-}
-
-elsif ($clusters{lisa}){
-    %variables = ("ploc", "/home/gwas/plink/1.08/src",
-		  "p2loc","/home/gwas/plink2/plink_1.9_newest",
-		  "shloc","/home/gwas/shapeit",
-		  "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static",
-		  "liloc","/home/gwas/liftover",
-		  "ldloc","/home/gwas/ldsc/",
-		  "eloc","/home/gwas/eigensoft",
-		  "rloc","/sara/sw/R-3.1.2/bin/",
-		  "rpac","NA",
-		  "hmloc","/home/gwas/pgc-samples/hapmap_ref/",
-		  "meloc","/home/gwas/metal",
-#		  "hvloc","./",
-	);
-}
-
-
-
-elsif ($clusters{computerome}){
-    %variables = ("ploc", "/home/people/sripke/rp_external_bins/plink/",
-		  "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/",
-		  "shloc","/home/people/sripke/rp_external_bins/shapeit/",
-		  "i2loc","/home/people/sripke/rp_external_bins/impute2/",
-		  "liloc","/home/people/sripke/rp_external_bins/liftover/",
-		  "ldloc","/home/people/sripke/rp_external_bins/ldsc/",
-		  "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/",
-		  "rloc","/services/tools/R-3.1.2/bin/",
-		  "rpac","/home/people/sripke/rp_external_bins/Rpackages/",
-		  "hmloc","/home/people/sripke/imputation_references/",
-		  "meloc","/home/people/sripke/rp_external_bins/metal/",
-#		  "hvloc","./",
-	);
-}
-
-
-elsif ($clusters{co_ipsych}){
-    %variables = ("ploc", "/data/tools/plink-1.07/",
-		  "p2loc","/data/tools/plink2_sept2015/",
-		  "shloc","/data/tools/shapeit_sept_2015/",
-		  "i2loc","/data/tools/impute-2.3.2/",
-		  "liloc","/data/user_tools/rp_external_bins/liftover/",
-		  "ldloc","/data/user_tools/rp_external_bins/ldsc/",
-		  "eloc","/data/tools/eigensoft-6.0.1/bin/",
-		  "rloc","/data/tools/R-3.2.1/bin/",
-		  "rpac","/data/user_tools/rp_external_bins/Rpackages/",
-		  "hmloc","/data/user_tools/imputation_references/",
-		  "meloc","/data/tools/metal-20110325/",
-#		  "hvloc","./",
-	);
-}
-
-elsif ($clusters{genomedk}){
-    %variables = ("ploc", "/project/ricopili/plink_src/",
-		  "p2loc","/project/ricopili/plink_1.9_jul4/",
-		  "shloc","/project/ricopili/3rd_bins/shapeit/",
-		  "i2loc","/project/ricopili/3rd_bins/impute2/",
-		  "liloc","/project/ricopili/3rd_bins/liftover/",
-		  "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/",
-		  "rloc","/com/extra/R/3.1.0/bin",
-		  "rpac","NA",
-		  "hmloc","/project/ricopili/reference_dir/",
-		  "meloc","/project/ricopili/3rd_bins/metal/",
-#		  "hvloc","./",
-	);
-}
-
-elsif ($clusters{mssm}){
-    %variables = ("ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/",
-		  "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/",
-		  "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/",
-		  "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/",
-		  "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/",
-		  "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/",
-		  "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/",
-		  "rpac","NA",
-		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/",
-		  "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/",
-#		  "hvloc","./",
-	);
-}
-
-
-
-foreach (keys %variables){
-
-    if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") {
-	print "You are running R on broad, took the default value\n\n";
-    }
-    elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") {
-	print "assuming library rmeta is installed on standard R\n\n";
-    }
-    else {
-	if ($variables{$_} ne '' && (-d $variables{$_})){
-	    print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n";
-	    if ($defall == 0) {
-		while (1) {
-		    my $answer = lc <>;
-		    chomp $answer;
-		    if ($answer eq "y") {
-			print "Using $variables{$_} for $longvar{$_}.\n\n";
-			last;
-		    }
-		    elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n";
-					    my $input = <>;
-					    chomp $input;
-					    $input =~ s/^~/$ENV{HOME}/g;
-					    $input =~ s/^\./$cd/g;
-					    unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
-					    print "\n";
-					    last;}
-		    else {print "Please answer with y or n.\n";}
-		}
-	    }
-	}
-	else {
-	    while (1){
-		print "not default value for:\n";
-		print "Please enter a location for $longvar{$_}:\n";
-		my $input = "";
-		$input = <>;
-		chomp $input;
-		$input =~ s/^~/$ENV{HOME}/g;
-		$input =~ s/^\./$cd/g;
-		unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
-		$variables{$_} = $input;
-		print "\n";
-		last;
-	    }
-	}
-    }
-}
-
-foreach (keys %variables){
-    push (@text, "$_ $variables{$_}");
-}
-
-push (@text, "home $home");
-push (@text, "sloc $sloc");
-push (@text, "init $initials");
-push (@text, "email $email");
-push (@text, "loloc $loloc");
-
-### define queue depending on cluster
-#if ($clusters{broad}){push (@text, "queue bsub")}
-
-if ($clusters{broad}){push (@text, "queue qsub_b")}
-if ($clusters{lisa}){push (@text, "queue qsub")}
-if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")}
-if ($clusters{genomedk}){push (@text, "queue slurm")}
-if ($clusters{mssm}){push (@text, "queue msub")}
-}
-
-unless ( -e $conf_file && $ans_ow eq "n") {
-    die $! unless open FILE, "> $conf_file";
-    foreach (@text) {print FILE "$_\n"};
-    close FILE;
-}
-
-#############################
-# read ricopili.config file with default parameters
-#############################
-my %conf = (); ## hash with config parameters
-
-### Read config file
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    $conf{$cells[0]} = $cells[1];
-}
-close FILE;
-
-print "\n";
-
-#############################
-# write pipeline status file to home directory
-#############################
-
-my @log_file = ("$conf{loloc}/preimp_dir_info","$conf{loloc}/impute_dir_info","$conf{loloc}/pcaer_info","$conf{loloc}/idtager_info","$conf{loloc}/repqc2_info","$conf{loloc}/areator_info","$conf{loloc}/merge_caller_info","$conf{loloc}/postimp_navi_info");
-
-foreach (@log_file) {
-    unless ( -e $_) {
-    print "Creating pipeline status file to $_\n";
-    &mysystem("touch $_");
-    }
-}
-
-############################
-# check whether all binary directories exist
-############################
-my @fail_path = ();
-my %locs = ("ploc","","p2loc","","shloc","","i2loc","","liloc","","eloc","","rloc","","hmloc","","meloc","","ldloc","","rpac","");
-
-die $!."($conf_file)" unless open FILE, "< $conf_file";
-while (my $line = <FILE>){
-    my @cells = split /\s+/, $line;
-    my $path = $cells[1];
-    my $variable = $cells[0];
-    unless (-d $path) {
-        if (exists $locs{$variable}) {push(@fail_path,$variable)};
-    }
-}
-close FILE;
-
-#############################
-# print finish statement
-#############################
-
-my $fail = 0;
-if ($#fail_path != -1) { 
-
-   
-#    foreach (@fail_path) {
-#        unless ($_ eq "rloc" && $clusters{broad} == 1) {
-
-    foreach my $confvar (@fail_path) {
-	if ($confvar eq "rloc" && $clusters{broad} == 1) {
-            next;
-	}
-	elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) {
-            next;
-	}
-	else{
-            $fail += 1;            
-        }
-    }
-    if ($fail != 0) {
-        print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $hdir/ricopili.conf for the following variables:\n";
-        foreach (@fail_path) {
-            unless ($_ eq "rloc" && $clusters{broad} == 1) {
-                print "\t$_\n";            
-            }
-        }
-    }
-    else {
-        print "Setup has been completed successfully!\n";
-        print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n";
-        &mysystem("rm install_true");    
-        &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");           
-    }
-}
-else {
-    print "Setup has been completed successfully!\n";
-    print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n";
-    &mysystem("rm install_true");    
-    &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");    
-}
-
-
-
-my $hostname = $ENV{HOSTNAME}.'.'.$ENV{DOMAINNAME};
-if ($clusters{lisa} == 1) {
-    $hostname = "lisa.surfsara.nl";
-}
-if ($clusters{computerome} == 1) {
-    $hostname = "computerome.cbs.dtu.dk";
-}
-
-if ($clusters{co_ipsych} == 1) {
-    $hostname = "ipsych.computerome.cbs.dtu.dk";
-}
-
-
-print "-------------------------------------------------------------------\n";
-print "adding these commands to your ~/.bashrc can be very helpful\n(just copy/paste the following lines into ~/.bashrc)\n(you have to logout and login again for these to take effect)\n\n";
-print "## for colored output of ls:\n";
-print 'alias ls=\'ls --color=auto\''."\n\n";
-print "## for easy copy over to your local machine:\n";
-print 'alias c=\'sed "s#.*#scp '.$ENV{LOGNAME}.'@'.$hostname.':$(pwd)/& .#"\''."\n\n";
-
-
-print "## for list of cluster jobs:\n";
-if ($clusters{lisa} == 1 || $clusters{computerome} == 1 || $clusters{co_ipsych} == 1 || $clusters{broad} == 1) {
-    print 'alias q=\'qstat -u '.$ENV{LOGNAME}."\'\n\n";
-}
-else {
-    print "alias q=\'bjobs -w\'\n\n";
-
-}
-
-if ($clusters{computerome} == 1) {
-    print "## load queuing system by default:\n";
-    print "module load torque\n\n";
-    print "## different prompt:\n";
-    print 'PS1="$USER@computerome.cbs.dtu.dk:"\'\w\'" "'."\n\n";      
-}
-elsif ($clusters{co_ipsych} == 1) {
-    print "## different prompt:\n";
-    print 'PS1="$USER@ipsych.computerome.cbs.dtu.dk:"\'\w\'" "'."\n\n";  
-}
-else {
-    print "## different prompt:\n";
-    print 'PS1="'.$ENV{USER}.'@'.$hostname.':"\'\w\'" "'."\n\n";
-}
-
-  
-
-print "-------------------------------------------------------------------\n";    
-exit;
-
-
-
-########## Done ##########
diff --git a/bin/config_pico.pl b/bin/config_pico.pl
new file mode 100755
index 0000000..7fc9563
--- /dev/null
+++ b/bin/config_pico.pl
@@ -0,0 +1,714 @@
+#!/usr/bin/env perl
+use strict;
+use File::Basename;
+use Cwd;
+use Cwd 'abs_path';
+use Data::Dumper;
+
+### Script to configure settings for picopili pipeline
+### Jackie Goldstein, Jan 2014
+
+### Adapted for picopili by Raymond Walters, Sept 2016
+
+my $version = "2.0.0";
+my $progname = $0;
+
+$progname =~ s!^.*/!!;
+
+my $cdir = abs_path($0);
+my $home = $ENV{HOME};
+my $conf_file = $ENV{HOME}."/picopili.conf";
+my $command_line = "$progname @ARGV";
+
+print "\n";
+print "##############################\n";
+print "#\n";
+print "# Creating config file for picopili\n";
+print "# $conf_file\n";
+print "#\n";
+print "# Will index location of executables for other\n";
+print "# programs (e.g. plink), reference files, and\n";
+print "# job settings (e.g. email address for job logs).\n";
+print "#\n";
+print "# Default settings are available for clusters\n";
+print "# with existing ricopili configurations.\n";
+print "#\n";
+print "##############################\n";
+    
+
+
+#############################
+# Ask user what cluster they're using
+#############################
+#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"other",0);
+#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"other",0);
+my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"co_ipsych",0,"other",0);
+my @cluster_names = ("broad","mssm","genomedk","lisa","computerome","co_ipsych","other");
+print "Please enter your cluster name from the following options:\n";
+my $i = 1;
+foreach (@cluster_names){
+    print "\t($i) $_\n";
+    $i += 1;
+}
+print "\n";
+my $cluster = "other";
+while (1) {    
+    $cluster = lc <>;
+    chomp $cluster;
+    if (exists $clusters{$cluster}){$clusters{$cluster} = 1;last;}
+    else {
+	$cluster =~ s/(\)|\()//g;
+	if ($cluster >= 1 && $cluster <= $i){$cluster -= 1; $cluster = $cluster_names[$cluster];$clusters{$cluster} = 1;last;}
+	else {
+	    print "Did not recognize option. Please enter a cluster name from the options below:\n";
+	    my $i = 1;
+	    foreach (@cluster_names){
+		print "\t($i) $_\n";
+		$i += 1;
+	    }
+	    print "\n";
+	    my $cluster = "other";
+	}
+    }
+}
+print "\nUsing the following cluster: $cluster\n\n";
+
+
+###################################################
+###  system call with test if successful
+###################################################
+sub mysystem(){
+    my ($systemstr)="@_";
+    system($systemstr);
+    my $status = ($? >> 8);
+    die "$systemstr\n->system call failed: $status" if ($status != 0);
+}
+
+###################################################
+### Make sure lapack is installed
+### specific to genomedk, and unclear if needed?
+###################################################
+# if ($clusters{genomedk} == 1){
+#    unless ($ENV{EXTRAS} =~ /lapack/) {
+#        print "Run the following commands to add lapack to the default search path:\n";
+#        print "echo \"source /com/extra/lapack/3.5.0/load.sh\" >> ~/.bashrc\n";
+#        print "source /com/extra/lapack/3.5.0/load.sh\n";
+#        print "./rp_config\n\n";
+#        exit;
+#    }
+#    else { print "Detected lapack is installed.\n\n";}
+#}
+
+
+###################################################
+### Check whether to overwrite existing config (if exists)
+###################################################
+
+my $ans_ow = "y";
+if (-e $conf_file) {
+    print "Configuration file already exists at $conf_file\n";
+    print "Do you wish to overwrite this file? (y/n)\n";
+    while (1) {
+        $ans_ow = lc <>;
+        chomp $ans_ow;
+        if ($ans_ow eq "y") {
+            print "Rewriting configuration file. Making a backup to $conf_file.copy\n\n";
+            &mysystem("cp $conf_file $conf_file.copy");
+            last;
+        }
+        elsif ($ans_ow eq "n") {print "Not overwriting $conf_file.\n";last;}
+        else {print "Please answer with y or n.\n";}
+    }
+};
+
+my $cd = cwd();
+my $sloc = "";
+my $initials = "";
+my $email = "";
+my @text = ();
+
+if ($ans_ow eq "y"){
+#############################
+# make scratch directory
+#############################
+if ($clusters{broad} == 1) {
+    my $user_name = basename($ENV{HOME});
+    $sloc = "/broad/hptmp/$user_name/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+elsif ($clusters{lisa} == 1) {
+    $sloc = "/scratch/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+
+elsif ($clusters{computerome} == 1) {
+    $sloc = "/scratch/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+
+elsif ($clusters{co_ipsych} == 1) {
+    $sloc = "/data/scratch/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+
+elsif ($clusters{genomedk} == 1) {
+    $sloc = "/project/ricopili/scratch_dir/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+
+
+elsif ($clusters{mssm} == 1) {
+    my $user_name = $ENV{USER};
+    $sloc = "/sc/orga/scratch/$user_name/";
+    print "Do you want to use the following default scratch directory? (y or n)\n";
+    print "\t$sloc\n";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+}
+else {
+    print "Please enter a scratch directory to use:\n";
+    $sloc = "$cd/tmp/";
+    while (1) {
+        my $answer = lc <>;
+        chomp $answer;
+        if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;}
+        elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n";
+				$sloc = <>;
+				chomp $sloc;
+				$sloc =~ s/^~/$ENV{HOME}/g;
+				$sloc =~ s/^\./$cd/g;        
+				last;}
+        else {print "Please answer with y or n.\n";}
+    }
+
+}
+
+unless (-d $sloc) {
+    print "Making scratch directory: $sloc\n\n";
+    &mysystem("mkdir $sloc");    
+}
+else {
+    print "Scratch directory already exists at $sloc\n";
+}
+print "\n";
+
+#############################
+# analyst info
+#############################
+print "Please enter your initials (2 letters):\n";
+while (1) {
+    $initials = lc <>;
+    chomp $initials;
+    if (length($initials) == 2) {last;}
+    else {print "Make sure initials are 2 letters!\n";}
+}
+print "\n";
+
+print "Please enter your email address:\n";
+my $email = <>;
+chomp $email;
+print "\n";
+
+
+
+
+#############################
+# allow default all remaining values on select platforms
+#############################
+my $defall = 0;
+
+if ($clusters{lisa} == 1 || $clusters{broad} == 1) {
+    print "Do you want to use default values for the rest of the installation process? (y or n)\n";
+    while (1) {
+	my $answer = lc <>;
+	chomp $answer;
+	if ($answer eq "y") {print "Using default values for the rest of the installation process\n\n"; $defall = 1;last;}
+	elsif ($answer eq "n") {print "Not using default values for the rest of the installation process\n\n"; $defall = 0;last;}
+	else {print "Please answer with y or n.\n";}
+    }
+}
+
+
+
+print "\n";
+
+my %longvar = ("p2loc","PLINK2",
+	       "shloc","SHAPEIT",
+	       "i2loc","IMPUTE2",
+	       "liloc","Liftover",
+	       "eloc","Eigenstrat",
+#	       "rloc","R",
+	       "hmloc","HapMap reference",
+	       "perlpack","Perl packages (for Compress::Zlib)",
+    );
+	       
+
+my %variables = ("p2loc", "",
+		  "shloc","",
+		 "i2loc","",
+		  "liloc","",
+		  "eloc","",
+#		  "rloc","",
+		  "hmloc","",
+		  "perlpack","",
+    );
+
+    
+if ($clusters{broad}){
+    %variables = (
+		  # "ploc", "/home/unix/sripke/plink_src/src/",
+		  "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/",
+		  "shloc","/home/unix/sripke/shapeit/",
+		  "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/",
+		  "liloc","/home/unix/sripke/liftover/",
+		  "eloc","/home/unix/sripke/eigensoft/bin",
+#		  "ldloc","/psych/genetics_data/ripke/ldsc/",
+#		  "rloc","broadinstitute",
+#		  "rpac","NA",
+		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/",
+#		  "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/",
+#		  "hvloc","/home/radon01/sripke/bakker_ripke/haploview/",
+		  "perlpack","/home/unix/sripke/perl_modules",
+	);
+}
+
+elsif ($clusters{lisa}){
+    %variables = (
+#		  "ploc", "/home/gwas/plink/1.08/src",
+		  "p2loc","/home/gwas/plink2/plink_1.9_newest",
+		  "shloc","/home/gwas/shapeit",
+		  "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static",
+		  "liloc","/home/gwas/liftover",
+#		  "ldloc","/home/gwas/ldsc/",
+		  "eloc","/home/gwas/eigensoft",
+#		  "rloc","/sara/sw/R-3.1.2/bin/",
+#		  "rpac","NA",
+		  "hmloc","/home/gwas/pgc-samples/hapmap_ref/",
+#		  "meloc","/home/gwas/metal",
+#		  "hvloc","./",
+		  "perlpack","/home/gwas/perl_modules",
+	);
+}
+
+
+
+elsif ($clusters{computerome}){
+    %variables = (
+#		  "ploc", "/home/people/sripke/rp_external_bins/plink/",
+		  "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/",
+		  "shloc","/home/people/sripke/rp_external_bins/shapeit/",
+		  "i2loc","/home/people/sripke/rp_external_bins/impute2/",
+		  "liloc","/home/people/sripke/rp_external_bins/liftover/",
+#		  "ldloc","/home/people/sripke/rp_external_bins/ldsc/",
+		  "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/",
+#		  "rloc","/services/tools/R-3.1.2/bin/",
+#		  "rpac","/home/people/sripke/rp_external_bins/Rpackages/",
+		  "hmloc","/home/people/sripke/imputation_references/",
+#		  "meloc","/home/people/sripke/rp_external_bins/metal/",
+#		  "hvloc","./",
+		  "perlpack","/home/people/sripke/rp_external_bins/perl_packages",
+	);
+}
+
+
+elsif ($clusters{co_ipsych}){
+    %variables = (
+#		  "ploc", "/data/tools/plink-1.07/",
+		  "p2loc","/data/tools/plink2_sept2015/",
+		  "shloc","/data/tools/shapeit_sept_2015/",
+		  "i2loc","/data/tools/impute-2.3.2/",
+		  "liloc","/data/user_tools/rp_external_bins/liftover/",
+#		  "ldloc","/data/user_tools/rp_external_bins/ldsc/",
+		  "eloc","/data/tools/eigensoft-6.0.1/bin/",
+#		  "rloc","/data/tools/R-3.2.1/bin/",
+#		  "rpac","/data/user_tools/rp_external_bins/Rpackages/",
+		  "hmloc","/data/user_tools/imputation_references/",
+#		  "meloc","/data/tools/metal-20110325/",
+#		  "hvloc","./",
+		  "perlpack","/data/user_tools/rp_external_bins/perl_packages",
+	);
+}
+
+elsif ($clusters{genomedk}){
+    %variables = (
+#		  "ploc", "/project/ricopili/plink_src/",
+		  "p2loc","/project/ricopili/plink_1.9_jul4/",
+		  "shloc","/project/ricopili/3rd_bins/shapeit/",
+		  "i2loc","/project/ricopili/3rd_bins/impute2/",
+		  "liloc","/project/ricopili/3rd_bins/liftover/",
+		  "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/",
+#		  "rloc","/com/extra/R/3.1.0/bin",
+#		  "rpac","NA",
+		  "hmloc","/project/ricopili/reference_dir/",
+#		  "meloc","/project/ricopili/3rd_bins/metal/",
+#		  "hvloc","./",
+		  "perlpack","/project/ricopili/perl_packages/",
+	);
+}
+
+elsif ($clusters{mssm}){
+    %variables = (
+#		  "ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/",
+		  "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/",
+		  "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/",
+		  "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/",
+		  "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/",
+		  "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/",
+#		  "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/",
+#		  "rpac","NA",
+		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/",
+#		  "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/",
+#		  "hvloc","./",
+		  "perlpack","/hpc/users/xripkes01/perl_modules/",
+	);
+}
+
+
+
+foreach (keys %variables){
+
+    if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") {
+	print "You are running R on broad, took the default value\n\n";
+    }
+    elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") {
+	print "assuming library rmeta is installed on standard R\n\n";
+    }
+    else {
+	if ($variables{$_} ne '' && (-d $variables{$_})){
+	    print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n";
+	    if ($defall == 0) {
+		while (1) {
+		    my $answer = lc <>;
+		    chomp $answer;
+		    if ($answer eq "y") {
+			print "Using $variables{$_} for $longvar{$_}.\n\n";
+			last;
+		    }
+		    elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n";
+					    my $input = <>;
+					    chomp $input;
+					    $input =~ s/^~/$ENV{HOME}/g;
+					    $input =~ s/^\./$cd/g;
+					    unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
+					    print "\n";
+					    last;}
+		    else {print "Please answer with y or n.\n";}
+		}
+	    }
+	}
+	else {
+	    while (1){
+			unless($clusters{other} == 1){
+				print "No default value available for:\n";
+			}
+		print "Please enter a location for $longvar{$_}:\n";
+		my $input = "";
+		$input = <>;
+		chomp $input;
+		$input =~ s/^~/$ENV{HOME}/g;
+		$input =~ s/^\./$cd/g;
+		unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
+		$variables{$_} = $input;
+		print "\n";
+		last;
+	    }
+	}
+    }
+}
+
+foreach (keys %variables){
+    push (@text, "$_ $variables{$_}");
+}
+
+push (@text, "sloc $sloc");
+push (@text, "init $initials");
+push (@text, "email $email");
+
+### define queue depending on cluster
+#if ($clusters{broad}){push (@text, "queue bsub")}
+
+if ($clusters{broad}){push (@text, "queue broad_uger")}
+if ($clusters{lisa}){push (@text, "queue qsub")}
+if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")}
+if ($clusters{genomedk}){push (@text, "queue slurm")}
+if ($clusters{mssm}){push (@text, "queue msub")}
+}
+
+unless ( -e $conf_file && $ans_ow eq "n") {
+    die $! unless open FILE, "> $conf_file";
+    foreach (@text) {print FILE "$_\n"};
+    close FILE;
+}
+
+
+
+#############################
+# read ricopili.config file with default parameters
+#############################
+my %conf = (); ## hash with config parameters
+
+### Read config file
+die $!."($conf_file)" unless open FILE, "< $conf_file";
+while (my $line = <FILE>){
+    my @cells = split /\s+/, $line;
+    $conf{$cells[0]} = $cells[1];
+}
+close FILE;
+
+print "\n";
+
+############################
+# check whether all binary directories exist
+############################
+my @fail_path = ();
+my %locs = ( 
+# 	"ploc","",
+	"p2loc","",
+	"shloc","",
+	"i2loc","",
+	"liloc","",
+	"eloc","",
+# 	"rloc","",
+	"hmloc","",
+# 	"meloc","",
+# 	"ldloc","",
+# 	"rpac","",
+	"perlpack",""
+);
+
+die $!."($conf_file)" unless open FILE, "< $conf_file";
+while (my $line = <FILE>){
+    my @cells = split /\s+/, $line;
+    my $path = $cells[1];
+    my $variable = $cells[0];
+    unless (-d $path) {
+        if (exists $locs{$variable}) {push(@fail_path,$variable)};
+    }
+}
+close FILE;
+
+#############################
+# print finish statement
+#############################
+
+my $fail = 0;
+if ($#fail_path != -1) { 
+
+   
+#    foreach (@fail_path) {
+#        unless ($_ eq "rloc" && $clusters{broad} == 1) {
+
+    foreach my $confvar (@fail_path) {
+	if ($confvar eq "rloc" && $clusters{broad} == 1) {
+            next;
+	}
+	elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) {
+            next;
+	}
+	else{
+            $fail += 1;            
+        }
+    }
+    if ($fail != 0) {
+        print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/ricopili.conf for the following variables:\n";
+        foreach (@fail_path) {
+            unless ($_ eq "rloc" && $clusters{broad} == 1) {
+                print "\t$_\n";            
+            }
+        }
+    }
+    else {
+        print "Setup has been completed successfully!\n";
+        print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; 
+        &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");           
+    }
+}
+else {
+    print "Setup has been completed successfully!\n";
+    print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n";   
+    &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");    
+}
+
+
+
+
+
+
+###################################################
+###  Optional: Add bin to default search path
+###################################################
+
+system("bin_check_pico"); # dummy script that doesn't do anything
+my $status_bin = ($? >> 8);
+
+
+if ($clusters{lisa} == 1) {
+    unless (-e "$home/.bash_profile") {
+	die $! unless open FILE, "> $home/.bash_profile";
+	     print FILE 'if [ -f ~/.bashrc ]; then '."\n";
+	     print FILE '    . ~/.bashrc'."\n";
+	     print FILE 'fi'."\n";
+	close FILE;
+    }
+    unless (-e "$home/.bashrc") {
+	system "touch ~/.bashrc\n";
+    }
+}
+
+
+if ($status_bin != 0) {
+    my $bash = "PATH=\$PATH:$cdir";    
+    my $csh = "set path=(\$path $cdir)";
+    
+    print "\n----------------------------------------------------\n";     
+    print "## You will probably want to add picopili to the default search path.\n";
+	
+	
+	# Determine the shell
+	my $shell = '';
+	if (exists $ENV{SHELL}){$shell = basename($ENV{SHELL});}
+	if ($shell eq "bash-login-check"){$shell = "bash";}
+	if ($shell ne "bash" && $shell ne "tcsh") {
+    	print "Warning! Shell not recognized: $shell\n";
+		print "Please send email to rwalters\@broadinstitute.org\n";
+	}
+	print "Detected you are using the following shell: $shell\n\n";
+	
+	# provide commands, where possible
+	# perm tracks if a command is generated for .bashrc or equivalent
+	my $perm = 0;
+	if ($shell eq "bash"){
+		print "To do this in bash, run the following command:\n";
+		print "$bash\n";
+		if ($clusters{broad}){
+	        if (-e "$home/.my.bashrc") {
+	            print "echo \"$bash\" >> ~/.my.bashrc\n";
+				$perm = 1;
+	        }
+		}
+		else{
+			if ($clusters{lisa} == 1){
+				unless (-e "$home/.bashrc") {
+					print "touch ~/.bashrc\n";
+				}
+				unless (-e "$home/.bash_profile") {
+					print "echo \"if [ -f ~/.bashrc ]; then \" > $home/.bash_profile";
+					print "echo \"    ~/.bashrc\" >> $home/.bash_profile";
+					print "echo \"fi\" >> $home/.bash_profile";
+				}
+			}
+	        if (-e "$home/.bashrc") {
+	            print "echo \"$bash\" >> ~/.bashrc\n";
+				$perm = 1;
+	        }
+		}
+	}
+	elsif ($shell eq "tcsh"){
+		print "To do this in tcsh, run the following command:\n";
+		print "$csh\n";
+		if ($clusters{broad}){
+	        if (-e "$home/.my.cshrc") {
+	            print "echo \"$csh\" >> ~/.my.cshrc\n";
+				$perm = 1;
+	        }
+		}
+		else{
+	        if (-e "$home/.cshrc") {
+	            print "echo \"$csh\" >> ~/.cshrc\n";
+				$perm = 1;
+	        }
+		}
+	}
+	# else if shell not determined
+    else {
+        print "You'll want to add the following path:\n";
+        print "\t$cdir\n";
+	}
+	# additional instructions of not .bashrc equivalent provided
+	if ($perm == 0){
+        print "If possible, add these paths permanently. Otherwise, you will need to do this everytime you start a new session.\n";		
+        print "For example instructions, see http://www.cyberciti.biz/faq/unix-linux-adding-path/\n";		
+	}
+}
+else{
+	print "Successfully found picopili directory in search path!\n";	
+}
+
+    
+exit;
+########## Done ##########
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index d14a5ef..9134f09 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -29,14 +29,17 @@
 
 use FindBin;
 use lib "$FindBin::Bin";
-use Ricopili::Utils qw(trans);
+use rp_perl::Utils qw(trans);
 
 my $version = "1.0.24";
 my $progname = $0;
 $progname =~ s!^.*/!!;
 my $command_line = "$progname @ARGV";
 
-
+use Cwd;
+use File::Path;
+my $rootdir = &Cwd::cwd();
+my $sjainfotxt = "$rootdir\t$command_line";
 
 my $jnum = 7; ### number of imputation job per node
 
@@ -54,16 +57,13 @@
 #############################
 
 my $ploc = &trans("p2loc");
-my $homedir = &trans("home");
 my $qloc = &trans("queue");
 my $liloc = &trans("liloc");
 my $email = &trans("email");
-my $loloc = &trans("loloc");
 
 
 ###############################################
 
-my $rootdir = "";
 my $iname = "" ;
 
 my $suminfo = "infosum_pos";
@@ -395,14 +395,10 @@ sub a2filenew_app {
 }
 
 
-my $sjainfofile = "$loloc/impute_dir_info";
+my $sjainfofile = "$rootdir/impute_dir_info.log";
 unless (-e $sjainfofile) {
-    print "log-file ($sjainfofile) is not existing\n";
-    print "please check loloc in ~/picopili.conf\n";
-    exit;
+	&mysystem ("touch $sjainfofile");
 }
-#my $sjainfofile = "$homedir/impute_dir_info_35_test";
-my $sjainfotxt = "";
 my $sjamulti = 0;
 
 
@@ -619,12 +615,6 @@ sub send_jobarray {
 ##############################################
 
 
-use Cwd;
-use File::Path;
-$rootdir = &Cwd::cwd();
-$sjainfotxt = "$rootdir\t$command_line";
-
-
 unless (-e $impute_dir){
     print "impute_dir is not existing, create one for you\n";
     my @created = mkpath(   ## $created ?
diff --git a/config b/config
deleted file mode 120000
index bc60002..0000000
--- a/config
+++ /dev/null
@@ -1 +0,0 @@
-./bin/config
\ No newline at end of file
diff --git a/docs/RICOPILI.md b/docs/RICOPILI.md
index ef8e45e..e71f047 100644
--- a/docs/RICOPILI.md
+++ b/docs/RICOPILI.md
@@ -3,9 +3,10 @@ The following scripts are adapted from ricopili (https://github.com/Nealelab/ric
 * `buigue_pico.pl`, from `buigue`
 * `checkflip_pico.pl`, from `checkflip4`
 * `checkpos_pico.pl`, from `checkpos6`
-* `config`, from `rp_config`
+* `config_pico.pl`, from `rp_config`
 * `lift_to_hg19.pl`, from `lift18219`
 * `plague_pico.pl`, from `plague_2`
+* `bin_check_pico`, from `bin_check`
 * `./rp_perl/Utils.pm`, from `./Ricopili/Utils.pm`
 
 In addition, the following scripts are adapted from ricopili with more substantial changes as indicated:

From 576e70f4d30f0c0a8289f93e51e0a7d4e3059f19 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 29 Sep 2016 13:13:40 -0400
Subject: [PATCH 04/48] migrate to using picopili.conf file

---
 bin/admix_rel.py      | 35 ++++++++++++++---------------------
 bin/agg_imp.py        | 24 ++++++------------------
 bin/args_pca.py       |  4 ++--
 bin/args_ped.py       |  2 +-
 bin/bg_imp.py         | 16 +++-------------
 bin/checkflip_pico.pl |  2 +-
 bin/checkpos_pico.pl  |  2 +-
 bin/config_pico.pl    |  4 ++--
 bin/gwas_dfam.py      | 15 +++++----------
 bin/gwas_gee.py       | 17 +++++++----------
 bin/gwas_rel.py       | 19 ++++---------------
 bin/imp2_rel.py       | 33 +++++++++------------------------
 bin/imus_pca.py       | 33 +++++++++++++++------------------
 bin/py_helpers.py     | 43 +++++++++++++++++++++++++++++++++++++++----
 bin/qc_rel.py         | 26 +++++++++-----------------
 bin/shape_rel.py      | 17 +++--------------
 bin/strict_qc.py      | 23 +++++++----------------
 17 files changed, 128 insertions(+), 187 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index debf85a..5f29981 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -42,7 +42,7 @@
 from string import ascii_uppercase
 from glob import glob
 from numpy import digitize
-from py_helpers import unbuffer_stdout, file_len, test_exec, read_conf, find_from_path, link, gz_confirm
+from py_helpers import unbuffer_stdout, file_len, test_exec, find_exec, link, gz_confirm
 unbuffer_stdout()
 
 
@@ -196,35 +196,28 @@
 print '--plot-admix-pca '+str(args.plot_admix_pca)
 
 
-
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
+# find, check exists, executable
 #############
 
-### read plink loc from config
-# not getting R here since ricopili.conf currently relies on platform info
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
+plinkx = find_exec('plink',key='p2loc')
 
-plinkx = configs['p2loc']+"plink"
+if args.rscript_ex == None or args.rscript_ex == "None":
+    args.rscript_ex = find_exec('Rscript', key='rscloc')
 
+if args.admixture_ex == None or args.admixture_ex == "None":
+    args.admixture_ex = find_exec('admixture', key='admloc')
 
-#############
-print '\n...Checking dependencies...'
-# check exists, executable
-#############
-
-# get variables from path as needed
-# - Rscript (if unspecified)
-# - IBD plotting script
-# - PCA plotting script (optional)
-if args.rscript_ex == None or args.rscript_ex == "None":
-    args.rscript_ex = find_from_path('Rscript', 'Rscript')
+if args.reap_ex == None or args.reap_ex == "None":
+    args.reap_ex = find_exec('REAP', key='reaploc')
 
-Rplotibdx = find_from_path('plot_reap_ibd.Rscript', 'IBD plotting script')
+rp_bin = os.path.dirname(os.path.realpath(__file__))
+Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript'
 
 if plot_pca:
-    Rplotpcax = find_from_path('plot_pca.Rscript', 'PCA plotting script')
+    Rplotibdx = rp_bin+'/plot_pca.Rscript'
+
 
 # verify executables
 test_exec(plinkx, 'Plink')
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 4f919c6..9e273c9 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -30,7 +30,7 @@
 import os
 import subprocess
 from args_impute import *
-from py_helpers import unbuffer_stdout, read_conf, file_len #, file_tail, link, warn_format
+from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format
 unbuffer_stdout()
 # warnings.formatwarning = warn_format
 
@@ -73,30 +73,18 @@
 
 
 
-
-
-
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
 #############
 
-### read plink loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-plink_ex = configs['p2loc']+"plink"
+plink_ex = find_exec('plink', key='p2loc')
 
 # get directory containing current script
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
+uger_ex = +str(rp_bin)+'/uger.sub.sh'
 
-
-#############
-print '\n...Checking dependencies...'
-#############
-
-
-
+test_exec(uger_ex)
 
 # TODO: here
 
@@ -217,7 +205,7 @@
                             '-l', 'm_mem_free=8g,h_vmem=8g',
                             '-N', 'agg.imp.'+str(outdot),
                             '-o', agg_log,
-                            str(rp_bin)+'/uger.sub.sh',
+                            str(uger_ex),
                             str(args.sleep),
                             ' '.join(sys.argv[:])])
     
diff --git a/bin/args_pca.py b/bin/args_pca.py
index 404d837..61d2353 100644
--- a/bin/args_pca.py
+++ b/bin/args_pca.py
@@ -138,7 +138,7 @@
 # - relatedness threshhold for defining IMUS set
 # - Number of PCs to compute
 # - PCA output controls (directory, number of PCs to plot)
-# - File paths for external software not provided by ~/ricopili.conf
+# - File paths for external software not previously provided by ricopili.conf
 #
 ############                  
                     
@@ -171,7 +171,7 @@
 # arg_exloc.add_argument('--plink-ex',
 #                    type=str,
 #                    metavar='PATH',
-#                    help='path to plink executable, read from ~/ricopili.conf if unspecified',
+#                    help='path to plink executable, read from ~/picopili.conf if unspecified',
 #                    required=False)
 arg_exloc.add_argument('--rscript-ex',
                     type=str,
diff --git a/bin/args_ped.py b/bin/args_ped.py
index ab166c1..6d4642e 100644
--- a/bin/args_ped.py
+++ b/bin/args_ped.py
@@ -187,7 +187,7 @@
 ############
 #
 # Software Executables
-# Locations for software dependencies not in ricopili config file
+# Locations for software dependencies not previously in ricopili config file
 #
 ############
 parserexloc = argparse.ArgumentParser(add_help=False)
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 7f01610..77b77e3 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -35,7 +35,7 @@
 import subprocess
 import warnings
 from args_impute import *
-from py_helpers import unbuffer_stdout, read_conf, file_tail, link, warn_format
+from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format
 unbuffer_stdout()
 warnings.formatwarning = warn_format
 
@@ -181,26 +181,16 @@
 
 
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
 #############
 
-### read plink loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-plink_ex = configs['p2loc']+"plink"
+plink_ex = find_exec('plink',key='p2loc')
 
 # get directory containing current script
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 rs_ex = str(rp_bin)+'/rs_trans.py'
 
-#############
-print '\n...Checking dependencies...'
-#############
-
-
-
 
 # TODO: here
 
diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl
index 69e2809..3bb3d08 100755
--- a/bin/checkflip_pico.pl
+++ b/bin/checkflip_pico.pl
@@ -30,7 +30,7 @@
 
 #############################
 
-my $conf_file = $ENV{HOME}."/ricopili.conf";
+my $conf_file = $ENV{HOME}."/picopili.conf";
 my %conf = ();
 
 die $!."($conf_file)" unless open FILE, "< $conf_file";
diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl
index 8f417d6..76a2661 100755
--- a/bin/checkpos_pico.pl
+++ b/bin/checkpos_pico.pl
@@ -30,7 +30,7 @@
 # read config file
 #############################
 
-my $conf_file = $ENV{HOME}."/ricopili.conf";
+my $conf_file = $ENV{HOME}."/picopili.conf";
 my %conf = ();
 
 die $!."($conf_file)" unless open FILE, "< $conf_file";
diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 7fc9563..7c9cd2b 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -518,7 +518,7 @@ ()
 
 
 #############################
-# read ricopili.config file with default parameters
+# read picopili.config file with default parameters
 #############################
 my %conf = (); ## hash with config parameters
 
@@ -585,7 +585,7 @@ ()
         }
     }
     if ($fail != 0) {
-        print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/ricopili.conf for the following variables:\n";
+        print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n";
         foreach (@fail_path) {
             unless ($_ eq "rloc" && $clusters{broad} == 1) {
                 print "\t$_\n";            
diff --git a/bin/gwas_dfam.py b/bin/gwas_dfam.py
index b121a92..7183dbd 100755
--- a/bin/gwas_dfam.py
+++ b/bin/gwas_dfam.py
@@ -41,7 +41,7 @@
 import argparse
 # from glob import glob
 from args_gwas import *
-from py_helpers import unbuffer_stdout, test_exec
+from py_helpers import unbuffer_stdout, test_exec, find_exec
 # , read_conf, link
 unbuffer_stdout()
 
@@ -93,21 +93,16 @@
 print '--rplink-ex '+str(args.rplink_ex)
 
 
-##############
-#print '\n...Reading ricopili config file...'
-##############
-#
-#### read plink loc from config
-#conf_file = os.environ['HOME']+"/ricopili.conf"
-#configs = read_conf(conf_file)
-
 
 #############
 print '\n...Checking dependencies...'
 # check exists, executable
 #############
 
-# verify executables
+# R-compatible plink
+if args.rplink_ex is None or args.rplink_ex == "None":
+    args.rplink_ex = find_exec('plink',key='rplloc')
+
 test_exec(args.rplink_ex, 'Plink')
 
 # verify bfiles are files, not paths
diff --git a/bin/gwas_gee.py b/bin/gwas_gee.py
index ff06c23..3cf9209 100755
--- a/bin/gwas_gee.py
+++ b/bin/gwas_gee.py
@@ -44,7 +44,7 @@
 from warnings import warn
 # from glob import glob
 from args_gwas import *
-from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len
+from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len, find_exec
 # , read_conf, link
 unbuffer_stdout()
 
@@ -127,20 +127,17 @@
 print '--port '+str(args.port)
 
 
-##############
-#print '\n...Reading ricopili config file...'
-##############
-#
-#### read plink loc from config
-#conf_file = os.environ['HOME']+"/ricopili.conf"
-#configs = read_conf(conf_file)
-
-
 #############
 print '\n...Checking dependencies...'
 # check exists, executable
 #############
 
+if args.rplink_ex is None or args.rplink_ex == "None":
+    args.rplink_ex = find_exec('plink',key='rpllloc')
+
+if args.r_ex is None or args.r_ex == "None":
+    args._ex = find_exec('R',key='rloc')
+    
 # verify executables
 test_exec(args.rplink_ex, 'Plink')
 #if not args.rserve_active:
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 8479e49..01ae60a 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -27,7 +27,7 @@
 import os
 from warnings import warn
 from args_gwas import *
-from py_helpers import link, unbuffer_stdout, read_conf, find_from_path
+from py_helpers import link, unbuffer_stdout, find_exec
 unbuffer_stdout()
 
 
@@ -130,25 +130,14 @@
 
 
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
 #############
 
-### read plink loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-plinkx = configs['p2loc']+"plink"
+plinkx = find_exec('plink',key='p2loc')
 
 if args.model == 'gmmat' or args.model == 'gmmat-fam':
     if args.rscript_ex == None or args.rscript_ex == "None":
-        args.rscript_ex = find_from_path('Rscript', 'Rscript')
-
-
-#############
-print '\n...Checking dependencies...'
-#############
-
-
+        args.rscript_ex = find_exec('Rscript', key='rscloc')
 
 # TODO: here
 
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index ba2b007..2951b60 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -27,7 +27,7 @@
 import os
 import subprocess
 from args_impute import *
-from py_helpers import unbuffer_stdout, file_len, link, read_conf
+from py_helpers import unbuffer_stdout, file_len, link, find_exec
 unbuffer_stdout()
 
 
@@ -87,35 +87,18 @@
 
 
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
 #############
 
-### read plink loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-impute_ex = configs['i2loc']+"impute2"
-shapeit_ex = configs['shloc'] + '/bin/shapeit'
+# from config
+impute_ex = find_exec('impute2',key='i2loc')
+shapeit_ex = find_exec('shapeit',key='shloc')
 
 # get directory containing current script
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 chunker_ex = rp_bin+'/chunk_snps.py'
-
-
-
-# directories
-wd = os.getcwd()
-shape_dir = wd + '/phase_chr'
-
-
-
-
-
-#############
-print '\n...Checking dependencies...'
-#############
-
+test_exec(chunker_ex)
 
 
 # TODO: here
@@ -124,7 +107,9 @@
 # executables
 
 
-
+# directories
+wd = os.getcwd()
+shape_dir = wd + '/phase_chr'
 
 
 
diff --git a/bin/imus_pca.py b/bin/imus_pca.py
index 08f1ce1..6fee4bb 100755
--- a/bin/imus_pca.py
+++ b/bin/imus_pca.py
@@ -35,7 +35,7 @@
 import subprocess
 import argparse
 from glob import glob
-from py_helpers import read_conf, unbuffer_stdout, test_exec, find_from_path
+from py_helpers import find_exec, unbuffer_stdout, test_exec
 from args_pca import *
 unbuffer_stdout()
 
@@ -83,31 +83,28 @@
 print '--npcs '+str(args.npcs)
 
 
- 
-#############
-print '\n...Reading ricopili config file...'
-#############
-
-### read plink loc from config
-# not getting R here since ricopili.conf currently relies on platform info
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-plinkx = configs['p2loc']+"plink"
-smartpcax = configs['eloc']+"/smartpca"
-
 
 #############
 print '\n...Checking dependencies...'
 # check exists, executable
 #############
 
-# find required files
-if args.rscript_ex == None or args.rscript_ex == "None":
-    args.rscript_ex = find_from_path("Rscript", 'Rscript')
+# from config
+plinkx = find_exec('plink',key='p2loc')
+smartpcax = find_exec('smartpca',key='eloc')
+
 
-Rplotpcax = find_from_path("plot_pca.Rscript", 'PCA plotting script')
+# if unspecified
+if args.rscript_ex == None or args.rscript_ex == "None":
+    args.rscript_ex = find_exec("Rscript", key='rscloc')
+    
+if args.primus_ex == None or args.primus_ex == "None":
+    args.primus_ex = find_exec("run_PRIMUS.pl", key='priloc')
 
+# get directory containing current script
+# (to get absolute path for scripts)
+rp_bin = os.path.dirname(os.path.realpath(__file__))
+Rplotpcax = str(rp_bin)+'/plot_pca.Rscript'
 
 # test executables
 test_exec(args.primus_ex, 'PRIMUS')
diff --git a/bin/py_helpers.py b/bin/py_helpers.py
index 7cfe5c1..3173e66 100644
--- a/bin/py_helpers.py
+++ b/bin/py_helpers.py
@@ -40,7 +40,7 @@ def file_tail(fname, n=1):
     return str(result)
 
 
-# read ricopili config file as dict    
+# read picopili config file as dict    
 def read_conf(fname):
     
     configs = {}
@@ -76,6 +76,34 @@ def find_from_path(fname, name):
         return file_ex
 
 
+
+# find executables from either config or path
+def find_exec(prog, key=None):
+
+    if key is not None:    
+        import os
+        conffile = os.environ['HOME']+'/picopili.conf'
+        
+        if os.path.isfile(conffile):
+            configs = read_conf(conffile)
+            
+            if str(key) in configs:
+                exloc = configs[str(key)]+'/'+str(prog)
+                test_exec(exloc,str(prog))
+                return(exloc)
+                
+            else:
+                print "Config file %s is missing extry %s for %s. Will search on path." % (str(conffile),str(key),str(prog))
+            
+        else:
+            print "Failed to find config file %s. Will search for %s on path." % (str(conffile), str(prog))
+            
+        exloc = find_from_path(str(prog),str(prog))
+        test_exec(exloc)
+        return exloc
+
+
+
 # symlink fromfile to tofile and verify
 def link(fromfile, tofile, name):
     
@@ -125,13 +153,20 @@ def pp_send_mail(subj, fname):
         import os
         import subprocess
         
+        # get mail address from config file
+        configs = read_conf(os.environ['HOME']+"/picopili.conf")        
+        addr = configs['email']
+        
+        # don't send email 
+        if addr is None or '@' not in str(addr):
+            print "Email turned off based on config file entry (%s)." % str(addr)
+            return 0
+        
         # get email script
         email_script = pp_find_mail()
         if email_script == None:
             raise IOError("Unable to find 'mutt' or 'mail' in path to send email")
-        
-        # get mail address from config file
-        configs = read_conf(os.environ['HOME']+"/ricopili.conf")
+
 
         # verify file before send
         if not os.path.isfile(fname):
diff --git a/bin/qc_rel.py b/bin/qc_rel.py
index e25c372..8c0c556 100755
--- a/bin/qc_rel.py
+++ b/bin/qc_rel.py
@@ -56,7 +56,7 @@
 start_time = strftime("%H:%M:%S %d-%B-%Y")
 # from glob import glob
 from args_qc import *
-from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format
+from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format, find_exec
 unbuffer_stdout()
 warnings.formatwarning = warn_format
 
@@ -114,33 +114,25 @@
 print ' '
 
 
+
 #############
-print '\n...Reading ricopili config file...'
+print '\n...Checking dependencies...'
+# check exists, executable
 #############
 
-### read plink loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
+### read config
+conf_file = os.environ['HOME']+"/picopili.conf"
 configs = read_conf(conf_file)
-
-plinkx = configs['p2loc']+"plink"
-
 analyst = configs['init']
 
+# find plink
+plinkx = find_exec('plink',key='p2loc')
+
 if not args.skip_platform:
     # get directory containing current script
     # (hack to get plague script location)
     rp_bin = os.path.dirname(os.path.realpath(__file__))
     plague_ex = rp_bin + '/plague_pico.pl'
-
-
-#############
-print '\n...Checking dependencies...'
-# check exists, executable
-#############
-
-# verify executables
-test_exec(plinkx, 'Plink')
-if not args.skip_platform:
     test_exec(plague_ex, 'Platform guessing script')
 # TODO: verify plague works properly across platforms (primary concern is Compress::Zlib loading)
 
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index e8bca37..8f53448 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -44,7 +44,7 @@
 # import random
 # import warnings
 from args_impute import *
-from py_helpers import unbuffer_stdout, link, read_conf #, test_exec
+from py_helpers import unbuffer_stdout, link, find_exec #, test_exec
 # file_len, read_conf, find_from_path, link, gz_confirm
 unbuffer_stdout()
 
@@ -106,24 +106,13 @@
     outdot = str(args.out)
 
 
-#############
-print '\n...Reading ricopili config file...'
-#############
-
-### read plink, shapeit loc from config
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
-
-plinkx = configs['p2loc']+"plink"
-shapeit_ex = configs['shloc'] + '/bin/shapeit'
-
-
 
 #############
 print '\n...Checking dependencies...'
 #############
 
-
+plinkx = find_exec('plink',key='p2loc')
+shapeit_ex = find_exec('shapeit',key='shloc')
 
 # TODO: here
 
diff --git a/bin/strict_qc.py b/bin/strict_qc.py
index 64e61fc..1e9a476 100755
--- a/bin/strict_qc.py
+++ b/bin/strict_qc.py
@@ -42,7 +42,7 @@
 import subprocess
 import argparse
 from glob import glob
-from py_helpers import file_len, read_conf, unbuffer_stdout, test_exec
+from py_helpers import file_len, find_exec, unbuffer_stdout, test_exec
 from args_pca import *
 unbuffer_stdout()
 
@@ -79,30 +79,21 @@
 print '--ld_wind '+str(args.ld_wind)
 print '--all_chr '+str(args.all_chr)
 
- 
-#############
-print '\n...Reading ricopili config file...'
-#############
 
-### read plink loc from config
 
-conf_file = os.environ['HOME']+"/ricopili.conf"
-configs = read_conf(conf_file)
+#############
+print '\n...Checking dependencies...'
+# check exists, executable
+#############
 
-plinkx = configs['p2loc']+"plink"
+# plink
+plinkx = find_exec('plink',key='p2loc')
 
 # get directory containing current script
 # (hack to help find ld region text file)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 rp_dir = os.path.dirname(rp_bin)
 
-#############
-print '\n...Checking dependencies...'
-# check exists, executable
-#############
-
-# plink
-test_exec(plinkx, 'Plink')
 
 # ld region file, if needed
 # try in rp_dir/lib/ in addition to cwd

From de4f9d6a4bf297039c6ba2d33361f169821668a6 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 29 Sep 2016 14:56:11 -0400
Subject: [PATCH 05/48] added picopili configs w defaults, make email optional

---
 bin/config_pico.pl | 330 +++++++++++++++++++++++++--------------------
 bin/imp_prep.pl    |  74 +++++-----
 2 files changed, 223 insertions(+), 181 deletions(-)

diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 7c9cd2b..62013bc 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -279,14 +279,12 @@ ()
 }
 print "\n";
 
-print "Please enter your email address:\n";
+print "Please enter your email address (optional, can enter \"None\"):\n";
 my $email = <>;
 chomp $email;
 print "\n";
 
 
-
-
 #############################
 # allow default all remaining values on select platforms
 #############################
@@ -312,7 +310,12 @@ ()
 	       "i2loc","IMPUTE2",
 	       "liloc","Liftover",
 	       "eloc","Eigenstrat",
-#	       "rloc","R",
+		   "admloc","ADMIXTURE",
+		   "reaploc","REAP",
+		   "priloc","PRIMUS",
+	       "rloc","R",
+		   "rscloc","Rscript",
+		   "rplloc","R-enabled Plink (e.g. v1.07, or a dev build of 1.90)",
 	       "hmloc","HapMap reference",
 	       "perlpack","Perl packages (for Compress::Zlib)",
     );
@@ -323,7 +326,12 @@ ()
 		 "i2loc","",
 		  "liloc","",
 		  "eloc","",
-#		  "rloc","",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","",
+		  "rscloc","",
+		  "rplloc","",
 		  "hmloc","",
 		  "perlpack","",
     );
@@ -331,164 +339,204 @@ ()
     
 if ($clusters{broad}){
     %variables = (
+		  "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest",
+		  "shloc","/home/unix/sripke/shapeit/bin",
+		  "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta",
+		  "liloc","/home/unix/sripke/liftover",
+		  "eloc","/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin",
+		  "admloc","/humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.23",
+		  "reaploc","/humgen/atgu1/fs03/shared_resources/shared_software/REAP",
+		  "priloc","/humgen/atgu1/fs03/shared_resources/shared_software/PRIMUS_v1.8.0/bin",
+		  "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin",
+		  "rscloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin",
+		  "rplloc","/home/unix/sripke/plink_src/src/",
+		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref",
+		  "perlpack","/home/unix/sripke/perl_modules",
+	);
+}
+
 		  # "ploc", "/home/unix/sripke/plink_src/src/",
-		  "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/",
-		  "shloc","/home/unix/sripke/shapeit/",
-		  "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/",
-		  "liloc","/home/unix/sripke/liftover/",
-		  "eloc","/home/unix/sripke/eigensoft/bin",
 #		  "ldloc","/psych/genetics_data/ripke/ldsc/",
 #		  "rloc","broadinstitute",
 #		  "rpac","NA",
-		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/",
 #		  "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/",
 #		  "hvloc","/home/radon01/sripke/bakker_ripke/haploview/",
-		  "perlpack","/home/unix/sripke/perl_modules",
-	);
-}
+
 
 elsif ($clusters{lisa}){
     %variables = (
-#		  "ploc", "/home/gwas/plink/1.08/src",
 		  "p2loc","/home/gwas/plink2/plink_1.9_newest",
-		  "shloc","/home/gwas/shapeit",
+		  "shloc","/home/gwas/shapeit/bin",
 		  "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static",
 		  "liloc","/home/gwas/liftover",
-#		  "ldloc","/home/gwas/ldsc/",
 		  "eloc","/home/gwas/eigensoft",
-#		  "rloc","/sara/sw/R-3.1.2/bin/",
-#		  "rpac","NA",
-		  "hmloc","/home/gwas/pgc-samples/hapmap_ref/",
-#		  "meloc","/home/gwas/metal",
-#		  "hvloc","./",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","/sara/sw/R-3.1.2/bin",
+		  "rscloc","/sara/sw/R-3.1.2/bin",
+		  "rplloc","/home/gwas/plink/1.08/src",
+		  "hmloc","/home/gwas/pgc-samples/hapmap_ref",
 		  "perlpack","/home/gwas/perl_modules",
 	);
 }
 
+#		  "ploc", "/home/gwas/plink/1.08/src",
+#		  "ldloc","/home/gwas/ldsc/",
+#		  "rloc","/sara/sw/R-3.1.2/bin/",
+#		  "rpac","NA",
+#		  "meloc","/home/gwas/metal",
+#		  "hvloc","./",
 
 
 elsif ($clusters{computerome}){
     %variables = (
+		  "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest",
+		  "shloc","/home/people/sripke/rp_external_bins/shapeit/bin",
+		  "i2loc","/home/people/sripke/rp_external_bins/impute2",
+		  "liloc","/home/people/sripke/rp_external_bins/liftover",
+		  "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","/services/tools/R-3.1.2/bin",
+		  "rscloc","/services/tools/R-3.1.2/bin",
+		  "rplloc","/services/tools/R-3.1.2/bin",
+		  "hmloc","/home/people/sripke/imputation_references",
+		  "perlpack","/home/people/sripke/rp_external_bins/perl_packages",
+	);
+}
+
 #		  "ploc", "/home/people/sripke/rp_external_bins/plink/",
-		  "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/",
-		  "shloc","/home/people/sripke/rp_external_bins/shapeit/",
-		  "i2loc","/home/people/sripke/rp_external_bins/impute2/",
-		  "liloc","/home/people/sripke/rp_external_bins/liftover/",
 #		  "ldloc","/home/people/sripke/rp_external_bins/ldsc/",
-		  "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/",
 #		  "rloc","/services/tools/R-3.1.2/bin/",
 #		  "rpac","/home/people/sripke/rp_external_bins/Rpackages/",
-		  "hmloc","/home/people/sripke/imputation_references/",
 #		  "meloc","/home/people/sripke/rp_external_bins/metal/",
 #		  "hvloc","./",
-		  "perlpack","/home/people/sripke/rp_external_bins/perl_packages",
-	);
-}
 
 
 elsif ($clusters{co_ipsych}){
     %variables = (
-#		  "ploc", "/data/tools/plink-1.07/",
-		  "p2loc","/data/tools/plink2_sept2015/",
-		  "shloc","/data/tools/shapeit_sept_2015/",
+		  "p2loc","/data/tools/plink2_sept2015",
+		  "shloc","/data/tools/shapeit_sept_2015/bin",
 		  "i2loc","/data/tools/impute-2.3.2/",
-		  "liloc","/data/user_tools/rp_external_bins/liftover/",
+		  "liloc","/data/user_tools/rp_external_bins/liftover",
+		  "eloc","/data/tools/eigensoft-6.0.1/bin",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","/data/tools/R-3.2.1/bin",
+		  "rscloc","/data/tools/R-3.2.1/bin",
+		  "rplloc","/data/tools/plink-1.07",
+		  "hmloc","/data/user_tools/imputation_references",
+		  "perlpack","/data/user_tools/rp_external_bins/perl_packages",
+	);
+}
+
+#		  "ploc", "/data/tools/plink-1.07/",
 #		  "ldloc","/data/user_tools/rp_external_bins/ldsc/",
-		  "eloc","/data/tools/eigensoft-6.0.1/bin/",
 #		  "rloc","/data/tools/R-3.2.1/bin/",
 #		  "rpac","/data/user_tools/rp_external_bins/Rpackages/",
-		  "hmloc","/data/user_tools/imputation_references/",
 #		  "meloc","/data/tools/metal-20110325/",
 #		  "hvloc","./",
-		  "perlpack","/data/user_tools/rp_external_bins/perl_packages",
-	);
-}
+
 
 elsif ($clusters{genomedk}){
     %variables = (
+		  "p2loc","/project/ricopili/plink_1.9_jul4",
+		  "shloc","/project/ricopili/3rd_bins/shapeit/bin",
+		  "i2loc","/project/ricopili/3rd_bins/impute2",
+		  "liloc","/project/ricopili/3rd_bins/liftover",
+		  "eloc","/project/ricopili/3rd_bins/eigenstrat/bin",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","/com/extra/R/3.1.0/bin",
+		  "rscloc","/com/extra/R/3.1.0/bin",
+		  "rplloc","/project/ricopili/plink_src",
+		  "hmloc","/project/ricopili/reference_dir",
+		  "perlpack","/project/ricopili/perl_packages/",
+	);
+}
+
 #		  "ploc", "/project/ricopili/plink_src/",
-		  "p2loc","/project/ricopili/plink_1.9_jul4/",
-		  "shloc","/project/ricopili/3rd_bins/shapeit/",
-		  "i2loc","/project/ricopili/3rd_bins/impute2/",
-		  "liloc","/project/ricopili/3rd_bins/liftover/",
-		  "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/",
 #		  "rloc","/com/extra/R/3.1.0/bin",
 #		  "rpac","NA",
-		  "hmloc","/project/ricopili/reference_dir/",
 #		  "meloc","/project/ricopili/3rd_bins/metal/",
 #		  "hvloc","./",
-		  "perlpack","/project/ricopili/perl_packages/",
-	);
-}
+
 
 elsif ($clusters{mssm}){
     %variables = (
+		  "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4",
+		  "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/bin",
+		  "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2",
+		  "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover",
+		  "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin",
+		  "rscloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin",
+		  "rplloc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke",
+		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir",
+		  "perlpack","/hpc/users/xripkes01/perl_modules",
+	);
+}
+
 #		  "ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/",
-		  "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/",
-		  "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/",
-		  "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/",
-		  "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/",
-		  "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/",
 #		  "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/",
 #		  "rpac","NA",
-		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/",
 #		  "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/",
 #		  "hvloc","./",
-		  "perlpack","/hpc/users/xripkes01/perl_modules/",
-	);
-}
-
 
 
 foreach (keys %variables){
 
-    if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") {
-	print "You are running R on broad, took the default value\n\n";
-    }
-    elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") {
-	print "assuming library rmeta is installed on standard R\n\n";
-    }
-    else {
 	if ($variables{$_} ne '' && (-d $variables{$_})){
-	    print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n";
+	    print "Default location for $longvar{$_} is: \n\t$variables{$_}\n\n";
 	    if ($defall == 0) {
-		while (1) {
-		    my $answer = lc <>;
-		    chomp $answer;
-		    if ($answer eq "y") {
-			print "Using $variables{$_} for $longvar{$_}.\n\n";
-			last;
-		    }
-		    elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n";
-					    my $input = <>;
-					    chomp $input;
-					    $input =~ s/^~/$ENV{HOME}/g;
-					    $input =~ s/^\./$cd/g;
-					    unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
-					    print "\n";
-					    last;}
-		    else {print "Please answer with y or n.\n";}
+			print "Do you want to use this location (y or n)?\n";
+			while (1) {
+		    	my $answer = lc <>;
+				chomp $answer;
+				if ($answer eq "y") {
+					print "Using $variables{$_} for $longvar{$_}.\n\n";
+					last;
+				}
+				elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n";
+					my $input = <>;
+					chomp $input;
+					$input =~ s/^~/$ENV{HOME}/g;
+					$input =~ s/^\./$cd/g;
+					unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;
+				}
+				print "\n";
+				last;
+			}
+		    else {
+				print "Please answer with y or n.\n";
+			}
 		}
 	    }
 	}
 	else {
 	    while (1){
 			unless($clusters{other} == 1){
-				print "No default value available for:\n";
+				print "No default value available for $longvar{$_}\n";
 			}
-		print "Please enter a location for $longvar{$_}:\n";
-		my $input = "";
-		$input = <>;
-		chomp $input;
-		$input =~ s/^~/$ENV{HOME}/g;
-		$input =~ s/^\./$cd/g;
-		unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
-		$variables{$_} = $input;
-		print "\n";
-		last;
+			print "Please enter a location:\n";
+			my $input = "";
+			$input = <>;
+			chomp $input;
+			$input =~ s/^~/$ENV{HOME}/g;
+			$input =~ s/^\./$cd/g;
+			unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;}
+			$variables{$_} = $input;
+			print "\n";
+			last;
 	    }
 	}
-    }
 }
 
 foreach (keys %variables){
@@ -500,14 +548,17 @@ ()
 push (@text, "email $email");
 
 ### define queue depending on cluster
-#if ($clusters{broad}){push (@text, "queue bsub")}
 
 if ($clusters{broad}){push (@text, "queue broad_uger")}
-if ($clusters{lisa}){push (@text, "queue qsub")}
-if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")}
-if ($clusters{genomedk}){push (@text, "queue slurm")}
-if ($clusters{mssm}){push (@text, "queue msub")}
-}
+if ($clusters{lisa}){push (@text, "queue lisa")}
+if ($clusters{computerome}){push (@text, "queue computerome")}
+if ($clusters{co_ipsych}){push (@text, "queue computerome_ipsych")}
+if ($clusters{genomedk}){push (@text, "queue genomedk")}
+if ($clusters{mssm}){push (@text, "queue mssm")}
+
+
+} # end if block for getting conf file info
+
 
 unless ( -e $conf_file && $ans_ow eq "n") {
     die $! unless open FILE, "> $conf_file";
@@ -518,7 +569,7 @@ ()
 
 
 #############################
-# read picopili.config file with default parameters
+# read picopili.conf file with default parameters
 #############################
 my %conf = (); ## hash with config parameters
 
@@ -537,18 +588,19 @@ ()
 ############################
 my @fail_path = ();
 my %locs = ( 
-# 	"ploc","",
-	"p2loc","",
-	"shloc","",
-	"i2loc","",
-	"liloc","",
-	"eloc","",
-# 	"rloc","",
-	"hmloc","",
-# 	"meloc","",
-# 	"ldloc","",
-# 	"rpac","",
-	"perlpack",""
+		  "p2loc", "",
+		  "shloc","",
+		  "i2loc","",
+		  "liloc","",
+		  "eloc","",
+		  "admloc","",
+		  "reaploc","",
+		  "priloc","",
+		  "rloc","",
+		  "rscloc","",
+		  "rplloc","",
+		  "hmloc","",
+		  "perlpack","",
 );
 
 die $!."($conf_file)" unless open FILE, "< $conf_file";
@@ -566,49 +618,30 @@ ()
 # print finish statement
 #############################
 
-my $fail = 0;
+my $email_on = 0;
+if ($conf{'email'} =~ m/\@/) {
+	$email_on = 1;
+}
+
 if ($#fail_path != -1) { 
 
-   
-#    foreach (@fail_path) {
-#        unless ($_ eq "rloc" && $clusters{broad} == 1) {
+	print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n";
 
-    foreach my $confvar (@fail_path) {
-	if ($confvar eq "rloc" && $clusters{broad} == 1) {
-            next;
+	foreach (@fail_path) {
+		print "\t$_\n";
 	}
-	elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) {
-            next;
-	}
-	else{
-            $fail += 1;            
-        }
-    }
-    if ($fail != 0) {
-        print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n";
-        foreach (@fail_path) {
-            unless ($_ eq "rloc" && $clusters{broad} == 1) {
-                print "\t$_\n";            
-            }
-        }
-    }
-    else {
-        print "Setup has been completed successfully!\n";
-        print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; 
-        &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");           
-    }
 }
 else {
-    print "Setup has been completed successfully!\n";
-    print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n";   
-    &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config  $conf{'email'}");    
+	print "Configuration completed successfully! Settings are stored in $conf_file\n";
+	
+	if ($email_on){
+		print "If you do not receive an email with the subject picopili_config, please check your address is entered correctly at $conf_file\n";
+		&mysystem("echo \"Configuration for picopili was successful.\" | mail -s picopili_config  $conf{'email'}");
+	}
 }
 
 
 
-
-
-
 ###################################################
 ###  Optional: Add bin to default search path
 ###################################################
@@ -706,9 +739,10 @@ ()
 	}
 }
 else{
-	print "Successfully found picopili directory in search path!\n";	
+	print "Successfully found picopili directory in search path!\n\n";	
 }
 
+print "### Finished ###\n\n";
     
 exit;
 ########## Done ##########
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index 9134f09..2c4f0bb 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -61,6 +61,10 @@
 my $liloc = &trans("liloc");
 my $email = &trans("email");
 
+my $email_on = 0;
+if ($email =~ m/\@/){
+	$email_on = 1;
+}
 
 ###############################################
 
@@ -183,7 +187,7 @@
 push @test_scripts, $buigue_script;
 push @test_scripts, $checkpos_script;
 push @test_scripts, $checkflip_script;
-push @test_scripts,  $blue_script;
+push @test_scripts, $blue_script;
 
 #push @test_scripts, $mutt_script ;
 
@@ -232,42 +236,43 @@
 
 
 
+if($email_on){
 
+	print ".......testing email program....\n";
 
-print ".......testing email program....\n";
-
-my $err_scr = 0;
-{
-    my $scr_path = '';
+	my $err_scr = 0;
+	{
+	    my $scr_path = '';
     
-    for my $path ( split /:/, $ENV{PATH} ) {
-	if ( -f "$path/$mutt_script" && -x _ ) {
-	    print "$mutt_script\tfound in $path\n";
-	    $scr_path = "$path/$mutt_script";
-	    last;
-	}
-    }
-    unless ( $scr_path ) {
+	    for my $path ( split /:/, $ENV{PATH} ) {
+		if ( -f "$path/$mutt_script" && -x _ ) {
+		    print "$mutt_script\tfound in $path\n";
+		    $scr_path = "$path/$mutt_script";
+		    last;
+		}
+	    }
+	    unless ( $scr_path ) {
 
-	print "!!Warning!! : No $mutt_script command available, trying mail\n" ;
+		print "!!Warning!! : No $mutt_script command available, trying mail\n" ;
 
-	$mutt_script = "mail";
-	for my $path ( split /:/, $ENV{PATH} ) {
-	    if ( -f "$path/$mutt_script" && -x _ ) {
-		print "$mutt_script\tfound in $path\n";
-		$scr_path = "$path/$mutt_script";
-		last;
+		$mutt_script = "mail";
+		for my $path ( split /:/, $ENV{PATH} ) {
+		    if ( -f "$path/$mutt_script" && -x _ ) {
+			print "$mutt_script\tfound in $path\n";
+			$scr_path = "$path/$mutt_script";
+			last;
+		    }
+		}
+		unless ( $scr_path ) {
+		    $err_scr = 1;
+		    print "!!Error!! : No $mutt_script command available\n" ;
+		}
 	    }
-	}
-	unless ( $scr_path ) {
-	    $err_scr = 1;
-	    print "!!Error!! : No $mutt_script command available\n" ;
-	}
-    }
  
-}
-die if $err_scr == 1;
+	}
+	die if $err_scr == 1;
 
+}
 
 print "....all necessary binaries found....\n";
 print "------------------------------------\n";
@@ -395,7 +400,7 @@ sub a2filenew_app {
 }
 
 
-my $sjainfofile = "$rootdir/impute_dir_info.log";
+my $sjainfofile = "$rootdir/imp_prep_job_info.log";
 unless (-e $sjainfofile) {
 	&mysystem ("touch $sjainfofile");
 }
@@ -435,7 +440,9 @@ sub send_jobarray {
 	print SUC $fini_message."\n";
 	close SUC;
 
-	&mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ;
+	if($email_on){
+		&mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ;
+	}
 
 	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
 	&a2filenew_app("$sjainfofile",$sjarow);
@@ -543,8 +550,9 @@ sub send_jobarray {
 	    print ERR $err_message."\n";
 	    close ERR;
 
-
-	    &mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ;
+		if($email_on){
+			&mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ;
+		}
 
 	    unless ($serial) {
 		exit;

From 36d046e7988c60db9c8117fdbc6c476fd4a4667f Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 30 Sep 2016 13:22:05 -0400
Subject: [PATCH 06/48] offer separate plague and buigue reference files

---
 .gitignore                                |   3 +-
 GET_REFS                                  |   1 +
 bin/{bin_check_pico => bin_check_pico.pl} |   0
 bin/buigue_pico.pl                        |  19 ++-
 bin/config_pico.pl                        |  77 ++++++++---
 bin/get_refs.sh                           | 153 ++++++++++++++++++++++
 bin/imp_prep.pl                           |   3 +-
 bin/lift_to_hg19.pl                       |  10 +-
 bin/plague_pico.pl                        |   5 +-
 9 files changed, 242 insertions(+), 29 deletions(-)
 create mode 120000 GET_REFS
 rename bin/{bin_check_pico => bin_check_pico.pl} (100%)
 create mode 100755 bin/get_refs.sh

diff --git a/.gitignore b/.gitignore
index 094a0f3..53fddd1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,5 @@ bin/args_qc.pyc
 bin/args_gwas.pyc
 bin/args_chunks.pyc
 bin/args_impute.pyc
-
+lib/plague*
+lib/buigue*
diff --git a/GET_REFS b/GET_REFS
new file mode 120000
index 0000000..0622e86
--- /dev/null
+++ b/GET_REFS
@@ -0,0 +1 @@
+./bin/get_refs.sh
\ No newline at end of file
diff --git a/bin/bin_check_pico b/bin/bin_check_pico.pl
similarity index 100%
rename from bin/bin_check_pico
rename to bin/bin_check_pico.pl
diff --git a/bin/buigue_pico.pl b/bin/buigue_pico.pl
index 3812cfc..14d6c95 100755
--- a/bin/buigue_pico.pl
+++ b/bin/buigue_pico.pl
@@ -5,8 +5,10 @@
 # load utility functions
 #############################
 
+use File::Basename;
 use FindBin;
 use lib "$FindBin::Bin";
+use Cwd 'abs_path';
 use rp_perl::Utils qw(trans);
 
 
@@ -14,14 +16,19 @@
 my $progname = $0;
 $progname =~ s!^.*/!!;
 
+my $picodir = dirname(dirname(abs_path($0)));
 
 #############################
 # read config file
 #############################
 
 my $liloc = &trans("liloc");
+my $liref = "$picodir/lib/buigue";
 
-my $perlpack = &trans("perlpack");
+my $perlpack;
+BEGIN {
+	$perlpack = &trans("perlpack");
+}
 use lib $perlpack;
 
 #####################################################
@@ -30,10 +37,10 @@
 
 
 my @bu_files;
-push @bu_files, "$liloc/snp.txt.pos.scz49.gz";
-push @bu_files, "$liloc/snp125.txt.pos.scz49.gz";
-push @bu_files, "$liloc/snp130.txt.pos.scz49.gz";
-push @bu_files, "$liloc/snp138.txt.pos.scz49.gz";
+push @bu_files, "$liref/snp.txt.pos.scz49.gz";
+push @bu_files, "$liref/snp125.txt.pos.scz49.gz";
+push @bu_files, "$liref/snp130.txt.pos.scz49.gz";
+push @bu_files, "$liref/snp138.txt.pos.scz49.gz";
 
 my @li_files;
 push @li_files, "$liloc/hg16ToHg19.over.chain.gz";
@@ -59,7 +66,7 @@
  guesses the build of a bim file out of ucsc snp file
 
   find here the helping files:
-    $liloc
+    $liref
 
  created by Stephan Ripke 2014 at MGH, Boston, MA
  in the frame of the PGC
diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 62013bc..52d125a 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -15,7 +15,7 @@
 
 $progname =~ s!^.*/!!;
 
-my $cdir = abs_path($0);
+my $cdir = dirname(abs_path($0));
 my $home = $ENV{HOME};
 my $conf_file = $ENV{HOME}."/picopili.conf";
 my $command_line = "$progname @ARGV";
@@ -316,7 +316,6 @@ ()
 	       "rloc","R",
 		   "rscloc","Rscript",
 		   "rplloc","R-enabled Plink (e.g. v1.07, or a dev build of 1.90)",
-	       "hmloc","HapMap reference",
 	       "perlpack","Perl packages (for Compress::Zlib)",
     );
 	       
@@ -332,7 +331,6 @@ ()
 		  "rloc","",
 		  "rscloc","",
 		  "rplloc","",
-		  "hmloc","",
 		  "perlpack","",
     );
 
@@ -350,7 +348,6 @@ ()
 		  "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin",
 		  "rscloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin",
 		  "rplloc","/home/unix/sripke/plink_src/src/",
-		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref",
 		  "perlpack","/home/unix/sripke/perl_modules",
 	);
 }
@@ -361,6 +358,7 @@ ()
 #		  "rpac","NA",
 #		  "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/",
 #		  "hvloc","/home/radon01/sripke/bakker_ripke/haploview/",
+#		  "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref",
 
 
 elsif ($clusters{lisa}){
@@ -376,7 +374,6 @@ ()
 		  "rloc","/sara/sw/R-3.1.2/bin",
 		  "rscloc","/sara/sw/R-3.1.2/bin",
 		  "rplloc","/home/gwas/plink/1.08/src",
-		  "hmloc","/home/gwas/pgc-samples/hapmap_ref",
 		  "perlpack","/home/gwas/perl_modules",
 	);
 }
@@ -387,6 +384,7 @@ ()
 #		  "rpac","NA",
 #		  "meloc","/home/gwas/metal",
 #		  "hvloc","./",
+#		  "hmloc","/home/gwas/pgc-samples/hapmap_ref",
 
 
 elsif ($clusters{computerome}){
@@ -402,7 +400,6 @@ ()
 		  "rloc","/services/tools/R-3.1.2/bin",
 		  "rscloc","/services/tools/R-3.1.2/bin",
 		  "rplloc","/services/tools/R-3.1.2/bin",
-		  "hmloc","/home/people/sripke/imputation_references",
 		  "perlpack","/home/people/sripke/rp_external_bins/perl_packages",
 	);
 }
@@ -413,7 +410,7 @@ ()
 #		  "rpac","/home/people/sripke/rp_external_bins/Rpackages/",
 #		  "meloc","/home/people/sripke/rp_external_bins/metal/",
 #		  "hvloc","./",
-
+#		  "hmloc","/home/people/sripke/imputation_references",
 
 elsif ($clusters{co_ipsych}){
     %variables = (
@@ -428,7 +425,6 @@ ()
 		  "rloc","/data/tools/R-3.2.1/bin",
 		  "rscloc","/data/tools/R-3.2.1/bin",
 		  "rplloc","/data/tools/plink-1.07",
-		  "hmloc","/data/user_tools/imputation_references",
 		  "perlpack","/data/user_tools/rp_external_bins/perl_packages",
 	);
 }
@@ -439,6 +435,7 @@ ()
 #		  "rpac","/data/user_tools/rp_external_bins/Rpackages/",
 #		  "meloc","/data/tools/metal-20110325/",
 #		  "hvloc","./",
+#		  "hmloc","/data/user_tools/imputation_references",
 
 
 elsif ($clusters{genomedk}){
@@ -454,7 +451,6 @@ ()
 		  "rloc","/com/extra/R/3.1.0/bin",
 		  "rscloc","/com/extra/R/3.1.0/bin",
 		  "rplloc","/project/ricopili/plink_src",
-		  "hmloc","/project/ricopili/reference_dir",
 		  "perlpack","/project/ricopili/perl_packages/",
 	);
 }
@@ -464,6 +460,7 @@ ()
 #		  "rpac","NA",
 #		  "meloc","/project/ricopili/3rd_bins/metal/",
 #		  "hvloc","./",
+#		  "hmloc","/project/ricopili/reference_dir",
 
 
 elsif ($clusters{mssm}){
@@ -479,7 +476,6 @@ ()
 		  "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin",
 		  "rscloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin",
 		  "rplloc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke",
-		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir",
 		  "perlpack","/hpc/users/xripkes01/perl_modules",
 	);
 }
@@ -489,7 +485,7 @@ ()
 #		  "rpac","NA",
 #		  "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/",
 #		  "hvloc","./",
-
+#		  "hmloc","/hpc/users/xripkes01/ricopili/reference_dir",
 
 foreach (keys %variables){
 
@@ -599,7 +595,6 @@ ()
 		  "rloc","",
 		  "rscloc","",
 		  "rplloc","",
-		  "hmloc","",
 		  "perlpack","",
 );
 
@@ -646,7 +641,7 @@ ()
 ###  Optional: Add bin to default search path
 ###################################################
 
-system("bin_check_pico"); # dummy script that doesn't do anything
+system("bin_check_pico.pl"); # dummy script that doesn't do anything
 my $status_bin = ($? >> 8);
 
 
@@ -742,7 +737,59 @@ ()
 	print "Successfully found picopili directory in search path!\n\n";	
 }
 
-print "### Finished ###\n\n";
-    
+
+##################
+#
+# Check whether reference files are present yet
+#
+##################
+
+my $picobin = dirname($cdir);
+my $plaguedir = "$picobin/lib/plague";
+my $buiguedir = "$picobin/lib/buigue";
+
+my $haveref = 0;
+if (-e $plaguedir && -e $buiguedir){
+
+	my @ref_files;
+	my $refcc = 0;
+	push @ref_files, "$buiguedir/snp.txt.pos.scz49.gz";
+	push @ref_files, "$buiguedir/snp125.txt.pos.scz49.gz";
+	push @ref_files, "$buiguedir/snp130.txt.pos.scz49.gz";
+	push @ref_files, "$buiguedir/snp138.txt.pos.scz49.gz";
+	push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0815.gz";
+	push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0416a.gz";
+	push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0114.gz";
+
+	foreach my $fi (@ref_files){
+		if (-e $fi){
+			$refcc++;
+			next;
+		}else{
+			last;
+		}
+	}
+	
+	if ($refcc == scalar(@ref_files)){
+		$haveref = 1;
+	}
+
+}
+
+if ($haveref == 0){
+	print "\n----------------------------------------------------\n";
+	print "References files from ricopili for guessing genome build and \n";
+	print "genotyping platform have not been installed yet.\n\n";
+	print "Please run:\n";
+	print "\t$picobin/GET_REFS\n";
+	
+}else{
+	print "Successfully found ricopili plague and buigue reference files!\n"
+}
+
+if ($haveref == 1 && $statusbin == 0){
+	print "\n### Finished ###\n\n";
+}
+
 exit;
 ########## Done ##########
diff --git a/bin/get_refs.sh b/bin/get_refs.sh
new file mode 100755
index 0000000..3ce2d5a
--- /dev/null
+++ b/bin/get_refs.sh
@@ -0,0 +1,153 @@
+#! /bin/sh
+
+###########
+#
+# get_refs.sh
+# Retrives reference files
+#
+# - ricopili platform guessing (plague) files
+# - ricopili build guessing (buigue) files
+#
+###########
+
+echo " "
+echo "### External reference file downloader for picopili ###"
+echo " "
+echo "Picopili depends on a few curated reference files"
+echo "from ricopili. If ricopili is installed on your"
+echo "platform, will set up symbolic links to the required"
+echo "files. Otherwise, will download the files."
+echo " "
+
+echo "### BEGIN ###"
+echo " "
+
+# setup
+rp_conf="$HOME/ricopili.conf"
+SERVER="https://personal.broadinstitute.org/rwalters/picopili_files/"
+SCRIPT=$(readlink -f "$0")
+BINLOC=$(dirname "$SCRIPT")
+LIBLOC=`echo $(dirname "$BINLOC")"/lib"`
+rp=false
+li_done=false
+hm_done=false
+
+if [ -d "$LIBLOC/buigue" ]; then
+	echo "WARNING: Found existing folder $LIBLOC/buigue. Contents may be overwritten."
+	echo "(pausing to allow cancel...)"
+	sleep 3
+	echo "(continuing)"
+else
+	mkdir "$LIBLOC/buigue"
+fi
+
+if [ -d "$LIBLOC/plague" ]; then
+	echo "WARNING: Found existing folder $LIBLOC/plague. Contents may be overwritten."
+	echo "(pausing to allow cancel...)"
+	sleep 3
+	echo "(continuing)"
+else
+	mkdir "$LIBLOC/plague"
+fi
+
+
+# check/read config file
+if [ -e "$rp_conf" ]; then
+	rp=true
+	echo "Found existing ricopili configuration. Reading..."
+	liloc=`awk '$1=="liloc"{print $2}' $rp_conf`
+	hmloc=`awk '$1=="hmloc"{print $2}' $rp_conf`
+else
+	echo "No ricopili configuration found."
+fi
+
+lifiles=("snp.txt.pos.scz49.gz" "snp125.txt.pos.scz49.gz" "snp130.txt.pos.scz49.gz" "snp138.txt.pos.scz49.gz" "last")
+hmfiles=("snp_platform_collection.txt.new.0815.gz" "snp_platform_collection.txt.new.0416a.gz" "snp_platform_collection.txt.new.0114.gz" "last")
+
+# link creation from ricopili references
+if [ "$rp" = 'true' ]; then 
+
+	if [ -d "$liloc" ]; then
+		
+		for finame in ${lifiles[@]}; do
+			
+			if [ "$finame" = "last" ]; then
+				li_done=true
+			else
+				echo "$liloc/$finame"
+				ln -sfn "$liloc/$finame" "$LIBLOC/buigue" || break
+			fi
+		done
+	fi
+	
+	if [ "$li_done" = 'false' ]; then
+		echo "Failed to link all files from liftover directory $liloc"
+	fi
+
+
+	if [ -d "$hmloc" ]; then
+		for finame in ${hmfiles[@]}; do
+			
+			if [ $finame == "last" ]; then
+				hm_done=true
+			else
+				echo "$hmloc/$finame"
+				ln -sfn "$hmloc/$finame" "$LIBLOC/plague" || break
+			fi
+		done
+	fi
+
+	if [ "$hm_done" = 'false' ]; then
+		echo "Failed to link all platform references from directory $hmloc"
+	fi
+fi
+
+if [ "$li_done" = 'false' ]; then 
+	to_dl=true
+elif [ "$hm_done" = 'false' ]; then
+	to_dl=true
+else
+	to_dl=false
+fi
+
+# wget external
+if [ "$to_dl" = 'true' ]; then
+
+	# warn of internet access
+	echo " "
+	echo "WARNING: Preparing to download reference files from:"
+	echo "$SERVER"
+	echo " "
+	echo "Expected total file size is ~275 MB, minus existing"
+	echo "files already linked/downloaded."
+	echo " "
+	echo "If you do not have web access, or if you do not want"
+	echo "to download these files now, please cancel now."
+	echo " "
+	echo "Will begin in 10 sec..."
+	echo " "
+	sleep 10
+
+	for finame in ${lifiles[@]}; do
+		
+		if [ "$finame" = "last" ]; then
+			continue
+		else
+			wget "$SERVER/$finame" "$LIBLOC/buigue/$finame"
+		fi
+	done
+	for finame in ${hmfiles[@]}; do
+
+		if [ "$finame" = "last" ]; then
+			continue
+		else
+			wget "$SERVER/$finame" "$LIBLOC/plague/$finame"
+		fi
+	done
+fi
+
+echo " "
+echo "### Finished ###"
+echo " "
+
+# eof
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index 2c4f0bb..3c75dfb 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -58,7 +58,6 @@
 
 my $ploc = &trans("p2loc");
 my $qloc = &trans("queue");
-my $liloc = &trans("liloc");
 my $email = &trans("email");
 
 my $email_on = 0;
@@ -177,6 +176,7 @@
 my $readref_script = "readref_pico.pl";    ### my.pipeline_tar
 my $readrefsum_script = "readrefsum_pico.pl";  ### my.pipeline_tar
 my $buigue_script = "buigue_pico.pl";              ### my.pipeline_tar
+my $lift_script = "lift_to_hg19.pl";
 my $checkpos_script = "checkpos_pico.pl";         ### my.pipeline_tar
 my $checkflip_script = "checkflip_pico.pl";       ### my.pipeline_tar
 my $mutt_script = "mutt";                  ### my.pipeline_tar
@@ -185,6 +185,7 @@
 push @test_scripts, $readref_script;
 push @test_scripts, $readrefsum_script;
 push @test_scripts, $buigue_script;
+push @test_scripts, $lift_script;
 push @test_scripts, $checkpos_script;
 push @test_scripts, $checkflip_script;
 push @test_scripts, $blue_script;
diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl
index 782404b..7ac1726 100755
--- a/bin/lift_to_hg19.pl
+++ b/bin/lift_to_hg19.pl
@@ -44,11 +44,11 @@
 my $ploc = &trans("p2loc");
 my $liloc = &trans("liloc");
 
-if ($ENV{SYS_TYPE} =~ /redhat_6/) {
-    print "running on gold\n";
-    $liloc .= "64bit/";
-    print "using $liloc\n";
-}
+# if ($ENV{SYS_TYPE} =~ /redhat_6/) {
+#     print "running on gold\n";
+#     $liloc .= "64bit/";
+#     print "using $liloc\n";
+# }
 #exit;
 
 #######################################
diff --git a/bin/plague_pico.pl b/bin/plague_pico.pl
index 9825be5..9cfea53 100755
--- a/bin/plague_pico.pl
+++ b/bin/plague_pico.pl
@@ -5,7 +5,9 @@
 # load utility functions
 #############################
 
+use File::Basename;
 use FindBin;
+use Cwd 'abs_path';
 use lib "$FindBin::Bin";
 use rp_perl::Utils qw(trans);
 
@@ -13,12 +15,13 @@
 my $progname = $0;
 $progname =~ s!^.*/!!;
 
+my $picodir = dirname(dirname(abs_path($0)));
 
 #############################
 # read config file
 #############################
 
-my $hmloc = &trans("hmloc");
+my $hmloc = "$picodir/lib/plague";
 my $perlpack = &trans("perlpack");
 use lib $perlpack;
 

From 6f64c1f4faf61ba57b4acfa2d1858038483b9752 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 30 Sep 2016 20:28:11 -0400
Subject: [PATCH 07/48] start integrating blueprint.py, add threading and
 flexibility

---
 .gitignore                                    |   1 +
 bin/blueprint.py                              |  91 ++++++++++-----
 bin/config_pico.pl                            |  12 +-
 bin/pca_rel.py                                | 109 +++++-------------
 bin/py_helpers.py                             |   4 +-
 bin/shape_rel.py                              |  55 ++++-----
 cluster_templates/broad_uger.conf             |   3 +-
 cluster_templates/broad_uger.single.sub.sh    |  27 -----
 ...ad_uger.array.sub.sh => broad_uger.sub.sh} |   5 +-
 9 files changed, 127 insertions(+), 180 deletions(-)
 delete mode 100755 cluster_templates/broad_uger.single.sub.sh
 rename cluster_templates/{broad_uger.array.sub.sh => broad_uger.sub.sh} (82%)

diff --git a/.gitignore b/.gitignore
index 53fddd1..f5ac793 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 bin/py_helpers.pyc
+bin/blueprint.pyc
 bin/args_pca.pyc
 bin/args_ped.pyc
 bin/args_qc.pyc
diff --git a/bin/blueprint.py b/bin/blueprint.py
index 70acd87..5ccd47d 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -24,7 +24,7 @@ def send_job(jobname,
 #             week=None,
              njobs=None,
              maxpar=10000,
-#             multi=None,
+             threads=None,
              wait_file=None,
              wait_name=None,
              cluster=None,
@@ -41,6 +41,9 @@ def send_job(jobname,
 
     if logloc is None:
         logloc = os.getcwd()
+    
+    if not os.path.isdir(logloc):
+        os.mkdir(logloc)
         
     if maxpar < 1:
         maxpar = 10000
@@ -49,7 +52,7 @@ def send_job(jobname,
     if cluster is None:
         conf_file = os.environ['HOME']+"/picopili.conf"
         configs = read_conf(conf_file)
-        cluster = configs['queue'] 
+        cluster = configs['cluster']
 
     # get queue template
     pico_bin = os.path.dirname(os.path.realpath(__file__))
@@ -61,11 +64,22 @@ def send_job(jobname,
     # - submission syntax, queue names, job holds
     clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
 
+    # basic template
+    with open(str(clust_dir)+'/'+str(cluster)+'.sub.sh','r') as single_templ:
+        templ = single_templ.read()
+
     # setup memory args
     if mem is None:
         mem = 2000
     mem_mb = str(int(mem))
-    mem_gb = str(int(mem)/1000)
+    if int(mem) > 1000:
+        mem_gb = str(int(mem)/1000)
+    else:
+        mem_gb = str(1)
+
+    # multithreading arguments
+    if threads is None:
+        threads = 1
 
     # queue picking from job length
     if walltime is None:
@@ -95,14 +109,15 @@ def send_job(jobname,
         hold_str = ""
         
 
+    # load base template
+    
+
 
-    # template for single jobs
+    # for single jobs
     if cmd is not None and (njobs is None or njobs <= 1):
-        
-        with open(str(clust_dir)+'/'+str(cluster)+'.single.sub.sh','r') as single_templ:
-            templ = single_templ.read()
-            
+                    
         njobs = 1
+        tot_threads = int(threads)
         
         # log name
         if logname is None:
@@ -116,13 +131,11 @@ def send_job(jobname,
         j_per_core = 1
 
 
-    # template for array jobs
+    # for array jobs
     else:
-        with open(str(clust_dir)+'/'+str(cluster)+'.array.sub.sh','r') as array_templ:
-            templ = array_templ.read()
 
         # setup indexing tasks
-        j_per_core = int(clust_conf['array_core'])
+        j_per_core = int(clust_conf['j_per_node'])
         if j_per_core == 1:
             task_index = str(clust_conf['task_id'])
         else:
@@ -131,11 +144,13 @@ def send_job(jobname,
         # cmd or array file spec
         if cmd is not None:
             cmd_line = cmd.format(task=task_index)
+            tot_threads = int(njobs)*int(threads)
         
         else:
             assert os.path.isfile(arrayfile), "Job array file %s not found." % str(arrayfile)
             
             njobs = file_len(arrayfile)
+            tot_threads = int(njobs)*int(threads)
             
             cmd_tmp = dedent("""\
                 cline=`head -n {task} {fi} | tail -n 1`
@@ -150,14 +165,15 @@ def send_job(jobname,
             from math import floor, ceil
             
             # max simul tasks with memory limit
-            node_mem = float(clust_conf['array_core'])
+            node_mem = float(clust_conf['array_mem_mb'])
             task_mem_lim = floor((node_mem-1.0)/float(mem))
             
-            if task_mem_lim < 1:
-                task_mem_lim=1
+            # max simul tasks with threading
+            if task_mem_lim > floor(int(j_per_core)/int(threads)):
+                task_mem_lim = floor(int(j_per_core)/int(threads))
             
-            if task_mem_lim > j_per_core:
-                task_mem_lim = j_per_core
+            if task_mem_lim < 1:
+                task_mem_lim=1            
             
             # number of jobs to cover all tasks
             array_jobs = ceil(float(njobs)/float(task_mem_lim))
@@ -215,9 +231,11 @@ def send_job(jobname,
     # fill in template
     jobdict = {"job_name": str(jobname),
                "cmd_string": cmd_str, # formatted elsewhere
-               "log_name": str(logname),
+               "log_name": str(logloc)+'/'+str(logname),
                "mem_in_mb": str(mem_mb),
                "mem_in_gb": str(mem_gb),
+               "threads": str(threads),
+               "total_threads": str(tot_threads),
                "wall_hours": str(walltime),
                "njobs": str(njobs),
                "array_jobs": str(array_jobs),
@@ -226,7 +244,8 @@ def send_job(jobname,
                "task_id": str(clust_conf['task_id']),
                "log_task_id": str(clust_conf['log_task_id']),
                "queue_name": str(queue_name),
-               "sleep_time": str(sleep)
+               "sleep_time": str(sleep),
+               "project": str(clust_conf['project'])
                }
 
             
@@ -235,6 +254,23 @@ def send_job(jobname,
     sub_file.write(templ.format(**jobdict))
     sub_file.close()
     
+    # finalize or remove optional lines
+    if njobs <= 1:
+        subprocess.check_call(['sed','-i','/^::PICO_ARRAY_ONLY::/d',str(sub_file.name)])
+    else:
+        subprocess.check_call(['sed','-i','s/^::PICO_ARRAY_ONLY:://',str(sub_file.name)])
+    
+    if threads <= 1:
+        subprocess.check_call(['sed','-i','/^::PICO_THREAD_ONLY::/d',str(sub_file.name)])
+    else:
+        subprocess.check_call(['sed','-i','s/^::PICO_THREAD_ONLY:://',str(sub_file.name)])
+    
+    if njobs <= 1 and threads <= 1:
+        subprocess.check_call(['sed','-i','/^::PICO_THREADARRAY_ONLY::/d',str(sub_file.name)])
+    else:
+        subprocess.check_call(['sed','-i','s/^::PICO_THREADARRAY_ONLY:://',str(sub_file.name)])        
+
+
     # command to run
     if hold_str != "":    
         launch_str = clust_conf['sub_cmd']+' '+hold_str+' '+str(sub_file.name)
@@ -250,16 +286,9 @@ def send_job(jobname,
         out, err = p.communicate()
         print out
         return(p.returncode)
-#
-#
-#        # manual error nhandling here because of Broad LD_LIBRARY_PATH warning
-#        if p.returncode != 0:
-#            if "LD_LIBRARY_PATH" in out:
-#                print out
-#            else:
-#                raise IOError("Job submission failed\nCode: %d\nError: %s\nOutput: %s\n" % p.returncode, err, out)
-             
-    return 0
+
+    else:          
+        return 0
 
 
 ####################################
@@ -372,9 +401,9 @@ def send_job(jobname,
     
     # set logfile name
     if args.noerr:
-        logloc = os.getcwd()+'/errandout/'
-    else:
         logloc = os.getcwd()
+    else:
+        logloc = os.getcwd()+'/errandout/'
     
     # ignore arguments for direct
     if args.direct:
diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 52d125a..9fb19df 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -545,12 +545,12 @@ ()
 
 ### define queue depending on cluster
 
-if ($clusters{broad}){push (@text, "queue broad_uger")}
-if ($clusters{lisa}){push (@text, "queue lisa")}
-if ($clusters{computerome}){push (@text, "queue computerome")}
-if ($clusters{co_ipsych}){push (@text, "queue computerome_ipsych")}
-if ($clusters{genomedk}){push (@text, "queue genomedk")}
-if ($clusters{mssm}){push (@text, "queue mssm")}
+if ($clusters{broad}){push (@text, "cluster broad_uger")}
+if ($clusters{lisa}){push (@text, "cluster lisa")}
+if ($clusters{computerome}){push (@text, "cluster computerome")}
+if ($clusters{co_ipsych}){push (@text, "cluster computerome_ipsych")}
+if ($clusters{genomedk}){push (@text, "cluster genomedk")}
+if ($clusters{mssm}){push (@text, "cluster mssm")}
 
 
 } # end if block for getting conf file info
diff --git a/bin/pca_rel.py b/bin/pca_rel.py
index 942371d..7303b55 100755
--- a/bin/pca_rel.py
+++ b/bin/pca_rel.py
@@ -29,11 +29,11 @@
 
 ### load requirements
 import argparse
-import subprocess
 import os
 from math import ceil
 from args_pca import *
 from py_helpers import file_len, unbuffer_stdout
+from blueprint import send_job
 unbuffer_stdout()
 
 
@@ -166,32 +166,15 @@
                          strandambi_txt,
                          allchr_txt])
 
-#strictqc_lsf = ' '.join(["bsub",
-#                         "-q", 'hour',
-#                         "-R", str('\"rusage[mem=2]\"'),
-#                         "-J", str('strictqc_'+args.out),
-#                         "-P", str('pico_'+args.out),
-#                         "-o", str('strictqc_'+args.out+'.bsub.log'),
-#                         "-r",
-#                         str('\"'+strictqc_call+'\"')])
-#
-#print strictqc_lsf
-#if not args.test_sub:
-#    subprocess.check_call(strictqc_lsf, shell=True)
-
+send_job(jobname=str('strictqc_'+args.out),
+         arrayfile=None,
+         cmd=str(strictqc_call),
+         logname=str('strictqc_'+args.out+'.sub.log'),
+         mem=2000,
+         walltime=2,
+         sleep=0,
+         testonly=args.test_sub)
 
-strictqc_uger = ' '.join(['qsub',
-                          '-q', 'short',
-                          '-l', 'm_mem_free=2g,h_vmem=2g',
-                          '-N', str('strictqc_'+args.out),
-                          '-o', str('strictqc_'+args.out+'.bsub.log'),
-                          str(rp_bin)+'/uger.sub.sh',
-                          str(0),
-                          str(strictqc_call)])
-
-print strictqc_uger
-if not args.test_sub:
-    subprocess.check_call(strictqc_uger, shell=True)
 
 #####
 # submit imus pca
@@ -210,34 +193,15 @@
                          '--primus-ex', str(args.primus_ex)
                          ])
 
-#imuspca_lsf = ' '.join(["bsub",
-#                        "-w", str('\'ended(\"'+str('strictqc_'+args.out)+'\")\''),
-#                        "-E", str('\"sleep '+str(args.sleep)+'\"'),
-#                        "-q", 'week',
-#                        "-R", str('\"rusage[mem='+str(imus_mem)+']\"'),
-#                        "-J", str('imuspca_'+args.out),
-#                        "-P", str('pico_'+args.out),
-#                        "-o", str('imuspca_'+args.out+'.bsub.log'),
-#                        "-r",
-#                        str('\"'+imuspca_call+'\"')])
-#
-#print imuspca_lsf
-#if not args.test_sub:
-#    subprocess.check_call(imuspca_lsf, shell=True)
-
-imuspca_uger = ' '.join(['qsub',
-                         '-hold_jid', str('strictqc_'+args.out),
-                         '-q', 'long',
-                         '-l', 'm_mem_free='+str(imus_mem)+'g,h_vmem='+str(imus_mem)+'g',
-                         '-N', str('imuspca_'+args.out),
-                         '-o', str('imuspca_'+args.out+'.bsub.log'),
-                         str(rp_bin)+'/uger.sub.sh',
-                         str(args.sleep),
-                         str(imuspca_call)])
-
-print imuspca_uger
-if not args.test_sub:
-    subprocess.check_call(imuspca_uger, shell=True)
+send_job(jobname=str('imuspca_'+args.out),
+         cmd=str(imuspca_call),
+         logname=str('imuspca_'+args.out+'.sub.log'),
+         mem=int(imus_mem)*1000,
+         walltime=168, # one week
+         wait_name=str('strictqc_'+args.out),
+         sleep=args.sleep,
+         testonly=args.test_sub)
+
 
 #####
 # submitting final file check
@@ -250,37 +214,20 @@
 else:
     pcaout = str(args.pcadir)
 
+
 final_call = ' '.join(['final_file_check.py',
                        '--filename', str(wd+'/'+pcaout+'/plots/'+args.out+'.pca.pairs.png'),
                        '--taskname', str('pca_rel_'+args.out)])
 
-#final_lsf = ' '.join(["bsub",
-#                        "-w", str('\'ended(\"'+str('imuspca_'+args.out)+'\")\''),
-#                        "-E", str('\"sleep '+str(args.sleep)+'\"'),
-#                        "-q", 'hour',
-#                        "-J", str('checkfinal_'+args.out),
-#                        "-P", str('pico_'+args.out),
-#                        "-o", str('checkfinal_'+args.out+'.bsub.log'),
-#                        "-r",
-#                        str('\"'+final_call+'\"')])
-#
-#print final_lsf
-#if not args.test_sub:
-#    subprocess.check_call(final_lsf, shell=True)
-
-
-final_uger = ' '.join(['qsub',
-                        '-hold_jid', str('imuspca_'+args.out),
-                        '-q', 'short',
-                        '-N', str('checkfinal_'+args.out),
-                        '-o', str('checkfinal_'+args.out+'.bsub.log'),
-                        str(rp_bin)+'/uger.sub.sh',
-                        str(args.sleep),
-                        str(final_call)])
-
-print final_uger
-if not args.test_sub:
-    subprocess.check_call(final_uger, shell=True)
+send_job(jobname=str('checkfinal_'+args.out),
+         arrayfile=None,
+         cmd=str(final_call),
+         logname=str('checkfinal_'+args.out+'.sub.log'),
+         mem=100,
+         walltime=1,
+         wait_name=str('imuspca_'+args.out),
+         sleep=str(args.sleep),
+         testonly=args.test_sub)
 
 #######
 # Print completion message
diff --git a/bin/py_helpers.py b/bin/py_helpers.py
index 3173e66..f50c4fd 100644
--- a/bin/py_helpers.py
+++ b/bin/py_helpers.py
@@ -47,7 +47,9 @@ def read_conf(fname):
     
     with open(fname, 'r') as f:
         for line in f:
-            (key, val) = line.split()
+            # strips '#' comments at end of line
+            # otherwise allows aribtrary content (spaces, etc)
+            (key,val) = line.split('#',1)[0].rstrip().split(None,1)
             configs[str(key)] = val  
     
     return configs
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 8f53448..82d6935 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -297,23 +297,21 @@
 
 print ' '.join(shape_call)+'\n'
 
-
-uger_call = ' '.join(['qsub',
-                      '-q','long',
-                      '-N', 'shape.'+str(outdot),
-                      '-l', 'm_mem_free='+str(args.mem_req)+'g,h_vmem='+str(args.mem_req)+'g',
-                      '-pe','smp',str(args.threads),
-                      '-t', '1-22',
-                      '-o', '\'shape.'+str(outdot)+'.chr$TASK_ID.qsub.log\'',
-                      str(rp_bin)+'/uger_array.sub.sh',
-                      str(args.sleep),
-                      ' '.join(shape_call)])
-
-print uger_call
-subprocess.check_call(uger_call, shell=True)
-
-
-
+# setup naming from task index
+configs = read_conf(os.environ['HOME']+'/picopili.conf')
+clust_confdir = os.path.dirname(str(rp_bin))+'/cluster_templates/'
+clust_conf = read_conf(clust_confdir+str(configs['cluster']+'.conf'))
+task_id = str(clust_conf['log_task_id'])
+
+# submit
+send_job(jobname='shape.'+str(outdot),
+             cmd=' '.join(shape_call),
+             logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log',
+             mem=int(args.mem_req)*1000,
+             walltime=168, # week
+             njobs=22,
+             threads=int(args.threads),
+             sleep=str(args.sleep))
 
 
 ###
@@ -327,21 +325,16 @@
     os.chdir(wd)
     next_call = str(rp_bin) + '/imp2_rel.py '+' '.join(sys.argv[1:])
 
-    # TODO: consider queue/mem for agg
-    imp_log = 'imp_chunks.'+str(outdot)+'.qsub.log'
-    uger_imp = ' '.join(['qsub',
-                            '-hold_jid','shape.'+str(outdot),
-                            '-q', 'short',
-                            '-l', 'm_mem_free=8g,h_vmem=8g',
-                            '-N', 'imp.chunks.'+str(outdot),
-                            '-o', imp_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(args.sleep),
-                            next_call])
-    
-    print uger_imp + '\n'
-    subprocess.check_call(uger_imp, shell=True)
+    imp_log = 'imp_chunks.'+str(outdot)+'.sub.log'
 
+    # TODO: consider queue/mem
+    send_job(jobname='imp.chunks.'+str(outdot),
+             cmd=next_call,
+             logname=imp_log,
+             mem=8000,
+             walltime=2,
+             wait_name='shape.'+str(outdot),
+             sleep=str(args.sleep))
 
 
 
diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf
index 658eae0..3ed149c 100644
--- a/cluster_templates/broad_uger.conf
+++ b/cluster_templates/broad_uger.conf
@@ -7,5 +7,6 @@ sub_cmd qsub
 log_task_id $TASK_ID
 task_id ${SGE_TASK_ID}
 hold_flag -hold_jid
-array_core 1
+j_per_node 1
 array_mem_mb 128000
+project unspecified
diff --git a/cluster_templates/broad_uger.single.sub.sh b/cluster_templates/broad_uger.single.sub.sh
deleted file mode 100755
index 42a1335..0000000
--- a/cluster_templates/broad_uger.single.sub.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env bash
-
-# wrapper script for job submission on Broad UGER cluster
-#
-# The -V below above will provoke a warning that
-# LD_LIBRARY_PATH won't be used for security reasons;
-# this warning can be safely ignored
-
-#$ -j y
-#$ -cwd
-#$ -V
-#$ -N {job_name}
-#$ -o {log_name}
-#$ -q {queue_name}
-#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g
-
-# sleep option (for preventing race conditions on network file systems)
-sleep {sleep_time}
-
-# setup resources
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-
-# main command line
-{cmd_string}
-
-# eof
diff --git a/cluster_templates/broad_uger.array.sub.sh b/cluster_templates/broad_uger.sub.sh
similarity index 82%
rename from cluster_templates/broad_uger.array.sub.sh
rename to cluster_templates/broad_uger.sub.sh
index 4416e9c..597ad98 100755
--- a/cluster_templates/broad_uger.array.sub.sh
+++ b/cluster_templates/broad_uger.sub.sh
@@ -13,8 +13,9 @@
 #$ -o {log_name}
 #$ -q {queue_name}
 #$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g
-#$ -t 1-{array_jobs}
-#$ -tc {array_max}
+::PICO_ARRAY_ONLY::#$ -t 1-{array_jobs}
+::PICO_ARRAY_ONLY::#$ -tc {array_max}
+::PICO_THREAD_ONLY::#$ -pe smp {threads}
 
 # sleep option (for preventing race conditions on network file systems)
 sleep {sleep_time}

From e3051bb6cd3973a0b6af8cca89490d51110384c5 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 12:29:43 -0400
Subject: [PATCH 08/48] convert additional tasks to blueprint; minor logging
 changes

---
 bin/agg_gwas.py   |  25 ++++---
 bin/agg_imp.py    |  25 +++----
 bin/bg_imp.py     |  49 ++++++-------
 bin/imp2_rel.py   | 178 +++++++++++++++++++---------------------------
 bin/imp_prep.pl   |  10 +--
 bin/impute_rel.py |  31 +++-----
 bin/shape_rel.py  |   1 +
 7 files changed, 131 insertions(+), 188 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index 13fada3..f32326f 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -41,6 +41,7 @@
 from math import log10, sqrt
 from args_gwas import *
 from py_helpers import unbuffer_stdout, file_len, file_tail
+from blueprint import send_job
 # , read_conf, link
 unbuffer_stdout()
 
@@ -216,19 +217,17 @@
     
     print '\n...Replacing this agg job in the queue...'
 
-    agg_log = 'agg.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log'
-    uger_agg = ' '.join(['qsub',
-                            '-hold_jid','gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss),
-                            '-q', 'long',
-                            '-l', 'm_mem_free=24g,h_vmem=24g',
-                            '-N', 'agg_'+str(outdot),
-                            '-o', agg_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(10), # hardcoded since chunks shouldn't normally need a sleep argument
-                            ' '.join(sys.argv[:])])
-    
-    print uger_agg + '\n'
-    subprocess.check_call(uger_agg, shell=True)
+    # TODO: adjust memory setting here
+
+    agg_log = 'agg.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log'
+
+    send_job(jobname='agg_'+str(outdot),
+             cmd=' '.join(sys.argv[:]),
+             logname=agg_log,
+             mem=24000,
+             walltime=168, # week
+             wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+             sleep=10)
 
     print '\n############'
     print '\n'
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 9e273c9..10a0ad9 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -31,6 +31,7 @@
 import subprocess
 from args_impute import *
 from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format
+from blueprint import send_job
 unbuffer_stdout()
 # warnings.formatwarning = warn_format
 
@@ -196,21 +197,17 @@
     
     print '\n...Replacing this aggregation job in the queue...'
 
-    # TODO: consider queue/mem for agg
     os.chdir(wd)
-    agg_log = 'agg_imp.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log'
-    uger_agg = ' '.join(['qsub',
-                            '-hold_jid','bg.chunks.'+str(outdot)+'.resub_'+str(nummiss),
-                            '-q', 'long',
-                            '-l', 'm_mem_free=8g,h_vmem=8g',
-                            '-N', 'agg.imp.'+str(outdot),
-                            '-o', agg_log,
-                            str(uger_ex),
-                            str(args.sleep),
-                            ' '.join(sys.argv[:])])
-    
-    print uger_agg + '\n'
-    subprocess.check_call(uger_agg, shell=True)
+    agg_log = 'agg_imp.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log'
+
+    # TODO: consider queue/mem for agg
+    send_job(jobname='agg.imp.'+str(outdot),
+             cmd=' '.join(sys.argv[:]),
+             logname=agg_log,
+             mem=8000,
+             walltime=168, # week
+             wait_name='bg.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+             sleep=args.sleep)
 
     print '\n############'
     print '\n'
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 77b77e3..7d99557 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -36,6 +36,7 @@
 import warnings
 from args_impute import *
 from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format
+from blueprint import send_job
 unbuffer_stdout()
 warnings.formatwarning = warn_format
 
@@ -298,21 +299,17 @@
     
     print '\n...Replacing this best-guess job in the queue...'
 
-    # TODO: consider queue/mem for agg
     os.chdir(wd)
-    bg_log = 'bg.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log'
-    uger_bg = ' '.join(['qsub',
-                            '-hold_jid','imp.chunks.'+str(outdot)+'.resub_'+str(nummiss),
-                            '-q', 'short',
-                            '-l', 'm_mem_free=4g,h_vmem=8g',
-                            '-N', 'bg.chunks.'+str(outdot),
-                            '-o', bg_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(args.sleep),
-                            ' '.join(sys.argv[:])])
-    
-    print uger_bg + '\n'
-    subprocess.check_call(uger_bg, shell=True)
+    bg_log = 'bg.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log'
+
+    # TODO: consider queue/mem for agg
+    send_job(jobname='bg.chunks.'+str(outdot),
+             cmd=' '.join(sys.argv[:]),
+             logname=bg_log,
+             mem=8000,
+             walltime=2, # week
+             wait_name='imp.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+             sleep=args.sleep)
 
     print '\n############'
     print '\n'
@@ -426,22 +423,16 @@
     os.chdir(wd)
     next_call = str(rp_bin) + '/agg_imp.py '+' '.join(sys.argv[1:])
 
-    # TODO: consider queue/mem for agg
-    agg_log = 'agg_imp.'+str(outdot)+'.qsub.log'
-    uger_agg = ' '.join(['qsub',
-                            '-hold_jid','bg.chunks.'+str(outdot),
-                            '-q', 'long',
-                            '-l', 'm_mem_free=8g,h_vmem=8g',
-                            '-N', 'agg.imp.'+str(outdot),
-                            '-o', agg_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(args.sleep),
-                            next_call])
-    
-    print uger_agg + '\n'
-    subprocess.check_call(uger_agg, shell=True)
+    agg_log = 'agg_imp.'+str(outdot)+'.sub.log'
 
-    
+    # TODO: consider queue/mem for agg
+    send_job(jobname='agg.imp.'+str(outdot),
+             cmd=next_call,
+             logname=agg_log,
+             mem=8000,
+             walltime=168, # week
+             wait_name='bg.chunks.'+str(outdot),
+             sleep=args.sleep)
 
 # finish
 print '\n############'
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 2951b60..381a58b 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -26,8 +26,10 @@
 ### load requirements
 import os
 import subprocess
+from textwrap import dedent
 from args_impute import *
 from py_helpers import unbuffer_stdout, file_len, link, find_exec
+from blueprint import send_job
 unbuffer_stdout()
 
 
@@ -90,6 +92,13 @@
 print '\n...Checking dependencies...'
 #############
 
+# get cluster configuration
+# needed for specifying logfile names with clust_conf['log_task_id']
+conf_file = os.environ['HOME']+"/picopili.conf"
+configs = read_conf(conf_file)
+cluster = configs['cluster']
+clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+
 # from config
 impute_ex = find_exec('impute2',key='i2loc')
 shapeit_ex = find_exec('shapeit',key='shloc')
@@ -101,6 +110,8 @@
 test_exec(chunker_ex)
 
 
+
+
 # TODO: here
 # .hg19.ch.fl.bim for chunking
 # imp. references
@@ -138,8 +149,8 @@
         bad_chr.append(chrom)
         
 
-# TODO: resub shapeit if failed
-# TODO: re-queue this job
+# if any shapeit jobs failed, 
+# resubmit them and re-queue this job
 if bad_chr:    
     num_chr = len(bad_chr)
     print 'Missing pre-phasing results for %d chromosomes.' % num_chr
@@ -168,30 +179,14 @@
         print 'Exiting...\n'
         exit(1)
 
-    # make submit script
-    # using this structure to get adaptive chromosome list
-    uger_phase_template = """#!/usr/bin/env sh
-    #$ -j y
-    #$ -cwd
-    #$ -V
-    #$ -N {jname}
-    #$ -q long
-    #$ -l m_mem_free={mem}g,h_vmem={mem}g
-    #$ -pe smp {threads}
-    #$ -t 1-{nchr}
-    #$ -o {outlog}
-    
-    source /broad/software/scripts/useuse
-    reuse -q Anaconda
-    sleep {sleep}
-    
+    # setup submit script
+    # with "chr_list" to get have adaptive chromosome list
+    cmd_templ = dedent("""\
     chrs=({chr_list})
-    chrom=${{chrs[${{SGE_TASK_ID}}-1]}}
+    chrom=${{chrs[{task}-1]}}
 
-    {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog} 
-    
-    # eof
-    """
+    {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog}    
+    """)
 
 #    shape_call = [shapeit_ex,
 #                  '--input-bed', chrstem+'.bed', chrstem+'.bim', chrstem+'.fam',
@@ -203,21 +198,15 @@
 #                  '--seed', str(args.shape_seed),
 #                  '--output-max', outstem+'.phased.haps', outstem+'.phased.sample',
 #                  '--output-log', outstem+'.shape.log']
-
     
-    # fill in template
-    chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}'
-    outstem = str(outdot)+'.chr${chrom}'
+    # manage duohmm arg
     if extra_args.no_duohmm:
         duo_txt = ''
     else:
         duo_txt = '--duohmm'
-    jobdict = {"jname": 'shape.'+str(outdot)+'.resub_'+str(num_chr),
-               "mem": str(extra_args.mem_req),
-               "threads": str(extra_args.threads),
-               "nchr": str(num_chr),
-               "outlog": 'shape.'+str(outdot)+'.resub_'+str(num_chr)+'.qsub.$TASK_ID.log',
-               "sleep": str(args.sleep),
+    
+    # fill in shapeit template
+    jobdict = {"task": "{task}",
                "chr_list": ' '.join(bad_chr),
                "shape_ex": str(shapeit_ex),
                "bed": '--input-bed '+str(chrstem)+'.bed '+str(chrstem)+'.bim '+str(chrstem)+'.fam',
@@ -229,15 +218,19 @@
                "seed_str": '--seed '+str(extra_args.shape_seed),
                "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample',
                "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log',
-               }
-    
-    uger_phase = open(uger_phase_name, 'w')
-    uger_phase.write(uger_phase_template.format(**jobdict))
-    uger_phase.close()
-    
+               }    
+    shape_cmd = cmd_templ.format(**jobdict)
+
     # submit
-    print ' '.join(['qsub',uger_phase_name]) + '\n'
-    subprocess.check_call(' '.join(['qsub',uger_phase_name]), shell=True)
+    send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr),
+             cmd=shape_cmd,
+             logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log',
+             mem=int(extra_args.mem_req)*1000,
+             walltime=168, # week
+             njobs=int(num_chr),
+             threads=extra_args.threads,
+             sleep=args.sleep)
+
     print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr
 
 
@@ -246,19 +239,15 @@
     print '\n...Replacing this imputation job in the queue...'
     
     os.chdir(wd)
-    imp_log = 'imp_chunks.'+str(outdot)+'.qsub.log'
-    uger_imp = ' '.join(['qsub',
-                            '-hold_jid','imp.chunks.'+str(outdot),
-                            '-q', 'short',
-                            '-l', 'm_mem_free=8g,h_vmem=8g',
-                            '-N', 'imp.chunks.'+str(outdot),
-                            '-o', imp_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(args.sleep),
-                            ' '.join(sys.argv[:])])
-    
-    print uger_imp + '\n'
-    subprocess.check_call(uger_imp, shell=True)
+    imp_log = 'imp_chunks.'+str(outdot)+'.sub.log'
+
+    send_job(jobname='imp.chunks.'+str(outdot),
+             cmd=' '.join(sys.argv[:]),
+             logname=imp_log,
+             mem=8000,
+             walltime=2, # week
+             wait_name='shape.'+str(outdot)+'.resub_'+str(num_chr),
+             sleep=args.sleep)
 
     print '\n############'
     print '\n'
@@ -330,38 +319,18 @@
 os.chdir(imp_dir)
 link(str(chunk_dir)+'/'+str(outdot)+'.chunks.txt', str(outdot)+'.chunks.txt', 'genomic chunk results')
 
-uger_imp_template = """#!/usr/bin/env sh
-#$ -j y
-#$ -cwd
-#$ -V
-#$ -N {jname}
-#$ -q short
-#$ -l m_mem_free=8g,h_vmem=8g
-#$ -t 1-{nchunk}
-#$ -o {outlog}
-
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-sleep {sleep}
+# job script
+imp_templ = dedent("""\
+    cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
+    cstart=`awk -v a={task} 'NR==a+1{{print $2}}' {cfile}`
+    cend=`awk -v a={task} 'NR==a+1{{print $3}}' {cfile}`
+    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
 
-cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}`
-cstart=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $2}}' {cfile}`
-cend=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $3}}' {cfile}`
-cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}`
-
-{impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt}
-
-# eof
-"""
-    
-# get number of chunks (-1 is for header)
-nchunks = file_len(outdot+'.chunks.txt')-1
+    {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt}
+""")
 
 # fill in template
-jobdict = {"jname": 'imp.chunks.'+str(outdot),
-           "nchunk": str(nchunks),
-           "outlog": str('imp.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'),
-           "sleep": str(args.sleep),
+jobdict = {"task": "{task}",
            "cfile": str(outdot)+'.chunks.txt',
            "impute_ex": str(impute_ex),
            "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps',
@@ -373,19 +342,23 @@
            "out": str(outdot)+'.imp.${cname}',
            "seedtxt": str(seedtxt)
            }
+cmd_imp = imp_templ.format(**jobdict)
 
-uger_imp = open(str(outdot)+'.imp_chunks.sub.sh', 'w')
-uger_imp.write(uger_imp_template.format(**jobdict))
-uger_imp.close()
+# get number of chunks (-1 is for header)
+nchunks = file_len(outdot+'.chunks.txt')-1
 
 # submit
-print ' '.join(['qsub',uger_imp.name]) + '\n'
-subprocess.check_call(' '.join(['qsub',uger_imp.name]), shell=True)
+send_job(jobname='imp.chunks.'+str(outdot),
+         cmd=cmd_imp,
+         logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+         mem=8000,
+         walltime=2,
+         njobs=int(nchunks),
+         sleep=args.sleep)
 print 'Imputation jobs submitted for %d chunks.\n' % nchunks
 
 
 
-
 ###
 # submit next imputation task
 ###
@@ -397,23 +370,16 @@
     os.chdir(wd)
     next_call = str(rp_bin) + '/bg_imp.py '+' '.join(sys.argv[1:])
 
-    # TODO: consider queue/mem for agg
-    bg_log = 'bg_imp.'+str(outdot)+'.qsub.log'
-    uger_bg = ' '.join(['qsub',
-                            '-hold_jid','imp.chunks.'+str(outdot),
-                            '-q', 'short',
-                            '-l', 'm_mem_free=4g,h_vmem=8g',
-                            '-N', 'bg.chunks.'+str(outdot),
-                            '-o', bg_log,
-                            str(rp_bin)+'/uger.sub.sh',
-                            str(args.sleep),
-                            next_call])
-    
-    print uger_bg + '\n'
-    subprocess.check_call(uger_bg, shell=True)
-
-
+    bg_log = 'bg_imp.'+str(outdot)+'.sub.log'
 
+    # TODO: consider queue/mem for agg
+    send_job(jobname='bg.chunks.'+str(outdot),
+             cmd=next_call,
+             logname=bg_log,
+             mem=8000,
+             walltime=2, # week
+             wait_name='imp.chunks.'+str(outdot),
+             sleep=args.sleep)
 
 
 
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index 3c75dfb..1afc2ee 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -540,10 +540,10 @@ sub send_jobarray {
 	    $err_message .= "##### step $sjaname has been done repeatedly without any progress\n";
 	    $err_message .= "##### imputation pipeline stopped: $command_line\n";
 	    $err_message .= "##### $sjainfotxt\n";
-	    $err_message .= "##### if reason does not appear obvious\n";
-	    $err_message .= "##### have a look at the wiki page\n"; 
-	    $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
-	    $err_message .= "##### or contact the developers\n";
+#	    $err_message .= "##### if reason does not appear obvious\n";
+#	    $err_message .= "##### have a look at the wiki page\n"; 
+#	    $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
+#	    $err_message .= "##### or contact the developers\n";
 	    $err_message .= "##################################################################\n";
 	    print "$err_message\n";
 
@@ -552,7 +552,7 @@ sub send_jobarray {
 	    close ERR;
 
 		if($email_on){
-			&mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ;
+			&mysystem ('cat error_file | '.$mutt_script.' -s Picopili_pipeline_error '.$email) ;
 		}
 
 	    unless ($serial) {
diff --git a/bin/impute_rel.py b/bin/impute_rel.py
index 13e79e3..332518c 100755
--- a/bin/impute_rel.py
+++ b/bin/impute_rel.py
@@ -25,9 +25,9 @@
 
 ### load requirements
 import os
-import subprocess
 from args_impute import *
 from py_helpers import unbuffer_stdout #, read_conf, file_tail, link, warn_format
+from blueprint import send_job
 unbuffer_stdout()
 
 #############
@@ -121,12 +121,10 @@
 
 
 #############
-print '\n...Checking dependencies...'
+# print '\n...Checking dependencies...'
 #############
 
 
-
-
 # TODO: here
 
 
@@ -138,24 +136,15 @@
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 next_call = str(rp_bin) + '/shape_rel.py '+' '.join(sys.argv[1:])+' --full-pipe'
 
-# TODO: consider queue/mem for agg
-shape_log = 'shape.'+str(outdot)+'.qsub.log'
-uger_shape = ' '.join(['qsub',
-                        '-q', 'long',
-                        '-l', 'm_mem_free='+str(args.mem_req)+'g,h_vmem='+str(args.mem_req)+'g',
-                        '-N', 'shape.'+str(outdot),
-                        '-o', shape_log,
-                        str(rp_bin)+'/uger.sub.sh',
-                        str(args.sleep),
-                        next_call])
-
-print uger_shape + '\n'
-subprocess.check_call(uger_shape, shell=True)
-
-
-# TODO: here
-
+shape_log = 'shape.'+str(outdot)+'.sub.log'
 
+# TODO: consider queue/mem
+send_job(jobname='shape.'+str(outdot),
+         cmd=next_call,
+         logname=shape_log,
+         mem=int(args.mem_req * 1000),
+         walltime=168, # week
+         sleep=args.sleep)
 
 
 # finish
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 82d6935..92bd5e0 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -45,6 +45,7 @@
 # import warnings
 from args_impute import *
 from py_helpers import unbuffer_stdout, link, find_exec #, test_exec
+from blueprint import send_job
 # file_len, read_conf, find_from_path, link, gz_confirm
 unbuffer_stdout()
 

From 5df6c382e3343fd389db3b31330c20fe89b848f4 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 13:33:57 -0400
Subject: [PATCH 09/48] blueprint remaining direct jobs (not resubs)

---
 bin/agg_imp.py  |   3 -
 bin/bg_imp.py   |  88 ++++++++++++++--------------
 bin/gwas_rel.py | 148 +++++++++++++++++++-----------------------------
 3 files changed, 100 insertions(+), 139 deletions(-)

diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 10a0ad9..a1d411c 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -16,9 +16,6 @@
 # 
 ####################################
 
-# TODO: enable failed chunk check
-
-
 
 import sys
 #############
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 7d99557..34233bd 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -34,6 +34,7 @@
 import os
 import subprocess
 import warnings
+from textwrap import dedent
 from args_impute import *
 from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format
 from blueprint import send_job
@@ -192,6 +193,12 @@
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 rs_ex = str(rp_bin)+'/rs_trans.py'
 
+# get cluster configuration
+# needed for specifying logfile names with clust_conf['log_task_id']
+conf_file = os.environ['HOME']+"/picopili.conf"
+configs = read_conf(conf_file)
+cluster = configs['cluster']
+clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
 
 # TODO: here
 
@@ -334,51 +341,34 @@
 print '\n...Generating best-guess genotypes...'
 ######################
 
-# TODO: flex queue/mem reqs
-uger_bg_template = """#!/usr/bin/env sh
-#$ -j y
-#$ -cwd
-#$ -V
-#$ -N {jname}
-#$ -q short
-#$ -l m_mem_free=4g,h_vmem=8g
-#$ -t 1-{nchunk}
-#$ -o {outlog}
-
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-sleep {sleep}
-
-cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}`
-cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}`
-
-{plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} 
-
-sleep {sleep}
-# note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order
-{plink_ex} --bfile {out_str} {mendel_txt} --pheno {idnum} --mpheno 4 --update-parents {idnum} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str2}
-rm {out_str}.bed
-rm {out_str}.bim
-rm {out_str}.fam
-
-sleep {sleep}
-{plink_ex} --bfile {out_str2} {maf_txt} {mac_txt} {geno_txt} {info_txt} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str_filt}
-rm {out_str2}.bed
-rm {out_str2}.bim
-rm {out_str2}.fam
-
-{rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans}
-
-# eof
-"""
+# best-guess job script for each chunk
+bg_templ = dedent("""\
+    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
+    cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
+    
+    {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} 
+    
+    sleep {sleep}
+    # note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order
+    {plink_ex} --bfile {out_str} {mendel_txt} --pheno {idnum} --mpheno 4 --update-parents {idnum} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str2}
+    rm {out_str}.bed
+    rm {out_str}.bim
+    rm {out_str}.fam
+    
+    sleep {sleep}
+    {plink_ex} --bfile {out_str2} {maf_txt} {mac_txt} {geno_txt} {info_txt} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str_filt}
+    rm {out_str2}.bed
+    rm {out_str2}.bim
+    rm {out_str2}.fam
     
+    {rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans}
+""")
+
 # get number of chunks
 nchunks = len(chunks)
 
 # fill in template
-jobdict = {"jname": 'bg.chunks.'+str(outdot),
-           "nchunk": str(nchunks),
-           "outlog": str('bg.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'),
+jobdict = {"task": "{task}",
            "sleep": str(args.sleep),
            "cfile": str(outdot)+'.chunks.txt',
            "plink_ex": str(plink_ex),
@@ -400,16 +390,20 @@
            "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl'
            }
 
-uger_bg = open(str(outdot)+'.bg_chunks.sub.sh', 'w')
-uger_bg.write(uger_bg_template.format(**jobdict))
-uger_bg.close()
+bg_cmd = bg_templ.format(**jobdict)
 
-# submit
-print ' '.join(['qsub',uger_bg.name]) + '\n'
-subprocess.check_call(' '.join(['qsub',uger_bg.name]), shell=True)
-print 'Best-guess jobs submitted for %d chunks.\n' % nchunks
 
+# submit
+# TODO: flex queue/mem reqs
+send_job(jobname='bg.chunks.'+str(outdot),
+         cmd=bg_cmd,
+         logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+         mem=8000,
+         walltime=2,
+         njobs=int(nchunks),
+         sleep=args.sleep)
 
+print 'Best-guess jobs submitted for %d chunks.\n' % nchunks
 
 
 ###
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 01ae60a..8847d32 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -26,8 +26,10 @@
 import subprocess
 import os
 from warnings import warn
+from textwrap import dedent
 from args_gwas import *
 from py_helpers import link, unbuffer_stdout, find_exec
+from blueprint import send_job
 unbuffer_stdout()
 
 
@@ -139,6 +141,13 @@
     if args.rscript_ex == None or args.rscript_ex == "None":
         args.rscript_ex = find_exec('Rscript', key='rscloc')
 
+# get cluster configuration
+# needed for specifying logfile names with clust_conf['log_task_id']
+conf_file = os.environ['HOME']+"/picopili.conf"
+configs = read_conf(conf_file)
+cluster = configs['cluster']
+clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+
 # TODO: here
 
 
@@ -387,36 +396,26 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 
 ######################
 print '\n...Submitting GWAS for all chunks...'
-######################
-
-# gwas each chunk
-# need to write submit script to include chunk name parsing
 # TODO: consider making queue/resources flexible
+######################
 
+# basic template, depending on model
 if args.model == 'gee' or args.model == 'dfam':
-    uger_gwas_template = """#!/usr/bin/env sh
-#$ -j y
-#$ -cwd
-#$ -V
-#$ -N {jname}
-#$ -q short
-#$ -l m_mem_free=4g,h_vmem=4g
-#$ -t 1-{nchunk}
-#$ -tc 200
-#$ -o {outlog}
+    gwas_templ = dedent("""\
+    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
+    {misc}
+    {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs}    
+    """)
 
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-sleep {sleep}
-
-cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}`
-
-{misc}
+elif args.model == 'gmmat' or args.model == 'gmmat-fam':
+    gwas_templ = dedent("""\
+    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
+    chrnum=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
 
-{gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs}
+    {plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}}
 
-# eof
-"""
+    {rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log
+    """)
 
 # alternative template for GMMAT
 # Rscript --no-save --no-restore 
@@ -426,34 +425,10 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 # ../fgwa_eur_1KGp3_postimp.pca.txt     (covariate file)
 # test1                                 (output name)
 # > test_gmm.log
-elif args.model == 'gmmat' or args.model == 'gmmat-fam':
-    uger_gwas_template = """#!/usr/bin/env sh
-#$ -j y
-#$ -cwd
-#$ -V
-#$ -N {jname}
-#$ -q short
-#$ -l m_mem_free=4gi,h_vmem=4g
-#$ -t 1-{nchunk}
-#$ -tc 200
-#$ -o {outlog}
-
-source /broad/software/scripts/useuse
-sleep {sleep}
-
-cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}`
-chrnum=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}`
-
-{plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}}
-
-{rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log
-
-# eof
-"""
 
 
+# optional arguments
 gwasargs = ''
-
 if args.pheno is not None:
     gwasargs = gwasargs + ' --pheno '+str(args.pheno)
 if args.keep is not None:
@@ -461,7 +436,7 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 if args.remove is not None:
     gwasargs = gwasargs + ' --remove '+str(args.remove)
 
-# these args not passed for gmmat
+# model-specific arguments not passed for gmmat
 if args.model == 'gee' or args.model == 'dfam':
     if args.addout is not None:
         gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${cname}'
@@ -474,16 +449,22 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 
     gwasargs = gwasargs + ' --r-ex '+str(args.r_ex)+' --rplink-ex '+str(args.rplink_ex)
 
+# model specific arguments for gee to specify Rserve port for each job
+# targeting IANA range 49152-65535 
+# (assuming here will be < 16k jobs; gwas_gee.py handles overflow check)  
+if args.model == 'gee':
+    misc_txt = 'rport=$((49151+{task}))'
+    gwasargs = str(gwasargs) +' --port $rport'
+else:
+    misc_txt = ''
+
 # TODO: pass through cleanup
 
-    
-nchunk = len(chunks.keys())
-jobdict = {"jname": 'gwas.chunks.'+str(outdot),
-           "nchunk": str(nchunk),
-           "outlog": str('gwas.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'),
-           "sleep": str(args.sleep),
+
+# fill in template
+jobdict = {"task": "{task}",
            "cfile": chunk_file.name,
-           "misc": '',
+           "misc": str(misc_txt),
            "gwas_ex": str(gwas_ex),
            "bfile": str(args.bfile),
            "argout": str(args.out),
@@ -494,20 +475,21 @@ def find_chunk(snpchrom, snpbp, last_chunk):
            "rsc": str(args.rscript_ex)
            }
 
-# for gee, need to specify Rserve port for each job
-# targeting IANA range 49152-65535 
-# (assuming here will be < 16k jobs; gwas_gee.py handles overflow check)           
-if args.model == 'gee':
-    jobdict['misc'] = 'rport=$((49151+SGE_TASK_ID))'
-    jobdict['optargs'] = str(gwasargs) +' --port $rport'
+gwas_cmd = gwas_templ.format(**jobdict)
 
 
-uger_gwas = open(str(outdot)+'.gwas_chunks.sub.sh', 'w')
-uger_gwas.write(uger_gwas_template.format(**jobdict))
-uger_gwas.close()
+# submit job
+nchunk = len(chunks.keys())
+
+send_job(jobname='gwas.chunks.'+str(outdot),
+         cmd=gwas_cmd,
+         logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+         mem=4000,
+         walltime=2,
+         njobs=int(nchunk),
+         maxpar=200,
+         sleep=args.sleep)
 
-print ' '.join(['qsub',uger_gwas.name]) + '\n'
-subprocess.check_call(' '.join(['qsub',uger_gwas.name]), shell=True)
 print 'GWAS jobs submitted for %d chunks.\n' % nchunk
 
 
@@ -548,7 +530,7 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 else:
     info_file_txt = ['','','','']
 
-agg_log = 'agg.'+str(outdot)+'.qsub.log'
+agg_log = 'agg.'+str(outdot)+'.sub.log'
 agg_call = [str(rp_bin)+'/agg_gwas.py',
             '--bfile',str(args.bfile),
             '--out',str(args.out),
@@ -563,19 +545,14 @@ def find_chunk(snpchrom, snpbp, last_chunk):
             '--model',str(args.model)]
 agg_call = filter(None,agg_call)
 
-uger_agg = ' '.join(['qsub',
-                        '-hold_jid','gwas.chunks.'+str(outdot),
-                        '-q', 'long',
-                        '-l', 'm_mem_free=4g,h_vmem=4g',
-                        '-N', 'agg_'+str(outdot),
-                        '-o', agg_log,
-                        str(rp_bin)+'/uger.sub.sh',
-                        str(args.sleep),
-                        ' '.join(agg_call)])
-
-print uger_agg + '\n'
-subprocess.check_call(uger_agg, shell=True)
 
+send_job(jobname='agg_'+str(outdot),
+         cmd=' '.join(agg_call),
+         logname=agg_log,
+         mem=4000,
+         walltime=168, # week
+         wait_name='gwas.chunks.'+str(outdot),
+         sleep=args.sleep)
 
 # TODO:
 # queue summarization script (plots, etc)
@@ -590,11 +567,4 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 print 'All jobs submitted.\n'
 exit(0)
 
-#uger_chunk = ' '.join(['qsub',
-#                        '-hold_jid',str(name),
-#                        '-q', 'short',
-#                        '-N', str('chunk_'+out),
-#                        '-o', chunk_log,
-#                        str(rp_bin)+'/uger.sub.sh',
-#                        str(sleep),
-#                        str(chunk_call)])
+# eof

From ea5e5358d68381958754c68afd861cc3017350a9 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 17:10:32 -0400
Subject: [PATCH 10/48] blueprint resubs, using pickled job info

---
 .gitignore         |  1 +
 bin/agg_gwas.py    | 81 ++++++++++++++++++++++++++---------------
 bin/agg_imp.py     | 76 +++++++++++++++++++++++++-------------
 bin/args_gwas.py   |  2 +-
 bin/args_impute.py |  2 +-
 bin/bg_imp.py      | 91 ++++++++++++++++++++++++++++++++++------------
 bin/blueprint.py   | 56 ++++++++++++++++++++++++++++
 bin/gwas_rel.py    | 19 +++++++++-
 bin/imp2_rel.py    | 23 ++++++++++--
 bin/pca_rel.py     |  2 +-
 bin/shape_rel.py   |  2 +-
 11 files changed, 267 insertions(+), 88 deletions(-)

diff --git a/.gitignore b/.gitignore
index f5ac793..adcf4fe 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,5 +6,6 @@ bin/args_qc.pyc
 bin/args_gwas.pyc
 bin/args_chunks.pyc
 bin/args_impute.pyc
+bin/test_debug
 lib/plague*
 lib/buigue*
diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index f32326f..750f50a 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -179,39 +179,60 @@
     tmp_chunk_file.close()
     
     print 'List of missing chunks: %s' % tmp_chunk_file.name
-    
+
+
+    ###
     # copy original submit script
-    # replace chunk list, name, number of tasks
-    orig_uger_file = open(str(outdot)+'.gwas_chunks.sub.sh', 'r')
-    new_uger_file = open(str(outdot)+'.gwas_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w')
+    # replace chunk list, name, number of tasks, memory spec
+    # resubmit
+    ###
+
+    # load pickle of job info
+    orig_job_conf = 'gwas.chunks.'+str(outdot)+'.pkl'
     
-    for line in orig_uger_file:
-        if '#$ -t ' in line:
-            new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n')
-            continue
-#        elif '#$ -tc ' in line:
-#            if nummiss < 20:
-#                new_uger_file.write('#$ -tc 5 \n')
-#            elif nummiss < 50:
-#                new_uger_file.write('#$ -tc 10 \n')
-#            elif nummiss < 100:
-#                new_uger_file.write('#$ -tc 25 \n')
-#            else:
-#                new_uger_file.write('#$ -tc 40 \n')
-#            new_uger_file.write('#$ -tc 5 \n')
-        elif '#$ -l m_mem_free' in line:
-	    new_uger_file.write('#$ -l m_mem_free=24g,h_vmem=24g \n')
-        else:
-            line=line.replace(args.chunk_file, tmp_chunk_file.name)
-            line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.')
-            line=line.replace('#$ -N gwas.chunks.'+str(outdot), '#$ -N gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss))
-            new_uger_file.write(line)
-            
-    orig_uger_file.close()
-    new_uger_file.close()
+    if not os.path.isfile(orig_job_conf):
+        orig_job_file = str(outdot)+'.gwas_chunks.sub.sh'
+        raise IOError("Unable to find previous job configuration pickle %s.\
+            \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file)))
 
-    print ' '.join(['qsub',new_uger_file.name]) + '\n'
-    subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True)
+    
+    cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf)
+
+    # rename resub
+    sendjob_dict['jobname'] = 'gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss)
+    
+    sendjob_dict['logname'] = str('gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+
+    # increase memory and walltime
+    # TODO: consider how to scale mem/time here
+    oldmem = sendjob_dict['mem']
+    sendjob_dict['mem'] = int(oldmem)*2
+
+    oldtime = sendjob_dict['walltime']
+    sendjob_dict['walltime'] = int(oldtime)*4
+    
+    # replace chunk file and set number of new jobs
+    sendjob_dict['njobs'] = int(nummiss)
+
+    job_dict['cfile'] = tmp_chunk_file_name
+    
+    
+    # re-save new settings (primarily to track updating mem and walltime)
+    save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict)
+
+    
+    # submit
+    gwas_cmd = cmd_templ.format(**job_dict)
+
+    send_job(jobname=sendjob_dict['jobname'],
+             cmd=gwas_cmd,
+             logname=sendjob_dict['logname'],
+             mem=sendjob_dict['mem'],
+             walltime=sendjob_dict['walltime'],
+             njobs=sendjob_dict['njobs'],
+             maxpar=sendjob_dict['maxpar'],
+             sleep=sendjob_dict['sleep'])
+        
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
     
     
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index a1d411c..0d670fb 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -28,7 +28,7 @@
 import subprocess
 from args_impute import *
 from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format
-from blueprint import send_job
+from blueprint import send_job, load_job
 unbuffer_stdout()
 # warnings.formatwarning = warn_format
 
@@ -80,9 +80,6 @@
 # get directory containing current script
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
-uger_ex = +str(rp_bin)+'/uger.sub.sh'
-
-test_exec(uger_ex)
 
 # TODO: here
 
@@ -166,32 +163,61 @@
     
     print 'List of missing chunks: %s' % tmp_chunk_file.name
     
+    ###
     # copy original submit script
-    # replace chunk list, name, number of tasks
-    orig_uger_file = open(str(outdot)+'.bg_chunks.sub.sh', 'r')
-    new_uger_file = open(str(outdot)+'.bg_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w')
+    # replace chunk list, name, number of tasks, memory spec
+    # resubmit
+    ###
+
+    # load pickle of job info
+    orig_job_conf = 'bg.chunks.'+str(outdot)+'.pkl'
+    
+    if not os.path.isfile(orig_job_conf):
+        orig_job_file = str(outdot)+'.bg_chunks.sub.sh'
+        raise IOError("Unable to find previous job configuration pickle %s.\
+            \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file)))
+
     
-    for line in orig_uger_file:
-        if '#$ -t ' in line:
-            new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n')
-        elif '#$ -l m_mem_free' in line:
-	    new_uger_file.write('#$ -l m_mem_free=8g,h_vmem=8g \n')     
-        elif '#$ -q short' in line:
-	    new_uger_file.write('#$ -q long \n')
-        else:
-            line=line.replace(chunk_file_name, tmp_chunk_file_name)
-            line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.')
-            line=line.replace('#$ -N bg.chunks.'+str(outdot), '#$ -N bg.chunks.'+str(outdot)+'.resub_'+str(nummiss))
-            new_uger_file.write(line)
-            
-    orig_uger_file.close()
-    new_uger_file.close()
-
-    print ' '.join(['qsub',new_uger_file.name]) + '\n'
-    subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True)
+    cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf)
+
+    # rename resub
+    sendjob_dict['jobname'] = 'bg.chunks.'+str(outdot)+'.resub_'+str(nummiss)
+    
+    sendjob_dict['logname'] = str('bg.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+
+    # increase memory and walltime
+    # TODO: consider how to scale mem/time here
+    oldmem = sendjob_dict['mem']
+    sendjob_dict['mem'] = int(oldmem) + 4000
+
+    oldtime = sendjob_dict['walltime']
+    sendjob_dict['walltime'] = int(oldtime)*4
+    
+    # replace chunk file and set number of new jobs
+    sendjob_dict['njobs'] = int(nummiss)
+
+    job_dict['cfile'] = tmp_chunk_file_name
+    
+    
+    # re-save new settings (primarily to track updating mem and walltime)
+    save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict)
+
+    
+    # submit
+    bg_cmd = cmd_templ.format(**job_dict)
+
+    send_job(jobname=sendjob_dict['jobname'],
+             cmd=bg_cmd,
+             logname=sendjob_dict['logname'],
+             mem=sendjob_dict['mem'],
+             walltime=sendjob_dict['walltime'],
+             njobs=sendjob_dict['njobs'],
+             sleep=sendjob_dict['sleep'])
+        
     print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss
     
     
+    
     print '\n...Replacing this aggregation job in the queue...'
 
     os.chdir(wd)
diff --git a/bin/args_gwas.py b/bin/args_gwas.py
index 49189b7..50627cc 100644
--- a/bin/args_gwas.py
+++ b/bin/args_gwas.py
@@ -190,7 +190,7 @@
 arg_clust.add_argument('--sleep', 
                     type=int,
                     metavar='SEC',
-                    help='Number of seconds to delay on start of UGER jobs',
+                    help='Number of seconds to delay on start of cluster jobs',
                     required=False,
                     default=30)
 arg_exloc.add_argument('--r-ex',
diff --git a/bin/args_impute.py b/bin/args_impute.py
index bda6621..de41a7b 100644
--- a/bin/args_impute.py
+++ b/bin/args_impute.py
@@ -294,7 +294,7 @@
 arg_clust.add_argument('--sleep', 
                     type=int,
                     metavar='SEC',
-                    help='Number of seconds to delay on start of UGER jobs',
+                    help='Number of seconds to delay on start of cluster jobs',
                     required=False,
                     default=30)
 arg_clust.add_argument('--full-pipe', 
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 34233bd..5f84d35 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -37,7 +37,7 @@
 from textwrap import dedent
 from args_impute import *
 from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format
-from blueprint import send_job
+from blueprint import send_job, init_sendjob_dict, save_job
 unbuffer_stdout()
 warnings.formatwarning = warn_format
 
@@ -278,30 +278,59 @@
     
     print 'List of missing chunks: %s' % tmp_chunk_file.name
     
+    ###
     # copy original submit script
-    # replace chunk list, name, number of tasks
-    orig_uger_file = open(str(outdot)+'.imp_chunks.sub.sh', 'r')
-    new_uger_file = open(str(outdot)+'.imp_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w')
+    # replace chunk list, name, number of tasks, memory spec
+    # resubmit
+    ###
+
+    # load pickle of job info
+    orig_job_conf = 'imp.chunks.'+str(outdot)+'.pkl'
     
-    for line in orig_uger_file:
-        if '#$ -t ' in line:
-            new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n')
-        elif '#$ -l m_mem_free' in line:
-	    new_uger_file.write('#$ -l m_mem_free=24g,h_vmem=24g \n')     
-        elif '#$ -q short' in line:
-	    new_uger_file.write('#$ -q long \n')
-        else:
-            line=line.replace(chunk_file_name, tmp_chunk_file_name)
-            line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.')
-            line=line.replace('#$ -N imp.chunks.'+str(outdot), '#$ -N imp.chunks.'+str(outdot)+'.resub_'+str(nummiss))
-            new_uger_file.write(line)
-            
-    orig_uger_file.close()
-    new_uger_file.close()
-
-    print ' '.join(['qsub',new_uger_file.name]) + '\n'
-    subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True)
+    if not os.path.isfile(orig_job_conf):
+        orig_job_file = str(outdot)+'.imp_chunks.sub.sh'
+        raise IOError("Unable to find previous job configuration pickle %s.\
+            \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file)))
+
+    
+    cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf)
+
+    # rename resub
+    sendjob_dict['jobname'] = 'imp.chunks.'+str(outdot)+'.resub_'+str(nummiss)
+    
+    sendjob_dict['logname'] = str('imp.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+
+    # increase memory and walltime
+    # TODO: consider how to scale mem/time here
+    oldmem = sendjob_dict['mem']
+    sendjob_dict['mem'] = int(oldmem)*2
+
+    oldtime = sendjob_dict['walltime']
+    sendjob_dict['walltime'] = int(oldtime)*4
+    
+    # replace chunk file and set number of new jobs
+    sendjob_dict['njobs'] = int(nummiss)
+
+    job_dict['cfile'] = tmp_chunk_file_name
+    
+    
+    # re-save new settings (primarily to track updating mem and walltime)
+    save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict)
+
+    
+    # submit
+    imp_cmd = cmd_templ.format(**job_dict)
+
+    send_job(jobname=sendjob_dict['jobname'],
+             cmd=imp_cmd,
+             logname=sendjob_dict['logname'],
+             mem=sendjob_dict['mem'],
+             walltime=sendjob_dict['walltime'],
+             njobs=sendjob_dict['njobs'],
+             sleep=sendjob_dict['sleep'])
+        
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
+
     
     
     print '\n...Replacing this best-guess job in the queue...'
@@ -367,7 +396,7 @@
 # get number of chunks
 nchunks = len(chunks)
 
-# fill in template
+# info to fill in job template
 jobdict = {"task": "{task}",
            "sleep": str(args.sleep),
            "cfile": str(outdot)+'.chunks.txt',
@@ -390,11 +419,25 @@
            "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl'
            }
 
-bg_cmd = bg_templ.format(**jobdict)
+
+# store job information for possible resubs
+job_store_file = 'bg.chunks.'+str(outdot)+'.pkl'
+
+clust_dict = init_sendjob_dict()
+clust_dict['jobname'] = 'bg.chunks.'+str(outdot)
+clust_dict['logname'] = str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+clust_dict['mem'] = 8000
+clust_dict['walltime'] = 2
+clust_dict['njobs'] = int(nchunks)
+clust_dict['sleep'] = args.sleep
+
+save_job(jfile=job_store_file, cmd_templ=bg_templ, job_dict=jobdict, sendjob_dict=clust_dict)
 
 
 # submit
 # TODO: flex queue/mem reqs
+bg_cmd = bg_templ.format(**jobdict)
+
 send_job(jobname='bg.chunks.'+str(outdot),
          cmd=bg_cmd,
          logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
diff --git a/bin/blueprint.py b/bin/blueprint.py
index 5ccd47d..09074a7 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -291,6 +291,62 @@ def send_job(jobname,
         return 0
 
 
+####################################
+#
+# Save / load job configurations
+# 
+####################################
+
+def init_sendjob_dict():
+    
+    sendjob_dict = {
+        "jobname": None,
+#        "arrayfile": None,
+#        "cmd": None,
+        "logname": None,
+        "logloc": None,
+        "mem": None,
+        "walltime": None,
+        "njobs": None,
+        "maxpar": None,
+        "threads": None,
+        "wait_file": None,
+        "wait_name": None,
+#        "cluster": None,
+        "sleep": None,
+#        "testonly": None
+    }
+    
+    return sendjob_dict
+
+
+
+def save_job(jfile, cmd_templ, job_dict, sendjob_dict):
+    
+    import cPickle as pickle
+    
+    with open(jfile, 'wb') as pickle_out:
+        pickle.dump(cmd_templ, pickle_out, -1)
+        pickle.dump(job_dict, pickle_out, -1)
+        pickle.dump(sendjob_dict, pickle_out, -1)
+
+    return 0
+
+
+
+def load_job(jfile):
+
+    import cPickle as pickle
+    
+    with open(jfile, 'rb') as pickle_in:
+        cmd_templ = pickle.load(pickle_in)
+        job_dict = pickle.load(pickle_in)
+        sendjob_dict = pickle.load(pickle_in)
+        
+    return cmd_templ, job_dict, sendjob_dict
+        
+    
+
 ####################################
 #
 # Parse arguments from ricopili interface if invoked directly
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 8847d32..2a85f16 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -475,11 +475,26 @@ def find_chunk(snpchrom, snpbp, last_chunk):
            "rsc": str(args.rscript_ex)
            }
 
-gwas_cmd = gwas_templ.format(**jobdict)
+nchunk = len(chunks.keys())
+
+
+# store job information for possible resubs
+job_store_file = 'gwas.chunks.'+str(outdot)+'.pkl'
+
+clust_dict = init_sendjob_dict()
+clust_dict['jobname'] = 'gwas.chunks.'+str(outdot)
+clust_dict['logname'] = str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+clust_dict['mem'] = 4000
+clust_dict['walltime'] = 2
+clust_dict['njobs'] = int(nchunk)
+clust_dict['maxpar'] = 200
+clust_dict['sleep'] = args.sleep
+
+save_job(jfile=job_store_file, cmd_templ=gwas_templ, job_dict=jobdict, sendjob_dict=clust_dict)
 
 
 # submit job
-nchunk = len(chunks.keys())
+gwas_cmd = gwas_templ.format(**jobdict)
 
 send_job(jobname='gwas.chunks.'+str(outdot),
          cmd=gwas_cmd,
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 381a58b..025f475 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -169,8 +169,8 @@
     os.chdir(shape_dir)
         
     # verify haven't already tried this resub
-    uger_phase_name = str(outdot)+'.shape.resub_'+str(num_chr)+'_chr.sub.sh'
-    if os.path.isfile(uger_phase_name):
+    phase_sub_name = 'shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.sh'
+    if os.path.isfile(phase_sub_name):
         print '\n####################'
         print 'ERROR:'
         print 'Found previous attempt to resubmit %d failed chromosomes.' % int(num_chr)
@@ -342,12 +342,29 @@
            "out": str(outdot)+'.imp.${cname}',
            "seedtxt": str(seedtxt)
            }
-cmd_imp = imp_templ.format(**jobdict)
+
 
 # get number of chunks (-1 is for header)
 nchunks = file_len(outdot+'.chunks.txt')-1
 
+
+# store job information for possible resubs
+job_store_file = 'imp.chunks.'+str(outdot)+'.pkl'
+
+clust_dict = init_sendjob_dict()
+clust_dict['jobname'] = 'imp.chunks.'+str(outdot)
+clust_dict['logname'] = str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log')
+clust_dict['mem'] = 8000
+clust_dict['walltime'] = 2
+clust_dict['njobs'] = int(nchunks)
+clust_dict['sleep'] = args.sleep
+
+save_job(jfile=job_store_file, cmd_templ=imp_templ, job_dict=jobdict, sendjob_dict=clust_dict)
+
+
 # submit
+cmd_imp = imp_templ.format(**jobdict)
+
 send_job(jobname='imp.chunks.'+str(outdot),
          cmd=cmd_imp,
          logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
diff --git a/bin/pca_rel.py b/bin/pca_rel.py
index 7303b55..9969e57 100755
--- a/bin/pca_rel.py
+++ b/bin/pca_rel.py
@@ -38,7 +38,7 @@
 
 
 # get directory containing current script
-# (to get absolute path for uger wrapper)
+# (to get absolute path for script directory)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 
 #############
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 92bd5e0..cc3ae4f 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -16,7 +16,7 @@
 # 4) Split plink files by chr
 #    - use shortened IDs
 # 5) Run SHAPEIT
-#    - using UGER to parallelize
+#    - parallelize on cluster
 #
 ####################################
 

From f313ceefa666b8d49f3a6e9b84df1061add41f99 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 18:19:55 -0400
Subject: [PATCH 11/48] clean up py imports, add doc with dependencies

---
 bin/agg_gwas.py         | 11 +++++------
 bin/agg_imp.py          |  9 +++++----
 bin/args_chunks.py      |  3 +--
 bin/args_impute.py      |  1 -
 bin/args_qc.py          |  3 +--
 bin/bg_imp.py           | 12 ++++++------
 bin/blueprint.py        | 26 ++++++++++++++++++++++++++
 bin/chunk_snps.py       |  4 +---
 bin/filter_ped.py       |  7 +------
 bin/final_file_check.py |  2 +-
 bin/gwas_dfam.py        |  6 ++----
 bin/gwas_gee.py         |  4 +---
 bin/gwas_rel.py         |  8 ++++----
 bin/imp2_rel.py         | 13 ++++++++-----
 bin/impute_rel.py       |  8 +++++---
 bin/imus_pca.py         |  2 +-
 bin/pca_rel.py          |  2 +-
 bin/ped_confirm.py      |  7 +------
 bin/qc_rel.py           |  3 +--
 bin/shape_rel.py        | 10 ++--------
 bin/strict_qc.py        |  4 ++--
 docs/PYTHON.md          | 34 ++++++++++++++++++++++++++++++++++
 22 files changed, 109 insertions(+), 70 deletions(-)
 create mode 100644 docs/PYTHON.md

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index 750f50a..c216701 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -36,13 +36,10 @@
 import subprocess
 import argparse
 import gzip
-# from warnings import warn
-# from glob import glob
 from math import log10, sqrt
-from args_gwas import *
+from args_gwas import parserbase, parseragg
 from py_helpers import unbuffer_stdout, file_len, file_tail
-from blueprint import send_job
-# , read_conf, link
+from blueprint import send_job, save_job, load_job, read_clust_conf
 unbuffer_stdout()
 
 
@@ -103,7 +100,9 @@
 # TODO: check dependencies
 
 
-
+# get cluster configuration
+# needed for specifying logfile names with clust_conf['log_task_id']
+clust_conf = read_clust_conf()
 
 
 
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 0d670fb..5a285ca 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -26,11 +26,11 @@
 ### load requirements
 import os
 import subprocess
-from args_impute import *
-from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format
-from blueprint import send_job, load_job
+import argparse
+from args_impute import parserbase, parsercluster
+from py_helpers import unbuffer_stdout, find_exec, file_len
+from blueprint import send_job, load_job, save_job, read_clust_conf
 unbuffer_stdout()
-# warnings.formatwarning = warn_format
 
 #############
 if not (('-h' in sys.argv) or ('--help' in sys.argv)):
@@ -80,6 +80,7 @@
 # get directory containing current script
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
+clust_conf = read_clust_conf()
 
 # TODO: here
 
diff --git a/bin/args_chunks.py b/bin/args_chunks.py
index 2af8ac6..caf6e61 100644
--- a/bin/args_chunks.py
+++ b/bin/args_chunks.py
@@ -17,7 +17,6 @@
 
 # imports
 import argparse
-# import os
 
 
 
@@ -96,4 +95,4 @@
                          'Such chunks may occur due to sparse data (e.g. few SNPs ' + \
                          'on the short arm of chr21) or could indicate bad chromosome build information.')
 
-# eof
\ No newline at end of file
+# eof
diff --git a/bin/args_impute.py b/bin/args_impute.py
index de41a7b..e603462 100644
--- a/bin/args_impute.py
+++ b/bin/args_impute.py
@@ -18,7 +18,6 @@
 
 # imports
 import argparse
-# import os
 
 
 
diff --git a/bin/args_qc.py b/bin/args_qc.py
index c2fff9e..011acb7 100644
--- a/bin/args_qc.py
+++ b/bin/args_qc.py
@@ -17,7 +17,6 @@
 
 # imports
 import argparse
-# import os
 
 
 ############
@@ -215,4 +214,4 @@
                         help='Prevents setting mendelian errors to missing')
 
 
-# eof
\ No newline at end of file
+# eof
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 5f84d35..ab08f04 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -32,12 +32,13 @@
 
 ### load requirements
 import os
-import subprocess
 import warnings
+import argparse
+from warnings import warn
 from textwrap import dedent
-from args_impute import *
-from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format
-from blueprint import send_job, init_sendjob_dict, save_job
+from args_impute import parserbase, parserbg, parsercluster
+from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format, read_conf
+from blueprint import send_job, init_sendjob_dict, save_job, load_job, read_clust_conf
 unbuffer_stdout()
 warnings.formatwarning = warn_format
 
@@ -197,8 +198,7 @@
 # needed for specifying logfile names with clust_conf['log_task_id']
 conf_file = os.environ['HOME']+"/picopili.conf"
 configs = read_conf(conf_file)
-cluster = configs['cluster']
-clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+clust_conf = read_clust_conf()
 
 # TODO: here
 
diff --git a/bin/blueprint.py b/bin/blueprint.py
index 09074a7..1684d95 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -291,6 +291,32 @@ def send_job(jobname,
         return 0
 
 
+####################################
+#
+# Get cluster configuration file
+# 
+####################################
+
+def read_clust_conf():
+    
+    import os
+    
+    conf_file = os.environ['HOME']+"/picopili.conf"
+    configs = read_conf(conf_file)
+    cluster = configs['cluster']
+    
+    pico_bin = os.path.dirname(os.path.realpath(__file__))
+    clust_dir = os.path.dirname(pico_bin) + '/cluster_templates'
+    
+    assert os.path.isdir(clust_dir), "Unable to find cluster job submission template directory %s" % str(clust_dir)
+
+    # load queue configuration info
+    # - submission syntax, queue names, job holds
+    clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')   
+    
+    return clust_conf
+
+
 ####################################
 #
 # Save / load job configurations
diff --git a/bin/chunk_snps.py b/bin/chunk_snps.py
index 630f8d9..7a7601a 100755
--- a/bin/chunk_snps.py
+++ b/bin/chunk_snps.py
@@ -35,11 +35,9 @@
 #############
 
 import os
-# import subprocess
 import argparse
 import copy
-# from glob import glob
-from args_chunks import *
+from args_chunks import parserbase, parsersnpchunk
 from py_helpers import unbuffer_stdout, file_len, warn_format
 unbuffer_stdout()
 import warnings
diff --git a/bin/filter_ped.py b/bin/filter_ped.py
index 26b3e91..6b103e6 100755
--- a/bin/filter_ped.py
+++ b/bin/filter_ped.py
@@ -33,16 +33,11 @@
 
 ### load requirements
 import os
-# import subprocess
 import argparse
-# from string import ascii_uppercase
-# from glob import glob
-# from numpy import digitize
 import random
 import warnings
-from args_ped import *
+from args_ped import parserbase, parsergeno, parseribd, parserweights
 from py_helpers import unbuffer_stdout
-# file_len, test_exec, read_conf, find_from_path, link, gz_confirm
 unbuffer_stdout()
 
 
diff --git a/bin/final_file_check.py b/bin/final_file_check.py
index 57085e8..ed4ec9b 100755
--- a/bin/final_file_check.py
+++ b/bin/final_file_check.py
@@ -11,7 +11,7 @@
 ####################################
 
 import argparse
-from py_helpers import *
+from py_helpers import file_check_email, unbuffer_stdout
 unbuffer_stdout()
 
 ### parse arguments
diff --git a/bin/gwas_dfam.py b/bin/gwas_dfam.py
index 7183dbd..6806060 100755
--- a/bin/gwas_dfam.py
+++ b/bin/gwas_dfam.py
@@ -39,10 +39,8 @@
 import os
 import subprocess
 import argparse
-# from glob import glob
-from args_gwas import *
+from args_gwas import parserbase,parsergwas,parsersoft
 from py_helpers import unbuffer_stdout, test_exec, find_exec
-# , read_conf, link
 unbuffer_stdout()
 
 #############
@@ -179,4 +177,4 @@
 print '\n############'
 print '\n'
 print 'SUCCESS!\n'
-exit(0)
\ No newline at end of file
+exit(0)
diff --git a/bin/gwas_gee.py b/bin/gwas_gee.py
index 3cf9209..cc5b458 100755
--- a/bin/gwas_gee.py
+++ b/bin/gwas_gee.py
@@ -42,10 +42,8 @@
 import subprocess
 import argparse
 from warnings import warn
-# from glob import glob
-from args_gwas import *
+from args_gwas import parserbase, parsergwas, parsersoft
 from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len, find_exec
-# , read_conf, link
 unbuffer_stdout()
 
 #############
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 2a85f16..6fe0985 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -27,9 +27,9 @@
 import os
 from warnings import warn
 from textwrap import dedent
-from args_gwas import *
-from py_helpers import link, unbuffer_stdout, find_exec
-from blueprint import send_job
+from args_gwas import parserbase, parsergwas, parserchunk, parseragg, parsersoft
+from py_helpers import link, unbuffer_stdout, find_exec, read_conf
+from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job
 unbuffer_stdout()
 
 
@@ -146,7 +146,7 @@
 conf_file = os.environ['HOME']+"/picopili.conf"
 configs = read_conf(conf_file)
 cluster = configs['cluster']
-clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+clust_conf = read_clust_conf()
 
 # TODO: here
 
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 025f475..bd32661 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -26,10 +26,11 @@
 ### load requirements
 import os
 import subprocess
+import argparse
 from textwrap import dedent
-from args_impute import *
-from py_helpers import unbuffer_stdout, file_len, link, find_exec
-from blueprint import send_job
+from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster
+from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf
+from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job
 unbuffer_stdout()
 
 
@@ -97,7 +98,7 @@
 conf_file = os.environ['HOME']+"/picopili.conf"
 configs = read_conf(conf_file)
 cluster = configs['cluster']
-clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf')
+clust_conf = read_clust_conf()
 
 # from config
 impute_ex = find_exec('impute2',key='i2loc')
@@ -199,7 +200,9 @@
 #                  '--output-max', outstem+'.phased.haps', outstem+'.phased.sample',
 #                  '--output-log', outstem+'.shape.log']
     
-    # manage duohmm arg
+    # manage additional arg pieces
+    chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}'
+    outstem = str(outdot)+'.chr${chrom}'
     if extra_args.no_duohmm:
         duo_txt = ''
     else:
diff --git a/bin/impute_rel.py b/bin/impute_rel.py
index 332518c..651c2ea 100755
--- a/bin/impute_rel.py
+++ b/bin/impute_rel.py
@@ -25,8 +25,10 @@
 
 ### load requirements
 import os
-from args_impute import *
-from py_helpers import unbuffer_stdout #, read_conf, file_tail, link, warn_format
+import argparse
+
+from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, parserbg, parsercluster
+from py_helpers import unbuffer_stdout
 from blueprint import send_job
 unbuffer_stdout()
 
@@ -96,7 +98,7 @@
 if args.hard_call_th is None:
     print '--bg-th '+str(args.bg_th)
 else:
-    print '--hard-call-th '+str(hard_call_th)
+    print '--hard-call-th '+str(args.hard_call_th)
 print '--info-th '+str(args.info_th)
 print '--max-info-th '+str(args.max_info_th)
 if args.keep_mendel:
diff --git a/bin/imus_pca.py b/bin/imus_pca.py
index 6fee4bb..765aff2 100755
--- a/bin/imus_pca.py
+++ b/bin/imus_pca.py
@@ -36,7 +36,7 @@
 import argparse
 from glob import glob
 from py_helpers import find_exec, unbuffer_stdout, test_exec
-from args_pca import *
+from args_pca import parserbase, parserpca
 unbuffer_stdout()
 
 #############
diff --git a/bin/pca_rel.py b/bin/pca_rel.py
index 9969e57..0edafd6 100755
--- a/bin/pca_rel.py
+++ b/bin/pca_rel.py
@@ -31,7 +31,7 @@
 import argparse
 import os
 from math import ceil
-from args_pca import *
+from args_pca import parserbase, parsergrid, parserqc, parserpca
 from py_helpers import file_len, unbuffer_stdout
 from blueprint import send_job
 unbuffer_stdout()
diff --git a/bin/ped_confirm.py b/bin/ped_confirm.py
index 681c55e..b201a64 100755
--- a/bin/ped_confirm.py
+++ b/bin/ped_confirm.py
@@ -35,14 +35,9 @@
 import subprocess
 import argparse
 import re
-# from string import ascii_uppercase
-# from glob import glob
-# from numpy import digitize
-# import random
 import warnings
-from args_ped import *
+from args_ped import parserbase, parseribd, parserexloc
 from py_helpers import unbuffer_stdout, test_exec
-# file_len, read_conf, find_from_path, link, gz_confirm
 unbuffer_stdout()
 
 
diff --git a/bin/qc_rel.py b/bin/qc_rel.py
index 8c0c556..ed759c9 100755
--- a/bin/qc_rel.py
+++ b/bin/qc_rel.py
@@ -54,8 +54,7 @@
 import warnings
 from time import strftime
 start_time = strftime("%H:%M:%S %d-%B-%Y")
-# from glob import glob
-from args_qc import *
+from args_qc import parserbase, parserqc, parsermendel, parsertag
 from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format, find_exec
 unbuffer_stdout()
 warnings.formatwarning = warn_format
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index cc3ae4f..43a9419 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -38,15 +38,9 @@
 import os
 import subprocess
 import argparse
-# from string import ascii_uppercase
-# from glob import glob
-# from numpy import digitize
-# import random
-# import warnings
-from args_impute import *
-from py_helpers import unbuffer_stdout, link, find_exec #, test_exec
+from args_impute import parserbase, parserphase, parserref, parsercluster
+from py_helpers import unbuffer_stdout, link, find_exec, read_conf
 from blueprint import send_job
-# file_len, read_conf, find_from_path, link, gz_confirm
 unbuffer_stdout()
 
 
diff --git a/bin/strict_qc.py b/bin/strict_qc.py
index 1e9a476..1ead067 100755
--- a/bin/strict_qc.py
+++ b/bin/strict_qc.py
@@ -42,8 +42,8 @@
 import subprocess
 import argparse
 from glob import glob
-from py_helpers import file_len, find_exec, unbuffer_stdout, test_exec
-from args_pca import *
+from py_helpers import file_len, find_exec, unbuffer_stdout
+from args_pca import parserbase, parserqc
 unbuffer_stdout()
 
 #############
diff --git a/docs/PYTHON.md b/docs/PYTHON.md
new file mode 100644
index 0000000..a960b96
--- /dev/null
+++ b/docs/PYTHON.md
@@ -0,0 +1,34 @@
+### Python Dependencies
+
+Picopili is built from a combination of Python, Perl, R, and *nix shell scripts.
+
+Most scripts depend only on packages from the Python Standard Library
+([[https://docs.python.org/2/library/]]). In addition, `admix_rel.py` depends
+on numpy. We strongly support using Anaconda ([[https://www.continuum.io/downloads]])
+to manage Python package dependencies, but a barebones installation of Python 2.X + numpy
+should be sufficient for picopili in most cases.
+
+Scripts are primarily tested under Python 2.7 and Anaconda 2.1.0, but should be broadly
+compatible with most Python 2.X versions. If you encounter compaitibility issues,
+please contact rwalters(at)broadinstitute.org and we would be happy to assist.
+
+##### Full list of package dependencies
+
+* argparse 
+* cPickle 
+* copy 
+* distutils 
+* glob 
+* gzip 
+* math 
+* numpy
+* os 
+* random 
+* re 
+* string 
+* subprocess 
+* sys 
+* textwrap 
+* time 
+* warnings 
+

From c996ca8d61b8e5fa692dea3288772fee468c7060 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 20:05:39 -0400
Subject: [PATCH 12/48] compress multiline commands when parallelizing

---
 bin/blueprint.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/bin/blueprint.py b/bin/blueprint.py
index 1684d95..e8e39ac 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -10,6 +10,7 @@
 ####################################
 
 import os
+import stat
 import subprocess
 from textwrap import dedent
 from py_helpers import read_conf, file_len
@@ -178,6 +179,14 @@ def send_job(jobname,
             # number of jobs to cover all tasks
             array_jobs = ceil(float(njobs)/float(task_mem_lim))
             
+            # convert multi-line command to script
+            if len(cmd_line.splitlines()) > 1:
+                tmp_script = open('temp_cmd.'+str(jobname)+'.sh','w')
+                tmp_script.write(cmd_line)
+                tmp_script.close()
+                os.chmod(tmp_script.name, stat.S_IEXEC)
+                cmd_line = './'+tmp_script.name
+                
             # setup to do task_mem_lim jobs on each node
             # note: specified above that cmd_line uses ${tid} as task index 
             par_tmp = dedent("""\

From 3d46a80fbc6faf1316088eb45c969113d98ee228 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Sat, 8 Oct 2016 02:12:14 +0200
Subject: [PATCH 13/48] list dependencies; first draft lisa config

---
 bin/config_pico.pl            |  2 +-
 cluster_templates/lisa.conf   | 12 ++++++++++++
 cluster_templates/lisa.sub.sh | 18 ++++++++++++++++++
 docs/DEPENDS.md               | 18 ++++++++++++++++++
 docs/PYTHON.md                |  1 +
 5 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 cluster_templates/lisa.conf
 create mode 100755 cluster_templates/lisa.sub.sh
 create mode 100644 docs/DEPENDS.md

diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 9fb19df..442c2bb 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -787,7 +787,7 @@ ()
 	print "Successfully found ricopili plague and buigue reference files!\n"
 }
 
-if ($haveref == 1 && $statusbin == 0){
+if ($haveref == 1 && $status_bin == 0){
 	print "\n### Finished ###\n\n";
 }
 
diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf
new file mode 100644
index 0000000..e880e01
--- /dev/null
+++ b/cluster_templates/lisa.conf
@@ -0,0 +1,12 @@
+hour_q None
+hour2_q None
+hour4_q None
+day_q None
+long_q None
+sub_cmd qsub -d $PWD
+log_task_id ${PBS_ARRAYID}
+task_id ${PBS_ARRAYID}
+hold_flag -W depend=afterany:
+j_per_node 16
+array_mem_mb 32000
+project unspecified
diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh
new file mode 100755
index 0000000..47b08d4
--- /dev/null
+++ b/cluster_templates/lisa.sub.sh
@@ -0,0 +1,18 @@
+#PBS -lwalltime={wall_hours}:00:00
+#PBS -lnodes=1
+#PBS -S /bin/bash
+#PBS -N {job_name}
+#PBS -j oe
+#PBS -o {log_name}
+::PICO_ARRAY_ONLY::#PBS -t 1-{array_jobs}
+
+# sleep option (for preventing race conditions on network file systems)
+sleep {sleep_time}
+
+# setup resources
+use R
+
+# main command line
+{cmd_string}
+
+# eof
diff --git a/docs/DEPENDS.md b/docs/DEPENDS.md
new file mode 100644
index 0000000..73edeef
--- /dev/null
+++ b/docs/DEPENDS.md
@@ -0,0 +1,18 @@
+### Software Dependencies
+
+Picopili largely serves as a wrapper for existing major software
+for analyzing genome-wide genotype data. A full list of dependencies
+is given below, along with links to their respective sources.
+
+* ADMIXTURE (https://www.genetics.ucla.edu/software/admixture/)
+* EIGENSOFT (https://www.hsph.harvard.edu/alkes-price/software/)
+* IMPUTE2 (https://mathgen.stats.ox.ac.uk/impute/impute_v2.html)
+* liftOver (http://genome.sph.umich.edu/wiki/LiftOver)
+* PLINK2 (https://www.cog-genomics.org/plink2)
+* R-enabled plink (1.07 [http://pngu.mgh.harvard.edu/~purcell/plink/], or dev branch of plink2 [above])
+* PRIMUS (https://primus.gs.washington.edu/primusweb/)
+* R (https://www.r-project.org/)
+* REAP (faculty.washington.edu/tathornt/software/REAP/)
+* Rserve (https://rforge.net/Rserve/)
+* SHAPEIT (www.shapeit.fr/)
+
diff --git a/docs/PYTHON.md b/docs/PYTHON.md
index a960b96..551becc 100644
--- a/docs/PYTHON.md
+++ b/docs/PYTHON.md
@@ -25,6 +25,7 @@ please contact rwalters(at)broadinstitute.org and we would be happy to assist.
 * os 
 * random 
 * re 
+* stat
 * string 
 * subprocess 
 * sys 

From 04bc27cb942d7a288cb5f9dad49a72442e2c2b7b Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Sat, 8 Oct 2016 02:44:29 +0200
Subject: [PATCH 14/48] cluster hold, working directory configs

---
 bin/blueprint.py                  | 7 ++++---
 cluster_templates/broad_uger.conf | 2 +-
 cluster_templates/lisa.conf       | 4 ++--
 cluster_templates/lisa.sub.sh     | 2 +-
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/bin/blueprint.py b/bin/blueprint.py
index e8e39ac..708a900 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -99,12 +99,12 @@ def send_job(jobname,
     
     # job dependencies
     if wait_name is not None:
-        hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name)
+        hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name))
         
     elif wait_file is not None:
         with open(wait_file, 'r') as wait_fi:
             wait_name = wait_fi.readline()
-            hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name)
+            hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name))
 
     else:
         hold_str = ""
@@ -254,7 +254,8 @@ def send_job(jobname,
                "log_task_id": str(clust_conf['log_task_id']),
                "queue_name": str(queue_name),
                "sleep_time": str(sleep),
-               "project": str(clust_conf['project'])
+               "project": str(clust_conf['project']),
+               "workdir": os.getcwd()
                }
 
             
diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf
index 3ed149c..5901a07 100644
--- a/cluster_templates/broad_uger.conf
+++ b/cluster_templates/broad_uger.conf
@@ -6,7 +6,7 @@ long_q long
 sub_cmd qsub
 log_task_id $TASK_ID
 task_id ${SGE_TASK_ID}
-hold_flag -hold_jid
+hold_flag -hold_jid {hold_name}
 j_per_node 1
 array_mem_mb 128000
 project unspecified
diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf
index e880e01..cfabff3 100644
--- a/cluster_templates/lisa.conf
+++ b/cluster_templates/lisa.conf
@@ -3,10 +3,10 @@ hour2_q None
 hour4_q None
 day_q None
 long_q None
-sub_cmd qsub -d $PWD
+sub_cmd qsub
 log_task_id ${PBS_ARRAYID}
 task_id ${PBS_ARRAYID}
-hold_flag -W depend=afterany:
+hold_flag -W depend=afterany:{hold_name}
 j_per_node 16
 array_mem_mb 32000
 project unspecified
diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh
index 47b08d4..9575147 100755
--- a/cluster_templates/lisa.sub.sh
+++ b/cluster_templates/lisa.sub.sh
@@ -10,7 +10,7 @@
 sleep {sleep_time}
 
 # setup resources
-use R
+cd {workdir}
 
 # main command line
 {cmd_string}

From fe1088c07b2738692a31a194689349db703613ed Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 7 Oct 2016 20:55:02 -0400
Subject: [PATCH 15/48] test of job number hold conditions

---
 bin/blueprint.py            | 13 ++++++++-----
 bin/pca_rel.py              | 34 ++++++++++++++++++----------------
 bin/shape_rel.py            | 17 +++++++++--------
 cluster_templates/lisa.conf |  2 +-
 4 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/bin/blueprint.py b/bin/blueprint.py
index 708a900..eeb2161 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -28,6 +28,7 @@ def send_job(jobname,
              threads=None,
              wait_file=None,
              wait_name=None,
+             wait_num=None,
              cluster=None,
              sleep=30,
              testonly=False):
@@ -99,12 +100,12 @@ def send_job(jobname,
     
     # job dependencies
     if wait_name is not None:
-        hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name))
+        hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),hold_num=str(wait_num))
         
     elif wait_file is not None:
         with open(wait_file, 'r') as wait_fi:
             wait_name = wait_fi.readline()
-            hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name))
+            hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),hold_num=str(wait_num))
 
     else:
         hold_str = ""
@@ -294,9 +295,11 @@ def send_job(jobname,
     if not testonly:
         p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
         out, err = p.communicate()
-        print out
-        return(p.returncode)
-
+        if p.returncode is None or p.returncode == 0:
+            return out
+        else
+            raise EnvironmentError((p.returncode,err))
+            
     else:          
         return 0
 
diff --git a/bin/pca_rel.py b/bin/pca_rel.py
index 0edafd6..7f6238f 100755
--- a/bin/pca_rel.py
+++ b/bin/pca_rel.py
@@ -166,14 +166,14 @@
                          strandambi_txt,
                          allchr_txt])
 
-send_job(jobname=str('strictqc_'+args.out),
-         arrayfile=None,
-         cmd=str(strictqc_call),
-         logname=str('strictqc_'+args.out+'.sub.log'),
-         mem=2000,
-         walltime=2,
-         sleep=0,
-         testonly=args.test_sub)
+jobres = send_job(jobname=str('strictqc_'+args.out),
+                  arrayfile=None,
+                  cmd=str(strictqc_call),
+                  logname=str('strictqc_'+args.out+'.sub.log'),
+                  mem=2000,
+                  walltime=2,
+                  sleep=0,
+                  testonly=args.test_sub)
 
 
 #####
@@ -193,14 +193,15 @@
                          '--primus-ex', str(args.primus_ex)
                          ])
 
-send_job(jobname=str('imuspca_'+args.out),
-         cmd=str(imuspca_call),
-         logname=str('imuspca_'+args.out+'.sub.log'),
-         mem=int(imus_mem)*1000,
-         walltime=168, # one week
-         wait_name=str('strictqc_'+args.out),
-         sleep=args.sleep,
-         testonly=args.test_sub)
+jobres2 = send_job(jobname=str('imuspca_'+args.out),
+                   cmd=str(imuspca_call),
+                   logname=str('imuspca_'+args.out+'.sub.log'),
+                   mem=int(imus_mem)*1000,
+                   walltime=168, # one week
+                   wait_name=str('strictqc_'+args.out),
+                   wait_num=str(jobres),
+                   sleep=args.sleep,
+                   testonly=args.test_sub)
 
 
 #####
@@ -226,6 +227,7 @@
          mem=100,
          walltime=1,
          wait_name=str('imuspca_'+args.out),
+         wait_num=str(jobres2),
          sleep=str(args.sleep),
          testonly=args.test_sub)
 
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 43a9419..09e9d3e 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -299,14 +299,14 @@
 task_id = str(clust_conf['log_task_id'])
 
 # submit
-send_job(jobname='shape.'+str(outdot),
-             cmd=' '.join(shape_call),
-             logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log',
-             mem=int(args.mem_req)*1000,
-             walltime=168, # week
-             njobs=22,
-             threads=int(args.threads),
-             sleep=str(args.sleep))
+jobres = send_job(jobname='shape.'+str(outdot),
+                  cmd=' '.join(shape_call),
+                  logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log',
+                  mem=int(args.mem_req)*1000,
+                  walltime=168, # week
+                  njobs=22,
+                  threads=int(args.threads),
+                  sleep=str(args.sleep))
 
 
 ###
@@ -329,6 +329,7 @@
              mem=8000,
              walltime=2,
              wait_name='shape.'+str(outdot),
+             wait_num=jobres,
              sleep=str(args.sleep))
 
 
diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf
index cfabff3..1db3e36 100644
--- a/cluster_templates/lisa.conf
+++ b/cluster_templates/lisa.conf
@@ -6,7 +6,7 @@ long_q None
 sub_cmd qsub
 log_task_id ${PBS_ARRAYID}
 task_id ${PBS_ARRAYID}
-hold_flag -W depend=afterany:{hold_name}
+hold_flag -W depend=afterany:{hold_num}
 j_per_node 16
 array_mem_mb 32000
 project unspecified

From 4f038852b905b91f8d7b58897c171a5340733da6 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Sat, 8 Oct 2016 03:36:04 +0200
Subject: [PATCH 16/48] allow job number holds; adjust long job walltimes

---
 bin/agg_gwas.py   | 19 ++++++++++---------
 bin/agg_imp.py    | 17 +++++++++--------
 bin/bg_imp.py     | 32 +++++++++++++++++---------------
 bin/blueprint.py  |  4 ++--
 bin/gwas_rel.py   | 19 ++++++++++---------
 bin/imp2_rel.py   | 32 +++++++++++++++++---------------
 bin/impute_rel.py |  2 +-
 bin/pca_rel.py    |  6 +++---
 bin/shape_rel.py  |  4 ++--
 9 files changed, 71 insertions(+), 64 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index c216701..d456758 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -223,14 +223,14 @@
     # submit
     gwas_cmd = cmd_templ.format(**job_dict)
 
-    send_job(jobname=sendjob_dict['jobname'],
-             cmd=gwas_cmd,
-             logname=sendjob_dict['logname'],
-             mem=sendjob_dict['mem'],
-             walltime=sendjob_dict['walltime'],
-             njobs=sendjob_dict['njobs'],
-             maxpar=sendjob_dict['maxpar'],
-             sleep=sendjob_dict['sleep'])
+    jobres = send_job(jobname=sendjob_dict['jobname'],
+	              cmd=gwas_cmd,
+	              logname=sendjob_dict['logname'],
+	              mem=sendjob_dict['mem'],
+	              walltime=sendjob_dict['walltime'],
+	              njobs=sendjob_dict['njobs'],
+	              maxpar=sendjob_dict['maxpar'],
+	              sleep=sendjob_dict['sleep'])
         
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
     
@@ -245,8 +245,9 @@
              cmd=' '.join(sys.argv[:]),
              logname=agg_log,
              mem=24000,
-             walltime=168, # week
+             walltime=30,
              wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+             wait_num=str(jobres).strip(),
              sleep=10)
 
     print '\n############'
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 5a285ca..77c4e8c 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -207,13 +207,13 @@
     # submit
     bg_cmd = cmd_templ.format(**job_dict)
 
-    send_job(jobname=sendjob_dict['jobname'],
-             cmd=bg_cmd,
-             logname=sendjob_dict['logname'],
-             mem=sendjob_dict['mem'],
-             walltime=sendjob_dict['walltime'],
-             njobs=sendjob_dict['njobs'],
-             sleep=sendjob_dict['sleep'])
+    jobres = send_job(jobname=sendjob_dict['jobname'],
+	              cmd=bg_cmd,
+	              logname=sendjob_dict['logname'],
+	              mem=sendjob_dict['mem'],
+	              walltime=sendjob_dict['walltime'],
+	              njobs=sendjob_dict['njobs'],
+	              sleep=sendjob_dict['sleep'])
         
     print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss
     
@@ -229,8 +229,9 @@
              cmd=' '.join(sys.argv[:]),
              logname=agg_log,
              mem=8000,
-             walltime=168, # week
+             walltime=30,
              wait_name='bg.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+             wait_num=str(jobres).strip(),
              sleep=args.sleep)
 
     print '\n############'
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index ab08f04..0990123 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -321,13 +321,13 @@
     # submit
     imp_cmd = cmd_templ.format(**job_dict)
 
-    send_job(jobname=sendjob_dict['jobname'],
-             cmd=imp_cmd,
-             logname=sendjob_dict['logname'],
-             mem=sendjob_dict['mem'],
-             walltime=sendjob_dict['walltime'],
-             njobs=sendjob_dict['njobs'],
-             sleep=sendjob_dict['sleep'])
+    jobres = send_job(jobname=sendjob_dict['jobname'],
+             	      cmd=imp_cmd,
+             	      logname=sendjob_dict['logname'],
+	              mem=sendjob_dict['mem'],
+	              walltime=sendjob_dict['walltime'],
+	              njobs=sendjob_dict['njobs'],
+	              sleep=sendjob_dict['sleep'])
         
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
 
@@ -345,6 +345,7 @@
              mem=8000,
              walltime=2, # week
              wait_name='imp.chunks.'+str(outdot)+'.resub_'+str(nummiss),
+	     wait_num=str(jobres).strip(),
              sleep=args.sleep)
 
     print '\n############'
@@ -438,13 +439,13 @@
 # TODO: flex queue/mem reqs
 bg_cmd = bg_templ.format(**jobdict)
 
-send_job(jobname='bg.chunks.'+str(outdot),
-         cmd=bg_cmd,
-         logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
-         mem=8000,
-         walltime=2,
-         njobs=int(nchunks),
-         sleep=args.sleep)
+jobres2 = send_job(jobname='bg.chunks.'+str(outdot),
+	           cmd=bg_cmd,
+	           logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+	           mem=8000,
+	           walltime=2,
+	           njobs=int(nchunks),
+	           sleep=args.sleep)
 
 print 'Best-guess jobs submitted for %d chunks.\n' % nchunks
 
@@ -467,8 +468,9 @@
              cmd=next_call,
              logname=agg_log,
              mem=8000,
-             walltime=168, # week
+             walltime=30,
              wait_name='bg.chunks.'+str(outdot),
+	     wait_num=str(jobres2).strip(),
              sleep=args.sleep)
 
 # finish
diff --git a/bin/blueprint.py b/bin/blueprint.py
index eeb2161..d2aba3e 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -297,8 +297,8 @@ def send_job(jobname,
         out, err = p.communicate()
         if p.returncode is None or p.returncode == 0:
             return out
-        else
-            raise EnvironmentError((p.returncode,err))
+        else:
+            raise EnvironmentError((p.returncode,err,out))
             
     else:          
         return 0
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 6fe0985..9bb199e 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -496,14 +496,14 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 # submit job
 gwas_cmd = gwas_templ.format(**jobdict)
 
-send_job(jobname='gwas.chunks.'+str(outdot),
-         cmd=gwas_cmd,
-         logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
-         mem=4000,
-         walltime=2,
-         njobs=int(nchunk),
-         maxpar=200,
-         sleep=args.sleep)
+jobres = send_job(jobname='gwas.chunks.'+str(outdot),
+         	  cmd=gwas_cmd,
+	          logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+	          mem=4000,
+	          walltime=2,
+	          njobs=int(nchunk),
+	          maxpar=200,
+	          sleep=args.sleep)
 
 print 'GWAS jobs submitted for %d chunks.\n' % nchunk
 
@@ -565,8 +565,9 @@ def find_chunk(snpchrom, snpbp, last_chunk):
          cmd=' '.join(agg_call),
          logname=agg_log,
          mem=4000,
-         walltime=168, # week
+         walltime=30,
          wait_name='gwas.chunks.'+str(outdot),
+         wait_num=str(jobres).strip(),
          sleep=args.sleep)
 
 # TODO:
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index bd32661..151d871 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -225,14 +225,14 @@
     shape_cmd = cmd_templ.format(**jobdict)
 
     # submit
-    send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr),
-             cmd=shape_cmd,
-             logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log',
-             mem=int(extra_args.mem_req)*1000,
-             walltime=168, # week
-             njobs=int(num_chr),
-             threads=extra_args.threads,
-             sleep=args.sleep)
+    jobres = send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr),
+	              cmd=shape_cmd,
+	              logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log',
+	              mem=int(extra_args.mem_req)*1000,
+	              walltime=30,
+	              njobs=int(num_chr),
+	              threads=extra_args.threads,
+		      sleep=args.sleep)
 
     print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr
 
@@ -250,6 +250,7 @@
              mem=8000,
              walltime=2, # week
              wait_name='shape.'+str(outdot)+'.resub_'+str(num_chr),
+	     wait_num=str(jobres).strip(),
              sleep=args.sleep)
 
     print '\n############'
@@ -368,13 +369,13 @@
 # submit
 cmd_imp = imp_templ.format(**jobdict)
 
-send_job(jobname='imp.chunks.'+str(outdot),
-         cmd=cmd_imp,
-         logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
-         mem=8000,
-         walltime=2,
-         njobs=int(nchunks),
-         sleep=args.sleep)
+jobres2 = send_job(jobname='imp.chunks.'+str(outdot),
+         	   cmd=cmd_imp,
+         	   logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'),
+         	   mem=8000,
+         	   walltime=2,
+         	   njobs=int(nchunks),
+         	   sleep=args.sleep)
 print 'Imputation jobs submitted for %d chunks.\n' % nchunks
 
 
@@ -399,6 +400,7 @@
              mem=8000,
              walltime=2, # week
              wait_name='imp.chunks.'+str(outdot),
+	     wait_num=str(jobres2).strip(),
              sleep=args.sleep)
 
 
diff --git a/bin/impute_rel.py b/bin/impute_rel.py
index 651c2ea..254cfbe 100755
--- a/bin/impute_rel.py
+++ b/bin/impute_rel.py
@@ -145,7 +145,7 @@
          cmd=next_call,
          logname=shape_log,
          mem=int(args.mem_req * 1000),
-         walltime=168, # week
+         walltime=30,
          sleep=args.sleep)
 
 
diff --git a/bin/pca_rel.py b/bin/pca_rel.py
index 7f6238f..ac401ff 100755
--- a/bin/pca_rel.py
+++ b/bin/pca_rel.py
@@ -197,9 +197,9 @@
                    cmd=str(imuspca_call),
                    logname=str('imuspca_'+args.out+'.sub.log'),
                    mem=int(imus_mem)*1000,
-                   walltime=168, # one week
+                   walltime=30,
                    wait_name=str('strictqc_'+args.out),
-                   wait_num=str(jobres),
+                   wait_num=str(jobres).strip(),
                    sleep=args.sleep,
                    testonly=args.test_sub)
 
@@ -227,7 +227,7 @@
          mem=100,
          walltime=1,
          wait_name=str('imuspca_'+args.out),
-         wait_num=str(jobres2),
+         wait_num=str(jobres2).strip(),
          sleep=str(args.sleep),
          testonly=args.test_sub)
 
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 09e9d3e..5943789 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -303,7 +303,7 @@
                   cmd=' '.join(shape_call),
                   logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log',
                   mem=int(args.mem_req)*1000,
-                  walltime=168, # week
+                  walltime=30,
                   njobs=22,
                   threads=int(args.threads),
                   sleep=str(args.sleep))
@@ -329,7 +329,7 @@
              mem=8000,
              walltime=2,
              wait_name='shape.'+str(outdot),
-             wait_num=jobres,
+             wait_num=str(jobres).strip(),
              sleep=str(args.sleep))
 
 

From 991b0968d273733641679b5546a8283d54187c58 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Sat, 8 Oct 2016 03:52:27 +0200
Subject: [PATCH 17/48] add R to lisa environ for primus

---
 cluster_templates/lisa.sub.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh
index 9575147..8451e4d 100755
--- a/cluster_templates/lisa.sub.sh
+++ b/cluster_templates/lisa.sub.sh
@@ -11,6 +11,7 @@ sleep {sleep_time}
 
 # setup resources
 cd {workdir}
+module load R
 
 # main command line
 {cmd_string}

From 1e2ce65c9b9d633c211486ce9134ecd50f948205 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 00:06:30 +0200
Subject: [PATCH 18/48] remove refs to unused files; improve ref downloads;
 avoid implicating Stephan for my .pl changes; misc bugfixes

---
 bin/admix_rel.py      |  4 +--
 bin/args_pca.py       |  2 +-
 bin/blueprint.py      |  2 +-
 bin/checkflip_pico.pl | 40 ++++-------------------
 bin/checkpos_pico.pl  | 26 +++++----------
 bin/get_refs.sh       | 14 +++++---
 bin/imp_prep.pl       | 76 ++++++++++++++++++++++---------------------
 bin/lift_to_hg19.pl   |  4 +--
 bin/uger.sub.sh       | 22 -------------
 bin/uger_array.sub.sh | 29 -----------------
 10 files changed, 70 insertions(+), 149 deletions(-)
 delete mode 100755 bin/uger.sub.sh
 delete mode 100755 bin/uger_array.sub.sh

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 5f29981..faa689e 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -144,13 +144,13 @@
                     metavar='PATH',
                     help='path to ADMIXTURE executable',
                     required=False,
-                    default="/humgen/atgu1/fs03/shared_resources/shared_software/bin/admixture")
+                    default=None)
 arg_exloc.add_argument('--reap-ex',
                     type=str,
                     metavar='PATH',
                     help='path to REAP executable',
                     required=False,
-                    default="/humgen/atgu1/fs03/shared_resources/shared_software/bin/REAP")
+                    default=None)
 
 args = parser.parse_args()
 
diff --git a/bin/args_pca.py b/bin/args_pca.py
index 61d2353..59442a0 100644
--- a/bin/args_pca.py
+++ b/bin/args_pca.py
@@ -190,7 +190,7 @@
 #                    metavar='PATH',
 #                    help='path to smartpca executable',
 #                    required=False,
-#                    default="/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin/smartpca")
+#                    default=None)
 
 
 
diff --git a/bin/blueprint.py b/bin/blueprint.py
index d2aba3e..e5535f6 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -492,7 +492,7 @@ def load_job(jfile):
     # get queue
     conf_file = os.environ['HOME']+"/picopili.conf"
     configs = read_conf(conf_file)
-    queue = configs['queue']
+    queue = configs['cluster']
     
     # set logfile name
     if args.noerr:
diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl
index 3bb3d08..3eb333b 100755
--- a/bin/checkflip_pico.pl
+++ b/bin/checkflip_pico.pl
@@ -12,6 +12,8 @@
 #
 #                                  01/14/10
 #
+#          Adapted for Picopili by Raymond Walters, rwalters@broadinstitute.org
+#
 #
 #
 #    checks alelles of a bim-file (plink-binary-dataset) with reference-info (created with refinfo)
@@ -65,10 +67,9 @@ sub trans {
 my $dfcol = 3;  ## chr-col in reference
 
 
-my $info_file = "HM3.info";
+my $info_file = "";
 
 my $refdir = "";
-my $lisadir = "/home/gwas/pgc-samples/hapmap_ref/";
 
 my $frq_th = .15;
 my $subdir = "flip_subdir";
@@ -82,7 +83,7 @@ sub trans {
 version: $version
 
   --refdir STRING     location of reference-directory, default $refdir
-  --ploc STRING       location of plink-binary (default is found at Broad)
+  --ploc STRING       location of plink-binary (default is found from picopili.conf)
                        default: $p2loc
   --info STRING       other info-file (absolute path) -> overwrites --refdir
   --subdir STRING     subdir, to put end-dataset into, default: $subdir
@@ -103,21 +104,6 @@ sub trans {
 
 #  --replace           replace old dataset with new one
 
-
- for Broadies: the files are currently stored here (this script should be able to find them):
-  $refdir
-
-    /fg/debakkerscratch/ripke/hapmap_ref/subchr/infosum.sorted
-
-here 1KG
-  /home/radon01/sripke/bakker_ripke/hapmap_ref/impute2_ref/1KG_Mar12/ALL_1000G_phase1integrated_feb2012_impute/subchr/sumfrq.eur
-
-
-
- on Lisa, th files are found here: $lisadir
-
- created by Stephan Ripke 2010: sripke\@broadinstitute.org
-
 ";
 
 use Cwd;
@@ -138,23 +124,11 @@ sub trans {
 
 die "$usage\n" if ($help);
 
-if ($info_file eq "HM3.info") {
-    
-    unless (-e "$refdir/$info_file") {
-	if (-e "$lisadir/$info_file"){
-	    $refdir = $lisadir;
-	}
-	else {
-	    print  "check reference dir and permissions for <$refdir/$info_file>\n";
-	    exit;
-	}
-    }
-    
-    $info_file = "$refdir/$info_file";
+if ($info_file eq "") {
+    die "$usage\n";
 }
-
 else {
-    die "couldn't find $info_file" unless (-e $info_file);
+    die "couldn't find info-file $info_file" unless (-e $info_file);
 }
 
 if ($dcolstr) {
diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl
index 76a2661..5bd8116 100755
--- a/bin/checkpos_pico.pl
+++ b/bin/checkpos_pico.pl
@@ -8,10 +8,13 @@
 #
 #    checkpos6
 #
-#          created by Stephan Ripke, Broadinstitute, sripke@broadinstitute.org
+#          Created by Stephan Ripke, Broadinstitute, sripke@broadinstitute.org
 #
 #                                  01/14/10
 #
+#          Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org
+#
+#
 #
 #
 #    checks position bim-file (plink-binary-dataset) with dbsnp reference
@@ -74,9 +77,7 @@ sub trans {
 my $subdir = "dbsnp_subdir";
 
 my $home_dir = "$ENV{HOME}";
-my $dbsnp_file = "/psych/genetics_data/ripke/references_from_debakkerscratch/ref_db/sorted_dbsnp_positions_129_b36";## created from this one  /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod 
-my $dbsnp_file_lisa = "/home/gwas/1KG_reference/sorted_dbsnp_positions_129_b36";           ## including sed 's/\tY\t/\t24\t/' sorted_dbsnp_positions_129_b36 | sed 's/\tX\t/\t23\t/' | sed 's/MT/26/'
-
+my $dbsnp_file = "";
 
 
 my $usage = "
@@ -84,11 +85,8 @@ sub trans {
 
 version: $version
 
-  --dbsnp STRING      dbSNP reference file (default is found at Broad)
-                       default: $dbsnp_file
-                       or: $dbsnp_file_lisa
-                       HM3: /home/gwas/pgc-samples/hapmap_ref/infosum.annot.markers.sorted
-  --ploc STRING       location of plink-binary (default is found at Broad)
+  --dbsnp STRING      dbSNP reference file (created by readref)
+  --ploc STRING       location of plink-binary (default read from picopili.conf)
                        default: $p2loc
   --col INT,INT,INT   snp-col,chr-col,kb-col in bim-file: default: $scol,$ccol,$kcol 
   --dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol 
@@ -101,8 +99,6 @@ sub trans {
 
   --exmulti, --nokeep and --subdir are in effect, as long as --ncreate is not switched
 
- created by Stephan Ripke 2010: sripke\@broadinstitute.org
-
 ";
 
 use Getopt::Long;
@@ -130,13 +126,7 @@ sub trans {
 }
 
 unless (-e $dbsnp_file) {
-    if (-e $dbsnp_file_lisa) {
-	$dbsnp_file = $dbsnp_file_lisa;
-    }
-    else {
-	print  "*** Error, dbSNP file not found\n";
-	exit;
-    }
+    die "*** Error, dbSNP file not found\n";
 }
 
 
diff --git a/bin/get_refs.sh b/bin/get_refs.sh
index 3ce2d5a..4231361 100755
--- a/bin/get_refs.sh
+++ b/bin/get_refs.sh
@@ -24,7 +24,7 @@ echo " "
 
 # setup
 rp_conf="$HOME/ricopili.conf"
-SERVER="https://personal.broadinstitute.org/rwalters/picopili_files/"
+SERVER="https://personal.broadinstitute.org/rwalters/picopili_files"
 SCRIPT=$(readlink -f "$0")
 BINLOC=$(dirname "$SCRIPT")
 LIBLOC=`echo $(dirname "$BINLOC")"/lib"`
@@ -118,7 +118,7 @@ if [ "$to_dl" = 'true' ]; then
 	echo "WARNING: Preparing to download reference files from:"
 	echo "$SERVER"
 	echo " "
-	echo "Expected total file size is ~275 MB, minus existing"
+	echo "Expected total file size is ~300 MB, minus existing"
 	echo "files already linked/downloaded."
 	echo " "
 	echo "If you do not have web access, or if you do not want"
@@ -133,7 +133,10 @@ if [ "$to_dl" = 'true' ]; then
 		if [ "$finame" = "last" ]; then
 			continue
 		else
-			wget "$SERVER/$finame" "$LIBLOC/buigue/$finame"
+			echo " "
+			echo "Next file: $SERVER/$finame"
+			# wget --no-check-certificate "$SERVER/$finame" "$LIBLOC/buigue/$finame"
+			curl -o "$LIBLOC/buigue/$finame" "$SERVER/$finame"
 		fi
 	done
 	for finame in ${hmfiles[@]}; do
@@ -141,7 +144,10 @@ if [ "$to_dl" = 'true' ]; then
 		if [ "$finame" = "last" ]; then
 			continue
 		else
-			wget "$SERVER/$finame" "$LIBLOC/plague/$finame"
+			echo " "
+			echo "Next file: $SERVER/$finame"
+			# wget --no-check-certificate "$SERVER/$finame" "$LIBLOC/plague/$finame"
+			curl -o "$LIBLOC/plague/$finame" "$SERVER/$finame"
 		fi
 	done
 fi
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index 1afc2ee..a707037 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -210,7 +210,7 @@
 	}
     }
     if ( $scr_path eq  '') {
-	push @miss_scripts, "cp /home/unix/sripke/bin/$scr_name ./\n";
+	push @miss_scripts, "$scr_name\n";
 	print "!!Error!! : No $scr_name command available\n" ;
     }
  
@@ -219,18 +219,20 @@
 
 
 if (@miss_scripts > 0) {
-  if (-e "get_scripts_on_broad.txt") {
-    print "please remove this file and restart: get_scripts_on_broad.txt\n";
-  }
-  die $! unless open FILE1, "> get_scripts_on_broad.txt";
+
+#  if (-e "get_scripts_on_broad.txt") {
+#    print "please remove this file and restart: get_scripts_on_broad.txt\n";
+#  }
+  die $! unless open FILE1, "> missing_picopili_scripts.txt";
   foreach (@miss_scripts) {
     print FILE1 "$_";
   }
   close FILE1;
 
+  die "Missing required scripts. See missing_picopili_scripts.txt\n";
 
-  print "exiting now -> have a look at get_scripts_on_broad.txt\n";
-  exit;
+#  print "exiting now -> have a look at get_scripts_on_broad.txt\n";
+#  exit;
 
 }
 
@@ -422,36 +424,36 @@ sub send_jobarray {
     $now =~ s/ /_/g;
 
 
-    if ($sjaname eq "finished") {
-
-	my $fini_message ;
-	$fini_message .= "\n\n##################################################################\n";
-	$fini_message .= "##### CONGRATULATIONS: \n";
-	$fini_message .= "##### rp_pipeline finished successfully:\n";
-	$fini_message .= "##### $sjainfotxt\n";
-	$fini_message .= "##### now start with PCA (see README in subdir pcaer_sub/)\n";
-	$fini_message .= "##### or directly with postimputation analysis\n";
-	$fini_message .= "##### have a look at the wiki page\n"; 
-	$fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
-	$fini_message .= "##################################################################\n";
-	print "$fini_message\n";
-
-	
-	die $! unless open SUC, "> success_file";
-	print SUC $fini_message."\n";
-	close SUC;
-
-	if($email_on){
-		&mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ;
-	}
-
-	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
-	&a2filenew_app("$sjainfofile",$sjarow);
-
-
-	exit;
-
-    }
+#    if ($sjaname eq "finished") {
+#
+#	my $fini_message ;
+#	$fini_message .= "\n\n##################################################################\n";
+#	$fini_message .= "##### CONGRATULATIONS: \n";
+#	$fini_message .= "##### rp_pipeline finished successfully:\n";
+#	$fini_message .= "##### $sjainfotxt\n";
+#	$fini_message .= "##### now start with PCA (see README in subdir pcaer_sub/)\n";
+#	$fini_message .= "##### or directly with postimputation analysis\n";
+#	$fini_message .= "##### have a look at the wiki page\n"; 
+#	$fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n";
+#	$fini_message .= "##################################################################\n";
+#	print "$fini_message\n";
+#
+#	
+#	die $! unless open SUC, "> success_file";
+#	print SUC $fini_message."\n";
+#	close SUC;
+#
+#	if($email_on){
+#		&mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ;
+#	}
+#
+#	my $sjarow      = $sjainfotxt."\t$sjaname\t$now";
+#	&a2filenew_app("$sjainfofile",$sjarow);
+#
+#
+#	exit;
+#
+#    }
 
 
     chdir ($sjadir);
diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl
index 7ac1726..e18d626 100755
--- a/bin/lift_to_hg19.pl
+++ b/bin/lift_to_hg19.pl
@@ -12,6 +12,8 @@
 #
 #                                  01/14/10
 #
+#          Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org
+#
 #
 #
 #    lifts a plink binary from hg18 to hg19
@@ -84,8 +86,6 @@
   here a seletion of lilofiles:
   $liloc
 
- created by Stephan Ripke 2010: sripke\@broadinstitute.org
-
 ";
 
 
diff --git a/bin/uger.sub.sh b/bin/uger.sub.sh
deleted file mode 100755
index 494d9df..0000000
--- a/bin/uger.sub.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-#$ -j y
-#$ -cwd
-#$ -V
-
-# wrapper script for job submission on Broad UGER cluster
-#
-# first parameter should be duration for 'sleep' before
-# execution
-# remainder of command line should be the job to be 
-# submitted (including all agruments)
-#
-# The -V flag above will provoke a warning that 
-# LD_LIBRARY_PATH won't be used for security reasons;
-# this warning can be safely ignored
-
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-sleep $1
-shift
-"$@"
-# eof
diff --git a/bin/uger_array.sub.sh b/bin/uger_array.sub.sh
deleted file mode 100755
index 8109e27..0000000
--- a/bin/uger_array.sub.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-#$ -j y
-#$ -cwd
-#$ -V
-
-# wrapper script for job submission on Broad UGER cluster
-#
-# first parameter should be duration for 'sleep' before
-# execution
-# remainder of command line should be the job to be 
-# submitted (including all agruments)
-#
-# The -V flag above will provoke a warning that 
-# LD_LIBRARY_PATH won't be used for security reasons;
-# this warning can be safely ignored
-
-# use for task arrays
-# tasknum=$SGE_TASK_ID
-
-source /broad/software/scripts/useuse
-reuse -q Anaconda
-sleep $1
-shift
-
-inp="$@"
-call=${inp//\$tasknum/$SGE_TASK_ID}
-$call
-
-# eof

From 0d25cb5528f4ef42e4b4b4cbe49277a60f475d45 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 00:09:46 +0200
Subject: [PATCH 19/48] obfuscate email

---
 bin/checkflip_pico.pl | 2 +-
 bin/config_pico.pl    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl
index 3eb333b..679a4b0 100755
--- a/bin/checkflip_pico.pl
+++ b/bin/checkflip_pico.pl
@@ -12,7 +12,7 @@
 #
 #                                  01/14/10
 #
-#          Adapted for Picopili by Raymond Walters, rwalters@broadinstitute.org
+#          Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org
 #
 #
 #
diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index 442c2bb..b9c7767 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -673,7 +673,7 @@ ()
 	if ($shell eq "bash-login-check"){$shell = "bash";}
 	if ($shell ne "bash" && $shell ne "tcsh") {
     	print "Warning! Shell not recognized: $shell\n";
-		print "Please send email to rwalters\@broadinstitute.org\n";
+		print "Please send email to rwalters(at)broadinstitute.org\n";
 	}
 	print "Detected you are using the following shell: $shell\n\n";
 	

From 44df0df53e2caa833218ed61c060e652b3fad3d1 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 13:24:33 -0400
Subject: [PATCH 20/48] add admixture projection option

---
 bin/admix_rel.py | 395 +++++++++++++++++++++++++++--------------------
 1 file changed, 230 insertions(+), 165 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index faa689e..4ef504d 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -203,26 +203,38 @@
 
 plinkx = find_exec('plink',key='p2loc')
 
-if args.rscript_ex == None or args.rscript_ex == "None":
+if args.rscript_ex is None or args.rscript_ex == "None":
     args.rscript_ex = find_exec('Rscript', key='rscloc')
 
-if args.admixture_ex == None or args.admixture_ex == "None":
-    args.admixture_ex = find_exec('admixture', key='admloc')
-
-if args.reap_ex == None or args.reap_ex == "None":
+if args.reap_ex is None or args.reap_ex == "None":
     args.reap_ex = find_exec('REAP', key='reaploc')
 
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript'
 
 if plot_pca:
-    Rplotibdx = rp_bin+'/plot_pca.Rscript'
+    Rplotpcax = rp_bin+'/plot_pca.Rscript'
+
+
+# either have admixture file, or need to run admixture
+run_admix = True
 
+if args.admix_p is not None and args.admix_q is not None and args.admix_p != "" and args.admix_q != "":
+    if args.admixture_ex is None or args.admixture_ex == "None":
+        args.admixture_ex = find_exec('admixture', key='admloc')
+
+    test_exec(args.admixture_ex, 'ADMIXTURE')
+    
+    assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path"
+    run_admix = False
+
+else:
+    assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p)
+    assert os.path.isfile(args.admix_q), "Admixture .P file %s does not exist." % str(args.admix_q)
 
 # verify executables
 test_exec(plinkx, 'Plink')
 test_exec(args.rscript_ex, 'Rscript')
-test_exec(args.admixture_ex, 'ADMIXTURE')
 test_exec(args.reap_ex, 'REAP')
 
 # pca file
@@ -231,7 +243,6 @@
     assert '/' not in args.target_bfile, "--plot-admix-pca must specify only a file, not a path"
 
 # verify bfiles are files, not paths
-assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path"
 assert '/' not in args.target_bfile, "--target-bfile must specify only a file stem, not a path"
 
 
@@ -272,136 +283,172 @@
 print '\n...Running Admixture on unrelated dataset...'
 #############
 
-admix_call = [args.admixture_ex,
-              str(args.unrel_bfile+'.bed'),
-              str(args.npops),
-              '-j'+str(args.multithread_cores)]
-admix_unrel_log = open(str('admix_'+args.out+'_unrel.log'), 'w')
-
-print str(' '.join(admix_call))
-print 'Logging to ' + admix_unrel_log.name + '\n'
-subprocess.check_call(admix_call, stdout=admix_unrel_log)
-
-admix_unrel_log.close()
+if run_admix:
+    admix_call = [args.admixture_ex,
+                  str(args.unrel_bfile+'.bed'),
+                  str(args.npops),
+                  '-j'+str(args.multithread_cores)]
+    admix_unrel_log = open(str('admix_'+args.out+'_unrel.log'), 'w')
+    
+    print str(' '.join(admix_call))
+    print 'Logging to ' + admix_unrel_log.name + '\n'
+    subprocess.check_call(admix_call, stdout=admix_unrel_log)
+    
+    admix_unrel_log.close()
 
 
+if args.use_exemplars:
 
-#############
-print '\n...Selecting exemplars for each ancestral population...'
-#############
-# - identify population assignment (including "-") for each input individual
-# - confirm whether there are enough IDs assigned to each populations
-# - match population assignments to FID/IIDs
-# - write .pops file for target bfile, .pops.info file 
-
-# label for populations are popA, popB, popC, ...
-popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)]
-
-# define function returning popname or '-' based on largest proportion
-# Note: ties broken in favor of first pop listed in names (possible if th <= 0.5)
-def maxpop(props, names, th):
-    whichmax = props.index(max(props))
-    if props[whichmax] > th:
-        outpop = names[whichmax]
-    else:
-        outpop = '-'
-    return outpop
-
-# get list of selected pop for each individual in admixture results
-ind_pops = []
-admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q')
-with open(admix_pops_file, 'r') as f:
-    # map() required to read probs as float instead of string
-    ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f]
-
-# sanity check parsing
-nfam = file_len(str(args.unrel_bfile+'.fam'))
-if len(ind_pops) != nfam:
-    raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \
-                     'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam')))
-
-# check have sufficient exemplars
-popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)]
-lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)]
-
-print 'Exemplars per population:'
-for i in range(args.npops):
-    print str(popnames[i] + ': ' + str(popcounts[i]))
-print 'Unassigned: '+str(ind_pops.count('-'))
-
-if any(lackingpops):
-    print '\n###########\n'
-    print 'ERROR: One or more populations with insufficient number of exemplars (<'+str(args.min_exemplar)+').'
-    print '\nConsider rerunning with fewer ancestral populations (here: '+str(args.npops)+'), \n' + \
-          'a looser threshold for selecting population exemplars (here: '+str(args.prop_th)+'), \n' + \
-          'or fewer required exemplars per ancestral population in the unrelated set ' + \
-          '(here :'+str(args.min_exemplar)+').\n'
-    exit(1)
-
-
-### match exemplar pop status with FID/IIDs, record in dict
-pop_dict = {}
-
-# process fam file by line
-ref_fam = open(str(args.unrel_bfile+'.fam'), 'r')
-idnum=0
-for line in ref_fam:
-    # iterate line counter, used to get elements from ind_pops[]
-    idnum += 1
-    
-    # read
-    (fid, iid, pat, mat, sex, phen) = line.split()
-
-    # use FID:IID identifier as key to record pop status
-    bfile_id = fid +':'+ iid
-    pop_dict[bfile_id] = ind_pops[idnum-1]
-
-ref_fam.close()
-
-
-### create pop file to match target fam file, pop info file
-target_fam = open(str(args.target_bfile+'.fam'), 'r')
-target_pop = open(str(args.target_bfile+'.pop'), 'w')
-target_popinfo = open(str(args.target_bfile+'.pop.info'), 'w')
-
-for line in target_fam:
-    
-    # read
-    (targetfid, targetiid, pat, mat, sex, phen) = line.split()
-    target_id = targetfid +':'+ targetiid
-    
-    # check dict
-    if target_id in pop_dict:
-        target_pop.write(pop_dict[target_id] + '\n')
-        target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' unrel ' + pop_dict[target_id] + '\n')
+    #############
+    print '\n...Selecting exemplars for each ancestral population...'
+    #############
+    # - identify population assignment (including "-") for each input individual
+    # - confirm whether there are enough IDs assigned to each populations
+    # - match population assignments to FID/IIDs
+    # - write .pops file for target bfile, .pops.info file 
+    
+    # label for populations are popA, popB, popC, ...
+    popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)]
+    
+    # define function returning popname or '-' based on largest proportion
+    # Note: ties broken in favor of first pop listed in names (possible if th <= 0.5)
+    def maxpop(props, names, th):
+        whichmax = props.index(max(props))
+        if props[whichmax] > th:
+            outpop = names[whichmax]
+        else:
+            outpop = '-'
+        return outpop
+    
+    # get list of selected pop for each individual in admixture results
+    ind_pops = []
+    
+    if run_admix:
+        admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q')
     else:
-        target_pop.write('-' + '\n')
-        target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' target ' + '-' + '\n')
-
-
-target_fam.close()
-target_pop.close()
-target_popinfo.close()
-
+        admix_pops_file = args.admix_q
+    
+    
+    with open(admix_pops_file, 'r') as f:
+        # map() required to read probs as float instead of string
+        ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f]
+    
+    # sanity check parsing
+    nfam = file_len(str(args.unrel_bfile+'.fam'))
+    if len(ind_pops) != nfam:
+        raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \
+                         'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam')))
+    
+    # check have sufficient exemplars
+    popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)]
+    lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)]
+    
+    print 'Exemplars per population:'
+    for i in range(args.npops):
+        print str(popnames[i] + ': ' + str(popcounts[i]))
+    print 'Unassigned: '+str(ind_pops.count('-'))
+    
+    if any(lackingpops):
+        print '\n###########\n'
+        print 'ERROR: One or more populations with insufficient number of exemplars (<'+str(args.min_exemplar)+').'
+        print '\nConsider rerunning with fewer ancestral populations (here: '+str(args.npops)+'), \n' + \
+              'a looser threshold for selecting population exemplars (here: '+str(args.prop_th)+'), \n' + \
+              'or fewer required exemplars per ancestral population in the unrelated set ' + \
+              '(here :'+str(args.min_exemplar)+').\n'
+        exit(1)
+    
+    
+    ### match exemplar pop status with FID/IIDs, record in dict
+    pop_dict = {}
+    
+    # process fam file by line
+    ref_fam = open(str(args.unrel_bfile+'.fam'), 'r')
+    idnum=0
+    for line in ref_fam:
+        # iterate line counter, used to get elements from ind_pops[]
+        idnum += 1
+        
+        # read
+        (fid, iid, pat, mat, sex, phen) = line.split()
+    
+        # use FID:IID identifier as key to record pop status
+        bfile_id = fid +':'+ iid
+        pop_dict[bfile_id] = ind_pops[idnum-1]
+    
+    ref_fam.close()
+    
+    
+    ### create pop file to match target fam file, pop info file
+    target_fam = open(str(args.target_bfile+'.fam'), 'r')
+    target_pop = open(str(args.target_bfile+'.pop'), 'w')
+    target_popinfo = open(str(args.target_bfile+'.pop.info'), 'w')
+    
+    for line in target_fam:
+        
+        # read
+        (targetfid, targetiid, pat, mat, sex, phen) = line.split()
+        target_id = targetfid +':'+ targetiid
+        
+        # check dict
+        if target_id in pop_dict:
+            target_pop.write(pop_dict[target_id] + '\n')
+            target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' unrel ' + pop_dict[target_id] + '\n')
+        else:
+            target_pop.write('-' + '\n')
+            target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' target ' + '-' + '\n')
+    
+    
+    target_fam.close()
+    target_pop.close()
+    target_popinfo.close()
 
 
-#############
-print '\n...Running supervised admixture analysis in target data...'
-#############
 
-admix_super_call = [args.admixture_ex,
-                    str(args.target_bfile+'.bed'),
-                    str(args.npops),
-                    '-j'+str(args.multithread_cores),
-                    '--supervised']
-admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w')
+    #############
+    print '\n...Running supervised admixture analysis in target data...'
+    #############
+    
+    admix_super_call = [args.admixture_ex,
+                        str(args.target_bfile+'.bed'),
+                        str(args.npops),
+                        '-j'+str(args.multithread_cores),
+                        '--supervised']
+    admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w')
+    
+    print str(' '.join(admix_super_call))
+    print 'Logging to ' + admix_target_log.name + '\n'
+    subprocess.check_call(admix_super_call, stdout=admix_target_log)
+    
+    admix_target_log.close()
+    
 
-print str(' '.join(admix_super_call))
-print 'Logging to ' + admix_target_log.name + '\n'
-subprocess.check_call(admix_super_call, stdout=admix_target_log)
 
-admix_target_log.close()
+# no exemplars, using projection instead
+else:
+    
+    #############
+    print '\n...Projecting admixture analysis to target data...'
+    #############
+    
+    ref_p_name = str(args.target_bfile)+'.'+str(args.npops)+'.P.in'
+    if run_admix:
+        link(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', ref_p_name, 'admixture allele freqs')
+    else:
+        ref_p_in = str(args.admix_p)
+        link(wd+'/'+ref_p_in, ref_p_name,'input admixture allele freqs')
 
+    
+    admix_project_call = [args.admixture_ex,
+                          '-P', str(args.target_bfile+'.bed'),
+                        str(args.npops),
+                        '-j'+str(args.multithread_cores)]
+    admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w')
+    
+    print str(' '.join(admix_project_call))
+    print 'Logging to ' + admix_target_log.name + '\n'
+    subprocess.check_call(admix_super_call, stdout=admix_target_log)
+    
+    admix_target_log.close()
 
 
 #############
@@ -426,7 +473,7 @@ def maxpop(props, names, th):
 target_fam_nam = str(args.target_bfile + '.fam')
 
 if not (file_len(target_Qfile_nam) == file_len(target_fam_nam)):
-    raise ValueError('Length of admixture proportions ouput (%s) does not match fam file (%s). ' + \
+    raise ValueError('Length of admixture proportions output (%s) does not match fam file (%s). ' + \
                      'Error during output?' % (target_Qfile_nam, target_fam_nam))
 
 # paste together columns, should be in same order (based on ADMIXTURE's ouptut format)
@@ -514,12 +561,15 @@ def maxpop(props, names, th):
 
     # setup file streams for plotinfo files
     pop_info_files = []
-    exemp_info_files = []
+    if args.use_exemplars:
+        exemp_info_files = []
     for i in xrange(args.npops):
         pop_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt', 'w') )
         pop_info_files[i].write('FID IID col pch layer\n')
-        exemp_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', 'w') )
-        exemp_info_files[i].write('FID IID col pch layer\n')
+        
+        if args.use_exemplars:
+            exemp_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', 'w') )
+            exemp_info_files[i].write('FID IID col pch layer\n')
         
     # parse admixture proportions
     reap_mix_props = open(str(args.target_bfile + '.props.tmp.txt'), 'r')
@@ -543,26 +593,29 @@ def maxpop(props, names, th):
             pop_info_files[i].write(' '.join([fid, iid, bin_col, str(1), str(in_bin)])+'\n')
             
             # exemplar info file: FID, IID, col, pch, layer
-            if joinid in pop_dict:
-                if pop_dict[joinid] == popnames[i]:
-                    exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(exemplar_color)+'\"', str(exemplar_pch), str(3)]) + '\n')
+            if args.use_exemplars:
+                if joinid in pop_dict:
+                    if pop_dict[joinid] == popnames[i]:
+                        exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(exemplar_color)+'\"', str(exemplar_pch), str(3)]) + '\n')
+                    else:
+                        exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(ref_color)+'\"', str(ref_pch), str(2)]) + '\n')
                 else:
-                    exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(ref_color)+'\"', str(ref_pch), str(2)]) + '\n')
-            else:
-                exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(other_color)+'\"', str(other_pch), str(1)]) + '\n')
+                    exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(other_color)+'\"', str(other_pch), str(1)]) + '\n')
 
     # close plotinfo files
     for i in xrange(args.npops):
         pop_info_files[i].close()
-        exemp_info_files[i].close()
+        if args.use_exemplars:
+            exemp_info_files[i].close()
     
     # create legend files: col, pch, fill, text (either col/pch or fill should be NA)
-    exem_legend = open(str(args.target_bfile) + '.exemplar.legend.txt', 'w')
-    exem_legend.write('col pch fill text\n')
-    exem_legend.write(str(exemplar_color) + ' ' + str(exemplar_pch) + ' NA ' + '\"Population exemplar\"\n')
-    exem_legend.write(str(ref_color) + ' ' + str(ref_pch) + ' NA ' + '\"Reference set\"\n')
-    exem_legend.write(str(other_color) + ' ' + str(other_pch) + ' NA ' + '\"Non-reference set\"\n')        
-    exem_legend.close()
+    if args.use_exemplars:
+        exem_legend = open(str(args.target_bfile) + '.exemplar.legend.txt', 'w')
+        exem_legend.write('col pch fill text\n')
+        exem_legend.write(str(exemplar_color) + ' ' + str(exemplar_pch) + ' NA ' + '\"Population exemplar\"\n')
+        exem_legend.write(str(ref_color) + ' ' + str(ref_pch) + ' NA ' + '\"Reference set\"\n')
+        exem_legend.write(str(other_color) + ' ' + str(other_pch) + ' NA ' + '\"Non-reference set\"\n')        
+        exem_legend.close()
     
     prop_legend = open(str(args.target_bfile) + '.admixture.legend.txt', 'w')
     prop_legend.write('col pch fill text\n')
@@ -572,16 +625,17 @@ def maxpop(props, names, th):
 
     ### generate plots
     for i in xrange(args.npops):
-        r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w')
-        subprocess.check_call([Rplotpcax,
-                               str(args.plot_admix_pca),
-                               str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt',
-                               str(args.target_bfile) + '.exemplar.legend.txt',
-                               str(3),
-                               str(args.out) + '.' + popnames[i] + '.exemplars'],
-                               stderr=subprocess.STDOUT,
-                               stdout=r_pca_ex_log)    
-        r_pca_ex_log.close()
+        if args.use_exemplars:
+            r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w')
+            subprocess.check_call([Rplotpcax,
+                                   str(args.plot_admix_pca),
+                                   str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt',
+                                   str(args.target_bfile) + '.exemplar.legend.txt',
+                                   str(3),
+                                   str(args.out) + '.' + popnames[i] + '.exemplars'],
+                                   stderr=subprocess.STDOUT,
+                                   stdout=r_pca_ex_log)    
+            r_pca_ex_log.close()
 
         r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w')
         subprocess.check_call([Rplotpcax,
@@ -612,7 +666,10 @@ def maxpop(props, names, th):
                                str(args.out+'.plot_pca_files.tar.gz')] + \
                                glob(args.target_bfile+".*.admixture.plotinfo.txt") + \
                                [str(args.target_bfile)+".admixture.legend.txt"] + \
-                               glob(args.out+".*.plot_admixture.log") + \
+                               glob(args.out+".*.plot_admixture.log"))
+                               
+        subprocess.check_call(["tar", "-zcvf",
+                               str(args.out+'.plot_exemplar_files.tar.gz')] + \
                                glob(args.target_bfile+".*.exemplar.plotinfo.txt") + \
                                [str(args.target_bfile)+".exemplar.legend.txt"] + \
                                glob(args.out+".*.plot_exemplars.log")  )
@@ -621,22 +678,27 @@ def maxpop(props, names, th):
         subprocess.check_call(['rm'] + glob(args.target_bfile+".*.admixture.plotinfo.txt"))
         subprocess.check_call(['rm'] + glob(args.target_bfile+".admixture.legend.txt"))
         subprocess.check_call(['rm'] + glob(args.out+".*.plot_admixture.log"))
-        subprocess.check_call(['rm'] + glob(args.target_bfile+".*.exemplar.plotinfo.txt"))
-        subprocess.check_call(['rm'] + glob(args.target_bfile+".exemplar.legend.txt"))
-        subprocess.check_call(['rm'] + glob(args.out+".*.plot_exemplars.log"))
+        
+        if args.use_exemplars:
+            subprocess.check_call(['rm'] + glob(args.target_bfile+".*.exemplar.plotinfo.txt"))
+            subprocess.check_call(['rm'] + glob(args.target_bfile+".exemplar.legend.txt"))
+            subprocess.check_call(['rm'] + glob(args.out+".*.plot_exemplars.log"))
 
 
     ###
     print '\nZipping Admixture output files:'
     ###
+    
     gz_confirm(str(args.target_bfile)+'.'+str(args.npops)+'.P', 
                str(args.target_bfile)+'.'+str(args.npops)+'.P.gz', force=False)
     gz_confirm(str(args.target_bfile)+'.'+str(args.npops)+'.Q', 
                str(args.target_bfile)+'.'+str(args.npops)+'.Q.gz', force=False)
-    gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', 
-               str(args.unrel_bfile)+'.'+str(args.npops)+'.P.gz', force=False)
-    gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.Q', 
-               str(args.unrel_bfile)+'.'+str(args.npops)+'.Q.gz', force=False)
+    
+    if run_admix:
+        gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', 
+                   str(args.unrel_bfile)+'.'+str(args.npops)+'.P.gz', force=False)
+        gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.Q', 
+                   str(args.unrel_bfile)+'.'+str(args.npops)+'.Q.gz', force=False)
 
     
     ###
@@ -655,8 +717,11 @@ def maxpop(props, names, th):
     ###
     subprocess.check_call(['rm', '-v',
                            str(args.target_bfile)+'.tmp_recode.tped',
-                           str(args.target_bfile)+'.tmp_recode.tfam',
-                           str(args.target_bfile)+'.props.tmp.txt'])
+                           str(args.target_bfile)+'.tmp_recode.tfam'])
+    
+    if plot_pca:
+        subprocess.check_call(['rm', '-v', 
+                               str(args.target_bfile)+'.props.tmp.txt'])
 
     ###
     print '\nRemove if exist:'

From e0512bf9e385b0b71ccf4a74bbaecebe94443c9a Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 14:02:20 -0400
Subject: [PATCH 21/48] arguments for admix projection vs exemplars

---
 bin/admix_rel.py | 70 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 4ef504d..46e9882 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -65,8 +65,23 @@
                     type=str,
                     metavar='FILESTEM',
                     help='File stem for plink bed/bim/fam files ' + \
-                         'with unrelated individuals to estimate admixture.',
-                    required=True)
+                         'with unrelated individuals to estimate admixture.' + \
+                         'Must specify either this or --admix-p.',
+                    required=False)
+arg_base.add_argument('--admix-p',
+                    type=str,
+                    metavar='FILE',
+                    help='Admixture results .P file from sample of ' + \
+                         'unrelated individuals. Can alternatively specify ' + \
+                         '--unrel-bfile to run this initial admixture.',
+                    required=False)
+arg_base.add_argument('--admix-q',
+                    type=str,
+                    metavar='FILE',
+                    help='Admixture results .Q file from sample of ' + \
+                         'unrelated individuals. Required only if using ' + \
+                         '--admix-p and --use-exemplars.',
+                    required=False)
 arg_base.add_argument('--target-bfile', 
                     type=str,
                     metavar='FILESTEM',
@@ -89,13 +104,19 @@
 arg_base.add_argument('--no-cleanup',
                     action='store_true',
                     help='skip cleanup of interim files')
-
 arg_admix.add_argument('--npops',
                     type=int,
                     metavar='INT',
                     help='Number of ancestral populations for admixture',
                     required=False,
                     default=4)
+arg_admix.add_argument('--use-exemplars',
+                    action='store_true',
+                    help='Determine admixture in target sample based on ' + \
+                          'supervised fit with a selection of population exemplars ' + \
+                          'rather than a project of admixture solution in unrelateds. ' + \
+                          '(Required for ADMIXTURE version < 1.3). Requires --unrel-bfile, ' + \
+                          'and if using --admix-p also requires specifying --admix-q.')                    
 arg_admix.add_argument('--prop-th',
                     type=float,
                     metavar='FLOAT',
@@ -203,6 +224,11 @@
 
 plinkx = find_exec('plink',key='p2loc')
 
+if args.admixture_ex is None or args.admixture_ex == "None":
+    args.admixture_ex = find_exec('admixture', key='admloc')
+
+test_exec(args.admixture_ex, 'ADMIXTURE')
+
 if args.rscript_ex is None or args.rscript_ex == "None":
     args.rscript_ex = find_exec('Rscript', key='rscloc')
 
@@ -215,22 +241,33 @@
 if plot_pca:
     Rplotpcax = rp_bin+'/plot_pca.Rscript'
 
-
-# either have admixture file, or need to run admixture
+# check if running admixture for unrelateds
 run_admix = True
+if args.admix_p is not None and args.admix_p != "":
+    run_admix = False
+
+else:
+    assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p)
+    
+    if args.use_exemplars:
+        assert os.path.isfile(args.admix_q), "Admixture .Q file %s does not exist." % str(args.admix_q)
 
-if args.admix_p is not None and args.admix_q is not None and args.admix_p != "" and args.admix_q != "":
-    if args.admixture_ex is None or args.admixture_ex == "None":
-        args.admixture_ex = find_exec('admixture', key='admloc')
 
-    test_exec(args.admixture_ex, 'ADMIXTURE')
+# check if have unrel-bfile if needed:
+if args.unrel_bfile is None or args.unrel_bfile == "":
     
-    assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path"
-    run_admix = False
+    if run_admix:
+        raise parser.error('Must specify either --unrel-bfile or --admix-p.')
+
+    if args.use_exemplars:
+        raise parser.error('Must specify --unrel-bfile to define exemplars for --use-exemplars.')
 
 else:
-    assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p)
-    assert os.path.isfile(args.admix_q), "Admixture .P file %s does not exist." % str(args.admix_q)
+    assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path"
+    assert os.path.isfile(str(args.unrel_bfile)+'.bed'), "bed file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.bed'
+    assert os.path.isfile(str(args.unrel_bfile)+'.bim'), "bim file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.bim'
+    assert os.path.isfile(str(args.unrel_bfile)+'.fam'), "fam file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.fam'
+
 
 # verify executables
 test_exec(plinkx, 'Plink')
@@ -264,9 +301,10 @@
 os.chdir(args.outdir)
 
 # link plink files (with verification)
-link(str(wd+'/'+args.unrel_bfile+'.bed'), str(args.unrel_bfile+'.bed'), 'bed file for unrelated individuals')
-link(str(wd+'/'+args.unrel_bfile+'.bim'), str(args.unrel_bfile+'.bim'), 'bim file for unrelated individuals')
-link(str(wd+'/'+args.unrel_bfile+'.fam'), str(args.unrel_bfile+'.fam'), 'fam file for unrelated individuals')
+if run_admix or args.use_exemplars:
+    link(str(wd+'/'+args.unrel_bfile+'.bed'), str(args.unrel_bfile+'.bed'), 'bed file for unrelated individuals')
+    link(str(wd+'/'+args.unrel_bfile+'.bim'), str(args.unrel_bfile+'.bim'), 'bim file for unrelated individuals')
+    link(str(wd+'/'+args.unrel_bfile+'.fam'), str(args.unrel_bfile+'.fam'), 'fam file for unrelated individuals')
 
 link(str(wd+'/'+args.target_bfile+'.bed'), str(args.target_bfile+'.bed'), 'bed file for target individuals')
 link(str(wd+'/'+args.target_bfile+'.bim'), str(args.target_bfile+'.bim'), 'bim file for target individuals')

From 2e9b3c10ba6644d721f6830e4fb783b077fe9b2f Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 20:34:15 +0200
Subject: [PATCH 22/48] admix logging and bugfix; blueprint big_mem option

---
 bin/admix_rel.py                  | 18 +++++++++++++-----
 bin/blueprint.py                  |  6 ++++++
 cluster_templates/broad_uger.conf |  1 +
 cluster_templates/lisa.conf       |  1 +
 cluster_templates/lisa.sub.sh     |  2 +-
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 46e9882..4c8ada3 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -206,15 +206,23 @@
 
 # print settings
 print 'Using settings:'
-print '--unrel-bfile '+args.unrel_bfile
+if args.unrel_bfile is not None and args.unrel_bfile != "":
+	print '--unrel-bfile '+args.unrel_bfile
+if args.admix_p is not None and args.admix_p != "":
+	print '--admix-p '+args.admix_p
 print '--target-bfile '+args.target_bfile
 print '--out '+args.out
 print '--outdir '+args.outdir
 print '--npops '+str(args.npops)
-print '--prop-th '+str(args.prop_th)
-print '--min-exemplar '+str(args.min_exemplar)
+if args.use_exemplars:
+	print '--min-exemplar '+str(args.min_exemplar)
+	if args.admix_q is not None and args.admix_q != "":
+		print '--admix-q '+args.admix_q
+	print '--prop-th '+str(args.prop_th)
+	print '--min-exemplar '+str(args.min_exemplar)
 print '--min-rel '+str(args.min_rel)
-print '--plot-admix-pca '+str(args.plot_admix_pca)
+if args.plot_admix_pca is not None and args.plot_admix_pca != "":
+	print '--plot-admix-pca '+str(args.plot_admix_pca)
 
 
 #############
@@ -484,7 +492,7 @@ def maxpop(props, names, th):
     
     print str(' '.join(admix_project_call))
     print 'Logging to ' + admix_target_log.name + '\n'
-    subprocess.check_call(admix_super_call, stdout=admix_target_log)
+    subprocess.check_call(admix_project_call, stdout=admix_target_log)
     
     admix_target_log.close()
 
diff --git a/bin/blueprint.py b/bin/blueprint.py
index e5535f6..352a555 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -79,6 +79,11 @@ def send_job(jobname,
     else:
         mem_gb = str(1)
 
+    if mem > 30000:
+	mem_txt = str(clust_conf['big_mem_txt'])
+    else:
+	mem_txt = ""
+
     # multithreading arguments
     if threads is None:
         threads = 1
@@ -244,6 +249,7 @@ def send_job(jobname,
                "log_name": str(logloc)+'/'+str(logname),
                "mem_in_mb": str(mem_mb),
                "mem_in_gb": str(mem_gb),
+	       "big_mem_txt": str(mem_txt),
                "threads": str(threads),
                "total_threads": str(tot_threads),
                "wall_hours": str(walltime),
diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf
index 5901a07..3c80210 100644
--- a/cluster_templates/broad_uger.conf
+++ b/cluster_templates/broad_uger.conf
@@ -9,4 +9,5 @@ task_id ${SGE_TASK_ID}
 hold_flag -hold_jid {hold_name}
 j_per_node 1
 array_mem_mb 128000
+big_mem_txt None
 project unspecified
diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf
index 1db3e36..8145e16 100644
--- a/cluster_templates/lisa.conf
+++ b/cluster_templates/lisa.conf
@@ -9,4 +9,5 @@ task_id ${PBS_ARRAYID}
 hold_flag -W depend=afterany:{hold_num}
 j_per_node 16
 array_mem_mb 32000
+big_mem_txt :mem64gb
 project unspecified
diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh
index 8451e4d..a2aab7c 100755
--- a/cluster_templates/lisa.sub.sh
+++ b/cluster_templates/lisa.sub.sh
@@ -1,5 +1,5 @@
 #PBS -lwalltime={wall_hours}:00:00
-#PBS -lnodes=1
+#PBS -lnodes=1{big_mem_txt}
 #PBS -S /bin/bash
 #PBS -N {job_name}
 #PBS -j oe

From c433f15b933007e335b5a850461c141ff1f070c0 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 13 Oct 2016 18:54:55 -0400
Subject: [PATCH 23/48] more platform agnostic impute reference args; adjust
 impute logging

---
 bin/agg_imp.py     |  4 +--
 bin/args_impute.py | 18 ++++++-----
 bin/bg_imp.py      |  7 +++--
 bin/imp2_rel.py    |  7 +++--
 bin/impute_rel.py  | 74 +++++++++++++++++++++++++++++++++++++++++++++-
 bin/shape_rel.py   |  7 +++--
 6 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 77c4e8c..80a01a3 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -27,7 +27,7 @@
 import os
 import subprocess
 import argparse
-from args_impute import parserbase, parsercluster
+from args_impute import parserbase, parsercluster, parserjob
 from py_helpers import unbuffer_stdout, find_exec, file_len
 from blueprint import send_job, load_job, save_job, read_clust_conf
 unbuffer_stdout()
@@ -39,7 +39,7 @@
 parser = argparse.ArgumentParser(prog='agg_imp.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
-                                 parents=[parserbase, parsercluster])
+                                 parents=[parserbase, parsercluster, parserjob])
                     
 args, extra_args = parser.parse_known_args()
 
diff --git a/bin/args_impute.py b/bin/args_impute.py
index e603462..057d27b 100644
--- a/bin/args_impute.py
+++ b/bin/args_impute.py
@@ -89,9 +89,10 @@
                         help='gzipped file of reference information, with columns ' + \
                                 '"id","position","a0","a1", and $popname, where $popname' + \
                                 'contains the allele frequency for the "a1" allele. Can ' + \
-                                'include "###" in place of chromosome number (as in default).',
+                                'include "###" in place of chromosome number. Expected format ' + \
+                                'is from 1000GP_Phase3_chr###.legend.gz files from IMPUTE.',
                         required=False,
-                        default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.legend.gz')
+                        default='1000GP_Phase3_chr###.legend.gz')
 arg_shape.add_argument('--window',
                         type=float,
                         metavar='FLOAT',
@@ -197,25 +198,25 @@
                      metavar='FILENAME',
                      help='Genomic maps. To specify files split by chromosome, use "###" to indicate chromosome number (see default).',
                      required=False,
-                     default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/genetic_map/genetic_map_chr###_combined_b37.txt')
+                     default='genetic_map_chr###_combined_b37.txt')
 arg_ref.add_argument('--ref-haps',
                      type=str,
                      metavar='FILENAME',
                      help='Imputation reference .hap.gz file for shapeit and impute2. Can use "###" to indicate chromosome number (see default).',
                      required=False,
-                     default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.hap.gz')
+                     default='1000GP_Phase3_chr###.hap.gz')
 arg_ref.add_argument('--ref-legs',
                      type=str,
                      metavar='FILENAME',
                      help='Imputation reference .legend.gz file for shapeit and impute2. Can use "###" to indicate chromosome number (see default).',
                      required=False,
-                     default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.legend.gz')
+                     default='1000GP_Phase3_chr###.legend.gz')
 arg_ref.add_argument('--ref-samps',
                      type=str,
                      metavar='FILENAME',
                      help='Imputation reference .sample file for shapeit and impute2. Can use "###" to indicate chromosome number.',
                      required=False,
-                     default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3.sample')
+                     default='1000GP_Phase3.sample')
 
 ############
 #
@@ -296,7 +297,10 @@
                     help='Number of seconds to delay on start of cluster jobs',
                     required=False,
                     default=30)
-arg_clust.add_argument('--full-pipe', 
+
+parserjob = argparse.ArgumentParser(add_help=False)
+arg_job = parserjob.add_argument_group('Job Submission Settings')
+arg_job.add_argument('--full-pipe', 
                     action='store_true',
                     help='Proceed through full imputation pipeline',
                     required=False)
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 0990123..8e90f4c 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -36,7 +36,7 @@
 import argparse
 from warnings import warn
 from textwrap import dedent
-from args_impute import parserbase, parserbg, parsercluster
+from args_impute import parserbase, parserbg, parsercluster, parserjob
 from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format, read_conf
 from blueprint import send_job, init_sendjob_dict, save_job, load_job, read_clust_conf
 unbuffer_stdout()
@@ -49,7 +49,7 @@
 parser = argparse.ArgumentParser(prog='bg_imp.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
-                                 parents=[parserbase, parserbg, parsercluster])
+                                 parents=[parserbase, parserbg, parsercluster, parserjob])
                    
 args, extra_args = parser.parse_known_args()
 
@@ -180,7 +180,8 @@
 
 print '\nCluster settings:'
 print '--sleep '+str(args.sleep)
-
+if args.full_pipe:
+    print '--full-pipe'
 
 
 #############
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 151d871..7239008 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -28,7 +28,7 @@
 import subprocess
 import argparse
 from textwrap import dedent
-from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster
+from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob
 from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf
 from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job
 unbuffer_stdout()
@@ -41,7 +41,7 @@
 parser = argparse.ArgumentParser(prog='imp2_rel.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
-                                 parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster])
+                                 parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob])
                     
 args, extra_args = parser.parse_known_args()
 
@@ -86,7 +86,8 @@
 
 print '\nCluster settings:'
 print '--sleep '+str(args.sleep)
-
+if args.full_pipe:
+    print '--full-pipe'
 
 
 #############
diff --git a/bin/impute_rel.py b/bin/impute_rel.py
index 254cfbe..f312917 100755
--- a/bin/impute_rel.py
+++ b/bin/impute_rel.py
@@ -27,7 +27,7 @@
 import os
 import argparse
 
-from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, parserbg, parsercluster
+from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, arg_ref, parserbg, parsercluster
 from py_helpers import unbuffer_stdout
 from blueprint import send_job
 unbuffer_stdout()
@@ -36,6 +36,15 @@
 if not (('-h' in sys.argv) or ('--help' in sys.argv)):
     print '\n...Parsing arguments...' 
 #############
+
+arg_ref.add_argument('--ref-dir',
+                         type=str,
+                         metavar='DIRECTORY',
+                         help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + 
+                              'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps',
+                         required=False,
+                         default=None)
+
 parser = argparse.ArgumentParser(prog='impute_rel.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
@@ -43,10 +52,72 @@
                     
 args = parser.parse_args()
 
+if args.ref_dir is not None:
+    # verify exists
+    assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir
+    
+    # prepend to references accordingly    
+    args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps
+    args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps
+    args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs
+    args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps
+
+# reference recommendation
+def print_ref_rec():
+    print '\nIf you do not have an imputation reference available, the 1000 Genomes'
+    print 'Phase 3 reference panel provided by IMPUTE is directly compatible with'
+    print 'picopili and broadly covers most major continental populations.'
+    print '\nDirect download:'
+    print 'wget https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.tgz'
+    print '\nWARNING: download filesize is > 12 GB\n'    
+
+# check these references exist
+if not os.path.isfile(args.ref_maps.replace('###','1')):
+    print "Failed to verify genetic maps exist."
+    print_ref_rec()
+    raise IOError("No chr 1 genetic map: %s" % args.ref_maps.replace('###','1'))
+    
+if not os.path.isfile(args.ref_haps.replace('###','1')):
+    print "Failed to verify reference haplotypes exist."
+    # print rec, since is possible have genetic map but not imputation panel
+    print_ref_rec()
+    raise IOError("No chr 1 reference haplotypes: %s" % args.ref_haps.replace('###','1'))
+    
+if not os.path.isfile(args.ref_legs.replace('###','1')):
+    # not printing ref_rec here since at this point have verified haplotypes exist
+    raise IOError("Failed to verify reference legend files exist (tested for chr 1 at %s)" % args.ref_legs.replace('###','1'))
+    
+if not os.path.isfile(args.ref_samps.replace('###','1')):
+    # not printing ref_rec here since at this point have verified haplotypes exist
+    raise IOError("Failed to verify reference sample file exists (tested for chr 1 at %s)" % args.ref_samps.replace('###','1'))
+
+
+# more flexible handling for info file for shapeit, since could be external
+if not os.path.isfile(args.ref_info.replace('###','1')):
+        
+        if args.ref_dir is not None and os.path.isfile(str(args.ref_dir) +'/' + args.ref_info.replace('###','1')):
+            args.ref_info = str(args.ref_dir) +'/' + args.ref_info
+            
+        else:
+            print "Reference information file for phasing not found (tested for chr 1: %s)." % args.ref_info.replace('###','1')
+            if args.ref_dir is not None:
+                print "Tried both relative path and in --ref-dir %s" % str(args.ref_dir)
+            
+            if args.ref_dir == "1000GP_Phase3_chr###.legend.gz":
+                print "For 1000 Genomes Phase 3 reference from IMPUTE the required file is "
+                print "the same as the reference legend.\n"
+                print "Maybe you wanted to add this\n?"
+                # verified above that the legend file exists
+                print "--ref-info %s\n" % args.ref_legs
+            
+            raise IOError("Failed to verify phasing info file exists (tested for chr 1 at %s)" % args.ref_info.replace('###','1'))
+
 
 # TODO: full sanity check of the args here
 
 
+
+
 # print args
 print '\nBasic settings:'
 print '--bfile '+str(args.bfile)
@@ -88,6 +159,7 @@
 
 
 print '\nImputation Reference Files:'
+print '--ref-dir '+str(args.ref_dir)
 print '--ref-maps '+str(args.ref_maps)
 print '--ref-haps '+str(args.ref_haps)
 print '--ref-legs '+str(args.ref_legs)
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 5943789..3e848fb 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -38,7 +38,7 @@
 import os
 import subprocess
 import argparse
-from args_impute import parserbase, parserphase, parserref, parsercluster
+from args_impute import parserbase, parserphase, parserref, parsercluster, parserjob
 from py_helpers import unbuffer_stdout, link, find_exec, read_conf
 from blueprint import send_job
 unbuffer_stdout()
@@ -52,7 +52,7 @@
 parser = argparse.ArgumentParser(prog='shape_rel.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
-                                 parents=[parserbase, parserphase, parserref, parsercluster])
+                                 parents=[parserbase, parserphase, parserref, parsercluster, parserjob])
 
 args, extra_args = parser.parse_known_args()
 
@@ -92,7 +92,8 @@
 print '--sleep '+str(args.sleep)
 print '--mem-req '+str(args.mem_req)
 print '--threads '+str(args.threads)
-
+if args.full_pipe:
+    print '--full-pipe'
 
 
 if str(args.addout) != '' and args.addout is not None:

From a0c6d3e1387810fe9a00a19b68f14a8dd65c8d60 Mon Sep 17 00:00:00 2001
From: Raymond Walters <rwalters@broadinstitute.org>
Date: Fri, 21 Oct 2016 20:52:17 +0200
Subject: [PATCH 24/48] admix logging; pass relatedness to reap as kinship

---
 bin/admix_rel.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 4c8ada3..13206b7 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -325,11 +325,12 @@
 
 
 
-#############
-print '\n...Running Admixture on unrelated dataset...'
-#############
-
 if run_admix:
+
+    #############
+    print '\n...Running Admixture on unrelated dataset...'
+    #############
+
     admix_call = [args.admixture_ex,
                   str(args.unrel_bfile+'.bed'),
                   str(args.npops),
@@ -485,7 +486,7 @@ def maxpop(props, names, th):
 
     
     admix_project_call = [args.admixture_ex,
-                          '-P', str(args.target_bfile+'.bed'),
+                          '-P', str(args.target_bfile)+'.bed',
                         str(args.npops),
                         '-j'+str(args.multithread_cores)]
     admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w')
@@ -559,7 +560,7 @@ def maxpop(props, names, th):
              '-r', str(2),
              '-k', str(args.npops),
              '-m',
-             '-t', str(args.min_rel)]
+             '-t', str(float(args.min_rel/2.0))]
 reap_log = open(str('reap_' + args.out + '.log'), 'w')
 
 print str(' '.join(reap_call))

From c38c77ff708cd062d97ed3c64ead2b2a17ac3743 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 25 Oct 2016 11:28:26 -0400
Subject: [PATCH 25/48] fix curly braces in job templates

---
 bin/bg_imp.py   | 16 +++++++++-------
 bin/gwas_rel.py | 24 ++++++++++++++----------
 bin/imp2_rel.py | 18 +++++++++++-------
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 8e90f4c..2b214e5 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -374,10 +374,10 @@
 
 # best-guess job script for each chunk
 bg_templ = dedent("""\
-    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
-    cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
+    cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}`
+    cchr=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}`
     
-    {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} 
+    {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${cbopen}cchr{cbclose} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} 
     
     sleep {sleep}
     # note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order
@@ -392,7 +392,7 @@
     rm {out_str2}.bim
     rm {out_str2}.fam
     
-    {rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans}
+    {rs_ex} --chunk ${cbopen}cname{cbclose} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans}
 """)
 
 # get number of chunks
@@ -403,8 +403,8 @@
            "sleep": str(args.sleep),
            "cfile": str(outdot)+'.chunks.txt',
            "plink_ex": str(plink_ex),
-           "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${cname}.gz',
-           "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.sample',
+           "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}.gz',
+           "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.sample',
            "hard_call_th": str(hard_call_th),
            "out_str": str(outdot)+'.bg.${cname}',
            "mendel_txt": str(mendel_txt),
@@ -418,7 +418,9 @@
            "outdot": str(outdot),
            "imp_dir": str(imp_dir),
            "idnum": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam',
-           "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl'
+           "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl',
+	   "cbopen":'{{',
+	   "cbclose":'}}',
            }
 
 
diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 9bb199e..4b26a09 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -149,7 +149,7 @@
 clust_conf = read_clust_conf()
 
 # TODO: here
-
+# TODO: move to before logging
 
 
 
@@ -400,21 +400,23 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 ######################
 
 # basic template, depending on model
+# cbopen/cbclose are placeholders for real curly braces, 
+#     to survive .format() here and in send_job
 if args.model == 'gee' or args.model == 'dfam':
     gwas_templ = dedent("""\
-    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
+    cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}`
     {misc}
-    {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs}    
+    {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs}    
     """)
 
 elif args.model == 'gmmat' or args.model == 'gmmat-fam':
     gwas_templ = dedent("""\
-    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
-    chrnum=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
+    cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}`
+    chrnum=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}`
 
-    {plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}}
+    {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbclose}cname{cbopen}
 
-    {rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log
+    {rsc} --no-save --no-restore {gwas_ex} {outdot}.${cbopen}cname{cbclose} grm.{outdot}.loco_chr${cbopen}chrnum{cbclose}.rel.gz {covarsub} {outdot}.${cbopen}cname{cbclose} > {outdot}.${cbopen}cname{cbclose}.gmmat.R.log
     """)
 
 # alternative template for GMMAT
@@ -439,9 +441,9 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 # model-specific arguments not passed for gmmat
 if args.model == 'gee' or args.model == 'dfam':
     if args.addout is not None:
-        gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${cname}'
+        gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${{cname}}'
     else:
-        gwasargs = gwasargs + ' --addout ${cname}'
+        gwasargs = gwasargs + ' --addout ${{cname}}'
     if args.covar is not None:
         gwasargs = gwasargs + ' --covar '+str(args.covar)
     if args.covar_number is not None:
@@ -472,7 +474,9 @@ def find_chunk(snpchrom, snpbp, last_chunk):
            "optargs": str(gwasargs),
            "plinkx": str(plinkx),
            "covarsub": str(args.covar)+'.sub.txt',
-           "rsc": str(args.rscript_ex)
+           "rsc": str(args.rscript_ex),
+	   "cbopen":'{{',
+	   "cbclose":'}}',
            }
 
 nchunk = len(chunks.keys())
diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 7239008..23fda59 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -185,7 +185,7 @@
     # with "chr_list" to get have adaptive chromosome list
     cmd_templ = dedent("""\
     chrs=({chr_list})
-    chrom=${{chrs[{task}-1]}}
+    chrom=${cbopen}chrs[{task}-1]{cbclose}
 
     {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog}    
     """)
@@ -222,6 +222,8 @@
                "seed_str": '--seed '+str(extra_args.shape_seed),
                "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample',
                "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log',
+	       "cbopen":'{{',
+	       "cbclose":'}}',
                }    
     shape_cmd = cmd_templ.format(**jobdict)
 
@@ -326,12 +328,12 @@
 
 # job script
 imp_templ = dedent("""\
-    cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}`
-    cstart=`awk -v a={task} 'NR==a+1{{print $2}}' {cfile}`
-    cend=`awk -v a={task} 'NR==a+1{{print $3}}' {cfile}`
-    cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}`
+    cchr=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}`
+    cstart=`awk -v a={task} 'NR==a+1{cbopen}print $2{cbclose}' {cfile}`
+    cend=`awk -v a={task} 'NR==a+1{cbopen}print $3{cbclose}' {cfile}`
+    cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}`
 
-    {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt}
+    {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${cbopen}cstart{cbclose} ${cbopen}cend{cbclose} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt}
 """)
 
 # fill in template
@@ -345,7 +347,9 @@
            "Ne": str(args.Ne),
            "buffer": str(args.buffer),
            "out": str(outdot)+'.imp.${cname}',
-           "seedtxt": str(seedtxt)
+           "seedtxt": str(seedtxt),
+	   "cbopen":'{{',
+	   "cbclose":'}}',
            }
 
 

From e2567de0599c14d66ee9442f55e6596c98dd9a5e Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 10 Nov 2016 11:52:23 -0500
Subject: [PATCH 26/48] bugfix args formatting in bg send_job

---
 bin/bg_imp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 2b214e5..81ac5ec 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -117,7 +117,7 @@
     info_txt = ''
 else:
     # init, then add thresholds
-    info_txt = '--qual-scores '+str(imp_dir)+'/'+str(outdot)+'.imp.${cname}_info' +' 5 2 1'
+    info_txt = '--qual-scores '+str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}_info' +' 5 2 1'
     # minimum info
     if args.info_th >= 0.0 and args.info_th <= 1.0:
         info_txt = info_txt + ' --qual-threshold '+str(args.info_th)
@@ -406,14 +406,14 @@
            "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}.gz',
            "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.sample',
            "hard_call_th": str(hard_call_th),
-           "out_str": str(outdot)+'.bg.${cname}',
+           "out_str": str(outdot)+'.bg.${{cname}}',
            "mendel_txt": str(mendel_txt),
            "info_txt": str(info_txt),
-           "out_str2": str(outdot)+'.bg.tmp.${cname}',
+           "out_str2": str(outdot)+'.bg.tmp.${{cname}}',
            "maf_txt": str(maf_txt),
            "mac_txt": str(mac_txt),
            "geno_txt": str(geno_txt),
-           "out_str_filt": str(outdot)+'.bg.filtered.${cname}',
+           "out_str_filt": str(outdot)+'.bg.filtered.${{cname}}',
            "rs_ex": str(rs_ex),
            "outdot": str(outdot),
            "imp_dir": str(imp_dir),

From 807f20f2d807ae09040a2d6083e7a7b8695bafa4 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 10 Nov 2016 11:53:30 -0500
Subject: [PATCH 27/48] bugfix arg format, adjust agg mem request in gwas_rel

---
 bin/gwas_rel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py
index 4b26a09..466f239 100755
--- a/bin/gwas_rel.py
+++ b/bin/gwas_rel.py
@@ -414,7 +414,7 @@ def find_chunk(snpchrom, snpbp, last_chunk):
     cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}`
     chrnum=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}`
 
-    {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbclose}cname{cbopen}
+    {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbopen}cname{cbclose}
 
     {rsc} --no-save --no-restore {gwas_ex} {outdot}.${cbopen}cname{cbclose} grm.{outdot}.loco_chr${cbopen}chrnum{cbclose}.rel.gz {covarsub} {outdot}.${cbopen}cname{cbclose} > {outdot}.${cbopen}cname{cbclose}.gmmat.R.log
     """)
@@ -568,7 +568,7 @@ def find_chunk(snpchrom, snpbp, last_chunk):
 send_job(jobname='agg_'+str(outdot),
          cmd=' '.join(agg_call),
          logname=agg_log,
-         mem=4000,
+         mem=8000,
          walltime=30,
          wait_name='gwas.chunks.'+str(outdot),
          wait_num=str(jobres).strip(),

From 6361099006bd251407de3f6dbfc7bc7c2add50d9 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 10 Nov 2016 11:54:31 -0500
Subject: [PATCH 28/48] more detailed logging of missing/broken chunks in gwas
 agg

---
 bin/agg_gwas.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index d456758..87c28ee 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -111,6 +111,9 @@
 
 
 
+###############
+print '\n...Checking for missing or incomplete chunks...'
+###############
 
 # read chunk def file
 chunks = {}
@@ -138,12 +141,15 @@
     
     # record chunks with no/partial/broken output
     if not os.path.isfile(ch_out):
+    	print 'Output not found for %s' % str(ch_out)
         mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
-    elif file_len(ch_out) != file_len(str(outdot)+'.snps.'+str(chname)+'.txt'):
+    elif file_len(ch_out) < file_len(str(outdot)+'.snps.'+str(chname)+'.txt'):
+    	print 'Output file %s is incomplete' % str(ch_out)
         mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
     else:
         ft = file_tail(ch_out)
         if len(ft.split()) != out_len:
+	    print 'Last line of output file %s is incomplete' % str(ch_out)
             mis_chunks[str(chname)] = [str(chrom), int(start), int(end)]
             
 
@@ -154,7 +160,7 @@
 ###############
 if len(mis_chunks) > 0:
     nummiss = len(mis_chunks)
-    print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss
+    print '\nMissing results for %d GWAS jobs. Preparing to resubmit...' % nummiss
     
     # just missing chunks for task array
     # fail if already tried
@@ -244,7 +250,7 @@
     send_job(jobname='agg_'+str(outdot),
              cmd=' '.join(sys.argv[:]),
              logname=agg_log,
-             mem=24000,
+             mem=16000,
              walltime=30,
              wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss),
              wait_num=str(jobres).strip(),
@@ -258,6 +264,7 @@
 
 ###############
 # if no missing chunks, proceed collecting info for aggregation
+print '\n...Loading auxilary information...'
 ###############
 
 # chnames = chunks.keys()
@@ -339,7 +346,9 @@
 out_file.write('\t'.join(out_head) + '\n')
 filt_file.write('\t'.join(filt_head) + '\n')
 
-print 'starting chunk loop'
+###############
+print '\n...Aggregating GWAS results from chunks...'
+###############
 # loop chunks to aggregate
 for ch in chnames:
     # open output file

From a9d7734f114070c72bb676df1a9c1339e6725558 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 10 Nov 2016 11:55:42 -0500
Subject: [PATCH 29/48] dont expect .eval output from pca fastmode

---
 bin/imus_pca.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/imus_pca.py b/bin/imus_pca.py
index 765aff2..5afbc39 100755
--- a/bin/imus_pca.py
+++ b/bin/imus_pca.py
@@ -459,7 +459,7 @@
     subprocess.check_call(["tar", "-zcvf",
                            args.out + '.pca_files.tar.gz',
                            args.bfile + '.pca.par',
-                           args.bfile + '.pca.eval.txt',
+#                           args.bfile + '.pca.eval.txt',
                            args.bfile + '.pca.snpw.txt',
                            args.bfile + '.pca.raw.txt',
                            args.bfile + '.pca.refpoplist.txt',
@@ -470,7 +470,7 @@
     # remove successfully zipped files
     subprocess.check_call(["rm",
                            args.bfile + '.pca.par',
-                           args.bfile + '.pca.eval.txt',
+#                           args.bfile + '.pca.eval.txt',
                            args.bfile + '.pca.snpw.txt',
                            args.bfile + '.pca.raw.txt',
                            args.bfile + '.pca.refpoplist.txt',

From 0a254d7f08069aa07b2cf116c37ce51aca1e3a5d Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 29 Nov 2016 17:16:02 -0500
Subject: [PATCH 30/48] ensure allele frequency reported for intended allele

---
 bin/agg_gwas.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index 87c28ee..5df1429 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -297,11 +297,17 @@
 	print 'bim loaded'
 
 # frq.cc
-# for both: maf_a, maf_u, n_a, n_u
+# for both: 
+# - maf_a = frq in affected (cases)
+# - maf_u = frq in unaffected (controls) 
+# - n_a = number affected (cases)
+# - n_u = number affected (controls)
+# - freq_a1 = a1 used for freq
 maf_a_info = {}
 maf_u_info = {}
 n_a_info = {}
 n_u_info = {}
+freq_a1 = {}
 
 frq = open(args.freq_file, 'r')
 dumphead = frq.readline()
@@ -311,6 +317,7 @@
     maf_u_info[str(snp)] = mafu
     n_a_info[str(snp)] = int(nchra) / 2
     n_u_info[str(snp)] = int(nchru) / 2
+    freq_a1[str(snp)] = a1
 frq.close()
 print 'frq loaded'
 
@@ -378,8 +385,13 @@
 	    (chrom, snp, cm, bp, a1, a2, n, af2, scoretest, scorevar, p) = line.split()
 
         # get meta info
-        frqa = maf_a_info.pop(str(snp))
-        frqu = maf_u_info.pop(str(snp))
+	# verify use freq of correct allele
+	if str(frq_a1.pop(str(snp))) == str(a1):
+            frqa = maf_a_info.pop(str(snp))
+            frqu = maf_u_info.pop(str(snp))
+	else:
+	    frqa = 1 - maf_a_info.pop(str(snp))
+	    frqu = 1 - maf_u_info.pop(str(snp))
         na = n_a_info.pop(str(snp))
         nu = n_u_info.pop(str(snp))
         

From f72a8919928d50da243d971d62c8c76cd8d38c09 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Mon, 5 Dec 2016 18:02:29 -0500
Subject: [PATCH 31/48] fix specifying reference files with directory, imp_prep
 cluster

---
 bin/imp2_rel.py  | 19 ++++++++++++++++++-
 bin/imp_prep.pl  |  2 +-
 bin/shape_rel.py | 22 ++++++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 23fda59..e55a71b 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -42,7 +42,16 @@
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
                                  parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob])
-                    
+
+arg_ref.add_argument('--ref-dir',
+			type=str,
+			metavar='DIRECTORY',
+			help='Directory containing imputation reference files (haps, legends, sample, and maps). ' +
+				'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps',
+			required=False,
+			default=None)
+
+
 args, extra_args = parser.parse_known_args()
 
 
@@ -111,7 +120,15 @@
 chunker_ex = rp_bin+'/chunk_snps.py'
 test_exec(chunker_ex)
 
+if args.ref_dir is not None:
+	# verify exists
+	assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir
 
+	# prepend to references accordingly
+	args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps
+	args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps
+	args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs
+	args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps
 
 
 # TODO: here
diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl
index a707037..69becf5 100755
--- a/bin/imp_prep.pl
+++ b/bin/imp_prep.pl
@@ -57,7 +57,7 @@
 #############################
 
 my $ploc = &trans("p2loc");
-my $qloc = &trans("queue");
+my $qloc = &trans("cluster");
 my $email = &trans("email");
 
 my $email_on = 0;
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index 3e848fb..c26163c 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -54,6 +54,15 @@
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
                                  parents=[parserbase, parserphase, parserref, parsercluster, parserjob])
 
+arg_ref.add_argument('--ref-dir',
+			type=str,
+			metavar='DIRECTORY',
+			help='Directory containing imputation reference files (haps, legends, sample, and maps). ' +
+				'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps',
+			required=False,
+			default=None)
+
+
 args, extra_args = parser.parse_known_args()
 
 # other settings
@@ -87,6 +96,7 @@
 print '--ref-haps '+str(args.ref_haps)
 print '--ref-legs '+str(args.ref_legs)
 print '--ref-samps '+str(args.ref_samps)
+print '--ref-dir '+str(args.ref_dir)
 
 print '\nJob Submission:'
 print '--sleep '+str(args.sleep)
@@ -110,6 +120,18 @@
 plinkx = find_exec('plink',key='p2loc')
 shapeit_ex = find_exec('shapeit',key='shloc')
 
+
+if args.ref_dir is not None:
+	# verify exists
+	assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir
+
+	# prepend to references accordingly
+	args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps
+	args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps
+	args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs
+	args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps
+
+
 # TODO: here
 
 

From c59be8b209278700b9629b0c8187c1b82533af1c Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Mon, 5 Dec 2016 18:31:38 -0500
Subject: [PATCH 32/48] fix directory for shapeit ref info

---
 bin/imp2_rel.py  | 2 +-
 bin/shape_rel.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index e55a71b..51a4afb 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -43,7 +43,7 @@
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
                                  parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob])
 
-arg_ref.add_argument('--ref-dir',
+parser.add_argument('--ref-dir',
 			type=str,
 			metavar='DIRECTORY',
 			help='Directory containing imputation reference files (haps, legends, sample, and maps). ' +
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index c26163c..eab62b1 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -54,7 +54,7 @@
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
                                  parents=[parserbase, parserphase, parserref, parsercluster, parserjob])
 
-arg_ref.add_argument('--ref-dir',
+parser.add_argument('--ref-dir',
 			type=str,
 			metavar='DIRECTORY',
 			help='Directory containing imputation reference files (haps, legends, sample, and maps). ' +
@@ -130,6 +130,7 @@
 	args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps
 	args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs
 	args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps
+	args.ref_info = str(args.ref_dir) +'/' + args.ref_info
 
 
 # TODO: here

From e97bc58bc6cf574cccd7473783bfa7d79cc141c1 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Mon, 5 Dec 2016 19:58:06 -0500
Subject: [PATCH 33/48] remove unused refs to hmloc

---
 bin/checkflip_pico.pl | 1 -
 bin/checkpos_pico.pl  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl
index 679a4b0..5c9bb08 100755
--- a/bin/checkflip_pico.pl
+++ b/bin/checkflip_pico.pl
@@ -51,7 +51,6 @@ sub trans {
 }
 
 my $sloc = &trans("sloc");
-my $hmloc = &trans("hmloc");
 my $p2loc = &trans("p2loc");
 
 
diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl
index 5bd8116..6a162ce 100755
--- a/bin/checkpos_pico.pl
+++ b/bin/checkpos_pico.pl
@@ -52,7 +52,6 @@ sub trans {
 }
 
 my $sloc = &trans("sloc");
-my $hmloc = &trans("hmloc");
 my $p2loc = &trans("p2loc");
 
 

From 5ee5edfc4e3b4bc0dced47825177845268843af8 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Mon, 5 Dec 2016 19:59:04 -0500
Subject: [PATCH 34/48] fix broken test_exec calls

---
 bin/imp2_rel.py   | 2 +-
 bin/py_helpers.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index 51a4afb..bf8be27 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -118,7 +118,7 @@
 # (to get absolute path for scripts)
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 chunker_ex = rp_bin+'/chunk_snps.py'
-test_exec(chunker_ex)
+test_exec(chunker_ex,'picopili chunking script')
 
 if args.ref_dir is not None:
 	# verify exists
diff --git a/bin/py_helpers.py b/bin/py_helpers.py
index f50c4fd..dd7c89b 100644
--- a/bin/py_helpers.py
+++ b/bin/py_helpers.py
@@ -101,7 +101,7 @@ def find_exec(prog, key=None):
             print "Failed to find config file %s. Will search for %s on path." % (str(conffile), str(prog))
             
         exloc = find_from_path(str(prog),str(prog))
-        test_exec(exloc)
+        test_exec(exloc,str(prog))
         return exloc
 
 

From 1ba6dd95f5e3d07f9022381f3f00d3ebc2ab7ab3 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 13 Dec 2016 23:47:48 -0500
Subject: [PATCH 35/48] more imputation job arg/templating fixes

---
 bin/imp2_rel.py  | 29 ++++++++++++++---------------
 bin/shape_rel.py | 12 ++++++------
 2 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py
index bf8be27..8bf279e 100755
--- a/bin/imp2_rel.py
+++ b/bin/imp2_rel.py
@@ -28,7 +28,7 @@
 import subprocess
 import argparse
 from textwrap import dedent
-from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob
+from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob, parserphase
 from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf
 from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job
 unbuffer_stdout()
@@ -41,7 +41,7 @@
 parser = argparse.ArgumentParser(prog='imp2_rel.py',
                                  formatter_class=lambda prog:
                                  argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40),
-                                 parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob])
+                                 parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob, parserphase])
 
 parser.add_argument('--ref-dir',
 			type=str,
@@ -161,7 +161,6 @@
     haps_out = str(shape_dir)+'/'+str(outdot)+'.chr'+str(chrom)+'.phased.haps'
     samp_out = str(shape_dir)+'/'+str(outdot)+'.chr'+str(chrom)+'.phased.sample'
 
-
     if not os.path.isfile(haps_out):
         bad_chr.append(chrom)
     elif not os.path.isfile(samp_out):
@@ -180,7 +179,7 @@
         exit(1)
     # else continue to resub
     print 'Preparing to resubmit...'
-    # note: assuming required shapeit args will be in extra_args
+    # note: assuming required shapeit args will be in args
     #   if running under --full-pipe
     #   TODO: add check on this
     #   (mem_req, threads, no_duohmm, window, shape_seed) 
@@ -221,7 +220,7 @@
     # manage additional arg pieces
     chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}'
     outstem = str(outdot)+'.chr${chrom}'
-    if extra_args.no_duohmm:
+    if args.no_duohmm:
         duo_txt = ''
     else:
         duo_txt = '--duohmm'
@@ -233,10 +232,10 @@
                "bed": '--input-bed '+str(chrstem)+'.bed '+str(chrstem)+'.bim '+str(chrstem)+'.fam',
                "map": '--input-map '+str(args.ref_maps).replace('###','${chrom}'),
                "ref": '--input-ref '+str(args.ref_haps).replace('###','${chrom}')+' '+str(args.ref_legs).replace('###','${chrom}')+' '+str(args.ref_samps).replace('###','${chrom}'),
-               "window": '--window '+str(extra_args.window),
+               "window": '--window '+str(args.window),
                "duo_txt": str(duo_txt),
-               "thread_str": '--thread '+str(extra_args.threads),
-               "seed_str": '--seed '+str(extra_args.shape_seed),
+               "thread_str": '--thread '+str(args.threads),
+               "seed_str": '--seed '+str(args.shape_seed),
                "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample',
                "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log',
 	       "cbopen":'{{',
@@ -248,10 +247,10 @@
     jobres = send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr),
 	              cmd=shape_cmd,
 	              logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log',
-	              mem=int(extra_args.mem_req)*1000,
+	              mem=int(args.mem_req)*1000,
 	              walltime=30,
 	              njobs=int(num_chr),
-	              threads=extra_args.threads,
+	              threads=args.threads,
 		      sleep=args.sleep)
 
     print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr
@@ -357,13 +356,13 @@
 jobdict = {"task": "{task}",
            "cfile": str(outdot)+'.chunks.txt',
            "impute_ex": str(impute_ex),
-           "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps',
-           "ref_haps": str(args.ref_haps).replace('###','${cchr}'),
-           "ref_leg": str(args.ref_legs).replace('###','${cchr}'),
-           "map": str(args.ref_maps).replace('###','${cchr}'),
+           "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.haps',
+           "ref_haps": str(args.ref_haps).replace('###','${{cchr}}'),
+           "ref_leg": str(args.ref_legs).replace('###','${{cchr}}'),
+           "map": str(args.ref_maps).replace('###','${{cchr}}'),
            "Ne": str(args.Ne),
            "buffer": str(args.buffer),
-           "out": str(outdot)+'.imp.${cname}',
+           "out": str(outdot)+'.imp.${{cname}}',
            "seedtxt": str(seedtxt),
 	   "cbopen":'{{',
 	   "cbclose":'}}',
diff --git a/bin/shape_rel.py b/bin/shape_rel.py
index eab62b1..7e1391a 100755
--- a/bin/shape_rel.py
+++ b/bin/shape_rel.py
@@ -296,12 +296,12 @@
     duo_txt = '--duohmm'
 
 # TODO: handle empty chromosomes
-chrstem = str(args.bfile)+'.hg19.ch.fl.chr\$tasknum'
-outstem = str(outdot)+'.chr\$tasknum'
-map_arg = str(args.ref_maps).replace('###','\$tasknum')
-hap_arg = str(args.ref_haps).replace('###','\$tasknum')
-leg_arg = str(args.ref_legs).replace('###','\$tasknum')
-samp_arg = str(args.ref_samps).replace('###','\$tasknum')
+chrstem = str(args.bfile)+'.hg19.ch.fl.chr{task}'
+outstem = str(outdot)+'.chr{task}'
+map_arg = str(args.ref_maps).replace('###','{task}')
+hap_arg = str(args.ref_haps).replace('###','{task}')
+leg_arg = str(args.ref_legs).replace('###','{task}')
+samp_arg = str(args.ref_samps).replace('###','{task}')
 
 shape_call = [shapeit_ex,
               '--input-bed', chrstem+'.bed', chrstem+'.bim', chrstem+'.fam',

From 92173c095a4a29986c9915e614e1451c8dcfb99f Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 13 Dec 2016 23:48:47 -0500
Subject: [PATCH 36/48] fix resub of array jobs with 1 task remaining

---
 bin/agg_gwas.py  | 3 ++-
 bin/agg_imp.py   | 3 ++-
 bin/bg_imp.py    | 3 ++-
 bin/blueprint.py | 9 +++++----
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index 5df1429..a19bf7f 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -236,7 +236,8 @@
 	              walltime=sendjob_dict['walltime'],
 	              njobs=sendjob_dict['njobs'],
 	              maxpar=sendjob_dict['maxpar'],
-	              sleep=sendjob_dict['sleep'])
+	              sleep=sendjob_dict['sleep'],
+		      forcearray=True)
         
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
     
diff --git a/bin/agg_imp.py b/bin/agg_imp.py
index 80a01a3..fc9bc0d 100755
--- a/bin/agg_imp.py
+++ b/bin/agg_imp.py
@@ -213,7 +213,8 @@
 	              mem=sendjob_dict['mem'],
 	              walltime=sendjob_dict['walltime'],
 	              njobs=sendjob_dict['njobs'],
-	              sleep=sendjob_dict['sleep'])
+	              sleep=sendjob_dict['sleep'],
+		      forcearray=True)
         
     print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss
     
diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 81ac5ec..511e058 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -328,7 +328,8 @@
 	              mem=sendjob_dict['mem'],
 	              walltime=sendjob_dict['walltime'],
 	              njobs=sendjob_dict['njobs'],
-	              sleep=sendjob_dict['sleep'])
+	              sleep=sendjob_dict['sleep'],
+		      forcearray=True)
         
     print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss
 
diff --git a/bin/blueprint.py b/bin/blueprint.py
index 352a555..62ffe48 100755
--- a/bin/blueprint.py
+++ b/bin/blueprint.py
@@ -31,7 +31,8 @@ def send_job(jobname,
              wait_num=None,
              cluster=None,
              sleep=30,
-             testonly=False):
+             testonly=False,
+	     forcearray=False):
     
     # validate args
     if arrayfile is None and cmd is None:
@@ -121,7 +122,7 @@ def send_job(jobname,
 
 
     # for single jobs
-    if cmd is not None and (njobs is None or njobs <= 1):
+    if cmd is not None and (njobs is None or njobs <= 1) and not forcearray:
                     
         njobs = 1
         tot_threads = int(threads)
@@ -272,7 +273,7 @@ def send_job(jobname,
     sub_file.close()
     
     # finalize or remove optional lines
-    if njobs <= 1:
+    if njobs <= 1 and not forcearray:
         subprocess.check_call(['sed','-i','/^::PICO_ARRAY_ONLY::/d',str(sub_file.name)])
     else:
         subprocess.check_call(['sed','-i','s/^::PICO_ARRAY_ONLY:://',str(sub_file.name)])
@@ -282,7 +283,7 @@ def send_job(jobname,
     else:
         subprocess.check_call(['sed','-i','s/^::PICO_THREAD_ONLY:://',str(sub_file.name)])
     
-    if njobs <= 1 and threads <= 1:
+    if njobs <= 1 and not forcearray and threads <= 1:
         subprocess.check_call(['sed','-i','/^::PICO_THREADARRAY_ONLY::/d',str(sub_file.name)])
     else:
         subprocess.check_call(['sed','-i','s/^::PICO_THREADARRAY_ONLY:://',str(sub_file.name)])        

From d34f45110325199501e01b38fed18bf600543b9c Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 28 Feb 2017 19:09:45 -0500
Subject: [PATCH 37/48] force numeric freqs

---
 bin/agg_gwas.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py
index a19bf7f..f935d52 100755
--- a/bin/agg_gwas.py
+++ b/bin/agg_gwas.py
@@ -314,8 +314,8 @@
 dumphead = frq.readline()
 for line in frq:
     (chrom, snp, a1, a2, mafa, mafu, nchra, nchru) = line.split()
-    maf_a_info[str(snp)] = mafa
-    maf_u_info[str(snp)] = mafu
+    maf_a_info[str(snp)] = float(mafa)
+    maf_u_info[str(snp)] = float(mafu)
     n_a_info[str(snp)] = int(nchra) / 2
     n_u_info[str(snp)] = int(nchru) / 2
     freq_a1[str(snp)] = a1
@@ -387,12 +387,12 @@
 
         # get meta info
 	# verify use freq of correct allele
-	if str(frq_a1.pop(str(snp))) == str(a1):
-            frqa = maf_a_info.pop(str(snp))
-            frqu = maf_u_info.pop(str(snp))
+	if str(freq_a1.pop(str(snp))) == str(a1):
+            frqa = float(maf_a_info.pop(str(snp)))
+            frqu = float(maf_u_info.pop(str(snp)))
 	else:
-	    frqa = 1 - maf_a_info.pop(str(snp))
-	    frqu = 1 - maf_u_info.pop(str(snp))
+	    frqa = 1 - float(maf_a_info.pop(str(snp)))
+	    frqu = 1 - float(maf_u_info.pop(str(snp)))
         na = n_a_info.pop(str(snp))
         nu = n_u_info.pop(str(snp))
         

From af818983a0178d0e7405e6ab132a856e7fb837e0 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Tue, 28 Feb 2017 19:10:39 -0500
Subject: [PATCH 38/48] improve file checks from args

---
 bin/admix_rel.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 13206b7..18b15a2 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -254,7 +254,6 @@
 if args.admix_p is not None and args.admix_p != "":
     run_admix = False
 
-else:
     assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p)
     
     if args.use_exemplars:
@@ -285,8 +284,9 @@
 # pca file
 if plot_pca:
     assert os.path.isfile(args.plot_admix_pca), "PCA file does not exist (%r)" % args.plot_admix_pca
-    assert '/' not in args.target_bfile, "--plot-admix-pca must specify only a file, not a path"
+#    assert '/' not in args.plot_admix_pca, "--plot-admix-pca must specify only a file, not a path"
 
+# TODO: allow relative paths here (os.path.normpath() should solve this; see link() for pca file)
 # verify bfiles are files, not paths
 assert '/' not in args.target_bfile, "--target-bfile must specify only a file stem, not a path"
 
@@ -320,8 +320,7 @@
 
 # link pca file, if provided
 if not (args.plot_admix_pca==None or args.plot_admix_pca=="None"):
-
-    link(str(wd+'/'+args.plot_admix_pca), str(args.plot_admix_pca), 'PCA file')
+    link(os.path.normpath(str(wd+'/'+args.plot_admix_pca)), os.path.basename(str(args.plot_admix_pca)), 'PCA file')
 
 
 

From e4f2f265d8927b42c2aca41f3172f313ef1f7a42 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 1 Mar 2017 13:34:37 -0500
Subject: [PATCH 39/48] update default loc for admixture on Broad

---
 bin/config_pico.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/config_pico.pl b/bin/config_pico.pl
index b9c7767..1a4fbfb 100755
--- a/bin/config_pico.pl
+++ b/bin/config_pico.pl
@@ -342,7 +342,7 @@ ()
 		  "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta",
 		  "liloc","/home/unix/sripke/liftover",
 		  "eloc","/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin",
-		  "admloc","/humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.23",
+		  "admloc"," /humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.3.0",
 		  "reaploc","/humgen/atgu1/fs03/shared_resources/shared_software/REAP",
 		  "priloc","/humgen/atgu1/fs03/shared_resources/shared_software/PRIMUS_v1.8.0/bin",
 		  "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin",

From 9e57709f29ad6191e60ced530bc20d662f83f0b7 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 1 Mar 2017 13:53:31 -0500
Subject: [PATCH 40/48] prevent duplicating find/test execs

---
 bin/admix_rel.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 18b15a2..d22ba45 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -234,18 +234,24 @@
 
 if args.admixture_ex is None or args.admixture_ex == "None":
     args.admixture_ex = find_exec('admixture', key='admloc')
-
-test_exec(args.admixture_ex, 'ADMIXTURE')
+else:
+    test_exec(args.admixture_ex, 'ADMIXTURE')
 
 if args.rscript_ex is None or args.rscript_ex == "None":
     args.rscript_ex = find_exec('Rscript', key='rscloc')
+else:
+    test_exec(args.rscript_ex, 'Rscript')
 
 if args.reap_ex is None or args.reap_ex == "None":
     args.reap_ex = find_exec('REAP', key='reaploc')
+else:
+    test_exec(args.reap_ex, 'REAP')
 
 rp_bin = os.path.dirname(os.path.realpath(__file__))
 Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript'
 
+
+
 if plot_pca:
     Rplotpcax = rp_bin+'/plot_pca.Rscript'
 
@@ -276,11 +282,6 @@
     assert os.path.isfile(str(args.unrel_bfile)+'.fam'), "fam file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.fam'
 
 
-# verify executables
-test_exec(plinkx, 'Plink')
-test_exec(args.rscript_ex, 'Rscript')
-test_exec(args.reap_ex, 'REAP')
-
 # pca file
 if plot_pca:
     assert os.path.isfile(args.plot_admix_pca), "PCA file does not exist (%r)" % args.plot_admix_pca

From 060f44db01d35b52d34d34d89f73051137b74681 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Thu, 2 Mar 2017 12:25:49 -0500
Subject: [PATCH 41/48] fix admixture plotting without exemplars

---
 bin/admix_rel.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index d22ba45..62f9da9 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -323,6 +323,8 @@
 if not (args.plot_admix_pca==None or args.plot_admix_pca=="None"):
     link(os.path.normpath(str(wd+'/'+args.plot_admix_pca)), os.path.basename(str(args.plot_admix_pca)), 'PCA file')
 
+# labels for populations are popA, popB, popC, ...
+popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)]
 
 
 if run_admix:
@@ -354,9 +356,6 @@
     # - match population assignments to FID/IIDs
     # - write .pops file for target bfile, .pops.info file 
     
-    # label for populations are popA, popB, popC, ...
-    popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)]
-    
     # define function returning popname or '-' based on largest proportion
     # Note: ties broken in favor of first pop listed in names (possible if th <= 0.5)
     def maxpop(props, names, th):
@@ -714,8 +713,9 @@ def maxpop(props, names, th):
                                glob(args.target_bfile+".*.admixture.plotinfo.txt") + \
                                [str(args.target_bfile)+".admixture.legend.txt"] + \
                                glob(args.out+".*.plot_admixture.log"))
-                               
-        subprocess.check_call(["tar", "-zcvf",
+        
+	if args.use_exemplars:
+            subprocess.check_call(["tar", "-zcvf",
                                str(args.out+'.plot_exemplar_files.tar.gz')] + \
                                glob(args.target_bfile+".*.exemplar.plotinfo.txt") + \
                                [str(args.target_bfile)+".exemplar.legend.txt"] + \

From 81714f8855b899078437e5743b063cde4f4fc2c1 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 3 Mar 2017 11:49:06 -0500
Subject: [PATCH 42/48] improve log of plotting cmds

---
 bin/admix_rel.py | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 62f9da9..26b27d7 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -580,12 +580,15 @@ def maxpop(props, names, th):
 ### IBD0/IBD1 points and density
 # plot_reap_ibd.Rscript has args <input_file> <outname> <minimum relatedness>
 r_ibd_log = open(str(args.out) + '.plot_ibd.log', 'w')
-subprocess.check_call([Rplotibdx,
-                       str('REAP_pairs_relatedness.txt'),
-                       str(args.out),
-                       str(args.min_rel)],
-                       stderr=subprocess.STDOUT,
-                       stdout=r_ibd_log)
+plot_ibd_call = [Rplotibdx,
+                 str('REAP_pairs_relatedness.txt'),
+		 str(args.out),
+                 str(args.min_rel)]
+print str(' '.join(plot_ibd_call))
+
+subprocess.check_call(plot_ibd_call,
+                      stderr=subprocess.STDOUT,
+                      stdout=r_ibd_log)
 
 r_ibd_log.close()
 print 'IBD plots: %s.IBD.png, %s.IBD_density.png' % (args.out, args.out)
@@ -673,25 +676,31 @@ def maxpop(props, names, th):
     for i in xrange(args.npops):
         if args.use_exemplars:
             r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w')
-            subprocess.check_call([Rplotpcax,
-                                   str(args.plot_admix_pca),
+            plot_pca_exemp_call = [Rplotpcax,
+                                   str(os.path.basename(str(args.plot_admix_pca)),
                                    str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt',
                                    str(args.target_bfile) + '.exemplar.legend.txt',
                                    str(3),
-                                   str(args.out) + '.' + popnames[i] + '.exemplars'],
-                                   stderr=subprocess.STDOUT,
-                                   stdout=r_pca_ex_log)    
+                                   str(args.out) + '.' + popnames[i] + '.exemplars']
+
+	    print str(" ".join(plot_pca_exemp_call))
+	    subprocess.check_call(plot_pca_exemp_call,
+	                          stderr=subprocess.STDOUT,
+                                  stdout=r_pca_ex_log)
             r_pca_ex_log.close()
 
         r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w')
-        subprocess.check_call([Rplotpcax,
-                               str(args.plot_admix_pca),
+        plot_pca_admix_call = [Rplotpcax,
+                               str(os.path.basename(str(args.plot_admix_pca)),
                                str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt',
                                str(args.target_bfile) + '.admixture.legend.txt',
                                str(3),
-                               str(args.out) + '.' + popnames[i] + '.admixture'],
-                               stderr=subprocess.STDOUT,
-                               stdout=r_pca_admix_log)    
+                               str(args.out) + '.' + popnames[i] + '.admixture']
+
+	print str(" ".join(plot_pca_admix_call))
+        subprocess.check_call(plot_pca_admix_call,
+                              stderr=subprocess.STDOUT,
+                              stdout=r_pca_admix_log)    
         r_pca_admix_log.close()
         print 'PCA plots for %s: %s, %s (completed %d/%d populations)' % (popnames[i], str(args.out)+'.'+popnames[i]+'.exemplars.pca.pairs.png', str(args.out)+'.'+popnames[i]+'.exemplars.pca.pc##_pc##.png', i+1, args.npops)
 

From f6b2bb051a6d3e7887bdba6dd1952ae959351a47 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Fri, 3 Mar 2017 12:04:25 -0500
Subject: [PATCH 43/48] fix mark for unplotted region

---
 bin/plot_reap_ibd.Rscript | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/plot_reap_ibd.Rscript b/bin/plot_reap_ibd.Rscript
index ef33508..a86b237 100755
--- a/bin/plot_reap_ibd.Rscript
+++ b/bin/plot_reap_ibd.Rscript
@@ -14,7 +14,7 @@ if(length(commandArgs(TRUE)) > 2){
 require(ggplot2)
 
 # unplotted region
-tri = data.frame(x=c(1-minrel,1-((2/3)*minrel),1), y=c(0,(2/3)*minrel,0))
+tri = data.frame(x=c(1-minrel,1-(2*minrel),1), y=c(0,2*minrel,0))
 
 # read data
 infile <- read.table(infile, header=TRUE, stringsAsFactors=F)

From bc97a039dff931558e4481ad84106125a4b59a85 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 15 Mar 2017 13:30:50 -0400
Subject: [PATCH 44/48] fix typo

---
 bin/admix_rel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bin/admix_rel.py b/bin/admix_rel.py
index 26b27d7..b325d82 100755
--- a/bin/admix_rel.py
+++ b/bin/admix_rel.py
@@ -677,7 +677,7 @@ def maxpop(props, names, th):
         if args.use_exemplars:
             r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w')
             plot_pca_exemp_call = [Rplotpcax,
-                                   str(os.path.basename(str(args.plot_admix_pca)),
+                                   str(os.path.basename(str(args.plot_admix_pca))),
                                    str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt',
                                    str(args.target_bfile) + '.exemplar.legend.txt',
                                    str(3),
@@ -691,7 +691,7 @@ def maxpop(props, names, th):
 
         r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w')
         plot_pca_admix_call = [Rplotpcax,
-                               str(os.path.basename(str(args.plot_admix_pca)),
+                               str(os.path.basename(str(args.plot_admix_pca))),
                                str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt',
                                str(args.target_bfile) + '.admixture.legend.txt',
                                str(3),

From e937bde498b383bc16ca18161847d8cd96fda053 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 15 Mar 2017 13:31:49 -0400
Subject: [PATCH 45/48] fix depends broken by Broad Anaconda (mostly for R
 plots)

---
 cluster_templates/broad_uger.sub.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cluster_templates/broad_uger.sub.sh b/cluster_templates/broad_uger.sub.sh
index 597ad98..b1d43f0 100755
--- a/cluster_templates/broad_uger.sub.sh
+++ b/cluster_templates/broad_uger.sub.sh
@@ -23,6 +23,8 @@ sleep {sleep_time}
 # setup resources
 source /broad/software/scripts/useuse
 reuse -q Anaconda
+reuse -q .curl-7.47.1
+reuse -q .cairo-1.14.2
 
 # main command line
 {cmd_string}

From e5dc9fc9ebf6a1fb9862cc10ebae53ce0d0ae4f6 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 15 Mar 2017 15:16:41 -0400
Subject: [PATCH 46/48] adaptive mem reqs for imp agg

---
 bin/bg_imp.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/bin/bg_imp.py b/bin/bg_imp.py
index 511e058..754fcd2 100755
--- a/bin/bg_imp.py
+++ b/bin/bg_imp.py
@@ -467,11 +467,24 @@
 
     agg_log = 'agg_imp.'+str(outdot)+'.sub.log'
 
+    # some dynamic adjustment of mem based on sample size population
+    # (empirically, seem to get ~2x sites from afr vs eur)
+    fam_n = file_len(str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam')
+    if fam_n > 3000:
+        agg_mem = 32000
+    elif fam_n > 1000:
+        agg_mem = 16000
+    else:
+        agg_mem = 8000
+
+    if "afr" in sys.argv[1:]:
+        agg_mem = 2*agg_mem
+
     # TODO: consider queue/mem for agg
     send_job(jobname='agg.imp.'+str(outdot),
              cmd=next_call,
              logname=agg_log,
-             mem=8000,
+             mem=int(agg_mem),
              walltime=30,
              wait_name='bg.chunks.'+str(outdot),
 	     wait_num=str(jobres2).strip(),

From 3114d1759451fc196e98386b52a28a03e5ff7611 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 15 Mar 2017 16:28:38 -0400
Subject: [PATCH 47/48] add option of arbitrary weights for filter ped

---
 bin/args_ped.py   | 13 +++++++--
 bin/filter_ped.py | 70 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/bin/args_ped.py b/bin/args_ped.py
index 6d4642e..c26d6ff 100644
--- a/bin/args_ped.py
+++ b/bin/args_ped.py
@@ -56,7 +56,7 @@
 #
 ############
 parsergeno = argparse.ArgumentParser(add_help=False)
-arg_geno = parsergeno.add_argument_group('Genotyping Rate (Optional)')
+arg_geno = parsergeno.add_argument_group('Additional Weights (Optional)')
 
 arg_geno.add_argument('--geno', 
                       type=str,
@@ -64,8 +64,15 @@
                       help='file with genotype missingness rate per individual ' + \
                            '(i.e. the .imiss file from plink --missing)',
                       required=False,
-                      default='NONE')  
-                    
+                      default=None)
+arg_geno.add_argument('--weight-file', 
+                      type=str,
+                      metavar='FILE',
+                      help='file with added weight per individual. Intentionally ' + \
+                           'flexible for arbitrary weights. Assumes 3 columns: FID,' + \
+                           'IID, and numeric weight.',
+                      required=False,
+                      default=None)                    
 
 ############
 #
diff --git a/bin/filter_ped.py b/bin/filter_ped.py
index 6b103e6..777b2f2 100755
--- a/bin/filter_ped.py
+++ b/bin/filter_ped.py
@@ -82,7 +82,10 @@
 print 'Using settings:'
 print '--input-ibd '+str(args.input_ibd)
 print '--bfile '+str(args.bfile)
-print '--geno '+str(args.geno)
+if args.geno is not None and args.geno != "None":
+    print '--geno '+str(args.geno)
+if args.weight_file is not None and args.weight_file != "None":
+    print '--weight-file '+str(args.weight_file)
 print '--out '+str(args.out)
 print '--format '+str(args.format)
 print '--min-rel '+str(args.min_rel)
@@ -93,7 +96,8 @@
 print '--fam-con-weight '+str(args.fam_con_weight)
 print '--fam-miss-weight '+str(args.fam_miss_weight)
 print '--cross-fid-weight '+str(args.cross_fid_weight)
-print '--geno-weight '+str(args.geno_weight)
+if args.geno is not None and args.geno != "None":
+    print '--geno-weight '+str(args.geno_weight)
 print '--rand-weight '+str(args.rand_weight)
 print '--seed '+str(args.seed)
 
@@ -106,9 +110,12 @@
 assert os.path.isfile(args.input_ibd), "IBD/relatedness file does not exist (%r)" % args.input_ibd
 assert os.path.isfile(str(args.bfile)+'.fam'), "Plink fam file does not exist (%s)" % str(args.bfile)+'.fam'
 
-if str(args.geno) != 'NONE':
+if args.geno is not None and str(args.geno) != 'None':
     assert os.path.isfile(args.geno), "Missingness rate file does not exist (%r)" % args.geno
 
+if args.weight_file is not None and str(args.weight_file) != 'None':
+    assert os.path.isfile(args.weight_file), "Weight file does not exist (%r)" % args.weight_file
+
 
 print '\n'
 print '############'
@@ -171,7 +178,7 @@
 
 genorate = {}
 
-if str(args.geno) == 'NONE':
+if args.geno is None or str(args.geno) == 'None':
     print 'Skipping (no file provided).'    
     for indiv in fam_info:
         genorate[indiv] = 1.0
@@ -201,8 +208,52 @@
         if indiv in genorate:
             continue
         else:
-            warnings.warn('Genotyping rate not loaded for %s. Setting to zero.' % str(indiv))
-            genofile[indiv] = 1.0
+            warnings.warn('Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv))
+            genofile[indiv] = 0.0
+
+
+
+
+
+#############
+print '\n...Loading additional weight file...'
+# Assume FID, IID, weight
+# if no file, set to zero
+#############
+
+misc_w = {}
+
+if args.weight_file is None or str(args.weight_file) == 'None':
+    print 'Skipping (no file provided).'    
+    for indiv in fam_info:
+        misc_w[indiv] = 0.0
+
+else:
+    wfile = open(str(args.weight_file), 'r')
+    
+    # read per individual, indexed by FID:IID
+    for line in wfile:
+        (fid, iid, weight_num) = line.split()
+        
+        # id key
+        ind_id = str(fid) + ':' + str(iid)
+        
+        # record
+        misc_w[ind_id] = float(weight_num)
+    
+    wfile.close()
+    
+    # check values present for all IDs
+    for indiv in fam_info:
+        if indiv in misc_w:
+            continue
+        else:
+            warnings.warn('No additional weight for %s. Setting to zero.' % str(indiv))
+            misc_w[indiv] = 0.0
+
+
+
+
 
 
 
@@ -504,7 +555,7 @@ def isFamPO(pair_info, fam_info):
 
     # define function to score preference for keeping each individual
     # lowest score will get deleted
-    def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict):
+    def pref_score(ind_id, fam_dict, rel_dict, geno_dict, misc_dict, weight_dict):
         # init
         pref = 0.0
         ind_id = str(ind_id)
@@ -535,6 +586,9 @@ def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict):
         # score geno rate
         pref += weight_dict['geno_rate'] * float(geno_dict[ind_id])
 
+        # score added weight
+        pref += float(misc_dict[ind_id])
+
         return pref
     
     # loop removal until no cross-fid relationship left
@@ -542,7 +596,7 @@ def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict):
     while len(cross_id_list) > 0:
     
         # score each cross-FID related IID's prority for keep/remove
-        prefs = [pref_score(indiv, fam_info, iid_relatives, genorate, pref_weights) for indiv in cross_id_list]
+        prefs = [pref_score(indiv, fam_info, iid_relatives, genorate, misc_w, pref_weights) for indiv in cross_id_list]
         
         # breaks ties randomly
         if len(prefs) != len(set(prefs)):

From 16cbf54818a843ef7c35b21d2cd41b1476a35f24 Mon Sep 17 00:00:00 2001
From: rkwalters <rwalters@broadinstitute.org>
Date: Wed, 15 Mar 2017 19:00:17 -0400
Subject: [PATCH 48/48] log filter_ped warnings

---
 bin/filter_ped.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/bin/filter_ped.py b/bin/filter_ped.py
index 777b2f2..640f72d 100755
--- a/bin/filter_ped.py
+++ b/bin/filter_ped.py
@@ -37,8 +37,9 @@
 import random
 import warnings
 from args_ped import parserbase, parsergeno, parseribd, parserweights
-from py_helpers import unbuffer_stdout
+from py_helpers import unbuffer_stdout, warn_format
 unbuffer_stdout()
+warnings.formatwarning = warn_format
 
 
 #############
@@ -209,6 +210,7 @@
             continue
         else:
             warnings.warn('Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv))
+	    print 'Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv)
             genofile[indiv] = 0.0
 
 
@@ -249,14 +251,10 @@
             continue
         else:
             warnings.warn('No additional weight for %s. Setting to zero.' % str(indiv))
+	    print 'No additional weight for %s. Setting to zero.' % str(indiv)
             misc_w[indiv] = 0.0
 
 
-
-
-
-
-
 #############
 print '\n...Parsing relatedness estimates...'
 # handle reap file format