From e4b6b1cf2fbbd1cefd37f63d578e9485d005d6e7 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 28 Sep 2016 16:00:01 -0400 Subject: [PATCH 01/48] first version of portable blueprint --- bin/blueprint.py | 400 +++++++++++++++++++++ bin/imp_prep.pl | 42 +-- cluster_templates/broad_uger.array.sub.sh | 29 ++ cluster_templates/broad_uger.conf | 11 + cluster_templates/broad_uger.single.sub.sh | 27 ++ 5 files changed, 488 insertions(+), 21 deletions(-) create mode 100755 bin/blueprint.py create mode 100755 cluster_templates/broad_uger.array.sub.sh create mode 100644 cluster_templates/broad_uger.conf create mode 100755 cluster_templates/broad_uger.single.sub.sh diff --git a/bin/blueprint.py b/bin/blueprint.py new file mode 100755 index 0000000..70acd87 --- /dev/null +++ b/bin/blueprint.py @@ -0,0 +1,400 @@ +#! /usr/bin/env python + +#################################### +# blueprint.py +# written by Raymond Walters, September 2016 +""" +manages job submission on different cluster architectures +""" +# +#################################### + +import os +import subprocess +from textwrap import dedent +from py_helpers import read_conf, file_len + +def send_job(jobname, + arrayfile=None, + cmd=None, + logname=None, + logloc=None, + mem=None, + walltime=None, +# week=None, + njobs=None, + maxpar=10000, +# multi=None, + wait_file=None, + wait_name=None, + cluster=None, + sleep=30, + testonly=False): + + # validate args + if arrayfile is None and cmd is None: + raise ValueError("Require either array file or command.") + + elif arrayfile is not None and cmd is not None: + raise ValueError("Require either array file or command, not both.") + + + if logloc is None: + logloc = os.getcwd() + + if maxpar < 1: + maxpar = 10000 + + # get cluster queue name + if cluster is None: + conf_file = os.environ['HOME']+"/picopili.conf" + configs = read_conf(conf_file) + cluster = configs['queue'] + + # get queue template + pico_bin = os.path.dirname(os.path.realpath(__file__)) + clust_dir = os.path.dirname(pico_bin) + '/cluster_templates' + + assert os.path.isdir(clust_dir), "Unable to find cluster job submission template directory %s" % str(clust_dir) + + # load queue configuration info + # - submission syntax, queue names, job holds + clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') + + # setup memory args + if mem is None: + mem = 2000 + mem_mb = str(int(mem)) + mem_gb = str(int(mem)/1000) + + # queue picking from job length + if walltime is None: + walltime = 1 + queue_name = clust_conf['hour_q'] + elif walltime <= 1.0: + queue_name = clust_conf['hour_q'] + elif walltime <= 2.0: + queue_name = clust_conf['hour2_q'] + elif walltime <= 4.0: + queue_name = clust_conf['hour4_q'] + elif walltime <= 24.0: + queue_name = clust_conf['day_q'] + else: + queue_name = clust_conf['long_q'] + + # job dependencies + if wait_name is not None: + hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name) + + elif wait_file is not None: + with open(wait_file, 'r') as wait_fi: + wait_name = wait_fi.readline() + hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name) + + else: + hold_str = "" + + + + # template for single jobs + if cmd is not None and (njobs is None or njobs <= 1): + + with open(str(clust_dir)+'/'+str(cluster)+'.single.sub.sh','r') as single_templ: + templ = single_templ.read() + + njobs = 1 + + # log name + if logname is None: + logname = str(jobname)+'.sub.log' + + # command line + cmd_str = cmd + + # dummy task array args for dict + array_jobs = njobs + j_per_core = 1 + + + # template for array jobs + else: + with open(str(clust_dir)+'/'+str(cluster)+'.array.sub.sh','r') as array_templ: + templ = array_templ.read() + + # setup indexing tasks + j_per_core = int(clust_conf['array_core']) + if j_per_core == 1: + task_index = str(clust_conf['task_id']) + else: + task_index = "${tid}" + + # cmd or array file spec + if cmd is not None: + cmd_line = cmd.format(task=task_index) + + else: + assert os.path.isfile(arrayfile), "Job array file %s not found." % str(arrayfile) + + njobs = file_len(arrayfile) + + cmd_tmp = dedent("""\ + cline=`head -n {task} {fi} | tail -n 1` + echo $cline + $cline + """) + cmd_line = cmd_tmp.format(task=task_index, fi=arrayfile) + + # parallelization of array jobs on a node + if j_per_core > 1: + + from math import floor, ceil + + # max simul tasks with memory limit + node_mem = float(clust_conf['array_core']) + task_mem_lim = floor((node_mem-1.0)/float(mem)) + + if task_mem_lim < 1: + task_mem_lim=1 + + if task_mem_lim > j_per_core: + task_mem_lim = j_per_core + + # number of jobs to cover all tasks + array_jobs = ceil(float(njobs)/float(task_mem_lim)) + + # setup to do task_mem_lim jobs on each node + # note: specified above that cmd_line uses ${tid} as task index + par_tmp = dedent("""\ + # array index for this job + jj={job_index} + + # number of jobs to run on node + nodej={nodej} + + # total number of jobs to run in task array + maxj={njobs} + + # task index of first task on this node + tid=$(($nodej * ($jj - 1) + 1)) + + # find index of last task for this node + # - from either node task limit (nodej) + # or total numebr of tasks (maxj) + if [$tid -le $(($maxj - $nodej + 1))]; then + last_task = $(($tid + $nodej - 1)) + else + last_task = $(($maxj)) + + # start the tasks + while [ $tid -le max_task ]; do + {cmd_line} & + tid=$(($tid+1)) + done + + # let all tasks finish + wait + """) + + cmd_str = par_tmp.format(njobs=str(njobs), + nodej=str(task_mem_lim), + job_index=str(clust_conf['task_id']), + cmd_line=cmd_line) + + + else: + array_jobs = njobs + cmd_str = cmd_line + + + # log name + if logname is None: + logname = str(jobname)+'.sub.'+str(clust_conf['log_task_id'])+'.log' + + + + # fill in template + jobdict = {"job_name": str(jobname), + "cmd_string": cmd_str, # formatted elsewhere + "log_name": str(logname), + "mem_in_mb": str(mem_mb), + "mem_in_gb": str(mem_gb), + "wall_hours": str(walltime), + "njobs": str(njobs), + "array_jobs": str(array_jobs), + "array_max": str(maxpar), + "core_par": str(j_per_core), + "task_id": str(clust_conf['task_id']), + "log_task_id": str(clust_conf['log_task_id']), + "queue_name": str(queue_name), + "sleep_time": str(sleep) + } + + + # write job script + sub_file = open(str(jobname)+'.sub.sh','w') + sub_file.write(templ.format(**jobdict)) + sub_file.close() + + # command to run + if hold_str != "": + launch_str = clust_conf['sub_cmd']+' '+hold_str+' '+str(sub_file.name) + else: + launch_str = clust_conf['sub_cmd']+' '+str(sub_file.name) + + # record + print launch_str + + # run + if not testonly: + p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) + out, err = p.communicate() + print out + return(p.returncode) +# +# +# # manual error nhandling here because of Broad LD_LIBRARY_PATH warning +# if p.returncode != 0: +# if "LD_LIBRARY_PATH" in out: +# print out +# else: +# raise IOError("Job submission failed\nCode: %d\nError: %s\nOutput: %s\n" % p.returncode, err, out) + + return 0 + + +#################################### +# +# Parse arguments from ricopili interface if invoked directly +# +#################################### +if __name__ == "__main__": + + # conditional imports + import argparse + + # setup arguments matching usage in imp_prep.pl + parser = argparse.ArgumentParser(prog='blueprint.py', + formatter_class=lambda prog: + argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40)) + + arg_core = parser.add_argument_group('Job Description') + + arg_core.add_argument('--name','--na', +# aliases=['--na'], + type=str, + help='job name', + required=True) + arg_core.add_argument('--array', + type=str, + help='file containing command lines to be run', + default=None, + required=False) + arg_core.add_argument('--b','-b','--blueprint', +# aliases=['--blueprint','--cmd'], + type=str, + help='command line to be run', + default=None, + required=False) + + + arg_old = parser.add_argument_group('Ricopili Backwards Compatibility') + + arg_old.add_argument('--job','-j','--j', +# aliases=['--j'], + action='store_true', + help='indicates ricopili call (repurposed)') + parser.add_argument('--noerr', + action='store_true', + help='no output to ./errandout (for ricopili comparitibility)') + parser.add_argument('--direct','--di', +# aliases=['--di'], + action='store_true', + help='start job without reading prefixes') + + arg_req = parser.add_argument_group('Resource Requirements') + + arg_req.add_argument('--mem', + type=int, + help='memeory requirement for each job, in Mb', + default=2000, + required=False) + arg_req.add_argument('--walltime','--wa', + # aliases=['--wa'], + type=int, + help='walltime for each job, in hours', + default=1, + required=False) +# arg_req.add_argument('--week', +# type=int, +# help='use week/long queues', +# default=None, +# required=False) + arg_req.add_argument('--njob', + type=int, + help='max number of jobs to be submitted', + default=1000, + required=False) + arg_req.add_argument('--maxpar', + type=int, + help='maximum number of jobs to run in parallel', + default=10000, + required=False) +# arg_req.add_argument('--multi', +# type=str, +# help='number of jobs to parallelize, and the number of threads to use for each parallel job (comma separated)', +# default=None, +# required=False) + arg_req.add_argument('--fwt', + type=str, + help='file listing job dependencies to wait for before launching job', + default=None, + required=False) + arg_req.add_argument('--wait-name', + type=str, + help='name of job dependency', + default=None, + required=False) + + arg_test = parser.add_argument_group('Dev Testing') + + parser.add_argument('--testonly', + action='store_true', + help='Skip job submission', + default=False) + + + args = parser.parse_args() + + # get queue + conf_file = os.environ['HOME']+"/picopili.conf" + configs = read_conf(conf_file) + queue = configs['queue'] + + # set logfile name + if args.noerr: + logloc = os.getcwd()+'/errandout/' + else: + logloc = os.getcwd() + + # ignore arguments for direct + if args.direct: + args.njob=None + args.walltime=None + args.mem=None + + + send_job(jobname=args.name, + arrayfile=args.array, + cmd=args.b, + logloc=logloc, + mem=args.mem, + walltime=args.walltime, +# week=None, + njobs=args.njob, + maxpar=args.maxpar, +# multi=None, + wait_file=args.fwt, + wait_name=args.wait_name, + cluster=queue, + testonly=args.testonly) + diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index bc98b50..f576f60 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -186,7 +186,7 @@ sub trans { my $checkpos_script = "checkpos_pico.pl"; ### my.pipeline_tar my $checkflip_script = "checkflip_pico.pl"; ### my.pipeline_tar my $mutt_script = "mutt"; ### my.pipeline_tar -my $blue_script = "blueprint_pico.pl"; ### my.pipeline_tar +my $blue_script = "blueprint.py"; ### my.pipeline_tar push @test_scripts, $readref_script; push @test_scripts, $readrefsum_script; @@ -572,27 +572,27 @@ sub send_jobarray { $command_line =~ s/--force1//; - my $wt_file = "$sjadir/blueprint_joblist_file-$sjaname.$outname"; +# my $wt_file = "$sjadir/blueprint_joblist_file-$sjaname.$outname"; chdir "$rootdir" or die "something strange"; - if ($qloc eq "bsub") { - $wt_file =~ s/.*blueprint_joblist_file-//; - } - - if ($qloc eq "slurm") { - $wt_file = "$sjadir/$jobfile.script.id"; - } - - if ($qloc eq "qsub") { - $wt_file = "$sjadir/j.$sjaname.$outname.id"; - } - if ($qloc eq "qsub_c") { - $wt_file = "$sjadir/j.$sjaname.$outname.id"; - } - if ($qloc eq "qsub_b") { - $wt_file = "$sjadir/j.$sjaname.$outname.id"; - } +# if ($qloc eq "bsub") { +# $wt_file =~ s/.*blueprint_joblist_file-//; +# } +# +# if ($qloc eq "slurm") { +# $wt_file = "$sjadir/$jobfile.script.id"; +# } +# +# if ($qloc eq "qsub") { +# $wt_file = "$sjadir/j.$sjaname.$outname.id"; +# } +# if ($qloc eq "qsub_c") { +# $wt_file = "$sjadir/j.$sjaname.$outname.id"; +# } +# if ($qloc eq "qsub_b") { +# $wt_file = "$sjadir/j.$sjaname.$outname.id"; +# } - + my $wt_name = "$sjaname.$outname"; if ($serial) { my $sys_re = "$command_line"; @@ -600,7 +600,7 @@ sub send_jobarray { exit; } else { - my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 2 --di -j --fwt $wt_file --na _if_$outname"; + my $sys_re = "$blue_script --njob $job_bn_th -b \"$command_line\" --wa 2 --di -j --wait-name $wt_name --na _if_$outname"; &mysystem ($sys_re); } diff --git a/cluster_templates/broad_uger.array.sub.sh b/cluster_templates/broad_uger.array.sub.sh new file mode 100755 index 0000000..4416e9c --- /dev/null +++ b/cluster_templates/broad_uger.array.sub.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +# wrapper script for job submission on Broad UGER cluster +# +# The -V below above will provoke a warning that +# LD_LIBRARY_PATH won't be used for security reasons; +# this warning can be safely ignored + +#$ -j y +#$ -cwd +#$ -V +#$ -N {job_name} +#$ -o {log_name} +#$ -q {queue_name} +#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g +#$ -t 1-{array_jobs} +#$ -tc {array_max} + +# sleep option (for preventing race conditions on network file systems) +sleep {sleep_time} + +# setup resources +source /broad/software/scripts/useuse +reuse -q Anaconda + +# main command line +{cmd_string} + +# eof diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf new file mode 100644 index 0000000..658eae0 --- /dev/null +++ b/cluster_templates/broad_uger.conf @@ -0,0 +1,11 @@ +hour_q short +hour2_q short +hour4_q long +day_q long +long_q long +sub_cmd qsub +log_task_id $TASK_ID +task_id ${SGE_TASK_ID} +hold_flag -hold_jid +array_core 1 +array_mem_mb 128000 diff --git a/cluster_templates/broad_uger.single.sub.sh b/cluster_templates/broad_uger.single.sub.sh new file mode 100755 index 0000000..42a1335 --- /dev/null +++ b/cluster_templates/broad_uger.single.sub.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# wrapper script for job submission on Broad UGER cluster +# +# The -V below above will provoke a warning that +# LD_LIBRARY_PATH won't be used for security reasons; +# this warning can be safely ignored + +#$ -j y +#$ -cwd +#$ -V +#$ -N {job_name} +#$ -o {log_name} +#$ -q {queue_name} +#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g + +# sleep option (for preventing race conditions on network file systems) +sleep {sleep_time} + +# setup resources +source /broad/software/scripts/useuse +reuse -q Anaconda + +# main command line +{cmd_string} + +# eof From ab30a1231c61fd8d8470894349b147a159efa0d0 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 28 Sep 2016 16:55:15 -0400 Subject: [PATCH 02/48] remove old blueprint, move perl to config, port Utils.pm --- bin/blue_start_job.pl | 106 -- bin/blueprint_pico.pl | 1581 ----------------------------- bin/buigue_pico.pl | 33 +- bin/imp_prep.pl | 40 +- bin/lift_to_hg19.pl | 27 +- bin/{plague.pl => plague_pico.pl} | 40 +- bin/qc_rel.py | 4 +- bin/rp_perl/Utils.pm | 49 + docs/RICOPILI.md | 5 +- 9 files changed, 105 insertions(+), 1780 deletions(-) delete mode 100755 bin/blue_start_job.pl delete mode 100755 bin/blueprint_pico.pl rename bin/{plague.pl => plague_pico.pl} (85%) create mode 100755 bin/rp_perl/Utils.pm diff --git a/bin/blue_start_job.pl b/bin/blue_start_job.pl deleted file mode 100755 index 5706252..0000000 --- a/bin/blue_start_job.pl +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; - -my $version = "1.0.0"; -my $progname = $0; -$progname =~ s!^.*/!!; - -my $num = ""; -my $jobfile = ""; -my $parn = ""; - -use Getopt::Long; -GetOptions( - "help"=> \my $help, - "parn=s"=> \$parn, - "n=s"=> \$num, - "jobfile=s"=> \$jobfile, - ); - -use File::Basename; - - - -if ($help || $num eq "" || $jobfile eq ""){ - print "usage: $progname FILES - -version: $version - - options: - - --help print this message then quit - --n INT line to take as command (like this n100) - --jobfile STRING jobfile from which to read command - --parn INT number of parallel jobs (then multiplicated with n) - - - --n n100 - - - created by Stephan Ripke 2012 at MGH, Boston, MA - in the frame of the PGC -\n"; - exit 2; -} - -$num =~ s/n//; -$num = $num * 1; -if ($num == 0) { - print "--n doesn't make sense: $num after transformation\n"; -} - - -if ($parn ne "") { - - $parn = $parn * 1; - if ($parn == 0) { - print "--parn doesn't make sense: $num after transformation\n"; - } - - my $first_n = ($num-1) * $parn; - $first_n++; - - my $last_n = $num * $parn; - - my $lc = 1; - -# my @job_array; - die "$jobfile: ".$! unless open FILE, "< $jobfile"; - die "$jobfile.sub$num: ".$! unless open OUT, "> $jobfile.sub$num"; - while (my $cmd = ){ - chomp ($cmd); - - if ($lc >= $first_n && $lc <= $last_n) { -# push @job_array, $cmd; - print OUT $cmd." &\n"; - } - - - $lc++; - } - close FILE; - print OUT "wait\n"; - close OUT; - - system ("chmod u+x $jobfile.sub$num"); - -# exit; - system ("./$jobfile.sub$num"); - - exit; - -} - - - - -my $sys = `head -n $num $jobfile | tail -1`; -chomp ($sys); -system ($sys); -#print $sys."\n"; - - - - diff --git a/bin/blueprint_pico.pl b/bin/blueprint_pico.pl deleted file mode 100755 index 9de6391..0000000 --- a/bin/blueprint_pico.pl +++ /dev/null @@ -1,1581 +0,0 @@ -#!/usr/bin/env perl -use strict; - -my $version = "3.1.0"; -my $progname = $0; -$progname =~ s!^.*/!!; - -my $command_line = "$progname @ARGV"; - - - -#print "blueprint @ARGV\n"; - -############################# -# read config file -############################# - -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my %conf = (); - -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; - -sub trans { - my ($expr)=@_; - unless (exists $conf{$expr}) { - die "config file without entry: $expr\n"; - } - $conf{$expr}; -} - -my $qloc = &trans("queue"); - -my $qsub =0; -my $qsub_computerome =0; -my $qsub_broad =0; -if ($qloc eq "qsub"){ - $qsub = 1; -} -if ($qloc eq "qsub_c"){ - $qsub_computerome =1; -} -if ($qloc eq "qsub_b"){ - $qsub_broad =1; -} - -my $bsub =1 if ($qloc eq "bsub"); -my $msub =1 if ($qloc eq "msub"); -my $slurm =1 if ($qloc eq "slurm"); - - -my $startjob_script ="blue_start_job.pl"; - - -########################################## - -my $bp = ""; -my $sample="plink --bfile prefix --out QC2/prefix-qc2 --geno 0.1 --mind 0.1 --make-bed"; - -my $prefix="prefix"; - -my $jl_file="blueprint_joblist_file"; - -my $core=12; -my $inst=11; -my $walltime="1"; - -my $name_of_job=""; - -my $week = -1; -my $multi = -1; -my $threads = 4; -my $multi_str = ""; -my $errname="errandout"; -my $jmem = 2000; - -my $hhmm = ""; - -my $cores = 16; - -my $job_bn_th = 1000; -my $maxpar = 0; - -use Cwd; -my $rootdir = &Cwd::cwd(); - -my $usage = " -Usage : $progname [options] -b \"blueprint\" - -version: $version - -## --inst INT,INT 2 or 8 core, 16,24 gb number of instancies in one job, default=$core,$inst - --cores INT on LISA, how many cores per node (default $16) - --walltime walltime per job, default=$walltime - --week INT the first N jobs go to the week queue - --multi INT,INT the first N jobs getting multithreading frame, second number the number of threads, default 4 - --job launches job on cluster or serail comands on home - --mach loads module mach in jobs -# --R loads module R in jobs - --plink loads module plink in jobs - --fortran loads module fortran/intel in jobs - --blueprint STRING uses STRING containing prefix and launches commands - --prefix STRING uses STRING instead of prefix - --px STRING,INT,INT substitute STRING by a seq from INT to INT - --linech FILE,INT,STRING make own seq with fam-file and chunksize into STRING - --direct start job without reading prefixes - --name STRING name of job - --wt STRING wait for OK from joblist from this file - --fwt STRING wait for finished from joblist from this file - --serial not all the commands parallel - --errname STRING name of errandout-subdir, default: $errname - - --time HHMM time, when the job is eligible - - --mem INT memory requested for each job (bsub), default = $jmem - - --start directly start the commands, no jobs. - - --njob INT start INT jobs at a time - - --noerr noerrandout - --array FILE jobarray, much quicker submission of larger numbers - --maxpar INT max number of parallel jobs (this is for high IO jobs) - -sample of blueprint: $sample - - -outdated { - cores 2: 2 cores - cores 8: any eight (any RAM) - cores 12: strictly 8gb RAM - cores 16: strictly 16gb RAM - cores 24: strictly 24gb RAM -} - -new: changes between 8 and 12 cores, instancies always one less - - - writes output to a specific file $jl_file - - created by Stephan Ripke 2008 at MGH, Boston, MA - -"; - -#print "cd $ENV{PWD}\n"; - -my $NOJ=""; -my $jobarray=""; - - - -use Getopt::Long; -GetOptions( "blueprint=s"=> \$bp, - "job" => \my $job, - "serial" => \my $serial, - "week=i" => \$week, - "multi=s" => \$multi_str, - "inst=s" => \my $inst_str, - "cores=i" => \$cores, - "prefix=s"=> \$prefix, - "wt=s"=> \my $wait_file, - "fwt=s"=> \my $wait_fi_file, - "time=s"=> \$hhmm, - "errname=s"=> \$errname, - "walltime=i" => \$walltime, - "mem=i" => \$jmem, - "njob=i" => \$job_bn_th, - "maxpar=i" => \$maxpar, - "mach" => \my $mach, - "start" => \my $start, - "direct" => \my $direct, - "noerr" => \my $noerr, -# "R" => \my $R, - "plink" => \my $plink, - "fortran" => \my $fortran, - "px=s" => \my $pxarg, - "linech=s" => \my $linechunk_str, - "name=s" => \$NOJ, - "array=s" => \$jobarray, - "help!"=> \my $help ); - - -if ($help){ - print $usage; - exit 2; -} - -if ($bp eq "" && $jobarray eq ""){ - print $usage; - exit 2; -} - -#print "all is right\n"; - -if ($hhmm ne ""){ - $hhmm = "-a $hhmm"; -} - - -$bp =~ s/dollarsign/\$/; -#print "$bp\n"; -#exit; - -if ($NOJ eq ""){ - $NOJ = $bp; - $NOJ =~ s/^[\s]+//g; - my @cols= split /\s+/, $NOJ; - $cols[0] =~ s/[^a-zA-Z]//g; - $NOJ = $cols[0]; -} - -$name_of_job = "-N $NOJ"; - - - - -##################################### -# subroutine to count lines of a file -##################################### - -sub count_lines { - my ($file)=@_; - my $lc=0; - die "$file: ".$! unless open FILE, "< $file"; - while (){ - $lc++; - } - close FILE; - $lc; -} - - - - -#print "$pxstart\t$pxend\n"; -#print "core: $core\n"; - -my @blueprint_out; - - -############################# -# test, if running on server -############################# -#use Sys::Hostname; -#my $host = hostname(); -#my $lisa=0; -#$lisa=1 if ($host =~ m/sara/) ; - -#my $broad = 1 if ($ENV{"DOMAINNAME"} =~ /broadinstitute/); - - - -##################################### -# print array to file -#################################### - -sub a2file { - my ($file, @lines)=@_; - die $! unless open FILE, "> $file"; - foreach (@lines){ - print FILE $_; - } - close FILE; -} - - -################################################### -### system call with test if successfull -################################################### - -sub mysystem(){ - my ($systemstr)="@_"; - my $test_str = `$systemstr`; - push @blueprint_out, $test_str; -# print "$test_str"; -#system($systemstr); - my $status = ($? >> 8); - print "$systemstr\n->system call failed: $status\n" if ($status != 0); -} - - - -my $wait_str = ""; -my $addon_str = ""; - -my $count_jobs = 1 ; -if ($wait_file){ - - my $wc = &count_lines($wait_file); - my $skip_lines = $wc-50 ; - die $! unless open WF, "< $wait_file"; - $wait_str = "#PBS -W depend=afterok"; - - while (my $line = ) { - $count_jobs ++; - next if ($count_jobs < $skip_lines); - chomp($line); - $line =~ s/[^0-9]//g; - $wait_str .= ":$line"; - $addon_str .= ":$line"; - - } - close WF; - -# print "$wait_str\n"; - -} - -if ($wait_fi_file){ - if ($qsub == 1) { - - $count_jobs = 1 ; - die "$! <$wait_fi_file>;<$ENV{PWD}>" unless open WF, "< $wait_fi_file"; - my $wc = &count_lines($wait_fi_file); - my $skip_lines = $wc-50; - $wait_str = "#PBS -W depend=afterany"; - - while (my $line = ) { - $count_jobs ++; - next if ($count_jobs < $skip_lines); - chomp($line); - $line =~ s/[^0-9]//g; - $wait_str .= ":$line"; - -# $wait_str .= ":$line.batch1.irc.sara.nl"; - } - close WF; - } -} - - -my ($pxstring,$pxstart,$pxend) = split ',', $pxarg if ($pxarg); -my ($lc_file,$lc_size, $lc_str) = split ',', $linechunk_str if ($linechunk_str); -($core,$inst) = split ',', $inst_str if ($inst_str); - -$inst = 10000 if ($serial); - - -if ($lc_file) { - print "file:$lc_file\n"; - print "csize:$lc_size\n"; - my $temp_size = &count_lines($lc_file); - print "fsize:$temp_size\n"; - $pxstring = $lc_str; - $pxstart=0; - $pxend= sprintf "%d", $temp_size/$lc_size-1; - $pxend++ if ($temp_size % $lc_size != 0); - $pxarg = 1; -} - - - - -my $row_count=0; - -my @job_arr=(); - - - -if ($direct){ - push @job_arr, $bp; -} -else { - if ($jobarray eq "") { -# print "is it here?\n"; - while (<>){ - chomp; - my $line=$bp; - $line =~ s/$prefix/$_/g; - $line =~ s/"//g; -# print $line."\n"; - if ($pxarg){ - foreach my $subs ($pxstart..$pxend){ -# print "this is a a command: $subs\n"; - my $line2=$line; - $line2 =~ s/$pxstring/$subs/g; - push @job_arr, $line2; - } - } - else { - - push @job_arr, $line; -# print "2: this is a a command: $line\n"; - } - } - } -} - -#print "sleep\n"; -#sleep(3); - -my $module =""; -$module = "module load plink\n" if $plink; -$module .= "module load mach\n" if $mach; -$module .= "module load fortran/intel\n" if $fortran; -#$module .= "module load R\n" if $R; - - - - - - -if ($job){ - -# print "entering job\n"; - - - use File::Path; - my @created = mkpath( ## $created ? - $errname, - {verbose => 0, mode => 0750}, - ); - - - - - - if ($NOJ eq "prefix") { - $NOJ = $job_arr[0]; - $NOJ =~ s/^[\s]+//g; - my @cols= split /\s+/, $NOJ; - $cols[0] =~ s/[^a-zA-Z]//g; - $NOJ = $cols[0]; - } - -######################################## -### BROAD (bsub) -########################################## - - - if ($bsub) { -# print "entering bsub\n"; - ($multi,$threads) = split ',', $multi_str if ($multi_str ne ""); -# print "multi: $multi\n"; -# print "threads: $threads\n"; -# sleep(10); - $threads = 4 if ($threads eq ""); - - - my $wallmin = $walltime * 60; - $wallmin = 240 if ($wallmin > 240); - my $wallstr = "-W $wallmin"; - - - if ($wallmin < 10){ - $wallstr = "-app shortjobs"; -# $wallmin = 10 ; - } - - - - my $sla_deadline = ""; -# $sla_deadline = "-sla DEADLINEsla"; - - my $time = "hour"; - - if ($jobarray ne "") { - - print "starting job_array\n"; - - my $mem_str = ($jmem / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - my $Rusage = "-R \"rusage[mem=$mem_str]\""; - - if ($multi >= 0){ - my $threads_loc = $threads; - my $threads_loc_half = sprintf "%d",$threads / 2; - my $mem_loc = $mem_str * 1100; -# $Rusage = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\""; - $Rusage = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc"; -# print "$cmd\n"; - - } - - if ($week > 0) { - $time = "week"; - $wallstr = "-W 10000"; - } - - -# my $pretext = ""; - my $pretext = "$sla_deadline $wallstr -q $time $Rusage"; - - my $sys = 'bsub -P unspecified--broadfolk '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']" -o "errandout/'.$NOJ.'..%J.%I" '.$startjob_script.' --n n\$LSB_JOBINDEX --jobfile '.$jobarray; - - if ($maxpar > 0) { - -## change maxpar of running job to 100 -## bmod -J "%100" 123 - ## see here - ## http://www.ccs.miami.edu/hpc/lsf/7.0.6/admin/jobarrays.html - - - - $sys = 'bsub -P unspecified--broadfolk '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']%'.$maxpar.'" -o "errandout/'.$NOJ.'..%J.%I" '.$startjob_script.' --n n\$LSB_JOBINDEX --jobfile '.$jobarray; - print "$sys\n"; - } -# exit; - -# print "sys: $sys\n"; - - my $acmd_file = "$jobarray.array_cmd"; - while (-e $acmd_file) { - $acmd_file .= ".a"; - } - die $! unless open AC, "> $acmd_file"; - print AC "$sys\n"; - close AC; -# exit; - &mysystem ($sys); - exit; - } - - my $job_bn=0; - my $job_bn_name=0; - my $job_bn_str = sprintf "%09d", $job_bn_name; - -# $NOJ = substr($NOJ,0,8); - my $dirname = ""; - $dirname = "$errname/" if ($errname ne "errandout"); - my $blue_n = 0 ; - my $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n"; - - while (-e "$blue_name") { - $blue_n++; - $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n"; - } - - die "$! $blue_name" unless open JOB, "> $blue_name"; - - my $week_count = 0; - my $multi_count = 0; - - my $zaehler = 2*$job_bn_th; - my $rand_th = $zaehler / @job_arr; - - foreach my $cmd (@job_arr){ - - - if ($noerr) { - $job_bn_str = "dump"; - } - else { - $job_bn_str = sprintf "%09d", $job_bn_name; - while (-e "$errname/$NOJ.$job_bn_str") { - $job_bn_name++; - $job_bn_str = sprintf "%09d", $job_bn_name; - } - } - - - - if ($start) { - &mysystem($cmd); - next; - } - - my $wt_str = ""; - if ($wait_fi_file) { -# my $substr_wf = substr($wait_fi_file,0,8); - my $substr_wf = $wait_fi_file; - $wt_str = '-w \'ended ("'.$substr_wf.'")\''; - } - - $time = "hour"; - - my $jmem_loc = $jmem; - - - - if ($week >= 0){ -# here change from priority to week!!! -# $time = "priority"; - $time = "week"; -# $time = "week -G deadline"; - $week = $week -1; -# $jmem_loc = $jmem * 2; - $jmem_loc = $jmem + 3000 ; -# $wallstr = '-W 2400'; - $wallstr = ''; - } - else { -# $time = "hour -G psychfolk"; - $time = "hour"; - } - - -#bsub -sla DEADLINEsla -q hou - - - my $mem_str = ($jmem_loc / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - - - my $Rusage = "-R \"rusage[mem=$mem_str]\""; - - - if ($multi >= 0){ - $multi = $multi -1; -# $jmem_loc = $jmem * 2; -# $Rusage = "-n 2,4 -R \"rusage[mem=$mem_str]span[hosts=1]\""; - -# my $threads_loc = $threads + 2; - my $threads_loc = $threads; - my $threads_loc_half = sprintf "%d",$threads / 2; - my $mem_loc = $mem_str * 1100; -# $Rusage = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\""; - $Rusage = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc"; -# print "$cmd\n"; - - } - - next if (rand() > $rand_th); - - if ($week >= 0){ - $week_count++; -# next if ($week_count > 25); - } - - my $locname = $NOJ; - if ($multi >= 0){ - $multi_count++; -# $locname .= ".mu"; - } - - - - my $errstr = "$errname/$NOJ.$job_bn_str"; - if ($noerr) { - $errstr = "/dev/null"; - } - - my $jcmd = "bsub $sla_deadline $wt_str $wallstr $Rusage -J $locname -q $time -o $errstr \"$cmd\""; - - print JOB "$jcmd\n"; - - - - if (1) { -# &mysystem ("$jcmd "); - &mysystem ("$jcmd 2> /dev/null > /dev/null"); # 0113 - } - else { - print "$jcmd\n"; - } -# sleep(1); -# exit; - $job_bn++; - $job_bn_name++; - - last if ($job_bn > $job_bn_th); - - } - close JOB; - exit; - } - -######################################## -### MSSM (msub) -########################################## - - if ($msub) { -# print "entering bsub\n"; - ($multi,$threads) = split ',', $multi_str if ($multi_str ne ""); -# print "multi: $multi\n"; -# print "threads: $threads\n"; -# sleep(10); - $threads = 4 if ($threads eq ""); - - - my $wallmin = $walltime * 60; - $wallmin = 240 if ($walltime > 240); - my $wallstr = "-W $wallmin"; - - - if ($wallmin < 10){ - $wallstr = "-app shortjobs"; -# $wallmin = 10 ; - } - - - - my $sla_deadline = ""; -# $sla_deadline = "-sla DEADLINEsla"; - -# my $time = "hour"; - my $time = "scavenger"; ## (this is msub) - - if ($jobarray ne "") { - - print "starting job_array\n"; - - my $mem_str = ($jmem / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - my $Rusage = "-R \"rusage[mem=$mem_str]\""; - - if ($multi >= 0){ - my $threads_loc = $threads; - my $threads_loc_half = sprintf "%d",$threads / 2; - my $mem_loc = $mem_str * 1100; -# $Rusage = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\""; - $Rusage = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc"; -# print "$cmd\n"; - - } - - - -# my $pretext = ""; - my $pretext = "$sla_deadline $wallstr -q $time $Rusage"; - my $sys = 'bsub '.$pretext.' -J "'.$NOJ.'[1-'.$job_bn_th.']" -o "errandout/'.$NOJ.'..%J.%I" '.$startjob_script.' --n n\$LSB_JOBINDEX --jobfile '.$jobarray; - - -# print "$sys\n"; - - my $acmd_file = "$jobarray.array_cmd"; - while (-e $acmd_file) { - $acmd_file .= ".a"; - } - die $! unless open AC, "> $acmd_file"; - print AC "$sys\n"; - close AC; -# exit; -# print "$sys\n"; - &mysystem ($sys); - exit; - } - - my $job_bn=0; - my $job_bn_name=0; - my $job_bn_str = sprintf "%09d", $job_bn_name; - -# $NOJ = substr($NOJ,0,8); - my $dirname = ""; - $dirname = "$errname/" if ($errname ne "errandout"); - my $blue_n = 0 ; - my $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n"; - - while (-e "$blue_name") { - $blue_n++; - $blue_name = "$dirname"."blueprint_jobs_$NOJ.$blue_n"; - } - - die "$! $blue_name" unless open JOB, "> $blue_name"; - - my $week_count = 0; - my $multi_count = 0; - - my $zaehler = 2*$job_bn_th; - my $rand_th = $zaehler / @job_arr; - - foreach my $cmd (@job_arr){ - - - if ($noerr) { - $job_bn_str = "dump"; - } - else { - $job_bn_str = sprintf "%09d", $job_bn_name; - while (-e "$errname/$NOJ.$job_bn_str") { - $job_bn_name++; - $job_bn_str = sprintf "%09d", $job_bn_name; - } - } - - - - if ($start) { -# print "cmd\n"; - &mysystem($cmd); - next; - } - - my $wt_str = ""; - if ($wait_fi_file) { -# my $substr_wf = substr($wait_fi_file,0,8); - my $substr_wf = $wait_fi_file; - $wt_str = '-w \'ended ("'.$substr_wf.'")\''; - } - - $time = "scavenger"; - - my $jmem_loc = $jmem; - - - - if ($week >= 0){ -# here change from priority to week!!! -# $time = "priority"; - $time = "week"; -# $time = "week -G deadline"; - $week = $week -1; -# $jmem_loc = $jmem * 2; - $jmem_loc = $jmem + 3000 ; -# $wallstr = '-W 2400'; - $wallstr = ''; - } - else { -# $time = "hour -G psychfolk"; -# $time = "hour"; - $time = "scavenger"; - } - - -#bsub -sla DEADLINEsla -q hou - - - my $mem_str = ($jmem_loc / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - - - my $Rusage = "-R \"rusage[mem=$mem_str]\""; - - - if ($multi >= 0){ - $multi = $multi -1; -# $jmem_loc = $jmem * 2; -# $Rusage = "-n 2,4 -R \"rusage[mem=$mem_str]span[hosts=1]\""; - -# my $threads_loc = $threads + 2; - my $threads_loc = $threads; - my $threads_loc_half = sprintf "%d",$threads / 2; - my $mem_loc = $mem_str * 1100; -# $Rusage = "-n $threads_loc,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\""; - $Rusage = "-n $threads_loc_half,$threads_loc -R \"rusage[mem=$mem_str]span[hosts=1]\" -M $mem_loc"; -# print "$cmd\n"; - - } - - next if (rand() > $rand_th); - - if ($week >= 0){ - $week_count++; -# next if ($week_count > 25); - } - - my $locname = $NOJ; - if ($multi >= 0){ - $multi_count++; -# $locname .= ".mu"; - } - - - - my $errstr = "$errname/$NOJ.$job_bn_str"; - if ($noerr) { - $errstr = "/dev/null"; - } - - my $jcmd = "bsub $sla_deadline $wt_str $wallstr $Rusage -J $locname -q $time -o $errstr \"$cmd\""; - - print JOB "$jcmd\n"; - - - - if (1) { -# &mysystem ("$jcmd "); -# print "$jcmd\n"; - &mysystem ("$jcmd 2> /dev/null > /dev/null"); # 0113 - } - else { - print "$jcmd\n"; - } -# sleep(1); -# exit; - $job_bn++; - $job_bn_name++; - - last if ($job_bn > $job_bn_th); - - } - close JOB; - exit; - } - - - - -######################################## -### SLURM -########################################## - - if ($slurm) { - -# my $job_n=0; - my $sum=0; - - my $jobname; - -# print "cmds: @job_arr\n"; -# print "cmds: $jobarray\n"; - - - my $wallstr = "$walltime:00:00"; - - if ($walltime == 0) { - $wallstr = "00:10:00"; - } - -# if ($start) { -# &mysystem($cmd); -# next; -# } - - - if ($week > 0){ -# my $walltime_loc = $walltime * 6; - $wallstr = "48:00:00"; - $week = $week - 1; - } - - $jobname="j.$job_bn_th.$NOJ"; - - my $jobfile = $jobarray.".script"; - - - my $mem_str = ($jmem / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - -# print "here\n"; - - my ($multi,$threads) = split ',', $multi_str if ($multi_str ne ""); - - - if ($wait_fi_file){ - - $jobfile = $wait_fi_file.".script"; - - - - my $idn; - die $!."($wait_fi_file)" unless open FILE, "< $wait_fi_file"; - my $line = ; - my @cells = split /\s+/, $line; - $idn = $cells[3]; - close FILE; - - die "$! ($jobfile)" unless open JOB, "> $jobfile"; - print JOB "#!/bin/sh\n"; - print JOB "#SBATCH --job-name $NOJ\n"; - print JOB "#SBATCH --output errandout/$NOJ.-%j.out\n"; - print JOB "#SBATCH --ntasks 1\n"; - print JOB "#SBATCH --cpus-per-task 1\n"; - print JOB "#SBATCH --mem-per-cpu $mem_str"."g\n"; - print JOB "#SBATCH --time $wallstr\n"; - print JOB "#SBATCH --dependency afterany:$idn\n"; - - print JOB "$bp\n"; - close (JOB); - - - } - else { - - if ($jobarray eq "") { - $job_bn_th = 0; - $jobfile = "$NOJ.start"; - $jobarray = "$NOJ.scripts"; - die $! unless open JF, "> $jobarray"; - foreach (@job_arr) { - print JF "$_\n"; - $job_bn_th++; - } - close JF; - print $jobfile."\n"; - } -# exit; - - - - - - die "$! ($jobfile)" unless open JOB, "> $jobfile"; - print JOB "#!/bin/sh\n"; - print JOB "#SBATCH --job-name $NOJ\n"; - print JOB "#SBATCH --output errandout/$NOJ.-%j.out\n"; - - -# print JOB "#SBATCH --ntasks $job_bn_th\n"; - my $aend = $job_bn_th; - if ($aend > 1000) { - $aend = 1000; - } - - - my $ast = "#SBATCH --array=1-$aend\n"; - if ($maxpar > 0) { - $ast = "#SBATCH --array=1-$aend%maxpar\n"; - } - print JOB $ast; - - - - - if ($multi > 0) { - print JOB "#SBATCH --cpus-per-task $threads\n"; - } - else { - print JOB "#SBATCH --cpus-per-task 1\n"; - } - print JOB "#SBATCH --mem-per-cpu $mem_str"."g\n"; - print JOB "#SBATCH --time $wallstr\n"; - -# print JOB "dispatch -r $jobarray\n"; - print JOB "$startjob_script --n \$SLURM_ARRAY_TASK_ID --jobfile $jobarray\n"; - close (JOB); - } -# print "debug: $jobfile\n"; -# exit; - - &mysystem ("sbatch $jobfile > $jobfile.id"); -# print "send $jobfile to queue\n"; -# exit; - - -# &a2file ($jl_file."-".$NOJ, @blueprint_out); - - - exit; - - } - - -######################################## -### BROAD UGER -########################################## - - - if ($qsub_broad) { - - my $inst_n=0; - my $job_n=0; - my $sum=0; - - # my $cores=$inst; - # $cores=2 if ($cores < 2); - - my $jobname; - - my $mem_str = ($jmem / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - - - $jobname="j.$NOJ"; - - - unless ($wait_fi_file){ - if ($jobarray eq "") { - - if (@job_arr > 0) { - - die "$! ($jobname.scripts)" unless open SCR, "> $jobname.scripts"; - foreach (@job_arr) { - - print SCR "$_\n"; - - } - close SCR; - $jobarray = "$jobname.scripts"; - } - else { - print "Exit: no jobs to process\n"; - exit; - } - - print "wrote $jobname.scripts\n"; - $job_bn_th = @job_arr; -# print "sleep\n"; -# sleep (3); - - } - } - - - - - if ($jobarray ne "") { - - - my $wallstr = "$walltime:00:00"; - if ($walltime ==0){ - $wallstr = "0:10:00"; - } - -# $cores = 28; - - - print "starting job_array, $jobname\n"; - - - my $qlong_str = ""; - if ($walltime > 2) { - $qlong_str = "-q long"; -# $qlong_str = "-P sanctioned -q sanctioned"; - } - die "$! ($jobname)" unless open JOB, "> $jobname"; - - - print JOB "#PBS -lnodes=1:ppn=1\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $rootdir\n"; - print JOB "$startjob_script -n ".'$SGE_TASK_ID'." --jobfile $jobarray\n"; - close JOB; - - my $qsub_cmd = "qsub -l m_mem_free=".$mem_str."g,h_vmem=".$mem_str."g $qlong_str -v PATH,rp_perlpackages -t 1-$job_bn_th -e $rootdir/$errname/ -o $rootdir/$errname/ $name_of_job $hhmm $jobname"; - my $qsub_txt = "$qsub_cmd > $jobname.id"; - my $qsub_log = "echo $qsub_cmd > $jobname.log"; - - &mysystem ($qsub_log); - &mysystem ($qsub_txt); - exit; - } - - - - - if ($wait_fi_file){ - die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file"; - my $id_str = ; - chomp($id_str); - $id_str =~ s/Your job-array //; - $id_str =~ s/\..*//; - - - close WF; - - my $wallstr = "$walltime:00:00"; - - print "starting motherscript, depending on $id_str\n"; - - $jobname="j.$NOJ"; - die "$! ($jobname)" unless open JOB, "> $jobname"; - print JOB "#PBS -lnodes=1\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $ENV{PWD}\n"; - print JOB "$bp\n"; - close JOB; - - my $qsub_cmd = "qsub -v PATH,rp_perlpackages -l m_mem_free=".$mem_str."g,h_vmem=".$mem_str."g -hold_jid $id_str -e $ENV{PWD}/$errname/ -o $ENV{PWD}/$errname/ $name_of_job $hhmm $jobname"; - my $qsub_txt = "$qsub_cmd > $jobname.id"; - my $qsub_log = "echo $qsub_cmd > $jobname.log"; - - &mysystem ($qsub_log); - &mysystem ($qsub_txt); - exit; - } - - - - - - - - - - } -######################################## -### COMPUTEROME -########################################## - - - if ($qsub_computerome) { - - my $inst_n=0; - my $job_n=0; - my $sum=0; - - # my $cores=$inst; - # $cores=2 if ($cores < 2); - - my $jobname; - - my $mem_str = ($jmem / 1000) * 1; - $mem_str = 1 if ($mem_str < 1); - - - if ($jobarray ne "") { - - - my $wallstr = "$walltime:00:00"; - if ($walltime ==0){ - $wallstr = "0:10:00"; - } - -# $cores = 28; - - - print "starting job_array, $jobname\n"; - $jobname="j.$NOJ"; - - - die "$! ($jobname)" unless open JOB, "> $jobname"; - - - print JOB "#PBS -lnodes=1:ppn=1\n"; - print JOB "#PBS -lmem=".$mem_str."gb\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $rootdir\n"; - print JOB "startjob_script -n ".'$PBS_ARRAYID'." --jobfile $jobarray\n"; - close JOB; - - - my $qsub_txt = "qsub -V -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id"; - my $qsub_log = "echo qsub -V -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log"; - - &mysystem ($qsub_log); - &mysystem ($qsub_txt); - exit; - } - - - - - if ($wait_fi_file){ - die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file"; - my $id_str = ; - chomp($id_str); - close WF; - - my $wallstr = "$walltime:00:00"; - - print "starting motherscript, depending on $id_str\n"; - - $jobname="j.$NOJ"; - die "$! ($jobname)" unless open JOB, "> $jobname"; - print JOB "#PBS -lnodes=1:ppn=1\n"; - print JOB "#PBS -lmem=".$mem_str."gb\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $ENV{PWD}\n"; - print JOB "$bp\n"; - close JOB; - - - my $qsub_txt = "qsub -V -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id"; - my $qsub_log = "echo qsub -V -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log"; - - &mysystem ($qsub_log); - &mysystem ($qsub_txt); - exit; - } - } - - -######################################## -### LISA -########################################## - - my $inst_n=0; - my $job_n=0; - my $sum=0; - -# my $cores=$inst; -# $cores=2 if ($cores < 2); - - my $jobname; - - - my $mem_str = ($jmem / 1000) * 1; - if ($mem_str < 8){ - $mem_str = 32; - } - else { - $mem_str = 64; - $cores =6; - - } - - - if ($jobarray ne "") { - - - my $wallstr = "$walltime:00:00"; - if ($walltime ==0){ - $wallstr = "0:10:00"; - } - - ## very complicated to get the ceiling of the job-N = number of nodes - my $numnode = ($job_bn_th-0.000001)/$cores; - $numnode=sprintf "%d",$numnode; - $numnode++; - - if ($numnode > 1000){ - $numnode = 1000; - } - - - print "starting job_array, $jobname\n"; - $jobname="j.$NOJ"; -# print "$ENV{PWD}"."\n"; -# print "$rootdir"."\n"; -# print "jpbname: $jobname"."\n"; -# exit; - - - die "$! ($jobname)" unless open JOB, "> $jobname"; - - - - if ($mem_str == 64) { -# print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores:mem64gb\n"; - print JOB "#PBS -lnodes=1:mem64gb\n"; - } - else { -# print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores\n"; - print JOB "#PBS -lnodes=1\n"; - } -# print JOB "#PBS -lmem=".$mem_str."gb\n"; -# } - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $rootdir\n"; - print JOB "$startjob_script --parn $cores -n ".'$PBS_ARRAYID'." --jobfile $jobarray\n"; - close JOB; - - #PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00 - #cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115 - #my.start_job -n $PBS_ARRAYID --jobfile array_test_2 - - -# my $qsub_txt = "qsub -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id"; - my $qsub_cmd = "qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname"; - my $qsub_txt = "$qsub_cmd > $jobname.id"; - my $qsub_log = "echo $qsub_cmd > $jobname.log"; - -# print "$qsub_txt\n"; - - # exit; - &mysystem ($qsub_log); - &mysystem ($qsub_txt); -# sleep (3); - exit; - } - - - - - if ($wait_fi_file){ - die "$! <$wait_fi_file>" unless open WF, "< $wait_fi_file"; - my $id_str = ; - chomp($id_str); - close WF; - - my $wallstr = "$walltime:00:00"; - - print "starting motherscript, depending on $id_str\n"; - - $jobname="j.$NOJ"; - die "$! ($jobname)" unless open JOB, "> $jobname"; - print JOB "#PBS -lnodes=1\n"; -# print JOB "#PBS -lmem=".$mem_str."gb\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $ENV{PWD}\n"; - print JOB "$bp\n"; - close JOB; - - #PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00 - #cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115 - #my.start_job -n $PBS_ARRAYID --jobfile array_test_2 - - - my $qsub_cmd = "qsub -W depend=afteranyarray:$id_str -e $errname/ -o $errname/ $name_of_job $hhmm $jobname"; - my $qsub_txt = "$qsub_cmd > $jobname.id"; - my $qsub_log = "echo $qsub_cmd > $jobname.log"; - - - - print "$qsub_txt\n"; - - # exit; - &mysystem ($qsub_txt); - &mysystem ($qsub_log); - exit; - } - - - - - - ##### - ## here without jobarray - ##### - - if (1) { - - $jobname="j.$NOJ"; - - die "$! ($jobname.jobarray)" unless open JOBA, "> $jobname.jobarray"; - foreach my $cmd (@job_arr){ - print JOBA "$cmd\n"; - } - close JOBA; - - - my $wallstr = "$walltime:00:00"; - if ($walltime ==0){ - $wallstr = "0:10:00"; - } - - $job_bn_th = @job_arr; - -# print "N: $job_bn_th\n"; -# exit; - # $cores = 16; - - ## very complicated to get the ceiling of the job-N = number of nodes - my $numnode = ($job_bn_th-0.000001)/$cores; - $numnode=sprintf "%d",$numnode; - $numnode++; - - - print "starting job_array, $jobname\n"; - $jobname="j.$NOJ"; -# print "$ENV{PWD}"."\n"; -# print "$rootdir"."\n"; -# print "jpbname: $jobname"."\n"; -# exit; - - - - - - die "$! ($jobname)" unless open JOB, "> $jobname"; - print JOB "#PBS -lnodes=1:cores$cores:ppn=$cores\n"; - print JOB "#PBS -lwalltime=$wallstr\n"; - print JOB "cd $rootdir\n"; - print JOB "$startjob_script --parn $cores -n ".'$PBS_ARRAYID'." --jobfile $jobname.jobarray\n"; - close JOB; - - #PBS -lnodes=1:cores16:ppn=16 -lwalltime=1:00:00 - #cd /home/gwas/pgc-samples/scz_sing/data-upload-sgChinese/rerun_0115 - #my.start_job -n $PBS_ARRAYID --jobfile array_test_2 - - -# my $qsub_txt = "qsub -t 1-$job_bn_th -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id"; - my $qsub_txt = "qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.id"; - my $qsub_log = "echo qsub -t 1-$numnode -e $errname/ -o $errname/ $name_of_job $hhmm $jobname > $jobname.log"; - -# print "$qsub_txt\n"; - -# exit; - &mysystem ($qsub_txt); - &mysystem ($qsub_log); -# sleep (3); - exit; - } - - - - - - - - - - if (0) { - foreach my $cmd (@job_arr){ - - my $wallstr = "$walltime:00:00"; - - if ($walltime == 0) { - $wallstr = "00:10:00"; - } - - if ($start) { - &mysystem($cmd); - next; - } - - - if ($week > 0){ - my $walltime_loc = $walltime * 6; - $wallstr = "$walltime_loc:00:00"; - $week = $week - 1; - } - - - - if ($inst_n == 0){ - - my $rn = int(rand(2)); -# if ($rn == 0){ - -# $core = 8; -# $inst = 7 unless ($inst == 1); - -# } -# else { -# $core = 12; -# $inst = 11; -# } - - $jobname="j.$job_n.$NOJ"; -# print " $jobname\n"; - if ($qsub){ - die "$! ($jobname)" unless open JOB, "> $jobname"; -# print JOB "#PBS -lnodes=1:cores$core\n"; - print JOB "#PBS -lnodes=1\n"; - - if (0) { - print JOB "#PBS -lnodes=1:cores$core\n" if ($core == 2); - print JOB "#PBS -lnodes=1:cores8\n" if ($core == 8); - print JOB "#PBS -lnodes=1:cores8:mem8gb\n" if ($core == 12); - print JOB "#PBS -lnodes=1:cores8:mem16gb\n" if ($core == 16); - print JOB "#PBS -lnodes=1:cores8:mem24gb\n" if ($core == 24); - } - - print JOB "#PBS -lwalltime=$wallstr\n"; - - - ##### as long as dependencies don't work - if (0) { - print JOB "$wait_str\n"; - } - - print JOB "$module\n"; - print JOB "cd $ENV{PWD}\n"; - } - } - if ($qsub){ - - if ($wait_fi_file){ - die $! unless open CMD, "> $NOJ.cmd"; - print CMD "$bp\n"; - close CMD; -# print "blueprint_addon --cmd $NOJ.cmd --fwt $wait_fi_file --out $NOJ\n"; - print JOB "blueprint_addon --cmd $NOJ.cmd --fwt $wait_fi_file --out $NOJ\n"; - - } - else { - print JOB $cmd; -# print JOB " &" unless ($serial); - print JOB " &" ; -# print JOB " &" unless ($cmd =~ /;$/); - print JOB "\n"; - } - - - - - $inst_n++; - $sum++; - if ($inst_n == $inst || $sum == @job_arr || $week > 0){ - print JOB "wait\n"; - close JOB; - $inst_n=0; - $job_n++; - - my $qsub_txt = "qsub -e $errname/ -o $errname/ $name_of_job $hhmm $jobname"; -# print "$qsub_txt\n"; -# &mysystem ("qsub $wait_str -e $errname/ -o $errname/ $name_of_job $jobname") if $lisa; - &mysystem ($qsub_txt) if $qsub; - last if ($job_n > 200); - - } - } - else { - &mysystem ($cmd); - } - } - - &a2file ($jl_file."-".$NOJ, @blueprint_out); - } -} - -else { - foreach (@job_arr){ - print $_."\n"; - - } - if ($qsub){ - my $wallstr = "$walltime:00:00"; - print "walltime: $wallstr\n"; - print "cores: $core\n"; - print "instancies: $inst\n"; - print "module: $module\n"; - print "dir: $ENV{PWD}\n"; - } -} diff --git a/bin/buigue_pico.pl b/bin/buigue_pico.pl index 92034ba..3812cfc 100755 --- a/bin/buigue_pico.pl +++ b/bin/buigue_pico.pl @@ -1,6 +1,15 @@ #!/usr/bin/env perl use strict; +############################# +# load utility functions +############################# + +use FindBin; +use lib "$FindBin::Bin"; +use rp_perl::Utils qw(trans); + + my $version = "1.0.0"; my $progname = $0; $progname =~ s!^.*/!!; @@ -10,31 +19,13 @@ # read config file ############################# -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my %conf = (); - -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; - -sub trans { - my ($expr)=@_; - unless (exists $conf{$expr}) { - die "config file without entry: $expr\n"; - } - $conf{$expr}; -} - my $liloc = &trans("liloc"); - - +my $perlpack = &trans("perlpack"); +use lib $perlpack; ##################################################### -use lib $ENV{rp_perlpackages}; +# use lib $ENV{rp_perlpackages}; diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index f576f60..d14a5ef 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -23,6 +23,14 @@ #### +############################# +# load utility functions +############################# + +use FindBin; +use lib "$FindBin::Bin"; +use Ricopili::Utils qw(trans); + my $version = "1.0.24"; my $progname = $0; $progname =~ s!^.*/!!; @@ -45,24 +53,6 @@ # read config file ############################# -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my %conf = (); - -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; - -sub trans { - my ($expr)=@_; - unless (exists $conf{$expr}) { - die "config file without entry: $expr\n"; - } - $conf{$expr}; -} - my $ploc = &trans("p2loc"); my $homedir = &trans("home"); my $qloc = &trans("queue"); @@ -289,12 +279,12 @@ sub trans { # "testing environment variable rp_perlpackages #################################### -print "testing environment variable rp_perlpackages....\n"; -unless (exists $ENV{rp_perlpackages}) { - print "Error: no environment variable for perl-packages, please re-install ricopili and make sure to follow all instructions\n"; - print "------------------------------------\n"; - exit; -} +# print "testing environment variable rp_perlpackages....\n"; +# unless (exists $ENV{rp_perlpackages}) { +# print "Error: no environment variable for perl-packages, please re-install ricopili and make sure to follow all instructions\n"; +# print "------------------------------------\n"; +# exit; +# } print "....all set....\n"; print "------------------------------------\n"; @@ -408,7 +398,7 @@ sub a2filenew_app { my $sjainfofile = "$loloc/impute_dir_info"; unless (-e $sjainfofile) { print "log-file ($sjainfofile) is not existing\n"; - print "please check loloc in ~/ricopili.conf\n"; + print "please check loloc in ~/picopili.conf\n"; exit; } #my $sjainfofile = "$homedir/impute_dir_info_35_test"; diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl index 75ec534..782404b 100755 --- a/bin/lift_to_hg19.pl +++ b/bin/lift_to_hg19.pl @@ -28,31 +28,18 @@ #awk '{print $4,$2}' liftes > liftes.new #/fg/debakkerscratch/ripke/plink/1.08/src/plink --bfile ../cmc2_051310.8_toimpute --update-map liftes.new --make-bed - - ############################# -# read config file +# load utility functions ############################# -#print "host: ".$ENV{HOST}."\n"; -#exit; -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my %conf = (); +use FindBin; +use lib "$FindBin::Bin"; +use rp_perl::Utils qw(trans); -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; -sub trans { - my ($expr)=@_; - unless (exists $conf{$expr}) { - die "config file without entry: $expr\n"; - } - $conf{$expr}; -} +############################# +# read config file +############################# my $ploc = &trans("p2loc"); my $liloc = &trans("liloc"); diff --git a/bin/plague.pl b/bin/plague_pico.pl similarity index 85% rename from bin/plague.pl rename to bin/plague_pico.pl index fb0ee88..9825be5 100755 --- a/bin/plague.pl +++ b/bin/plague_pico.pl @@ -1,6 +1,13 @@ #!/usr/bin/env perl use strict; -BEGIN { push @INC, $ENV{rp_perlpackages}.'/Compress-Raw-Zlib-2.065/blib/lib' } + +############################# +# load utility functions +############################# + +use FindBin; +use lib "$FindBin::Bin"; +use rp_perl::Utils qw(trans); my $version = "1.0.0"; my $progname = $0; @@ -11,30 +18,14 @@ # read config file ############################# -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my %conf = (); - -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; - -sub trans { - my ($expr)=@_; - unless (exists $conf{$expr}) { - die "config file without entry: $expr\n"; - } - $conf{$expr}; -} - my $hmloc = &trans("hmloc"); - +my $perlpack = &trans("perlpack"); +use lib $perlpack; ##################################################### my $sc_file = "$hmloc/snp_platform_collection.txt.new.0815.gz"; +my $sc_file_0416 = "$hmloc/snp_platform_collection.txt.new.0416a.gz"; my $scol = 2; @@ -44,7 +35,9 @@ sub trans { version: $version - --scf STRING SNP collection file, default: $sc_file + --scf STRING SNP collection file + default: $sc_file + first checking this: $sc_file_0416 --scol INT column of SNPs, default = $scol --create STRING create new entry with name STRING -help print this message and exit @@ -94,7 +87,6 @@ sub split_line { my %bsnps=(); -# use lib $ENV{rp_perlpackages}.'/Compress-Raw-Zlib-2.065/blib/lib'; use Compress::Zlib ; ## read bim-file @@ -113,6 +105,10 @@ sub split_line { ## compare with snp-collection +if (-e $sc_file_0416) { + $sc_file = $sc_file_0416; +} + unless (-e $sc_file) { $sc_file = "$hmloc/snp_platform_collection.txt.new.0114.gz"; if (-e $sc_file) { diff --git a/bin/qc_rel.py b/bin/qc_rel.py index 9d994f1..e25c372 100755 --- a/bin/qc_rel.py +++ b/bin/qc_rel.py @@ -130,7 +130,7 @@ # get directory containing current script # (hack to get plague script location) rp_bin = os.path.dirname(os.path.realpath(__file__)) - plague_ex = rp_bin + '/plague.pl' + plague_ex = rp_bin + '/plague_pico.pl' ############# @@ -1063,4 +1063,4 @@ print '\n############' print '\n' print 'SUCCESS!\n' -exit(0) \ No newline at end of file +exit(0) diff --git a/bin/rp_perl/Utils.pm b/bin/rp_perl/Utils.pm new file mode 100755 index 0000000..7bf48d2 --- /dev/null +++ b/bin/rp_perl/Utils.pm @@ -0,0 +1,49 @@ +package rp_perl::Utils; + +###################### +# +# Adapted from ricopili (https://github.com/Nealelab/ricopili) +# Original code by Robert Karlsson (@robkar on github) +# +###################### + +use strict; +use warnings; + +BEGIN { + require Exporter; + our @ISA = qw(Exporter); + our @EXPORT_OK = qw(trans $conf_file); +} + +############################# +# read config file +############################# + +our $conf_file = $ENV{HOME}."/picopili.conf"; +my %conf = (); + +die $!."($conf_file)" unless open FILE, "< $conf_file"; +while (my $line = ){ + next if ($line =~ /^#/); + my @cells = split /\s+/, $line; + next unless ($#cells >= 1); + + # expand environment variables and '~' for home directory in conf entries + $cells[1] =~ s/^~/$ENV{HOME}/; + $cells[1] =~ s/\$\{(\w+)\}/$ENV{$1}/g; + $cells[1] =~ s/\$(\w+)/$ENV{$1}/g; + + $conf{$cells[0]} = $cells[1]; +} +close FILE; + +sub trans { + my ($expr) = @_; + unless (exists $conf{$expr}) { + die "config file without entry: $expr\n"; + } + $conf{$expr}; +} + +1; diff --git a/docs/RICOPILI.md b/docs/RICOPILI.md index df850e5..ef8e45e 100644 --- a/docs/RICOPILI.md +++ b/docs/RICOPILI.md @@ -1,13 +1,12 @@ The following scripts are adapted from ricopili (https://github.com/Nealelab/ricopili) with very minor changes: -* `blue_start_job.pl`, from `my.start_job` -* `blueprint_pico.pl`, from `blueprint` * `buigue_pico.pl`, from `buigue` * `checkflip_pico.pl`, from `checkflip4` * `checkpos_pico.pl`, from `checkpos6` * `config`, from `rp_config` * `lift_to_hg19.pl`, from `lift18219` -* `plague.pl`, from `plague_2` +* `plague_pico.pl`, from `plague_2` +* `./rp_perl/Utils.pm`, from `./Ricopili/Utils.pm` In addition, the following scripts are adapted from ricopili with more substantial changes as indicated: From 863840490466c06b0d05db1b57330d6a1bfe95cc Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 28 Sep 2016 20:17:13 -0400 Subject: [PATCH 03/48] separate and simplify config, move imp_prep logs --- CONFIG | 1 + bin/bin_check_pico | 7 + bin/checkflip_pico.pl | 1 - bin/checkpos_pico.pl | 1 - bin/config | 1127 ----------------------------------------- bin/config_pico.pl | 714 ++++++++++++++++++++++++++ bin/imp_prep.pl | 24 +- config | 1 - docs/RICOPILI.md | 3 +- 9 files changed, 731 insertions(+), 1148 deletions(-) create mode 120000 CONFIG create mode 100755 bin/bin_check_pico delete mode 100755 bin/config create mode 100755 bin/config_pico.pl delete mode 120000 config diff --git a/CONFIG b/CONFIG new file mode 120000 index 0000000..200ec28 --- /dev/null +++ b/CONFIG @@ -0,0 +1 @@ +./bin/config_pico.pl \ No newline at end of file diff --git a/bin/bin_check_pico b/bin/bin_check_pico new file mode 100755 index 0000000..819b0c4 --- /dev/null +++ b/bin/bin_check_pico @@ -0,0 +1,7 @@ +#!/usr/bin/perl +use strict; + + +### dud script to check whether search path is correct +### Jackie addition 01/22/14 + diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl index aba1290..69e2809 100755 --- a/bin/checkflip_pico.pl +++ b/bin/checkflip_pico.pl @@ -50,7 +50,6 @@ sub trans { my $sloc = &trans("sloc"); my $hmloc = &trans("hmloc"); -#my $ploc = &trans("ploc"); my $p2loc = &trans("p2loc"); diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl index 6d5fc5c..8f417d6 100755 --- a/bin/checkpos_pico.pl +++ b/bin/checkpos_pico.pl @@ -50,7 +50,6 @@ sub trans { my $sloc = &trans("sloc"); my $hmloc = &trans("hmloc"); -#my $ploc = &trans("ploc"); my $p2loc = &trans("p2loc"); diff --git a/bin/config b/bin/config deleted file mode 100755 index 709da3c..0000000 --- a/bin/config +++ /dev/null @@ -1,1127 +0,0 @@ -#!/usr/bin/env perl -use strict; -use File::Basename; -use Cwd; -use Data::Dumper; - -### Script to configure settings for ricopili pipeline -### Jackie Goldstein, Jan 2014 - - - - - -my $version = "2.0.0"; -my $progname = $0; - -$progname =~ s!^.*/!!; - -my $cdir = cwd(); -my $home = $ENV{HOME}; -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my $command_line = "$progname @ARGV"; - -############################# -# Ask user what cluster they're using -############################# -#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"other",0); -#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"other",0); -my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"co_ipsych",0,"other",0); -my @cluster_names = ("broad","mssm","genomedk","lisa","computerome","co_ipsych","other"); -print "Please enter your cluster name from the following options:\n"; -my $i = 1; -foreach (@cluster_names){ - print "\t($i) $_\n"; - $i += 1; -} -print "\n"; -my $cluster = "other"; -while (1) { - $cluster = lc <>; - chomp $cluster; - if (exists $clusters{$cluster}){$clusters{$cluster} = 1;last;} - else { - $cluster =~ s/(\)|\()//g; - if ($cluster >= 1 && $cluster <= $i){$cluster -= 1; $cluster = $cluster_names[$cluster];$clusters{$cluster} = 1;last;} - else { - print "Did not recognize option. Please enter a cluster name from the options below:\n"; - my $i = 1; - foreach (@cluster_names){ - print "\t($i) $_\n"; - $i += 1; - } - print "\n"; - my $cluster = "other"; - } - } -} -print "\nUsing the following cluster: $cluster\n\n"; - - - - - -############################# -# Determine the shell -############################# -my $shell = ''; -if (exists $ENV{SHELL}){$shell = basename($ENV{SHELL});} -if ($shell eq "bash-login-check"){$shell = "bash";} -if ($shell ne "bash" && $shell ne "tcsh") { - print "Warning! Shell not recognized: $shell\n"; - print "Please send email to rp_dev\@broadinstitute.org\n"; -} -print "Detected you are using the following shell: $shell\n\n"; - - -################################################### -### system call with test if successful -################################################### -sub mysystem(){ - my ($systemstr)="@_"; - system($systemstr); - my $status = ($? >> 8); - die "$systemstr\n->system call failed: $status" if ($status != 0); -} - - -################################################### -### Check if rp_bin already installed -################################################### -system("bin_check"); # dummy script that doesn't do anything -my $status_bin = ($? >> 8); -system("bin_check_pdfjam"); # dummy script that doesn't do anything -my $status_pdfjam = ($? >> 8); - - -if ($clusters{lisa} == 1) { - unless (-e "$home/.bash_profile") { - die $! unless open FILE, "> $home/.bash_profile"; - print FILE 'if [ -f ~/.bashrc ]; then '."\n"; - print FILE ' . ~/.bashrc'."\n"; - print FILE 'fi'."\n"; - close FILE; - } - unless (-e "$home/.bashrc") { - system "touch ~/.bashrc\n"; - } -} - -unless ($clusters{broad} == 1) { - -# print "$cdir/pdfjam\n"; - die $!."($cdir/pdfjam/pdfjam)" unless open FILE, "< $cdir/pdfjam/pdfjam"; - die $!."($cdir/pdfjam/pdfjam.ow)" unless open OUT, "> $cdir/pdfjam/pdfjam.ow"; - while (my $line = ){ - $line =~ s!/psych/genetics_data/ricopili_tmp!/scratch!; - print OUT "$line"; - } - close FILE; - close OUT; - system ("mv $cdir/pdfjam/pdfjam.ow $cdir/pdfjam/pdfjam"); - print "rewrote $cdir/pdfjam.ow\n"; - -} - - - - -if ($status_bin == 0 && $status_pdfjam == 0 && !(-e "install_true")) { - print "\n----------------------------------------------------\n"; - print "\n\nWarning: Ricopili is already installed.\n"; - print "Do you wish to uninstall Ricopili first (recommended)? \n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") { - print "\n----------------------------------------------------\n"; - print "----------------------------------------------------\n"; - print "----------------------------------------------------\n"; - - print "\nTo uninstall Ricopili, you need to remove the following paths from your default search path:\n"; - my @PATH = split(':',$ENV{PATH}); - foreach (@PATH) { - if ($_ =~ "rp_bin" || $_ =~ "rp_perlpackages") { - print "\t$_\n";} - } - print "If this seems incorrect, DO NOT continue with the uninstall instructions below!!!\n"; - print "If this is correct, please invoke the following 2 commands not preceeded by ##\n"; - print "after this, please restart ./rp_config\n"; - - my @PATH = split ":", $ENV{PATH}; - my @NEW_PATH = (); - foreach (@PATH) { - unless ($_ =~ "rp_bin") { - push @NEW_PATH, $_; - } - } - my $new_path = join ":", @NEW_PATH; - - my $i = 1; - # 1. Remove paths for this session - if ($shell eq "bash") { - print "\n----------------------------------------------------\n"; - print "## Please enter the following command to remove rp_bin from the search path for this session:\n\n"; - $i += 1; - print "\texport PATH=$new_path\n"; - } - elsif ($shell eq "tcsh") { - print "\n----------------------------------------------------\n"; - print "## Please enter the following command to remove rp_bin from the search path for this session:\n\n"; - $i += 1; - print "\tsetenv PATH $new_path\n"; - } - else { - print "\n----------------------------------------------------\n"; - print "## You will need to figure out how to change the current search path to the following for your shell:\n\n"; - $i += 1; - print "\t$new_path\n"; - } - # 2. Remove the path permanently from the search path - if ($clusters{broad} == 1) { - if (-e "$home/.my.bashrc") { - &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.my.bashrc > my.bashrc_minus_rpbin.txt"); - &mysystem("cp $home/.my.bashrc my.bashrc.copy"); - print "\n----------------------------------------------------\n"; - print "\n$## To remove rp_bin permanently from the default search path in bash, run the following command:\n\n"; - $i += 1; - print "\tmv my.bashrc_minus_rpbin.txt $home/.my.bashrc\n\n"; - print "## which will delete the following lines from your $home/.my.bashrc file:\n"; - - - my @tmp_lines = `grep rp_bin $home/.my.bashrc`; - foreach (@tmp_lines) { - print "##### $_"; - } - @tmp_lines = `grep rp_perlpackages $home/.my.bashrc`; - foreach (@tmp_lines) { - print "##### $_"; - } - - print "\n## A copy of $home/.my.bashrc is available at my.bashrc.copy\n"; - } - if (-e "$home/.my.cshrc") { - &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.my.cshrc > my.cshrc_minus_rpbin.txt"); - &mysystem("cp $home/.my.cshrc my.cshrc.copy"); - print "\n----------------------------------------------------\n"; - print "\n$i. To remove rp_bin permanently from the default csh or tcsh search path, run the following command:\n\n"; - $i += 1; - print "\tmv my.cshrc_minus_rpbin.txt $home/.my.cshrc\n\n"; - print "## which will delete the following lines from your $home/.my.cshrc file:\n"; - - - - my @tmp_lines = `grep rp_bin $home/.my.cshrc`; - foreach (@tmp_lines) { - print "##### $_"; - } - @tmp_lines = `grep rp_perlpackages $home/.my.cshrc`; - foreach (@tmp_lines) { - print "##### $_"; - } -## &mysystem("grep \"rp_bin\\|rp_perlpackages\" $home/.my.cshrc"); - - - - print "\n## A copy of $home/.my.cshrc is available at my.cshrc.copy\n"; - } - } - elsif ($clusters{genomedk} == 1 || $clusters{mssm} == 1 || $clusters{lisa} == 1 || $clusters{computerome} == 1 || $clusters{co_ipsych} == 1) { - if (-e "$home/.bashrc") { - &mysystem("grep -v \"rp_bin\\|rp_perlpackages\" $home/.bashrc > my.bashrc_minus_rpbin.txt"); - &mysystem("cp $home/.bashrc my.bashrc.copy"); - print "\n----------------------------------------------------\n"; - print "\n$## To remove rp_bin permanently from the default search path in bash, run the following command:\n\n"; - $i += 1; - print "\tmv my.bashrc_minus_rpbin.txt $home/.bashrc\n\n"; - print "## which will delete the following lines from your $home/.bashrc file:\n"; - - - - my @tmp_lines = `grep rp_bin $home/.bashrc`; - foreach (@tmp_lines) { - print "##### $_"; - } - @tmp_lines = `grep rp_perlpackages $home/.bashrc`; - foreach (@tmp_lines) { - print "##### $_"; - } -## &mysystem("grep \"rp_bin\\|rp_perlpackages\" $home/.bashrc"); - - - print "\n## A copy of $home/.bashrc is available at my.bashrc.copy\n"; - } - } - - - - - else { - print "\n----------------------------------------------------\n"; - print "\n## Remove the directories listed above from the same place where you permanently added the directories to the search path.\n"; - $i += 1; - } - print "\n\n"; - exit; - } - elsif ($answer eq "n") {&mysystem("touch install_true");last;} - else {print "Please answer with y or n.\n";} - } -}; - -################################################### -### Add rp_bin to default search path -################################################### -system("bin_check"); # dummy script that doesn't do anything -my $status_bin = ($? >> 8); -system("bin_check_pdfjam"); # dummy script that doesn't do anything -my $status_pdfjam = ($? >> 8); - - - - -# exit; - - -unless ($status_bin == 0 && $status_pdfjam == 0) { - my $bash = "$cdir/my.bashrc_rp_path"; - my $csh = "$cdir/my.cshrc_rp_path"; - - die $! unless open FILE, "> $bash"; - print FILE "\n\nPATH=$cdir:\$PATH\n"; - print FILE "PATH=$cdir/pdfjam:\$PATH\n"; - - if ($clusters{lisa}){ - print FILE "export rp_perlpackages=/home/gwas/perl_modules\n"; - } - if ($clusters{computerome}){ - print FILE "export rp_perlpackages=/home/people/sripke/rp_external_bins/perl_packages\n"; - } - if ($clusters{co_ipsych}){ - print FILE "export rp_perlpackages=/data/user_tools/rp_external_bins/perl_packages\n"; - } - if ($clusters{broad}){ - print FILE "export rp_perlpackages=/home/unix/sripke/perl_modules\n"; - } - close FILE; - - die $! unless open FILE, "> $csh"; - print FILE "\n\nset path=($cdir \$path)\n"; - print FILE "set path=($cdir/pdfjam \$path)\n"; - if ($clusters{broad}){ - print FILE "setenv rp_perlpackages /home/unix/sripke/perl_modules\n"; - } - close FILE; - - - - print "\n----------------------------------------------------\n"; - print "## Please run the following commands to permanently add rp_bin to the default search path and restart the configuration: \n\n"; - - - if ($clusters{broad}){ - my $i = 1; - - if (-e "$home/.my.bashrc") { - print "cat $bash >> ~/.my.bashrc\n"; - $i += 1; - } - if (-e "$home/.my.cshrc") { - print "cat $csh >> ~/.my.cshrc\n"; - $i += 1; - } - - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - print "export rp_perlpackages=/home/unix/sripke/perl_modules\n"; - $i += 1; - } - elsif ($shell eq "tcsh") { - print "set path=($cdir \$path)\n"; - $i += 1; - print "set path=($cdir/pdfjam \$path)\n"; - $i += 1; - print "setenv rp_perlpackages /home/unix/sripke/perl_modules\n"; - $i += 1; - } - - - - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - elsif ($clusters{genomedk}){ - my $i = 1; - - - if (-e "$home/.bashrc") { - print "cat $bash >> ~/.bashrc\n"; - $i += 1; - } - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - } - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - elsif ($clusters{lisa}){ - my $i = 1; - - if (-e "$home/.bashrc") { - print "cat $bash >> ~/.bashrc\n"; - $i += 1; - } - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - } - print "export rp_perlpackages=/home/gwas/perl_modules\n"; - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - elsif ($clusters{computerome}){ - my $i = 1; - - if (-e "$home/.bashrc") { - print "cat $bash >> ~/.bashrc\n"; - $i += 1; - } - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - } - print "export rp_perlpackages=/home/people/sripke/rp_external_bins/perl_packages\n"; - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - elsif ($clusters{co_ipsych}){ - my $i = 1; - - if (-e "$home/.bashrc") { - print "cat $bash >> ~/.bashrc\n"; - $i += 1; - } - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - } - print "export rp_perlpackages=/data/user_tools/rp_external_bins/perl_packages\n"; - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - - - - - elsif ($clusters{mssm}){ - my $i = 1; - - if (-e "$home/.bashrc") { - print "cat $bash >> ~/.bashrc\n"; - $i += 1; - } - if ($shell eq "bash") { - print "PATH=$cdir:\$PATH\n"; - $i += 1; - print "PATH=$cdir/pdfjam:\$PATH\n"; - $i += 1; - } - print "./rp_config\n\n"; - &mysystem("touch install_true"); - exit; - } - else { - print "You'll need to add the following paths to your default search path:\n"; - print "\t$cdir\n"; - print "\t$cdir/pdfjam\n\n"; - print "If you are using a bash shell, sample commands are located in this file: $bash\n"; - print "If you are using a tcsh shell, sample commands are located in this file: $csh\n"; - print "For example instructions, see http://www.cyberciti.biz/faq/unix-linux-adding-path/\n"; - print "If possible, add these paths permanently. Otherwise, you will need to do this everytime you start a new session.\n"; - print "After these directories have been added to the search path, rerun this script: ./rp_config\n"; - &mysystem("touch install_true"); - exit; - } -} - -print "Required directories found in search path:\n"; -print "\trp_bin/ -- success\n"; -print "\trp_bin/pdfjam/ -- success\n\n"; - -system("latex small2e > /dev/null"); # dummy script that doesn't do anything -my $status_latex = ($? >> 8); -unless ($status_latex) { - print "Detected pdflatex is installed.\n\n"; - &mysystem("rm small2e.*"); -} -else { - print "---------------------------------------\n\n"; - print "Error -- pdflatex is not installed!\n\n"; - - if ($clusters{genomedk} == 1){ - print "Run the following commands to add pdflatex to the default search path:\n"; - print "\techo \"source /com/extra/texlive/2014/load.sh\" >> ~/.bashrc\n"; - print "\tsource /com/extra/texlive/2014/load.sh\n\n"; -} - else { - print "Please install pdflatex by downloading the texlive package and following the installation instructions (https://www.tug.org/texlive/)\n\n"; - } - print "Rerun this script once pdflatex has been added to the default search path (./rp_config)\n\n"; - print "---------------------------------------\n"; - exit; -} - -### Make sure all perl packages are installed -### JG addition -- wrote this block before I saw you added something similar above -#if ($clusters{broad} == 1){ -# unless (exists $ENV{rp_perlpackages}) { -# print "Run the following commands to add rp_perlpackages as an environmental variable:\n"; -# print "echo \"export rp_perlpackages=/home/unix/sripke/perl_modules/\" >> ~/.my.bashrc\n"; -# print "echo \"setenv rp_perlpackages /home/unix/sripke/perl_modules/\" >> ~/.my.cshrc\n"; -# if ($shell eq "bash") { -# print "export rp_perlpackages=/home/unix/sripke/perl_modules/\n"; -# } -# if ($shell eq "tcsh") { -# print "setenv rp_perlpackages /home/unix/sripke/perl_modules/\n"; -# } - -# print "./rp_config\n\n"; -# exit; -# } -# else { print "Detected rp_perlpackages as an environmental variable.\n\n";} -#} -if ($clusters{genomedk} == 1){ - unless (exists $ENV{rp_perlpackages}) { - print "Run the following commands to add rp_perlpackages as an environmental variable:\n"; - print "echo \"export rp_perlpackages=/project/ricopili/perl_packages/\" >> ~/.bashrc\n"; - print "export rp_perlpackages=/project/ricopili/perl_packages/\n"; - print "./rp_config\n\n"; - exit; - } - else { print "Detected rp_perlpackages as an environmental variable.\n\n";} -} -if ($clusters{mssm} == 1){ - unless (exists $ENV{rp_perlpackages}) { - print "Run the following commands to add rp_perlpackages as an environmental variable:\n"; - print "echo \"export rp_perlpackages=/hpc/users/xripkes01/perl_modules/\" >> ~/.bashrc\n"; - print "export rp_perlpackages=/hpc/users/xripkes01/perl_modules/\n"; - print "./rp_config\n\n"; - exit; - } - else { print "Detected rp_perlpackages as an environmental variable.\n\n";} -} - -### Make sure lapack is installed -if ($clusters{genomedk} == 1){ - unless ($ENV{EXTRAS} =~ /lapack/) { - print "Run the following commands to add lapack to the default search path:\n"; - print "echo \"source /com/extra/lapack/3.5.0/load.sh\" >> ~/.bashrc\n"; - print "source /com/extra/lapack/3.5.0/load.sh\n"; - print "./rp_config\n\n"; - exit; - } - else { print "Detected lapack is installed.\n\n";} -} - -my $ans_ow = "y"; -if (-e $conf_file) { - print "Configuration file already exists at $conf_file\n"; - print "Do you wish to overwrite this file? (y/n)\n"; - while (1) { - $ans_ow = lc <>; - chomp $ans_ow; - if ($ans_ow eq "y") { - print "Rewriting configuration file. Making a backup to $conf_file.copy\n\n"; - &mysystem("cp $conf_file $conf_file.copy"); - last; - } - elsif ($ans_ow eq "n") {print "Not overwriting $conf_file.\n";last;} - else {print "Please answer with y or n.\n";} - } -}; - -my $cd = cwd(); -my $sloc = ""; -my $loloc = ""; -my $initials = ""; -my $conf_file = $ENV{HOME}."/ricopili.conf"; -my $hdir = $ENV{HOME}; -my $email = ""; -my @text = (); - -if ($ans_ow eq "y"){ -############################# -# make scratch directory -############################# -if ($clusters{broad} == 1) { - my $user_name = basename($ENV{HOME}); - $sloc = "/broad/hptmp/$user_name/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} -elsif ($clusters{lisa} == 1) { - $sloc = "/scratch/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} - -elsif ($clusters{computerome} == 1) { - $sloc = "/scratch/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} - -elsif ($clusters{co_ipsych} == 1) { - $sloc = "/data/scratch/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} - -elsif ($clusters{genomedk} == 1) { - $sloc = "/project/ricopili/scratch_dir/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} - - -elsif ($clusters{mssm} == 1) { - my $user_name = $ENV{USER}; - $sloc = "/sc/orga/scratch/$user_name/"; - print "Do you want to use the following default scratch directory? (y or n)\n"; - print "\t$sloc\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} -else { - print "Please enter a scratch directory to use:\n"; - $sloc = "$cd/tmp/"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; - $sloc = <>; - chomp $sloc; - $sloc =~ s/^~/$ENV{HOME}/g; - $sloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } - -} - -unless (-d $sloc) { - print "Making scratch directory: $sloc\n\n"; - &mysystem("mkdir $sloc"); -} -else { - print "Scratch directory already exists at $sloc\n"; -} -print "\n"; - -############################# -# write config file to home directory -############################# -print "Please enter your initials (2 letters):\n"; -while (1) { - $initials = lc <>; - chomp $initials; - if (length($initials) == 2) {last;} - else {print "Make sure initials are 2 letters!\n";} -} -print "\n"; - -print "Please enter your email address:\n"; -my $email = <>; -chomp $email; -print "\n"; - - - -my $defall = 0; - -if ($clusters{lisa} == 1) { - print "Do you want to use default values for the rest of the installation process? (y or n)\n"; - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using default values for the rest of the installation process\n\n"; $defall = 1;last;} - elsif ($answer eq "n") {print "Not using default values for the rest of the installation process\n\n"; $defall = 0;last;} - else {print "Please answer with y or n.\n";} - } -} - - - - - - - -my $home_dir = $ENV{HOME}; -$loloc = "$home_dir/"; -print "Do you want to use the following default directory to store your log files? (y or n)\n"; -print "\t$loloc\n"; -if ($defall == 0) { - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") {print "Using $loloc as the log directory.\n\n";last;} - elsif ($answer eq "n") {print "Please enter a log directory to use:\n"; - $loloc = <>; - chomp $loloc; - $loloc =~ s/^~/$ENV{HOME}/g; - $loloc =~ s/^\./$cd/g; - last;} - else {print "Please answer with y or n.\n";} - } -} - -#print "Please enter the directory you wish to store your log files in:\n"; -#while (1) { -# $loloc = <>; -# chomp $loloc; -# $loloc =~ s/^~/$ENV{HOME}/g; -# $loloc =~ s/^\./$cd/g; -# print "Using $loloc as the directory for log files.\n"; -# unless (-d $loloc) {&mysystem("mkdir $loloc");} -# last; -#} -print "\n"; - -my %longvar = ("ploc","PLINK", - "p2loc","PLINK2", - "shloc","SHAPEIT", - "i2loc","IMPUTE2", - "liloc","Liftover", - "eloc","Eigenstrat", - "rloc","R", - "rpac","Rpackages", - "hmloc","HapMap reference", - "meloc","METAL", - "ldloc","LDscore", -# "hvloc","HaploView" - ); - - -my %variables = ("ploc", "", - "p2loc","", - "shloc","", - "i2loc","", - "liloc","", - "eloc","", - "rloc","", - "rpac","", - "hmloc","", - "meloc","", -# "hvloc","", - ); - - - - - -if ($clusters{broad}){ - %variables = ("ploc", "/home/unix/sripke/plink_src/src/", - "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/", - "shloc","/home/unix/sripke/shapeit/", - "i2loc","/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/", - "liloc","/home/unix/sripke/liftover/", - "eloc","/home/unix/sripke/eigensoft/bin", - "ldloc","/psych/genetics_data/ripke/ldsc/", - "rloc","broadinstitute", - "rpac","NA", - "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/", - "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/", -# "hvloc","/home/radon01/sripke/bakker_ripke/haploview/", - ); -} - -elsif ($clusters{lisa}){ - %variables = ("ploc", "/home/gwas/plink/1.08/src", - "p2loc","/home/gwas/plink2/plink_1.9_newest", - "shloc","/home/gwas/shapeit", - "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static", - "liloc","/home/gwas/liftover", - "ldloc","/home/gwas/ldsc/", - "eloc","/home/gwas/eigensoft", - "rloc","/sara/sw/R-3.1.2/bin/", - "rpac","NA", - "hmloc","/home/gwas/pgc-samples/hapmap_ref/", - "meloc","/home/gwas/metal", -# "hvloc","./", - ); -} - - - -elsif ($clusters{computerome}){ - %variables = ("ploc", "/home/people/sripke/rp_external_bins/plink/", - "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/", - "shloc","/home/people/sripke/rp_external_bins/shapeit/", - "i2loc","/home/people/sripke/rp_external_bins/impute2/", - "liloc","/home/people/sripke/rp_external_bins/liftover/", - "ldloc","/home/people/sripke/rp_external_bins/ldsc/", - "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/", - "rloc","/services/tools/R-3.1.2/bin/", - "rpac","/home/people/sripke/rp_external_bins/Rpackages/", - "hmloc","/home/people/sripke/imputation_references/", - "meloc","/home/people/sripke/rp_external_bins/metal/", -# "hvloc","./", - ); -} - - -elsif ($clusters{co_ipsych}){ - %variables = ("ploc", "/data/tools/plink-1.07/", - "p2loc","/data/tools/plink2_sept2015/", - "shloc","/data/tools/shapeit_sept_2015/", - "i2loc","/data/tools/impute-2.3.2/", - "liloc","/data/user_tools/rp_external_bins/liftover/", - "ldloc","/data/user_tools/rp_external_bins/ldsc/", - "eloc","/data/tools/eigensoft-6.0.1/bin/", - "rloc","/data/tools/R-3.2.1/bin/", - "rpac","/data/user_tools/rp_external_bins/Rpackages/", - "hmloc","/data/user_tools/imputation_references/", - "meloc","/data/tools/metal-20110325/", -# "hvloc","./", - ); -} - -elsif ($clusters{genomedk}){ - %variables = ("ploc", "/project/ricopili/plink_src/", - "p2loc","/project/ricopili/plink_1.9_jul4/", - "shloc","/project/ricopili/3rd_bins/shapeit/", - "i2loc","/project/ricopili/3rd_bins/impute2/", - "liloc","/project/ricopili/3rd_bins/liftover/", - "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/", - "rloc","/com/extra/R/3.1.0/bin", - "rpac","NA", - "hmloc","/project/ricopili/reference_dir/", - "meloc","/project/ricopili/3rd_bins/metal/", -# "hvloc","./", - ); -} - -elsif ($clusters{mssm}){ - %variables = ("ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/", - "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/", - "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/", - "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/", - "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/", - "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/", - "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/", - "rpac","NA", - "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/", - "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/", -# "hvloc","./", - ); -} - - - -foreach (keys %variables){ - - if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") { - print "You are running R on broad, took the default value\n\n"; - } - elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") { - print "assuming library rmeta is installed on standard R\n\n"; - } - else { - if ($variables{$_} ne '' && (-d $variables{$_})){ - print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n"; - if ($defall == 0) { - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") { - print "Using $variables{$_} for $longvar{$_}.\n\n"; - last; - } - elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n"; - my $input = <>; - chomp $input; - $input =~ s/^~/$ENV{HOME}/g; - $input =~ s/^\./$cd/g; - unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} - print "\n"; - last;} - else {print "Please answer with y or n.\n";} - } - } - } - else { - while (1){ - print "not default value for:\n"; - print "Please enter a location for $longvar{$_}:\n"; - my $input = ""; - $input = <>; - chomp $input; - $input =~ s/^~/$ENV{HOME}/g; - $input =~ s/^\./$cd/g; - unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} - $variables{$_} = $input; - print "\n"; - last; - } - } - } -} - -foreach (keys %variables){ - push (@text, "$_ $variables{$_}"); -} - -push (@text, "home $home"); -push (@text, "sloc $sloc"); -push (@text, "init $initials"); -push (@text, "email $email"); -push (@text, "loloc $loloc"); - -### define queue depending on cluster -#if ($clusters{broad}){push (@text, "queue bsub")} - -if ($clusters{broad}){push (@text, "queue qsub_b")} -if ($clusters{lisa}){push (@text, "queue qsub")} -if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")} -if ($clusters{genomedk}){push (@text, "queue slurm")} -if ($clusters{mssm}){push (@text, "queue msub")} -} - -unless ( -e $conf_file && $ans_ow eq "n") { - die $! unless open FILE, "> $conf_file"; - foreach (@text) {print FILE "$_\n"}; - close FILE; -} - -############################# -# read ricopili.config file with default parameters -############################# -my %conf = (); ## hash with config parameters - -### Read config file -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - $conf{$cells[0]} = $cells[1]; -} -close FILE; - -print "\n"; - -############################# -# write pipeline status file to home directory -############################# - -my @log_file = ("$conf{loloc}/preimp_dir_info","$conf{loloc}/impute_dir_info","$conf{loloc}/pcaer_info","$conf{loloc}/idtager_info","$conf{loloc}/repqc2_info","$conf{loloc}/areator_info","$conf{loloc}/merge_caller_info","$conf{loloc}/postimp_navi_info"); - -foreach (@log_file) { - unless ( -e $_) { - print "Creating pipeline status file to $_\n"; - &mysystem("touch $_"); - } -} - -############################ -# check whether all binary directories exist -############################ -my @fail_path = (); -my %locs = ("ploc","","p2loc","","shloc","","i2loc","","liloc","","eloc","","rloc","","hmloc","","meloc","","ldloc","","rpac",""); - -die $!."($conf_file)" unless open FILE, "< $conf_file"; -while (my $line = ){ - my @cells = split /\s+/, $line; - my $path = $cells[1]; - my $variable = $cells[0]; - unless (-d $path) { - if (exists $locs{$variable}) {push(@fail_path,$variable)}; - } -} -close FILE; - -############################# -# print finish statement -############################# - -my $fail = 0; -if ($#fail_path != -1) { - - -# foreach (@fail_path) { -# unless ($_ eq "rloc" && $clusters{broad} == 1) { - - foreach my $confvar (@fail_path) { - if ($confvar eq "rloc" && $clusters{broad} == 1) { - next; - } - elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) { - next; - } - else{ - $fail += 1; - } - } - if ($fail != 0) { - print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $hdir/ricopili.conf for the following variables:\n"; - foreach (@fail_path) { - unless ($_ eq "rloc" && $clusters{broad} == 1) { - print "\t$_\n"; - } - } - } - else { - print "Setup has been completed successfully!\n"; - print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; - &mysystem("rm install_true"); - &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); - } -} -else { - print "Setup has been completed successfully!\n"; - print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; - &mysystem("rm install_true"); - &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); -} - - - -my $hostname = $ENV{HOSTNAME}.'.'.$ENV{DOMAINNAME}; -if ($clusters{lisa} == 1) { - $hostname = "lisa.surfsara.nl"; -} -if ($clusters{computerome} == 1) { - $hostname = "computerome.cbs.dtu.dk"; -} - -if ($clusters{co_ipsych} == 1) { - $hostname = "ipsych.computerome.cbs.dtu.dk"; -} - - -print "-------------------------------------------------------------------\n"; -print "adding these commands to your ~/.bashrc can be very helpful\n(just copy/paste the following lines into ~/.bashrc)\n(you have to logout and login again for these to take effect)\n\n"; -print "## for colored output of ls:\n"; -print 'alias ls=\'ls --color=auto\''."\n\n"; -print "## for easy copy over to your local machine:\n"; -print 'alias c=\'sed "s#.*#scp '.$ENV{LOGNAME}.'@'.$hostname.':$(pwd)/& .#"\''."\n\n"; - - -print "## for list of cluster jobs:\n"; -if ($clusters{lisa} == 1 || $clusters{computerome} == 1 || $clusters{co_ipsych} == 1 || $clusters{broad} == 1) { - print 'alias q=\'qstat -u '.$ENV{LOGNAME}."\'\n\n"; -} -else { - print "alias q=\'bjobs -w\'\n\n"; - -} - -if ($clusters{computerome} == 1) { - print "## load queuing system by default:\n"; - print "module load torque\n\n"; - print "## different prompt:\n"; - print 'PS1="$USER@computerome.cbs.dtu.dk:"\'\w\'" "'."\n\n"; -} -elsif ($clusters{co_ipsych} == 1) { - print "## different prompt:\n"; - print 'PS1="$USER@ipsych.computerome.cbs.dtu.dk:"\'\w\'" "'."\n\n"; -} -else { - print "## different prompt:\n"; - print 'PS1="'.$ENV{USER}.'@'.$hostname.':"\'\w\'" "'."\n\n"; -} - - - -print "-------------------------------------------------------------------\n"; -exit; - - - -########## Done ########## diff --git a/bin/config_pico.pl b/bin/config_pico.pl new file mode 100755 index 0000000..7fc9563 --- /dev/null +++ b/bin/config_pico.pl @@ -0,0 +1,714 @@ +#!/usr/bin/env perl +use strict; +use File::Basename; +use Cwd; +use Cwd 'abs_path'; +use Data::Dumper; + +### Script to configure settings for picopili pipeline +### Jackie Goldstein, Jan 2014 + +### Adapted for picopili by Raymond Walters, Sept 2016 + +my $version = "2.0.0"; +my $progname = $0; + +$progname =~ s!^.*/!!; + +my $cdir = abs_path($0); +my $home = $ENV{HOME}; +my $conf_file = $ENV{HOME}."/picopili.conf"; +my $command_line = "$progname @ARGV"; + +print "\n"; +print "##############################\n"; +print "#\n"; +print "# Creating config file for picopili\n"; +print "# $conf_file\n"; +print "#\n"; +print "# Will index location of executables for other\n"; +print "# programs (e.g. plink), reference files, and\n"; +print "# job settings (e.g. email address for job logs).\n"; +print "#\n"; +print "# Default settings are available for clusters\n"; +print "# with existing ricopili configurations.\n"; +print "#\n"; +print "##############################\n"; + + + +############################# +# Ask user what cluster they're using +############################# +#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"other",0); +#my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"other",0); +my %clusters = ("broad",0,"mssm",0,"genomedk",0,"lisa",0,"computerome",0,"co_ipsych",0,"other",0); +my @cluster_names = ("broad","mssm","genomedk","lisa","computerome","co_ipsych","other"); +print "Please enter your cluster name from the following options:\n"; +my $i = 1; +foreach (@cluster_names){ + print "\t($i) $_\n"; + $i += 1; +} +print "\n"; +my $cluster = "other"; +while (1) { + $cluster = lc <>; + chomp $cluster; + if (exists $clusters{$cluster}){$clusters{$cluster} = 1;last;} + else { + $cluster =~ s/(\)|\()//g; + if ($cluster >= 1 && $cluster <= $i){$cluster -= 1; $cluster = $cluster_names[$cluster];$clusters{$cluster} = 1;last;} + else { + print "Did not recognize option. Please enter a cluster name from the options below:\n"; + my $i = 1; + foreach (@cluster_names){ + print "\t($i) $_\n"; + $i += 1; + } + print "\n"; + my $cluster = "other"; + } + } +} +print "\nUsing the following cluster: $cluster\n\n"; + + +################################################### +### system call with test if successful +################################################### +sub mysystem(){ + my ($systemstr)="@_"; + system($systemstr); + my $status = ($? >> 8); + die "$systemstr\n->system call failed: $status" if ($status != 0); +} + +################################################### +### Make sure lapack is installed +### specific to genomedk, and unclear if needed? +################################################### +# if ($clusters{genomedk} == 1){ +# unless ($ENV{EXTRAS} =~ /lapack/) { +# print "Run the following commands to add lapack to the default search path:\n"; +# print "echo \"source /com/extra/lapack/3.5.0/load.sh\" >> ~/.bashrc\n"; +# print "source /com/extra/lapack/3.5.0/load.sh\n"; +# print "./rp_config\n\n"; +# exit; +# } +# else { print "Detected lapack is installed.\n\n";} +#} + + +################################################### +### Check whether to overwrite existing config (if exists) +################################################### + +my $ans_ow = "y"; +if (-e $conf_file) { + print "Configuration file already exists at $conf_file\n"; + print "Do you wish to overwrite this file? (y/n)\n"; + while (1) { + $ans_ow = lc <>; + chomp $ans_ow; + if ($ans_ow eq "y") { + print "Rewriting configuration file. Making a backup to $conf_file.copy\n\n"; + &mysystem("cp $conf_file $conf_file.copy"); + last; + } + elsif ($ans_ow eq "n") {print "Not overwriting $conf_file.\n";last;} + else {print "Please answer with y or n.\n";} + } +}; + +my $cd = cwd(); +my $sloc = ""; +my $initials = ""; +my $email = ""; +my @text = (); + +if ($ans_ow eq "y"){ +############################# +# make scratch directory +############################# +if ($clusters{broad} == 1) { + my $user_name = basename($ENV{HOME}); + $sloc = "/broad/hptmp/$user_name/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} +elsif ($clusters{lisa} == 1) { + $sloc = "/scratch/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} + +elsif ($clusters{computerome} == 1) { + $sloc = "/scratch/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} + +elsif ($clusters{co_ipsych} == 1) { + $sloc = "/data/scratch/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} + +elsif ($clusters{genomedk} == 1) { + $sloc = "/project/ricopili/scratch_dir/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} + + +elsif ($clusters{mssm} == 1) { + my $user_name = $ENV{USER}; + $sloc = "/sc/orga/scratch/$user_name/"; + print "Do you want to use the following default scratch directory? (y or n)\n"; + print "\t$sloc\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } +} +else { + print "Please enter a scratch directory to use:\n"; + $sloc = "$cd/tmp/"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using $sloc as the scratch directory.\n\n";last;} + elsif ($answer eq "n") {print "Please enter a scratch directory to use:\n"; + $sloc = <>; + chomp $sloc; + $sloc =~ s/^~/$ENV{HOME}/g; + $sloc =~ s/^\./$cd/g; + last;} + else {print "Please answer with y or n.\n";} + } + +} + +unless (-d $sloc) { + print "Making scratch directory: $sloc\n\n"; + &mysystem("mkdir $sloc"); +} +else { + print "Scratch directory already exists at $sloc\n"; +} +print "\n"; + +############################# +# analyst info +############################# +print "Please enter your initials (2 letters):\n"; +while (1) { + $initials = lc <>; + chomp $initials; + if (length($initials) == 2) {last;} + else {print "Make sure initials are 2 letters!\n";} +} +print "\n"; + +print "Please enter your email address:\n"; +my $email = <>; +chomp $email; +print "\n"; + + + + +############################# +# allow default all remaining values on select platforms +############################# +my $defall = 0; + +if ($clusters{lisa} == 1 || $clusters{broad} == 1) { + print "Do you want to use default values for the rest of the installation process? (y or n)\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") {print "Using default values for the rest of the installation process\n\n"; $defall = 1;last;} + elsif ($answer eq "n") {print "Not using default values for the rest of the installation process\n\n"; $defall = 0;last;} + else {print "Please answer with y or n.\n";} + } +} + + + +print "\n"; + +my %longvar = ("p2loc","PLINK2", + "shloc","SHAPEIT", + "i2loc","IMPUTE2", + "liloc","Liftover", + "eloc","Eigenstrat", +# "rloc","R", + "hmloc","HapMap reference", + "perlpack","Perl packages (for Compress::Zlib)", + ); + + +my %variables = ("p2loc", "", + "shloc","", + "i2loc","", + "liloc","", + "eloc","", +# "rloc","", + "hmloc","", + "perlpack","", + ); + + +if ($clusters{broad}){ + %variables = ( + # "ploc", "/home/unix/sripke/plink_src/src/", + "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/", + "shloc","/home/unix/sripke/shapeit/", + "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/", + "liloc","/home/unix/sripke/liftover/", + "eloc","/home/unix/sripke/eigensoft/bin", +# "ldloc","/psych/genetics_data/ripke/ldsc/", +# "rloc","broadinstitute", +# "rpac","NA", + "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/", +# "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/", +# "hvloc","/home/radon01/sripke/bakker_ripke/haploview/", + "perlpack","/home/unix/sripke/perl_modules", + ); +} + +elsif ($clusters{lisa}){ + %variables = ( +# "ploc", "/home/gwas/plink/1.08/src", + "p2loc","/home/gwas/plink2/plink_1.9_newest", + "shloc","/home/gwas/shapeit", + "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static", + "liloc","/home/gwas/liftover", +# "ldloc","/home/gwas/ldsc/", + "eloc","/home/gwas/eigensoft", +# "rloc","/sara/sw/R-3.1.2/bin/", +# "rpac","NA", + "hmloc","/home/gwas/pgc-samples/hapmap_ref/", +# "meloc","/home/gwas/metal", +# "hvloc","./", + "perlpack","/home/gwas/perl_modules", + ); +} + + + +elsif ($clusters{computerome}){ + %variables = ( +# "ploc", "/home/people/sripke/rp_external_bins/plink/", + "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/", + "shloc","/home/people/sripke/rp_external_bins/shapeit/", + "i2loc","/home/people/sripke/rp_external_bins/impute2/", + "liloc","/home/people/sripke/rp_external_bins/liftover/", +# "ldloc","/home/people/sripke/rp_external_bins/ldsc/", + "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/", +# "rloc","/services/tools/R-3.1.2/bin/", +# "rpac","/home/people/sripke/rp_external_bins/Rpackages/", + "hmloc","/home/people/sripke/imputation_references/", +# "meloc","/home/people/sripke/rp_external_bins/metal/", +# "hvloc","./", + "perlpack","/home/people/sripke/rp_external_bins/perl_packages", + ); +} + + +elsif ($clusters{co_ipsych}){ + %variables = ( +# "ploc", "/data/tools/plink-1.07/", + "p2loc","/data/tools/plink2_sept2015/", + "shloc","/data/tools/shapeit_sept_2015/", + "i2loc","/data/tools/impute-2.3.2/", + "liloc","/data/user_tools/rp_external_bins/liftover/", +# "ldloc","/data/user_tools/rp_external_bins/ldsc/", + "eloc","/data/tools/eigensoft-6.0.1/bin/", +# "rloc","/data/tools/R-3.2.1/bin/", +# "rpac","/data/user_tools/rp_external_bins/Rpackages/", + "hmloc","/data/user_tools/imputation_references/", +# "meloc","/data/tools/metal-20110325/", +# "hvloc","./", + "perlpack","/data/user_tools/rp_external_bins/perl_packages", + ); +} + +elsif ($clusters{genomedk}){ + %variables = ( +# "ploc", "/project/ricopili/plink_src/", + "p2loc","/project/ricopili/plink_1.9_jul4/", + "shloc","/project/ricopili/3rd_bins/shapeit/", + "i2loc","/project/ricopili/3rd_bins/impute2/", + "liloc","/project/ricopili/3rd_bins/liftover/", + "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/", +# "rloc","/com/extra/R/3.1.0/bin", +# "rpac","NA", + "hmloc","/project/ricopili/reference_dir/", +# "meloc","/project/ricopili/3rd_bins/metal/", +# "hvloc","./", + "perlpack","/project/ricopili/perl_packages/", + ); +} + +elsif ($clusters{mssm}){ + %variables = ( +# "ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/", + "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/", + "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/", + "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/", + "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/", + "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/", +# "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/", +# "rpac","NA", + "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/", +# "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/", +# "hvloc","./", + "perlpack","/hpc/users/xripkes01/perl_modules/", + ); +} + + + +foreach (keys %variables){ + + if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") { + print "You are running R on broad, took the default value\n\n"; + } + elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") { + print "assuming library rmeta is installed on standard R\n\n"; + } + else { + if ($variables{$_} ne '' && (-d $variables{$_})){ + print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n"; + if ($defall == 0) { + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") { + print "Using $variables{$_} for $longvar{$_}.\n\n"; + last; + } + elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n"; + my $input = <>; + chomp $input; + $input =~ s/^~/$ENV{HOME}/g; + $input =~ s/^\./$cd/g; + unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} + print "\n"; + last;} + else {print "Please answer with y or n.\n";} + } + } + } + else { + while (1){ + unless($clusters{other} == 1){ + print "No default value available for:\n"; + } + print "Please enter a location for $longvar{$_}:\n"; + my $input = ""; + $input = <>; + chomp $input; + $input =~ s/^~/$ENV{HOME}/g; + $input =~ s/^\./$cd/g; + unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} + $variables{$_} = $input; + print "\n"; + last; + } + } + } +} + +foreach (keys %variables){ + push (@text, "$_ $variables{$_}"); +} + +push (@text, "sloc $sloc"); +push (@text, "init $initials"); +push (@text, "email $email"); + +### define queue depending on cluster +#if ($clusters{broad}){push (@text, "queue bsub")} + +if ($clusters{broad}){push (@text, "queue broad_uger")} +if ($clusters{lisa}){push (@text, "queue qsub")} +if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")} +if ($clusters{genomedk}){push (@text, "queue slurm")} +if ($clusters{mssm}){push (@text, "queue msub")} +} + +unless ( -e $conf_file && $ans_ow eq "n") { + die $! unless open FILE, "> $conf_file"; + foreach (@text) {print FILE "$_\n"}; + close FILE; +} + + + +############################# +# read ricopili.config file with default parameters +############################# +my %conf = (); ## hash with config parameters + +### Read config file +die $!."($conf_file)" unless open FILE, "< $conf_file"; +while (my $line = ){ + my @cells = split /\s+/, $line; + $conf{$cells[0]} = $cells[1]; +} +close FILE; + +print "\n"; + +############################ +# check whether all binary directories exist +############################ +my @fail_path = (); +my %locs = ( +# "ploc","", + "p2loc","", + "shloc","", + "i2loc","", + "liloc","", + "eloc","", +# "rloc","", + "hmloc","", +# "meloc","", +# "ldloc","", +# "rpac","", + "perlpack","" +); + +die $!."($conf_file)" unless open FILE, "< $conf_file"; +while (my $line = ){ + my @cells = split /\s+/, $line; + my $path = $cells[1]; + my $variable = $cells[0]; + unless (-d $path) { + if (exists $locs{$variable}) {push(@fail_path,$variable)}; + } +} +close FILE; + +############################# +# print finish statement +############################# + +my $fail = 0; +if ($#fail_path != -1) { + + +# foreach (@fail_path) { +# unless ($_ eq "rloc" && $clusters{broad} == 1) { + + foreach my $confvar (@fail_path) { + if ($confvar eq "rloc" && $clusters{broad} == 1) { + next; + } + elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) { + next; + } + else{ + $fail += 1; + } + } + if ($fail != 0) { + print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/ricopili.conf for the following variables:\n"; + foreach (@fail_path) { + unless ($_ eq "rloc" && $clusters{broad} == 1) { + print "\t$_\n"; + } + } + } + else { + print "Setup has been completed successfully!\n"; + print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; + &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); + } +} +else { + print "Setup has been completed successfully!\n"; + print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; + &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); +} + + + + + + +################################################### +### Optional: Add bin to default search path +################################################### + +system("bin_check_pico"); # dummy script that doesn't do anything +my $status_bin = ($? >> 8); + + +if ($clusters{lisa} == 1) { + unless (-e "$home/.bash_profile") { + die $! unless open FILE, "> $home/.bash_profile"; + print FILE 'if [ -f ~/.bashrc ]; then '."\n"; + print FILE ' . ~/.bashrc'."\n"; + print FILE 'fi'."\n"; + close FILE; + } + unless (-e "$home/.bashrc") { + system "touch ~/.bashrc\n"; + } +} + + +if ($status_bin != 0) { + my $bash = "PATH=\$PATH:$cdir"; + my $csh = "set path=(\$path $cdir)"; + + print "\n----------------------------------------------------\n"; + print "## You will probably want to add picopili to the default search path.\n"; + + + # Determine the shell + my $shell = ''; + if (exists $ENV{SHELL}){$shell = basename($ENV{SHELL});} + if ($shell eq "bash-login-check"){$shell = "bash";} + if ($shell ne "bash" && $shell ne "tcsh") { + print "Warning! Shell not recognized: $shell\n"; + print "Please send email to rwalters\@broadinstitute.org\n"; + } + print "Detected you are using the following shell: $shell\n\n"; + + # provide commands, where possible + # perm tracks if a command is generated for .bashrc or equivalent + my $perm = 0; + if ($shell eq "bash"){ + print "To do this in bash, run the following command:\n"; + print "$bash\n"; + if ($clusters{broad}){ + if (-e "$home/.my.bashrc") { + print "echo \"$bash\" >> ~/.my.bashrc\n"; + $perm = 1; + } + } + else{ + if ($clusters{lisa} == 1){ + unless (-e "$home/.bashrc") { + print "touch ~/.bashrc\n"; + } + unless (-e "$home/.bash_profile") { + print "echo \"if [ -f ~/.bashrc ]; then \" > $home/.bash_profile"; + print "echo \" ~/.bashrc\" >> $home/.bash_profile"; + print "echo \"fi\" >> $home/.bash_profile"; + } + } + if (-e "$home/.bashrc") { + print "echo \"$bash\" >> ~/.bashrc\n"; + $perm = 1; + } + } + } + elsif ($shell eq "tcsh"){ + print "To do this in tcsh, run the following command:\n"; + print "$csh\n"; + if ($clusters{broad}){ + if (-e "$home/.my.cshrc") { + print "echo \"$csh\" >> ~/.my.cshrc\n"; + $perm = 1; + } + } + else{ + if (-e "$home/.cshrc") { + print "echo \"$csh\" >> ~/.cshrc\n"; + $perm = 1; + } + } + } + # else if shell not determined + else { + print "You'll want to add the following path:\n"; + print "\t$cdir\n"; + } + # additional instructions of not .bashrc equivalent provided + if ($perm == 0){ + print "If possible, add these paths permanently. Otherwise, you will need to do this everytime you start a new session.\n"; + print "For example instructions, see http://www.cyberciti.biz/faq/unix-linux-adding-path/\n"; + } +} +else{ + print "Successfully found picopili directory in search path!\n"; +} + + +exit; +########## Done ########## diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index d14a5ef..9134f09 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -29,14 +29,17 @@ use FindBin; use lib "$FindBin::Bin"; -use Ricopili::Utils qw(trans); +use rp_perl::Utils qw(trans); my $version = "1.0.24"; my $progname = $0; $progname =~ s!^.*/!!; my $command_line = "$progname @ARGV"; - +use Cwd; +use File::Path; +my $rootdir = &Cwd::cwd(); +my $sjainfotxt = "$rootdir\t$command_line"; my $jnum = 7; ### number of imputation job per node @@ -54,16 +57,13 @@ ############################# my $ploc = &trans("p2loc"); -my $homedir = &trans("home"); my $qloc = &trans("queue"); my $liloc = &trans("liloc"); my $email = &trans("email"); -my $loloc = &trans("loloc"); ############################################### -my $rootdir = ""; my $iname = "" ; my $suminfo = "infosum_pos"; @@ -395,14 +395,10 @@ sub a2filenew_app { } -my $sjainfofile = "$loloc/impute_dir_info"; +my $sjainfofile = "$rootdir/impute_dir_info.log"; unless (-e $sjainfofile) { - print "log-file ($sjainfofile) is not existing\n"; - print "please check loloc in ~/picopili.conf\n"; - exit; + &mysystem ("touch $sjainfofile"); } -#my $sjainfofile = "$homedir/impute_dir_info_35_test"; -my $sjainfotxt = ""; my $sjamulti = 0; @@ -619,12 +615,6 @@ sub send_jobarray { ############################################## -use Cwd; -use File::Path; -$rootdir = &Cwd::cwd(); -$sjainfotxt = "$rootdir\t$command_line"; - - unless (-e $impute_dir){ print "impute_dir is not existing, create one for you\n"; my @created = mkpath( ## $created ? diff --git a/config b/config deleted file mode 120000 index bc60002..0000000 --- a/config +++ /dev/null @@ -1 +0,0 @@ -./bin/config \ No newline at end of file diff --git a/docs/RICOPILI.md b/docs/RICOPILI.md index ef8e45e..e71f047 100644 --- a/docs/RICOPILI.md +++ b/docs/RICOPILI.md @@ -3,9 +3,10 @@ The following scripts are adapted from ricopili (https://github.com/Nealelab/ric * `buigue_pico.pl`, from `buigue` * `checkflip_pico.pl`, from `checkflip4` * `checkpos_pico.pl`, from `checkpos6` -* `config`, from `rp_config` +* `config_pico.pl`, from `rp_config` * `lift_to_hg19.pl`, from `lift18219` * `plague_pico.pl`, from `plague_2` +* `bin_check_pico`, from `bin_check` * `./rp_perl/Utils.pm`, from `./Ricopili/Utils.pm` In addition, the following scripts are adapted from ricopili with more substantial changes as indicated: From 576e70f4d30f0c0a8289f93e51e0a7d4e3059f19 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 29 Sep 2016 13:13:40 -0400 Subject: [PATCH 04/48] migrate to using picopili.conf file --- bin/admix_rel.py | 35 ++++++++++++++--------------------- bin/agg_imp.py | 24 ++++++------------------ bin/args_pca.py | 4 ++-- bin/args_ped.py | 2 +- bin/bg_imp.py | 16 +++------------- bin/checkflip_pico.pl | 2 +- bin/checkpos_pico.pl | 2 +- bin/config_pico.pl | 4 ++-- bin/gwas_dfam.py | 15 +++++---------- bin/gwas_gee.py | 17 +++++++---------- bin/gwas_rel.py | 19 ++++--------------- bin/imp2_rel.py | 33 +++++++++------------------------ bin/imus_pca.py | 33 +++++++++++++++------------------ bin/py_helpers.py | 43 +++++++++++++++++++++++++++++++++++++++---- bin/qc_rel.py | 26 +++++++++----------------- bin/shape_rel.py | 17 +++-------------- bin/strict_qc.py | 23 +++++++---------------- 17 files changed, 128 insertions(+), 187 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index debf85a..5f29981 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -42,7 +42,7 @@ from string import ascii_uppercase from glob import glob from numpy import digitize -from py_helpers import unbuffer_stdout, file_len, test_exec, read_conf, find_from_path, link, gz_confirm +from py_helpers import unbuffer_stdout, file_len, test_exec, find_exec, link, gz_confirm unbuffer_stdout() @@ -196,35 +196,28 @@ print '--plot-admix-pca '+str(args.plot_admix_pca) - ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' +# find, check exists, executable ############# -### read plink loc from config -# not getting R here since ricopili.conf currently relies on platform info -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) +plinkx = find_exec('plink',key='p2loc') -plinkx = configs['p2loc']+"plink" +if args.rscript_ex == None or args.rscript_ex == "None": + args.rscript_ex = find_exec('Rscript', key='rscloc') +if args.admixture_ex == None or args.admixture_ex == "None": + args.admixture_ex = find_exec('admixture', key='admloc') -############# -print '\n...Checking dependencies...' -# check exists, executable -############# - -# get variables from path as needed -# - Rscript (if unspecified) -# - IBD plotting script -# - PCA plotting script (optional) -if args.rscript_ex == None or args.rscript_ex == "None": - args.rscript_ex = find_from_path('Rscript', 'Rscript') +if args.reap_ex == None or args.reap_ex == "None": + args.reap_ex = find_exec('REAP', key='reaploc') -Rplotibdx = find_from_path('plot_reap_ibd.Rscript', 'IBD plotting script') +rp_bin = os.path.dirname(os.path.realpath(__file__)) +Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript' if plot_pca: - Rplotpcax = find_from_path('plot_pca.Rscript', 'PCA plotting script') + Rplotibdx = rp_bin+'/plot_pca.Rscript' + # verify executables test_exec(plinkx, 'Plink') diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 4f919c6..9e273c9 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -30,7 +30,7 @@ import os import subprocess from args_impute import * -from py_helpers import unbuffer_stdout, read_conf, file_len #, file_tail, link, warn_format +from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format unbuffer_stdout() # warnings.formatwarning = warn_format @@ -73,30 +73,18 @@ - - - ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' ############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -plink_ex = configs['p2loc']+"plink" +plink_ex = find_exec('plink', key='p2loc') # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) +uger_ex = +str(rp_bin)+'/uger.sub.sh' - -############# -print '\n...Checking dependencies...' -############# - - - +test_exec(uger_ex) # TODO: here @@ -217,7 +205,7 @@ '-l', 'm_mem_free=8g,h_vmem=8g', '-N', 'agg.imp.'+str(outdot), '-o', agg_log, - str(rp_bin)+'/uger.sub.sh', + str(uger_ex), str(args.sleep), ' '.join(sys.argv[:])]) diff --git a/bin/args_pca.py b/bin/args_pca.py index 404d837..61d2353 100644 --- a/bin/args_pca.py +++ b/bin/args_pca.py @@ -138,7 +138,7 @@ # - relatedness threshhold for defining IMUS set # - Number of PCs to compute # - PCA output controls (directory, number of PCs to plot) -# - File paths for external software not provided by ~/ricopili.conf +# - File paths for external software not previously provided by ricopili.conf # ############ @@ -171,7 +171,7 @@ # arg_exloc.add_argument('--plink-ex', # type=str, # metavar='PATH', -# help='path to plink executable, read from ~/ricopili.conf if unspecified', +# help='path to plink executable, read from ~/picopili.conf if unspecified', # required=False) arg_exloc.add_argument('--rscript-ex', type=str, diff --git a/bin/args_ped.py b/bin/args_ped.py index ab166c1..6d4642e 100644 --- a/bin/args_ped.py +++ b/bin/args_ped.py @@ -187,7 +187,7 @@ ############ # # Software Executables -# Locations for software dependencies not in ricopili config file +# Locations for software dependencies not previously in ricopili config file # ############ parserexloc = argparse.ArgumentParser(add_help=False) diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 7f01610..77b77e3 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -35,7 +35,7 @@ import subprocess import warnings from args_impute import * -from py_helpers import unbuffer_stdout, read_conf, file_tail, link, warn_format +from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format unbuffer_stdout() warnings.formatwarning = warn_format @@ -181,26 +181,16 @@ ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' ############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -plink_ex = configs['p2loc']+"plink" +plink_ex = find_exec('plink',key='p2loc') # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) rs_ex = str(rp_bin)+'/rs_trans.py' -############# -print '\n...Checking dependencies...' -############# - - - # TODO: here diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl index 69e2809..3bb3d08 100755 --- a/bin/checkflip_pico.pl +++ b/bin/checkflip_pico.pl @@ -30,7 +30,7 @@ ############################# -my $conf_file = $ENV{HOME}."/ricopili.conf"; +my $conf_file = $ENV{HOME}."/picopili.conf"; my %conf = (); die $!."($conf_file)" unless open FILE, "< $conf_file"; diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl index 8f417d6..76a2661 100755 --- a/bin/checkpos_pico.pl +++ b/bin/checkpos_pico.pl @@ -30,7 +30,7 @@ # read config file ############################# -my $conf_file = $ENV{HOME}."/ricopili.conf"; +my $conf_file = $ENV{HOME}."/picopili.conf"; my %conf = (); die $!."($conf_file)" unless open FILE, "< $conf_file"; diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 7fc9563..7c9cd2b 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -518,7 +518,7 @@ () ############################# -# read ricopili.config file with default parameters +# read picopili.config file with default parameters ############################# my %conf = (); ## hash with config parameters @@ -585,7 +585,7 @@ () } } if ($fail != 0) { - print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/ricopili.conf for the following variables:\n"; + print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n"; foreach (@fail_path) { unless ($_ eq "rloc" && $clusters{broad} == 1) { print "\t$_\n"; diff --git a/bin/gwas_dfam.py b/bin/gwas_dfam.py index b121a92..7183dbd 100755 --- a/bin/gwas_dfam.py +++ b/bin/gwas_dfam.py @@ -41,7 +41,7 @@ import argparse # from glob import glob from args_gwas import * -from py_helpers import unbuffer_stdout, test_exec +from py_helpers import unbuffer_stdout, test_exec, find_exec # , read_conf, link unbuffer_stdout() @@ -93,21 +93,16 @@ print '--rplink-ex '+str(args.rplink_ex) -############## -#print '\n...Reading ricopili config file...' -############## -# -#### read plink loc from config -#conf_file = os.environ['HOME']+"/ricopili.conf" -#configs = read_conf(conf_file) - ############# print '\n...Checking dependencies...' # check exists, executable ############# -# verify executables +# R-compatible plink +if args.rplink_ex is None or args.rplink_ex == "None": + args.rplink_ex = find_exec('plink',key='rplloc') + test_exec(args.rplink_ex, 'Plink') # verify bfiles are files, not paths diff --git a/bin/gwas_gee.py b/bin/gwas_gee.py index ff06c23..3cf9209 100755 --- a/bin/gwas_gee.py +++ b/bin/gwas_gee.py @@ -44,7 +44,7 @@ from warnings import warn # from glob import glob from args_gwas import * -from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len +from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len, find_exec # , read_conf, link unbuffer_stdout() @@ -127,20 +127,17 @@ print '--port '+str(args.port) -############## -#print '\n...Reading ricopili config file...' -############## -# -#### read plink loc from config -#conf_file = os.environ['HOME']+"/ricopili.conf" -#configs = read_conf(conf_file) - - ############# print '\n...Checking dependencies...' # check exists, executable ############# +if args.rplink_ex is None or args.rplink_ex == "None": + args.rplink_ex = find_exec('plink',key='rpllloc') + +if args.r_ex is None or args.r_ex == "None": + args._ex = find_exec('R',key='rloc') + # verify executables test_exec(args.rplink_ex, 'Plink') #if not args.rserve_active: diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 8479e49..01ae60a 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -27,7 +27,7 @@ import os from warnings import warn from args_gwas import * -from py_helpers import link, unbuffer_stdout, read_conf, find_from_path +from py_helpers import link, unbuffer_stdout, find_exec unbuffer_stdout() @@ -130,25 +130,14 @@ ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' ############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -plinkx = configs['p2loc']+"plink" +plinkx = find_exec('plink',key='p2loc') if args.model == 'gmmat' or args.model == 'gmmat-fam': if args.rscript_ex == None or args.rscript_ex == "None": - args.rscript_ex = find_from_path('Rscript', 'Rscript') - - -############# -print '\n...Checking dependencies...' -############# - - + args.rscript_ex = find_exec('Rscript', key='rscloc') # TODO: here diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index ba2b007..2951b60 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -27,7 +27,7 @@ import os import subprocess from args_impute import * -from py_helpers import unbuffer_stdout, file_len, link, read_conf +from py_helpers import unbuffer_stdout, file_len, link, find_exec unbuffer_stdout() @@ -87,35 +87,18 @@ ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' ############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -impute_ex = configs['i2loc']+"impute2" -shapeit_ex = configs['shloc'] + '/bin/shapeit' +# from config +impute_ex = find_exec('impute2',key='i2loc') +shapeit_ex = find_exec('shapeit',key='shloc') # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) chunker_ex = rp_bin+'/chunk_snps.py' - - - -# directories -wd = os.getcwd() -shape_dir = wd + '/phase_chr' - - - - - -############# -print '\n...Checking dependencies...' -############# - +test_exec(chunker_ex) # TODO: here @@ -124,7 +107,9 @@ # executables - +# directories +wd = os.getcwd() +shape_dir = wd + '/phase_chr' diff --git a/bin/imus_pca.py b/bin/imus_pca.py index 08f1ce1..6fee4bb 100755 --- a/bin/imus_pca.py +++ b/bin/imus_pca.py @@ -35,7 +35,7 @@ import subprocess import argparse from glob import glob -from py_helpers import read_conf, unbuffer_stdout, test_exec, find_from_path +from py_helpers import find_exec, unbuffer_stdout, test_exec from args_pca import * unbuffer_stdout() @@ -83,31 +83,28 @@ print '--npcs '+str(args.npcs) - -############# -print '\n...Reading ricopili config file...' -############# - -### read plink loc from config -# not getting R here since ricopili.conf currently relies on platform info -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -plinkx = configs['p2loc']+"plink" -smartpcax = configs['eloc']+"/smartpca" - ############# print '\n...Checking dependencies...' # check exists, executable ############# -# find required files -if args.rscript_ex == None or args.rscript_ex == "None": - args.rscript_ex = find_from_path("Rscript", 'Rscript') +# from config +plinkx = find_exec('plink',key='p2loc') +smartpcax = find_exec('smartpca',key='eloc') + -Rplotpcax = find_from_path("plot_pca.Rscript", 'PCA plotting script') +# if unspecified +if args.rscript_ex == None or args.rscript_ex == "None": + args.rscript_ex = find_exec("Rscript", key='rscloc') + +if args.primus_ex == None or args.primus_ex == "None": + args.primus_ex = find_exec("run_PRIMUS.pl", key='priloc') +# get directory containing current script +# (to get absolute path for scripts) +rp_bin = os.path.dirname(os.path.realpath(__file__)) +Rplotpcax = str(rp_bin)+'/plot_pca.Rscript' # test executables test_exec(args.primus_ex, 'PRIMUS') diff --git a/bin/py_helpers.py b/bin/py_helpers.py index 7cfe5c1..3173e66 100644 --- a/bin/py_helpers.py +++ b/bin/py_helpers.py @@ -40,7 +40,7 @@ def file_tail(fname, n=1): return str(result) -# read ricopili config file as dict +# read picopili config file as dict def read_conf(fname): configs = {} @@ -76,6 +76,34 @@ def find_from_path(fname, name): return file_ex + +# find executables from either config or path +def find_exec(prog, key=None): + + if key is not None: + import os + conffile = os.environ['HOME']+'/picopili.conf' + + if os.path.isfile(conffile): + configs = read_conf(conffile) + + if str(key) in configs: + exloc = configs[str(key)]+'/'+str(prog) + test_exec(exloc,str(prog)) + return(exloc) + + else: + print "Config file %s is missing extry %s for %s. Will search on path." % (str(conffile),str(key),str(prog)) + + else: + print "Failed to find config file %s. Will search for %s on path." % (str(conffile), str(prog)) + + exloc = find_from_path(str(prog),str(prog)) + test_exec(exloc) + return exloc + + + # symlink fromfile to tofile and verify def link(fromfile, tofile, name): @@ -125,13 +153,20 @@ def pp_send_mail(subj, fname): import os import subprocess + # get mail address from config file + configs = read_conf(os.environ['HOME']+"/picopili.conf") + addr = configs['email'] + + # don't send email + if addr is None or '@' not in str(addr): + print "Email turned off based on config file entry (%s)." % str(addr) + return 0 + # get email script email_script = pp_find_mail() if email_script == None: raise IOError("Unable to find 'mutt' or 'mail' in path to send email") - - # get mail address from config file - configs = read_conf(os.environ['HOME']+"/ricopili.conf") + # verify file before send if not os.path.isfile(fname): diff --git a/bin/qc_rel.py b/bin/qc_rel.py index e25c372..8c0c556 100755 --- a/bin/qc_rel.py +++ b/bin/qc_rel.py @@ -56,7 +56,7 @@ start_time = strftime("%H:%M:%S %d-%B-%Y") # from glob import glob from args_qc import * -from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format +from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format, find_exec unbuffer_stdout() warnings.formatwarning = warn_format @@ -114,33 +114,25 @@ print ' ' + ############# -print '\n...Reading ricopili config file...' +print '\n...Checking dependencies...' +# check exists, executable ############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" +### read config +conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) - -plinkx = configs['p2loc']+"plink" - analyst = configs['init'] +# find plink +plinkx = find_exec('plink',key='p2loc') + if not args.skip_platform: # get directory containing current script # (hack to get plague script location) rp_bin = os.path.dirname(os.path.realpath(__file__)) plague_ex = rp_bin + '/plague_pico.pl' - - -############# -print '\n...Checking dependencies...' -# check exists, executable -############# - -# verify executables -test_exec(plinkx, 'Plink') -if not args.skip_platform: test_exec(plague_ex, 'Platform guessing script') # TODO: verify plague works properly across platforms (primary concern is Compress::Zlib loading) diff --git a/bin/shape_rel.py b/bin/shape_rel.py index e8bca37..8f53448 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -44,7 +44,7 @@ # import random # import warnings from args_impute import * -from py_helpers import unbuffer_stdout, link, read_conf #, test_exec +from py_helpers import unbuffer_stdout, link, find_exec #, test_exec # file_len, read_conf, find_from_path, link, gz_confirm unbuffer_stdout() @@ -106,24 +106,13 @@ outdot = str(args.out) -############# -print '\n...Reading ricopili config file...' -############# - -### read plink, shapeit loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) - -plinkx = configs['p2loc']+"plink" -shapeit_ex = configs['shloc'] + '/bin/shapeit' - - ############# print '\n...Checking dependencies...' ############# - +plinkx = find_exec('plink',key='p2loc') +shapeit_ex = find_exec('shapeit',key='shloc') # TODO: here diff --git a/bin/strict_qc.py b/bin/strict_qc.py index 64e61fc..1e9a476 100755 --- a/bin/strict_qc.py +++ b/bin/strict_qc.py @@ -42,7 +42,7 @@ import subprocess import argparse from glob import glob -from py_helpers import file_len, read_conf, unbuffer_stdout, test_exec +from py_helpers import file_len, find_exec, unbuffer_stdout, test_exec from args_pca import * unbuffer_stdout() @@ -79,30 +79,21 @@ print '--ld_wind '+str(args.ld_wind) print '--all_chr '+str(args.all_chr) - -############# -print '\n...Reading ricopili config file...' -############# -### read plink loc from config -conf_file = os.environ['HOME']+"/ricopili.conf" -configs = read_conf(conf_file) +############# +print '\n...Checking dependencies...' +# check exists, executable +############# -plinkx = configs['p2loc']+"plink" +# plink +plinkx = find_exec('plink',key='p2loc') # get directory containing current script # (hack to help find ld region text file) rp_bin = os.path.dirname(os.path.realpath(__file__)) rp_dir = os.path.dirname(rp_bin) -############# -print '\n...Checking dependencies...' -# check exists, executable -############# - -# plink -test_exec(plinkx, 'Plink') # ld region file, if needed # try in rp_dir/lib/ in addition to cwd From de4f9d6a4bf297039c6ba2d33361f169821668a6 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 29 Sep 2016 14:56:11 -0400 Subject: [PATCH 05/48] added picopili configs w defaults, make email optional --- bin/config_pico.pl | 330 +++++++++++++++++++++++++-------------------- bin/imp_prep.pl | 74 +++++----- 2 files changed, 223 insertions(+), 181 deletions(-) diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 7c9cd2b..62013bc 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -279,14 +279,12 @@ () } print "\n"; -print "Please enter your email address:\n"; +print "Please enter your email address (optional, can enter \"None\"):\n"; my $email = <>; chomp $email; print "\n"; - - ############################# # allow default all remaining values on select platforms ############################# @@ -312,7 +310,12 @@ () "i2loc","IMPUTE2", "liloc","Liftover", "eloc","Eigenstrat", -# "rloc","R", + "admloc","ADMIXTURE", + "reaploc","REAP", + "priloc","PRIMUS", + "rloc","R", + "rscloc","Rscript", + "rplloc","R-enabled Plink (e.g. v1.07, or a dev build of 1.90)", "hmloc","HapMap reference", "perlpack","Perl packages (for Compress::Zlib)", ); @@ -323,7 +326,12 @@ () "i2loc","", "liloc","", "eloc","", -# "rloc","", + "admloc","", + "reaploc","", + "priloc","", + "rloc","", + "rscloc","", + "rplloc","", "hmloc","", "perlpack","", ); @@ -331,164 +339,204 @@ () if ($clusters{broad}){ %variables = ( + "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest", + "shloc","/home/unix/sripke/shapeit/bin", + "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta", + "liloc","/home/unix/sripke/liftover", + "eloc","/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin", + "admloc","/humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.23", + "reaploc","/humgen/atgu1/fs03/shared_resources/shared_software/REAP", + "priloc","/humgen/atgu1/fs03/shared_resources/shared_software/PRIMUS_v1.8.0/bin", + "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin", + "rscloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin", + "rplloc","/home/unix/sripke/plink_src/src/", + "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref", + "perlpack","/home/unix/sripke/perl_modules", + ); +} + # "ploc", "/home/unix/sripke/plink_src/src/", - "p2loc","/home/unix/sripke/plink_src/plink_1.9_newest/", - "shloc","/home/unix/sripke/shapeit/", - "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta/", - "liloc","/home/unix/sripke/liftover/", - "eloc","/home/unix/sripke/eigensoft/bin", # "ldloc","/psych/genetics_data/ripke/ldsc/", # "rloc","broadinstitute", # "rpac","NA", - "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref/", # "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/", # "hvloc","/home/radon01/sripke/bakker_ripke/haploview/", - "perlpack","/home/unix/sripke/perl_modules", - ); -} + elsif ($clusters{lisa}){ %variables = ( -# "ploc", "/home/gwas/plink/1.08/src", "p2loc","/home/gwas/plink2/plink_1.9_newest", - "shloc","/home/gwas/shapeit", + "shloc","/home/gwas/shapeit/bin", "i2loc","/home/gwas/bin_impute_v2/impute_v2.2.2_x86_64_static", "liloc","/home/gwas/liftover", -# "ldloc","/home/gwas/ldsc/", "eloc","/home/gwas/eigensoft", -# "rloc","/sara/sw/R-3.1.2/bin/", -# "rpac","NA", - "hmloc","/home/gwas/pgc-samples/hapmap_ref/", -# "meloc","/home/gwas/metal", -# "hvloc","./", + "admloc","", + "reaploc","", + "priloc","", + "rloc","/sara/sw/R-3.1.2/bin", + "rscloc","/sara/sw/R-3.1.2/bin", + "rplloc","/home/gwas/plink/1.08/src", + "hmloc","/home/gwas/pgc-samples/hapmap_ref", "perlpack","/home/gwas/perl_modules", ); } +# "ploc", "/home/gwas/plink/1.08/src", +# "ldloc","/home/gwas/ldsc/", +# "rloc","/sara/sw/R-3.1.2/bin/", +# "rpac","NA", +# "meloc","/home/gwas/metal", +# "hvloc","./", elsif ($clusters{computerome}){ %variables = ( + "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest", + "shloc","/home/people/sripke/rp_external_bins/shapeit/bin", + "i2loc","/home/people/sripke/rp_external_bins/impute2", + "liloc","/home/people/sripke/rp_external_bins/liftover", + "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta", + "admloc","", + "reaploc","", + "priloc","", + "rloc","/services/tools/R-3.1.2/bin", + "rscloc","/services/tools/R-3.1.2/bin", + "rplloc","/services/tools/R-3.1.2/bin", + "hmloc","/home/people/sripke/imputation_references", + "perlpack","/home/people/sripke/rp_external_bins/perl_packages", + ); +} + # "ploc", "/home/people/sripke/rp_external_bins/plink/", - "p2loc","/home/people/sripke/rp_external_bins/plink_1.9_newest/", - "shloc","/home/people/sripke/rp_external_bins/shapeit/", - "i2loc","/home/people/sripke/rp_external_bins/impute2/", - "liloc","/home/people/sripke/rp_external_bins/liftover/", # "ldloc","/home/people/sripke/rp_external_bins/ldsc/", - "eloc","/home/people/sripke/rp_external_bins/EIG6.0beta/", # "rloc","/services/tools/R-3.1.2/bin/", # "rpac","/home/people/sripke/rp_external_bins/Rpackages/", - "hmloc","/home/people/sripke/imputation_references/", # "meloc","/home/people/sripke/rp_external_bins/metal/", # "hvloc","./", - "perlpack","/home/people/sripke/rp_external_bins/perl_packages", - ); -} elsif ($clusters{co_ipsych}){ %variables = ( -# "ploc", "/data/tools/plink-1.07/", - "p2loc","/data/tools/plink2_sept2015/", - "shloc","/data/tools/shapeit_sept_2015/", + "p2loc","/data/tools/plink2_sept2015", + "shloc","/data/tools/shapeit_sept_2015/bin", "i2loc","/data/tools/impute-2.3.2/", - "liloc","/data/user_tools/rp_external_bins/liftover/", + "liloc","/data/user_tools/rp_external_bins/liftover", + "eloc","/data/tools/eigensoft-6.0.1/bin", + "admloc","", + "reaploc","", + "priloc","", + "rloc","/data/tools/R-3.2.1/bin", + "rscloc","/data/tools/R-3.2.1/bin", + "rplloc","/data/tools/plink-1.07", + "hmloc","/data/user_tools/imputation_references", + "perlpack","/data/user_tools/rp_external_bins/perl_packages", + ); +} + +# "ploc", "/data/tools/plink-1.07/", # "ldloc","/data/user_tools/rp_external_bins/ldsc/", - "eloc","/data/tools/eigensoft-6.0.1/bin/", # "rloc","/data/tools/R-3.2.1/bin/", # "rpac","/data/user_tools/rp_external_bins/Rpackages/", - "hmloc","/data/user_tools/imputation_references/", # "meloc","/data/tools/metal-20110325/", # "hvloc","./", - "perlpack","/data/user_tools/rp_external_bins/perl_packages", - ); -} + elsif ($clusters{genomedk}){ %variables = ( + "p2loc","/project/ricopili/plink_1.9_jul4", + "shloc","/project/ricopili/3rd_bins/shapeit/bin", + "i2loc","/project/ricopili/3rd_bins/impute2", + "liloc","/project/ricopili/3rd_bins/liftover", + "eloc","/project/ricopili/3rd_bins/eigenstrat/bin", + "admloc","", + "reaploc","", + "priloc","", + "rloc","/com/extra/R/3.1.0/bin", + "rscloc","/com/extra/R/3.1.0/bin", + "rplloc","/project/ricopili/plink_src", + "hmloc","/project/ricopili/reference_dir", + "perlpack","/project/ricopili/perl_packages/", + ); +} + # "ploc", "/project/ricopili/plink_src/", - "p2loc","/project/ricopili/plink_1.9_jul4/", - "shloc","/project/ricopili/3rd_bins/shapeit/", - "i2loc","/project/ricopili/3rd_bins/impute2/", - "liloc","/project/ricopili/3rd_bins/liftover/", - "eloc","/project/ricopili/3rd_bins/eigenstrat/bin/", # "rloc","/com/extra/R/3.1.0/bin", # "rpac","NA", - "hmloc","/project/ricopili/reference_dir/", # "meloc","/project/ricopili/3rd_bins/metal/", # "hvloc","./", - "perlpack","/project/ricopili/perl_packages/", - ); -} + elsif ($clusters{mssm}){ %variables = ( + "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4", + "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/bin", + "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2", + "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover", + "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin", + "admloc","", + "reaploc","", + "priloc","", + "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin", + "rscloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin", + "rplloc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke", + "hmloc","/hpc/users/xripkes01/ricopili/reference_dir", + "perlpack","/hpc/users/xripkes01/perl_modules", + ); +} + # "ploc", "/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke/", - "p2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.09-src-aug4/", - "shloc","/hpc/users/xripkes01/ricopili/3rd_binaries/shapeit/", - "i2loc","/hpc/users/xripkes01/ricopili/3rd_binaries/impute2/", - "liloc","/hpc/users/xripkes01/ricopili/3rd_binaries/liftover/", - "eloc","/hpc/packages/minerva-common/eigensoft/5.0.1/bin/", # "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin/", # "rpac","NA", - "hmloc","/hpc/users/xripkes01/ricopili/reference_dir/", # "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/", # "hvloc","./", - "perlpack","/hpc/users/xripkes01/perl_modules/", - ); -} - foreach (keys %variables){ - if ($variables{$_} eq "broadinstitute" && $longvar{$_} eq "R") { - print "You are running R on broad, took the default value\n\n"; - } - elsif ($variables{$_} eq "NA" && $longvar{$_} eq "Rpackages") { - print "assuming library rmeta is installed on standard R\n\n"; - } - else { if ($variables{$_} ne '' && (-d $variables{$_})){ - print "For $longvar{$_}, do you want to use the default location (y or n)?\n\t$variables{$_}\n"; + print "Default location for $longvar{$_} is: \n\t$variables{$_}\n\n"; if ($defall == 0) { - while (1) { - my $answer = lc <>; - chomp $answer; - if ($answer eq "y") { - print "Using $variables{$_} for $longvar{$_}.\n\n"; - last; - } - elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n"; - my $input = <>; - chomp $input; - $input =~ s/^~/$ENV{HOME}/g; - $input =~ s/^\./$cd/g; - unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} - print "\n"; - last;} - else {print "Please answer with y or n.\n";} + print "Do you want to use this location (y or n)?\n"; + while (1) { + my $answer = lc <>; + chomp $answer; + if ($answer eq "y") { + print "Using $variables{$_} for $longvar{$_}.\n\n"; + last; + } + elsif ($answer eq "n") {print "Please enter a new location to use for $longvar{$_}:\n"; + my $input = <>; + chomp $input; + $input =~ s/^~/$ENV{HOME}/g; + $input =~ s/^\./$cd/g; + unless ( -d $input ){print "Not a valid directory. Please try again.\n";next; + } + print "\n"; + last; + } + else { + print "Please answer with y or n.\n"; + } } } } else { while (1){ unless($clusters{other} == 1){ - print "No default value available for:\n"; + print "No default value available for $longvar{$_}\n"; } - print "Please enter a location for $longvar{$_}:\n"; - my $input = ""; - $input = <>; - chomp $input; - $input =~ s/^~/$ENV{HOME}/g; - $input =~ s/^\./$cd/g; - unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} - $variables{$_} = $input; - print "\n"; - last; + print "Please enter a location:\n"; + my $input = ""; + $input = <>; + chomp $input; + $input =~ s/^~/$ENV{HOME}/g; + $input =~ s/^\./$cd/g; + unless ( -d $input ){print "Not a valid directory. Please try again.\n";next;} + $variables{$_} = $input; + print "\n"; + last; } } - } } foreach (keys %variables){ @@ -500,14 +548,17 @@ () push (@text, "email $email"); ### define queue depending on cluster -#if ($clusters{broad}){push (@text, "queue bsub")} if ($clusters{broad}){push (@text, "queue broad_uger")} -if ($clusters{lisa}){push (@text, "queue qsub")} -if ($clusters{computerome} || $clusters{co_ipsych}){push (@text, "queue qsub_c")} -if ($clusters{genomedk}){push (@text, "queue slurm")} -if ($clusters{mssm}){push (@text, "queue msub")} -} +if ($clusters{lisa}){push (@text, "queue lisa")} +if ($clusters{computerome}){push (@text, "queue computerome")} +if ($clusters{co_ipsych}){push (@text, "queue computerome_ipsych")} +if ($clusters{genomedk}){push (@text, "queue genomedk")} +if ($clusters{mssm}){push (@text, "queue mssm")} + + +} # end if block for getting conf file info + unless ( -e $conf_file && $ans_ow eq "n") { die $! unless open FILE, "> $conf_file"; @@ -518,7 +569,7 @@ () ############################# -# read picopili.config file with default parameters +# read picopili.conf file with default parameters ############################# my %conf = (); ## hash with config parameters @@ -537,18 +588,19 @@ () ############################ my @fail_path = (); my %locs = ( -# "ploc","", - "p2loc","", - "shloc","", - "i2loc","", - "liloc","", - "eloc","", -# "rloc","", - "hmloc","", -# "meloc","", -# "ldloc","", -# "rpac","", - "perlpack","" + "p2loc", "", + "shloc","", + "i2loc","", + "liloc","", + "eloc","", + "admloc","", + "reaploc","", + "priloc","", + "rloc","", + "rscloc","", + "rplloc","", + "hmloc","", + "perlpack","", ); die $!."($conf_file)" unless open FILE, "< $conf_file"; @@ -566,49 +618,30 @@ () # print finish statement ############################# -my $fail = 0; +my $email_on = 0; +if ($conf{'email'} =~ m/\@/) { + $email_on = 1; +} + if ($#fail_path != -1) { - -# foreach (@fail_path) { -# unless ($_ eq "rloc" && $clusters{broad} == 1) { + print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n"; - foreach my $confvar (@fail_path) { - if ($confvar eq "rloc" && $clusters{broad} == 1) { - next; + foreach (@fail_path) { + print "\t$_\n"; } - elsif ($confvar eq "rpac" && $clusters{lisa} != 1 && $clusters{other} != 1) { - next; - } - else{ - $fail += 1; - } - } - if ($fail != 0) { - print "You will need to install the binaries as described here (https://sites.google.com/a/broadinstitute.org/ricopili/resources) and use a text editor (emacs,vim,etc.) to edit the file paths listed in $home/picopili.conf for the following variables:\n"; - foreach (@fail_path) { - unless ($_ eq "rloc" && $clusters{broad} == 1) { - print "\t$_\n"; - } - } - } - else { - print "Setup has been completed successfully!\n"; - print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; - &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); - } } else { - print "Setup has been completed successfully!\n"; - print "If you do not receive an email with the subject rp_config, please check your email address is entered correctly at $conf_file\n"; - &mysystem("echo \"Configuration for RP was successful.\" | mail -s rp_config $conf{'email'}"); + print "Configuration completed successfully! Settings are stored in $conf_file\n"; + + if ($email_on){ + print "If you do not receive an email with the subject picopili_config, please check your address is entered correctly at $conf_file\n"; + &mysystem("echo \"Configuration for picopili was successful.\" | mail -s picopili_config $conf{'email'}"); + } } - - - ################################################### ### Optional: Add bin to default search path ################################################### @@ -706,9 +739,10 @@ () } } else{ - print "Successfully found picopili directory in search path!\n"; + print "Successfully found picopili directory in search path!\n\n"; } +print "### Finished ###\n\n"; exit; ########## Done ########## diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index 9134f09..2c4f0bb 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -61,6 +61,10 @@ my $liloc = &trans("liloc"); my $email = &trans("email"); +my $email_on = 0; +if ($email =~ m/\@/){ + $email_on = 1; +} ############################################### @@ -183,7 +187,7 @@ push @test_scripts, $buigue_script; push @test_scripts, $checkpos_script; push @test_scripts, $checkflip_script; -push @test_scripts, $blue_script; +push @test_scripts, $blue_script; #push @test_scripts, $mutt_script ; @@ -232,42 +236,43 @@ +if($email_on){ + print ".......testing email program....\n"; -print ".......testing email program....\n"; - -my $err_scr = 0; -{ - my $scr_path = ''; + my $err_scr = 0; + { + my $scr_path = ''; - for my $path ( split /:/, $ENV{PATH} ) { - if ( -f "$path/$mutt_script" && -x _ ) { - print "$mutt_script\tfound in $path\n"; - $scr_path = "$path/$mutt_script"; - last; - } - } - unless ( $scr_path ) { + for my $path ( split /:/, $ENV{PATH} ) { + if ( -f "$path/$mutt_script" && -x _ ) { + print "$mutt_script\tfound in $path\n"; + $scr_path = "$path/$mutt_script"; + last; + } + } + unless ( $scr_path ) { - print "!!Warning!! : No $mutt_script command available, trying mail\n" ; + print "!!Warning!! : No $mutt_script command available, trying mail\n" ; - $mutt_script = "mail"; - for my $path ( split /:/, $ENV{PATH} ) { - if ( -f "$path/$mutt_script" && -x _ ) { - print "$mutt_script\tfound in $path\n"; - $scr_path = "$path/$mutt_script"; - last; + $mutt_script = "mail"; + for my $path ( split /:/, $ENV{PATH} ) { + if ( -f "$path/$mutt_script" && -x _ ) { + print "$mutt_script\tfound in $path\n"; + $scr_path = "$path/$mutt_script"; + last; + } + } + unless ( $scr_path ) { + $err_scr = 1; + print "!!Error!! : No $mutt_script command available\n" ; + } } - } - unless ( $scr_path ) { - $err_scr = 1; - print "!!Error!! : No $mutt_script command available\n" ; - } - } -} -die if $err_scr == 1; + } + die if $err_scr == 1; +} print "....all necessary binaries found....\n"; print "------------------------------------\n"; @@ -395,7 +400,7 @@ sub a2filenew_app { } -my $sjainfofile = "$rootdir/impute_dir_info.log"; +my $sjainfofile = "$rootdir/imp_prep_job_info.log"; unless (-e $sjainfofile) { &mysystem ("touch $sjainfofile"); } @@ -435,7 +440,9 @@ sub send_jobarray { print SUC $fini_message."\n"; close SUC; - &mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ; + if($email_on){ + &mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ; + } my $sjarow = $sjainfotxt."\t$sjaname\t$now"; &a2filenew_app("$sjainfofile",$sjarow); @@ -543,8 +550,9 @@ sub send_jobarray { print ERR $err_message."\n"; close ERR; - - &mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ; + if($email_on){ + &mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ; + } unless ($serial) { exit; From 36d046e7988c60db9c8117fdbc6c476fd4a4667f Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 30 Sep 2016 13:22:05 -0400 Subject: [PATCH 06/48] offer separate plague and buigue reference files --- .gitignore | 3 +- GET_REFS | 1 + bin/{bin_check_pico => bin_check_pico.pl} | 0 bin/buigue_pico.pl | 19 ++- bin/config_pico.pl | 77 ++++++++--- bin/get_refs.sh | 153 ++++++++++++++++++++++ bin/imp_prep.pl | 3 +- bin/lift_to_hg19.pl | 10 +- bin/plague_pico.pl | 5 +- 9 files changed, 242 insertions(+), 29 deletions(-) create mode 120000 GET_REFS rename bin/{bin_check_pico => bin_check_pico.pl} (100%) create mode 100755 bin/get_refs.sh diff --git a/.gitignore b/.gitignore index 094a0f3..53fddd1 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ bin/args_qc.pyc bin/args_gwas.pyc bin/args_chunks.pyc bin/args_impute.pyc - +lib/plague* +lib/buigue* diff --git a/GET_REFS b/GET_REFS new file mode 120000 index 0000000..0622e86 --- /dev/null +++ b/GET_REFS @@ -0,0 +1 @@ +./bin/get_refs.sh \ No newline at end of file diff --git a/bin/bin_check_pico b/bin/bin_check_pico.pl similarity index 100% rename from bin/bin_check_pico rename to bin/bin_check_pico.pl diff --git a/bin/buigue_pico.pl b/bin/buigue_pico.pl index 3812cfc..14d6c95 100755 --- a/bin/buigue_pico.pl +++ b/bin/buigue_pico.pl @@ -5,8 +5,10 @@ # load utility functions ############################# +use File::Basename; use FindBin; use lib "$FindBin::Bin"; +use Cwd 'abs_path'; use rp_perl::Utils qw(trans); @@ -14,14 +16,19 @@ my $progname = $0; $progname =~ s!^.*/!!; +my $picodir = dirname(dirname(abs_path($0))); ############################# # read config file ############################# my $liloc = &trans("liloc"); +my $liref = "$picodir/lib/buigue"; -my $perlpack = &trans("perlpack"); +my $perlpack; +BEGIN { + $perlpack = &trans("perlpack"); +} use lib $perlpack; ##################################################### @@ -30,10 +37,10 @@ my @bu_files; -push @bu_files, "$liloc/snp.txt.pos.scz49.gz"; -push @bu_files, "$liloc/snp125.txt.pos.scz49.gz"; -push @bu_files, "$liloc/snp130.txt.pos.scz49.gz"; -push @bu_files, "$liloc/snp138.txt.pos.scz49.gz"; +push @bu_files, "$liref/snp.txt.pos.scz49.gz"; +push @bu_files, "$liref/snp125.txt.pos.scz49.gz"; +push @bu_files, "$liref/snp130.txt.pos.scz49.gz"; +push @bu_files, "$liref/snp138.txt.pos.scz49.gz"; my @li_files; push @li_files, "$liloc/hg16ToHg19.over.chain.gz"; @@ -59,7 +66,7 @@ guesses the build of a bim file out of ucsc snp file find here the helping files: - $liloc + $liref created by Stephan Ripke 2014 at MGH, Boston, MA in the frame of the PGC diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 62013bc..52d125a 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -15,7 +15,7 @@ $progname =~ s!^.*/!!; -my $cdir = abs_path($0); +my $cdir = dirname(abs_path($0)); my $home = $ENV{HOME}; my $conf_file = $ENV{HOME}."/picopili.conf"; my $command_line = "$progname @ARGV"; @@ -316,7 +316,6 @@ () "rloc","R", "rscloc","Rscript", "rplloc","R-enabled Plink (e.g. v1.07, or a dev build of 1.90)", - "hmloc","HapMap reference", "perlpack","Perl packages (for Compress::Zlib)", ); @@ -332,7 +331,6 @@ () "rloc","", "rscloc","", "rplloc","", - "hmloc","", "perlpack","", ); @@ -350,7 +348,6 @@ () "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin", "rscloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin", "rplloc","/home/unix/sripke/plink_src/src/", - "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref", "perlpack","/home/unix/sripke/perl_modules", ); } @@ -361,6 +358,7 @@ () # "rpac","NA", # "meloc","/psych/genetics_data/ripke/references_from_debakkerscratch/metal/", # "hvloc","/home/radon01/sripke/bakker_ripke/haploview/", +# "hmloc","/psych/genetics_data/ripke/references_outdated/hapmap_ref", elsif ($clusters{lisa}){ @@ -376,7 +374,6 @@ () "rloc","/sara/sw/R-3.1.2/bin", "rscloc","/sara/sw/R-3.1.2/bin", "rplloc","/home/gwas/plink/1.08/src", - "hmloc","/home/gwas/pgc-samples/hapmap_ref", "perlpack","/home/gwas/perl_modules", ); } @@ -387,6 +384,7 @@ () # "rpac","NA", # "meloc","/home/gwas/metal", # "hvloc","./", +# "hmloc","/home/gwas/pgc-samples/hapmap_ref", elsif ($clusters{computerome}){ @@ -402,7 +400,6 @@ () "rloc","/services/tools/R-3.1.2/bin", "rscloc","/services/tools/R-3.1.2/bin", "rplloc","/services/tools/R-3.1.2/bin", - "hmloc","/home/people/sripke/imputation_references", "perlpack","/home/people/sripke/rp_external_bins/perl_packages", ); } @@ -413,7 +410,7 @@ () # "rpac","/home/people/sripke/rp_external_bins/Rpackages/", # "meloc","/home/people/sripke/rp_external_bins/metal/", # "hvloc","./", - +# "hmloc","/home/people/sripke/imputation_references", elsif ($clusters{co_ipsych}){ %variables = ( @@ -428,7 +425,6 @@ () "rloc","/data/tools/R-3.2.1/bin", "rscloc","/data/tools/R-3.2.1/bin", "rplloc","/data/tools/plink-1.07", - "hmloc","/data/user_tools/imputation_references", "perlpack","/data/user_tools/rp_external_bins/perl_packages", ); } @@ -439,6 +435,7 @@ () # "rpac","/data/user_tools/rp_external_bins/Rpackages/", # "meloc","/data/tools/metal-20110325/", # "hvloc","./", +# "hmloc","/data/user_tools/imputation_references", elsif ($clusters{genomedk}){ @@ -454,7 +451,6 @@ () "rloc","/com/extra/R/3.1.0/bin", "rscloc","/com/extra/R/3.1.0/bin", "rplloc","/project/ricopili/plink_src", - "hmloc","/project/ricopili/reference_dir", "perlpack","/project/ricopili/perl_packages/", ); } @@ -464,6 +460,7 @@ () # "rpac","NA", # "meloc","/project/ricopili/3rd_bins/metal/", # "hvloc","./", +# "hmloc","/project/ricopili/reference_dir", elsif ($clusters{mssm}){ @@ -479,7 +476,6 @@ () "rloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin", "rscloc","/hpc/packages/minerva-common/R/2.15.3/lib64/R/bin", "rplloc","/hpc/users/xripkes01/ricopili/3rd_binaries/plink-1.07-src-sripke", - "hmloc","/hpc/users/xripkes01/ricopili/reference_dir", "perlpack","/hpc/users/xripkes01/perl_modules", ); } @@ -489,7 +485,7 @@ () # "rpac","NA", # "meloc","/hpc/users/xripkes01/ricopili/3rd_binaries/metal/", # "hvloc","./", - +# "hmloc","/hpc/users/xripkes01/ricopili/reference_dir", foreach (keys %variables){ @@ -599,7 +595,6 @@ () "rloc","", "rscloc","", "rplloc","", - "hmloc","", "perlpack","", ); @@ -646,7 +641,7 @@ () ### Optional: Add bin to default search path ################################################### -system("bin_check_pico"); # dummy script that doesn't do anything +system("bin_check_pico.pl"); # dummy script that doesn't do anything my $status_bin = ($? >> 8); @@ -742,7 +737,59 @@ () print "Successfully found picopili directory in search path!\n\n"; } -print "### Finished ###\n\n"; - + +################## +# +# Check whether reference files are present yet +# +################## + +my $picobin = dirname($cdir); +my $plaguedir = "$picobin/lib/plague"; +my $buiguedir = "$picobin/lib/buigue"; + +my $haveref = 0; +if (-e $plaguedir && -e $buiguedir){ + + my @ref_files; + my $refcc = 0; + push @ref_files, "$buiguedir/snp.txt.pos.scz49.gz"; + push @ref_files, "$buiguedir/snp125.txt.pos.scz49.gz"; + push @ref_files, "$buiguedir/snp130.txt.pos.scz49.gz"; + push @ref_files, "$buiguedir/snp138.txt.pos.scz49.gz"; + push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0815.gz"; + push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0416a.gz"; + push @ref_files, "$plaguedir/snp_platform_collection.txt.new.0114.gz"; + + foreach my $fi (@ref_files){ + if (-e $fi){ + $refcc++; + next; + }else{ + last; + } + } + + if ($refcc == scalar(@ref_files)){ + $haveref = 1; + } + +} + +if ($haveref == 0){ + print "\n----------------------------------------------------\n"; + print "References files from ricopili for guessing genome build and \n"; + print "genotyping platform have not been installed yet.\n\n"; + print "Please run:\n"; + print "\t$picobin/GET_REFS\n"; + +}else{ + print "Successfully found ricopili plague and buigue reference files!\n" +} + +if ($haveref == 1 && $statusbin == 0){ + print "\n### Finished ###\n\n"; +} + exit; ########## Done ########## diff --git a/bin/get_refs.sh b/bin/get_refs.sh new file mode 100755 index 0000000..3ce2d5a --- /dev/null +++ b/bin/get_refs.sh @@ -0,0 +1,153 @@ +#! /bin/sh + +########### +# +# get_refs.sh +# Retrives reference files +# +# - ricopili platform guessing (plague) files +# - ricopili build guessing (buigue) files +# +########### + +echo " " +echo "### External reference file downloader for picopili ###" +echo " " +echo "Picopili depends on a few curated reference files" +echo "from ricopili. If ricopili is installed on your" +echo "platform, will set up symbolic links to the required" +echo "files. Otherwise, will download the files." +echo " " + +echo "### BEGIN ###" +echo " " + +# setup +rp_conf="$HOME/ricopili.conf" +SERVER="https://personal.broadinstitute.org/rwalters/picopili_files/" +SCRIPT=$(readlink -f "$0") +BINLOC=$(dirname "$SCRIPT") +LIBLOC=`echo $(dirname "$BINLOC")"/lib"` +rp=false +li_done=false +hm_done=false + +if [ -d "$LIBLOC/buigue" ]; then + echo "WARNING: Found existing folder $LIBLOC/buigue. Contents may be overwritten." + echo "(pausing to allow cancel...)" + sleep 3 + echo "(continuing)" +else + mkdir "$LIBLOC/buigue" +fi + +if [ -d "$LIBLOC/plague" ]; then + echo "WARNING: Found existing folder $LIBLOC/plague. Contents may be overwritten." + echo "(pausing to allow cancel...)" + sleep 3 + echo "(continuing)" +else + mkdir "$LIBLOC/plague" +fi + + +# check/read config file +if [ -e "$rp_conf" ]; then + rp=true + echo "Found existing ricopili configuration. Reading..." + liloc=`awk '$1=="liloc"{print $2}' $rp_conf` + hmloc=`awk '$1=="hmloc"{print $2}' $rp_conf` +else + echo "No ricopili configuration found." +fi + +lifiles=("snp.txt.pos.scz49.gz" "snp125.txt.pos.scz49.gz" "snp130.txt.pos.scz49.gz" "snp138.txt.pos.scz49.gz" "last") +hmfiles=("snp_platform_collection.txt.new.0815.gz" "snp_platform_collection.txt.new.0416a.gz" "snp_platform_collection.txt.new.0114.gz" "last") + +# link creation from ricopili references +if [ "$rp" = 'true' ]; then + + if [ -d "$liloc" ]; then + + for finame in ${lifiles[@]}; do + + if [ "$finame" = "last" ]; then + li_done=true + else + echo "$liloc/$finame" + ln -sfn "$liloc/$finame" "$LIBLOC/buigue" || break + fi + done + fi + + if [ "$li_done" = 'false' ]; then + echo "Failed to link all files from liftover directory $liloc" + fi + + + if [ -d "$hmloc" ]; then + for finame in ${hmfiles[@]}; do + + if [ $finame == "last" ]; then + hm_done=true + else + echo "$hmloc/$finame" + ln -sfn "$hmloc/$finame" "$LIBLOC/plague" || break + fi + done + fi + + if [ "$hm_done" = 'false' ]; then + echo "Failed to link all platform references from directory $hmloc" + fi +fi + +if [ "$li_done" = 'false' ]; then + to_dl=true +elif [ "$hm_done" = 'false' ]; then + to_dl=true +else + to_dl=false +fi + +# wget external +if [ "$to_dl" = 'true' ]; then + + # warn of internet access + echo " " + echo "WARNING: Preparing to download reference files from:" + echo "$SERVER" + echo " " + echo "Expected total file size is ~275 MB, minus existing" + echo "files already linked/downloaded." + echo " " + echo "If you do not have web access, or if you do not want" + echo "to download these files now, please cancel now." + echo " " + echo "Will begin in 10 sec..." + echo " " + sleep 10 + + for finame in ${lifiles[@]}; do + + if [ "$finame" = "last" ]; then + continue + else + wget "$SERVER/$finame" "$LIBLOC/buigue/$finame" + fi + done + for finame in ${hmfiles[@]}; do + + if [ "$finame" = "last" ]; then + continue + else + wget "$SERVER/$finame" "$LIBLOC/plague/$finame" + fi + done +fi + +echo " " +echo "### Finished ###" +echo " " + +# eof diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index 2c4f0bb..3c75dfb 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -58,7 +58,6 @@ my $ploc = &trans("p2loc"); my $qloc = &trans("queue"); -my $liloc = &trans("liloc"); my $email = &trans("email"); my $email_on = 0; @@ -177,6 +176,7 @@ my $readref_script = "readref_pico.pl"; ### my.pipeline_tar my $readrefsum_script = "readrefsum_pico.pl"; ### my.pipeline_tar my $buigue_script = "buigue_pico.pl"; ### my.pipeline_tar +my $lift_script = "lift_to_hg19.pl"; my $checkpos_script = "checkpos_pico.pl"; ### my.pipeline_tar my $checkflip_script = "checkflip_pico.pl"; ### my.pipeline_tar my $mutt_script = "mutt"; ### my.pipeline_tar @@ -185,6 +185,7 @@ push @test_scripts, $readref_script; push @test_scripts, $readrefsum_script; push @test_scripts, $buigue_script; +push @test_scripts, $lift_script; push @test_scripts, $checkpos_script; push @test_scripts, $checkflip_script; push @test_scripts, $blue_script; diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl index 782404b..7ac1726 100755 --- a/bin/lift_to_hg19.pl +++ b/bin/lift_to_hg19.pl @@ -44,11 +44,11 @@ my $ploc = &trans("p2loc"); my $liloc = &trans("liloc"); -if ($ENV{SYS_TYPE} =~ /redhat_6/) { - print "running on gold\n"; - $liloc .= "64bit/"; - print "using $liloc\n"; -} +# if ($ENV{SYS_TYPE} =~ /redhat_6/) { +# print "running on gold\n"; +# $liloc .= "64bit/"; +# print "using $liloc\n"; +# } #exit; ####################################### diff --git a/bin/plague_pico.pl b/bin/plague_pico.pl index 9825be5..9cfea53 100755 --- a/bin/plague_pico.pl +++ b/bin/plague_pico.pl @@ -5,7 +5,9 @@ # load utility functions ############################# +use File::Basename; use FindBin; +use Cwd 'abs_path'; use lib "$FindBin::Bin"; use rp_perl::Utils qw(trans); @@ -13,12 +15,13 @@ my $progname = $0; $progname =~ s!^.*/!!; +my $picodir = dirname(dirname(abs_path($0))); ############################# # read config file ############################# -my $hmloc = &trans("hmloc"); +my $hmloc = "$picodir/lib/plague"; my $perlpack = &trans("perlpack"); use lib $perlpack; From 6f64c1f4faf61ba57b4acfa2d1858038483b9752 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 30 Sep 2016 20:28:11 -0400 Subject: [PATCH 07/48] start integrating blueprint.py, add threading and flexibility --- .gitignore | 1 + bin/blueprint.py | 91 ++++++++++----- bin/config_pico.pl | 12 +- bin/pca_rel.py | 109 +++++------------- bin/py_helpers.py | 4 +- bin/shape_rel.py | 55 ++++----- cluster_templates/broad_uger.conf | 3 +- cluster_templates/broad_uger.single.sub.sh | 27 ----- ...ad_uger.array.sub.sh => broad_uger.sub.sh} | 5 +- 9 files changed, 127 insertions(+), 180 deletions(-) delete mode 100755 cluster_templates/broad_uger.single.sub.sh rename cluster_templates/{broad_uger.array.sub.sh => broad_uger.sub.sh} (82%) diff --git a/.gitignore b/.gitignore index 53fddd1..f5ac793 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ bin/py_helpers.pyc +bin/blueprint.pyc bin/args_pca.pyc bin/args_ped.pyc bin/args_qc.pyc diff --git a/bin/blueprint.py b/bin/blueprint.py index 70acd87..5ccd47d 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -24,7 +24,7 @@ def send_job(jobname, # week=None, njobs=None, maxpar=10000, -# multi=None, + threads=None, wait_file=None, wait_name=None, cluster=None, @@ -41,6 +41,9 @@ def send_job(jobname, if logloc is None: logloc = os.getcwd() + + if not os.path.isdir(logloc): + os.mkdir(logloc) if maxpar < 1: maxpar = 10000 @@ -49,7 +52,7 @@ def send_job(jobname, if cluster is None: conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) - cluster = configs['queue'] + cluster = configs['cluster'] # get queue template pico_bin = os.path.dirname(os.path.realpath(__file__)) @@ -61,11 +64,22 @@ def send_job(jobname, # - submission syntax, queue names, job holds clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') + # basic template + with open(str(clust_dir)+'/'+str(cluster)+'.sub.sh','r') as single_templ: + templ = single_templ.read() + # setup memory args if mem is None: mem = 2000 mem_mb = str(int(mem)) - mem_gb = str(int(mem)/1000) + if int(mem) > 1000: + mem_gb = str(int(mem)/1000) + else: + mem_gb = str(1) + + # multithreading arguments + if threads is None: + threads = 1 # queue picking from job length if walltime is None: @@ -95,14 +109,15 @@ def send_job(jobname, hold_str = "" + # load base template + + - # template for single jobs + # for single jobs if cmd is not None and (njobs is None or njobs <= 1): - - with open(str(clust_dir)+'/'+str(cluster)+'.single.sub.sh','r') as single_templ: - templ = single_templ.read() - + njobs = 1 + tot_threads = int(threads) # log name if logname is None: @@ -116,13 +131,11 @@ def send_job(jobname, j_per_core = 1 - # template for array jobs + # for array jobs else: - with open(str(clust_dir)+'/'+str(cluster)+'.array.sub.sh','r') as array_templ: - templ = array_templ.read() # setup indexing tasks - j_per_core = int(clust_conf['array_core']) + j_per_core = int(clust_conf['j_per_node']) if j_per_core == 1: task_index = str(clust_conf['task_id']) else: @@ -131,11 +144,13 @@ def send_job(jobname, # cmd or array file spec if cmd is not None: cmd_line = cmd.format(task=task_index) + tot_threads = int(njobs)*int(threads) else: assert os.path.isfile(arrayfile), "Job array file %s not found." % str(arrayfile) njobs = file_len(arrayfile) + tot_threads = int(njobs)*int(threads) cmd_tmp = dedent("""\ cline=`head -n {task} {fi} | tail -n 1` @@ -150,14 +165,15 @@ def send_job(jobname, from math import floor, ceil # max simul tasks with memory limit - node_mem = float(clust_conf['array_core']) + node_mem = float(clust_conf['array_mem_mb']) task_mem_lim = floor((node_mem-1.0)/float(mem)) - if task_mem_lim < 1: - task_mem_lim=1 + # max simul tasks with threading + if task_mem_lim > floor(int(j_per_core)/int(threads)): + task_mem_lim = floor(int(j_per_core)/int(threads)) - if task_mem_lim > j_per_core: - task_mem_lim = j_per_core + if task_mem_lim < 1: + task_mem_lim=1 # number of jobs to cover all tasks array_jobs = ceil(float(njobs)/float(task_mem_lim)) @@ -215,9 +231,11 @@ def send_job(jobname, # fill in template jobdict = {"job_name": str(jobname), "cmd_string": cmd_str, # formatted elsewhere - "log_name": str(logname), + "log_name": str(logloc)+'/'+str(logname), "mem_in_mb": str(mem_mb), "mem_in_gb": str(mem_gb), + "threads": str(threads), + "total_threads": str(tot_threads), "wall_hours": str(walltime), "njobs": str(njobs), "array_jobs": str(array_jobs), @@ -226,7 +244,8 @@ def send_job(jobname, "task_id": str(clust_conf['task_id']), "log_task_id": str(clust_conf['log_task_id']), "queue_name": str(queue_name), - "sleep_time": str(sleep) + "sleep_time": str(sleep), + "project": str(clust_conf['project']) } @@ -235,6 +254,23 @@ def send_job(jobname, sub_file.write(templ.format(**jobdict)) sub_file.close() + # finalize or remove optional lines + if njobs <= 1: + subprocess.check_call(['sed','-i','/^::PICO_ARRAY_ONLY::/d',str(sub_file.name)]) + else: + subprocess.check_call(['sed','-i','s/^::PICO_ARRAY_ONLY:://',str(sub_file.name)]) + + if threads <= 1: + subprocess.check_call(['sed','-i','/^::PICO_THREAD_ONLY::/d',str(sub_file.name)]) + else: + subprocess.check_call(['sed','-i','s/^::PICO_THREAD_ONLY:://',str(sub_file.name)]) + + if njobs <= 1 and threads <= 1: + subprocess.check_call(['sed','-i','/^::PICO_THREADARRAY_ONLY::/d',str(sub_file.name)]) + else: + subprocess.check_call(['sed','-i','s/^::PICO_THREADARRAY_ONLY:://',str(sub_file.name)]) + + # command to run if hold_str != "": launch_str = clust_conf['sub_cmd']+' '+hold_str+' '+str(sub_file.name) @@ -250,16 +286,9 @@ def send_job(jobname, out, err = p.communicate() print out return(p.returncode) -# -# -# # manual error nhandling here because of Broad LD_LIBRARY_PATH warning -# if p.returncode != 0: -# if "LD_LIBRARY_PATH" in out: -# print out -# else: -# raise IOError("Job submission failed\nCode: %d\nError: %s\nOutput: %s\n" % p.returncode, err, out) - - return 0 + + else: + return 0 #################################### @@ -372,9 +401,9 @@ def send_job(jobname, # set logfile name if args.noerr: - logloc = os.getcwd()+'/errandout/' - else: logloc = os.getcwd() + else: + logloc = os.getcwd()+'/errandout/' # ignore arguments for direct if args.direct: diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 52d125a..9fb19df 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -545,12 +545,12 @@ () ### define queue depending on cluster -if ($clusters{broad}){push (@text, "queue broad_uger")} -if ($clusters{lisa}){push (@text, "queue lisa")} -if ($clusters{computerome}){push (@text, "queue computerome")} -if ($clusters{co_ipsych}){push (@text, "queue computerome_ipsych")} -if ($clusters{genomedk}){push (@text, "queue genomedk")} -if ($clusters{mssm}){push (@text, "queue mssm")} +if ($clusters{broad}){push (@text, "cluster broad_uger")} +if ($clusters{lisa}){push (@text, "cluster lisa")} +if ($clusters{computerome}){push (@text, "cluster computerome")} +if ($clusters{co_ipsych}){push (@text, "cluster computerome_ipsych")} +if ($clusters{genomedk}){push (@text, "cluster genomedk")} +if ($clusters{mssm}){push (@text, "cluster mssm")} } # end if block for getting conf file info diff --git a/bin/pca_rel.py b/bin/pca_rel.py index 942371d..7303b55 100755 --- a/bin/pca_rel.py +++ b/bin/pca_rel.py @@ -29,11 +29,11 @@ ### load requirements import argparse -import subprocess import os from math import ceil from args_pca import * from py_helpers import file_len, unbuffer_stdout +from blueprint import send_job unbuffer_stdout() @@ -166,32 +166,15 @@ strandambi_txt, allchr_txt]) -#strictqc_lsf = ' '.join(["bsub", -# "-q", 'hour', -# "-R", str('\"rusage[mem=2]\"'), -# "-J", str('strictqc_'+args.out), -# "-P", str('pico_'+args.out), -# "-o", str('strictqc_'+args.out+'.bsub.log'), -# "-r", -# str('\"'+strictqc_call+'\"')]) -# -#print strictqc_lsf -#if not args.test_sub: -# subprocess.check_call(strictqc_lsf, shell=True) - +send_job(jobname=str('strictqc_'+args.out), + arrayfile=None, + cmd=str(strictqc_call), + logname=str('strictqc_'+args.out+'.sub.log'), + mem=2000, + walltime=2, + sleep=0, + testonly=args.test_sub) -strictqc_uger = ' '.join(['qsub', - '-q', 'short', - '-l', 'm_mem_free=2g,h_vmem=2g', - '-N', str('strictqc_'+args.out), - '-o', str('strictqc_'+args.out+'.bsub.log'), - str(rp_bin)+'/uger.sub.sh', - str(0), - str(strictqc_call)]) - -print strictqc_uger -if not args.test_sub: - subprocess.check_call(strictqc_uger, shell=True) ##### # submit imus pca @@ -210,34 +193,15 @@ '--primus-ex', str(args.primus_ex) ]) -#imuspca_lsf = ' '.join(["bsub", -# "-w", str('\'ended(\"'+str('strictqc_'+args.out)+'\")\''), -# "-E", str('\"sleep '+str(args.sleep)+'\"'), -# "-q", 'week', -# "-R", str('\"rusage[mem='+str(imus_mem)+']\"'), -# "-J", str('imuspca_'+args.out), -# "-P", str('pico_'+args.out), -# "-o", str('imuspca_'+args.out+'.bsub.log'), -# "-r", -# str('\"'+imuspca_call+'\"')]) -# -#print imuspca_lsf -#if not args.test_sub: -# subprocess.check_call(imuspca_lsf, shell=True) - -imuspca_uger = ' '.join(['qsub', - '-hold_jid', str('strictqc_'+args.out), - '-q', 'long', - '-l', 'm_mem_free='+str(imus_mem)+'g,h_vmem='+str(imus_mem)+'g', - '-N', str('imuspca_'+args.out), - '-o', str('imuspca_'+args.out+'.bsub.log'), - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - str(imuspca_call)]) - -print imuspca_uger -if not args.test_sub: - subprocess.check_call(imuspca_uger, shell=True) +send_job(jobname=str('imuspca_'+args.out), + cmd=str(imuspca_call), + logname=str('imuspca_'+args.out+'.sub.log'), + mem=int(imus_mem)*1000, + walltime=168, # one week + wait_name=str('strictqc_'+args.out), + sleep=args.sleep, + testonly=args.test_sub) + ##### # submitting final file check @@ -250,37 +214,20 @@ else: pcaout = str(args.pcadir) + final_call = ' '.join(['final_file_check.py', '--filename', str(wd+'/'+pcaout+'/plots/'+args.out+'.pca.pairs.png'), '--taskname', str('pca_rel_'+args.out)]) -#final_lsf = ' '.join(["bsub", -# "-w", str('\'ended(\"'+str('imuspca_'+args.out)+'\")\''), -# "-E", str('\"sleep '+str(args.sleep)+'\"'), -# "-q", 'hour', -# "-J", str('checkfinal_'+args.out), -# "-P", str('pico_'+args.out), -# "-o", str('checkfinal_'+args.out+'.bsub.log'), -# "-r", -# str('\"'+final_call+'\"')]) -# -#print final_lsf -#if not args.test_sub: -# subprocess.check_call(final_lsf, shell=True) - - -final_uger = ' '.join(['qsub', - '-hold_jid', str('imuspca_'+args.out), - '-q', 'short', - '-N', str('checkfinal_'+args.out), - '-o', str('checkfinal_'+args.out+'.bsub.log'), - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - str(final_call)]) - -print final_uger -if not args.test_sub: - subprocess.check_call(final_uger, shell=True) +send_job(jobname=str('checkfinal_'+args.out), + arrayfile=None, + cmd=str(final_call), + logname=str('checkfinal_'+args.out+'.sub.log'), + mem=100, + walltime=1, + wait_name=str('imuspca_'+args.out), + sleep=str(args.sleep), + testonly=args.test_sub) ####### # Print completion message diff --git a/bin/py_helpers.py b/bin/py_helpers.py index 3173e66..f50c4fd 100644 --- a/bin/py_helpers.py +++ b/bin/py_helpers.py @@ -47,7 +47,9 @@ def read_conf(fname): with open(fname, 'r') as f: for line in f: - (key, val) = line.split() + # strips '#' comments at end of line + # otherwise allows aribtrary content (spaces, etc) + (key,val) = line.split('#',1)[0].rstrip().split(None,1) configs[str(key)] = val return configs diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 8f53448..82d6935 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -297,23 +297,21 @@ print ' '.join(shape_call)+'\n' - -uger_call = ' '.join(['qsub', - '-q','long', - '-N', 'shape.'+str(outdot), - '-l', 'm_mem_free='+str(args.mem_req)+'g,h_vmem='+str(args.mem_req)+'g', - '-pe','smp',str(args.threads), - '-t', '1-22', - '-o', '\'shape.'+str(outdot)+'.chr$TASK_ID.qsub.log\'', - str(rp_bin)+'/uger_array.sub.sh', - str(args.sleep), - ' '.join(shape_call)]) - -print uger_call -subprocess.check_call(uger_call, shell=True) - - - +# setup naming from task index +configs = read_conf(os.environ['HOME']+'/picopili.conf') +clust_confdir = os.path.dirname(str(rp_bin))+'/cluster_templates/' +clust_conf = read_conf(clust_confdir+str(configs['cluster']+'.conf')) +task_id = str(clust_conf['log_task_id']) + +# submit +send_job(jobname='shape.'+str(outdot), + cmd=' '.join(shape_call), + logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log', + mem=int(args.mem_req)*1000, + walltime=168, # week + njobs=22, + threads=int(args.threads), + sleep=str(args.sleep)) ### @@ -327,21 +325,16 @@ os.chdir(wd) next_call = str(rp_bin) + '/imp2_rel.py '+' '.join(sys.argv[1:]) - # TODO: consider queue/mem for agg - imp_log = 'imp_chunks.'+str(outdot)+'.qsub.log' - uger_imp = ' '.join(['qsub', - '-hold_jid','shape.'+str(outdot), - '-q', 'short', - '-l', 'm_mem_free=8g,h_vmem=8g', - '-N', 'imp.chunks.'+str(outdot), - '-o', imp_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - next_call]) - - print uger_imp + '\n' - subprocess.check_call(uger_imp, shell=True) + imp_log = 'imp_chunks.'+str(outdot)+'.sub.log' + # TODO: consider queue/mem + send_job(jobname='imp.chunks.'+str(outdot), + cmd=next_call, + logname=imp_log, + mem=8000, + walltime=2, + wait_name='shape.'+str(outdot), + sleep=str(args.sleep)) diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf index 658eae0..3ed149c 100644 --- a/cluster_templates/broad_uger.conf +++ b/cluster_templates/broad_uger.conf @@ -7,5 +7,6 @@ sub_cmd qsub log_task_id $TASK_ID task_id ${SGE_TASK_ID} hold_flag -hold_jid -array_core 1 +j_per_node 1 array_mem_mb 128000 +project unspecified diff --git a/cluster_templates/broad_uger.single.sub.sh b/cluster_templates/broad_uger.single.sub.sh deleted file mode 100755 index 42a1335..0000000 --- a/cluster_templates/broad_uger.single.sub.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env bash - -# wrapper script for job submission on Broad UGER cluster -# -# The -V below above will provoke a warning that -# LD_LIBRARY_PATH won't be used for security reasons; -# this warning can be safely ignored - -#$ -j y -#$ -cwd -#$ -V -#$ -N {job_name} -#$ -o {log_name} -#$ -q {queue_name} -#$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g - -# sleep option (for preventing race conditions on network file systems) -sleep {sleep_time} - -# setup resources -source /broad/software/scripts/useuse -reuse -q Anaconda - -# main command line -{cmd_string} - -# eof diff --git a/cluster_templates/broad_uger.array.sub.sh b/cluster_templates/broad_uger.sub.sh similarity index 82% rename from cluster_templates/broad_uger.array.sub.sh rename to cluster_templates/broad_uger.sub.sh index 4416e9c..597ad98 100755 --- a/cluster_templates/broad_uger.array.sub.sh +++ b/cluster_templates/broad_uger.sub.sh @@ -13,8 +13,9 @@ #$ -o {log_name} #$ -q {queue_name} #$ -l m_mem_free={mem_in_gb}g,h_vmem={mem_in_gb}g -#$ -t 1-{array_jobs} -#$ -tc {array_max} +::PICO_ARRAY_ONLY::#$ -t 1-{array_jobs} +::PICO_ARRAY_ONLY::#$ -tc {array_max} +::PICO_THREAD_ONLY::#$ -pe smp {threads} # sleep option (for preventing race conditions on network file systems) sleep {sleep_time} From e3051bb6cd3973a0b6af8cca89490d51110384c5 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 12:29:43 -0400 Subject: [PATCH 08/48] convert additional tasks to blueprint; minor logging changes --- bin/agg_gwas.py | 25 ++++--- bin/agg_imp.py | 25 +++---- bin/bg_imp.py | 49 ++++++------- bin/imp2_rel.py | 178 +++++++++++++++++++--------------------------- bin/imp_prep.pl | 10 +-- bin/impute_rel.py | 31 +++----- bin/shape_rel.py | 1 + 7 files changed, 131 insertions(+), 188 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index 13fada3..f32326f 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -41,6 +41,7 @@ from math import log10, sqrt from args_gwas import * from py_helpers import unbuffer_stdout, file_len, file_tail +from blueprint import send_job # , read_conf, link unbuffer_stdout() @@ -216,19 +217,17 @@ print '\n...Replacing this agg job in the queue...' - agg_log = 'agg.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log' - uger_agg = ' '.join(['qsub', - '-hold_jid','gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss), - '-q', 'long', - '-l', 'm_mem_free=24g,h_vmem=24g', - '-N', 'agg_'+str(outdot), - '-o', agg_log, - str(rp_bin)+'/uger.sub.sh', - str(10), # hardcoded since chunks shouldn't normally need a sleep argument - ' '.join(sys.argv[:])]) - - print uger_agg + '\n' - subprocess.check_call(uger_agg, shell=True) + # TODO: adjust memory setting here + + agg_log = 'agg.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log' + + send_job(jobname='agg_'+str(outdot), + cmd=' '.join(sys.argv[:]), + logname=agg_log, + mem=24000, + walltime=168, # week + wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss), + sleep=10) print '\n############' print '\n' diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 9e273c9..10a0ad9 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -31,6 +31,7 @@ import subprocess from args_impute import * from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format +from blueprint import send_job unbuffer_stdout() # warnings.formatwarning = warn_format @@ -196,21 +197,17 @@ print '\n...Replacing this aggregation job in the queue...' - # TODO: consider queue/mem for agg os.chdir(wd) - agg_log = 'agg_imp.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log' - uger_agg = ' '.join(['qsub', - '-hold_jid','bg.chunks.'+str(outdot)+'.resub_'+str(nummiss), - '-q', 'long', - '-l', 'm_mem_free=8g,h_vmem=8g', - '-N', 'agg.imp.'+str(outdot), - '-o', agg_log, - str(uger_ex), - str(args.sleep), - ' '.join(sys.argv[:])]) - - print uger_agg + '\n' - subprocess.check_call(uger_agg, shell=True) + agg_log = 'agg_imp.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log' + + # TODO: consider queue/mem for agg + send_job(jobname='agg.imp.'+str(outdot), + cmd=' '.join(sys.argv[:]), + logname=agg_log, + mem=8000, + walltime=168, # week + wait_name='bg.chunks.'+str(outdot)+'.resub_'+str(nummiss), + sleep=args.sleep) print '\n############' print '\n' diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 77b77e3..7d99557 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -36,6 +36,7 @@ import warnings from args_impute import * from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format +from blueprint import send_job unbuffer_stdout() warnings.formatwarning = warn_format @@ -298,21 +299,17 @@ print '\n...Replacing this best-guess job in the queue...' - # TODO: consider queue/mem for agg os.chdir(wd) - bg_log = 'bg.'+str(outdot)+'.resub_'+str(nummiss)+'.qsub.log' - uger_bg = ' '.join(['qsub', - '-hold_jid','imp.chunks.'+str(outdot)+'.resub_'+str(nummiss), - '-q', 'short', - '-l', 'm_mem_free=4g,h_vmem=8g', - '-N', 'bg.chunks.'+str(outdot), - '-o', bg_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - ' '.join(sys.argv[:])]) - - print uger_bg + '\n' - subprocess.check_call(uger_bg, shell=True) + bg_log = 'bg.'+str(outdot)+'.resub_'+str(nummiss)+'.sub.log' + + # TODO: consider queue/mem for agg + send_job(jobname='bg.chunks.'+str(outdot), + cmd=' '.join(sys.argv[:]), + logname=bg_log, + mem=8000, + walltime=2, # week + wait_name='imp.chunks.'+str(outdot)+'.resub_'+str(nummiss), + sleep=args.sleep) print '\n############' print '\n' @@ -426,22 +423,16 @@ os.chdir(wd) next_call = str(rp_bin) + '/agg_imp.py '+' '.join(sys.argv[1:]) - # TODO: consider queue/mem for agg - agg_log = 'agg_imp.'+str(outdot)+'.qsub.log' - uger_agg = ' '.join(['qsub', - '-hold_jid','bg.chunks.'+str(outdot), - '-q', 'long', - '-l', 'm_mem_free=8g,h_vmem=8g', - '-N', 'agg.imp.'+str(outdot), - '-o', agg_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - next_call]) - - print uger_agg + '\n' - subprocess.check_call(uger_agg, shell=True) + agg_log = 'agg_imp.'+str(outdot)+'.sub.log' - + # TODO: consider queue/mem for agg + send_job(jobname='agg.imp.'+str(outdot), + cmd=next_call, + logname=agg_log, + mem=8000, + walltime=168, # week + wait_name='bg.chunks.'+str(outdot), + sleep=args.sleep) # finish print '\n############' diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 2951b60..381a58b 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -26,8 +26,10 @@ ### load requirements import os import subprocess +from textwrap import dedent from args_impute import * from py_helpers import unbuffer_stdout, file_len, link, find_exec +from blueprint import send_job unbuffer_stdout() @@ -90,6 +92,13 @@ print '\n...Checking dependencies...' ############# +# get cluster configuration +# needed for specifying logfile names with clust_conf['log_task_id'] +conf_file = os.environ['HOME']+"/picopili.conf" +configs = read_conf(conf_file) +cluster = configs['cluster'] +clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') + # from config impute_ex = find_exec('impute2',key='i2loc') shapeit_ex = find_exec('shapeit',key='shloc') @@ -101,6 +110,8 @@ test_exec(chunker_ex) + + # TODO: here # .hg19.ch.fl.bim for chunking # imp. references @@ -138,8 +149,8 @@ bad_chr.append(chrom) -# TODO: resub shapeit if failed -# TODO: re-queue this job +# if any shapeit jobs failed, +# resubmit them and re-queue this job if bad_chr: num_chr = len(bad_chr) print 'Missing pre-phasing results for %d chromosomes.' % num_chr @@ -168,30 +179,14 @@ print 'Exiting...\n' exit(1) - # make submit script - # using this structure to get adaptive chromosome list - uger_phase_template = """#!/usr/bin/env sh - #$ -j y - #$ -cwd - #$ -V - #$ -N {jname} - #$ -q long - #$ -l m_mem_free={mem}g,h_vmem={mem}g - #$ -pe smp {threads} - #$ -t 1-{nchr} - #$ -o {outlog} - - source /broad/software/scripts/useuse - reuse -q Anaconda - sleep {sleep} - + # setup submit script + # with "chr_list" to get have adaptive chromosome list + cmd_templ = dedent("""\ chrs=({chr_list}) - chrom=${{chrs[${{SGE_TASK_ID}}-1]}} + chrom=${{chrs[{task}-1]}} - {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog} - - # eof - """ + {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog} + """) # shape_call = [shapeit_ex, # '--input-bed', chrstem+'.bed', chrstem+'.bim', chrstem+'.fam', @@ -203,21 +198,15 @@ # '--seed', str(args.shape_seed), # '--output-max', outstem+'.phased.haps', outstem+'.phased.sample', # '--output-log', outstem+'.shape.log'] - - # fill in template - chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}' - outstem = str(outdot)+'.chr${chrom}' + # manage duohmm arg if extra_args.no_duohmm: duo_txt = '' else: duo_txt = '--duohmm' - jobdict = {"jname": 'shape.'+str(outdot)+'.resub_'+str(num_chr), - "mem": str(extra_args.mem_req), - "threads": str(extra_args.threads), - "nchr": str(num_chr), - "outlog": 'shape.'+str(outdot)+'.resub_'+str(num_chr)+'.qsub.$TASK_ID.log', - "sleep": str(args.sleep), + + # fill in shapeit template + jobdict = {"task": "{task}", "chr_list": ' '.join(bad_chr), "shape_ex": str(shapeit_ex), "bed": '--input-bed '+str(chrstem)+'.bed '+str(chrstem)+'.bim '+str(chrstem)+'.fam', @@ -229,15 +218,19 @@ "seed_str": '--seed '+str(extra_args.shape_seed), "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample', "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log', - } - - uger_phase = open(uger_phase_name, 'w') - uger_phase.write(uger_phase_template.format(**jobdict)) - uger_phase.close() - + } + shape_cmd = cmd_templ.format(**jobdict) + # submit - print ' '.join(['qsub',uger_phase_name]) + '\n' - subprocess.check_call(' '.join(['qsub',uger_phase_name]), shell=True) + send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr), + cmd=shape_cmd, + logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log', + mem=int(extra_args.mem_req)*1000, + walltime=168, # week + njobs=int(num_chr), + threads=extra_args.threads, + sleep=args.sleep) + print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr @@ -246,19 +239,15 @@ print '\n...Replacing this imputation job in the queue...' os.chdir(wd) - imp_log = 'imp_chunks.'+str(outdot)+'.qsub.log' - uger_imp = ' '.join(['qsub', - '-hold_jid','imp.chunks.'+str(outdot), - '-q', 'short', - '-l', 'm_mem_free=8g,h_vmem=8g', - '-N', 'imp.chunks.'+str(outdot), - '-o', imp_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - ' '.join(sys.argv[:])]) - - print uger_imp + '\n' - subprocess.check_call(uger_imp, shell=True) + imp_log = 'imp_chunks.'+str(outdot)+'.sub.log' + + send_job(jobname='imp.chunks.'+str(outdot), + cmd=' '.join(sys.argv[:]), + logname=imp_log, + mem=8000, + walltime=2, # week + wait_name='shape.'+str(outdot)+'.resub_'+str(num_chr), + sleep=args.sleep) print '\n############' print '\n' @@ -330,38 +319,18 @@ os.chdir(imp_dir) link(str(chunk_dir)+'/'+str(outdot)+'.chunks.txt', str(outdot)+'.chunks.txt', 'genomic chunk results') -uger_imp_template = """#!/usr/bin/env sh -#$ -j y -#$ -cwd -#$ -V -#$ -N {jname} -#$ -q short -#$ -l m_mem_free=8g,h_vmem=8g -#$ -t 1-{nchunk} -#$ -o {outlog} - -source /broad/software/scripts/useuse -reuse -q Anaconda -sleep {sleep} +# job script +imp_templ = dedent("""\ + cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` + cstart=`awk -v a={task} 'NR==a+1{{print $2}}' {cfile}` + cend=`awk -v a={task} 'NR==a+1{{print $3}}' {cfile}` + cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` -cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}` -cstart=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $2}}' {cfile}` -cend=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $3}}' {cfile}` -cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}` - -{impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt} - -# eof -""" - -# get number of chunks (-1 is for header) -nchunks = file_len(outdot+'.chunks.txt')-1 + {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt} +""") # fill in template -jobdict = {"jname": 'imp.chunks.'+str(outdot), - "nchunk": str(nchunks), - "outlog": str('imp.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'), - "sleep": str(args.sleep), +jobdict = {"task": "{task}", "cfile": str(outdot)+'.chunks.txt', "impute_ex": str(impute_ex), "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps', @@ -373,19 +342,23 @@ "out": str(outdot)+'.imp.${cname}', "seedtxt": str(seedtxt) } +cmd_imp = imp_templ.format(**jobdict) -uger_imp = open(str(outdot)+'.imp_chunks.sub.sh', 'w') -uger_imp.write(uger_imp_template.format(**jobdict)) -uger_imp.close() +# get number of chunks (-1 is for header) +nchunks = file_len(outdot+'.chunks.txt')-1 # submit -print ' '.join(['qsub',uger_imp.name]) + '\n' -subprocess.check_call(' '.join(['qsub',uger_imp.name]), shell=True) +send_job(jobname='imp.chunks.'+str(outdot), + cmd=cmd_imp, + logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=8000, + walltime=2, + njobs=int(nchunks), + sleep=args.sleep) print 'Imputation jobs submitted for %d chunks.\n' % nchunks - ### # submit next imputation task ### @@ -397,23 +370,16 @@ os.chdir(wd) next_call = str(rp_bin) + '/bg_imp.py '+' '.join(sys.argv[1:]) - # TODO: consider queue/mem for agg - bg_log = 'bg_imp.'+str(outdot)+'.qsub.log' - uger_bg = ' '.join(['qsub', - '-hold_jid','imp.chunks.'+str(outdot), - '-q', 'short', - '-l', 'm_mem_free=4g,h_vmem=8g', - '-N', 'bg.chunks.'+str(outdot), - '-o', bg_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - next_call]) - - print uger_bg + '\n' - subprocess.check_call(uger_bg, shell=True) - - + bg_log = 'bg_imp.'+str(outdot)+'.sub.log' + # TODO: consider queue/mem for agg + send_job(jobname='bg.chunks.'+str(outdot), + cmd=next_call, + logname=bg_log, + mem=8000, + walltime=2, # week + wait_name='imp.chunks.'+str(outdot), + sleep=args.sleep) diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index 3c75dfb..1afc2ee 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -540,10 +540,10 @@ sub send_jobarray { $err_message .= "##### step $sjaname has been done repeatedly without any progress\n"; $err_message .= "##### imputation pipeline stopped: $command_line\n"; $err_message .= "##### $sjainfotxt\n"; - $err_message .= "##### if reason does not appear obvious\n"; - $err_message .= "##### have a look at the wiki page\n"; - $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n"; - $err_message .= "##### or contact the developers\n"; +# $err_message .= "##### if reason does not appear obvious\n"; +# $err_message .= "##### have a look at the wiki page\n"; +# $err_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n"; +# $err_message .= "##### or contact the developers\n"; $err_message .= "##################################################################\n"; print "$err_message\n"; @@ -552,7 +552,7 @@ sub send_jobarray { close ERR; if($email_on){ - &mysystem ('cat error_file | '.$mutt_script.' -s RP_pipeline_error '.$email) ; + &mysystem ('cat error_file | '.$mutt_script.' -s Picopili_pipeline_error '.$email) ; } unless ($serial) { diff --git a/bin/impute_rel.py b/bin/impute_rel.py index 13e79e3..332518c 100755 --- a/bin/impute_rel.py +++ b/bin/impute_rel.py @@ -25,9 +25,9 @@ ### load requirements import os -import subprocess from args_impute import * from py_helpers import unbuffer_stdout #, read_conf, file_tail, link, warn_format +from blueprint import send_job unbuffer_stdout() ############# @@ -121,12 +121,10 @@ ############# -print '\n...Checking dependencies...' +# print '\n...Checking dependencies...' ############# - - # TODO: here @@ -138,24 +136,15 @@ rp_bin = os.path.dirname(os.path.realpath(__file__)) next_call = str(rp_bin) + '/shape_rel.py '+' '.join(sys.argv[1:])+' --full-pipe' -# TODO: consider queue/mem for agg -shape_log = 'shape.'+str(outdot)+'.qsub.log' -uger_shape = ' '.join(['qsub', - '-q', 'long', - '-l', 'm_mem_free='+str(args.mem_req)+'g,h_vmem='+str(args.mem_req)+'g', - '-N', 'shape.'+str(outdot), - '-o', shape_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - next_call]) - -print uger_shape + '\n' -subprocess.check_call(uger_shape, shell=True) - - -# TODO: here - +shape_log = 'shape.'+str(outdot)+'.sub.log' +# TODO: consider queue/mem +send_job(jobname='shape.'+str(outdot), + cmd=next_call, + logname=shape_log, + mem=int(args.mem_req * 1000), + walltime=168, # week + sleep=args.sleep) # finish diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 82d6935..92bd5e0 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -45,6 +45,7 @@ # import warnings from args_impute import * from py_helpers import unbuffer_stdout, link, find_exec #, test_exec +from blueprint import send_job # file_len, read_conf, find_from_path, link, gz_confirm unbuffer_stdout() From 5df6c382e3343fd389db3b31330c20fe89b848f4 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 13:33:57 -0400 Subject: [PATCH 09/48] blueprint remaining direct jobs (not resubs) --- bin/agg_imp.py | 3 - bin/bg_imp.py | 88 ++++++++++++++-------------- bin/gwas_rel.py | 148 +++++++++++++++++++----------------------------- 3 files changed, 100 insertions(+), 139 deletions(-) diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 10a0ad9..a1d411c 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -16,9 +16,6 @@ # #################################### -# TODO: enable failed chunk check - - import sys ############# diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 7d99557..34233bd 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -34,6 +34,7 @@ import os import subprocess import warnings +from textwrap import dedent from args_impute import * from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format from blueprint import send_job @@ -192,6 +193,12 @@ rp_bin = os.path.dirname(os.path.realpath(__file__)) rs_ex = str(rp_bin)+'/rs_trans.py' +# get cluster configuration +# needed for specifying logfile names with clust_conf['log_task_id'] +conf_file = os.environ['HOME']+"/picopili.conf" +configs = read_conf(conf_file) +cluster = configs['cluster'] +clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') # TODO: here @@ -334,51 +341,34 @@ print '\n...Generating best-guess genotypes...' ###################### -# TODO: flex queue/mem reqs -uger_bg_template = """#!/usr/bin/env sh -#$ -j y -#$ -cwd -#$ -V -#$ -N {jname} -#$ -q short -#$ -l m_mem_free=4g,h_vmem=8g -#$ -t 1-{nchunk} -#$ -o {outlog} - -source /broad/software/scripts/useuse -reuse -q Anaconda -sleep {sleep} - -cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}` -cchr=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}` - -{plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} - -sleep {sleep} -# note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order -{plink_ex} --bfile {out_str} {mendel_txt} --pheno {idnum} --mpheno 4 --update-parents {idnum} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str2} -rm {out_str}.bed -rm {out_str}.bim -rm {out_str}.fam - -sleep {sleep} -{plink_ex} --bfile {out_str2} {maf_txt} {mac_txt} {geno_txt} {info_txt} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str_filt} -rm {out_str2}.bed -rm {out_str2}.bim -rm {out_str2}.fam - -{rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans} - -# eof -""" +# best-guess job script for each chunk +bg_templ = dedent("""\ + cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` + cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` + + {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} + + sleep {sleep} + # note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order + {plink_ex} --bfile {out_str} {mendel_txt} --pheno {idnum} --mpheno 4 --update-parents {idnum} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str2} + rm {out_str}.bed + rm {out_str}.bim + rm {out_str}.fam + + sleep {sleep} + {plink_ex} --bfile {out_str2} {maf_txt} {mac_txt} {geno_txt} {info_txt} --allow-no-sex --make-bed --silent --memory 2000 --out {out_str_filt} + rm {out_str2}.bed + rm {out_str2}.bim + rm {out_str2}.fam + {rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans} +""") + # get number of chunks nchunks = len(chunks) # fill in template -jobdict = {"jname": 'bg.chunks.'+str(outdot), - "nchunk": str(nchunks), - "outlog": str('bg.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'), +jobdict = {"task": "{task}", "sleep": str(args.sleep), "cfile": str(outdot)+'.chunks.txt', "plink_ex": str(plink_ex), @@ -400,16 +390,20 @@ "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl' } -uger_bg = open(str(outdot)+'.bg_chunks.sub.sh', 'w') -uger_bg.write(uger_bg_template.format(**jobdict)) -uger_bg.close() +bg_cmd = bg_templ.format(**jobdict) -# submit -print ' '.join(['qsub',uger_bg.name]) + '\n' -subprocess.check_call(' '.join(['qsub',uger_bg.name]), shell=True) -print 'Best-guess jobs submitted for %d chunks.\n' % nchunks +# submit +# TODO: flex queue/mem reqs +send_job(jobname='bg.chunks.'+str(outdot), + cmd=bg_cmd, + logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=8000, + walltime=2, + njobs=int(nchunks), + sleep=args.sleep) +print 'Best-guess jobs submitted for %d chunks.\n' % nchunks ### diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 01ae60a..8847d32 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -26,8 +26,10 @@ import subprocess import os from warnings import warn +from textwrap import dedent from args_gwas import * from py_helpers import link, unbuffer_stdout, find_exec +from blueprint import send_job unbuffer_stdout() @@ -139,6 +141,13 @@ if args.rscript_ex == None or args.rscript_ex == "None": args.rscript_ex = find_exec('Rscript', key='rscloc') +# get cluster configuration +# needed for specifying logfile names with clust_conf['log_task_id'] +conf_file = os.environ['HOME']+"/picopili.conf" +configs = read_conf(conf_file) +cluster = configs['cluster'] +clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') + # TODO: here @@ -387,36 +396,26 @@ def find_chunk(snpchrom, snpbp, last_chunk): ###################### print '\n...Submitting GWAS for all chunks...' -###################### - -# gwas each chunk -# need to write submit script to include chunk name parsing # TODO: consider making queue/resources flexible +###################### +# basic template, depending on model if args.model == 'gee' or args.model == 'dfam': - uger_gwas_template = """#!/usr/bin/env sh -#$ -j y -#$ -cwd -#$ -V -#$ -N {jname} -#$ -q short -#$ -l m_mem_free=4g,h_vmem=4g -#$ -t 1-{nchunk} -#$ -tc 200 -#$ -o {outlog} + gwas_templ = dedent("""\ + cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` + {misc} + {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs} + """) -source /broad/software/scripts/useuse -reuse -q Anaconda -sleep {sleep} - -cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}` - -{misc} +elif args.model == 'gmmat' or args.model == 'gmmat-fam': + gwas_templ = dedent("""\ + cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` + chrnum=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` -{gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs} + {plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}} -# eof -""" + {rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log + """) # alternative template for GMMAT # Rscript --no-save --no-restore @@ -426,34 +425,10 @@ def find_chunk(snpchrom, snpbp, last_chunk): # ../fgwa_eur_1KGp3_postimp.pca.txt (covariate file) # test1 (output name) # > test_gmm.log -elif args.model == 'gmmat' or args.model == 'gmmat-fam': - uger_gwas_template = """#!/usr/bin/env sh -#$ -j y -#$ -cwd -#$ -V -#$ -N {jname} -#$ -q short -#$ -l m_mem_free=4gi,h_vmem=4g -#$ -t 1-{nchunk} -#$ -tc 200 -#$ -o {outlog} - -source /broad/software/scripts/useuse -sleep {sleep} - -cname=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $4}}' {cfile}` -chrnum=`awk -v a=${{SGE_TASK_ID}} 'NR==a+1{{print $1}}' {cfile}` - -{plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}} - -{rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log - -# eof -""" +# optional arguments gwasargs = '' - if args.pheno is not None: gwasargs = gwasargs + ' --pheno '+str(args.pheno) if args.keep is not None: @@ -461,7 +436,7 @@ def find_chunk(snpchrom, snpbp, last_chunk): if args.remove is not None: gwasargs = gwasargs + ' --remove '+str(args.remove) -# these args not passed for gmmat +# model-specific arguments not passed for gmmat if args.model == 'gee' or args.model == 'dfam': if args.addout is not None: gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${cname}' @@ -474,16 +449,22 @@ def find_chunk(snpchrom, snpbp, last_chunk): gwasargs = gwasargs + ' --r-ex '+str(args.r_ex)+' --rplink-ex '+str(args.rplink_ex) +# model specific arguments for gee to specify Rserve port for each job +# targeting IANA range 49152-65535 +# (assuming here will be < 16k jobs; gwas_gee.py handles overflow check) +if args.model == 'gee': + misc_txt = 'rport=$((49151+{task}))' + gwasargs = str(gwasargs) +' --port $rport' +else: + misc_txt = '' + # TODO: pass through cleanup - -nchunk = len(chunks.keys()) -jobdict = {"jname": 'gwas.chunks.'+str(outdot), - "nchunk": str(nchunk), - "outlog": str('gwas.chunks.'+str(outdot)+'.$TASK_ID.qsub.log'), - "sleep": str(args.sleep), + +# fill in template +jobdict = {"task": "{task}", "cfile": chunk_file.name, - "misc": '', + "misc": str(misc_txt), "gwas_ex": str(gwas_ex), "bfile": str(args.bfile), "argout": str(args.out), @@ -494,20 +475,21 @@ def find_chunk(snpchrom, snpbp, last_chunk): "rsc": str(args.rscript_ex) } -# for gee, need to specify Rserve port for each job -# targeting IANA range 49152-65535 -# (assuming here will be < 16k jobs; gwas_gee.py handles overflow check) -if args.model == 'gee': - jobdict['misc'] = 'rport=$((49151+SGE_TASK_ID))' - jobdict['optargs'] = str(gwasargs) +' --port $rport' +gwas_cmd = gwas_templ.format(**jobdict) -uger_gwas = open(str(outdot)+'.gwas_chunks.sub.sh', 'w') -uger_gwas.write(uger_gwas_template.format(**jobdict)) -uger_gwas.close() +# submit job +nchunk = len(chunks.keys()) + +send_job(jobname='gwas.chunks.'+str(outdot), + cmd=gwas_cmd, + logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=4000, + walltime=2, + njobs=int(nchunk), + maxpar=200, + sleep=args.sleep) -print ' '.join(['qsub',uger_gwas.name]) + '\n' -subprocess.check_call(' '.join(['qsub',uger_gwas.name]), shell=True) print 'GWAS jobs submitted for %d chunks.\n' % nchunk @@ -548,7 +530,7 @@ def find_chunk(snpchrom, snpbp, last_chunk): else: info_file_txt = ['','','',''] -agg_log = 'agg.'+str(outdot)+'.qsub.log' +agg_log = 'agg.'+str(outdot)+'.sub.log' agg_call = [str(rp_bin)+'/agg_gwas.py', '--bfile',str(args.bfile), '--out',str(args.out), @@ -563,19 +545,14 @@ def find_chunk(snpchrom, snpbp, last_chunk): '--model',str(args.model)] agg_call = filter(None,agg_call) -uger_agg = ' '.join(['qsub', - '-hold_jid','gwas.chunks.'+str(outdot), - '-q', 'long', - '-l', 'm_mem_free=4g,h_vmem=4g', - '-N', 'agg_'+str(outdot), - '-o', agg_log, - str(rp_bin)+'/uger.sub.sh', - str(args.sleep), - ' '.join(agg_call)]) - -print uger_agg + '\n' -subprocess.check_call(uger_agg, shell=True) +send_job(jobname='agg_'+str(outdot), + cmd=' '.join(agg_call), + logname=agg_log, + mem=4000, + walltime=168, # week + wait_name='gwas.chunks.'+str(outdot), + sleep=args.sleep) # TODO: # queue summarization script (plots, etc) @@ -590,11 +567,4 @@ def find_chunk(snpchrom, snpbp, last_chunk): print 'All jobs submitted.\n' exit(0) -#uger_chunk = ' '.join(['qsub', -# '-hold_jid',str(name), -# '-q', 'short', -# '-N', str('chunk_'+out), -# '-o', chunk_log, -# str(rp_bin)+'/uger.sub.sh', -# str(sleep), -# str(chunk_call)]) +# eof From ea5e5358d68381958754c68afd861cc3017350a9 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 17:10:32 -0400 Subject: [PATCH 10/48] blueprint resubs, using pickled job info --- .gitignore | 1 + bin/agg_gwas.py | 81 ++++++++++++++++++++++++++--------------- bin/agg_imp.py | 76 +++++++++++++++++++++++++------------- bin/args_gwas.py | 2 +- bin/args_impute.py | 2 +- bin/bg_imp.py | 91 ++++++++++++++++++++++++++++++++++------------ bin/blueprint.py | 56 ++++++++++++++++++++++++++++ bin/gwas_rel.py | 19 +++++++++- bin/imp2_rel.py | 23 ++++++++++-- bin/pca_rel.py | 2 +- bin/shape_rel.py | 2 +- 11 files changed, 267 insertions(+), 88 deletions(-) diff --git a/.gitignore b/.gitignore index f5ac793..adcf4fe 100644 --- a/.gitignore +++ b/.gitignore @@ -6,5 +6,6 @@ bin/args_qc.pyc bin/args_gwas.pyc bin/args_chunks.pyc bin/args_impute.pyc +bin/test_debug lib/plague* lib/buigue* diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index f32326f..750f50a 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -179,39 +179,60 @@ tmp_chunk_file.close() print 'List of missing chunks: %s' % tmp_chunk_file.name - + + + ### # copy original submit script - # replace chunk list, name, number of tasks - orig_uger_file = open(str(outdot)+'.gwas_chunks.sub.sh', 'r') - new_uger_file = open(str(outdot)+'.gwas_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w') + # replace chunk list, name, number of tasks, memory spec + # resubmit + ### + + # load pickle of job info + orig_job_conf = 'gwas.chunks.'+str(outdot)+'.pkl' - for line in orig_uger_file: - if '#$ -t ' in line: - new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n') - continue -# elif '#$ -tc ' in line: -# if nummiss < 20: -# new_uger_file.write('#$ -tc 5 \n') -# elif nummiss < 50: -# new_uger_file.write('#$ -tc 10 \n') -# elif nummiss < 100: -# new_uger_file.write('#$ -tc 25 \n') -# else: -# new_uger_file.write('#$ -tc 40 \n') -# new_uger_file.write('#$ -tc 5 \n') - elif '#$ -l m_mem_free' in line: - new_uger_file.write('#$ -l m_mem_free=24g,h_vmem=24g \n') - else: - line=line.replace(args.chunk_file, tmp_chunk_file.name) - line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.') - line=line.replace('#$ -N gwas.chunks.'+str(outdot), '#$ -N gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss)) - new_uger_file.write(line) - - orig_uger_file.close() - new_uger_file.close() + if not os.path.isfile(orig_job_conf): + orig_job_file = str(outdot)+'.gwas_chunks.sub.sh' + raise IOError("Unable to find previous job configuration pickle %s.\ + \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file))) - print ' '.join(['qsub',new_uger_file.name]) + '\n' - subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True) + + cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf) + + # rename resub + sendjob_dict['jobname'] = 'gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss) + + sendjob_dict['logname'] = str('gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log') + + # increase memory and walltime + # TODO: consider how to scale mem/time here + oldmem = sendjob_dict['mem'] + sendjob_dict['mem'] = int(oldmem)*2 + + oldtime = sendjob_dict['walltime'] + sendjob_dict['walltime'] = int(oldtime)*4 + + # replace chunk file and set number of new jobs + sendjob_dict['njobs'] = int(nummiss) + + job_dict['cfile'] = tmp_chunk_file_name + + + # re-save new settings (primarily to track updating mem and walltime) + save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict) + + + # submit + gwas_cmd = cmd_templ.format(**job_dict) + + send_job(jobname=sendjob_dict['jobname'], + cmd=gwas_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + maxpar=sendjob_dict['maxpar'], + sleep=sendjob_dict['sleep']) + print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss diff --git a/bin/agg_imp.py b/bin/agg_imp.py index a1d411c..0d670fb 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -28,7 +28,7 @@ import subprocess from args_impute import * from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format -from blueprint import send_job +from blueprint import send_job, load_job unbuffer_stdout() # warnings.formatwarning = warn_format @@ -80,9 +80,6 @@ # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) -uger_ex = +str(rp_bin)+'/uger.sub.sh' - -test_exec(uger_ex) # TODO: here @@ -166,32 +163,61 @@ print 'List of missing chunks: %s' % tmp_chunk_file.name + ### # copy original submit script - # replace chunk list, name, number of tasks - orig_uger_file = open(str(outdot)+'.bg_chunks.sub.sh', 'r') - new_uger_file = open(str(outdot)+'.bg_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w') + # replace chunk list, name, number of tasks, memory spec + # resubmit + ### + + # load pickle of job info + orig_job_conf = 'bg.chunks.'+str(outdot)+'.pkl' + + if not os.path.isfile(orig_job_conf): + orig_job_file = str(outdot)+'.bg_chunks.sub.sh' + raise IOError("Unable to find previous job configuration pickle %s.\ + \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file))) + - for line in orig_uger_file: - if '#$ -t ' in line: - new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n') - elif '#$ -l m_mem_free' in line: - new_uger_file.write('#$ -l m_mem_free=8g,h_vmem=8g \n') - elif '#$ -q short' in line: - new_uger_file.write('#$ -q long \n') - else: - line=line.replace(chunk_file_name, tmp_chunk_file_name) - line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.') - line=line.replace('#$ -N bg.chunks.'+str(outdot), '#$ -N bg.chunks.'+str(outdot)+'.resub_'+str(nummiss)) - new_uger_file.write(line) - - orig_uger_file.close() - new_uger_file.close() - - print ' '.join(['qsub',new_uger_file.name]) + '\n' - subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True) + cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf) + + # rename resub + sendjob_dict['jobname'] = 'bg.chunks.'+str(outdot)+'.resub_'+str(nummiss) + + sendjob_dict['logname'] = str('bg.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log') + + # increase memory and walltime + # TODO: consider how to scale mem/time here + oldmem = sendjob_dict['mem'] + sendjob_dict['mem'] = int(oldmem) + 4000 + + oldtime = sendjob_dict['walltime'] + sendjob_dict['walltime'] = int(oldtime)*4 + + # replace chunk file and set number of new jobs + sendjob_dict['njobs'] = int(nummiss) + + job_dict['cfile'] = tmp_chunk_file_name + + + # re-save new settings (primarily to track updating mem and walltime) + save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict) + + + # submit + bg_cmd = cmd_templ.format(**job_dict) + + send_job(jobname=sendjob_dict['jobname'], + cmd=bg_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + sleep=sendjob_dict['sleep']) + print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss + print '\n...Replacing this aggregation job in the queue...' os.chdir(wd) diff --git a/bin/args_gwas.py b/bin/args_gwas.py index 49189b7..50627cc 100644 --- a/bin/args_gwas.py +++ b/bin/args_gwas.py @@ -190,7 +190,7 @@ arg_clust.add_argument('--sleep', type=int, metavar='SEC', - help='Number of seconds to delay on start of UGER jobs', + help='Number of seconds to delay on start of cluster jobs', required=False, default=30) arg_exloc.add_argument('--r-ex', diff --git a/bin/args_impute.py b/bin/args_impute.py index bda6621..de41a7b 100644 --- a/bin/args_impute.py +++ b/bin/args_impute.py @@ -294,7 +294,7 @@ arg_clust.add_argument('--sleep', type=int, metavar='SEC', - help='Number of seconds to delay on start of UGER jobs', + help='Number of seconds to delay on start of cluster jobs', required=False, default=30) arg_clust.add_argument('--full-pipe', diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 34233bd..5f84d35 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -37,7 +37,7 @@ from textwrap import dedent from args_impute import * from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format -from blueprint import send_job +from blueprint import send_job, init_sendjob_dict, save_job unbuffer_stdout() warnings.formatwarning = warn_format @@ -278,30 +278,59 @@ print 'List of missing chunks: %s' % tmp_chunk_file.name + ### # copy original submit script - # replace chunk list, name, number of tasks - orig_uger_file = open(str(outdot)+'.imp_chunks.sub.sh', 'r') - new_uger_file = open(str(outdot)+'.imp_chunks.resub_'+ str(nummiss)+'_chunks.sub.sh', 'w') + # replace chunk list, name, number of tasks, memory spec + # resubmit + ### + + # load pickle of job info + orig_job_conf = 'imp.chunks.'+str(outdot)+'.pkl' - for line in orig_uger_file: - if '#$ -t ' in line: - new_uger_file.write('#$ -t 1-'+str(nummiss)+'\n') - elif '#$ -l m_mem_free' in line: - new_uger_file.write('#$ -l m_mem_free=24g,h_vmem=24g \n') - elif '#$ -q short' in line: - new_uger_file.write('#$ -q long \n') - else: - line=line.replace(chunk_file_name, tmp_chunk_file_name) - line=line.replace('.$TASK_ID.','.tmp'+str(nummiss)+'.$TASK_ID.') - line=line.replace('#$ -N imp.chunks.'+str(outdot), '#$ -N imp.chunks.'+str(outdot)+'.resub_'+str(nummiss)) - new_uger_file.write(line) - - orig_uger_file.close() - new_uger_file.close() - - print ' '.join(['qsub',new_uger_file.name]) + '\n' - subprocess.check_call(' '.join(['qsub',new_uger_file.name]), shell=True) + if not os.path.isfile(orig_job_conf): + orig_job_file = str(outdot)+'.imp_chunks.sub.sh' + raise IOError("Unable to find previous job configuration pickle %s.\ + \nRefer to previous submit script %s to modify/resubmit.\n" % (str(orig_job_conf),str(orig_job_file))) + + + cmd_templ, job_dict, sendjob_dict = load_job(orig_job_conf) + + # rename resub + sendjob_dict['jobname'] = 'imp.chunks.'+str(outdot)+'.resub_'+str(nummiss) + + sendjob_dict['logname'] = str('imp.chunks.'+str(outdot)+'.resub_'+str(nummiss)+'.'+str(clust_conf['log_task_id'])+'.sub.log') + + # increase memory and walltime + # TODO: consider how to scale mem/time here + oldmem = sendjob_dict['mem'] + sendjob_dict['mem'] = int(oldmem)*2 + + oldtime = sendjob_dict['walltime'] + sendjob_dict['walltime'] = int(oldtime)*4 + + # replace chunk file and set number of new jobs + sendjob_dict['njobs'] = int(nummiss) + + job_dict['cfile'] = tmp_chunk_file_name + + + # re-save new settings (primarily to track updating mem and walltime) + save_job(jfile=orig_job_conf, cmd_templ=cmd_templ, job_dict=job_dict, sendjob_dict=sendjob_dict) + + + # submit + imp_cmd = cmd_templ.format(**job_dict) + + send_job(jobname=sendjob_dict['jobname'], + cmd=imp_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + sleep=sendjob_dict['sleep']) + print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss + print '\n...Replacing this best-guess job in the queue...' @@ -367,7 +396,7 @@ # get number of chunks nchunks = len(chunks) -# fill in template +# info to fill in job template jobdict = {"task": "{task}", "sleep": str(args.sleep), "cfile": str(outdot)+'.chunks.txt', @@ -390,11 +419,25 @@ "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl' } -bg_cmd = bg_templ.format(**jobdict) + +# store job information for possible resubs +job_store_file = 'bg.chunks.'+str(outdot)+'.pkl' + +clust_dict = init_sendjob_dict() +clust_dict['jobname'] = 'bg.chunks.'+str(outdot) +clust_dict['logname'] = str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log') +clust_dict['mem'] = 8000 +clust_dict['walltime'] = 2 +clust_dict['njobs'] = int(nchunks) +clust_dict['sleep'] = args.sleep + +save_job(jfile=job_store_file, cmd_templ=bg_templ, job_dict=jobdict, sendjob_dict=clust_dict) # submit # TODO: flex queue/mem reqs +bg_cmd = bg_templ.format(**jobdict) + send_job(jobname='bg.chunks.'+str(outdot), cmd=bg_cmd, logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), diff --git a/bin/blueprint.py b/bin/blueprint.py index 5ccd47d..09074a7 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -291,6 +291,62 @@ def send_job(jobname, return 0 +#################################### +# +# Save / load job configurations +# +#################################### + +def init_sendjob_dict(): + + sendjob_dict = { + "jobname": None, +# "arrayfile": None, +# "cmd": None, + "logname": None, + "logloc": None, + "mem": None, + "walltime": None, + "njobs": None, + "maxpar": None, + "threads": None, + "wait_file": None, + "wait_name": None, +# "cluster": None, + "sleep": None, +# "testonly": None + } + + return sendjob_dict + + + +def save_job(jfile, cmd_templ, job_dict, sendjob_dict): + + import cPickle as pickle + + with open(jfile, 'wb') as pickle_out: + pickle.dump(cmd_templ, pickle_out, -1) + pickle.dump(job_dict, pickle_out, -1) + pickle.dump(sendjob_dict, pickle_out, -1) + + return 0 + + + +def load_job(jfile): + + import cPickle as pickle + + with open(jfile, 'rb') as pickle_in: + cmd_templ = pickle.load(pickle_in) + job_dict = pickle.load(pickle_in) + sendjob_dict = pickle.load(pickle_in) + + return cmd_templ, job_dict, sendjob_dict + + + #################################### # # Parse arguments from ricopili interface if invoked directly diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 8847d32..2a85f16 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -475,11 +475,26 @@ def find_chunk(snpchrom, snpbp, last_chunk): "rsc": str(args.rscript_ex) } -gwas_cmd = gwas_templ.format(**jobdict) +nchunk = len(chunks.keys()) + + +# store job information for possible resubs +job_store_file = 'gwas.chunks.'+str(outdot)+'.pkl' + +clust_dict = init_sendjob_dict() +clust_dict['jobname'] = 'gwas.chunks.'+str(outdot) +clust_dict['logname'] = str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log') +clust_dict['mem'] = 4000 +clust_dict['walltime'] = 2 +clust_dict['njobs'] = int(nchunk) +clust_dict['maxpar'] = 200 +clust_dict['sleep'] = args.sleep + +save_job(jfile=job_store_file, cmd_templ=gwas_templ, job_dict=jobdict, sendjob_dict=clust_dict) # submit job -nchunk = len(chunks.keys()) +gwas_cmd = gwas_templ.format(**jobdict) send_job(jobname='gwas.chunks.'+str(outdot), cmd=gwas_cmd, diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 381a58b..025f475 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -169,8 +169,8 @@ os.chdir(shape_dir) # verify haven't already tried this resub - uger_phase_name = str(outdot)+'.shape.resub_'+str(num_chr)+'_chr.sub.sh' - if os.path.isfile(uger_phase_name): + phase_sub_name = 'shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.sh' + if os.path.isfile(phase_sub_name): print '\n####################' print 'ERROR:' print 'Found previous attempt to resubmit %d failed chromosomes.' % int(num_chr) @@ -342,12 +342,29 @@ "out": str(outdot)+'.imp.${cname}', "seedtxt": str(seedtxt) } -cmd_imp = imp_templ.format(**jobdict) + # get number of chunks (-1 is for header) nchunks = file_len(outdot+'.chunks.txt')-1 + +# store job information for possible resubs +job_store_file = 'imp.chunks.'+str(outdot)+'.pkl' + +clust_dict = init_sendjob_dict() +clust_dict['jobname'] = 'imp.chunks.'+str(outdot) +clust_dict['logname'] = str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log') +clust_dict['mem'] = 8000 +clust_dict['walltime'] = 2 +clust_dict['njobs'] = int(nchunks) +clust_dict['sleep'] = args.sleep + +save_job(jfile=job_store_file, cmd_templ=imp_templ, job_dict=jobdict, sendjob_dict=clust_dict) + + # submit +cmd_imp = imp_templ.format(**jobdict) + send_job(jobname='imp.chunks.'+str(outdot), cmd=cmd_imp, logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), diff --git a/bin/pca_rel.py b/bin/pca_rel.py index 7303b55..9969e57 100755 --- a/bin/pca_rel.py +++ b/bin/pca_rel.py @@ -38,7 +38,7 @@ # get directory containing current script -# (to get absolute path for uger wrapper) +# (to get absolute path for script directory) rp_bin = os.path.dirname(os.path.realpath(__file__)) ############# diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 92bd5e0..cc3ae4f 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -16,7 +16,7 @@ # 4) Split plink files by chr # - use shortened IDs # 5) Run SHAPEIT -# - using UGER to parallelize +# - parallelize on cluster # #################################### From f313ceefa666b8d49f3a6e9b84df1061add41f99 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 18:19:55 -0400 Subject: [PATCH 11/48] clean up py imports, add doc with dependencies --- bin/agg_gwas.py | 11 +++++------ bin/agg_imp.py | 9 +++++---- bin/args_chunks.py | 3 +-- bin/args_impute.py | 1 - bin/args_qc.py | 3 +-- bin/bg_imp.py | 12 ++++++------ bin/blueprint.py | 26 ++++++++++++++++++++++++++ bin/chunk_snps.py | 4 +--- bin/filter_ped.py | 7 +------ bin/final_file_check.py | 2 +- bin/gwas_dfam.py | 6 ++---- bin/gwas_gee.py | 4 +--- bin/gwas_rel.py | 8 ++++---- bin/imp2_rel.py | 13 ++++++++----- bin/impute_rel.py | 8 +++++--- bin/imus_pca.py | 2 +- bin/pca_rel.py | 2 +- bin/ped_confirm.py | 7 +------ bin/qc_rel.py | 3 +-- bin/shape_rel.py | 10 ++-------- bin/strict_qc.py | 4 ++-- docs/PYTHON.md | 34 ++++++++++++++++++++++++++++++++++ 22 files changed, 109 insertions(+), 70 deletions(-) create mode 100644 docs/PYTHON.md diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index 750f50a..c216701 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -36,13 +36,10 @@ import subprocess import argparse import gzip -# from warnings import warn -# from glob import glob from math import log10, sqrt -from args_gwas import * +from args_gwas import parserbase, parseragg from py_helpers import unbuffer_stdout, file_len, file_tail -from blueprint import send_job -# , read_conf, link +from blueprint import send_job, save_job, load_job, read_clust_conf unbuffer_stdout() @@ -103,7 +100,9 @@ # TODO: check dependencies - +# get cluster configuration +# needed for specifying logfile names with clust_conf['log_task_id'] +clust_conf = read_clust_conf() diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 0d670fb..5a285ca 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -26,11 +26,11 @@ ### load requirements import os import subprocess -from args_impute import * -from py_helpers import unbuffer_stdout, find_exec, test_exec, file_len #, file_tail, link, warn_format -from blueprint import send_job, load_job +import argparse +from args_impute import parserbase, parsercluster +from py_helpers import unbuffer_stdout, find_exec, file_len +from blueprint import send_job, load_job, save_job, read_clust_conf unbuffer_stdout() -# warnings.formatwarning = warn_format ############# if not (('-h' in sys.argv) or ('--help' in sys.argv)): @@ -80,6 +80,7 @@ # get directory containing current script # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) +clust_conf = read_clust_conf() # TODO: here diff --git a/bin/args_chunks.py b/bin/args_chunks.py index 2af8ac6..caf6e61 100644 --- a/bin/args_chunks.py +++ b/bin/args_chunks.py @@ -17,7 +17,6 @@ # imports import argparse -# import os @@ -96,4 +95,4 @@ 'Such chunks may occur due to sparse data (e.g. few SNPs ' + \ 'on the short arm of chr21) or could indicate bad chromosome build information.') -# eof \ No newline at end of file +# eof diff --git a/bin/args_impute.py b/bin/args_impute.py index de41a7b..e603462 100644 --- a/bin/args_impute.py +++ b/bin/args_impute.py @@ -18,7 +18,6 @@ # imports import argparse -# import os diff --git a/bin/args_qc.py b/bin/args_qc.py index c2fff9e..011acb7 100644 --- a/bin/args_qc.py +++ b/bin/args_qc.py @@ -17,7 +17,6 @@ # imports import argparse -# import os ############ @@ -215,4 +214,4 @@ help='Prevents setting mendelian errors to missing') -# eof \ No newline at end of file +# eof diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 5f84d35..ab08f04 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -32,12 +32,13 @@ ### load requirements import os -import subprocess import warnings +import argparse +from warnings import warn from textwrap import dedent -from args_impute import * -from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format -from blueprint import send_job, init_sendjob_dict, save_job +from args_impute import parserbase, parserbg, parsercluster +from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format, read_conf +from blueprint import send_job, init_sendjob_dict, save_job, load_job, read_clust_conf unbuffer_stdout() warnings.formatwarning = warn_format @@ -197,8 +198,7 @@ # needed for specifying logfile names with clust_conf['log_task_id'] conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) -cluster = configs['cluster'] -clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') +clust_conf = read_clust_conf() # TODO: here diff --git a/bin/blueprint.py b/bin/blueprint.py index 09074a7..1684d95 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -291,6 +291,32 @@ def send_job(jobname, return 0 +#################################### +# +# Get cluster configuration file +# +#################################### + +def read_clust_conf(): + + import os + + conf_file = os.environ['HOME']+"/picopili.conf" + configs = read_conf(conf_file) + cluster = configs['cluster'] + + pico_bin = os.path.dirname(os.path.realpath(__file__)) + clust_dir = os.path.dirname(pico_bin) + '/cluster_templates' + + assert os.path.isdir(clust_dir), "Unable to find cluster job submission template directory %s" % str(clust_dir) + + # load queue configuration info + # - submission syntax, queue names, job holds + clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') + + return clust_conf + + #################################### # # Save / load job configurations diff --git a/bin/chunk_snps.py b/bin/chunk_snps.py index 630f8d9..7a7601a 100755 --- a/bin/chunk_snps.py +++ b/bin/chunk_snps.py @@ -35,11 +35,9 @@ ############# import os -# import subprocess import argparse import copy -# from glob import glob -from args_chunks import * +from args_chunks import parserbase, parsersnpchunk from py_helpers import unbuffer_stdout, file_len, warn_format unbuffer_stdout() import warnings diff --git a/bin/filter_ped.py b/bin/filter_ped.py index 26b3e91..6b103e6 100755 --- a/bin/filter_ped.py +++ b/bin/filter_ped.py @@ -33,16 +33,11 @@ ### load requirements import os -# import subprocess import argparse -# from string import ascii_uppercase -# from glob import glob -# from numpy import digitize import random import warnings -from args_ped import * +from args_ped import parserbase, parsergeno, parseribd, parserweights from py_helpers import unbuffer_stdout -# file_len, test_exec, read_conf, find_from_path, link, gz_confirm unbuffer_stdout() diff --git a/bin/final_file_check.py b/bin/final_file_check.py index 57085e8..ed4ec9b 100755 --- a/bin/final_file_check.py +++ b/bin/final_file_check.py @@ -11,7 +11,7 @@ #################################### import argparse -from py_helpers import * +from py_helpers import file_check_email, unbuffer_stdout unbuffer_stdout() ### parse arguments diff --git a/bin/gwas_dfam.py b/bin/gwas_dfam.py index 7183dbd..6806060 100755 --- a/bin/gwas_dfam.py +++ b/bin/gwas_dfam.py @@ -39,10 +39,8 @@ import os import subprocess import argparse -# from glob import glob -from args_gwas import * +from args_gwas import parserbase,parsergwas,parsersoft from py_helpers import unbuffer_stdout, test_exec, find_exec -# , read_conf, link unbuffer_stdout() ############# @@ -179,4 +177,4 @@ print '\n############' print '\n' print 'SUCCESS!\n' -exit(0) \ No newline at end of file +exit(0) diff --git a/bin/gwas_gee.py b/bin/gwas_gee.py index 3cf9209..cc5b458 100755 --- a/bin/gwas_gee.py +++ b/bin/gwas_gee.py @@ -42,10 +42,8 @@ import subprocess import argparse from warnings import warn -# from glob import glob -from args_gwas import * +from args_gwas import parserbase, parsergwas, parsersoft from py_helpers import unbuffer_stdout, test_exec, find_from_path, file_len, find_exec -# , read_conf, link unbuffer_stdout() ############# diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 2a85f16..6fe0985 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -27,9 +27,9 @@ import os from warnings import warn from textwrap import dedent -from args_gwas import * -from py_helpers import link, unbuffer_stdout, find_exec -from blueprint import send_job +from args_gwas import parserbase, parsergwas, parserchunk, parseragg, parsersoft +from py_helpers import link, unbuffer_stdout, find_exec, read_conf +from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job unbuffer_stdout() @@ -146,7 +146,7 @@ conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] -clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') +clust_conf = read_clust_conf() # TODO: here diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 025f475..bd32661 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -26,10 +26,11 @@ ### load requirements import os import subprocess +import argparse from textwrap import dedent -from args_impute import * -from py_helpers import unbuffer_stdout, file_len, link, find_exec -from blueprint import send_job +from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster +from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf +from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job unbuffer_stdout() @@ -97,7 +98,7 @@ conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) cluster = configs['cluster'] -clust_conf = read_conf(str(clust_dir)+'/'+str(cluster)+'.conf') +clust_conf = read_clust_conf() # from config impute_ex = find_exec('impute2',key='i2loc') @@ -199,7 +200,9 @@ # '--output-max', outstem+'.phased.haps', outstem+'.phased.sample', # '--output-log', outstem+'.shape.log'] - # manage duohmm arg + # manage additional arg pieces + chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}' + outstem = str(outdot)+'.chr${chrom}' if extra_args.no_duohmm: duo_txt = '' else: diff --git a/bin/impute_rel.py b/bin/impute_rel.py index 332518c..651c2ea 100755 --- a/bin/impute_rel.py +++ b/bin/impute_rel.py @@ -25,8 +25,10 @@ ### load requirements import os -from args_impute import * -from py_helpers import unbuffer_stdout #, read_conf, file_tail, link, warn_format +import argparse + +from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, parserbg, parsercluster +from py_helpers import unbuffer_stdout from blueprint import send_job unbuffer_stdout() @@ -96,7 +98,7 @@ if args.hard_call_th is None: print '--bg-th '+str(args.bg_th) else: - print '--hard-call-th '+str(hard_call_th) + print '--hard-call-th '+str(args.hard_call_th) print '--info-th '+str(args.info_th) print '--max-info-th '+str(args.max_info_th) if args.keep_mendel: diff --git a/bin/imus_pca.py b/bin/imus_pca.py index 6fee4bb..765aff2 100755 --- a/bin/imus_pca.py +++ b/bin/imus_pca.py @@ -36,7 +36,7 @@ import argparse from glob import glob from py_helpers import find_exec, unbuffer_stdout, test_exec -from args_pca import * +from args_pca import parserbase, parserpca unbuffer_stdout() ############# diff --git a/bin/pca_rel.py b/bin/pca_rel.py index 9969e57..0edafd6 100755 --- a/bin/pca_rel.py +++ b/bin/pca_rel.py @@ -31,7 +31,7 @@ import argparse import os from math import ceil -from args_pca import * +from args_pca import parserbase, parsergrid, parserqc, parserpca from py_helpers import file_len, unbuffer_stdout from blueprint import send_job unbuffer_stdout() diff --git a/bin/ped_confirm.py b/bin/ped_confirm.py index 681c55e..b201a64 100755 --- a/bin/ped_confirm.py +++ b/bin/ped_confirm.py @@ -35,14 +35,9 @@ import subprocess import argparse import re -# from string import ascii_uppercase -# from glob import glob -# from numpy import digitize -# import random import warnings -from args_ped import * +from args_ped import parserbase, parseribd, parserexloc from py_helpers import unbuffer_stdout, test_exec -# file_len, read_conf, find_from_path, link, gz_confirm unbuffer_stdout() diff --git a/bin/qc_rel.py b/bin/qc_rel.py index 8c0c556..ed759c9 100755 --- a/bin/qc_rel.py +++ b/bin/qc_rel.py @@ -54,8 +54,7 @@ import warnings from time import strftime start_time = strftime("%H:%M:%S %d-%B-%Y") -# from glob import glob -from args_qc import * +from args_qc import parserbase, parserqc, parsermendel, parsertag from py_helpers import unbuffer_stdout, read_conf, test_exec, link, file_len, warn_format, find_exec unbuffer_stdout() warnings.formatwarning = warn_format diff --git a/bin/shape_rel.py b/bin/shape_rel.py index cc3ae4f..43a9419 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -38,15 +38,9 @@ import os import subprocess import argparse -# from string import ascii_uppercase -# from glob import glob -# from numpy import digitize -# import random -# import warnings -from args_impute import * -from py_helpers import unbuffer_stdout, link, find_exec #, test_exec +from args_impute import parserbase, parserphase, parserref, parsercluster +from py_helpers import unbuffer_stdout, link, find_exec, read_conf from blueprint import send_job -# file_len, read_conf, find_from_path, link, gz_confirm unbuffer_stdout() diff --git a/bin/strict_qc.py b/bin/strict_qc.py index 1e9a476..1ead067 100755 --- a/bin/strict_qc.py +++ b/bin/strict_qc.py @@ -42,8 +42,8 @@ import subprocess import argparse from glob import glob -from py_helpers import file_len, find_exec, unbuffer_stdout, test_exec -from args_pca import * +from py_helpers import file_len, find_exec, unbuffer_stdout +from args_pca import parserbase, parserqc unbuffer_stdout() ############# diff --git a/docs/PYTHON.md b/docs/PYTHON.md new file mode 100644 index 0000000..a960b96 --- /dev/null +++ b/docs/PYTHON.md @@ -0,0 +1,34 @@ +### Python Dependencies + +Picopili is built from a combination of Python, Perl, R, and *nix shell scripts. + +Most scripts depend only on packages from the Python Standard Library +([[https://docs.python.org/2/library/]]). In addition, `admix_rel.py` depends +on numpy. We strongly support using Anaconda ([[https://www.continuum.io/downloads]]) +to manage Python package dependencies, but a barebones installation of Python 2.X + numpy +should be sufficient for picopili in most cases. + +Scripts are primarily tested under Python 2.7 and Anaconda 2.1.0, but should be broadly +compatible with most Python 2.X versions. If you encounter compaitibility issues, +please contact rwalters(at)broadinstitute.org and we would be happy to assist. + +##### Full list of package dependencies + +* argparse +* cPickle +* copy +* distutils +* glob +* gzip +* math +* numpy +* os +* random +* re +* string +* subprocess +* sys +* textwrap +* time +* warnings + From c996ca8d61b8e5fa692dea3288772fee468c7060 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 20:05:39 -0400 Subject: [PATCH 12/48] compress multiline commands when parallelizing --- bin/blueprint.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bin/blueprint.py b/bin/blueprint.py index 1684d95..e8e39ac 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -10,6 +10,7 @@ #################################### import os +import stat import subprocess from textwrap import dedent from py_helpers import read_conf, file_len @@ -178,6 +179,14 @@ def send_job(jobname, # number of jobs to cover all tasks array_jobs = ceil(float(njobs)/float(task_mem_lim)) + # convert multi-line command to script + if len(cmd_line.splitlines()) > 1: + tmp_script = open('temp_cmd.'+str(jobname)+'.sh','w') + tmp_script.write(cmd_line) + tmp_script.close() + os.chmod(tmp_script.name, stat.S_IEXEC) + cmd_line = './'+tmp_script.name + # setup to do task_mem_lim jobs on each node # note: specified above that cmd_line uses ${tid} as task index par_tmp = dedent("""\ From 3d46a80fbc6faf1316088eb45c969113d98ee228 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Sat, 8 Oct 2016 02:12:14 +0200 Subject: [PATCH 13/48] list dependencies; first draft lisa config --- bin/config_pico.pl | 2 +- cluster_templates/lisa.conf | 12 ++++++++++++ cluster_templates/lisa.sub.sh | 18 ++++++++++++++++++ docs/DEPENDS.md | 18 ++++++++++++++++++ docs/PYTHON.md | 1 + 5 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 cluster_templates/lisa.conf create mode 100755 cluster_templates/lisa.sub.sh create mode 100644 docs/DEPENDS.md diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 9fb19df..442c2bb 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -787,7 +787,7 @@ () print "Successfully found ricopili plague and buigue reference files!\n" } -if ($haveref == 1 && $statusbin == 0){ +if ($haveref == 1 && $status_bin == 0){ print "\n### Finished ###\n\n"; } diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf new file mode 100644 index 0000000..e880e01 --- /dev/null +++ b/cluster_templates/lisa.conf @@ -0,0 +1,12 @@ +hour_q None +hour2_q None +hour4_q None +day_q None +long_q None +sub_cmd qsub -d $PWD +log_task_id ${PBS_ARRAYID} +task_id ${PBS_ARRAYID} +hold_flag -W depend=afterany: +j_per_node 16 +array_mem_mb 32000 +project unspecified diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh new file mode 100755 index 0000000..47b08d4 --- /dev/null +++ b/cluster_templates/lisa.sub.sh @@ -0,0 +1,18 @@ +#PBS -lwalltime={wall_hours}:00:00 +#PBS -lnodes=1 +#PBS -S /bin/bash +#PBS -N {job_name} +#PBS -j oe +#PBS -o {log_name} +::PICO_ARRAY_ONLY::#PBS -t 1-{array_jobs} + +# sleep option (for preventing race conditions on network file systems) +sleep {sleep_time} + +# setup resources +use R + +# main command line +{cmd_string} + +# eof diff --git a/docs/DEPENDS.md b/docs/DEPENDS.md new file mode 100644 index 0000000..73edeef --- /dev/null +++ b/docs/DEPENDS.md @@ -0,0 +1,18 @@ +### Software Dependencies + +Picopili largely serves as a wrapper for existing major software +for analyzing genome-wide genotype data. A full list of dependencies +is given below, along with links to their respective sources. + +* ADMIXTURE (https://www.genetics.ucla.edu/software/admixture/) +* EIGENSOFT (https://www.hsph.harvard.edu/alkes-price/software/) +* IMPUTE2 (https://mathgen.stats.ox.ac.uk/impute/impute_v2.html) +* liftOver (http://genome.sph.umich.edu/wiki/LiftOver) +* PLINK2 (https://www.cog-genomics.org/plink2) +* R-enabled plink (1.07 [http://pngu.mgh.harvard.edu/~purcell/plink/], or dev branch of plink2 [above]) +* PRIMUS (https://primus.gs.washington.edu/primusweb/) +* R (https://www.r-project.org/) +* REAP (faculty.washington.edu/tathornt/software/REAP/) +* Rserve (https://rforge.net/Rserve/) +* SHAPEIT (www.shapeit.fr/) + diff --git a/docs/PYTHON.md b/docs/PYTHON.md index a960b96..551becc 100644 --- a/docs/PYTHON.md +++ b/docs/PYTHON.md @@ -25,6 +25,7 @@ please contact rwalters(at)broadinstitute.org and we would be happy to assist. * os * random * re +* stat * string * subprocess * sys From 04bc27cb942d7a288cb5f9dad49a72442e2c2b7b Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Sat, 8 Oct 2016 02:44:29 +0200 Subject: [PATCH 14/48] cluster hold, working directory configs --- bin/blueprint.py | 7 ++++--- cluster_templates/broad_uger.conf | 2 +- cluster_templates/lisa.conf | 4 ++-- cluster_templates/lisa.sub.sh | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/blueprint.py b/bin/blueprint.py index e8e39ac..708a900 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -99,12 +99,12 @@ def send_job(jobname, # job dependencies if wait_name is not None: - hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name) + hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name)) elif wait_file is not None: with open(wait_file, 'r') as wait_fi: wait_name = wait_fi.readline() - hold_str = clust_conf['hold_flag'] + ' ' + str(wait_name) + hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name)) else: hold_str = "" @@ -254,7 +254,8 @@ def send_job(jobname, "log_task_id": str(clust_conf['log_task_id']), "queue_name": str(queue_name), "sleep_time": str(sleep), - "project": str(clust_conf['project']) + "project": str(clust_conf['project']), + "workdir": os.getcwd() } diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf index 3ed149c..5901a07 100644 --- a/cluster_templates/broad_uger.conf +++ b/cluster_templates/broad_uger.conf @@ -6,7 +6,7 @@ long_q long sub_cmd qsub log_task_id $TASK_ID task_id ${SGE_TASK_ID} -hold_flag -hold_jid +hold_flag -hold_jid {hold_name} j_per_node 1 array_mem_mb 128000 project unspecified diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf index e880e01..cfabff3 100644 --- a/cluster_templates/lisa.conf +++ b/cluster_templates/lisa.conf @@ -3,10 +3,10 @@ hour2_q None hour4_q None day_q None long_q None -sub_cmd qsub -d $PWD +sub_cmd qsub log_task_id ${PBS_ARRAYID} task_id ${PBS_ARRAYID} -hold_flag -W depend=afterany: +hold_flag -W depend=afterany:{hold_name} j_per_node 16 array_mem_mb 32000 project unspecified diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh index 47b08d4..9575147 100755 --- a/cluster_templates/lisa.sub.sh +++ b/cluster_templates/lisa.sub.sh @@ -10,7 +10,7 @@ sleep {sleep_time} # setup resources -use R +cd {workdir} # main command line {cmd_string} From fe1088c07b2738692a31a194689349db703613ed Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 7 Oct 2016 20:55:02 -0400 Subject: [PATCH 15/48] test of job number hold conditions --- bin/blueprint.py | 13 ++++++++----- bin/pca_rel.py | 34 ++++++++++++++++++---------------- bin/shape_rel.py | 17 +++++++++-------- cluster_templates/lisa.conf | 2 +- 4 files changed, 36 insertions(+), 30 deletions(-) diff --git a/bin/blueprint.py b/bin/blueprint.py index 708a900..eeb2161 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -28,6 +28,7 @@ def send_job(jobname, threads=None, wait_file=None, wait_name=None, + wait_num=None, cluster=None, sleep=30, testonly=False): @@ -99,12 +100,12 @@ def send_job(jobname, # job dependencies if wait_name is not None: - hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name)) + hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),hold_num=str(wait_num)) elif wait_file is not None: with open(wait_file, 'r') as wait_fi: wait_name = wait_fi.readline() - hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name)) + hold_str = clust_conf['hold_flag'].format(hold_name=str(wait_name),hold_num=str(wait_num)) else: hold_str = "" @@ -294,9 +295,11 @@ def send_job(jobname, if not testonly: p = subprocess.Popen(launch_str.split(), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) out, err = p.communicate() - print out - return(p.returncode) - + if p.returncode is None or p.returncode == 0: + return out + else + raise EnvironmentError((p.returncode,err)) + else: return 0 diff --git a/bin/pca_rel.py b/bin/pca_rel.py index 0edafd6..7f6238f 100755 --- a/bin/pca_rel.py +++ b/bin/pca_rel.py @@ -166,14 +166,14 @@ strandambi_txt, allchr_txt]) -send_job(jobname=str('strictqc_'+args.out), - arrayfile=None, - cmd=str(strictqc_call), - logname=str('strictqc_'+args.out+'.sub.log'), - mem=2000, - walltime=2, - sleep=0, - testonly=args.test_sub) +jobres = send_job(jobname=str('strictqc_'+args.out), + arrayfile=None, + cmd=str(strictqc_call), + logname=str('strictqc_'+args.out+'.sub.log'), + mem=2000, + walltime=2, + sleep=0, + testonly=args.test_sub) ##### @@ -193,14 +193,15 @@ '--primus-ex', str(args.primus_ex) ]) -send_job(jobname=str('imuspca_'+args.out), - cmd=str(imuspca_call), - logname=str('imuspca_'+args.out+'.sub.log'), - mem=int(imus_mem)*1000, - walltime=168, # one week - wait_name=str('strictqc_'+args.out), - sleep=args.sleep, - testonly=args.test_sub) +jobres2 = send_job(jobname=str('imuspca_'+args.out), + cmd=str(imuspca_call), + logname=str('imuspca_'+args.out+'.sub.log'), + mem=int(imus_mem)*1000, + walltime=168, # one week + wait_name=str('strictqc_'+args.out), + wait_num=str(jobres), + sleep=args.sleep, + testonly=args.test_sub) ##### @@ -226,6 +227,7 @@ mem=100, walltime=1, wait_name=str('imuspca_'+args.out), + wait_num=str(jobres2), sleep=str(args.sleep), testonly=args.test_sub) diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 43a9419..09e9d3e 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -299,14 +299,14 @@ task_id = str(clust_conf['log_task_id']) # submit -send_job(jobname='shape.'+str(outdot), - cmd=' '.join(shape_call), - logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log', - mem=int(args.mem_req)*1000, - walltime=168, # week - njobs=22, - threads=int(args.threads), - sleep=str(args.sleep)) +jobres = send_job(jobname='shape.'+str(outdot), + cmd=' '.join(shape_call), + logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log', + mem=int(args.mem_req)*1000, + walltime=168, # week + njobs=22, + threads=int(args.threads), + sleep=str(args.sleep)) ### @@ -329,6 +329,7 @@ mem=8000, walltime=2, wait_name='shape.'+str(outdot), + wait_num=jobres, sleep=str(args.sleep)) diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf index cfabff3..1db3e36 100644 --- a/cluster_templates/lisa.conf +++ b/cluster_templates/lisa.conf @@ -6,7 +6,7 @@ long_q None sub_cmd qsub log_task_id ${PBS_ARRAYID} task_id ${PBS_ARRAYID} -hold_flag -W depend=afterany:{hold_name} +hold_flag -W depend=afterany:{hold_num} j_per_node 16 array_mem_mb 32000 project unspecified From 4f038852b905b91f8d7b58897c171a5340733da6 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Sat, 8 Oct 2016 03:36:04 +0200 Subject: [PATCH 16/48] allow job number holds; adjust long job walltimes --- bin/agg_gwas.py | 19 ++++++++++--------- bin/agg_imp.py | 17 +++++++++-------- bin/bg_imp.py | 32 +++++++++++++++++--------------- bin/blueprint.py | 4 ++-- bin/gwas_rel.py | 19 ++++++++++--------- bin/imp2_rel.py | 32 +++++++++++++++++--------------- bin/impute_rel.py | 2 +- bin/pca_rel.py | 6 +++--- bin/shape_rel.py | 4 ++-- 9 files changed, 71 insertions(+), 64 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index c216701..d456758 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -223,14 +223,14 @@ # submit gwas_cmd = cmd_templ.format(**job_dict) - send_job(jobname=sendjob_dict['jobname'], - cmd=gwas_cmd, - logname=sendjob_dict['logname'], - mem=sendjob_dict['mem'], - walltime=sendjob_dict['walltime'], - njobs=sendjob_dict['njobs'], - maxpar=sendjob_dict['maxpar'], - sleep=sendjob_dict['sleep']) + jobres = send_job(jobname=sendjob_dict['jobname'], + cmd=gwas_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + maxpar=sendjob_dict['maxpar'], + sleep=sendjob_dict['sleep']) print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss @@ -245,8 +245,9 @@ cmd=' '.join(sys.argv[:]), logname=agg_log, mem=24000, - walltime=168, # week + walltime=30, wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss), + wait_num=str(jobres).strip(), sleep=10) print '\n############' diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 5a285ca..77c4e8c 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -207,13 +207,13 @@ # submit bg_cmd = cmd_templ.format(**job_dict) - send_job(jobname=sendjob_dict['jobname'], - cmd=bg_cmd, - logname=sendjob_dict['logname'], - mem=sendjob_dict['mem'], - walltime=sendjob_dict['walltime'], - njobs=sendjob_dict['njobs'], - sleep=sendjob_dict['sleep']) + jobres = send_job(jobname=sendjob_dict['jobname'], + cmd=bg_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + sleep=sendjob_dict['sleep']) print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss @@ -229,8 +229,9 @@ cmd=' '.join(sys.argv[:]), logname=agg_log, mem=8000, - walltime=168, # week + walltime=30, wait_name='bg.chunks.'+str(outdot)+'.resub_'+str(nummiss), + wait_num=str(jobres).strip(), sleep=args.sleep) print '\n############' diff --git a/bin/bg_imp.py b/bin/bg_imp.py index ab08f04..0990123 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -321,13 +321,13 @@ # submit imp_cmd = cmd_templ.format(**job_dict) - send_job(jobname=sendjob_dict['jobname'], - cmd=imp_cmd, - logname=sendjob_dict['logname'], - mem=sendjob_dict['mem'], - walltime=sendjob_dict['walltime'], - njobs=sendjob_dict['njobs'], - sleep=sendjob_dict['sleep']) + jobres = send_job(jobname=sendjob_dict['jobname'], + cmd=imp_cmd, + logname=sendjob_dict['logname'], + mem=sendjob_dict['mem'], + walltime=sendjob_dict['walltime'], + njobs=sendjob_dict['njobs'], + sleep=sendjob_dict['sleep']) print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss @@ -345,6 +345,7 @@ mem=8000, walltime=2, # week wait_name='imp.chunks.'+str(outdot)+'.resub_'+str(nummiss), + wait_num=str(jobres).strip(), sleep=args.sleep) print '\n############' @@ -438,13 +439,13 @@ # TODO: flex queue/mem reqs bg_cmd = bg_templ.format(**jobdict) -send_job(jobname='bg.chunks.'+str(outdot), - cmd=bg_cmd, - logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), - mem=8000, - walltime=2, - njobs=int(nchunks), - sleep=args.sleep) +jobres2 = send_job(jobname='bg.chunks.'+str(outdot), + cmd=bg_cmd, + logname=str('bg.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=8000, + walltime=2, + njobs=int(nchunks), + sleep=args.sleep) print 'Best-guess jobs submitted for %d chunks.\n' % nchunks @@ -467,8 +468,9 @@ cmd=next_call, logname=agg_log, mem=8000, - walltime=168, # week + walltime=30, wait_name='bg.chunks.'+str(outdot), + wait_num=str(jobres2).strip(), sleep=args.sleep) # finish diff --git a/bin/blueprint.py b/bin/blueprint.py index eeb2161..d2aba3e 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -297,8 +297,8 @@ def send_job(jobname, out, err = p.communicate() if p.returncode is None or p.returncode == 0: return out - else - raise EnvironmentError((p.returncode,err)) + else: + raise EnvironmentError((p.returncode,err,out)) else: return 0 diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 6fe0985..9bb199e 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -496,14 +496,14 @@ def find_chunk(snpchrom, snpbp, last_chunk): # submit job gwas_cmd = gwas_templ.format(**jobdict) -send_job(jobname='gwas.chunks.'+str(outdot), - cmd=gwas_cmd, - logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), - mem=4000, - walltime=2, - njobs=int(nchunk), - maxpar=200, - sleep=args.sleep) +jobres = send_job(jobname='gwas.chunks.'+str(outdot), + cmd=gwas_cmd, + logname=str('gwas.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=4000, + walltime=2, + njobs=int(nchunk), + maxpar=200, + sleep=args.sleep) print 'GWAS jobs submitted for %d chunks.\n' % nchunk @@ -565,8 +565,9 @@ def find_chunk(snpchrom, snpbp, last_chunk): cmd=' '.join(agg_call), logname=agg_log, mem=4000, - walltime=168, # week + walltime=30, wait_name='gwas.chunks.'+str(outdot), + wait_num=str(jobres).strip(), sleep=args.sleep) # TODO: diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index bd32661..151d871 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -225,14 +225,14 @@ shape_cmd = cmd_templ.format(**jobdict) # submit - send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr), - cmd=shape_cmd, - logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log', - mem=int(extra_args.mem_req)*1000, - walltime=168, # week - njobs=int(num_chr), - threads=extra_args.threads, - sleep=args.sleep) + jobres = send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr), + cmd=shape_cmd, + logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log', + mem=int(extra_args.mem_req)*1000, + walltime=30, + njobs=int(num_chr), + threads=extra_args.threads, + sleep=args.sleep) print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr @@ -250,6 +250,7 @@ mem=8000, walltime=2, # week wait_name='shape.'+str(outdot)+'.resub_'+str(num_chr), + wait_num=str(jobres).strip(), sleep=args.sleep) print '\n############' @@ -368,13 +369,13 @@ # submit cmd_imp = imp_templ.format(**jobdict) -send_job(jobname='imp.chunks.'+str(outdot), - cmd=cmd_imp, - logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), - mem=8000, - walltime=2, - njobs=int(nchunks), - sleep=args.sleep) +jobres2 = send_job(jobname='imp.chunks.'+str(outdot), + cmd=cmd_imp, + logname=str('imp.chunks.'+str(outdot)+'.'+str(clust_conf['log_task_id'])+'.sub.log'), + mem=8000, + walltime=2, + njobs=int(nchunks), + sleep=args.sleep) print 'Imputation jobs submitted for %d chunks.\n' % nchunks @@ -399,6 +400,7 @@ mem=8000, walltime=2, # week wait_name='imp.chunks.'+str(outdot), + wait_num=str(jobres2).strip(), sleep=args.sleep) diff --git a/bin/impute_rel.py b/bin/impute_rel.py index 651c2ea..254cfbe 100755 --- a/bin/impute_rel.py +++ b/bin/impute_rel.py @@ -145,7 +145,7 @@ cmd=next_call, logname=shape_log, mem=int(args.mem_req * 1000), - walltime=168, # week + walltime=30, sleep=args.sleep) diff --git a/bin/pca_rel.py b/bin/pca_rel.py index 7f6238f..ac401ff 100755 --- a/bin/pca_rel.py +++ b/bin/pca_rel.py @@ -197,9 +197,9 @@ cmd=str(imuspca_call), logname=str('imuspca_'+args.out+'.sub.log'), mem=int(imus_mem)*1000, - walltime=168, # one week + walltime=30, wait_name=str('strictqc_'+args.out), - wait_num=str(jobres), + wait_num=str(jobres).strip(), sleep=args.sleep, testonly=args.test_sub) @@ -227,7 +227,7 @@ mem=100, walltime=1, wait_name=str('imuspca_'+args.out), - wait_num=str(jobres2), + wait_num=str(jobres2).strip(), sleep=str(args.sleep), testonly=args.test_sub) diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 09e9d3e..5943789 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -303,7 +303,7 @@ cmd=' '.join(shape_call), logname='shape.'+str(outdot)+'.chr'+task_id+'.sub.log', mem=int(args.mem_req)*1000, - walltime=168, # week + walltime=30, njobs=22, threads=int(args.threads), sleep=str(args.sleep)) @@ -329,7 +329,7 @@ mem=8000, walltime=2, wait_name='shape.'+str(outdot), - wait_num=jobres, + wait_num=str(jobres).strip(), sleep=str(args.sleep)) From 991b0968d273733641679b5546a8283d54187c58 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Sat, 8 Oct 2016 03:52:27 +0200 Subject: [PATCH 17/48] add R to lisa environ for primus --- cluster_templates/lisa.sub.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh index 9575147..8451e4d 100755 --- a/cluster_templates/lisa.sub.sh +++ b/cluster_templates/lisa.sub.sh @@ -11,6 +11,7 @@ sleep {sleep_time} # setup resources cd {workdir} +module load R # main command line {cmd_string} From 1e2ce65c9b9d633c211486ce9134ecd50f948205 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Thu, 13 Oct 2016 00:06:30 +0200 Subject: [PATCH 18/48] remove refs to unused files; improve ref downloads; avoid implicating Stephan for my .pl changes; misc bugfixes --- bin/admix_rel.py | 4 +-- bin/args_pca.py | 2 +- bin/blueprint.py | 2 +- bin/checkflip_pico.pl | 40 ++++------------------- bin/checkpos_pico.pl | 26 +++++---------- bin/get_refs.sh | 14 +++++--- bin/imp_prep.pl | 76 ++++++++++++++++++++++--------------------- bin/lift_to_hg19.pl | 4 +-- bin/uger.sub.sh | 22 ------------- bin/uger_array.sub.sh | 29 ----------------- 10 files changed, 70 insertions(+), 149 deletions(-) delete mode 100755 bin/uger.sub.sh delete mode 100755 bin/uger_array.sub.sh diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 5f29981..faa689e 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -144,13 +144,13 @@ metavar='PATH', help='path to ADMIXTURE executable', required=False, - default="/humgen/atgu1/fs03/shared_resources/shared_software/bin/admixture") + default=None) arg_exloc.add_argument('--reap-ex', type=str, metavar='PATH', help='path to REAP executable', required=False, - default="/humgen/atgu1/fs03/shared_resources/shared_software/bin/REAP") + default=None) args = parser.parse_args() diff --git a/bin/args_pca.py b/bin/args_pca.py index 61d2353..59442a0 100644 --- a/bin/args_pca.py +++ b/bin/args_pca.py @@ -190,7 +190,7 @@ # metavar='PATH', # help='path to smartpca executable', # required=False, -# default="/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin/smartpca") +# default=None) diff --git a/bin/blueprint.py b/bin/blueprint.py index d2aba3e..e5535f6 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -492,7 +492,7 @@ def load_job(jfile): # get queue conf_file = os.environ['HOME']+"/picopili.conf" configs = read_conf(conf_file) - queue = configs['queue'] + queue = configs['cluster'] # set logfile name if args.noerr: diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl index 3bb3d08..3eb333b 100755 --- a/bin/checkflip_pico.pl +++ b/bin/checkflip_pico.pl @@ -12,6 +12,8 @@ # # 01/14/10 # +# Adapted for Picopili by Raymond Walters, rwalters@broadinstitute.org +# # # # checks alelles of a bim-file (plink-binary-dataset) with reference-info (created with refinfo) @@ -65,10 +67,9 @@ sub trans { my $dfcol = 3; ## chr-col in reference -my $info_file = "HM3.info"; +my $info_file = ""; my $refdir = ""; -my $lisadir = "/home/gwas/pgc-samples/hapmap_ref/"; my $frq_th = .15; my $subdir = "flip_subdir"; @@ -82,7 +83,7 @@ sub trans { version: $version --refdir STRING location of reference-directory, default $refdir - --ploc STRING location of plink-binary (default is found at Broad) + --ploc STRING location of plink-binary (default is found from picopili.conf) default: $p2loc --info STRING other info-file (absolute path) -> overwrites --refdir --subdir STRING subdir, to put end-dataset into, default: $subdir @@ -103,21 +104,6 @@ sub trans { # --replace replace old dataset with new one - - for Broadies: the files are currently stored here (this script should be able to find them): - $refdir - - /fg/debakkerscratch/ripke/hapmap_ref/subchr/infosum.sorted - -here 1KG - /home/radon01/sripke/bakker_ripke/hapmap_ref/impute2_ref/1KG_Mar12/ALL_1000G_phase1integrated_feb2012_impute/subchr/sumfrq.eur - - - - on Lisa, th files are found here: $lisadir - - created by Stephan Ripke 2010: sripke\@broadinstitute.org - "; use Cwd; @@ -138,23 +124,11 @@ sub trans { die "$usage\n" if ($help); -if ($info_file eq "HM3.info") { - - unless (-e "$refdir/$info_file") { - if (-e "$lisadir/$info_file"){ - $refdir = $lisadir; - } - else { - print "check reference dir and permissions for <$refdir/$info_file>\n"; - exit; - } - } - - $info_file = "$refdir/$info_file"; +if ($info_file eq "") { + die "$usage\n"; } - else { - die "couldn't find $info_file" unless (-e $info_file); + die "couldn't find info-file $info_file" unless (-e $info_file); } if ($dcolstr) { diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl index 76a2661..5bd8116 100755 --- a/bin/checkpos_pico.pl +++ b/bin/checkpos_pico.pl @@ -8,10 +8,13 @@ # # checkpos6 # -# created by Stephan Ripke, Broadinstitute, sripke@broadinstitute.org +# Created by Stephan Ripke, Broadinstitute, sripke@broadinstitute.org # # 01/14/10 # +# Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org +# +# # # # checks position bim-file (plink-binary-dataset) with dbsnp reference @@ -74,9 +77,7 @@ sub trans { my $subdir = "dbsnp_subdir"; my $home_dir = "$ENV{HOME}"; -my $dbsnp_file = "/psych/genetics_data/ripke/references_from_debakkerscratch/ref_db/sorted_dbsnp_positions_129_b36";## created from this one /humgen/gsa-hpprojects/GATK/data/dbsnp_129_b36.rod -my $dbsnp_file_lisa = "/home/gwas/1KG_reference/sorted_dbsnp_positions_129_b36"; ## including sed 's/\tY\t/\t24\t/' sorted_dbsnp_positions_129_b36 | sed 's/\tX\t/\t23\t/' | sed 's/MT/26/' - +my $dbsnp_file = ""; my $usage = " @@ -84,11 +85,8 @@ sub trans { version: $version - --dbsnp STRING dbSNP reference file (default is found at Broad) - default: $dbsnp_file - or: $dbsnp_file_lisa - HM3: /home/gwas/pgc-samples/hapmap_ref/infosum.annot.markers.sorted - --ploc STRING location of plink-binary (default is found at Broad) + --dbsnp STRING dbSNP reference file (created by readref) + --ploc STRING location of plink-binary (default read from picopili.conf) default: $p2loc --col INT,INT,INT snp-col,chr-col,kb-col in bim-file: default: $scol,$ccol,$kcol --dbcol INT,INT,INT snp-col,chr-col,kb-col in dbsnp-file: default: $dscol,$dccol,$dkcol @@ -101,8 +99,6 @@ sub trans { --exmulti, --nokeep and --subdir are in effect, as long as --ncreate is not switched - created by Stephan Ripke 2010: sripke\@broadinstitute.org - "; use Getopt::Long; @@ -130,13 +126,7 @@ sub trans { } unless (-e $dbsnp_file) { - if (-e $dbsnp_file_lisa) { - $dbsnp_file = $dbsnp_file_lisa; - } - else { - print "*** Error, dbSNP file not found\n"; - exit; - } + die "*** Error, dbSNP file not found\n"; } diff --git a/bin/get_refs.sh b/bin/get_refs.sh index 3ce2d5a..4231361 100755 --- a/bin/get_refs.sh +++ b/bin/get_refs.sh @@ -24,7 +24,7 @@ echo " " # setup rp_conf="$HOME/ricopili.conf" -SERVER="https://personal.broadinstitute.org/rwalters/picopili_files/" +SERVER="https://personal.broadinstitute.org/rwalters/picopili_files" SCRIPT=$(readlink -f "$0") BINLOC=$(dirname "$SCRIPT") LIBLOC=`echo $(dirname "$BINLOC")"/lib"` @@ -118,7 +118,7 @@ if [ "$to_dl" = 'true' ]; then echo "WARNING: Preparing to download reference files from:" echo "$SERVER" echo " " - echo "Expected total file size is ~275 MB, minus existing" + echo "Expected total file size is ~300 MB, minus existing" echo "files already linked/downloaded." echo " " echo "If you do not have web access, or if you do not want" @@ -133,7 +133,10 @@ if [ "$to_dl" = 'true' ]; then if [ "$finame" = "last" ]; then continue else - wget "$SERVER/$finame" "$LIBLOC/buigue/$finame" + echo " " + echo "Next file: $SERVER/$finame" + # wget --no-check-certificate "$SERVER/$finame" "$LIBLOC/buigue/$finame" + curl -o "$LIBLOC/buigue/$finame" "$SERVER/$finame" fi done for finame in ${hmfiles[@]}; do @@ -141,7 +144,10 @@ if [ "$to_dl" = 'true' ]; then if [ "$finame" = "last" ]; then continue else - wget "$SERVER/$finame" "$LIBLOC/plague/$finame" + echo " " + echo "Next file: $SERVER/$finame" + # wget --no-check-certificate "$SERVER/$finame" "$LIBLOC/plague/$finame" + curl -o "$LIBLOC/plague/$finame" "$SERVER/$finame" fi done fi diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index 1afc2ee..a707037 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -210,7 +210,7 @@ } } if ( $scr_path eq '') { - push @miss_scripts, "cp /home/unix/sripke/bin/$scr_name ./\n"; + push @miss_scripts, "$scr_name\n"; print "!!Error!! : No $scr_name command available\n" ; } @@ -219,18 +219,20 @@ if (@miss_scripts > 0) { - if (-e "get_scripts_on_broad.txt") { - print "please remove this file and restart: get_scripts_on_broad.txt\n"; - } - die $! unless open FILE1, "> get_scripts_on_broad.txt"; + +# if (-e "get_scripts_on_broad.txt") { +# print "please remove this file and restart: get_scripts_on_broad.txt\n"; +# } + die $! unless open FILE1, "> missing_picopili_scripts.txt"; foreach (@miss_scripts) { print FILE1 "$_"; } close FILE1; + die "Missing required scripts. See missing_picopili_scripts.txt\n"; - print "exiting now -> have a look at get_scripts_on_broad.txt\n"; - exit; +# print "exiting now -> have a look at get_scripts_on_broad.txt\n"; +# exit; } @@ -422,36 +424,36 @@ sub send_jobarray { $now =~ s/ /_/g; - if ($sjaname eq "finished") { - - my $fini_message ; - $fini_message .= "\n\n##################################################################\n"; - $fini_message .= "##### CONGRATULATIONS: \n"; - $fini_message .= "##### rp_pipeline finished successfully:\n"; - $fini_message .= "##### $sjainfotxt\n"; - $fini_message .= "##### now start with PCA (see README in subdir pcaer_sub/)\n"; - $fini_message .= "##### or directly with postimputation analysis\n"; - $fini_message .= "##### have a look at the wiki page\n"; - $fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n"; - $fini_message .= "##################################################################\n"; - print "$fini_message\n"; - - - die $! unless open SUC, "> success_file"; - print SUC $fini_message."\n"; - close SUC; - - if($email_on){ - &mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ; - } - - my $sjarow = $sjainfotxt."\t$sjaname\t$now"; - &a2filenew_app("$sjainfofile",$sjarow); - - - exit; - - } +# if ($sjaname eq "finished") { +# +# my $fini_message ; +# $fini_message .= "\n\n##################################################################\n"; +# $fini_message .= "##### CONGRATULATIONS: \n"; +# $fini_message .= "##### rp_pipeline finished successfully:\n"; +# $fini_message .= "##### $sjainfotxt\n"; +# $fini_message .= "##### now start with PCA (see README in subdir pcaer_sub/)\n"; +# $fini_message .= "##### or directly with postimputation analysis\n"; +# $fini_message .= "##### have a look at the wiki page\n"; +# $fini_message .= "##### https://sites.google.com/a/broadinstitute.org/ricopili/\n"; +# $fini_message .= "##################################################################\n"; +# print "$fini_message\n"; +# +# +# die $! unless open SUC, "> success_file"; +# print SUC $fini_message."\n"; +# close SUC; +# +# if($email_on){ +# &mysystem ('cat success_file | '.$mutt_script.' -s RP_pipeline_finished '.$email) ; +# } +# +# my $sjarow = $sjainfotxt."\t$sjaname\t$now"; +# &a2filenew_app("$sjainfofile",$sjarow); +# +# +# exit; +# +# } chdir ($sjadir); diff --git a/bin/lift_to_hg19.pl b/bin/lift_to_hg19.pl index 7ac1726..e18d626 100755 --- a/bin/lift_to_hg19.pl +++ b/bin/lift_to_hg19.pl @@ -12,6 +12,8 @@ # # 01/14/10 # +# Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org +# # # # lifts a plink binary from hg18 to hg19 @@ -84,8 +86,6 @@ here a seletion of lilofiles: $liloc - created by Stephan Ripke 2010: sripke\@broadinstitute.org - "; diff --git a/bin/uger.sub.sh b/bin/uger.sub.sh deleted file mode 100755 index 494d9df..0000000 --- a/bin/uger.sub.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env bash -#$ -j y -#$ -cwd -#$ -V - -# wrapper script for job submission on Broad UGER cluster -# -# first parameter should be duration for 'sleep' before -# execution -# remainder of command line should be the job to be -# submitted (including all agruments) -# -# The -V flag above will provoke a warning that -# LD_LIBRARY_PATH won't be used for security reasons; -# this warning can be safely ignored - -source /broad/software/scripts/useuse -reuse -q Anaconda -sleep $1 -shift -"$@" -# eof diff --git a/bin/uger_array.sub.sh b/bin/uger_array.sub.sh deleted file mode 100755 index 8109e27..0000000 --- a/bin/uger_array.sub.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash -#$ -j y -#$ -cwd -#$ -V - -# wrapper script for job submission on Broad UGER cluster -# -# first parameter should be duration for 'sleep' before -# execution -# remainder of command line should be the job to be -# submitted (including all agruments) -# -# The -V flag above will provoke a warning that -# LD_LIBRARY_PATH won't be used for security reasons; -# this warning can be safely ignored - -# use for task arrays -# tasknum=$SGE_TASK_ID - -source /broad/software/scripts/useuse -reuse -q Anaconda -sleep $1 -shift - -inp="$@" -call=${inp//\$tasknum/$SGE_TASK_ID} -$call - -# eof From 0d25cb5528f4ef42e4b4b4cbe49277a60f475d45 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Thu, 13 Oct 2016 00:09:46 +0200 Subject: [PATCH 19/48] obfuscate email --- bin/checkflip_pico.pl | 2 +- bin/config_pico.pl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl index 3eb333b..679a4b0 100755 --- a/bin/checkflip_pico.pl +++ b/bin/checkflip_pico.pl @@ -12,7 +12,7 @@ # # 01/14/10 # -# Adapted for Picopili by Raymond Walters, rwalters@broadinstitute.org +# Adapted for Picopili by Raymond Walters, rwalters(at)broadinstitute.org # # # diff --git a/bin/config_pico.pl b/bin/config_pico.pl index 442c2bb..b9c7767 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -673,7 +673,7 @@ () if ($shell eq "bash-login-check"){$shell = "bash";} if ($shell ne "bash" && $shell ne "tcsh") { print "Warning! Shell not recognized: $shell\n"; - print "Please send email to rwalters\@broadinstitute.org\n"; + print "Please send email to rwalters(at)broadinstitute.org\n"; } print "Detected you are using the following shell: $shell\n\n"; From 44df0df53e2caa833218ed61c060e652b3fad3d1 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 13 Oct 2016 13:24:33 -0400 Subject: [PATCH 20/48] add admixture projection option --- bin/admix_rel.py | 395 +++++++++++++++++++++++++++-------------------- 1 file changed, 230 insertions(+), 165 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index faa689e..4ef504d 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -203,26 +203,38 @@ plinkx = find_exec('plink',key='p2loc') -if args.rscript_ex == None or args.rscript_ex == "None": +if args.rscript_ex is None or args.rscript_ex == "None": args.rscript_ex = find_exec('Rscript', key='rscloc') -if args.admixture_ex == None or args.admixture_ex == "None": - args.admixture_ex = find_exec('admixture', key='admloc') - -if args.reap_ex == None or args.reap_ex == "None": +if args.reap_ex is None or args.reap_ex == "None": args.reap_ex = find_exec('REAP', key='reaploc') rp_bin = os.path.dirname(os.path.realpath(__file__)) Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript' if plot_pca: - Rplotibdx = rp_bin+'/plot_pca.Rscript' + Rplotpcax = rp_bin+'/plot_pca.Rscript' + + +# either have admixture file, or need to run admixture +run_admix = True +if args.admix_p is not None and args.admix_q is not None and args.admix_p != "" and args.admix_q != "": + if args.admixture_ex is None or args.admixture_ex == "None": + args.admixture_ex = find_exec('admixture', key='admloc') + + test_exec(args.admixture_ex, 'ADMIXTURE') + + assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path" + run_admix = False + +else: + assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p) + assert os.path.isfile(args.admix_q), "Admixture .P file %s does not exist." % str(args.admix_q) # verify executables test_exec(plinkx, 'Plink') test_exec(args.rscript_ex, 'Rscript') -test_exec(args.admixture_ex, 'ADMIXTURE') test_exec(args.reap_ex, 'REAP') # pca file @@ -231,7 +243,6 @@ assert '/' not in args.target_bfile, "--plot-admix-pca must specify only a file, not a path" # verify bfiles are files, not paths -assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path" assert '/' not in args.target_bfile, "--target-bfile must specify only a file stem, not a path" @@ -272,136 +283,172 @@ print '\n...Running Admixture on unrelated dataset...' ############# -admix_call = [args.admixture_ex, - str(args.unrel_bfile+'.bed'), - str(args.npops), - '-j'+str(args.multithread_cores)] -admix_unrel_log = open(str('admix_'+args.out+'_unrel.log'), 'w') - -print str(' '.join(admix_call)) -print 'Logging to ' + admix_unrel_log.name + '\n' -subprocess.check_call(admix_call, stdout=admix_unrel_log) - -admix_unrel_log.close() +if run_admix: + admix_call = [args.admixture_ex, + str(args.unrel_bfile+'.bed'), + str(args.npops), + '-j'+str(args.multithread_cores)] + admix_unrel_log = open(str('admix_'+args.out+'_unrel.log'), 'w') + + print str(' '.join(admix_call)) + print 'Logging to ' + admix_unrel_log.name + '\n' + subprocess.check_call(admix_call, stdout=admix_unrel_log) + + admix_unrel_log.close() +if args.use_exemplars: -############# -print '\n...Selecting exemplars for each ancestral population...' -############# -# - identify population assignment (including "-") for each input individual -# - confirm whether there are enough IDs assigned to each populations -# - match population assignments to FID/IIDs -# - write .pops file for target bfile, .pops.info file - -# label for populations are popA, popB, popC, ... -popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)] - -# define function returning popname or '-' based on largest proportion -# Note: ties broken in favor of first pop listed in names (possible if th <= 0.5) -def maxpop(props, names, th): - whichmax = props.index(max(props)) - if props[whichmax] > th: - outpop = names[whichmax] - else: - outpop = '-' - return outpop - -# get list of selected pop for each individual in admixture results -ind_pops = [] -admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q') -with open(admix_pops_file, 'r') as f: - # map() required to read probs as float instead of string - ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f] - -# sanity check parsing -nfam = file_len(str(args.unrel_bfile+'.fam')) -if len(ind_pops) != nfam: - raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \ - 'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam'))) - -# check have sufficient exemplars -popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)] -lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)] - -print 'Exemplars per population:' -for i in range(args.npops): - print str(popnames[i] + ': ' + str(popcounts[i])) -print 'Unassigned: '+str(ind_pops.count('-')) - -if any(lackingpops): - print '\n###########\n' - print 'ERROR: One or more populations with insufficient number of exemplars (<'+str(args.min_exemplar)+').' - print '\nConsider rerunning with fewer ancestral populations (here: '+str(args.npops)+'), \n' + \ - 'a looser threshold for selecting population exemplars (here: '+str(args.prop_th)+'), \n' + \ - 'or fewer required exemplars per ancestral population in the unrelated set ' + \ - '(here :'+str(args.min_exemplar)+').\n' - exit(1) - - -### match exemplar pop status with FID/IIDs, record in dict -pop_dict = {} - -# process fam file by line -ref_fam = open(str(args.unrel_bfile+'.fam'), 'r') -idnum=0 -for line in ref_fam: - # iterate line counter, used to get elements from ind_pops[] - idnum += 1 - - # read - (fid, iid, pat, mat, sex, phen) = line.split() - - # use FID:IID identifier as key to record pop status - bfile_id = fid +':'+ iid - pop_dict[bfile_id] = ind_pops[idnum-1] - -ref_fam.close() - - -### create pop file to match target fam file, pop info file -target_fam = open(str(args.target_bfile+'.fam'), 'r') -target_pop = open(str(args.target_bfile+'.pop'), 'w') -target_popinfo = open(str(args.target_bfile+'.pop.info'), 'w') - -for line in target_fam: - - # read - (targetfid, targetiid, pat, mat, sex, phen) = line.split() - target_id = targetfid +':'+ targetiid - - # check dict - if target_id in pop_dict: - target_pop.write(pop_dict[target_id] + '\n') - target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' unrel ' + pop_dict[target_id] + '\n') + ############# + print '\n...Selecting exemplars for each ancestral population...' + ############# + # - identify population assignment (including "-") for each input individual + # - confirm whether there are enough IDs assigned to each populations + # - match population assignments to FID/IIDs + # - write .pops file for target bfile, .pops.info file + + # label for populations are popA, popB, popC, ... + popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)] + + # define function returning popname or '-' based on largest proportion + # Note: ties broken in favor of first pop listed in names (possible if th <= 0.5) + def maxpop(props, names, th): + whichmax = props.index(max(props)) + if props[whichmax] > th: + outpop = names[whichmax] + else: + outpop = '-' + return outpop + + # get list of selected pop for each individual in admixture results + ind_pops = [] + + if run_admix: + admix_pops_file = str(args.unrel_bfile+'.'+str(args.npops)+'.Q') else: - target_pop.write('-' + '\n') - target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' target ' + '-' + '\n') - - -target_fam.close() -target_pop.close() -target_popinfo.close() - + admix_pops_file = args.admix_q + + + with open(admix_pops_file, 'r') as f: + # map() required to read probs as float instead of string + ind_pops = [maxpop(props=map(float,line.split()), names=popnames, th=args.prop_th) for line in f] + + # sanity check parsing + nfam = file_len(str(args.unrel_bfile+'.fam')) + if len(ind_pops) != nfam: + raise ValueError('Number of individuals parsed from admixture results (%d in %s) ' + \ + 'and fam file of unrelateds (%d in %s) do not match.' % (len(ind_pops), admix_pops_file, int(nfam), str(args.unrel_bfile+'.fam'))) + + # check have sufficient exemplars + popcounts = [ind_pops.count(popnames[i]) for i in range(args.npops)] + lackingpops = [popcounts[i] < args.min_exemplar for i in range(args.npops)] + + print 'Exemplars per population:' + for i in range(args.npops): + print str(popnames[i] + ': ' + str(popcounts[i])) + print 'Unassigned: '+str(ind_pops.count('-')) + + if any(lackingpops): + print '\n###########\n' + print 'ERROR: One or more populations with insufficient number of exemplars (<'+str(args.min_exemplar)+').' + print '\nConsider rerunning with fewer ancestral populations (here: '+str(args.npops)+'), \n' + \ + 'a looser threshold for selecting population exemplars (here: '+str(args.prop_th)+'), \n' + \ + 'or fewer required exemplars per ancestral population in the unrelated set ' + \ + '(here :'+str(args.min_exemplar)+').\n' + exit(1) + + + ### match exemplar pop status with FID/IIDs, record in dict + pop_dict = {} + + # process fam file by line + ref_fam = open(str(args.unrel_bfile+'.fam'), 'r') + idnum=0 + for line in ref_fam: + # iterate line counter, used to get elements from ind_pops[] + idnum += 1 + + # read + (fid, iid, pat, mat, sex, phen) = line.split() + + # use FID:IID identifier as key to record pop status + bfile_id = fid +':'+ iid + pop_dict[bfile_id] = ind_pops[idnum-1] + + ref_fam.close() + + + ### create pop file to match target fam file, pop info file + target_fam = open(str(args.target_bfile+'.fam'), 'r') + target_pop = open(str(args.target_bfile+'.pop'), 'w') + target_popinfo = open(str(args.target_bfile+'.pop.info'), 'w') + + for line in target_fam: + + # read + (targetfid, targetiid, pat, mat, sex, phen) = line.split() + target_id = targetfid +':'+ targetiid + + # check dict + if target_id in pop_dict: + target_pop.write(pop_dict[target_id] + '\n') + target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' unrel ' + pop_dict[target_id] + '\n') + else: + target_pop.write('-' + '\n') + target_popinfo.write(targetfid + ' ' + targetiid + ' ' + target_id + ' target ' + '-' + '\n') + + + target_fam.close() + target_pop.close() + target_popinfo.close() -############# -print '\n...Running supervised admixture analysis in target data...' -############# -admix_super_call = [args.admixture_ex, - str(args.target_bfile+'.bed'), - str(args.npops), - '-j'+str(args.multithread_cores), - '--supervised'] -admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w') + ############# + print '\n...Running supervised admixture analysis in target data...' + ############# + + admix_super_call = [args.admixture_ex, + str(args.target_bfile+'.bed'), + str(args.npops), + '-j'+str(args.multithread_cores), + '--supervised'] + admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w') + + print str(' '.join(admix_super_call)) + print 'Logging to ' + admix_target_log.name + '\n' + subprocess.check_call(admix_super_call, stdout=admix_target_log) + + admix_target_log.close() + -print str(' '.join(admix_super_call)) -print 'Logging to ' + admix_target_log.name + '\n' -subprocess.check_call(admix_super_call, stdout=admix_target_log) -admix_target_log.close() +# no exemplars, using projection instead +else: + + ############# + print '\n...Projecting admixture analysis to target data...' + ############# + + ref_p_name = str(args.target_bfile)+'.'+str(args.npops)+'.P.in' + if run_admix: + link(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', ref_p_name, 'admixture allele freqs') + else: + ref_p_in = str(args.admix_p) + link(wd+'/'+ref_p_in, ref_p_name,'input admixture allele freqs') + + admix_project_call = [args.admixture_ex, + '-P', str(args.target_bfile+'.bed'), + str(args.npops), + '-j'+str(args.multithread_cores)] + admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w') + + print str(' '.join(admix_project_call)) + print 'Logging to ' + admix_target_log.name + '\n' + subprocess.check_call(admix_super_call, stdout=admix_target_log) + + admix_target_log.close() ############# @@ -426,7 +473,7 @@ def maxpop(props, names, th): target_fam_nam = str(args.target_bfile + '.fam') if not (file_len(target_Qfile_nam) == file_len(target_fam_nam)): - raise ValueError('Length of admixture proportions ouput (%s) does not match fam file (%s). ' + \ + raise ValueError('Length of admixture proportions output (%s) does not match fam file (%s). ' + \ 'Error during output?' % (target_Qfile_nam, target_fam_nam)) # paste together columns, should be in same order (based on ADMIXTURE's ouptut format) @@ -514,12 +561,15 @@ def maxpop(props, names, th): # setup file streams for plotinfo files pop_info_files = [] - exemp_info_files = [] + if args.use_exemplars: + exemp_info_files = [] for i in xrange(args.npops): pop_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt', 'w') ) pop_info_files[i].write('FID IID col pch layer\n') - exemp_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', 'w') ) - exemp_info_files[i].write('FID IID col pch layer\n') + + if args.use_exemplars: + exemp_info_files.append( open(str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', 'w') ) + exemp_info_files[i].write('FID IID col pch layer\n') # parse admixture proportions reap_mix_props = open(str(args.target_bfile + '.props.tmp.txt'), 'r') @@ -543,26 +593,29 @@ def maxpop(props, names, th): pop_info_files[i].write(' '.join([fid, iid, bin_col, str(1), str(in_bin)])+'\n') # exemplar info file: FID, IID, col, pch, layer - if joinid in pop_dict: - if pop_dict[joinid] == popnames[i]: - exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(exemplar_color)+'\"', str(exemplar_pch), str(3)]) + '\n') + if args.use_exemplars: + if joinid in pop_dict: + if pop_dict[joinid] == popnames[i]: + exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(exemplar_color)+'\"', str(exemplar_pch), str(3)]) + '\n') + else: + exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(ref_color)+'\"', str(ref_pch), str(2)]) + '\n') else: - exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(ref_color)+'\"', str(ref_pch), str(2)]) + '\n') - else: - exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(other_color)+'\"', str(other_pch), str(1)]) + '\n') + exemp_info_files[i].write(' '.join([fid, iid, '\"'+str(other_color)+'\"', str(other_pch), str(1)]) + '\n') # close plotinfo files for i in xrange(args.npops): pop_info_files[i].close() - exemp_info_files[i].close() + if args.use_exemplars: + exemp_info_files[i].close() # create legend files: col, pch, fill, text (either col/pch or fill should be NA) - exem_legend = open(str(args.target_bfile) + '.exemplar.legend.txt', 'w') - exem_legend.write('col pch fill text\n') - exem_legend.write(str(exemplar_color) + ' ' + str(exemplar_pch) + ' NA ' + '\"Population exemplar\"\n') - exem_legend.write(str(ref_color) + ' ' + str(ref_pch) + ' NA ' + '\"Reference set\"\n') - exem_legend.write(str(other_color) + ' ' + str(other_pch) + ' NA ' + '\"Non-reference set\"\n') - exem_legend.close() + if args.use_exemplars: + exem_legend = open(str(args.target_bfile) + '.exemplar.legend.txt', 'w') + exem_legend.write('col pch fill text\n') + exem_legend.write(str(exemplar_color) + ' ' + str(exemplar_pch) + ' NA ' + '\"Population exemplar\"\n') + exem_legend.write(str(ref_color) + ' ' + str(ref_pch) + ' NA ' + '\"Reference set\"\n') + exem_legend.write(str(other_color) + ' ' + str(other_pch) + ' NA ' + '\"Non-reference set\"\n') + exem_legend.close() prop_legend = open(str(args.target_bfile) + '.admixture.legend.txt', 'w') prop_legend.write('col pch fill text\n') @@ -572,16 +625,17 @@ def maxpop(props, names, th): ### generate plots for i in xrange(args.npops): - r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w') - subprocess.check_call([Rplotpcax, - str(args.plot_admix_pca), - str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', - str(args.target_bfile) + '.exemplar.legend.txt', - str(3), - str(args.out) + '.' + popnames[i] + '.exemplars'], - stderr=subprocess.STDOUT, - stdout=r_pca_ex_log) - r_pca_ex_log.close() + if args.use_exemplars: + r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w') + subprocess.check_call([Rplotpcax, + str(args.plot_admix_pca), + str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', + str(args.target_bfile) + '.exemplar.legend.txt', + str(3), + str(args.out) + '.' + popnames[i] + '.exemplars'], + stderr=subprocess.STDOUT, + stdout=r_pca_ex_log) + r_pca_ex_log.close() r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w') subprocess.check_call([Rplotpcax, @@ -612,7 +666,10 @@ def maxpop(props, names, th): str(args.out+'.plot_pca_files.tar.gz')] + \ glob(args.target_bfile+".*.admixture.plotinfo.txt") + \ [str(args.target_bfile)+".admixture.legend.txt"] + \ - glob(args.out+".*.plot_admixture.log") + \ + glob(args.out+".*.plot_admixture.log")) + + subprocess.check_call(["tar", "-zcvf", + str(args.out+'.plot_exemplar_files.tar.gz')] + \ glob(args.target_bfile+".*.exemplar.plotinfo.txt") + \ [str(args.target_bfile)+".exemplar.legend.txt"] + \ glob(args.out+".*.plot_exemplars.log") ) @@ -621,22 +678,27 @@ def maxpop(props, names, th): subprocess.check_call(['rm'] + glob(args.target_bfile+".*.admixture.plotinfo.txt")) subprocess.check_call(['rm'] + glob(args.target_bfile+".admixture.legend.txt")) subprocess.check_call(['rm'] + glob(args.out+".*.plot_admixture.log")) - subprocess.check_call(['rm'] + glob(args.target_bfile+".*.exemplar.plotinfo.txt")) - subprocess.check_call(['rm'] + glob(args.target_bfile+".exemplar.legend.txt")) - subprocess.check_call(['rm'] + glob(args.out+".*.plot_exemplars.log")) + + if args.use_exemplars: + subprocess.check_call(['rm'] + glob(args.target_bfile+".*.exemplar.plotinfo.txt")) + subprocess.check_call(['rm'] + glob(args.target_bfile+".exemplar.legend.txt")) + subprocess.check_call(['rm'] + glob(args.out+".*.plot_exemplars.log")) ### print '\nZipping Admixture output files:' ### + gz_confirm(str(args.target_bfile)+'.'+str(args.npops)+'.P', str(args.target_bfile)+'.'+str(args.npops)+'.P.gz', force=False) gz_confirm(str(args.target_bfile)+'.'+str(args.npops)+'.Q', str(args.target_bfile)+'.'+str(args.npops)+'.Q.gz', force=False) - gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', - str(args.unrel_bfile)+'.'+str(args.npops)+'.P.gz', force=False) - gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.Q', - str(args.unrel_bfile)+'.'+str(args.npops)+'.Q.gz', force=False) + + if run_admix: + gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.P', + str(args.unrel_bfile)+'.'+str(args.npops)+'.P.gz', force=False) + gz_confirm(str(args.unrel_bfile)+'.'+str(args.npops)+'.Q', + str(args.unrel_bfile)+'.'+str(args.npops)+'.Q.gz', force=False) ### @@ -655,8 +717,11 @@ def maxpop(props, names, th): ### subprocess.check_call(['rm', '-v', str(args.target_bfile)+'.tmp_recode.tped', - str(args.target_bfile)+'.tmp_recode.tfam', - str(args.target_bfile)+'.props.tmp.txt']) + str(args.target_bfile)+'.tmp_recode.tfam']) + + if plot_pca: + subprocess.check_call(['rm', '-v', + str(args.target_bfile)+'.props.tmp.txt']) ### print '\nRemove if exist:' From e0512bf9e385b0b71ccf4a74bbaecebe94443c9a Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 13 Oct 2016 14:02:20 -0400 Subject: [PATCH 21/48] arguments for admix projection vs exemplars --- bin/admix_rel.py | 70 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 4ef504d..46e9882 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -65,8 +65,23 @@ type=str, metavar='FILESTEM', help='File stem for plink bed/bim/fam files ' + \ - 'with unrelated individuals to estimate admixture.', - required=True) + 'with unrelated individuals to estimate admixture.' + \ + 'Must specify either this or --admix-p.', + required=False) +arg_base.add_argument('--admix-p', + type=str, + metavar='FILE', + help='Admixture results .P file from sample of ' + \ + 'unrelated individuals. Can alternatively specify ' + \ + '--unrel-bfile to run this initial admixture.', + required=False) +arg_base.add_argument('--admix-q', + type=str, + metavar='FILE', + help='Admixture results .Q file from sample of ' + \ + 'unrelated individuals. Required only if using ' + \ + '--admix-p and --use-exemplars.', + required=False) arg_base.add_argument('--target-bfile', type=str, metavar='FILESTEM', @@ -89,13 +104,19 @@ arg_base.add_argument('--no-cleanup', action='store_true', help='skip cleanup of interim files') - arg_admix.add_argument('--npops', type=int, metavar='INT', help='Number of ancestral populations for admixture', required=False, default=4) +arg_admix.add_argument('--use-exemplars', + action='store_true', + help='Determine admixture in target sample based on ' + \ + 'supervised fit with a selection of population exemplars ' + \ + 'rather than a project of admixture solution in unrelateds. ' + \ + '(Required for ADMIXTURE version < 1.3). Requires --unrel-bfile, ' + \ + 'and if using --admix-p also requires specifying --admix-q.') arg_admix.add_argument('--prop-th', type=float, metavar='FLOAT', @@ -203,6 +224,11 @@ plinkx = find_exec('plink',key='p2loc') +if args.admixture_ex is None or args.admixture_ex == "None": + args.admixture_ex = find_exec('admixture', key='admloc') + +test_exec(args.admixture_ex, 'ADMIXTURE') + if args.rscript_ex is None or args.rscript_ex == "None": args.rscript_ex = find_exec('Rscript', key='rscloc') @@ -215,22 +241,33 @@ if plot_pca: Rplotpcax = rp_bin+'/plot_pca.Rscript' - -# either have admixture file, or need to run admixture +# check if running admixture for unrelateds run_admix = True +if args.admix_p is not None and args.admix_p != "": + run_admix = False + +else: + assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p) + + if args.use_exemplars: + assert os.path.isfile(args.admix_q), "Admixture .Q file %s does not exist." % str(args.admix_q) -if args.admix_p is not None and args.admix_q is not None and args.admix_p != "" and args.admix_q != "": - if args.admixture_ex is None or args.admixture_ex == "None": - args.admixture_ex = find_exec('admixture', key='admloc') - test_exec(args.admixture_ex, 'ADMIXTURE') +# check if have unrel-bfile if needed: +if args.unrel_bfile is None or args.unrel_bfile == "": - assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path" - run_admix = False + if run_admix: + raise parser.error('Must specify either --unrel-bfile or --admix-p.') + + if args.use_exemplars: + raise parser.error('Must specify --unrel-bfile to define exemplars for --use-exemplars.') else: - assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p) - assert os.path.isfile(args.admix_q), "Admixture .P file %s does not exist." % str(args.admix_q) + assert '/' not in args.unrel_bfile, "--unrel-bfile must specify only a file stem, not a path" + assert os.path.isfile(str(args.unrel_bfile)+'.bed'), "bed file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.bed' + assert os.path.isfile(str(args.unrel_bfile)+'.bim'), "bim file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.bim' + assert os.path.isfile(str(args.unrel_bfile)+'.fam'), "fam file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.fam' + # verify executables test_exec(plinkx, 'Plink') @@ -264,9 +301,10 @@ os.chdir(args.outdir) # link plink files (with verification) -link(str(wd+'/'+args.unrel_bfile+'.bed'), str(args.unrel_bfile+'.bed'), 'bed file for unrelated individuals') -link(str(wd+'/'+args.unrel_bfile+'.bim'), str(args.unrel_bfile+'.bim'), 'bim file for unrelated individuals') -link(str(wd+'/'+args.unrel_bfile+'.fam'), str(args.unrel_bfile+'.fam'), 'fam file for unrelated individuals') +if run_admix or args.use_exemplars: + link(str(wd+'/'+args.unrel_bfile+'.bed'), str(args.unrel_bfile+'.bed'), 'bed file for unrelated individuals') + link(str(wd+'/'+args.unrel_bfile+'.bim'), str(args.unrel_bfile+'.bim'), 'bim file for unrelated individuals') + link(str(wd+'/'+args.unrel_bfile+'.fam'), str(args.unrel_bfile+'.fam'), 'fam file for unrelated individuals') link(str(wd+'/'+args.target_bfile+'.bed'), str(args.target_bfile+'.bed'), 'bed file for target individuals') link(str(wd+'/'+args.target_bfile+'.bim'), str(args.target_bfile+'.bim'), 'bim file for target individuals') From 2e9b3c10ba6644d721f6830e4fb783b077fe9b2f Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Thu, 13 Oct 2016 20:34:15 +0200 Subject: [PATCH 22/48] admix logging and bugfix; blueprint big_mem option --- bin/admix_rel.py | 18 +++++++++++++----- bin/blueprint.py | 6 ++++++ cluster_templates/broad_uger.conf | 1 + cluster_templates/lisa.conf | 1 + cluster_templates/lisa.sub.sh | 2 +- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 46e9882..4c8ada3 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -206,15 +206,23 @@ # print settings print 'Using settings:' -print '--unrel-bfile '+args.unrel_bfile +if args.unrel_bfile is not None and args.unrel_bfile != "": + print '--unrel-bfile '+args.unrel_bfile +if args.admix_p is not None and args.admix_p != "": + print '--admix-p '+args.admix_p print '--target-bfile '+args.target_bfile print '--out '+args.out print '--outdir '+args.outdir print '--npops '+str(args.npops) -print '--prop-th '+str(args.prop_th) -print '--min-exemplar '+str(args.min_exemplar) +if args.use_exemplars: + print '--min-exemplar '+str(args.min_exemplar) + if args.admix_q is not None and args.admix_q != "": + print '--admix-q '+args.admix_q + print '--prop-th '+str(args.prop_th) + print '--min-exemplar '+str(args.min_exemplar) print '--min-rel '+str(args.min_rel) -print '--plot-admix-pca '+str(args.plot_admix_pca) +if args.plot_admix_pca is not None and args.plot_admix_pca != "": + print '--plot-admix-pca '+str(args.plot_admix_pca) ############# @@ -484,7 +492,7 @@ def maxpop(props, names, th): print str(' '.join(admix_project_call)) print 'Logging to ' + admix_target_log.name + '\n' - subprocess.check_call(admix_super_call, stdout=admix_target_log) + subprocess.check_call(admix_project_call, stdout=admix_target_log) admix_target_log.close() diff --git a/bin/blueprint.py b/bin/blueprint.py index e5535f6..352a555 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -79,6 +79,11 @@ def send_job(jobname, else: mem_gb = str(1) + if mem > 30000: + mem_txt = str(clust_conf['big_mem_txt']) + else: + mem_txt = "" + # multithreading arguments if threads is None: threads = 1 @@ -244,6 +249,7 @@ def send_job(jobname, "log_name": str(logloc)+'/'+str(logname), "mem_in_mb": str(mem_mb), "mem_in_gb": str(mem_gb), + "big_mem_txt": str(mem_txt), "threads": str(threads), "total_threads": str(tot_threads), "wall_hours": str(walltime), diff --git a/cluster_templates/broad_uger.conf b/cluster_templates/broad_uger.conf index 5901a07..3c80210 100644 --- a/cluster_templates/broad_uger.conf +++ b/cluster_templates/broad_uger.conf @@ -9,4 +9,5 @@ task_id ${SGE_TASK_ID} hold_flag -hold_jid {hold_name} j_per_node 1 array_mem_mb 128000 +big_mem_txt None project unspecified diff --git a/cluster_templates/lisa.conf b/cluster_templates/lisa.conf index 1db3e36..8145e16 100644 --- a/cluster_templates/lisa.conf +++ b/cluster_templates/lisa.conf @@ -9,4 +9,5 @@ task_id ${PBS_ARRAYID} hold_flag -W depend=afterany:{hold_num} j_per_node 16 array_mem_mb 32000 +big_mem_txt :mem64gb project unspecified diff --git a/cluster_templates/lisa.sub.sh b/cluster_templates/lisa.sub.sh index 8451e4d..a2aab7c 100755 --- a/cluster_templates/lisa.sub.sh +++ b/cluster_templates/lisa.sub.sh @@ -1,5 +1,5 @@ #PBS -lwalltime={wall_hours}:00:00 -#PBS -lnodes=1 +#PBS -lnodes=1{big_mem_txt} #PBS -S /bin/bash #PBS -N {job_name} #PBS -j oe From c433f15b933007e335b5a850461c141ff1f070c0 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 13 Oct 2016 18:54:55 -0400 Subject: [PATCH 23/48] more platform agnostic impute reference args; adjust impute logging --- bin/agg_imp.py | 4 +-- bin/args_impute.py | 18 ++++++----- bin/bg_imp.py | 7 +++-- bin/imp2_rel.py | 7 +++-- bin/impute_rel.py | 74 +++++++++++++++++++++++++++++++++++++++++++++- bin/shape_rel.py | 7 +++-- 6 files changed, 98 insertions(+), 19 deletions(-) diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 77c4e8c..80a01a3 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -27,7 +27,7 @@ import os import subprocess import argparse -from args_impute import parserbase, parsercluster +from args_impute import parserbase, parsercluster, parserjob from py_helpers import unbuffer_stdout, find_exec, file_len from blueprint import send_job, load_job, save_job, read_clust_conf unbuffer_stdout() @@ -39,7 +39,7 @@ parser = argparse.ArgumentParser(prog='agg_imp.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), - parents=[parserbase, parsercluster]) + parents=[parserbase, parsercluster, parserjob]) args, extra_args = parser.parse_known_args() diff --git a/bin/args_impute.py b/bin/args_impute.py index e603462..057d27b 100644 --- a/bin/args_impute.py +++ b/bin/args_impute.py @@ -89,9 +89,10 @@ help='gzipped file of reference information, with columns ' + \ '"id","position","a0","a1", and $popname, where $popname' + \ 'contains the allele frequency for the "a1" allele. Can ' + \ - 'include "###" in place of chromosome number (as in default).', + 'include "###" in place of chromosome number. Expected format ' + \ + 'is from 1000GP_Phase3_chr###.legend.gz files from IMPUTE.', required=False, - default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.legend.gz') + default='1000GP_Phase3_chr###.legend.gz') arg_shape.add_argument('--window', type=float, metavar='FLOAT', @@ -197,25 +198,25 @@ metavar='FILENAME', help='Genomic maps. To specify files split by chromosome, use "###" to indicate chromosome number (see default).', required=False, - default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/genetic_map/genetic_map_chr###_combined_b37.txt') + default='genetic_map_chr###_combined_b37.txt') arg_ref.add_argument('--ref-haps', type=str, metavar='FILENAME', help='Imputation reference .hap.gz file for shapeit and impute2. Can use "###" to indicate chromosome number (see default).', required=False, - default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.hap.gz') + default='1000GP_Phase3_chr###.hap.gz') arg_ref.add_argument('--ref-legs', type=str, metavar='FILENAME', help='Imputation reference .legend.gz file for shapeit and impute2. Can use "###" to indicate chromosome number (see default).', required=False, - default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3_chr###.legend.gz') + default='1000GP_Phase3_chr###.legend.gz') arg_ref.add_argument('--ref-samps', type=str, metavar='FILENAME', help='Imputation reference .sample file for shapeit and impute2. Can use "###" to indicate chromosome number.', required=False, - default='/humgen/atgu1/fs03/shared_resources/1kG/shapeit/1000GP_Phase3.sample') + default='1000GP_Phase3.sample') ############ # @@ -296,7 +297,10 @@ help='Number of seconds to delay on start of cluster jobs', required=False, default=30) -arg_clust.add_argument('--full-pipe', + +parserjob = argparse.ArgumentParser(add_help=False) +arg_job = parserjob.add_argument_group('Job Submission Settings') +arg_job.add_argument('--full-pipe', action='store_true', help='Proceed through full imputation pipeline', required=False) diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 0990123..8e90f4c 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -36,7 +36,7 @@ import argparse from warnings import warn from textwrap import dedent -from args_impute import parserbase, parserbg, parsercluster +from args_impute import parserbase, parserbg, parsercluster, parserjob from py_helpers import unbuffer_stdout, find_exec, file_tail, link, warn_format, read_conf from blueprint import send_job, init_sendjob_dict, save_job, load_job, read_clust_conf unbuffer_stdout() @@ -49,7 +49,7 @@ parser = argparse.ArgumentParser(prog='bg_imp.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), - parents=[parserbase, parserbg, parsercluster]) + parents=[parserbase, parserbg, parsercluster, parserjob]) args, extra_args = parser.parse_known_args() @@ -180,7 +180,8 @@ print '\nCluster settings:' print '--sleep '+str(args.sleep) - +if args.full_pipe: + print '--full-pipe' ############# diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 151d871..7239008 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -28,7 +28,7 @@ import subprocess import argparse from textwrap import dedent -from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster +from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job unbuffer_stdout() @@ -41,7 +41,7 @@ parser = argparse.ArgumentParser(prog='imp2_rel.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), - parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster]) + parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob]) args, extra_args = parser.parse_known_args() @@ -86,7 +86,8 @@ print '\nCluster settings:' print '--sleep '+str(args.sleep) - +if args.full_pipe: + print '--full-pipe' ############# diff --git a/bin/impute_rel.py b/bin/impute_rel.py index 254cfbe..f312917 100755 --- a/bin/impute_rel.py +++ b/bin/impute_rel.py @@ -27,7 +27,7 @@ import os import argparse -from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, parserbg, parsercluster +from args_impute import parserbase, parserphase, parserimpute, parserchunk, parserref, arg_ref, parserbg, parsercluster from py_helpers import unbuffer_stdout from blueprint import send_job unbuffer_stdout() @@ -36,6 +36,15 @@ if not (('-h' in sys.argv) or ('--help' in sys.argv)): print '\n...Parsing arguments...' ############# + +arg_ref.add_argument('--ref-dir', + type=str, + metavar='DIRECTORY', + help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + + 'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps', + required=False, + default=None) + parser = argparse.ArgumentParser(prog='impute_rel.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), @@ -43,10 +52,72 @@ args = parser.parse_args() +if args.ref_dir is not None: + # verify exists + assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir + + # prepend to references accordingly + args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps + args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps + args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs + args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps + +# reference recommendation +def print_ref_rec(): + print '\nIf you do not have an imputation reference available, the 1000 Genomes' + print 'Phase 3 reference panel provided by IMPUTE is directly compatible with' + print 'picopili and broadly covers most major continental populations.' + print '\nDirect download:' + print 'wget https://mathgen.stats.ox.ac.uk/impute/1000GP_Phase3.tgz' + print '\nWARNING: download filesize is > 12 GB\n' + +# check these references exist +if not os.path.isfile(args.ref_maps.replace('###','1')): + print "Failed to verify genetic maps exist." + print_ref_rec() + raise IOError("No chr 1 genetic map: %s" % args.ref_maps.replace('###','1')) + +if not os.path.isfile(args.ref_haps.replace('###','1')): + print "Failed to verify reference haplotypes exist." + # print rec, since is possible have genetic map but not imputation panel + print_ref_rec() + raise IOError("No chr 1 reference haplotypes: %s" % args.ref_haps.replace('###','1')) + +if not os.path.isfile(args.ref_legs.replace('###','1')): + # not printing ref_rec here since at this point have verified haplotypes exist + raise IOError("Failed to verify reference legend files exist (tested for chr 1 at %s)" % args.ref_legs.replace('###','1')) + +if not os.path.isfile(args.ref_samps.replace('###','1')): + # not printing ref_rec here since at this point have verified haplotypes exist + raise IOError("Failed to verify reference sample file exists (tested for chr 1 at %s)" % args.ref_samps.replace('###','1')) + + +# more flexible handling for info file for shapeit, since could be external +if not os.path.isfile(args.ref_info.replace('###','1')): + + if args.ref_dir is not None and os.path.isfile(str(args.ref_dir) +'/' + args.ref_info.replace('###','1')): + args.ref_info = str(args.ref_dir) +'/' + args.ref_info + + else: + print "Reference information file for phasing not found (tested for chr 1: %s)." % args.ref_info.replace('###','1') + if args.ref_dir is not None: + print "Tried both relative path and in --ref-dir %s" % str(args.ref_dir) + + if args.ref_dir == "1000GP_Phase3_chr###.legend.gz": + print "For 1000 Genomes Phase 3 reference from IMPUTE the required file is " + print "the same as the reference legend.\n" + print "Maybe you wanted to add this\n?" + # verified above that the legend file exists + print "--ref-info %s\n" % args.ref_legs + + raise IOError("Failed to verify phasing info file exists (tested for chr 1 at %s)" % args.ref_info.replace('###','1')) + # TODO: full sanity check of the args here + + # print args print '\nBasic settings:' print '--bfile '+str(args.bfile) @@ -88,6 +159,7 @@ print '\nImputation Reference Files:' +print '--ref-dir '+str(args.ref_dir) print '--ref-maps '+str(args.ref_maps) print '--ref-haps '+str(args.ref_haps) print '--ref-legs '+str(args.ref_legs) diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 5943789..3e848fb 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -38,7 +38,7 @@ import os import subprocess import argparse -from args_impute import parserbase, parserphase, parserref, parsercluster +from args_impute import parserbase, parserphase, parserref, parsercluster, parserjob from py_helpers import unbuffer_stdout, link, find_exec, read_conf from blueprint import send_job unbuffer_stdout() @@ -52,7 +52,7 @@ parser = argparse.ArgumentParser(prog='shape_rel.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), - parents=[parserbase, parserphase, parserref, parsercluster]) + parents=[parserbase, parserphase, parserref, parsercluster, parserjob]) args, extra_args = parser.parse_known_args() @@ -92,7 +92,8 @@ print '--sleep '+str(args.sleep) print '--mem-req '+str(args.mem_req) print '--threads '+str(args.threads) - +if args.full_pipe: + print '--full-pipe' if str(args.addout) != '' and args.addout is not None: From a0c6d3e1387810fe9a00a19b68f14a8dd65c8d60 Mon Sep 17 00:00:00 2001 From: Raymond Walters Date: Fri, 21 Oct 2016 20:52:17 +0200 Subject: [PATCH 24/48] admix logging; pass relatedness to reap as kinship --- bin/admix_rel.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 4c8ada3..13206b7 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -325,11 +325,12 @@ -############# -print '\n...Running Admixture on unrelated dataset...' -############# - if run_admix: + + ############# + print '\n...Running Admixture on unrelated dataset...' + ############# + admix_call = [args.admixture_ex, str(args.unrel_bfile+'.bed'), str(args.npops), @@ -485,7 +486,7 @@ def maxpop(props, names, th): admix_project_call = [args.admixture_ex, - '-P', str(args.target_bfile+'.bed'), + '-P', str(args.target_bfile)+'.bed', str(args.npops), '-j'+str(args.multithread_cores)] admix_target_log = open(str('admix_'+args.out+'_target.log'), 'w') @@ -559,7 +560,7 @@ def maxpop(props, names, th): '-r', str(2), '-k', str(args.npops), '-m', - '-t', str(args.min_rel)] + '-t', str(float(args.min_rel/2.0))] reap_log = open(str('reap_' + args.out + '.log'), 'w') print str(' '.join(reap_call)) From c38c77ff708cd062d97ed3c64ead2b2a17ac3743 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 25 Oct 2016 11:28:26 -0400 Subject: [PATCH 25/48] fix curly braces in job templates --- bin/bg_imp.py | 16 +++++++++------- bin/gwas_rel.py | 24 ++++++++++++++---------- bin/imp2_rel.py | 18 +++++++++++------- 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 8e90f4c..2b214e5 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -374,10 +374,10 @@ # best-guess job script for each chunk bg_templ = dedent("""\ - cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` - cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` + cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}` + cchr=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}` - {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${{cchr}} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} + {plink_ex} --gen {gen_in} --sample {samp_in} --oxford-single-chr ${cbopen}cchr{cbclose} --oxford-pheno-name plink_pheno --hard-call-threshold {hard_call_th} --missing-code -9,NA,na --allow-no-sex --silent --memory 4000 --out {out_str} sleep {sleep} # note: Mendel errors checked after --update-parents, see https://www.cog-genomics.org/plink2/order @@ -392,7 +392,7 @@ rm {out_str2}.bim rm {out_str2}.fam - {rs_ex} --chunk ${{cname}} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans} + {rs_ex} --chunk ${cbopen}cname{cbclose} --name {outdot} --imp-dir {imp_dir} --fam-trans {trans} """) # get number of chunks @@ -403,8 +403,8 @@ "sleep": str(args.sleep), "cfile": str(outdot)+'.chunks.txt', "plink_ex": str(plink_ex), - "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${cname}.gz', - "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.sample', + "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}.gz', + "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.sample', "hard_call_th": str(hard_call_th), "out_str": str(outdot)+'.bg.${cname}', "mendel_txt": str(mendel_txt), @@ -418,7 +418,9 @@ "outdot": str(outdot), "imp_dir": str(imp_dir), "idnum": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam', - "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl' + "trans": str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam.transl', + "cbopen":'{{', + "cbclose":'}}', } diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 9bb199e..4b26a09 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -149,7 +149,7 @@ clust_conf = read_clust_conf() # TODO: here - +# TODO: move to before logging @@ -400,21 +400,23 @@ def find_chunk(snpchrom, snpbp, last_chunk): ###################### # basic template, depending on model +# cbopen/cbclose are placeholders for real curly braces, +# to survive .format() here and in send_job if args.model == 'gee' or args.model == 'dfam': gwas_templ = dedent("""\ - cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` + cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}` {misc} - {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${{cname}}.txt {optargs} + {gwas_ex} --bfile {bfile} --out {argout} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} """) elif args.model == 'gmmat' or args.model == 'gmmat-fam': gwas_templ = dedent("""\ - cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` - chrnum=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` + cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}` + chrnum=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}` - {plinkx} --bfile {bfile} --extract {outdot}.snps.${{cname}}.txt {optargs} --make-bed --out {outdot}.${{cname}} + {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbclose}cname{cbopen} - {rsc} --no-save --no-restore {gwas_ex} {outdot}.${{cname}} grm.{outdot}.loco_chr${{chrnum}}.rel.gz {covarsub} {outdot}.${{cname}} > {outdot}.${{cname}}.gmmat.R.log + {rsc} --no-save --no-restore {gwas_ex} {outdot}.${cbopen}cname{cbclose} grm.{outdot}.loco_chr${cbopen}chrnum{cbclose}.rel.gz {covarsub} {outdot}.${cbopen}cname{cbclose} > {outdot}.${cbopen}cname{cbclose}.gmmat.R.log """) # alternative template for GMMAT @@ -439,9 +441,9 @@ def find_chunk(snpchrom, snpbp, last_chunk): # model-specific arguments not passed for gmmat if args.model == 'gee' or args.model == 'dfam': if args.addout is not None: - gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${cname}' + gwasargs = gwasargs + ' --addout '+str(args.addout)+'.${{cname}}' else: - gwasargs = gwasargs + ' --addout ${cname}' + gwasargs = gwasargs + ' --addout ${{cname}}' if args.covar is not None: gwasargs = gwasargs + ' --covar '+str(args.covar) if args.covar_number is not None: @@ -472,7 +474,9 @@ def find_chunk(snpchrom, snpbp, last_chunk): "optargs": str(gwasargs), "plinkx": str(plinkx), "covarsub": str(args.covar)+'.sub.txt', - "rsc": str(args.rscript_ex) + "rsc": str(args.rscript_ex), + "cbopen":'{{', + "cbclose":'}}', } nchunk = len(chunks.keys()) diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 7239008..23fda59 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -185,7 +185,7 @@ # with "chr_list" to get have adaptive chromosome list cmd_templ = dedent("""\ chrs=({chr_list}) - chrom=${{chrs[{task}-1]}} + chrom=${cbopen}chrs[{task}-1]{cbclose} {shape_ex} {bed} {map} {ref} {window} {duo_txt} {thread_str} {seed_str} {outmax} {shapelog} """) @@ -222,6 +222,8 @@ "seed_str": '--seed '+str(extra_args.shape_seed), "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample', "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log', + "cbopen":'{{', + "cbclose":'}}', } shape_cmd = cmd_templ.format(**jobdict) @@ -326,12 +328,12 @@ # job script imp_templ = dedent("""\ - cchr=`awk -v a={task} 'NR==a+1{{print $1}}' {cfile}` - cstart=`awk -v a={task} 'NR==a+1{{print $2}}' {cfile}` - cend=`awk -v a={task} 'NR==a+1{{print $3}}' {cfile}` - cname=`awk -v a={task} 'NR==a+1{{print $4}}' {cfile}` + cchr=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}` + cstart=`awk -v a={task} 'NR==a+1{cbopen}print $2{cbclose}' {cfile}` + cend=`awk -v a={task} 'NR==a+1{cbopen}print $3{cbclose}' {cfile}` + cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}` - {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${{cstart}} ${{cend}} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt} + {impute_ex} -use_prephased_g -known_haps_g {in_haps} -h {ref_haps} -l {ref_leg} -m {map} -int ${cbopen}cstart{cbclose} ${cbopen}cend{cbclose} -buffer {buffer} -Ne {Ne} -allow_large_regions -o_gz -o {out} {seedtxt} """) # fill in template @@ -345,7 +347,9 @@ "Ne": str(args.Ne), "buffer": str(args.buffer), "out": str(outdot)+'.imp.${cname}', - "seedtxt": str(seedtxt) + "seedtxt": str(seedtxt), + "cbopen":'{{', + "cbclose":'}}', } From e2567de0599c14d66ee9442f55e6596c98dd9a5e Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 10 Nov 2016 11:52:23 -0500 Subject: [PATCH 26/48] bugfix args formatting in bg send_job --- bin/bg_imp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 2b214e5..81ac5ec 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -117,7 +117,7 @@ info_txt = '' else: # init, then add thresholds - info_txt = '--qual-scores '+str(imp_dir)+'/'+str(outdot)+'.imp.${cname}_info' +' 5 2 1' + info_txt = '--qual-scores '+str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}_info' +' 5 2 1' # minimum info if args.info_th >= 0.0 and args.info_th <= 1.0: info_txt = info_txt + ' --qual-threshold '+str(args.info_th) @@ -406,14 +406,14 @@ "gen_in": str(imp_dir)+'/'+str(outdot)+'.imp.${{cname}}.gz', "samp_in": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.sample', "hard_call_th": str(hard_call_th), - "out_str": str(outdot)+'.bg.${cname}', + "out_str": str(outdot)+'.bg.${{cname}}', "mendel_txt": str(mendel_txt), "info_txt": str(info_txt), - "out_str2": str(outdot)+'.bg.tmp.${cname}', + "out_str2": str(outdot)+'.bg.tmp.${{cname}}', "maf_txt": str(maf_txt), "mac_txt": str(mac_txt), "geno_txt": str(geno_txt), - "out_str_filt": str(outdot)+'.bg.filtered.${cname}', + "out_str_filt": str(outdot)+'.bg.filtered.${{cname}}', "rs_ex": str(rs_ex), "outdot": str(outdot), "imp_dir": str(imp_dir), From 807f20f2d807ae09040a2d6083e7a7b8695bafa4 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 10 Nov 2016 11:53:30 -0500 Subject: [PATCH 27/48] bugfix arg format, adjust agg mem request in gwas_rel --- bin/gwas_rel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/gwas_rel.py b/bin/gwas_rel.py index 4b26a09..466f239 100755 --- a/bin/gwas_rel.py +++ b/bin/gwas_rel.py @@ -414,7 +414,7 @@ def find_chunk(snpchrom, snpbp, last_chunk): cname=`awk -v a={task} 'NR==a+1{cbopen}print $4{cbclose}' {cfile}` chrnum=`awk -v a={task} 'NR==a+1{cbopen}print $1{cbclose}' {cfile}` - {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbclose}cname{cbopen} + {plinkx} --bfile {bfile} --extract {outdot}.snps.${cbopen}cname{cbclose}.txt {optargs} --make-bed --out {outdot}.${cbopen}cname{cbclose} {rsc} --no-save --no-restore {gwas_ex} {outdot}.${cbopen}cname{cbclose} grm.{outdot}.loco_chr${cbopen}chrnum{cbclose}.rel.gz {covarsub} {outdot}.${cbopen}cname{cbclose} > {outdot}.${cbopen}cname{cbclose}.gmmat.R.log """) @@ -568,7 +568,7 @@ def find_chunk(snpchrom, snpbp, last_chunk): send_job(jobname='agg_'+str(outdot), cmd=' '.join(agg_call), logname=agg_log, - mem=4000, + mem=8000, walltime=30, wait_name='gwas.chunks.'+str(outdot), wait_num=str(jobres).strip(), From 6361099006bd251407de3f6dbfc7bc7c2add50d9 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 10 Nov 2016 11:54:31 -0500 Subject: [PATCH 28/48] more detailed logging of missing/broken chunks in gwas agg --- bin/agg_gwas.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index d456758..87c28ee 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -111,6 +111,9 @@ +############### +print '\n...Checking for missing or incomplete chunks...' +############### # read chunk def file chunks = {} @@ -138,12 +141,15 @@ # record chunks with no/partial/broken output if not os.path.isfile(ch_out): + print 'Output not found for %s' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] - elif file_len(ch_out) != file_len(str(outdot)+'.snps.'+str(chname)+'.txt'): + elif file_len(ch_out) < file_len(str(outdot)+'.snps.'+str(chname)+'.txt'): + print 'Output file %s is incomplete' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] else: ft = file_tail(ch_out) if len(ft.split()) != out_len: + print 'Last line of output file %s is incomplete' % str(ch_out) mis_chunks[str(chname)] = [str(chrom), int(start), int(end)] @@ -154,7 +160,7 @@ ############### if len(mis_chunks) > 0: nummiss = len(mis_chunks) - print 'Missing results for %d GWAS jobs. Preparing to resubmit...' % nummiss + print '\nMissing results for %d GWAS jobs. Preparing to resubmit...' % nummiss # just missing chunks for task array # fail if already tried @@ -244,7 +250,7 @@ send_job(jobname='agg_'+str(outdot), cmd=' '.join(sys.argv[:]), logname=agg_log, - mem=24000, + mem=16000, walltime=30, wait_name='gwas.chunks.'+str(outdot)+'.resub_'+str(nummiss), wait_num=str(jobres).strip(), @@ -258,6 +264,7 @@ ############### # if no missing chunks, proceed collecting info for aggregation +print '\n...Loading auxilary information...' ############### # chnames = chunks.keys() @@ -339,7 +346,9 @@ out_file.write('\t'.join(out_head) + '\n') filt_file.write('\t'.join(filt_head) + '\n') -print 'starting chunk loop' +############### +print '\n...Aggregating GWAS results from chunks...' +############### # loop chunks to aggregate for ch in chnames: # open output file From a9d7734f114070c72bb676df1a9c1339e6725558 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 10 Nov 2016 11:55:42 -0500 Subject: [PATCH 29/48] dont expect .eval output from pca fastmode --- bin/imus_pca.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/imus_pca.py b/bin/imus_pca.py index 765aff2..5afbc39 100755 --- a/bin/imus_pca.py +++ b/bin/imus_pca.py @@ -459,7 +459,7 @@ subprocess.check_call(["tar", "-zcvf", args.out + '.pca_files.tar.gz', args.bfile + '.pca.par', - args.bfile + '.pca.eval.txt', +# args.bfile + '.pca.eval.txt', args.bfile + '.pca.snpw.txt', args.bfile + '.pca.raw.txt', args.bfile + '.pca.refpoplist.txt', @@ -470,7 +470,7 @@ # remove successfully zipped files subprocess.check_call(["rm", args.bfile + '.pca.par', - args.bfile + '.pca.eval.txt', +# args.bfile + '.pca.eval.txt', args.bfile + '.pca.snpw.txt', args.bfile + '.pca.raw.txt', args.bfile + '.pca.refpoplist.txt', From 0a254d7f08069aa07b2cf116c37ce51aca1e3a5d Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 29 Nov 2016 17:16:02 -0500 Subject: [PATCH 30/48] ensure allele frequency reported for intended allele --- bin/agg_gwas.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index 87c28ee..5df1429 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -297,11 +297,17 @@ print 'bim loaded' # frq.cc -# for both: maf_a, maf_u, n_a, n_u +# for both: +# - maf_a = frq in affected (cases) +# - maf_u = frq in unaffected (controls) +# - n_a = number affected (cases) +# - n_u = number affected (controls) +# - freq_a1 = a1 used for freq maf_a_info = {} maf_u_info = {} n_a_info = {} n_u_info = {} +freq_a1 = {} frq = open(args.freq_file, 'r') dumphead = frq.readline() @@ -311,6 +317,7 @@ maf_u_info[str(snp)] = mafu n_a_info[str(snp)] = int(nchra) / 2 n_u_info[str(snp)] = int(nchru) / 2 + freq_a1[str(snp)] = a1 frq.close() print 'frq loaded' @@ -378,8 +385,13 @@ (chrom, snp, cm, bp, a1, a2, n, af2, scoretest, scorevar, p) = line.split() # get meta info - frqa = maf_a_info.pop(str(snp)) - frqu = maf_u_info.pop(str(snp)) + # verify use freq of correct allele + if str(frq_a1.pop(str(snp))) == str(a1): + frqa = maf_a_info.pop(str(snp)) + frqu = maf_u_info.pop(str(snp)) + else: + frqa = 1 - maf_a_info.pop(str(snp)) + frqu = 1 - maf_u_info.pop(str(snp)) na = n_a_info.pop(str(snp)) nu = n_u_info.pop(str(snp)) From f72a8919928d50da243d971d62c8c76cd8d38c09 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Mon, 5 Dec 2016 18:02:29 -0500 Subject: [PATCH 31/48] fix specifying reference files with directory, imp_prep cluster --- bin/imp2_rel.py | 19 ++++++++++++++++++- bin/imp_prep.pl | 2 +- bin/shape_rel.py | 22 ++++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 23fda59..e55a71b 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -42,7 +42,16 @@ formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob]) - + +arg_ref.add_argument('--ref-dir', + type=str, + metavar='DIRECTORY', + help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + + 'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps', + required=False, + default=None) + + args, extra_args = parser.parse_known_args() @@ -111,7 +120,15 @@ chunker_ex = rp_bin+'/chunk_snps.py' test_exec(chunker_ex) +if args.ref_dir is not None: + # verify exists + assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir + # prepend to references accordingly + args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps + args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps + args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs + args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps # TODO: here diff --git a/bin/imp_prep.pl b/bin/imp_prep.pl index a707037..69becf5 100755 --- a/bin/imp_prep.pl +++ b/bin/imp_prep.pl @@ -57,7 +57,7 @@ ############################# my $ploc = &trans("p2loc"); -my $qloc = &trans("queue"); +my $qloc = &trans("cluster"); my $email = &trans("email"); my $email_on = 0; diff --git a/bin/shape_rel.py b/bin/shape_rel.py index 3e848fb..c26163c 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -54,6 +54,15 @@ argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), parents=[parserbase, parserphase, parserref, parsercluster, parserjob]) +arg_ref.add_argument('--ref-dir', + type=str, + metavar='DIRECTORY', + help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + + 'Used as prefix for specifying full paths of --ref-maps, --ref-haps, --ref-legs, and --ref-samps', + required=False, + default=None) + + args, extra_args = parser.parse_known_args() # other settings @@ -87,6 +96,7 @@ print '--ref-haps '+str(args.ref_haps) print '--ref-legs '+str(args.ref_legs) print '--ref-samps '+str(args.ref_samps) +print '--ref-dir '+str(args.ref_dir) print '\nJob Submission:' print '--sleep '+str(args.sleep) @@ -110,6 +120,18 @@ plinkx = find_exec('plink',key='p2loc') shapeit_ex = find_exec('shapeit',key='shloc') + +if args.ref_dir is not None: + # verify exists + assert os.path.isdir(args.ref_dir), "Failed to find imputation reference directory %s" % args.ref_dir + + # prepend to references accordingly + args.ref_maps = str(args.ref_dir) +'/' + args.ref_maps + args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps + args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs + args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps + + # TODO: here From c59be8b209278700b9629b0c8187c1b82533af1c Mon Sep 17 00:00:00 2001 From: rkwalters Date: Mon, 5 Dec 2016 18:31:38 -0500 Subject: [PATCH 32/48] fix directory for shapeit ref info --- bin/imp2_rel.py | 2 +- bin/shape_rel.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index e55a71b..51a4afb 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -43,7 +43,7 @@ argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob]) -arg_ref.add_argument('--ref-dir', +parser.add_argument('--ref-dir', type=str, metavar='DIRECTORY', help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + diff --git a/bin/shape_rel.py b/bin/shape_rel.py index c26163c..eab62b1 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -54,7 +54,7 @@ argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), parents=[parserbase, parserphase, parserref, parsercluster, parserjob]) -arg_ref.add_argument('--ref-dir', +parser.add_argument('--ref-dir', type=str, metavar='DIRECTORY', help='Directory containing imputation reference files (haps, legends, sample, and maps). ' + @@ -130,6 +130,7 @@ args.ref_haps = str(args.ref_dir) +'/' + args.ref_haps args.ref_legs = str(args.ref_dir) +'/' + args.ref_legs args.ref_samps = str(args.ref_dir) +'/' + args.ref_samps + args.ref_info = str(args.ref_dir) +'/' + args.ref_info # TODO: here From e97bc58bc6cf574cccd7473783bfa7d79cc141c1 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Mon, 5 Dec 2016 19:58:06 -0500 Subject: [PATCH 33/48] remove unused refs to hmloc --- bin/checkflip_pico.pl | 1 - bin/checkpos_pico.pl | 1 - 2 files changed, 2 deletions(-) diff --git a/bin/checkflip_pico.pl b/bin/checkflip_pico.pl index 679a4b0..5c9bb08 100755 --- a/bin/checkflip_pico.pl +++ b/bin/checkflip_pico.pl @@ -51,7 +51,6 @@ sub trans { } my $sloc = &trans("sloc"); -my $hmloc = &trans("hmloc"); my $p2loc = &trans("p2loc"); diff --git a/bin/checkpos_pico.pl b/bin/checkpos_pico.pl index 5bd8116..6a162ce 100755 --- a/bin/checkpos_pico.pl +++ b/bin/checkpos_pico.pl @@ -52,7 +52,6 @@ sub trans { } my $sloc = &trans("sloc"); -my $hmloc = &trans("hmloc"); my $p2loc = &trans("p2loc"); From 5ee5edfc4e3b4bc0dced47825177845268843af8 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Mon, 5 Dec 2016 19:59:04 -0500 Subject: [PATCH 34/48] fix broken test_exec calls --- bin/imp2_rel.py | 2 +- bin/py_helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index 51a4afb..bf8be27 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -118,7 +118,7 @@ # (to get absolute path for scripts) rp_bin = os.path.dirname(os.path.realpath(__file__)) chunker_ex = rp_bin+'/chunk_snps.py' -test_exec(chunker_ex) +test_exec(chunker_ex,'picopili chunking script') if args.ref_dir is not None: # verify exists diff --git a/bin/py_helpers.py b/bin/py_helpers.py index f50c4fd..dd7c89b 100644 --- a/bin/py_helpers.py +++ b/bin/py_helpers.py @@ -101,7 +101,7 @@ def find_exec(prog, key=None): print "Failed to find config file %s. Will search for %s on path." % (str(conffile), str(prog)) exloc = find_from_path(str(prog),str(prog)) - test_exec(exloc) + test_exec(exloc,str(prog)) return exloc From 1ba6dd95f5e3d07f9022381f3f00d3ebc2ab7ab3 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 13 Dec 2016 23:47:48 -0500 Subject: [PATCH 35/48] more imputation job arg/templating fixes --- bin/imp2_rel.py | 29 ++++++++++++++--------------- bin/shape_rel.py | 12 ++++++------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/bin/imp2_rel.py b/bin/imp2_rel.py index bf8be27..8bf279e 100755 --- a/bin/imp2_rel.py +++ b/bin/imp2_rel.py @@ -28,7 +28,7 @@ import subprocess import argparse from textwrap import dedent -from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob +from args_impute import parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob, parserphase from py_helpers import unbuffer_stdout, file_len, link, find_exec, test_exec, read_conf from blueprint import send_job, read_clust_conf, init_sendjob_dict, save_job unbuffer_stdout() @@ -41,7 +41,7 @@ parser = argparse.ArgumentParser(prog='imp2_rel.py', formatter_class=lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, max_help_position=40), - parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob]) + parents=[parserbase, parserimpute, parserref, parserchunk, parsercluster, parserjob, parserphase]) parser.add_argument('--ref-dir', type=str, @@ -161,7 +161,6 @@ haps_out = str(shape_dir)+'/'+str(outdot)+'.chr'+str(chrom)+'.phased.haps' samp_out = str(shape_dir)+'/'+str(outdot)+'.chr'+str(chrom)+'.phased.sample' - if not os.path.isfile(haps_out): bad_chr.append(chrom) elif not os.path.isfile(samp_out): @@ -180,7 +179,7 @@ exit(1) # else continue to resub print 'Preparing to resubmit...' - # note: assuming required shapeit args will be in extra_args + # note: assuming required shapeit args will be in args # if running under --full-pipe # TODO: add check on this # (mem_req, threads, no_duohmm, window, shape_seed) @@ -221,7 +220,7 @@ # manage additional arg pieces chrstem = str(args.bfile)+'.hg19.ch.fl.chr${chrom}' outstem = str(outdot)+'.chr${chrom}' - if extra_args.no_duohmm: + if args.no_duohmm: duo_txt = '' else: duo_txt = '--duohmm' @@ -233,10 +232,10 @@ "bed": '--input-bed '+str(chrstem)+'.bed '+str(chrstem)+'.bim '+str(chrstem)+'.fam', "map": '--input-map '+str(args.ref_maps).replace('###','${chrom}'), "ref": '--input-ref '+str(args.ref_haps).replace('###','${chrom}')+' '+str(args.ref_legs).replace('###','${chrom}')+' '+str(args.ref_samps).replace('###','${chrom}'), - "window": '--window '+str(extra_args.window), + "window": '--window '+str(args.window), "duo_txt": str(duo_txt), - "thread_str": '--thread '+str(extra_args.threads), - "seed_str": '--seed '+str(extra_args.shape_seed), + "thread_str": '--thread '+str(args.threads), + "seed_str": '--seed '+str(args.shape_seed), "outmax": '--output-max '+str(outstem)+'.phased.haps '+str(outstem)+'.phased.sample', "shapelog": str(outstem)+'.shape.resub_'+str(num_chr)+'.log', "cbopen":'{{', @@ -248,10 +247,10 @@ jobres = send_job(jobname='shape.'+str(outdot)+'.resub_'+str(num_chr), cmd=shape_cmd, logname='shape.'+str(outdot)+'.resub_'+str(num_chr)+'.sub.'+str(clust_conf['log_task_id'])+'.log', - mem=int(extra_args.mem_req)*1000, + mem=int(args.mem_req)*1000, walltime=30, njobs=int(num_chr), - threads=extra_args.threads, + threads=args.threads, sleep=args.sleep) print 'Pre-phasing jobs re-submitted for %d chromosomes.\n' % num_chr @@ -357,13 +356,13 @@ jobdict = {"task": "{task}", "cfile": str(outdot)+'.chunks.txt', "impute_ex": str(impute_ex), - "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${cchr}.phased.haps', - "ref_haps": str(args.ref_haps).replace('###','${cchr}'), - "ref_leg": str(args.ref_legs).replace('###','${cchr}'), - "map": str(args.ref_maps).replace('###','${cchr}'), + "in_haps": str(shape_dir)+'/'+str(outdot)+'.chr${{cchr}}.phased.haps', + "ref_haps": str(args.ref_haps).replace('###','${{cchr}}'), + "ref_leg": str(args.ref_legs).replace('###','${{cchr}}'), + "map": str(args.ref_maps).replace('###','${{cchr}}'), "Ne": str(args.Ne), "buffer": str(args.buffer), - "out": str(outdot)+'.imp.${cname}', + "out": str(outdot)+'.imp.${{cname}}', "seedtxt": str(seedtxt), "cbopen":'{{', "cbclose":'}}', diff --git a/bin/shape_rel.py b/bin/shape_rel.py index eab62b1..7e1391a 100755 --- a/bin/shape_rel.py +++ b/bin/shape_rel.py @@ -296,12 +296,12 @@ duo_txt = '--duohmm' # TODO: handle empty chromosomes -chrstem = str(args.bfile)+'.hg19.ch.fl.chr\$tasknum' -outstem = str(outdot)+'.chr\$tasknum' -map_arg = str(args.ref_maps).replace('###','\$tasknum') -hap_arg = str(args.ref_haps).replace('###','\$tasknum') -leg_arg = str(args.ref_legs).replace('###','\$tasknum') -samp_arg = str(args.ref_samps).replace('###','\$tasknum') +chrstem = str(args.bfile)+'.hg19.ch.fl.chr{task}' +outstem = str(outdot)+'.chr{task}' +map_arg = str(args.ref_maps).replace('###','{task}') +hap_arg = str(args.ref_haps).replace('###','{task}') +leg_arg = str(args.ref_legs).replace('###','{task}') +samp_arg = str(args.ref_samps).replace('###','{task}') shape_call = [shapeit_ex, '--input-bed', chrstem+'.bed', chrstem+'.bim', chrstem+'.fam', From 92173c095a4a29986c9915e614e1451c8dcfb99f Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 13 Dec 2016 23:48:47 -0500 Subject: [PATCH 36/48] fix resub of array jobs with 1 task remaining --- bin/agg_gwas.py | 3 ++- bin/agg_imp.py | 3 ++- bin/bg_imp.py | 3 ++- bin/blueprint.py | 9 +++++---- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index 5df1429..a19bf7f 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -236,7 +236,8 @@ walltime=sendjob_dict['walltime'], njobs=sendjob_dict['njobs'], maxpar=sendjob_dict['maxpar'], - sleep=sendjob_dict['sleep']) + sleep=sendjob_dict['sleep'], + forcearray=True) print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss diff --git a/bin/agg_imp.py b/bin/agg_imp.py index 80a01a3..fc9bc0d 100755 --- a/bin/agg_imp.py +++ b/bin/agg_imp.py @@ -213,7 +213,8 @@ mem=sendjob_dict['mem'], walltime=sendjob_dict['walltime'], njobs=sendjob_dict['njobs'], - sleep=sendjob_dict['sleep']) + sleep=sendjob_dict['sleep'], + forcearray=True) print 'Best-guess jobs resubmitted for %d chunks.\n' % nummiss diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 81ac5ec..511e058 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -328,7 +328,8 @@ mem=sendjob_dict['mem'], walltime=sendjob_dict['walltime'], njobs=sendjob_dict['njobs'], - sleep=sendjob_dict['sleep']) + sleep=sendjob_dict['sleep'], + forcearray=True) print 'GWAS jobs resubmitted for %d chunks.\n' % nummiss diff --git a/bin/blueprint.py b/bin/blueprint.py index 352a555..62ffe48 100755 --- a/bin/blueprint.py +++ b/bin/blueprint.py @@ -31,7 +31,8 @@ def send_job(jobname, wait_num=None, cluster=None, sleep=30, - testonly=False): + testonly=False, + forcearray=False): # validate args if arrayfile is None and cmd is None: @@ -121,7 +122,7 @@ def send_job(jobname, # for single jobs - if cmd is not None and (njobs is None or njobs <= 1): + if cmd is not None and (njobs is None or njobs <= 1) and not forcearray: njobs = 1 tot_threads = int(threads) @@ -272,7 +273,7 @@ def send_job(jobname, sub_file.close() # finalize or remove optional lines - if njobs <= 1: + if njobs <= 1 and not forcearray: subprocess.check_call(['sed','-i','/^::PICO_ARRAY_ONLY::/d',str(sub_file.name)]) else: subprocess.check_call(['sed','-i','s/^::PICO_ARRAY_ONLY:://',str(sub_file.name)]) @@ -282,7 +283,7 @@ def send_job(jobname, else: subprocess.check_call(['sed','-i','s/^::PICO_THREAD_ONLY:://',str(sub_file.name)]) - if njobs <= 1 and threads <= 1: + if njobs <= 1 and not forcearray and threads <= 1: subprocess.check_call(['sed','-i','/^::PICO_THREADARRAY_ONLY::/d',str(sub_file.name)]) else: subprocess.check_call(['sed','-i','s/^::PICO_THREADARRAY_ONLY:://',str(sub_file.name)]) From d34f45110325199501e01b38fed18bf600543b9c Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 28 Feb 2017 19:09:45 -0500 Subject: [PATCH 37/48] force numeric freqs --- bin/agg_gwas.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/agg_gwas.py b/bin/agg_gwas.py index a19bf7f..f935d52 100755 --- a/bin/agg_gwas.py +++ b/bin/agg_gwas.py @@ -314,8 +314,8 @@ dumphead = frq.readline() for line in frq: (chrom, snp, a1, a2, mafa, mafu, nchra, nchru) = line.split() - maf_a_info[str(snp)] = mafa - maf_u_info[str(snp)] = mafu + maf_a_info[str(snp)] = float(mafa) + maf_u_info[str(snp)] = float(mafu) n_a_info[str(snp)] = int(nchra) / 2 n_u_info[str(snp)] = int(nchru) / 2 freq_a1[str(snp)] = a1 @@ -387,12 +387,12 @@ # get meta info # verify use freq of correct allele - if str(frq_a1.pop(str(snp))) == str(a1): - frqa = maf_a_info.pop(str(snp)) - frqu = maf_u_info.pop(str(snp)) + if str(freq_a1.pop(str(snp))) == str(a1): + frqa = float(maf_a_info.pop(str(snp))) + frqu = float(maf_u_info.pop(str(snp))) else: - frqa = 1 - maf_a_info.pop(str(snp)) - frqu = 1 - maf_u_info.pop(str(snp)) + frqa = 1 - float(maf_a_info.pop(str(snp))) + frqu = 1 - float(maf_u_info.pop(str(snp))) na = n_a_info.pop(str(snp)) nu = n_u_info.pop(str(snp)) From af818983a0178d0e7405e6ab132a856e7fb837e0 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Tue, 28 Feb 2017 19:10:39 -0500 Subject: [PATCH 38/48] improve file checks from args --- bin/admix_rel.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 13206b7..18b15a2 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -254,7 +254,6 @@ if args.admix_p is not None and args.admix_p != "": run_admix = False -else: assert os.path.isfile(args.admix_p), "Admixture .P file %s does not exist." % str(args.admix_p) if args.use_exemplars: @@ -285,8 +284,9 @@ # pca file if plot_pca: assert os.path.isfile(args.plot_admix_pca), "PCA file does not exist (%r)" % args.plot_admix_pca - assert '/' not in args.target_bfile, "--plot-admix-pca must specify only a file, not a path" +# assert '/' not in args.plot_admix_pca, "--plot-admix-pca must specify only a file, not a path" +# TODO: allow relative paths here (os.path.normpath() should solve this; see link() for pca file) # verify bfiles are files, not paths assert '/' not in args.target_bfile, "--target-bfile must specify only a file stem, not a path" @@ -320,8 +320,7 @@ # link pca file, if provided if not (args.plot_admix_pca==None or args.plot_admix_pca=="None"): - - link(str(wd+'/'+args.plot_admix_pca), str(args.plot_admix_pca), 'PCA file') + link(os.path.normpath(str(wd+'/'+args.plot_admix_pca)), os.path.basename(str(args.plot_admix_pca)), 'PCA file') From e4f2f265d8927b42c2aca41f3172f313ef1f7a42 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 1 Mar 2017 13:34:37 -0500 Subject: [PATCH 39/48] update default loc for admixture on Broad --- bin/config_pico.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/config_pico.pl b/bin/config_pico.pl index b9c7767..1a4fbfb 100755 --- a/bin/config_pico.pl +++ b/bin/config_pico.pl @@ -342,7 +342,7 @@ () "i2loc", "/psych/genetics_data/ripke/references_from_debakkerscratch/impute_v2/impute_v2/impute_2.2.7_beta", "liloc","/home/unix/sripke/liftover", "eloc","/humgen/atgu1/fs03/shared_resources/shared_software/EIG6.0beta_noreq/bin", - "admloc","/humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.23", + "admloc"," /humgen/atgu1/fs03/shared_resources/shared_software/admixture_linux-1.3.0", "reaploc","/humgen/atgu1/fs03/shared_resources/shared_software/REAP", "priloc","/humgen/atgu1/fs03/shared_resources/shared_software/PRIMUS_v1.8.0/bin", "rloc","/broad/software/free/Linux/redhat_6_x86_64/pkgs/r_3.0.1/bin", From 9e57709f29ad6191e60ced530bc20d662f83f0b7 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 1 Mar 2017 13:53:31 -0500 Subject: [PATCH 40/48] prevent duplicating find/test execs --- bin/admix_rel.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 18b15a2..d22ba45 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -234,18 +234,24 @@ if args.admixture_ex is None or args.admixture_ex == "None": args.admixture_ex = find_exec('admixture', key='admloc') - -test_exec(args.admixture_ex, 'ADMIXTURE') +else: + test_exec(args.admixture_ex, 'ADMIXTURE') if args.rscript_ex is None or args.rscript_ex == "None": args.rscript_ex = find_exec('Rscript', key='rscloc') +else: + test_exec(args.rscript_ex, 'Rscript') if args.reap_ex is None or args.reap_ex == "None": args.reap_ex = find_exec('REAP', key='reaploc') +else: + test_exec(args.reap_ex, 'REAP') rp_bin = os.path.dirname(os.path.realpath(__file__)) Rplotibdx = rp_bin+'/plot_reap_ibd.Rscript' + + if plot_pca: Rplotpcax = rp_bin+'/plot_pca.Rscript' @@ -276,11 +282,6 @@ assert os.path.isfile(str(args.unrel_bfile)+'.fam'), "fam file for unrelated individuals %s does not exist." % str(args.unrel_bfile)+'.fam' -# verify executables -test_exec(plinkx, 'Plink') -test_exec(args.rscript_ex, 'Rscript') -test_exec(args.reap_ex, 'REAP') - # pca file if plot_pca: assert os.path.isfile(args.plot_admix_pca), "PCA file does not exist (%r)" % args.plot_admix_pca From 060f44db01d35b52d34d34d89f73051137b74681 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Thu, 2 Mar 2017 12:25:49 -0500 Subject: [PATCH 41/48] fix admixture plotting without exemplars --- bin/admix_rel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index d22ba45..62f9da9 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -323,6 +323,8 @@ if not (args.plot_admix_pca==None or args.plot_admix_pca=="None"): link(os.path.normpath(str(wd+'/'+args.plot_admix_pca)), os.path.basename(str(args.plot_admix_pca)), 'PCA file') +# labels for populations are popA, popB, popC, ... +popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)] if run_admix: @@ -354,9 +356,6 @@ # - match population assignments to FID/IIDs # - write .pops file for target bfile, .pops.info file - # label for populations are popA, popB, popC, ... - popnames = [str('pop'+ascii_uppercase[i]) for i in range(args.npops)] - # define function returning popname or '-' based on largest proportion # Note: ties broken in favor of first pop listed in names (possible if th <= 0.5) def maxpop(props, names, th): @@ -714,8 +713,9 @@ def maxpop(props, names, th): glob(args.target_bfile+".*.admixture.plotinfo.txt") + \ [str(args.target_bfile)+".admixture.legend.txt"] + \ glob(args.out+".*.plot_admixture.log")) - - subprocess.check_call(["tar", "-zcvf", + + if args.use_exemplars: + subprocess.check_call(["tar", "-zcvf", str(args.out+'.plot_exemplar_files.tar.gz')] + \ glob(args.target_bfile+".*.exemplar.plotinfo.txt") + \ [str(args.target_bfile)+".exemplar.legend.txt"] + \ From 81714f8855b899078437e5743b063cde4f4fc2c1 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 3 Mar 2017 11:49:06 -0500 Subject: [PATCH 42/48] improve log of plotting cmds --- bin/admix_rel.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 62f9da9..26b27d7 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -580,12 +580,15 @@ def maxpop(props, names, th): ### IBD0/IBD1 points and density # plot_reap_ibd.Rscript has args r_ibd_log = open(str(args.out) + '.plot_ibd.log', 'w') -subprocess.check_call([Rplotibdx, - str('REAP_pairs_relatedness.txt'), - str(args.out), - str(args.min_rel)], - stderr=subprocess.STDOUT, - stdout=r_ibd_log) +plot_ibd_call = [Rplotibdx, + str('REAP_pairs_relatedness.txt'), + str(args.out), + str(args.min_rel)] +print str(' '.join(plot_ibd_call)) + +subprocess.check_call(plot_ibd_call, + stderr=subprocess.STDOUT, + stdout=r_ibd_log) r_ibd_log.close() print 'IBD plots: %s.IBD.png, %s.IBD_density.png' % (args.out, args.out) @@ -673,25 +676,31 @@ def maxpop(props, names, th): for i in xrange(args.npops): if args.use_exemplars: r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w') - subprocess.check_call([Rplotpcax, - str(args.plot_admix_pca), + plot_pca_exemp_call = [Rplotpcax, + str(os.path.basename(str(args.plot_admix_pca)), str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', str(args.target_bfile) + '.exemplar.legend.txt', str(3), - str(args.out) + '.' + popnames[i] + '.exemplars'], - stderr=subprocess.STDOUT, - stdout=r_pca_ex_log) + str(args.out) + '.' + popnames[i] + '.exemplars'] + + print str(" ".join(plot_pca_exemp_call)) + subprocess.check_call(plot_pca_exemp_call, + stderr=subprocess.STDOUT, + stdout=r_pca_ex_log) r_pca_ex_log.close() r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w') - subprocess.check_call([Rplotpcax, - str(args.plot_admix_pca), + plot_pca_admix_call = [Rplotpcax, + str(os.path.basename(str(args.plot_admix_pca)), str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt', str(args.target_bfile) + '.admixture.legend.txt', str(3), - str(args.out) + '.' + popnames[i] + '.admixture'], - stderr=subprocess.STDOUT, - stdout=r_pca_admix_log) + str(args.out) + '.' + popnames[i] + '.admixture'] + + print str(" ".join(plot_pca_admix_call)) + subprocess.check_call(plot_pca_admix_call, + stderr=subprocess.STDOUT, + stdout=r_pca_admix_log) r_pca_admix_log.close() print 'PCA plots for %s: %s, %s (completed %d/%d populations)' % (popnames[i], str(args.out)+'.'+popnames[i]+'.exemplars.pca.pairs.png', str(args.out)+'.'+popnames[i]+'.exemplars.pca.pc##_pc##.png', i+1, args.npops) From f6b2bb051a6d3e7887bdba6dd1952ae959351a47 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Fri, 3 Mar 2017 12:04:25 -0500 Subject: [PATCH 43/48] fix mark for unplotted region --- bin/plot_reap_ibd.Rscript | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/plot_reap_ibd.Rscript b/bin/plot_reap_ibd.Rscript index ef33508..a86b237 100755 --- a/bin/plot_reap_ibd.Rscript +++ b/bin/plot_reap_ibd.Rscript @@ -14,7 +14,7 @@ if(length(commandArgs(TRUE)) > 2){ require(ggplot2) # unplotted region -tri = data.frame(x=c(1-minrel,1-((2/3)*minrel),1), y=c(0,(2/3)*minrel,0)) +tri = data.frame(x=c(1-minrel,1-(2*minrel),1), y=c(0,2*minrel,0)) # read data infile <- read.table(infile, header=TRUE, stringsAsFactors=F) From bc97a039dff931558e4481ad84106125a4b59a85 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 15 Mar 2017 13:30:50 -0400 Subject: [PATCH 44/48] fix typo --- bin/admix_rel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/admix_rel.py b/bin/admix_rel.py index 26b27d7..b325d82 100755 --- a/bin/admix_rel.py +++ b/bin/admix_rel.py @@ -677,7 +677,7 @@ def maxpop(props, names, th): if args.use_exemplars: r_pca_ex_log = open(str(args.out) + '.' + popnames[i] + '.plot_exemplars.log', 'w') plot_pca_exemp_call = [Rplotpcax, - str(os.path.basename(str(args.plot_admix_pca)), + str(os.path.basename(str(args.plot_admix_pca))), str(args.target_bfile) + '.' + popnames[i] + '.exemplar.plotinfo.txt', str(args.target_bfile) + '.exemplar.legend.txt', str(3), @@ -691,7 +691,7 @@ def maxpop(props, names, th): r_pca_admix_log = open(str(args.out) + '.' + popnames[i] + '.plot_admixture.log', 'w') plot_pca_admix_call = [Rplotpcax, - str(os.path.basename(str(args.plot_admix_pca)), + str(os.path.basename(str(args.plot_admix_pca))), str(args.target_bfile) + '.' + popnames[i] + '.admixture.plotinfo.txt', str(args.target_bfile) + '.admixture.legend.txt', str(3), From e937bde498b383bc16ca18161847d8cd96fda053 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 15 Mar 2017 13:31:49 -0400 Subject: [PATCH 45/48] fix depends broken by Broad Anaconda (mostly for R plots) --- cluster_templates/broad_uger.sub.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster_templates/broad_uger.sub.sh b/cluster_templates/broad_uger.sub.sh index 597ad98..b1d43f0 100755 --- a/cluster_templates/broad_uger.sub.sh +++ b/cluster_templates/broad_uger.sub.sh @@ -23,6 +23,8 @@ sleep {sleep_time} # setup resources source /broad/software/scripts/useuse reuse -q Anaconda +reuse -q .curl-7.47.1 +reuse -q .cairo-1.14.2 # main command line {cmd_string} From e5dc9fc9ebf6a1fb9862cc10ebae53ce0d0ae4f6 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 15 Mar 2017 15:16:41 -0400 Subject: [PATCH 46/48] adaptive mem reqs for imp agg --- bin/bg_imp.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/bin/bg_imp.py b/bin/bg_imp.py index 511e058..754fcd2 100755 --- a/bin/bg_imp.py +++ b/bin/bg_imp.py @@ -467,11 +467,24 @@ agg_log = 'agg_imp.'+str(outdot)+'.sub.log' + # some dynamic adjustment of mem based on sample size population + # (empirically, seem to get ~2x sites from afr vs eur) + fam_n = file_len(str(shape_dir)+'/'+str(args.bfile)+'.hg19.ch.fl.fam') + if fam_n > 3000: + agg_mem = 32000 + elif fam_n > 1000: + agg_mem = 16000 + else: + agg_mem = 8000 + + if "afr" in sys.argv[1:]: + agg_mem = 2*agg_mem + # TODO: consider queue/mem for agg send_job(jobname='agg.imp.'+str(outdot), cmd=next_call, logname=agg_log, - mem=8000, + mem=int(agg_mem), walltime=30, wait_name='bg.chunks.'+str(outdot), wait_num=str(jobres2).strip(), From 3114d1759451fc196e98386b52a28a03e5ff7611 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 15 Mar 2017 16:28:38 -0400 Subject: [PATCH 47/48] add option of arbitrary weights for filter ped --- bin/args_ped.py | 13 +++++++-- bin/filter_ped.py | 70 +++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 72 insertions(+), 11 deletions(-) diff --git a/bin/args_ped.py b/bin/args_ped.py index 6d4642e..c26d6ff 100644 --- a/bin/args_ped.py +++ b/bin/args_ped.py @@ -56,7 +56,7 @@ # ############ parsergeno = argparse.ArgumentParser(add_help=False) -arg_geno = parsergeno.add_argument_group('Genotyping Rate (Optional)') +arg_geno = parsergeno.add_argument_group('Additional Weights (Optional)') arg_geno.add_argument('--geno', type=str, @@ -64,8 +64,15 @@ help='file with genotype missingness rate per individual ' + \ '(i.e. the .imiss file from plink --missing)', required=False, - default='NONE') - + default=None) +arg_geno.add_argument('--weight-file', + type=str, + metavar='FILE', + help='file with added weight per individual. Intentionally ' + \ + 'flexible for arbitrary weights. Assumes 3 columns: FID,' + \ + 'IID, and numeric weight.', + required=False, + default=None) ############ # diff --git a/bin/filter_ped.py b/bin/filter_ped.py index 6b103e6..777b2f2 100755 --- a/bin/filter_ped.py +++ b/bin/filter_ped.py @@ -82,7 +82,10 @@ print 'Using settings:' print '--input-ibd '+str(args.input_ibd) print '--bfile '+str(args.bfile) -print '--geno '+str(args.geno) +if args.geno is not None and args.geno != "None": + print '--geno '+str(args.geno) +if args.weight_file is not None and args.weight_file != "None": + print '--weight-file '+str(args.weight_file) print '--out '+str(args.out) print '--format '+str(args.format) print '--min-rel '+str(args.min_rel) @@ -93,7 +96,8 @@ print '--fam-con-weight '+str(args.fam_con_weight) print '--fam-miss-weight '+str(args.fam_miss_weight) print '--cross-fid-weight '+str(args.cross_fid_weight) -print '--geno-weight '+str(args.geno_weight) +if args.geno is not None and args.geno != "None": + print '--geno-weight '+str(args.geno_weight) print '--rand-weight '+str(args.rand_weight) print '--seed '+str(args.seed) @@ -106,9 +110,12 @@ assert os.path.isfile(args.input_ibd), "IBD/relatedness file does not exist (%r)" % args.input_ibd assert os.path.isfile(str(args.bfile)+'.fam'), "Plink fam file does not exist (%s)" % str(args.bfile)+'.fam' -if str(args.geno) != 'NONE': +if args.geno is not None and str(args.geno) != 'None': assert os.path.isfile(args.geno), "Missingness rate file does not exist (%r)" % args.geno +if args.weight_file is not None and str(args.weight_file) != 'None': + assert os.path.isfile(args.weight_file), "Weight file does not exist (%r)" % args.weight_file + print '\n' print '############' @@ -171,7 +178,7 @@ genorate = {} -if str(args.geno) == 'NONE': +if args.geno is None or str(args.geno) == 'None': print 'Skipping (no file provided).' for indiv in fam_info: genorate[indiv] = 1.0 @@ -201,8 +208,52 @@ if indiv in genorate: continue else: - warnings.warn('Genotyping rate not loaded for %s. Setting to zero.' % str(indiv)) - genofile[indiv] = 1.0 + warnings.warn('Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv)) + genofile[indiv] = 0.0 + + + + + +############# +print '\n...Loading additional weight file...' +# Assume FID, IID, weight +# if no file, set to zero +############# + +misc_w = {} + +if args.weight_file is None or str(args.weight_file) == 'None': + print 'Skipping (no file provided).' + for indiv in fam_info: + misc_w[indiv] = 0.0 + +else: + wfile = open(str(args.weight_file), 'r') + + # read per individual, indexed by FID:IID + for line in wfile: + (fid, iid, weight_num) = line.split() + + # id key + ind_id = str(fid) + ':' + str(iid) + + # record + misc_w[ind_id] = float(weight_num) + + wfile.close() + + # check values present for all IDs + for indiv in fam_info: + if indiv in misc_w: + continue + else: + warnings.warn('No additional weight for %s. Setting to zero.' % str(indiv)) + misc_w[indiv] = 0.0 + + + + @@ -504,7 +555,7 @@ def isFamPO(pair_info, fam_info): # define function to score preference for keeping each individual # lowest score will get deleted - def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict): + def pref_score(ind_id, fam_dict, rel_dict, geno_dict, misc_dict, weight_dict): # init pref = 0.0 ind_id = str(ind_id) @@ -535,6 +586,9 @@ def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict): # score geno rate pref += weight_dict['geno_rate'] * float(geno_dict[ind_id]) + # score added weight + pref += float(misc_dict[ind_id]) + return pref # loop removal until no cross-fid relationship left @@ -542,7 +596,7 @@ def pref_score(ind_id, fam_dict, rel_dict, geno_dict, weight_dict): while len(cross_id_list) > 0: # score each cross-FID related IID's prority for keep/remove - prefs = [pref_score(indiv, fam_info, iid_relatives, genorate, pref_weights) for indiv in cross_id_list] + prefs = [pref_score(indiv, fam_info, iid_relatives, genorate, misc_w, pref_weights) for indiv in cross_id_list] # breaks ties randomly if len(prefs) != len(set(prefs)): From 16cbf54818a843ef7c35b21d2cd41b1476a35f24 Mon Sep 17 00:00:00 2001 From: rkwalters Date: Wed, 15 Mar 2017 19:00:17 -0400 Subject: [PATCH 48/48] log filter_ped warnings --- bin/filter_ped.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bin/filter_ped.py b/bin/filter_ped.py index 777b2f2..640f72d 100755 --- a/bin/filter_ped.py +++ b/bin/filter_ped.py @@ -37,8 +37,9 @@ import random import warnings from args_ped import parserbase, parsergeno, parseribd, parserweights -from py_helpers import unbuffer_stdout +from py_helpers import unbuffer_stdout, warn_format unbuffer_stdout() +warnings.formatwarning = warn_format ############# @@ -209,6 +210,7 @@ continue else: warnings.warn('Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv)) + print 'Genotyping rate not loaded for %s. Setting call rate to zero.' % str(indiv) genofile[indiv] = 0.0 @@ -249,14 +251,10 @@ continue else: warnings.warn('No additional weight for %s. Setting to zero.' % str(indiv)) + print 'No additional weight for %s. Setting to zero.' % str(indiv) misc_w[indiv] = 0.0 - - - - - ############# print '\n...Parsing relatedness estimates...' # handle reap file format