Adjust abnormal script. Add early exit to align script. Add comments …

…to everything. Cleanup when jobs fail in launch. svn r135
aidenlab · Aug 8, 2017 · 5e5ef74 · 5e5ef74
1 parent de6f226
commit 5e5ef74
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 79 deletions.
diff --git a/abnormal.awk b/abnormal.awk
@@ -22,14 +22,9 @@ BEGIN{
   else {
     # deal with read pair group
     tottot++;
-#    printme = 0;
     printme = 1;
     j = 0;
     for (i in c) {
-      # line contains ligation junction
-#      if (c[i] ~ /AAGCTAGCTT/) {
-#	printme = 1;
-#      }
       split(c[i], tmp);
       mapped[j] = tmp[3] ~ /^[1-9,X,Y,M][0-9,T]?$/ || tmp[3] ~ /^chr[1-9,X,Y,M][0-9,T]?$/;
       name[j] = tmp[1];
@@ -51,7 +46,8 @@ BEGIN{
             }
         }
     }
-    if (printme && bigdist >= 3) {
+#    printme = printme && (bigdist >= 3);
+    if (printme) {
       for (j = 0; j < len; j++) {
           printf("%s %d %s %d %d %s %s ", name[j], str[j], chr[j], pos[j], m[j], cig[j], seq[j]);
       }
@@ -66,3 +62,36 @@ BEGIN{
   # these happen no matter what, after the above processing
   c[count] = $0;
 }
+END {
+    printme = 1;
+    j = 0;
+    for (i in c) {
+      split(c[i], tmp);
+      mapped[j] = tmp[3] ~ /^[1-9,X,Y,M][0-9,T]?$/ || tmp[3] ~ /^chr[1-9,X,Y,M][0-9,T]?$/;
+      name[j] = tmp[1];
+      str[j] = tmp[2];
+      chr[j] = tmp[3];
+      pos[j] = tmp[4];
+      m[j] = tmp[5];
+      cig[j] = tmp[6];
+      seq[j] = tmp[10];
+      j++;
+    }
+    len = j;
+    bigdist = 0;
+    for (j=0; j<len; j++) {
+        printme = printme && mapped[j] && (m[j] >= 10);
+        for (k=j+1; k<len; k++) {
+            if (abs(pos[j]-pos[k]) >= 20000) {
+                bigdist++;
+            }
+        }
+    }
+#    printme = printme && (bigdist >= 3);
+    if (printme) {
+      for (j = 0; j < len; j++) {
+          printf("%s %d %s %d %d %s %s ", name[j], str[j], chr[j], pos[j], m[j], cig[j], seq[j]);
+      }
+      printf("\n");
+    }
+}
diff --git a/align_stable.sh b/align_stable.sh
@@ -1,10 +1,13 @@
 #!/bin/bash
 # Alignment script. Sets the reference genome and genome ID based on the input
-# arguments. Optional arguments are the queue for the alignment (default hour)
-# and the top-level directory (default current directory) 
+# arguments (default human, DpnII). Optional arguments are the queue for the alignment 
+# (default hour), key for menu entry, description, read end, early exit, 
+# and the top-level directory (default current directory).
+#
 # Splits the fastq files, creates jobs to align them, creates merge jobs that 
 # wait for the alignment to finish, and creates a final merge and final 
 # cleanup job (in case one of the jobs failed).
+#
 # If all is successful, takes the final merged file, removes name duplicates, 
 # removes PCR duplicates, and creates the hic job and stats job.  Final 
 # product will be hic file and stats file in the aligned directory.
@@ -30,9 +33,9 @@
 #             match any files with the read1str.
 shopt -s extglob
 
-splitsize=6000000 #adjust to match your needs.  40000000 = 1M reads per split
-read1str="_R1"
-read2str="_R2"
+splitsize=6000000 # adjust to match your needs.  40000000 = 1M reads per split
+read1str="_R1" # fastq files should look like filename_R1.fastq and filename_R2.fastq
+read2str="_R2" # if your fastq files look different, change this value
 
 # unique name for group in case we need to kill jobs
 groupname="/a"`date +%s`
@@ -44,20 +47,22 @@ queue="hour"
 site="DpnII"
 # genome ID, default to human, can also be set in options
 genomeID="hg19"
+# normally both read ends are aligned with long read aligner; if one end is short, this is set
 shortreadend=0
+# description, default empty
 about=""
 
 ## Read arguments
 usageHelp="Usage: ${0##*/} -g genomeID [-d topDir] [-q queue] [-s site] [-k key] [-a about] [-R end] [-r] [-e] [-h]"
-genomeHelp="   genomeID must be one of \"mm9\" (mouse), \"hg19\" (human), \"sCerS288c\" (yeast), or \"dMel\" (fly)"
+genomeHelp="   genomeID must be one of \"mm9\" (mouse), \"hg18\" \"hg19\" (human), \"sCerS288c\" (yeast), or \"dMel\" (fly)\n   alternatively, it can be the fasta file of the genome, but the BWA indices must already be created in the same directory"
 dirHelp="   [topDir] is the top level directory (default \"$topDir\")\n     [topDir]/fastq must contain the fastq files\n     [topDir]/splits will be created to contain the temporary split files\n     [topDir]/aligned will be created for the final alignment"
 queueHelp="   [queue] is the LSF queue for running alignments (default \"$queue\")"
 siteHelp="   [site] must be one of \"HindIII\", \"MseI\", \"NcoI\", \"DpnII\", \"MspI\", \"HinP1I\", \"StyD4I\", \"SaII\", \"NheI\", \"StyI\", \"XhoI\", \"NlaIII\", or \"merge\" (default \"$site\")"
 shortHelp2="   [end]: use the short read aligner on end, must be one of 1 or 2 "
 shortHelp="   -r: use the short read version of the aligner (default long read)"
 keyHelp="   -k: key for menu item to put this file under"
-exitHelp="   -e: early exit; align but don't combine sorted files"
-aboutHelp="    -a: enter description of experiment, enclosed in single quotes"
+exitHelp="   -e: early exit; align, sort, merge, and dedup (ends with merged_nodups.txt)"
+aboutHelp="   -a: enter description of experiment, enclosed in single quotes"
 helpHelp="   -h: print this help and exit"
 moreHelp="For more information, type:\n\tgroff -man -Tascii /broad/aidenlab/neva/neva_scripts/align.1 | less"
 
@@ -77,7 +82,7 @@ printHelpAndExit() {
     exit $1
 }
 
-while getopts "d:g:R:k:a:hreq:s:" opt; do
+while getopts "d:g:R:k:a:hrefq:s:" opt; do
     case $opt in
 	g) genomeID=$OPTARG ;;
 	h) printHelpAndExit 0;;
@@ -88,6 +93,7 @@ while getopts "d:g:R:k:a:hreq:s:" opt; do
 	R) shortreadend=$OPTARG ;;
 	r) shortread=1 ;;  #use short read aligner
 	e) earlyexit=1 ;;
+  f) fastq=1 ;;
   a) about=$OPTARG ;;
 	[?]) printHelpAndExit 1;;
     esac
@@ -100,11 +106,12 @@ case $genomeID in
     mm9) refSeq="/broad/aidenlab/references/Mus_musculus_assembly9_norandom.fasta";;
     hg19) refSeq="/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";;
 		sCerS288c) refSeq="/broad/aidenlab/references/sacCer3.fa";;
-    *)  echo "$usageHelp"
+    *)  refSeq=$genomeID;;
 	echo "$genomeHelp"
 	exit 1
 esac
 
+## Set ligation junction based on restriction enzyme
 case $site in
     HindIII) ligation="AAGCTAGCTT";;
     MseI)  ligation="TTATAA";;
@@ -115,6 +122,7 @@ case $site in
 				exit 1
 esac
 
+## If short read end is set, make sure it is 1 or 2
 case $shortreadend in
 		0) ;;
 		1) ;;
@@ -124,14 +132,17 @@ case $shortreadend in
 			 exit 1
 esac
 
+## Hard-coded directory here; in future versions, this should be packaged with script
 site_file="/broad/aidenlab/restriction_sites/${genomeID}_${site}.txt"
 splitdir=$topDir"/splits"
 fastqdir=$topDir"/fastq/*_R*.fastq"
 outputdir=$topDir"/aligned"
 read1=$splitdir"/*${read1str}*.fastq"
+## ARRAY holds the names of the jobs as they are submitted
 countjobs=0
 declare -a ARRAY
 
+## Check that fastq directory exists and has proper fastq files
 if [ ! -d "$topDir/fastq" ]; then
 		echo "Directory \"$topDir/fastq\" does not exist."
 		echo "Create \"$topDir/$fastq\" and put fastq files to be aligned there."
@@ -150,6 +161,7 @@ else
 		fi
 fi
 
+## If key option is given, check that the key exists in the menu properties file
 if [ ! -z $key ] 
     then
     if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties 
@@ -160,8 +172,6 @@ if [ ! -z $key ]
     fi
 fi
 
-
-
 ## Create output directory
 if [ -d "$outputdir" ]; then
     echo "Move or remove directory \"$outputdir\" before proceeding."
@@ -171,27 +181,26 @@ fi
 
 mkdir $outputdir
 
-
 ## Create split directory
 if [ -d "$splitdir" ]; then
     splitdirexists=1
 else
     mkdir $splitdir
 fi
 
-## Create temporary directory
+## Create temporary directory, used for sort later
 if [ ! -d "/broad/hptmp/neva" ]; then
     mkdir /broad/hptmp/neva
 fi
 
+## use LSF cluster on Broad
 source /broad/software/scripts/useuse
 reuse LSF
 
 echo -e "Aligning files matching $fastqdir\n in queue $queue to genome $genomeID"
 
-
-
 ## Split fastq files into smaller portions for parallelizing alignment 
+## Do this by creating a text script file for the job in "tmp" and then sending it to LSF
 if [ ! $splitdirexists ]; then
     echo "Created $splitdir and $outputdir.  Splitting files"
     for i in ${fastqdir}
@@ -210,6 +219,8 @@ if [ ! $splitdirexists ]; then
 			rm tmp
     done 
 
+
+    # Once split succeeds, rename the splits as fastq files
 		echo -e '#!/bin/bash -l' > tmp
 		echo -e "#BSUB -q priority" >> tmp
 		echo -e "#BSUB -o $topDir/lsf.out\n" >> tmp
@@ -224,6 +235,8 @@ if [ ! $splitdirexists ]; then
 		ARRAY[countjobs]="${groupname}move"
 		countjobs=$(( $countjobs + 1 ))
 
+    # for all the jobs that have been launched so far, create a condition in case
+    # any of them exit prematurely
 		for (( i=0; i < countjobs; i++ ))
 			do
 			if [ $i -eq 0 ]; then
@@ -245,9 +258,14 @@ if [ ! $splitdirexists ]; then
 		bsub < tmp		
 		rm tmp
 else
+    # No need to re-split fastqs if they already exist
     echo -e "Using already created files in $splitdir \n"
 fi
 
+## Launch job.  Once split is done, kill the cleanup job, then
+## set the parameters for the launch.  Concatenate the script
+## "launch.sh" to this script.  This will start the rest of the 
+## parallel jobs.
 echo -e '#!/bin/bash -l' > tmp2
 echo -e "#BSUB -q priority" >> tmp2
 echo -e "#BSUB -o $topDir/lsf.out\n" >> tmp2

diff --git a/chimeric.awk b/chimeric.awk
@@ -67,7 +67,7 @@ BEGIN{
 				split(c[j], tmp);
 				split(tmp[1],readname,"/");
 				read[j] = readname[2];
-				name[j] = readname[1];
+				name[j] = tmp[1];
 
 				# strand
 				str[j] = tmp[2];
@@ -81,6 +81,7 @@ BEGIN{
 				cigarstr[j] = tmp[6];
 				# sequence
 				seq[j] = tmp[10];
+        qual[j] = tmp[11];
 				# get rid of soft clipping to know correct position
 				if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
 					split(tmp[6], cigar, "S");
@@ -145,10 +146,10 @@ BEGIN{
 				if (mapped[read1] && mapped[read2]) {
 					count_norm++;
 					if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
-						print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
+              print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
 					}
 					else {
-						print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
+              print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
 					}
 				}
 				else {
@@ -185,7 +186,8 @@ BEGIN{
 				m[j] = tmp[5];
 				cigarstr[j] = tmp[6];
 				seq[j] = tmp[10];
-				name[j]=readname[1];
+        qual[j] = tmp[11];
+				name[j] = tmp[1];
 
 
 				if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
@@ -222,10 +224,10 @@ BEGIN{
 				count_reg++;
 				if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
 					# ideally we'll get rid of printing out cigar string at some point
-					print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
+            print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
 				}
 				else {
-					print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
+            print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
 				}
       }
       else {

diff --git a/cleanup.sh b/cleanup.sh
@@ -6,4 +6,4 @@ total=`ls -l aligned/merged_sort.txt | awk '{print $5}'`
 total2=`ls -l aligned/merged_nodups.txt aligned/dups.txt aligned/opt_dups.txt | awk '{sum = sum + $5}END{print sum}'`
 if [ $total -eq $total2 ]; then rm aligned/merged_sort.txt; else echo "Problem: The sum of merged_nodups and the dups files is not the same size as merged_sort.txt"; fi
 rm -r splits
-bzip2 fastq/*.fastq
+#bzip2 fastq/*.fastq