Skip to content

Commit

Permalink
Adjust abnormal script. Add early exit to align script. Add comments …
Browse files Browse the repository at this point in the history
…to everything. Cleanup when jobs fail in launch.

svn r135
  • Loading branch information
nchernia committed Aug 8, 2017
1 parent de6f226 commit 5e5ef74
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 79 deletions.
41 changes: 35 additions & 6 deletions abnormal.awk
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,9 @@ BEGIN{
else {
# deal with read pair group
tottot++;
# printme = 0;
printme = 1;
j = 0;
for (i in c) {
# line contains ligation junction
# if (c[i] ~ /AAGCTAGCTT/) {
# printme = 1;
# }
split(c[i], tmp);
mapped[j] = tmp[3] ~ /^[1-9,X,Y,M][0-9,T]?$/ || tmp[3] ~ /^chr[1-9,X,Y,M][0-9,T]?$/;
name[j] = tmp[1];
Expand All @@ -51,7 +46,8 @@ BEGIN{
}
}
}
if (printme && bigdist >= 3) {
# printme = printme && (bigdist >= 3);
if (printme) {
for (j = 0; j < len; j++) {
printf("%s %d %s %d %d %s %s ", name[j], str[j], chr[j], pos[j], m[j], cig[j], seq[j]);
}
Expand All @@ -66,3 +62,36 @@ BEGIN{
# these happen no matter what, after the above processing
c[count] = $0;
}
END {
printme = 1;
j = 0;
for (i in c) {
split(c[i], tmp);
mapped[j] = tmp[3] ~ /^[1-9,X,Y,M][0-9,T]?$/ || tmp[3] ~ /^chr[1-9,X,Y,M][0-9,T]?$/;
name[j] = tmp[1];
str[j] = tmp[2];
chr[j] = tmp[3];
pos[j] = tmp[4];
m[j] = tmp[5];
cig[j] = tmp[6];
seq[j] = tmp[10];
j++;
}
len = j;
bigdist = 0;
for (j=0; j<len; j++) {
printme = printme && mapped[j] && (m[j] >= 10);
for (k=j+1; k<len; k++) {
if (abs(pos[j]-pos[k]) >= 20000) {
bigdist++;
}
}
}
# printme = printme && (bigdist >= 3);
if (printme) {
for (j = 0; j < len; j++) {
printf("%s %d %s %d %d %s %s ", name[j], str[j], chr[j], pos[j], m[j], cig[j], seq[j]);
}
printf("\n");
}
}
50 changes: 34 additions & 16 deletions align_stable.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
#!/bin/bash
# Alignment script. Sets the reference genome and genome ID based on the input
# arguments. Optional arguments are the queue for the alignment (default hour)
# and the top-level directory (default current directory)
# arguments (default human, DpnII). Optional arguments are the queue for the alignment
# (default hour), key for menu entry, description, read end, early exit,
# and the top-level directory (default current directory).
#
# Splits the fastq files, creates jobs to align them, creates merge jobs that
# wait for the alignment to finish, and creates a final merge and final
# cleanup job (in case one of the jobs failed).
#
# If all is successful, takes the final merged file, removes name duplicates,
# removes PCR duplicates, and creates the hic job and stats job. Final
# product will be hic file and stats file in the aligned directory.
Expand All @@ -30,9 +33,9 @@
# match any files with the read1str.
shopt -s extglob

splitsize=6000000 #adjust to match your needs. 40000000 = 1M reads per split
read1str="_R1"
read2str="_R2"
splitsize=6000000 # adjust to match your needs. 40000000 = 1M reads per split
read1str="_R1" # fastq files should look like filename_R1.fastq and filename_R2.fastq
read2str="_R2" # if your fastq files look different, change this value

# unique name for group in case we need to kill jobs
groupname="/a"`date +%s`
Expand All @@ -44,20 +47,22 @@ queue="hour"
site="DpnII"
# genome ID, default to human, can also be set in options
genomeID="hg19"
# normally both read ends are aligned with long read aligner; if one end is short, this is set
shortreadend=0
# description, default empty
about=""

## Read arguments
usageHelp="Usage: ${0##*/} -g genomeID [-d topDir] [-q queue] [-s site] [-k key] [-a about] [-R end] [-r] [-e] [-h]"
genomeHelp=" genomeID must be one of \"mm9\" (mouse), \"hg19\" (human), \"sCerS288c\" (yeast), or \"dMel\" (fly)"
genomeHelp=" genomeID must be one of \"mm9\" (mouse), \"hg18\" \"hg19\" (human), \"sCerS288c\" (yeast), or \"dMel\" (fly)\n alternatively, it can be the fasta file of the genome, but the BWA indices must already be created in the same directory"
dirHelp=" [topDir] is the top level directory (default \"$topDir\")\n [topDir]/fastq must contain the fastq files\n [topDir]/splits will be created to contain the temporary split files\n [topDir]/aligned will be created for the final alignment"
queueHelp=" [queue] is the LSF queue for running alignments (default \"$queue\")"
siteHelp=" [site] must be one of \"HindIII\", \"MseI\", \"NcoI\", \"DpnII\", \"MspI\", \"HinP1I\", \"StyD4I\", \"SaII\", \"NheI\", \"StyI\", \"XhoI\", \"NlaIII\", or \"merge\" (default \"$site\")"
shortHelp2=" [end]: use the short read aligner on end, must be one of 1 or 2 "
shortHelp=" -r: use the short read version of the aligner (default long read)"
keyHelp=" -k: key for menu item to put this file under"
exitHelp=" -e: early exit; align but don't combine sorted files"
aboutHelp=" -a: enter description of experiment, enclosed in single quotes"
exitHelp=" -e: early exit; align, sort, merge, and dedup (ends with merged_nodups.txt)"
aboutHelp=" -a: enter description of experiment, enclosed in single quotes"
helpHelp=" -h: print this help and exit"
moreHelp="For more information, type:\n\tgroff -man -Tascii /broad/aidenlab/neva/neva_scripts/align.1 | less"

Expand All @@ -77,7 +82,7 @@ printHelpAndExit() {
exit $1
}

while getopts "d:g:R:k:a:hreq:s:" opt; do
while getopts "d:g:R:k:a:hrefq:s:" opt; do
case $opt in
g) genomeID=$OPTARG ;;
h) printHelpAndExit 0;;
Expand All @@ -88,6 +93,7 @@ while getopts "d:g:R:k:a:hreq:s:" opt; do
R) shortreadend=$OPTARG ;;
r) shortread=1 ;; #use short read aligner
e) earlyexit=1 ;;
f) fastq=1 ;;
a) about=$OPTARG ;;
[?]) printHelpAndExit 1;;
esac
Expand All @@ -100,11 +106,12 @@ case $genomeID in
mm9) refSeq="/broad/aidenlab/references/Mus_musculus_assembly9_norandom.fasta";;
hg19) refSeq="/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";;
sCerS288c) refSeq="/broad/aidenlab/references/sacCer3.fa";;
*) echo "$usageHelp"
*) refSeq=$genomeID;;
echo "$genomeHelp"
exit 1
esac

## Set ligation junction based on restriction enzyme
case $site in
HindIII) ligation="AAGCTAGCTT";;
MseI) ligation="TTATAA";;
Expand All @@ -115,6 +122,7 @@ case $site in
exit 1
esac

## If short read end is set, make sure it is 1 or 2
case $shortreadend in
0) ;;
1) ;;
Expand All @@ -124,14 +132,17 @@ case $shortreadend in
exit 1
esac

## Hard-coded directory here; in future versions, this should be packaged with script
site_file="/broad/aidenlab/restriction_sites/${genomeID}_${site}.txt"
splitdir=$topDir"/splits"
fastqdir=$topDir"/fastq/*_R*.fastq"
outputdir=$topDir"/aligned"
read1=$splitdir"/*${read1str}*.fastq"
## ARRAY holds the names of the jobs as they are submitted
countjobs=0
declare -a ARRAY

## Check that fastq directory exists and has proper fastq files
if [ ! -d "$topDir/fastq" ]; then
echo "Directory \"$topDir/fastq\" does not exist."
echo "Create \"$topDir/$fastq\" and put fastq files to be aligned there."
Expand All @@ -150,6 +161,7 @@ else
fi
fi

## If key option is given, check that the key exists in the menu properties file
if [ ! -z $key ]
then
if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties
Expand All @@ -160,8 +172,6 @@ if [ ! -z $key ]
fi
fi



## Create output directory
if [ -d "$outputdir" ]; then
echo "Move or remove directory \"$outputdir\" before proceeding."
Expand All @@ -171,27 +181,26 @@ fi

mkdir $outputdir


## Create split directory
if [ -d "$splitdir" ]; then
splitdirexists=1
else
mkdir $splitdir
fi

## Create temporary directory
## Create temporary directory, used for sort later
if [ ! -d "/broad/hptmp/neva" ]; then
mkdir /broad/hptmp/neva
fi

## use LSF cluster on Broad
source /broad/software/scripts/useuse
reuse LSF

echo -e "Aligning files matching $fastqdir\n in queue $queue to genome $genomeID"



## Split fastq files into smaller portions for parallelizing alignment
## Do this by creating a text script file for the job in "tmp" and then sending it to LSF
if [ ! $splitdirexists ]; then
echo "Created $splitdir and $outputdir. Splitting files"
for i in ${fastqdir}
Expand All @@ -210,6 +219,8 @@ if [ ! $splitdirexists ]; then
rm tmp
done


# Once split succeeds, rename the splits as fastq files
echo -e '#!/bin/bash -l' > tmp
echo -e "#BSUB -q priority" >> tmp
echo -e "#BSUB -o $topDir/lsf.out\n" >> tmp
Expand All @@ -224,6 +235,8 @@ if [ ! $splitdirexists ]; then
ARRAY[countjobs]="${groupname}move"
countjobs=$(( $countjobs + 1 ))

# for all the jobs that have been launched so far, create a condition in case
# any of them exit prematurely
for (( i=0; i < countjobs; i++ ))
do
if [ $i -eq 0 ]; then
Expand All @@ -245,9 +258,14 @@ if [ ! $splitdirexists ]; then
bsub < tmp
rm tmp
else
# No need to re-split fastqs if they already exist
echo -e "Using already created files in $splitdir \n"
fi

## Launch job. Once split is done, kill the cleanup job, then
## set the parameters for the launch. Concatenate the script
## "launch.sh" to this script. This will start the rest of the
## parallel jobs.
echo -e '#!/bin/bash -l' > tmp2
echo -e "#BSUB -q priority" >> tmp2
echo -e "#BSUB -o $topDir/lsf.out\n" >> tmp2
Expand Down
14 changes: 8 additions & 6 deletions chimeric.awk
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ BEGIN{
split(c[j], tmp);
split(tmp[1],readname,"/");
read[j] = readname[2];
name[j] = readname[1];
name[j] = tmp[1];

# strand
str[j] = tmp[2];
Expand All @@ -81,6 +81,7 @@ BEGIN{
cigarstr[j] = tmp[6];
# sequence
seq[j] = tmp[10];
qual[j] = tmp[11];
# get rid of soft clipping to know correct position
if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
split(tmp[6], cigar, "S");
Expand Down Expand Up @@ -145,10 +146,10 @@ BEGIN{
if (mapped[read1] && mapped[read2]) {
count_norm++;
if (less_than(str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2])) {
print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
print str[read1],chr[read1],pos[read1],str[read2],chr[read2],pos[read2],m[read1],cigarstr[read1],seq[read1],m[read2],cigarstr[read2],seq[read2],name[read1],name[read2] > fname1;
}
else {
print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
print str[read2],chr[read2],pos[read2],str[read1],chr[read1],pos[read1],m[read2],cigarstr[read2],seq[read2],m[read1],cigarstr[read1],seq[read1],name[read2],name[read1] > fname1;
}
}
else {
Expand Down Expand Up @@ -185,7 +186,8 @@ BEGIN{
m[j] = tmp[5];
cigarstr[j] = tmp[6];
seq[j] = tmp[10];
name[j]=readname[1];
qual[j] = tmp[11];
name[j] = tmp[1];


if (str[j] == 0 && tmp[6] ~/^[0-9]+S/) {
Expand Down Expand Up @@ -222,10 +224,10 @@ BEGIN{
count_reg++;
if (less_than(str[0],chr[0],pos[0],str[1],chr[1],pos[1])) {
# ideally we'll get rid of printing out cigar string at some point
print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
print str[0],chr[0],pos[0],str[1],chr[1],pos[1],m[0],cigarstr[0],seq[0],m[1],cigarstr[1],seq[1],name[0],name[1] > fname1;
}
else {
print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
print str[1],chr[1],pos[1],str[0],chr[0],pos[0],m[1],cigarstr[1],seq[1],m[0],cigarstr[0],seq[0],name[1],name[0] > fname1;
}
}
else {
Expand Down
2 changes: 1 addition & 1 deletion cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ total=`ls -l aligned/merged_sort.txt | awk '{print $5}'`
total2=`ls -l aligned/merged_nodups.txt aligned/dups.txt aligned/opt_dups.txt | awk '{sum = sum + $5}END{print sum}'`
if [ $total -eq $total2 ]; then rm aligned/merged_sort.txt; else echo "Problem: The sum of merged_nodups and the dups files is not the same size as merged_sort.txt"; fi
rm -r splits
bzip2 fastq/*.fastq
#bzip2 fastq/*.fastq
Loading

0 comments on commit 5e5ef74

Please sign in to comment.