Skip to content

Commit

Permalink
Added in key for properties menu
Browse files Browse the repository at this point in the history
LibraryComplexity reads files to calculate unique read pairs / duplicates
Name of optical dups file changed
Got rid of Matlab for producing graphs (affects launch, statistics, and relaunch), Java now reads the .m file directly and produces them in the Juicebox GUI
Mapq script now also eliminates intra fragment reads
svn r107
  • Loading branch information
nchernia committed Aug 8, 2017
1 parent b5fe5d3 commit 76470e0
Show file tree
Hide file tree
Showing 8 changed files with 161 additions and 166 deletions.
Binary file modified LibraryComplexity.class
Binary file not shown.
155 changes: 90 additions & 65 deletions LibraryComplexity.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,92 +2,117 @@
* Taken from PicardTools
*/
import java.text.NumberFormat;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.File;
class LibraryComplexity {

public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage: java LibraryComplexity <reads> <unique reads>");
System.exit(0);
}
long readPairs = 0;
long uniqueReadPairs = 0;
try {
readPairs = Long.parseLong(args[0]);
uniqueReadPairs = Long.parseLong(args[1]);
}
catch (NumberFormatException e) {
System.err.println("Arguments must be integers");
System.exit(1);
}
long result;
try {
result = estimateLibrarySize(readPairs, uniqueReadPairs);
}
catch (NullPointerException e) {
System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
return;
}
long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));
System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));

public static void main(String[] args) {
if (args.length != 1) {
System.out.println("Usage: java LibraryComplexity <directory>");
System.exit(0);
}
BufferedReader reader = null;
long readPairs = 0;
long uniqueReadPairs = 0;
long opticalDups = 0;
try {
File f = new File(args[0] + "/opt_dups.txt");
if (f.exists()) {
reader = new BufferedReader(new FileReader(f));
while (reader.readLine() != null) opticalDups++;
reader.close();
}

f = new File(args[0] + "/merged_nodups.txt");
if (f.exists()) {
reader = new BufferedReader(new FileReader(f));
while (reader.readLine() != null) uniqueReadPairs++;
reader.close();
}
f = new File(args[0] + "/dups.txt");
if (f.exists()) {
reader = new BufferedReader(new FileReader(args[0] + "/dups.txt"));
while (reader.readLine() != null) readPairs++;
reader.close();
readPairs += uniqueReadPairs;
}
} catch (IOException error) {
System.err.println("Problem counting lines in merged_nodups and dups");
System.exit(1);
}
long result;
try {
result = estimateLibrarySize(readPairs, uniqueReadPairs);
}
catch (NullPointerException e) {
System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
return;
}
long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));

System.out.println("Total reads after duplication removal: " + NumberFormat.getInstance().format(uniqueReadPairs));
System.out.println("Duplicate reads: " + NumberFormat.getInstance().format(readPairs - uniqueReadPairs));
System.out.println("Optical duplicates: " + NumberFormat.getInstance().format(opticalDups));
System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));

}

/**
* Estimates the size of a library based on the number of paired end molecules
* observed and the number of unique pairs observed.
* <br>
* Based on the Lander-Waterman equation that states:<br>
* C/X = 1 - exp( -N/X )<br>
* where<br>
* X = number of distinct molecules in library<br>
* N = number of read pairs<br>
* C = number of distinct fragments observed in read pairs<br>
*/
public static Long estimateLibrarySize(final long readPairs,
final long uniqueReadPairs) {
final long readPairDuplicates = readPairs - uniqueReadPairs;
if (readPairs > 0 && readPairDuplicates > 0) {
/**
* Estimates the size of a library based on the number of paired end molecules
* observed and the number of unique pairs observed.
* <br>
* Based on the Lander-Waterman equation that states:<br>
* C/X = 1 - exp( -N/X )<br>
* where<br>
* X = number of distinct molecules in library<br>
* N = number of read pairs<br>
* C = number of distinct fragments observed in read pairs<br>
*/
public static Long estimateLibrarySize(final long readPairs,
final long uniqueReadPairs) {
final long readPairDuplicates = readPairs - uniqueReadPairs;
if (readPairs > 0 && readPairDuplicates > 0) {
long n = readPairs;
long c = uniqueReadPairs;

double m = 1.0, M = 100.0;

if (c >= n || f(m*c, c, n) < 0) {
throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
}

while( f(M*c, c, n) >= 0 ) {
m = M;
M *= 10.0;
m = M;
M *= 10.0;
}

double r = (m+M)/2.0;
double u = f( r * c, c, n );
int i = 0;

while (u != 0 && i < 1000) {
if (u > 0) m = r;
else M = r;
r = (m+M)/2.0;
u = f( r * c, c, n );
i++;
if (u > 0) m = r;
else M = r;
r = (m+M)/2.0;
u = f( r * c, c, n );
i++;
}
if (i == 1000) {
System.err.println("Iterated 1000 times, returning estimate");
System.err.println("Iterated 1000 times, returning estimate");
}

return (long) (c * (m+M)/2.0);
}
else {
return null;
}
}

/** Method that is used in the computation of estimated library size. */
private static double f(double x, double c, double n) {
return c/x - 1 + Math.exp(-n/x);
else {
return null;
}

}

/** Method that is used in the computation of estimated library size. */
private static double f(double x, double c, double n) {
return c/x - 1 + Math.exp(-n/x);
}
}
11 changes: 11 additions & 0 deletions align_stable.sh
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,17 @@ else
fi
fi

if [ ! -z $key ]
then
if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties
then
echo "Cannot find key $key in hicInternalMenu.properties"
echo "Please use an existing key or omit the key option"
exit 1
fi
fi



## Create output directory
if [ -d "$outputdir" ]; then
Expand Down
2 changes: 1 addition & 1 deletion dups.awk
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ BEGIN {
# the variable "name" can be set via the -v flag
dupname=name"dups.txt";
nodupname=name"merged_nodups.txt";
optname=name"opt_dups.txt"; #added
optname=name"optdups.txt";
}
# strand, chromosome, fragment match previous; first position (sorted) within wobble1

Expand Down
6 changes: 3 additions & 3 deletions launch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ if [ -z "$earlyexit" ]
echo -e "echo Splits done, launching other jobs." >> tmp
echo -e "bkill -J ${groupname}_clean2" >> tmp
echo "bsub -o $topDir/lsf.out -q $queue -g ${groupname}_clean -w \"done(${groupname}_kill)\" \"echo /broad/aidenlab/neva/neva_scripts/relaunch_dups.sh $flags > $topDir/relaunchme.sh\" " >> tmp
echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic; rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; ln -s $outputdir/inter*.jpg $newdir/.; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties; fi \" " >> tmp
echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; java -cp /broad/aidenlab/neva/neva_scripts/ LibraryComplexity $outputdir >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt -g $outputdir/inter_hists.m $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic; rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties; fi \" " >> tmp
echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt -g $outputdir/inter_30_hists.m $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
bsub < tmp
rm tmp

Expand Down
2 changes: 1 addition & 1 deletion mapq.awk
Original file line number Diff line number Diff line change
@@ -1 +1 @@
$9>=thresh && $12>=thresh{print}
$9>=thresh && $12>=thresh && !($2==$6 && $4==$8){print}
4 changes: 2 additions & 2 deletions split_rmdups.awk
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ END {
else {
waitstring=sprintf("%s || exit(%s)", waitstring, sname);
}
sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_optdups.txt > %s/opt_dups.txt; cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir, dir, dir);
system(sysstring);
sysstring = sprintf("bsub -o %s -q %s -g %s_skill -w \"%s\" \"bkill -g %s 0; bkill -g %skill 0\" ",outfile, queue, groupname, waitstring, groupname, groupname);
system(sysstring);
sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir);
sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_optdups.txt; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
system(sysstring);


Expand Down
Loading

0 comments on commit 76470e0

Please sign in to comment.