Added in key for properties menu

LibraryComplexity reads files to calculate unique read pairs / duplicates Name of optical dups file changed Got rid of Matlab for producing graphs (affects launch, statistics, and relaunch), Java now reads the .m file directly and produces them in the Juicebox GUI Mapq script now also eliminates intra fragment reads svn r107
aidenlab · Aug 8, 2017 · 76470e0 · 76470e0
1 parent b5fe5d3
commit 76470e0
Show file tree

Hide file tree

Showing 8 changed files with 161 additions and 166 deletions.
diff --git a/LibraryComplexity.class b/LibraryComplexity.class
diff --git a/LibraryComplexity.java b/LibraryComplexity.java
@@ -2,92 +2,117 @@
  * Taken from PicardTools 
  */
 import java.text.NumberFormat;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.File;
 class LibraryComplexity {
-
-    public static void main(String[] args) {
-	if (args.length != 2) {
-	    System.out.println("Usage: java LibraryComplexity <reads> <unique reads>");
-	    System.exit(0);
-	}
-	long readPairs = 0;
-	long uniqueReadPairs = 0;
-	try {
-	    readPairs = Long.parseLong(args[0]);
-	    uniqueReadPairs = Long.parseLong(args[1]);
-	}
-	catch (NumberFormatException e) {
-	    System.err.println("Arguments must be integers");
-	    System.exit(1);
-	}
-	long result;
-	try {
-			result = estimateLibrarySize(readPairs, uniqueReadPairs);
-	}
-	catch (NullPointerException e) {
-			System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
-			return;
-	}
-	long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));
-	System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
-	System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));
-
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      System.out.println("Usage: java LibraryComplexity <directory>");
+      System.exit(0);
+    }
+    BufferedReader reader = null;
+    long readPairs = 0;
+    long uniqueReadPairs = 0;
+    long opticalDups = 0;
+    try {
+      File f = new File(args[0] + "/opt_dups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(f));
+        while (reader.readLine() != null) opticalDups++;
+        reader.close();
+      }
+
+      f = new File(args[0] + "/merged_nodups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(f));
+        while (reader.readLine() != null) uniqueReadPairs++;
+        reader.close();				
+      }
+      f = new File(args[0] + "/dups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(args[0] + "/dups.txt"));
+        while (reader.readLine() != null) readPairs++;
+        reader.close();				
+        readPairs += uniqueReadPairs; 
+      }
+    } catch (IOException error) {
+      System.err.println("Problem counting lines in merged_nodups and dups");
+      System.exit(1);
+    }
+    long result;
+    try {
+      result = estimateLibrarySize(readPairs, uniqueReadPairs);
     }
+    catch (NullPointerException e) {
+      System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
+      return;
+    }
+    long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));
+
+    System.out.println("Total reads after duplication removal: " + NumberFormat.getInstance().format(uniqueReadPairs));
+    System.out.println("Duplicate reads: " + NumberFormat.getInstance().format(readPairs - uniqueReadPairs));
+    System.out.println("Optical duplicates: " + NumberFormat.getInstance().format(opticalDups));
+    System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
+    System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));
+
+  }
 
-/**
- * Estimates the size of a library based on the number of paired end molecules 
- * observed and the number of unique pairs observed.
- * <br>
- * Based on the Lander-Waterman equation that states:<br>
- *     C/X = 1 - exp( -N/X )<br>
- * where<br>
- *     X = number of distinct molecules in library<br>
- *     N = number of read pairs<br>
- *     C = number of distinct fragments observed in read pairs<br>
- */
-    public static Long estimateLibrarySize(final long readPairs, 
-					   final long uniqueReadPairs) {
-	final long readPairDuplicates = readPairs - uniqueReadPairs;
-	
-	if (readPairs > 0 && readPairDuplicates > 0) {
+  /**
+   * Estimates the size of a library based on the number of paired end molecules 
+   * observed and the number of unique pairs observed.
+   * <br>
+   * Based on the Lander-Waterman equation that states:<br>
+   *     C/X = 1 - exp( -N/X )<br>
+   * where<br>
+   *     X = number of distinct molecules in library<br>
+   *     N = number of read pairs<br>
+   *     C = number of distinct fragments observed in read pairs<br>
+   */
+  public static Long estimateLibrarySize(final long readPairs, 
+                                         final long uniqueReadPairs) {
+    final long readPairDuplicates = readPairs - uniqueReadPairs;
+    
+    if (readPairs > 0 && readPairDuplicates > 0) {
 	    long n = readPairs;
 	    long c = uniqueReadPairs;
 
 	    double m = 1.0, M = 100.0;
 
 	    if (c >= n || f(m*c, c, n) < 0) {
-		throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
+        throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
 	    }
-
+      
 	    while( f(M*c, c, n) >= 0 ) {
-		m = M;
-		M *= 10.0;
+        m = M;
+        M *= 10.0;
 	    }
-
+      
 	    double r = (m+M)/2.0;
 	    double u = f( r * c, c, n );
 	    int i = 0;
-
+      
 	    while (u != 0 && i < 1000) {
-					if (u > 0) m = r;
-					else M = r;
-					r = (m+M)/2.0;
-					u = f( r * c, c, n );
-					i++;
+        if (u > 0) m = r;
+        else M = r;
+        r = (m+M)/2.0;
+        u = f( r * c, c, n );
+        i++;
 	    }
 	    if (i == 1000) {
-					System.err.println("Iterated 1000 times, returning estimate");
+        System.err.println("Iterated 1000 times, returning estimate");
 	    }
 
 	    return (long) (c * (m+M)/2.0);
-	}
-	else {
-	    return null;
-	}
     }
-
-    /** Method that is used in the computation of estimated library size. */
-    private static double f(double x, double c, double n) {
-	return c/x - 1 + Math.exp(-n/x);
+    else {
+	    return null;
     }
-
+  }
+
+  /** Method that is used in the computation of estimated library size. */
+  private static double f(double x, double c, double n) {
+    return c/x - 1 + Math.exp(-n/x);
+  }
 }    
diff --git a/align_stable.sh b/align_stable.sh
@@ -145,6 +145,17 @@ else
 		fi
 fi
 
+if [ ! -z $key ] 
+    then
+    if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties 
+        then 
+        echo "Cannot find key $key in hicInternalMenu.properties"
+        echo "Please use an existing key or omit the key option"
+        exit 1
+    fi
+fi
+
+
 
 ## Create output directory
 if [ -d "$outputdir" ]; then

diff --git a/dups.awk b/dups.awk
@@ -30,7 +30,7 @@ BEGIN {
 	# the variable "name" can be set via the -v flag
 	dupname=name"dups.txt";
 	nodupname=name"merged_nodups.txt";
-	optname=name"opt_dups.txt"; #added
+	optname=name"optdups.txt"; 
 }
 # strand, chromosome, fragment match previous; first position (sorted) within wobble1
 

diff --git a/launch.sh b/launch.sh
@@ -193,9 +193,9 @@ if [ -z "$earlyexit" ]
 		echo -e "echo Splits done, launching other jobs." >> tmp
 		echo -e "bkill -J ${groupname}_clean2" >> tmp
 		echo "bsub -o $topDir/lsf.out -q $queue -g ${groupname}_clean -w \"done(${groupname}_kill)\" \"echo /broad/aidenlab/neva/neva_scripts/relaunch_dups.sh $flags > $topDir/relaunchme.sh\" " >> tmp
-		echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt;   /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
-		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic;  rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; ln -s $outputdir/inter*.jpg $newdir/.; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties;  fi  \" " >> tmp
-		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
+		echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt;   /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; java -cp /broad/aidenlab/neva/neva_scripts/ LibraryComplexity $outputdir >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
+		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt -g $outputdir/inter_hists.m $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic;  rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties;  fi  \" " >> tmp
+		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt -g $outputdir/inter_30_hists.m $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
 		bsub < tmp
 		rm tmp
 

diff --git a/mapq.awk b/mapq.awk
@@ -1 +1 @@
-$9>=thresh && $12>=thresh{print}
+$9>=thresh && $12>=thresh && !($2==$6 && $4==$8){print}
diff --git a/split_rmdups.awk b/split_rmdups.awk
@@ -34,11 +34,11 @@ END {
 	else {
 		waitstring=sprintf("%s || exit(%s)", waitstring, sname);
 	}
-	sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
+	sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_optdups.txt > %s/opt_dups.txt;  cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir, dir, dir);
 	system(sysstring);
 	sysstring = sprintf("bsub -o %s -q %s -g %s_skill -w \"%s\" \"bkill -g %s 0; bkill -g %skill 0\" ",outfile, queue, groupname, waitstring, groupname, groupname);
 	system(sysstring);
- 	sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir);
+ 	sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_optdups.txt; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
 	system(sysstring);
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		$9>=thresh && $12>=thresh{print}
		$9>=thresh && $12>=thresh && !($2==$6 && $4==$8){print}