From 76470e0af5ef7d3544d14e5ae90e252ae93cdd70 Mon Sep 17 00:00:00 2001
From: nchernia <nchernia@gmail.com>
Date: Wed, 17 Apr 2013 10:13:00 -0400
Subject: [PATCH] Added in key for properties menu LibraryComplexity reads
 files to calculate unique read pairs / duplicates Name of optical dups file
 changed Got rid of Matlab for producing graphs (affects launch, statistics,
 and relaunch), Java now reads the .m file directly and produces them in the
 Juicebox GUI Mapq script now also eliminates intra fragment reads svn r107

---
 LibraryComplexity.class | Bin 2339 -> 3092 bytes
 LibraryComplexity.java  | 155 +++++++++++++++++++++++-----------------
 align_stable.sh         |  11 +++
 dups.awk                |   2 +-
 launch.sh               |   6 +-
 mapq.awk                |   2 +-
 split_rmdups.awk        |   4 +-
 statistics.pl           | 147 ++++++++++++++-----------------------
 8 files changed, 161 insertions(+), 166 deletions(-)
diff --git a/LibraryComplexity.class b/LibraryComplexity.class
index 2bffa8fdf9143743455a95fa19224a0871d7de07..4d4123bd8847ef5e5d189cb64744db763cb82143 100644
GIT binary patch
literal 3092
zcmb7GTTmO<8U7Ahw5w$iSb!`$1aBOS1$Hb#UCR*1!8UP=4aNzMF}c`Tqy?{$b`@!5
zgL`k2G;y1z7q^ZZm$aSE(5E(bXMm|^()OX9PM`AF$M&gt>qDp0nKWbi9bJqlxHCn=
zIcNX#|KETAcmH(n*B=1r!+Sao;$#RT7}an}#~4oQe0WC1vns}Qs5mK?Ga8<g`&k`9
zOvvS&ib)mE>uACYI>LBSE-%UDyoL)JUY0i#DlCaTr6HvwiL?$I(<){(T$FH*hD-Id
zL;lrinAMR%RxYjvPeV?_D;n~_!O1d<gea)+H54^m*0C28!NJb*U2w3xL&FuRWlqOD
zuB!N)iq8uK4>_*m9~KD2yUz+J$Go&H&@|$>_Niia%Fds$rZS``S<7(+qVX3-E?JkY
zM8<Mw661c}ac7>8f{o+8m6{#3awUEhUnoU^p|a?dK#N`Qovh{CBhFOb%FmBGR|(o0
zKXIbF25-b8nt+(5UGbrzq3$68ecUVNQ}%I3S`n>?AM>)gj6LW0^ZTSV2Kq6e;)@2p
zgf9zpKUc73>;Y3=nHwa`Lun^(r+hCze^|v=415(|Gw>>26EI4yIbNcyZR>w~dbH?d
z(stg!>-aidPIx*0Lb{kM?DOY*18?9>X$Zlt;TyEI3U;(OJ#FXh^s}}_VSz~XLTV`!
zrQA2=@|J;bNtm}WK*mJY&d=EC3$9nyA`nWfom;9%!(&g+rR<#Vc&>qO%NX>Y&U;K;
z)=YUt*JqZ@3{zS#9oJk3YFcjEl!ykN#&H$jG4Ne{PoQJX_fy48=CtRyzMa3XMWCk=
zSSqOI!YsOJdzxy}=9P=KYx<sVWy~i{8L#UC;gYVR>%3C5Nfp?Aro@nETneT&O?+ud
z#z|Sy5i@URy~|c+z*O;l13$nG1MlFbz?Pv3lT8+i4iKt!jLxU1tqNe5$lEtDGVa<}
zx~0sfmtH2#O9P+5X9aew1z|Xov1hCdtK7G1qOfP!rHM}3<XU8Sr@g$Hvz$ErVO%S|
zmIzW~-CpVSFdZdI+I+mXx7TD_*oD3H#4qMunWt)}82BOHRdLI}kFY?mHx#l!^O}8Q
zQ<qrNY_v`Ka~<ROe8IP~?02t7_O^2OvE)y)m3=m}ZDrYl_baQ0ETHBjQ-*V(End^M
zHDf@LP048yAC^NQXytOYn-+L%GxiM#mG+`~&G*Wnaz@lnh>9vgQ&GV$l;+m+j7~*C
zrFTmssb!5;?y?e9lQkp9n?QXk;}y6E>^$2tycSG3huCQuZf9j)OIl6{w8z(ttFkd`
zR+l^8x99vsd3GN6@|<}B4Kubs>=t~>O%ZE*)uRoZ^gPH4D#^5Mri%<}snW(cB3ssT
zREB4b8>5zgQF=F*lNsSOg~|8C!P4tM;mV;2fhTx+n7iO!-?I#Hn+JhIT!STI2q8R)
z!#tC<?jGMG2KP|U2g-YBx`Vn%U<pcK1@&SibR?8iLdoD=1Sju8om__26X;t)D57&&
zLc=nQ0d1SMgvOhQ-$j^QO_STSWi&6d!uKp8@&_2vpl&6Di>v?ObIV4k*8c}eA(Zk!
zsJ4wzhpSetxBot8bj?uR)U6|ntN)Z{no)&kY#Gq*AQsuih$xH*sVmsNgol#aBDx23
z-c?{D4^y?dgpSA#9-<0Q3=hR_o2JtWlajWRO;Vz+DzTG{D|l4$c9!iXd(Hi>^1YON
z07q8^A^3p%-IaUhXHyOF$~}|0c7GdnXcsLaimjqkBt#$DNs;BVfZbdhSVjiFei{+s
zhfgb-5ycj4<5N4I2GEA1q>N)LQix#&58*Z5yv3SaKnH$-9k_!>@F8~M&v+D{pi}7B
zCBo<uE!ab?aS=neFwrAAu}>uMDO%7g_TzCefj)5#`^7X4h}V%6KgU7wNA!!2Fu-rB
zLwJhTB@m9RVh}+UgDQ?FilWjUGzAVc1^$iU#*d*LRdK9wRK?II*rl)+_?|5Hf>@+$
zEo_#C4%1F59ii(BLPQmA__!k(G?Q9Wzut=Oej}<ygV%3iYfO)7*Jk-24PINpp71-j
z(0vzstGyC=i~)`^I~|dI^29vIs#r$iH!)(=O5S)Ek(kl1$Bb6=&h{INtAB~{A=QdL
zuRk0MujB4-4jhQYn!`74;;FXCmcU0pMN&R4K#zs{^+l-MjQ=J7YD6rq{<b9hV>HMT
z-_PQsl9JG@)cW=cKK;IUpD`_G|2#7aoJ2iFnd?)`^BD8{3}?_dV}Axyn80P6!&OW&
z<1g~F<t6+C=kYVVj9<aR@5u3c*2ITM<FEW~{TplJ@3@G6VOE6Tma^A{&?>=HrcF@-
z0btk8d*}!RM0?<ord5_%fOJ}+%lCOXX{^a0e@t<_%WIm6DOb9@6&y(YlBC+H=ksZp
Iv^t#l4<)JSfB*mh

delta 1307
zcmZ9LO;B4^6vuy$e7u*(Bfu+!loApuN{B@Yg<|+<DW$XpiuQvlsI>u}F@*#gl3J@y
zHq}Kp>O5xZjGa-(&aiM{sTvqNQ+3ge<HC(=b){}}>BbSyBkjQWX5KyL-v2r0p7THF
zt^_|d3*Z0y^?jg|w{%+R_0dPa#(>Tt4(qZR)H%YC%25Xy);OjUWyCo=u5v=<q)r9B
z8l#RJbCz+9Q;wYQNL=SD(|AVbSx!648I7~v!Ko6Wb1=#|l}U~B8qev3c-}kMTmrp=
zC5gtAbKkVi3%scElFEhm|D0A|mZT&Fds3P4$9PiZqCtWgg)aMn5-wz|#N6_sNPj+)
zx}3KnrLD@W!5o&sD<oB3HArzuA$%!uCDFc+$Ryh*^642XJF=KfCvwNG&RR>k)MCb9
z-U(cPB%92qtxRq?lFl#ZA~RMbmC0F2E4ysaMYq8MX@y27nqA>)u#K{y-QsF*%&=&X
zVM)OsNo3|C4|5ub7+fZ6|KJK$s7Q<)d4nswX8+*|yAIN0m%ArD1~DxAqI+NYRbE%Q
zX7C1Y+Be<BG5>bj<BOT3f^R92UA9VkZFGDaKAX#?GReNyQ3aQk&5GrMIGj%`<SjXg
zwvO6ko<2L|4cqT{_S*g4+l8o1gY&uN7j{wDCt66PBGMCEM_Cg^p-onA=@1`J(Js1}
zDx=ajpI~3b?t4C>@-AhjtBBjRfyX!K8;ZF)H}DR|+>RJprMlO17tQpEiCatx;ie7r
zBIP}vRieGBVE3q)TE}lz7O4sssWwefnnWqGOT5;?;ZzMZ;U+7U?h*U4NA>|R+n_Gs
zDH0U5WW595{Sa4o;)ONJ2+&LmD%~7r2t!VsV$V<`t0qPJ@S|f2QX}&opi=m$5}#V}
zGKIfgbO_Q-Q2av@p`Ky($mBJUq*0n3W`zh}vX}i5aStAIQ|w(T9q!$36qU}3ag_uA
z(Cn7GWWXG1?U>$LsE-=~E#O_bMO{!2XgB8N4|s2E(BS{z7OnRPO+6~VQOXN=9lnIL
zs(@A^Y@NtAL5Z=`^X7+`K_jjQjT)Nf<HqXdFG1PN*6`S3+#mFBTT2rg?4R_u!v6s|
zk5^qcmDSB}Yt3rc?;lg{boFuRk9coIn|H#!yTKE8lsi&&iPH<ziQa*S1HxRFFzB$>
zBkaWKr<WN9<Xs%%nlSmcF!>%w`GnyZw;ACJj`JNSc)&@1WRzbS<2O$67Zd!=X~oAm
h_S>J8x2f`^wkZ6$JQ!-3{amixIlt4gdwMIS_<z)-_=f-h

diff --git a/LibraryComplexity.java b/LibraryComplexity.java
index c7e9234..3fe23d5 100644
--- a/LibraryComplexity.java
+++ b/LibraryComplexity.java
@@ -2,92 +2,117 @@
  * Taken from PicardTools 
  */
 import java.text.NumberFormat;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.File;
 class LibraryComplexity {
-
-    public static void main(String[] args) {
-	if (args.length != 2) {
-	    System.out.println("Usage: java LibraryComplexity <reads> <unique reads>");
-	    System.exit(0);
-	}
-	long readPairs = 0;
-	long uniqueReadPairs = 0;
-	try {
-	    readPairs = Long.parseLong(args[0]);
-	    uniqueReadPairs = Long.parseLong(args[1]);
-	}
-	catch (NumberFormatException e) {
-	    System.err.println("Arguments must be integers");
-	    System.exit(1);
-	}
-	long result;
-	try {
-			result = estimateLibrarySize(readPairs, uniqueReadPairs);
-	}
-	catch (NullPointerException e) {
-			System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
-			return;
-	}
-	long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));
-	System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
-	System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));
-	
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      System.out.println("Usage: java LibraryComplexity <directory>");
+      System.exit(0);
+    }
+    BufferedReader reader = null;
+    long readPairs = 0;
+    long uniqueReadPairs = 0;
+    long opticalDups = 0;
+    try {
+      File f = new File(args[0] + "/opt_dups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(f));
+        while (reader.readLine() != null) opticalDups++;
+        reader.close();
+      }
+    
+      f = new File(args[0] + "/merged_nodups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(f));
+        while (reader.readLine() != null) uniqueReadPairs++;
+        reader.close();				
+      }
+      f = new File(args[0] + "/dups.txt");
+      if (f.exists()) {
+        reader = new BufferedReader(new FileReader(args[0] + "/dups.txt"));
+        while (reader.readLine() != null) readPairs++;
+        reader.close();				
+        readPairs += uniqueReadPairs; 
+      }
+    } catch (IOException error) {
+      System.err.println("Problem counting lines in merged_nodups and dups");
+      System.exit(1);
+    }
+    long result;
+    try {
+      result = estimateLibrarySize(readPairs, uniqueReadPairs);
     }
+    catch (NullPointerException e) {
+      System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs);
+      return;
+    }
+    long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs));
+    
+    System.out.println("Total reads after duplication removal: " + NumberFormat.getInstance().format(uniqueReadPairs));
+    System.out.println("Duplicate reads: " + NumberFormat.getInstance().format(readPairs - uniqueReadPairs));
+    System.out.println("Optical duplicates: " + NumberFormat.getInstance().format(opticalDups));
+    System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result));
+    System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2));
+		
+  }
 
-/**
- * Estimates the size of a library based on the number of paired end molecules 
- * observed and the number of unique pairs observed.
- * <br>
- * Based on the Lander-Waterman equation that states:<br>
- *     C/X = 1 - exp( -N/X )<br>
- * where<br>
- *     X = number of distinct molecules in library<br>
- *     N = number of read pairs<br>
- *     C = number of distinct fragments observed in read pairs<br>
- */
-    public static Long estimateLibrarySize(final long readPairs, 
-					   final long uniqueReadPairs) {
-	final long readPairDuplicates = readPairs - uniqueReadPairs;
-	
-	if (readPairs > 0 && readPairDuplicates > 0) {
+  /**
+   * Estimates the size of a library based on the number of paired end molecules 
+   * observed and the number of unique pairs observed.
+   * <br>
+   * Based on the Lander-Waterman equation that states:<br>
+   *     C/X = 1 - exp( -N/X )<br>
+   * where<br>
+   *     X = number of distinct molecules in library<br>
+   *     N = number of read pairs<br>
+   *     C = number of distinct fragments observed in read pairs<br>
+   */
+  public static Long estimateLibrarySize(final long readPairs, 
+                                         final long uniqueReadPairs) {
+    final long readPairDuplicates = readPairs - uniqueReadPairs;
+    
+    if (readPairs > 0 && readPairDuplicates > 0) {
 	    long n = readPairs;
 	    long c = uniqueReadPairs;
 	    
 	    double m = 1.0, M = 100.0;
 	    
 	    if (c >= n || f(m*c, c, n) < 0) {
-		throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
+        throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c);
 	    }
-
+      
 	    while( f(M*c, c, n) >= 0 ) {
-		m = M;
-		M *= 10.0;
+        m = M;
+        M *= 10.0;
 	    }
-
+      
 	    double r = (m+M)/2.0;
 	    double u = f( r * c, c, n );
 	    int i = 0;
-
+      
 	    while (u != 0 && i < 1000) {
-					if (u > 0) m = r;
-					else M = r;
-					r = (m+M)/2.0;
-					u = f( r * c, c, n );
-					i++;
+        if (u > 0) m = r;
+        else M = r;
+        r = (m+M)/2.0;
+        u = f( r * c, c, n );
+        i++;
 	    }
 	    if (i == 1000) {
-					System.err.println("Iterated 1000 times, returning estimate");
+        System.err.println("Iterated 1000 times, returning estimate");
 	    }
 	    
 	    return (long) (c * (m+M)/2.0);
-	}
-	else {
-	    return null;
-	}
     }
-    
-    /** Method that is used in the computation of estimated library size. */
-    private static double f(double x, double c, double n) {
-	return c/x - 1 + Math.exp(-n/x);
+    else {
+	    return null;
     }
-    
+  }
+  
+  /** Method that is used in the computation of estimated library size. */
+  private static double f(double x, double c, double n) {
+    return c/x - 1 + Math.exp(-n/x);
+  }
 }    
diff --git a/align_stable.sh b/align_stable.sh
index 34e5247..1d32500 100755
--- a/align_stable.sh
+++ b/align_stable.sh
@@ -145,6 +145,17 @@ else
 		fi
 fi
 
+if [ ! -z $key ] 
+    then
+    if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties 
+        then 
+        echo "Cannot find key $key in hicInternalMenu.properties"
+        echo "Please use an existing key or omit the key option"
+        exit 1
+    fi
+fi
+
+
 
 ## Create output directory
 if [ -d "$outputdir" ]; then
diff --git a/dups.awk b/dups.awk
index 4818474..3afe2b5 100644
--- a/dups.awk
+++ b/dups.awk
@@ -30,7 +30,7 @@ BEGIN {
 	# the variable "name" can be set via the -v flag
 	dupname=name"dups.txt";
 	nodupname=name"merged_nodups.txt";
-	optname=name"opt_dups.txt"; #added
+	optname=name"optdups.txt"; 
 }
 # strand, chromosome, fragment match previous; first position (sorted) within wobble1
 
diff --git a/launch.sh b/launch.sh
index f1adad8..054424d 100644
--- a/launch.sh
+++ b/launch.sh
@@ -193,9 +193,9 @@ if [ -z "$earlyexit" ]
 		echo -e "echo Splits done, launching other jobs." >> tmp
 		echo -e "bkill -J ${groupname}_clean2" >> tmp
 		echo "bsub -o $topDir/lsf.out -q $queue -g ${groupname}_clean -w \"done(${groupname}_kill)\" \"echo /broad/aidenlab/neva/neva_scripts/relaunch_dups.sh $flags > $topDir/relaunchme.sh\" " >> tmp
-		echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt;   /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
-		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic;  rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; ln -s $outputdir/inter*.jpg $newdir/.; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties;  fi  \" " >> tmp
-		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
+		echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt;   /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; java -cp /broad/aidenlab/neva/neva_scripts/ LibraryComplexity $outputdir >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp
+		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt -g $outputdir/inter_hists.m $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic;  rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties;  fi  \" " >> tmp
+		echo -e "bsub -o $topDir/lsf.out  -q $queue -R \"rusage[mem=4]\" -g $groupname -w  \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt -g $outputdir/inter_30_hists.m $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp
 		bsub < tmp
 		rm tmp
 
diff --git a/mapq.awk b/mapq.awk
index a9c3c4c..838a5e7 100644
--- a/mapq.awk
+++ b/mapq.awk
@@ -1 +1 @@
-$9>=thresh && $12>=thresh{print}
+$9>=thresh && $12>=thresh && !($2==$6 && $4==$8){print}
diff --git a/split_rmdups.awk b/split_rmdups.awk
index eb7346d..d4a702a 100644
--- a/split_rmdups.awk
+++ b/split_rmdups.awk
@@ -34,11 +34,11 @@ END {
 	else {
 		waitstring=sprintf("%s || exit(%s)", waitstring, sname);
 	}
-	sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
+	sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_optdups.txt > %s/opt_dups.txt;  cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir, dir, dir);
 	system(sysstring);
 	sysstring = sprintf("bsub -o %s -q %s -g %s_skill -w \"%s\" \"bkill -g %s 0; bkill -g %skill 0\" ",outfile, queue, groupname, waitstring, groupname, groupname);
 	system(sysstring);
- 	sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir);
+ 	sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_optdups.txt; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir);
 	system(sysstring);
 
 
diff --git a/statistics.pl b/statistics.pl
index d9ee0a5..39612fd 100755
--- a/statistics.pl
+++ b/statistics.pl
@@ -101,6 +101,7 @@
 my $true_dangling_intra_small = 0;
 my $true_dangling_intra_large = 0;
 my $true_dangling_inter = 0;
+my $total_current = 0;
 # logspace bins
 my @bins = (10,12,15,19,23,28,35,43,53,66,81,100,123,152,187,231,285,351,433,534,658,811,1000,1233,1520,1874,2310,2848,3511,4329,5337,6579,8111,10000,12328,15199,18738,23101,28480,35112,43288,53367,65793,81113,100000,123285,151991,187382,231013,284804,351119,432876,533670,657933,811131,1000000,1232847,1519911,1873817,2310130,2848036,3511192,4328761,5336699,6579332,8111308,10000000,12328467,15199111,18738174,23101297,28480359,35111917,43287613,53366992,65793322,81113083,100000000,123284674,151991108,187381742,231012970,284803587,351119173,432876128,533669923,657933225,811130831,1000000000,1232846739,1519911083,1873817423,2310129700,2848035868,3511191734,4328761281,5336699231,6579332247,8111308308,10000000000);
 
@@ -114,52 +115,10 @@
 }
 close(FILE);
 
-my($fname, $fdir) = fileparse($infile);
-my $tot_dups;
-my $opt_dups;
-
-if ($fname eq "merged_0.txt") {
-	my $line_counts;
-	my @tmp;
-	my $testname = ${fdir} . "dups.txt";
-	if (-e $testname) {
-    $line_counts = `wc -l ${fdir}dups.txt`;
-		@tmp = split(' ', $line_counts);
-		$tot_dups = $tmp[0];
-	}
-	else {
-		$tot_dups = 0;
-	}
-	$testname = ${fdir} . "opt_dups.txt";
-	if (-e $testname) {
-	  $line_counts = `wc -l ${fdir}opt_dups.txt`;
-		@tmp = split(' ', $line_counts);
-		$opt_dups = $tmp[0];
-	}
-  else {
-		$opt_dups = 0;
-	}
-	$testname = ${fdir} . "merged_nodups.txt";
-	if (-e $testname) {
-	  $line_counts = `wc -l ${fdir}merged_nodups.txt`;
-		@tmp = split(' ', $line_counts);
-		$total = $tmp[0];
-	}
-	else {
-		$total = 0;
-	}
-}
-else {
-	$tot_dups = 0;
-	$opt_dups = 0;
-}
-
-print "tot" . $total . " " . $tot_dups . " " . $opt_dups . "\n";
-
 # read in infile and calculate statistics
 open FILE, $infile or die $!;
 while (<FILE>) {
-	#$total++;
+	$total_current++;
 	my @record = split;
 
 	my $num_records = scalar(@record);
@@ -300,25 +259,17 @@
 	}
 }
 close(FILE);
-open FILE, " >> $stats_file", or die $!;
-print FILE "Total reads after duplication removal: " . commify($total) . "\n";
-print FILE "Duplicate reads: " . commify($tot_dups)  . "\n"; 
-my $tot_reads = $total + $tot_dups + $opt_dups;
-my $result = `java -cp /broad/aidenlab/neva/neva_scripts LibraryComplexity $tot_reads $total`;
-print FILE "$result";
 
-if ($total==0) {
-	$total=1;
+my($statsfilename, $directories, $suffix)= fileparse($stats_file, qr/\.[^.]*/);
+my $histsfile = $directories . $statsfilename . "_hists.m";
+my $statssupfile = $directories . $statsfilename . "_supp.txt";
+
+open FILE, " >> $stats_file", or die $!;
+print FILE "Total reads in current file: " . commify($total_current) . "\n";
+if ($total_current==0) {
+	$total_current=1;
 }
-printf FILE "Ligations: %s (%0.2f\%)\n", commify($ligation), $ligation*100/$total;
-printf FILE "Dangling: %s (%0.2f\%)\n", commify($dangling), $dangling*100/$total;
-printf FILE "  Very small: %s (%0.2f\%)\n", commify($very_small_dangling), $very_small_dangling*100/$total;
-printf FILE "  Small: %s (%0.2f\%)\n", commify($small_dangling),$small_dangling*100/$total;
-printf FILE "  Large: %s (%0.2f\%)\n", commify($large_dangling),$large_dangling*100/$total;
-printf FILE "  Inter: %s (%0.2f\%)\n", commify($inter_dangling),$inter_dangling*100/$total;
-printf FILE "  True Small: %s (%0.2f\%)\n", commify($true_dangling_intra_small),$true_dangling_intra_small*100/$total;
-printf FILE "  True Large: %s (%0.2f\%)\n", commify($true_dangling_intra_large),$true_dangling_intra_large*100/$total;
-printf FILE "  True Inter: %s (%0.2f\%)\n", commify($true_dangling_inter),$true_dangling_inter*100/$total;
+printf FILE "Ligations: %s (%0.2f\%)\n", commify($ligation), $ligation*100/$total_current;
 if ($five_prime_end + $three_prime_end > 0) {
 	printf FILE "Five prime: %s (%0.2f\%)\n", commify($five_prime_end), $five_prime_end*100/($five_prime_end + $three_prime_end);
 	printf FILE "Three prime: %s (%0.2f\%)\n", commify($three_prime_end), $three_prime_end*100/($five_prime_end + $three_prime_end);
@@ -327,11 +278,11 @@
 	printf FILE "Five prime: $five_prime_end (%0.2f\%)\n", 0;
 	printf FILE "Three prime: $three_prime_end (%0.2f\%)\n", 0;
 }
-printf FILE "Inter: %s (%0.2f\%)\n", commify($inter), $inter*100/$total;
-printf FILE "Intra: %s (%0.2f\%)\n", commify($intra), $intra*100/$total;
-printf FILE "Small: %s (%0.2f\%)\n", commify($small), $small*100/$total;
-printf FILE "Large: %s (%0.2f\%)\n", commify($large), $large*100/$total;
-printf FILE "Very small: %s (%0.2f\%)\n", commify($very_small), $very_small*100/$total;
+printf FILE "Inter: %s (%0.2f\%)\n", commify($inter), $inter*100/$total_current;
+printf FILE "Intra: %s (%0.2f\%)\n", commify($intra), $intra*100/$total_current;
+printf FILE "Small: %s (%0.2f\%)\n", commify($small), $small*100/$total_current;
+printf FILE "Large: %s (%0.2f\%)\n", commify($large), $large*100/$total_current;
+printf FILE "Very small: %s (%0.2f\%)\n", commify($very_small), $very_small*100/$total_current;
 if ($large > 0) {
 	printf FILE "Inner: %s (%0.2f\%) \n", commify($inner), $inner*100/$large;
 	printf FILE "Outer: %s (%0.2f\%) \n", commify($outer), $outer*100/$large;
@@ -339,44 +290,52 @@
 	printf FILE "Right: %s (%0.2f\%) \n", commify($right), $right*100/$large;
 }
 close FILE;
-my($statsfilename, $directories, $suffix)= fileparse($stats_file, qr/\.[^.]*/);
-my $histsfile = $directories . $statsfilename . "_hists.m";
+
+open FILE, " > $statssupfile", or die $!;
+printf FILE "Dangling: %s (%0.2f\%)\n", commify($dangling), $dangling*100/$total_current;
+printf FILE "  Very small: %s (%0.2f\%)\n", commify($very_small_dangling), $very_small_dangling*100/$total_current;
+printf FILE "  Small: %s (%0.2f\%)\n", commify($small_dangling),$small_dangling*100/$total_current;
+printf FILE "  Large: %s (%0.2f\%)\n", commify($large_dangling),$large_dangling*100/$total_current;
+printf FILE "  Inter: %s (%0.2f\%)\n", commify($inter_dangling),$inter_dangling*100/$total_current;
+printf FILE "  True Small: %s (%0.2f\%)\n", commify($true_dangling_intra_small),$true_dangling_intra_small*100/$total_current;
+printf FILE "  True Large: %s (%0.2f\%)\n", commify($true_dangling_intra_large),$true_dangling_intra_large*100/$total_current;
+printf FILE "  True Inter: %s (%0.2f\%)\n", commify($true_dangling_inter),$true_dangling_inter*100/$total_current;
+close FILE;
+
 open FILE, "> $histsfile", or die $!; 
-print FILE "A = [";
+print FILE "A = [\n";
 for (my $i=1; $i <= 2000; $i++) {
 	my $tmp =	 $hindIII{$i} || 0;
 	print FILE "$tmp ";
 }
-print FILE "];\n";
-print FILE "B = [";
+print FILE "\n];\n";
+print FILE "B = [\n";
 for (my $i=0; $i <= 200; $i++) {
 	my $tmp = $mapQ{$i} || 0;
 	my $tmp2 = $mapQ_intra{$i} || 0;
 	my $tmp3 = $mapQ_inter{$i} || 0;
-	print FILE "$tmp,$tmp2,$tmp3; ";
+	print FILE "$tmp $tmp2 $tmp3\n ";
 }
-print FILE "];\n";
-print FILE "D = [";
+print FILE "\n];\n";
+print FILE "D = [\n";
 for (my $i=0; $i < scalar(@bins); $i++) {
 	my $tmp = $innerM{$i} || 0;
-	print FILE "$tmp,";
+	print FILE "$tmp ";
 	$tmp = $outerM{$i} || 0;
-	print FILE "$tmp,";
+	print FILE "$tmp ";
 	$tmp = $rightM{$i} || 0;
-	print FILE "$tmp,";
+	print FILE "$tmp ";
 	$tmp = $leftM{$i} || 0;
-	print FILE "$tmp;\n";
+	print FILE "$tmp\n";
 }
 
-print FILE "];";
-print FILE "x = [";
+print FILE "\n];";
+print FILE "x = [\n";
 for (my $i=0; $i < scalar(@bins); $i++) {
-	print FILE "$bins[$i],";
+	print FILE "$bins[$i] ";
 }
-print FILE "];";
-$str = "\'$outdir\'";
-
-system('matlab -nodisplay -r "addpath(\'/broad/aidenlab/neva/neva_scripts/\'); addpath( ' . ${str} . ');' . ${statsfilename} . '_hists; plotstats (A,B,D,x, \'' . ${directories} . ${statsfilename} . '\'); quit;" > /dev/null');
+print FILE "\n];\n";
+close FILE;
 
 # Find distance to nearest HindIII restriction site
 sub distHindIII {
@@ -394,16 +353,16 @@ sub distHindIII {
 	}
 	my $dist2 = abs($_[2] - $chromosomes{$_[1]}[$index]);
 	
-	if ($index == 0) {
-		if (!($_[2] >= 0 && $_[2] <= $chromosomes{$_[1]}[$index])) {
-	    print(STDERR "Problem not " . $_[2] . " >= 0 && <= " . $chromosomes{$_[1]}[$index] . " " . $_[1] . " index = " . $index . " chr = " . $_[1] . "\n");
-		}
-	}
-	else {
-		if (!($_[2] >= $chromosomes{$_[1]}[$index-1] && $_[2] <= $chromosomes{$_[1]}[$index])) {
-	    print(STDERR "Problem not " . $_[2] . " >= " . $chromosomes{$_[1]}[$index-1] . " && <= " . $chromosomes{$_[1]}[$index] . " index = " . $index . " chr = " . $_[1] . "\n");
-		}
-	}
+#	if ($index == 0) {
+#		if (!($_[2] >= 0 && $_[2] <= $chromosomes{$_[1]}[$index])) {
+#	    print(STDERR "Problem not " . $_[2] . " >= 0 && <= " . $chromosomes{$_[1]}[$index] . " " . $_[1] . " index = " . $index . " chr = " . $_[1] . "\n");
+#		}
+#	}
+#	else {
+#		if (!($_[2] >= $chromosomes{$_[1]}[$index-1] && $_[2] <= $chromosomes{$_[1]}[$index])) {
+#	    print(STDERR "Problem not " . $_[2] . " >= " . $chromosomes{$_[1]}[$index-1] . " && <= " . $chromosomes{$_[1]}[$index] . " index = " . $index . " chr = " . $_[1] . "\n");
+#		}
+#	}
 	# get minimum value -- if (dist1 <= dist2), it's dist1, else dist2
 	my $retval = $dist1 <= $dist2 ? $dist1 : $dist2; 
 	# get which end of the fragment this is, 3' or 5' (depends on strand)