From 76470e0af5ef7d3544d14e5ae90e252ae93cdd70 Mon Sep 17 00:00:00 2001 From: nchernia Date: Wed, 17 Apr 2013 10:13:00 -0400 Subject: [PATCH] Added in key for properties menu LibraryComplexity reads files to calculate unique read pairs / duplicates Name of optical dups file changed Got rid of Matlab for producing graphs (affects launch, statistics, and relaunch), Java now reads the .m file directly and produces them in the Juicebox GUI Mapq script now also eliminates intra fragment reads svn r107 --- LibraryComplexity.class | Bin 2339 -> 3092 bytes LibraryComplexity.java | 155 +++++++++++++++++++++++----------------- align_stable.sh | 11 +++ dups.awk | 2 +- launch.sh | 6 +- mapq.awk | 2 +- split_rmdups.awk | 4 +- statistics.pl | 147 ++++++++++++++----------------------- 8 files changed, 161 insertions(+), 166 deletions(-) diff --git a/LibraryComplexity.class b/LibraryComplexity.class index 2bffa8fdf9143743455a95fa19224a0871d7de07..4d4123bd8847ef5e5d189cb64744db763cb82143 100644 GIT binary patch literal 3092 zcmb7GTTmO<8U7Ahw5w$iSb!`$1aBOS1$Hb#UCR*1!8UP=4aNzMF}c`Tqy?{$b`@!5 zgL`k2G;y1z7q^ZZm$aSE(5E(bXMm|^()OX9PM`AF$M>>qDp0nKWbi9bJqlxHCn= zIcNX#|KETAcmH(n*B=1r!+Sao;$#RT7}an}#~4oQe0WC1vns}Qs5mK?Ga8uACYI>LBSE-%UDyoL)JUY0i#DlCaTr6HvwiL?$I(<){(T$FH*hD-Id zL;lrinAMR%RxYjvPeV?_D;n~_!O1d_*m9~KD2yUz+J$Go&H&@|$>_Niia%Fds$rZS``S<7(+qVX3-E?JkY zM88f{o+8m6{#3awUEhUnoU^p|a?dK#N`Qovh{CBhFOb%FmBGR|(o0 zKXIbF25-b8nt+(5UGbrzq3$68ecUVNQ}%I3S`n>?AM>)gj6LW0^ZTSV2Kq6e;)@2p zgf9zpKUc73>;Y3=nHwa`Lun^(r+hCze^|v=415(|Gw>>26EI4yIbNcyZR>w~dbH?d z(stg!>-aidPIx*0Lb{kM?DOY*18?9>X$Zlt;TyEI3U;(OJ#FXh^s}}_VSz~XLTV`! zrQA2=@|J;bNtm}WK*mJY&d=EC3$9nyA`nWfom;9%!(&g+rR<#Vc&>qO%NX>Y&U;K; z)=YUt*JqZ@3{zS#9oJk3YFcjEl!ykN#&H$jG4Ne{PoQJX_fy48=CtRyzMa3XMWCk= zSSqOI!YsOJdzxy}=9P=KYx%3C5Nfp?Aro@nETneT&O?+ud z#z|Sy5i@URy~|c+z*O;l13$nG1MlFbz?Pv3lT8+i4iKt!jLxU1tqNe5$lEtDGVa<} zx~0sfmtH2#O9P+5X9aew1z|Xov1hCdtK7G1qOfP!rHM}39vgQ&GV$l;+m+j7~*C zrFTmssb!5;?y?e9lQkp9n?QXk;}y6E>^$2tycSG3huCQuZf9j)OIl6{w8z(ttFkd` zR+l^8x99vsd3GN6@|<}B4Kubs>=t~>O%ZE*)uRoZ^gPH4D#^5Mri%<}snW(cB3ssT zREB4b8>5zgQF=F*lNsSOg~|8C!P4tM;mV;2fhTx+n7iO!-?I#Hn+JhIT!STI2q8R) z!#tCv?ObIV4k*8c}eA(Zk! zsJ4wzhpSetxBot8bj?uR)U6|ntN)Z{no)&kY#Gq*AQsuih$xH*sVmsNgol#aBDx23 z-c?{D4^y?dgpSA#9-<0Q3=hR_o2JtWlajWRO;Vz+DzTG{D|l4$c9!iXd(Hi>^1YON z07q8^A^3p%-IaUhXHyOF$~}|0c7GdnXcsLaimjqkBt#$DNs;BVfZbdhSVjiFei{+s zhfgb-5ycj4<5N4I2GEA1q>N)LQix#&58*Z5yv3SaKnH$-9k_!>@F8~M&v+D{pi}7B zCBouMDO%7g_TzCefj)5#`^7X4h}V%6KgU7wNA!!2Fu-rB zLwJhTB@m9RVh}+UgDQ?FilWjUGzAVc1^$iU#*d*LRdK9wRK?II*rl)+_?|5Hf>@+$ zEo_#C4%1F59ii(BLPQmA__!k(G?Q9Wzut=Oej}HMT z-_PQsl9JG@)cW=cKK;IUpD`_G|2#7aoJ2iFnd?)`^BD8{3}?_dV}Axyn80P6!&OW& z<1g~F=HrcF@- z0btk8d*}!RM0?u}F@*#gl3J@y zHq}Kp>O5xZjGa-(&aiM{sTvqNQ+3geBbSyBkjQWX5KyL-v2r0p7THF zt^_|d3*Z0y^?jg|w{%+R_0dPa#(>Tt4(qZR)H%YC%25Xy);OjUWyCo=u5v=k)MCb9 z-U(cPB%92qtxRq?lFl#ZA~RMbmC0F2E4ysaMYq8MX@y27nqA>)u#K{y-QsF*%&=&X zVM)OsNo3|C4|5ub7+fZ6|KJK$s7Q<)d4nswX8+*|yAIN0m%ArD1~DxAqI+NYRbE%Q zX7C1Y+Bebj@-AhjtBBjRfyX!K8;ZF)H}DR|+>RJprMlO17tQpEiCatx;ie7r zBIP}vRieGBVE3q)TE}lz7O4sssWwefnnWqGOT5;?;ZzMZ;U+7U?h*U4NA>|R+n_Gs zDH0U5WW595{Sa4o;)ONJ2+&LmD%~7r2t!VsV$V<`t0qPJ@S|f2QX}&opi=m$5}#V} zGKIfgbO_Q-Q2av@p`Ky($mBJUq*0n3W`zh}vX}i5aStAIQ|w(T9q!$36qU}3ag_uA z(Cn7GWWXG1?U>$LsE-=~E#O_bMO{!2XgB8N4|s2E(BS{z7OnRPO+6~VQOXN=9lnIL zs(@A^Y@NtAL5Z=`^X7+`K_jjQjT)Nf zBkaWKr%w`GnyZw;ACJj`JNSc)&@1WRzbS<2O$67Zd!=X~oAm h_S>J8x2f`^wkZ6$JQ!-3{amixIlt4gdwMIS_ "); - System.exit(0); - } - long readPairs = 0; - long uniqueReadPairs = 0; - try { - readPairs = Long.parseLong(args[0]); - uniqueReadPairs = Long.parseLong(args[1]); - } - catch (NumberFormatException e) { - System.err.println("Arguments must be integers"); - System.exit(1); - } - long result; - try { - result = estimateLibrarySize(readPairs, uniqueReadPairs); - } - catch (NullPointerException e) { - System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs); - return; - } - long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs)); - System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result)); - System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2)); - + public static void main(String[] args) { + if (args.length != 1) { + System.out.println("Usage: java LibraryComplexity "); + System.exit(0); + } + BufferedReader reader = null; + long readPairs = 0; + long uniqueReadPairs = 0; + long opticalDups = 0; + try { + File f = new File(args[0] + "/opt_dups.txt"); + if (f.exists()) { + reader = new BufferedReader(new FileReader(f)); + while (reader.readLine() != null) opticalDups++; + reader.close(); + } + + f = new File(args[0] + "/merged_nodups.txt"); + if (f.exists()) { + reader = new BufferedReader(new FileReader(f)); + while (reader.readLine() != null) uniqueReadPairs++; + reader.close(); + } + f = new File(args[0] + "/dups.txt"); + if (f.exists()) { + reader = new BufferedReader(new FileReader(args[0] + "/dups.txt")); + while (reader.readLine() != null) readPairs++; + reader.close(); + readPairs += uniqueReadPairs; + } + } catch (IOException error) { + System.err.println("Problem counting lines in merged_nodups and dups"); + System.exit(1); + } + long result; + try { + result = estimateLibrarySize(readPairs, uniqueReadPairs); } + catch (NullPointerException e) { + System.err.println("Library complexity undefined when total = " + readPairs + " and unique = " + uniqueReadPairs); + return; + } + long result2 = readPairs*readPairs/(2*(readPairs-uniqueReadPairs)); + + System.out.println("Total reads after duplication removal: " + NumberFormat.getInstance().format(uniqueReadPairs)); + System.out.println("Duplicate reads: " + NumberFormat.getInstance().format(readPairs - uniqueReadPairs)); + System.out.println("Optical duplicates: " + NumberFormat.getInstance().format(opticalDups)); + System.out.println("Library complexity (new): " + NumberFormat.getInstance().format(result)); + System.out.println("Library complexity (old): " + NumberFormat.getInstance().format(result2)); + + } -/** - * Estimates the size of a library based on the number of paired end molecules - * observed and the number of unique pairs observed. - *
- * Based on the Lander-Waterman equation that states:
- * C/X = 1 - exp( -N/X )
- * where
- * X = number of distinct molecules in library
- * N = number of read pairs
- * C = number of distinct fragments observed in read pairs
- */ - public static Long estimateLibrarySize(final long readPairs, - final long uniqueReadPairs) { - final long readPairDuplicates = readPairs - uniqueReadPairs; - - if (readPairs > 0 && readPairDuplicates > 0) { + /** + * Estimates the size of a library based on the number of paired end molecules + * observed and the number of unique pairs observed. + *
+ * Based on the Lander-Waterman equation that states:
+ * C/X = 1 - exp( -N/X )
+ * where
+ * X = number of distinct molecules in library
+ * N = number of read pairs
+ * C = number of distinct fragments observed in read pairs
+ */ + public static Long estimateLibrarySize(final long readPairs, + final long uniqueReadPairs) { + final long readPairDuplicates = readPairs - uniqueReadPairs; + + if (readPairs > 0 && readPairDuplicates > 0) { long n = readPairs; long c = uniqueReadPairs; double m = 1.0, M = 100.0; if (c >= n || f(m*c, c, n) < 0) { - throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c); + throw new IllegalStateException("Invalid values for pairs and unique pairs: " + n + ", " + c); } - + while( f(M*c, c, n) >= 0 ) { - m = M; - M *= 10.0; + m = M; + M *= 10.0; } - + double r = (m+M)/2.0; double u = f( r * c, c, n ); int i = 0; - + while (u != 0 && i < 1000) { - if (u > 0) m = r; - else M = r; - r = (m+M)/2.0; - u = f( r * c, c, n ); - i++; + if (u > 0) m = r; + else M = r; + r = (m+M)/2.0; + u = f( r * c, c, n ); + i++; } if (i == 1000) { - System.err.println("Iterated 1000 times, returning estimate"); + System.err.println("Iterated 1000 times, returning estimate"); } return (long) (c * (m+M)/2.0); - } - else { - return null; - } } - - /** Method that is used in the computation of estimated library size. */ - private static double f(double x, double c, double n) { - return c/x - 1 + Math.exp(-n/x); + else { + return null; } - + } + + /** Method that is used in the computation of estimated library size. */ + private static double f(double x, double c, double n) { + return c/x - 1 + Math.exp(-n/x); + } } diff --git a/align_stable.sh b/align_stable.sh index 34e5247..1d32500 100755 --- a/align_stable.sh +++ b/align_stable.sh @@ -145,6 +145,17 @@ else fi fi +if [ ! -z $key ] + then + if ! grep -q -m 1 $key /broad/aidenlab/neva/neva_scripts/hicInternalMenu.properties + then + echo "Cannot find key $key in hicInternalMenu.properties" + echo "Please use an existing key or omit the key option" + exit 1 + fi +fi + + ## Create output directory if [ -d "$outputdir" ]; then diff --git a/dups.awk b/dups.awk index 4818474..3afe2b5 100644 --- a/dups.awk +++ b/dups.awk @@ -30,7 +30,7 @@ BEGIN { # the variable "name" can be set via the -v flag dupname=name"dups.txt"; nodupname=name"merged_nodups.txt"; - optname=name"opt_dups.txt"; #added + optname=name"optdups.txt"; } # strand, chromosome, fragment match previous; first position (sorted) within wobble1 diff --git a/launch.sh b/launch.sh index f1adad8..054424d 100644 --- a/launch.sh +++ b/launch.sh @@ -193,9 +193,9 @@ if [ -z "$earlyexit" ] echo -e "echo Splits done, launching other jobs." >> tmp echo -e "bkill -J ${groupname}_clean2" >> tmp echo "bsub -o $topDir/lsf.out -q $queue -g ${groupname}_clean -w \"done(${groupname}_kill)\" \"echo /broad/aidenlab/neva/neva_scripts/relaunch_dups.sh $flags > $topDir/relaunchme.sh\" " >> tmp - echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp - echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic; rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; ln -s $outputdir/inter*.jpg $newdir/.; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties; fi \" " >> tmp - echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; reuse Matlab; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp + echo "bsub -o $topDir/lsf.out -q $queue -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}stats \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=1 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_0.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/stats_dups.txt $outputdir/dups.txt; export LC_ALL=en_US.UTF-8; cat $splitdir/*.res.txt | awk -f /broad/aidenlab/neva/neva_scripts/stats_sub.awk >> $outputdir/inter.txt; java -cp /broad/aidenlab/neva/neva_scripts/ LibraryComplexity $outputdir >> $outputdir/inter.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter.txt $outputdir/merged_0.txt; cat $splitdir/*_abnorm.sam > $outputdir/abnormal.sam; awk -f /broad/aidenlab/neva/neva_scripts/abnormal.awk $outputdir/abnormal.sam > $outputdir/abnormal.txt\" " >> tmp + echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}stats)\" -J ${groupname}hic \" source /broad/software/scripts/useuse; reuse Java-1.6; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_0.txt > $outputdir/merged_nodups_small.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small.txt $outputdir/merged_nodups.bin $genomeID ; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter.txt -g $outputdir/inter_hists.m $outputdir/merged_nodups.bin $outputdir/inter.hic $genomeID ; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter.hic; rm $outputdir/merged_0.txt; rm $outputdir/merged_nodups_small.txt; rm $outputdir/merged_nodups.bin ; if [[ $topDir == *MiSeq* ]]; then mkdir $newdir; ln -s $outputdir/inter*.hic $newdir/. ; echo leaf0${newleaf} = ${key}, ${hicdir}, http://iwww.broadinstitute.org/igvdata/hic/files/${hicdir}/inter.hic, 50 >> /broad/aidenlab/hic_files/hicInternalMenu.properties; fi \" " >> tmp + echo -e "bsub -o $topDir/lsf.out -q $queue -R \"rusage[mem=4]\" -g $groupname -w \"done(${groupname}_split)\" -J ${groupname}hic30 \"source /broad/software/scripts/useuse; reuse Java-1.6; awk -v thresh=30 -f /broad/aidenlab/neva/neva_scripts/mapq.awk $outputdir/merged_nodups.txt > $outputdir/merged_30.txt; /broad/aidenlab/neva/neva_scripts/statistics.pl -s $site_file -d $outputdir -l $ligation -o $outputdir/inter_30.txt $outputdir/merged_30.txt; cut -d' ' -f1,2,3,4,5,6,7,8 $outputdir/merged_30.txt > $outputdir/merged_nodups_small_30.txt; /broad/aidenlab/HiCTools/hictools pairsToBin $outputdir/merged_nodups_small_30.txt $outputdir/merged_nodups_30.bin $genomeID; /broad/aidenlab/HiCTools/hictools pre -f $site_file -s $outputdir/inter_30.txt -g $outputdir/inter_30_hists.m $outputdir/merged_nodups_30.bin $outputdir/inter_30.hic $genomeID; /broad/aidenlab/HiCTools/hictools addNorm $outputdir/inter_30.hic; rm $outputdir/merged_nodups_small_30.txt; rm $outputdir/merged_nodups_30.bin; rm $outputdir/merged_30.txt; \"" >> tmp bsub < tmp rm tmp diff --git a/mapq.awk b/mapq.awk index a9c3c4c..838a5e7 100644 --- a/mapq.awk +++ b/mapq.awk @@ -1 +1 @@ -$9>=thresh && $12>=thresh{print} +$9>=thresh && $12>=thresh && !($2==$6 && $4==$8){print} diff --git a/split_rmdups.awk b/split_rmdups.awk index eb7346d..d4a702a 100644 --- a/split_rmdups.awk +++ b/split_rmdups.awk @@ -34,11 +34,11 @@ END { else { waitstring=sprintf("%s || exit(%s)", waitstring, sname); } - sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir); + sysstring = sprintf("bsub -o %s -q %s -g %s -J %s_split -w \"done(%s_msplit*)\" \"cat %s/*_msplit*_optdups.txt > %s/opt_dups.txt; cat %s/*_msplit*_dups.txt > %s/dups.txt;cat %s/*_msplit*_merged_nodups.txt > %s/merged_nodups.txt; \" ", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir, dir, dir); system(sysstring); sysstring = sprintf("bsub -o %s -q %s -g %s_skill -w \"%s\" \"bkill -g %s 0; bkill -g %skill 0\" ",outfile, queue, groupname, waitstring, groupname, groupname); system(sysstring); - sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir); + sysstring = sprintf("bsub -o %s -q %s -g %s -w \"done(%s_split)\" \"bkill -g %s_skill 0 ; rm %s/*_msplit*_optdups.txt; rm %s/*_msplit*_dups.txt; rm %s/*_msplit*_merged_nodups.txt; rm %s/split* \"", outfile, queue, groupname, groupname, groupname, dir, dir, dir, dir); system(sysstring); diff --git a/statistics.pl b/statistics.pl index d9ee0a5..39612fd 100755 --- a/statistics.pl +++ b/statistics.pl @@ -101,6 +101,7 @@ my $true_dangling_intra_small = 0; my $true_dangling_intra_large = 0; my $true_dangling_inter = 0; +my $total_current = 0; # logspace bins my @bins = (10,12,15,19,23,28,35,43,53,66,81,100,123,152,187,231,285,351,433,534,658,811,1000,1233,1520,1874,2310,2848,3511,4329,5337,6579,8111,10000,12328,15199,18738,23101,28480,35112,43288,53367,65793,81113,100000,123285,151991,187382,231013,284804,351119,432876,533670,657933,811131,1000000,1232847,1519911,1873817,2310130,2848036,3511192,4328761,5336699,6579332,8111308,10000000,12328467,15199111,18738174,23101297,28480359,35111917,43287613,53366992,65793322,81113083,100000000,123284674,151991108,187381742,231012970,284803587,351119173,432876128,533669923,657933225,811130831,1000000000,1232846739,1519911083,1873817423,2310129700,2848035868,3511191734,4328761281,5336699231,6579332247,8111308308,10000000000); @@ -114,52 +115,10 @@ } close(FILE); -my($fname, $fdir) = fileparse($infile); -my $tot_dups; -my $opt_dups; - -if ($fname eq "merged_0.txt") { - my $line_counts; - my @tmp; - my $testname = ${fdir} . "dups.txt"; - if (-e $testname) { - $line_counts = `wc -l ${fdir}dups.txt`; - @tmp = split(' ', $line_counts); - $tot_dups = $tmp[0]; - } - else { - $tot_dups = 0; - } - $testname = ${fdir} . "opt_dups.txt"; - if (-e $testname) { - $line_counts = `wc -l ${fdir}opt_dups.txt`; - @tmp = split(' ', $line_counts); - $opt_dups = $tmp[0]; - } - else { - $opt_dups = 0; - } - $testname = ${fdir} . "merged_nodups.txt"; - if (-e $testname) { - $line_counts = `wc -l ${fdir}merged_nodups.txt`; - @tmp = split(' ', $line_counts); - $total = $tmp[0]; - } - else { - $total = 0; - } -} -else { - $tot_dups = 0; - $opt_dups = 0; -} - -print "tot" . $total . " " . $tot_dups . " " . $opt_dups . "\n"; - # read in infile and calculate statistics open FILE, $infile or die $!; while () { - #$total++; + $total_current++; my @record = split; my $num_records = scalar(@record); @@ -300,25 +259,17 @@ } } close(FILE); -open FILE, " >> $stats_file", or die $!; -print FILE "Total reads after duplication removal: " . commify($total) . "\n"; -print FILE "Duplicate reads: " . commify($tot_dups) . "\n"; -my $tot_reads = $total + $tot_dups + $opt_dups; -my $result = `java -cp /broad/aidenlab/neva/neva_scripts LibraryComplexity $tot_reads $total`; -print FILE "$result"; -if ($total==0) { - $total=1; +my($statsfilename, $directories, $suffix)= fileparse($stats_file, qr/\.[^.]*/); +my $histsfile = $directories . $statsfilename . "_hists.m"; +my $statssupfile = $directories . $statsfilename . "_supp.txt"; + +open FILE, " >> $stats_file", or die $!; +print FILE "Total reads in current file: " . commify($total_current) . "\n"; +if ($total_current==0) { + $total_current=1; } -printf FILE "Ligations: %s (%0.2f\%)\n", commify($ligation), $ligation*100/$total; -printf FILE "Dangling: %s (%0.2f\%)\n", commify($dangling), $dangling*100/$total; -printf FILE " Very small: %s (%0.2f\%)\n", commify($very_small_dangling), $very_small_dangling*100/$total; -printf FILE " Small: %s (%0.2f\%)\n", commify($small_dangling),$small_dangling*100/$total; -printf FILE " Large: %s (%0.2f\%)\n", commify($large_dangling),$large_dangling*100/$total; -printf FILE " Inter: %s (%0.2f\%)\n", commify($inter_dangling),$inter_dangling*100/$total; -printf FILE " True Small: %s (%0.2f\%)\n", commify($true_dangling_intra_small),$true_dangling_intra_small*100/$total; -printf FILE " True Large: %s (%0.2f\%)\n", commify($true_dangling_intra_large),$true_dangling_intra_large*100/$total; -printf FILE " True Inter: %s (%0.2f\%)\n", commify($true_dangling_inter),$true_dangling_inter*100/$total; +printf FILE "Ligations: %s (%0.2f\%)\n", commify($ligation), $ligation*100/$total_current; if ($five_prime_end + $three_prime_end > 0) { printf FILE "Five prime: %s (%0.2f\%)\n", commify($five_prime_end), $five_prime_end*100/($five_prime_end + $three_prime_end); printf FILE "Three prime: %s (%0.2f\%)\n", commify($three_prime_end), $three_prime_end*100/($five_prime_end + $three_prime_end); @@ -327,11 +278,11 @@ printf FILE "Five prime: $five_prime_end (%0.2f\%)\n", 0; printf FILE "Three prime: $three_prime_end (%0.2f\%)\n", 0; } -printf FILE "Inter: %s (%0.2f\%)\n", commify($inter), $inter*100/$total; -printf FILE "Intra: %s (%0.2f\%)\n", commify($intra), $intra*100/$total; -printf FILE "Small: %s (%0.2f\%)\n", commify($small), $small*100/$total; -printf FILE "Large: %s (%0.2f\%)\n", commify($large), $large*100/$total; -printf FILE "Very small: %s (%0.2f\%)\n", commify($very_small), $very_small*100/$total; +printf FILE "Inter: %s (%0.2f\%)\n", commify($inter), $inter*100/$total_current; +printf FILE "Intra: %s (%0.2f\%)\n", commify($intra), $intra*100/$total_current; +printf FILE "Small: %s (%0.2f\%)\n", commify($small), $small*100/$total_current; +printf FILE "Large: %s (%0.2f\%)\n", commify($large), $large*100/$total_current; +printf FILE "Very small: %s (%0.2f\%)\n", commify($very_small), $very_small*100/$total_current; if ($large > 0) { printf FILE "Inner: %s (%0.2f\%) \n", commify($inner), $inner*100/$large; printf FILE "Outer: %s (%0.2f\%) \n", commify($outer), $outer*100/$large; @@ -339,44 +290,52 @@ printf FILE "Right: %s (%0.2f\%) \n", commify($right), $right*100/$large; } close FILE; -my($statsfilename, $directories, $suffix)= fileparse($stats_file, qr/\.[^.]*/); -my $histsfile = $directories . $statsfilename . "_hists.m"; + +open FILE, " > $statssupfile", or die $!; +printf FILE "Dangling: %s (%0.2f\%)\n", commify($dangling), $dangling*100/$total_current; +printf FILE " Very small: %s (%0.2f\%)\n", commify($very_small_dangling), $very_small_dangling*100/$total_current; +printf FILE " Small: %s (%0.2f\%)\n", commify($small_dangling),$small_dangling*100/$total_current; +printf FILE " Large: %s (%0.2f\%)\n", commify($large_dangling),$large_dangling*100/$total_current; +printf FILE " Inter: %s (%0.2f\%)\n", commify($inter_dangling),$inter_dangling*100/$total_current; +printf FILE " True Small: %s (%0.2f\%)\n", commify($true_dangling_intra_small),$true_dangling_intra_small*100/$total_current; +printf FILE " True Large: %s (%0.2f\%)\n", commify($true_dangling_intra_large),$true_dangling_intra_large*100/$total_current; +printf FILE " True Inter: %s (%0.2f\%)\n", commify($true_dangling_inter),$true_dangling_inter*100/$total_current; +close FILE; + open FILE, "> $histsfile", or die $!; -print FILE "A = ["; +print FILE "A = [\n"; for (my $i=1; $i <= 2000; $i++) { my $tmp = $hindIII{$i} || 0; print FILE "$tmp "; } -print FILE "];\n"; -print FILE "B = ["; +print FILE "\n];\n"; +print FILE "B = [\n"; for (my $i=0; $i <= 200; $i++) { my $tmp = $mapQ{$i} || 0; my $tmp2 = $mapQ_intra{$i} || 0; my $tmp3 = $mapQ_inter{$i} || 0; - print FILE "$tmp,$tmp2,$tmp3; "; + print FILE "$tmp $tmp2 $tmp3\n "; } -print FILE "];\n"; -print FILE "D = ["; +print FILE "\n];\n"; +print FILE "D = [\n"; for (my $i=0; $i < scalar(@bins); $i++) { my $tmp = $innerM{$i} || 0; - print FILE "$tmp,"; + print FILE "$tmp "; $tmp = $outerM{$i} || 0; - print FILE "$tmp,"; + print FILE "$tmp "; $tmp = $rightM{$i} || 0; - print FILE "$tmp,"; + print FILE "$tmp "; $tmp = $leftM{$i} || 0; - print FILE "$tmp;\n"; + print FILE "$tmp\n"; } -print FILE "];"; -print FILE "x = ["; +print FILE "\n];"; +print FILE "x = [\n"; for (my $i=0; $i < scalar(@bins); $i++) { - print FILE "$bins[$i],"; + print FILE "$bins[$i] "; } -print FILE "];"; -$str = "\'$outdir\'"; - -system('matlab -nodisplay -r "addpath(\'/broad/aidenlab/neva/neva_scripts/\'); addpath( ' . ${str} . ');' . ${statsfilename} . '_hists; plotstats (A,B,D,x, \'' . ${directories} . ${statsfilename} . '\'); quit;" > /dev/null'); +print FILE "\n];\n"; +close FILE; # Find distance to nearest HindIII restriction site sub distHindIII { @@ -394,16 +353,16 @@ sub distHindIII { } my $dist2 = abs($_[2] - $chromosomes{$_[1]}[$index]); - if ($index == 0) { - if (!($_[2] >= 0 && $_[2] <= $chromosomes{$_[1]}[$index])) { - print(STDERR "Problem not " . $_[2] . " >= 0 && <= " . $chromosomes{$_[1]}[$index] . " " . $_[1] . " index = " . $index . " chr = " . $_[1] . "\n"); - } - } - else { - if (!($_[2] >= $chromosomes{$_[1]}[$index-1] && $_[2] <= $chromosomes{$_[1]}[$index])) { - print(STDERR "Problem not " . $_[2] . " >= " . $chromosomes{$_[1]}[$index-1] . " && <= " . $chromosomes{$_[1]}[$index] . " index = " . $index . " chr = " . $_[1] . "\n"); - } - } +# if ($index == 0) { +# if (!($_[2] >= 0 && $_[2] <= $chromosomes{$_[1]}[$index])) { +# print(STDERR "Problem not " . $_[2] . " >= 0 && <= " . $chromosomes{$_[1]}[$index] . " " . $_[1] . " index = " . $index . " chr = " . $_[1] . "\n"); +# } +# } +# else { +# if (!($_[2] >= $chromosomes{$_[1]}[$index-1] && $_[2] <= $chromosomes{$_[1]}[$index])) { +# print(STDERR "Problem not " . $_[2] . " >= " . $chromosomes{$_[1]}[$index-1] . " && <= " . $chromosomes{$_[1]}[$index] . " index = " . $index . " chr = " . $_[1] . "\n"); +# } +# } # get minimum value -- if (dist1 <= dist2), it's dist1, else dist2 my $retval = $dist1 <= $dist2 ? $dist1 : $dist2; # get which end of the fragment this is, 3' or 5' (depends on strand)