From ea53e2bcdb2f8c064ac61a9176f054af82a2c340 Mon Sep 17 00:00:00 2001 From: Paul Li Date: Thu, 9 Nov 2017 19:19:26 -0700 Subject: [PATCH] change default relative abundnace to ROLLUP_DOC --- src/filterGottcha.pl | 21 +++++++++++---------- src/gottcha.pl | 8 ++++---- src/profileGottcha.pl | 26 +++++++++++++++++++------- 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/src/filterGottcha.pl b/src/filterGottcha.pl index 563fc9e..4380309 100755 --- a/src/filterGottcha.pl +++ b/src/filterGottcha.pl @@ -193,7 +193,7 @@ sub exportAbundances { my $wantedRank = $taxAbbrRevLookup{ $_[0]->{TAXLEVEL} }; #my @headers = @{ $_[0]->{HEADERS} }; - my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV" ); + my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "UREF_DOC" ); # Headers print $OUTFILE $_[0]->{TAXLEVEL}."\t".join("\t", @headers)."\n"; @@ -235,7 +235,7 @@ sub displayRollups { # "P" => "PHYLUM", # ); - my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV"); + my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV", "ROLLUP_DOC"); foreach my $taxAbbr (@taxAbbrs) { print "\n\n".$taxAbbrLookup{$taxAbbr}."\t".join("\t", @headers)."\n"; @@ -250,7 +250,7 @@ sub displayRollups { # ARGS: \%inputOptions, \%ancestry sub outputLineage { - my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV"); + my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV", "ROLLUP_DOC"); #my %taxLevel = ( "STRAIN" => "90", # "SPECIES" => "80", @@ -328,7 +328,10 @@ sub filterTable { # $test_state = 1; # ; #} - + + # update rollup DOC at strain level + $_[1]->{$strain}->{ROLLUP_DOC} = $_[1]->{$strain}->{TOTAL_BP_MAPPED}/$_[1]->{$strain}->{UNIQUE_DB_LENGTH}; + # Filter Stage1: Coverage Threshold delete $_[1]->{$strain} if ( @@ -376,6 +379,7 @@ sub filterTable { $ancestry{SS}->{$strain}->{HIT_COUNT} += $_[1]->{$strain}->{HIT_COUNT}; $ancestry{SS}->{$strain}->{HIT_COUNT_PLASMID} += $_[1]->{$strain}->{HIT_COUNT_PLASMID}; $ancestry{SS}->{$strain}->{READ_COUNT} += $_[1]->{$strain}->{READ_COUNT}; + $ancestry{SS}->{$strain}->{ROLLUP_DOC} += $_[1]->{$strain}->{ROLLUP_DOC}; } my $sum = 0; foreach my $strain (@strains) { @@ -387,7 +391,6 @@ sub filterTable { foreach my $strain (@strains) { my $norm_cov = $ancestry{SS}->{$strain}->{LINEAR_DOC} / $sum; $ancestry{SS}->{$strain}->{NORM_COV} = $norm_cov; - $max = $norm_cov unless ($max > $norm_cov); } @@ -422,7 +425,7 @@ sub filterTable { $ancestry{S}->{$species}->{HIT_COUNT} += $_[1]->{$org}->{HIT_COUNT}; $ancestry{S}->{$species}->{HIT_COUNT_PLASMID} += $_[1]->{$org}->{HIT_COUNT_PLASMID}; $ancestry{S}->{$species}->{READ_COUNT} += $_[1]->{$org}->{READ_COUNT}; - + $ancestry{S}->{$species}->{ROLLUP_DOC} += $_[1]->{$org}->{ROLLUP_DOC}; } #ORG @@ -455,8 +458,6 @@ sub filterTable { ########################## # HIGHER ROLL-UP ########################## - - { my @taxAbbr = ("S", "G", "F", "O", "C", "P"); foreach my $taxAbbrIdx (1..5) { @@ -486,6 +487,7 @@ sub filterTable { $ancestry{$currTaxAbbr}->{$currRankName}->{HIT_COUNT} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{HIT_COUNT}; $ancestry{$currTaxAbbr}->{$currRankName}->{HIT_COUNT_PLASMID} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{HIT_COUNT_PLASMID}; $ancestry{$currTaxAbbr}->{$currRankName}->{READ_COUNT} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{READ_COUNT}; + $ancestry{$currTaxAbbr}->{$currRankName}->{ROLLUP_DOC} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{ROLLUP_DOC}; } else { die "*FATAL*: Could not place $prevRankName [$prevTaxAbbr] into parent [$currTaxAbbr]\n"; @@ -747,5 +749,4 @@ sub usage { HELP print "$HELP\n"; exit 1; -} - +} \ No newline at end of file diff --git a/src/gottcha.pl b/src/gottcha.pl index 5a7379b..8ab1dca 100644 --- a/src/gottcha.pl +++ b/src/gottcha.pl @@ -36,7 +36,7 @@ use strict; # environment setup -my $ver = "1.0b"; +my $ver = "1.0c"; $ENV{PATH} = "$RealBin:$RealBin/../ext/bin:$ENV{PATH}"; $ENV{PERL5LIB} = "$RealBin/../ext/lib/perl5:$ENV{PERL5LIB}"; @@ -87,7 +87,7 @@ my $PREFIX = defined $opt{prefix} ? $opt{prefix} : $fn; my $OUTDIR = defined $opt{outdir} ? $opt{outdir} : "."; my $TMPDIR = "${PREFIX}_temp"; -my $RELABU = defined $opt{relAbu} ? $opt{relAbu} : "LINEAR_DOC"; +my $RELABU = defined $opt{relAbu} ? $opt{relAbu} : "ROLLUP_DOC"; my $MODE = defined $opt{mode} ? $opt{mode} : "summary"; my $BWAMETHOD = "mem"; my $BWA_OPT = defined $opt{bwaOpt} ? $opt{bwaOpt} : "-k 30 -T 0 -B 100 -O 100 -E 100"; @@ -542,9 +542,9 @@ sub usage { [default: ] --relAbu|r The field will be used to calculate relative abundance. You can specify one of the following - fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED", + fields: "ROLLUP_DOC", "LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "LINEAR_DOC". - [default: LINEAR_DOC] + [default: ROLLUP_DOC] --mode|m You can specify one of the output mode: "summary" : this mode will report a summary of profiling result to *.gottcha.tsv file. diff --git a/src/profileGottcha.pl b/src/profileGottcha.pl index 049734c..64203f2 100755 --- a/src/profileGottcha.pl +++ b/src/profileGottcha.pl @@ -991,7 +991,7 @@ sub computeData { ."UNIQUE_DB_LENGTH\t". "FULL_REFDB_LENGTH\t". "LINEAR_COV\t" ."HIT_COUNT\t". "HIT_COUNT_PLASMID\t". "READ_COUNT\t" ."FULL_HIT_COUNT\t". "TOTAL_BP_MAPPED\t". "LINEAR_DOC\t" - ."UREF_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t" + ."UREF_DOC\t". "ROLLUP_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t" ."FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t". "MEAN_LINEAR_HIT_LENGTH\t" #------------------------------------------------------------------------------------------ ."best_SUBRANK\t" @@ -1041,6 +1041,7 @@ sub computeData { my $fullRefDBlength = $hitTree->{$_[2]}->{$rankName}->{REFSIZE}; # total bases in the COMPLETE reference (not just the unique bases) my $linearDOC = sprintf("%.15f", $totalBpMapped / $linearLength); my $uRefDOC = sprintf("%.15f", $totalBpMapped / $uniqueDBlength); + my $rollupDOC = $hitTree->{$_[2]}->{$rankName}->{ROLLUPDOC}; # total rollup uRefDOC # Max Coverage of RefDB Possible in Sample = Cmax = L0/l0 = (Total Possible Bases) / (unique refDB Bases) my $uRefCmax = sprintf("%.15f", $inputBases/$uniqueDBlength); # Fraction of Total Hits Possible = uN = Ni/N0 = (HIT_COUNT, genome i) / (Total Potential Hits) @@ -1097,6 +1098,7 @@ sub computeData { .$totalBpMapped."\t" .$linearDOC."\t" .$uRefDOC."\t" + .$rollupDOC."\t" .$uRefCmax."\t" .$fracHitsPossible."\t" .$fracBasesPossible."\t" @@ -1282,8 +1284,9 @@ sub computeRepliconData { ."UNIQUE_DB_LENGTH\t". "FULL_REFDB_LENGTH\t". "LINEAR_COV\t" ."HIT_COUNT\t". "HIT_COUNT_PLASMID\t". "READ_COUNT\t" ."FULL_HIT_COUNT\t". "TOTAL_BP_MAPPED\t". "LINEAR_DOC\t" - ."UREF_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t" - ."FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t". "MEAN_LINEAR_HIT_LENGTH\t" + ."UREF_DOC\t". "ROLLUP_DOC\t". "UREF_CMAX\t" + ."FRAC_HITS_POSSIBLE\t". "FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t" + ."MEAN_LINEAR_HIT_LENGTH\t" #------------------------------------------------------------------------------------------ ."CONTIG_COUNT\t". "CONTIG_MEAN_LEN\t". "CONTIG_STDEV_LEN\t" ."CONTIG_MINLEN\t". "CONTIG_MAXLEN\t". "CONTIG_HISTO(LEN:FREQ)\t" @@ -1323,6 +1326,7 @@ sub computeRepliconData { my $linearDOC = sprintf("%.4f", $totalBpMapped / $_[0]->{N_O_COV_HREF}->{$gi}->{LINLEN}); my $uniqueDOC = sprintf("%.4f", $totalBpMapped / $uniqueDBlength); + my $rollupDOC = sprintf("%.4f", $totalBpMapped / $uniqueDBlength); my $mean_hit_length = sprintf("%.4f", $totalBpMapped / $_[0]->{GI2MERGEDFRAGS_HREF}->{$gi}->{HITCOUNT}); my $mean_linear_hit_length = @@ -1365,6 +1369,7 @@ sub computeRepliconData { .$totalBpMapped."\t" # total BP mapped .$linearDOC."\t" # linear depth-of-coverage .$uniqueDOC."\t" # unique genome depth-of-coverage + .$rollupDOC."\t" # unique genome depth-of-coverage for rollup .$uRefCmax."\t" # max coverage of unique DB given sample input .$fracHitsPossible."\t" # fraction of all possible hits .$fracBasesPossible."\t" # fraction of all possible bases @@ -1574,7 +1579,8 @@ sub genTree { my $readcount = $_[1]->{$gi}->{READCOUNT}; my $fullReadHits = 0; #$_[2]->{$gi}->{FULLREADHITS}; my $totalBPmapped = $_[2]->{$gi}->{TOTALBPMAPPED}; - + my $rollupDOC = sprintf("%.4f", $totalBPmapped/$usize); + $hitTree{GI}->{$gi}->{REFSIZE} = $refsize; $hitTree{GI}->{$gi}->{USIZE} = $usize; $hitTree{GI}->{$gi}->{LINLEN} = $linlen; @@ -1582,6 +1588,7 @@ sub genTree { $hitTree{GI}->{$gi}->{HITCOUNT} = $hitcount; $hitTree{GI}->{$gi}->{HITCOUNT_P} = $hitcountP; $hitTree{GI}->{$gi}->{READCOUNT} = $readcount; + $hitTree{GI}->{$gi}->{ROLLUPDOC} = $rollupDOC; $hitTree{GI}->{$gi}->{FULLREADHITS} = $fullReadHits; $hitTree{GI}->{$gi}->{MAPPED} = $totalBPmapped; $hitTree{GI}->{$gi}->{HISTO} = $giHisto{$gi}; @@ -1628,6 +1635,7 @@ sub genTree { $hitTree{SS}->{$strainName}->{HITCOUNT} += $hitcount; $hitTree{SS}->{$strainName}->{HITCOUNT_P} += $hitcountP; $hitTree{SS}->{$strainName}->{READCOUNT} += $readcount; + $hitTree{SS}->{$strainName}->{ROLLUPDOC} += $rollupDOC; $hitTree{SS}->{$strainName}->{MAPPED} += $totalBPmapped; $hitTree{SS}->{$strainName}->{FULLREADHITS} += $fullReadHits; $hitTree{SS}->{$strainName}->{CHILDREN}->{NAMES}->{$gi} = (); @@ -1639,6 +1647,7 @@ sub genTree { $hitTree{S}->{$speciesName}->{HITCOUNT} += $hitcount; $hitTree{S}->{$speciesName}->{HITCOUNT_P} += $hitcountP; $hitTree{S}->{$speciesName}->{READCOUNT} += $readcount; + $hitTree{S}->{$speciesName}->{ROLLUPDOC} += $rollupDOC; $hitTree{S}->{$speciesName}->{MAPPED} += $totalBPmapped; $hitTree{S}->{$speciesName}->{FULLREADHITS} += $fullReadHits; $hitTree{S}->{$speciesName}->{CHILDREN}->{NAMES}->{$strainName} = (); @@ -1650,6 +1659,7 @@ sub genTree { $hitTree{G}->{$genusName}->{HITCOUNT} += $hitcount; $hitTree{G}->{$genusName}->{HITCOUNT_P} += $hitcountP; $hitTree{G}->{$genusName}->{READCOUNT} += $readcount; + $hitTree{G}->{$genusName}->{ROLLUPDOC} += $rollupDOC; $hitTree{G}->{$genusName}->{MAPPED} += $totalBPmapped; $hitTree{G}->{$genusName}->{FULLREADHITS} += $fullReadHits; $hitTree{G}->{$genusName}->{CHILDREN}->{NAMES}->{$speciesName} = (); @@ -1661,6 +1671,7 @@ sub genTree { $hitTree{F}->{$familyName}->{HITCOUNT} += $hitcount; $hitTree{F}->{$familyName}->{HITCOUNT_P} += $hitcountP; $hitTree{F}->{$familyName}->{READCOUNT} += $readcount; + $hitTree{F}->{$familyName}->{ROLLUPDOC} += $rollupDOC; $hitTree{F}->{$familyName}->{MAPPED} += $totalBPmapped; $hitTree{F}->{$familyName}->{FULLREADHITS} += $fullReadHits; $hitTree{F}->{$familyName}->{CHILDREN}->{NAMES}->{$genusName} = (); @@ -1672,6 +1683,7 @@ sub genTree { $hitTree{O}->{$orderName}->{HITCOUNT} += $hitcount; $hitTree{O}->{$orderName}->{HITCOUNT_P} += $hitcountP; $hitTree{O}->{$orderName}->{READCOUNT} += $readcount; + $hitTree{O}->{$orderName}->{ROLLUPDOC} += $rollupDOC; $hitTree{O}->{$orderName}->{MAPPED} += $totalBPmapped; $hitTree{O}->{$orderName}->{FULLREADHITS} += $fullReadHits; $hitTree{O}->{$orderName}->{CHILDREN}->{NAMES}->{$familyName} = (); @@ -1683,6 +1695,7 @@ sub genTree { $hitTree{C}->{$className}->{HITCOUNT} += $hitcount; $hitTree{C}->{$className}->{HITCOUNT_P} += $hitcountP; $hitTree{C}->{$className}->{READCOUNT} += $readcount; + $hitTree{C}->{$className}->{ROLLUPDOC} += $rollupDOC; $hitTree{C}->{$className}->{MAPPED} += $totalBPmapped; $hitTree{C}->{$className}->{FULLREADHITS} += $fullReadHits; $hitTree{C}->{$className}->{CHILDREN}->{NAMES}->{$orderName} = (); @@ -1694,6 +1707,7 @@ sub genTree { $hitTree{P}->{$phylumName}->{HITCOUNT} += $hitcount; $hitTree{P}->{$phylumName}->{HITCOUNT_P} += $hitcountP; $hitTree{P}->{$phylumName}->{READCOUNT} += $readcount; + $hitTree{P}->{$phylumName}->{ROLLUPDOC} += $rollupDOC; $hitTree{P}->{$phylumName}->{MAPPED} += $totalBPmapped; $hitTree{P}->{$phylumName}->{FULLREADHITS} += $fullReadHits; $hitTree{P}->{$phylumName}->{CHILDREN}->{NAMES}->{$className} = (); @@ -2288,8 +2302,7 @@ sub mergeOverlappingHits { # Record no. of actual hits to the reference GI $gi2mergedFrags{$gi}->{HITCOUNT} = $_[0]->{$gi}->{HITCOUNT}; $gi2mergedFrags{$gi}->{HITCOUNT_P} = $_[0]->{$gi}->{HITCOUNT_P}; - $gi2mergedFrags{$gi}->{READCOUNT} = $_[0]->{$gi}->{READCOUNT}; - + $gi2mergedFrags{$gi}->{READCOUNT} = $_[0]->{$gi}->{READCOUNT}; } #GI return \%gi2mergedFrags; @@ -2611,7 +2624,6 @@ sub parseSAM { $MAPPED_PLASMID_RAWREADS_CNT++ if $uniqueGIlengths->{$gi}->{PLASMID}; $readlist->{$1} = 1; } - } #FIELDS[2] else { print " **Unrecognized mapping entry!** [".$fields[2]."]\n";