Skip to content

Commit

Permalink
change default relative abundnace to ROLLUP_DOC
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Li committed Nov 10, 2017
1 parent f2bd774 commit ea53e2b
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 21 deletions.
21 changes: 11 additions & 10 deletions src/filterGottcha.pl
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ sub exportAbundances {

my $wantedRank = $taxAbbrRevLookup{ $_[0]->{TAXLEVEL} };
#my @headers = @{ $_[0]->{HEADERS} };
my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV" );
my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "UREF_DOC" );

# Headers
print $OUTFILE $_[0]->{TAXLEVEL}."\t".join("\t", @headers)."\n";
Expand Down Expand Up @@ -235,7 +235,7 @@ sub displayRollups {
# "P" => "PHYLUM",
# );

my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV");
my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV", "ROLLUP_DOC");

foreach my $taxAbbr (@taxAbbrs) {
print "\n\n".$taxAbbrLookup{$taxAbbr}."\t".join("\t", @headers)."\n";
Expand All @@ -250,7 +250,7 @@ sub displayRollups {
# ARGS: \%inputOptions, \%ancestry
sub outputLineage {

my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV");
my @headers = ("LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "HIT_COUNT_PLASMID", "READ_COUNT", "LINEAR_DOC", "NORM_COV", "ROLLUP_DOC");

#my %taxLevel = ( "STRAIN" => "90",
# "SPECIES" => "80",
Expand Down Expand Up @@ -328,7 +328,10 @@ sub filterTable {
# $test_state = 1;
# <STDIN>;
#}


# update rollup DOC at strain level
$_[1]->{$strain}->{ROLLUP_DOC} = $_[1]->{$strain}->{TOTAL_BP_MAPPED}/$_[1]->{$strain}->{UNIQUE_DB_LENGTH};

# Filter Stage1: Coverage Threshold
delete $_[1]->{$strain}
if (
Expand Down Expand Up @@ -376,6 +379,7 @@ sub filterTable {
$ancestry{SS}->{$strain}->{HIT_COUNT} += $_[1]->{$strain}->{HIT_COUNT};
$ancestry{SS}->{$strain}->{HIT_COUNT_PLASMID} += $_[1]->{$strain}->{HIT_COUNT_PLASMID};
$ancestry{SS}->{$strain}->{READ_COUNT} += $_[1]->{$strain}->{READ_COUNT};
$ancestry{SS}->{$strain}->{ROLLUP_DOC} += $_[1]->{$strain}->{ROLLUP_DOC};
}
my $sum = 0;
foreach my $strain (@strains) {
Expand All @@ -387,7 +391,6 @@ sub filterTable {
foreach my $strain (@strains) {
my $norm_cov = $ancestry{SS}->{$strain}->{LINEAR_DOC} / $sum;
$ancestry{SS}->{$strain}->{NORM_COV} = $norm_cov;

$max = $norm_cov unless ($max > $norm_cov);
}

Expand Down Expand Up @@ -422,7 +425,7 @@ sub filterTable {
$ancestry{S}->{$species}->{HIT_COUNT} += $_[1]->{$org}->{HIT_COUNT};
$ancestry{S}->{$species}->{HIT_COUNT_PLASMID} += $_[1]->{$org}->{HIT_COUNT_PLASMID};
$ancestry{S}->{$species}->{READ_COUNT} += $_[1]->{$org}->{READ_COUNT};

$ancestry{S}->{$species}->{ROLLUP_DOC} += $_[1]->{$org}->{ROLLUP_DOC};

} #ORG

Expand Down Expand Up @@ -455,8 +458,6 @@ sub filterTable {
##########################
# HIGHER ROLL-UP
##########################


{
my @taxAbbr = ("S", "G", "F", "O", "C", "P");
foreach my $taxAbbrIdx (1..5) {
Expand Down Expand Up @@ -486,6 +487,7 @@ sub filterTable {
$ancestry{$currTaxAbbr}->{$currRankName}->{HIT_COUNT} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{HIT_COUNT};
$ancestry{$currTaxAbbr}->{$currRankName}->{HIT_COUNT_PLASMID} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{HIT_COUNT_PLASMID};
$ancestry{$currTaxAbbr}->{$currRankName}->{READ_COUNT} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{READ_COUNT};
$ancestry{$currTaxAbbr}->{$currRankName}->{ROLLUP_DOC} += $ancestry{$prevTaxAbbr}->{$prevRankName}->{ROLLUP_DOC};
}
else {
die "*FATAL*: Could not place $prevRankName [$prevTaxAbbr] into parent [$currTaxAbbr]\n";
Expand Down Expand Up @@ -747,5 +749,4 @@ sub usage {
HELP
print "$HELP\n";
exit 1;
}

}
8 changes: 4 additions & 4 deletions src/gottcha.pl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
use strict;

# environment setup
my $ver = "1.0b";
my $ver = "1.0c";
$ENV{PATH} = "$RealBin:$RealBin/../ext/bin:$ENV{PATH}";
$ENV{PERL5LIB} = "$RealBin/../ext/lib/perl5:$ENV{PERL5LIB}";

Expand Down Expand Up @@ -87,7 +87,7 @@
my $PREFIX = defined $opt{prefix} ? $opt{prefix} : $fn;
my $OUTDIR = defined $opt{outdir} ? $opt{outdir} : ".";
my $TMPDIR = "${PREFIX}_temp";
my $RELABU = defined $opt{relAbu} ? $opt{relAbu} : "LINEAR_DOC";
my $RELABU = defined $opt{relAbu} ? $opt{relAbu} : "ROLLUP_DOC";
my $MODE = defined $opt{mode} ? $opt{mode} : "summary";
my $BWAMETHOD = "mem";
my $BWA_OPT = defined $opt{bwaOpt} ? $opt{bwaOpt} : "-k 30 -T 0 -B 100 -O 100 -E 100";
Expand Down Expand Up @@ -542,9 +542,9 @@ sub usage {
[default: <INPUT_FILENAME_PREFIX>]
--relAbu|r <STRING> The field will be used to calculate relative
abundance. You can specify one of the following
fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED",
fields: "ROLLUP_DOC", "LINEAR_LENGTH", "TOTAL_BP_MAPPED",
"HIT_COUNT", "LINEAR_DOC".
[default: LINEAR_DOC]
[default: ROLLUP_DOC]
--mode|m <STRING> You can specify one of the output mode:
"summary" : this mode will report a summary of
profiling result to *.gottcha.tsv file.
Expand Down
26 changes: 19 additions & 7 deletions src/profileGottcha.pl
Original file line number Diff line number Diff line change
Expand Up @@ -991,7 +991,7 @@ sub computeData {
."UNIQUE_DB_LENGTH\t". "FULL_REFDB_LENGTH\t". "LINEAR_COV\t"
."HIT_COUNT\t". "HIT_COUNT_PLASMID\t". "READ_COUNT\t"
."FULL_HIT_COUNT\t". "TOTAL_BP_MAPPED\t". "LINEAR_DOC\t"
."UREF_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t"
."UREF_DOC\t". "ROLLUP_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t"
."FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t". "MEAN_LINEAR_HIT_LENGTH\t"
#------------------------------------------------------------------------------------------
."best_SUBRANK\t"
Expand Down Expand Up @@ -1041,6 +1041,7 @@ sub computeData {
my $fullRefDBlength = $hitTree->{$_[2]}->{$rankName}->{REFSIZE}; # total bases in the COMPLETE reference (not just the unique bases)
my $linearDOC = sprintf("%.15f", $totalBpMapped / $linearLength);
my $uRefDOC = sprintf("%.15f", $totalBpMapped / $uniqueDBlength);
my $rollupDOC = $hitTree->{$_[2]}->{$rankName}->{ROLLUPDOC}; # total rollup uRefDOC
# Max Coverage of RefDB Possible in Sample = Cmax = L0/l0 = (Total Possible Bases) / (unique refDB Bases)
my $uRefCmax = sprintf("%.15f", $inputBases/$uniqueDBlength);
# Fraction of Total Hits Possible = uN = Ni/N0 = (HIT_COUNT, genome i) / (Total Potential Hits)
Expand Down Expand Up @@ -1097,6 +1098,7 @@ sub computeData {
.$totalBpMapped."\t"
.$linearDOC."\t"
.$uRefDOC."\t"
.$rollupDOC."\t"
.$uRefCmax."\t"
.$fracHitsPossible."\t"
.$fracBasesPossible."\t"
Expand Down Expand Up @@ -1282,8 +1284,9 @@ sub computeRepliconData {
."UNIQUE_DB_LENGTH\t". "FULL_REFDB_LENGTH\t". "LINEAR_COV\t"
."HIT_COUNT\t". "HIT_COUNT_PLASMID\t". "READ_COUNT\t"
."FULL_HIT_COUNT\t". "TOTAL_BP_MAPPED\t". "LINEAR_DOC\t"
."UREF_DOC\t". "UREF_CMAX\t". "FRAC_HITS_POSSIBLE\t"
."FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t". "MEAN_LINEAR_HIT_LENGTH\t"
."UREF_DOC\t". "ROLLUP_DOC\t". "UREF_CMAX\t"
."FRAC_HITS_POSSIBLE\t". "FRAC_BASES_POSSIBLE\t". "MEAN_HIT_LENGTH\t"
."MEAN_LINEAR_HIT_LENGTH\t"
#------------------------------------------------------------------------------------------
."CONTIG_COUNT\t". "CONTIG_MEAN_LEN\t". "CONTIG_STDEV_LEN\t"
."CONTIG_MINLEN\t". "CONTIG_MAXLEN\t". "CONTIG_HISTO(LEN:FREQ)\t"
Expand Down Expand Up @@ -1323,6 +1326,7 @@ sub computeRepliconData {

my $linearDOC = sprintf("%.4f", $totalBpMapped / $_[0]->{N_O_COV_HREF}->{$gi}->{LINLEN});
my $uniqueDOC = sprintf("%.4f", $totalBpMapped / $uniqueDBlength);
my $rollupDOC = sprintf("%.4f", $totalBpMapped / $uniqueDBlength);
my $mean_hit_length =
sprintf("%.4f", $totalBpMapped / $_[0]->{GI2MERGEDFRAGS_HREF}->{$gi}->{HITCOUNT});
my $mean_linear_hit_length =
Expand Down Expand Up @@ -1365,6 +1369,7 @@ sub computeRepliconData {
.$totalBpMapped."\t" # total BP mapped
.$linearDOC."\t" # linear depth-of-coverage
.$uniqueDOC."\t" # unique genome depth-of-coverage
.$rollupDOC."\t" # unique genome depth-of-coverage for rollup
.$uRefCmax."\t" # max coverage of unique DB given sample input
.$fracHitsPossible."\t" # fraction of all possible hits
.$fracBasesPossible."\t" # fraction of all possible bases
Expand Down Expand Up @@ -1574,14 +1579,16 @@ sub genTree {
my $readcount = $_[1]->{$gi}->{READCOUNT};
my $fullReadHits = 0; #$_[2]->{$gi}->{FULLREADHITS};
my $totalBPmapped = $_[2]->{$gi}->{TOTALBPMAPPED};

my $rollupDOC = sprintf("%.4f", $totalBPmapped/$usize);

$hitTree{GI}->{$gi}->{REFSIZE} = $refsize;
$hitTree{GI}->{$gi}->{USIZE} = $usize;
$hitTree{GI}->{$gi}->{LINLEN} = $linlen;
$hitTree{GI}->{$gi}->{COV} = $cov;
$hitTree{GI}->{$gi}->{HITCOUNT} = $hitcount;
$hitTree{GI}->{$gi}->{HITCOUNT_P} = $hitcountP;
$hitTree{GI}->{$gi}->{READCOUNT} = $readcount;
$hitTree{GI}->{$gi}->{ROLLUPDOC} = $rollupDOC;
$hitTree{GI}->{$gi}->{FULLREADHITS} = $fullReadHits;
$hitTree{GI}->{$gi}->{MAPPED} = $totalBPmapped;
$hitTree{GI}->{$gi}->{HISTO} = $giHisto{$gi};
Expand Down Expand Up @@ -1628,6 +1635,7 @@ sub genTree {
$hitTree{SS}->{$strainName}->{HITCOUNT} += $hitcount;
$hitTree{SS}->{$strainName}->{HITCOUNT_P} += $hitcountP;
$hitTree{SS}->{$strainName}->{READCOUNT} += $readcount;
$hitTree{SS}->{$strainName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{SS}->{$strainName}->{MAPPED} += $totalBPmapped;
$hitTree{SS}->{$strainName}->{FULLREADHITS} += $fullReadHits;
$hitTree{SS}->{$strainName}->{CHILDREN}->{NAMES}->{$gi} = ();
Expand All @@ -1639,6 +1647,7 @@ sub genTree {
$hitTree{S}->{$speciesName}->{HITCOUNT} += $hitcount;
$hitTree{S}->{$speciesName}->{HITCOUNT_P} += $hitcountP;
$hitTree{S}->{$speciesName}->{READCOUNT} += $readcount;
$hitTree{S}->{$speciesName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{S}->{$speciesName}->{MAPPED} += $totalBPmapped;
$hitTree{S}->{$speciesName}->{FULLREADHITS} += $fullReadHits;
$hitTree{S}->{$speciesName}->{CHILDREN}->{NAMES}->{$strainName} = ();
Expand All @@ -1650,6 +1659,7 @@ sub genTree {
$hitTree{G}->{$genusName}->{HITCOUNT} += $hitcount;
$hitTree{G}->{$genusName}->{HITCOUNT_P} += $hitcountP;
$hitTree{G}->{$genusName}->{READCOUNT} += $readcount;
$hitTree{G}->{$genusName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{G}->{$genusName}->{MAPPED} += $totalBPmapped;
$hitTree{G}->{$genusName}->{FULLREADHITS} += $fullReadHits;
$hitTree{G}->{$genusName}->{CHILDREN}->{NAMES}->{$speciesName} = ();
Expand All @@ -1661,6 +1671,7 @@ sub genTree {
$hitTree{F}->{$familyName}->{HITCOUNT} += $hitcount;
$hitTree{F}->{$familyName}->{HITCOUNT_P} += $hitcountP;
$hitTree{F}->{$familyName}->{READCOUNT} += $readcount;
$hitTree{F}->{$familyName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{F}->{$familyName}->{MAPPED} += $totalBPmapped;
$hitTree{F}->{$familyName}->{FULLREADHITS} += $fullReadHits;
$hitTree{F}->{$familyName}->{CHILDREN}->{NAMES}->{$genusName} = ();
Expand All @@ -1672,6 +1683,7 @@ sub genTree {
$hitTree{O}->{$orderName}->{HITCOUNT} += $hitcount;
$hitTree{O}->{$orderName}->{HITCOUNT_P} += $hitcountP;
$hitTree{O}->{$orderName}->{READCOUNT} += $readcount;
$hitTree{O}->{$orderName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{O}->{$orderName}->{MAPPED} += $totalBPmapped;
$hitTree{O}->{$orderName}->{FULLREADHITS} += $fullReadHits;
$hitTree{O}->{$orderName}->{CHILDREN}->{NAMES}->{$familyName} = ();
Expand All @@ -1683,6 +1695,7 @@ sub genTree {
$hitTree{C}->{$className}->{HITCOUNT} += $hitcount;
$hitTree{C}->{$className}->{HITCOUNT_P} += $hitcountP;
$hitTree{C}->{$className}->{READCOUNT} += $readcount;
$hitTree{C}->{$className}->{ROLLUPDOC} += $rollupDOC;
$hitTree{C}->{$className}->{MAPPED} += $totalBPmapped;
$hitTree{C}->{$className}->{FULLREADHITS} += $fullReadHits;
$hitTree{C}->{$className}->{CHILDREN}->{NAMES}->{$orderName} = ();
Expand All @@ -1694,6 +1707,7 @@ sub genTree {
$hitTree{P}->{$phylumName}->{HITCOUNT} += $hitcount;
$hitTree{P}->{$phylumName}->{HITCOUNT_P} += $hitcountP;
$hitTree{P}->{$phylumName}->{READCOUNT} += $readcount;
$hitTree{P}->{$phylumName}->{ROLLUPDOC} += $rollupDOC;
$hitTree{P}->{$phylumName}->{MAPPED} += $totalBPmapped;
$hitTree{P}->{$phylumName}->{FULLREADHITS} += $fullReadHits;
$hitTree{P}->{$phylumName}->{CHILDREN}->{NAMES}->{$className} = ();
Expand Down Expand Up @@ -2288,8 +2302,7 @@ sub mergeOverlappingHits {
# Record no. of actual hits to the reference GI
$gi2mergedFrags{$gi}->{HITCOUNT} = $_[0]->{$gi}->{HITCOUNT};
$gi2mergedFrags{$gi}->{HITCOUNT_P} = $_[0]->{$gi}->{HITCOUNT_P};
$gi2mergedFrags{$gi}->{READCOUNT} = $_[0]->{$gi}->{READCOUNT};

$gi2mergedFrags{$gi}->{READCOUNT} = $_[0]->{$gi}->{READCOUNT};
} #GI

return \%gi2mergedFrags;
Expand Down Expand Up @@ -2611,7 +2624,6 @@ sub parseSAM {
$MAPPED_PLASMID_RAWREADS_CNT++ if $uniqueGIlengths->{$gi}->{PLASMID};
$readlist->{$1} = 1;
}

} #FIELDS[2]
else {
print " **Unrecognized mapping entry!** [".$fields[2]."]\n";
Expand Down

0 comments on commit ea53e2b

Please sign in to comment.