Skip to content

Commit

Permalink
Add auxTargetFile option to quant (ref. COMBINE-lab/pufferfish#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Patro committed Dec 31, 2019
1 parent 09822cf commit 7a37e8b
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 51 deletions.
4 changes: 4 additions & 0 deletions include/AlignmentLibrary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,8 @@ for (auto& txp : transcripts_) {
pmf, DistributionSpace::LINEAR);
}

salmon::utils::markAuxiliaryTargets(salmonOpts.jointLog, salmonOpts.auxTargetFile, transcripts_);

// Start parsing the alignments
NullFragmentFilter<FragT>* nff = nullptr;
bool onlyProcessAmbiguousAlignments = false;
Expand Down Expand Up @@ -248,6 +250,7 @@ for (auto& txp : transcripts_) {

alnMod_.reset(new AlignmentModel(1.0, salmonOpts.numErrorBins));
alnMod_->setLogger(salmonOpts.jointLog);
salmon::utils::markAuxiliaryTargets(salmonOpts.jointLog, salmonOpts.auxTargetFile, transcripts_);
}

EQBuilderT& equivalenceClassBuilder() { return eqBuilder_; }
Expand Down Expand Up @@ -502,6 +505,7 @@ for (auto& txp : transcripts_) {
}

private:

void setTranscriptLengthClasses_(std::vector<uint32_t>& lengths,
size_t nbins) {
auto n = lengths.size();
Expand Down
2 changes: 1 addition & 1 deletion include/ProgramOptionsGenerator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class ProgramOptionsGenerator{
po::options_description getAdvancedOptions(int32_t& numBiasSamples, SalmonOpts& sopt);
po::options_description getHiddenOptions(SalmonOpts& sopt);
po::options_description getTestingOptions(SalmonOpts& sopt);
po::options_description getDeprecatedOptions(SalmonOpts& sopt);
po::options_description getDeprecatedOptions(SalmonOpts& /*sopt*/);
};

}
Expand Down
2 changes: 2 additions & 0 deletions include/ReadExperiment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ class ReadExperiment {
}
}

salmon::utils::markAuxiliaryTargets(sopt.jointLog, sopt.auxTargetFile, transcripts_);

// Create the cluster forest for this set of transcripts
clusters_.reset(new ClusterForest(transcripts_.size(), transcripts_));
}
Expand Down
1 change: 1 addition & 0 deletions include/SalmonDefaults.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ namespace defaults {
constexpr const double decoyThreshold{1.0};
const std::string hitFilterPolicyStr{"AFTER"};
constexpr const double minAlnProb{1e-5};
const std::string auxTargetFile{""};

// FMD-specific options
constexpr const int fmdMinSeedLength{19};
Expand Down
1 change: 1 addition & 0 deletions include/SalmonOpts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ struct SalmonOpts {
// required for a read to be considered mapped.

std::string hitFilterPolicyStr{salmon::defaults::hitFilterPolicyStr};
std::string auxTargetFile{salmon::defaults::auxTargetFile};
pufferfish::util::HitFilterPolicy hitFilterPolicy{pufferfish::util::HitFilterPolicy::FILTER_AFTER_CHAINING};

bool splitSpanningSeeds; // Attempt to split seeds that span multiple
Expand Down
10 changes: 10 additions & 0 deletions include/SalmonUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ extern "C" {
template <typename EqBuilderT> class ReadExperiment;
class LibraryFormat;
class FragmentLengthDistribution;
class Transcript;

namespace salmon {
namespace utils {
Expand Down Expand Up @@ -122,6 +123,15 @@ bool readEquivCounts(boost::filesystem::path& eqFilePathString,
std::vector<std::vector<double>>& auxs_vals,
std::vector<uint32_t>& eqclass_counts );

/**
* @param log : The logger to which we should write any messages
* @param auxTargetFile : The name of the file containing any auxiliary target names
* @param transcripts : The list of transcript objects
*
* If the auxTargetFile is not empty (i.e. if the file exists)
**/
void markAuxiliaryTargets(std::shared_ptr<spdlog::logger> log, const std::string& auxTargetFile, std::vector<Transcript>& transcripts);

/*
template <typename AbundanceVecT, typename ReadExpT>
Eigen::VectorXd updateEffectiveLengths(
Expand Down
55 changes: 9 additions & 46 deletions include/Transcript.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,35 +41,29 @@ class Transcript {
EffectiveLength(-1.0), id(std::numeric_limits<uint32_t>::max()),
logPerBasePrior_(salmon::math::LOG_0), priorMass_(salmon::math::LOG_0),
mass_(salmon::math::LOG_0), sharedCount_(0.0),
avgMassBias_(salmon::math::LOG_0), active_(false) {
avgMassBias_(salmon::math::LOG_0), active_(false), skipBiasCorrection_(false) {
uniqueCount_.store(0);
totalCount_.store(0); // thanks @come-raczy
lastUpdate_.store(0);
lastTimestepUpdated_.store(0);
cachedEffectiveLength_.store(salmon::math::LOG_0);
}

Transcript(size_t idIn, const char* name, uint32_t len, double alpha = 0.05)
: RefName(name), RefLength(len), CompleteLength(len),
EffectiveLength(-1.0), id(idIn), logPerBasePrior_(std::log(alpha)),
priorMass_(std::log(alpha * len)), mass_(salmon::math::LOG_0),
sharedCount_(0.0), avgMassBias_(salmon::math::LOG_0), active_(false) {
sharedCount_(0.0), avgMassBias_(salmon::math::LOG_0), active_(false), skipBiasCorrection_(false) {
uniqueCount_.store(0);
totalCount_.store(0); // thanks @come-raczy
lastUpdate_.store(0);
lastTimestepUpdated_.store(0);
cachedEffectiveLength_.store(std::log(static_cast<double>(RefLength)));
}

Transcript(size_t idIn, const char* name, double len, bool /*updateEffLength*/, double alpha = 0.05)
: RefName(name), RefLength(len), CompleteLength(len),
EffectiveLength(len), id(idIn), logPerBasePrior_(std::log(alpha)),
priorMass_(std::log(alpha * len)), mass_(salmon::math::LOG_0),
sharedCount_(0.0), avgMassBias_(salmon::math::LOG_0), active_(false) {
sharedCount_(0.0), avgMassBias_(salmon::math::LOG_0), active_(false), skipBiasCorrection_(false) {
uniqueCount_.store(0);
totalCount_.store(0); // thanks @come-raczy
lastUpdate_.store(0);
lastTimestepUpdated_.store(0);
cachedEffectiveLength_.store(std::log(len));
}

Expand All @@ -96,10 +90,8 @@ class Transcript {

uniqueCount_.store(other.uniqueCount_);
totalCount_.store(other.totalCount_.load());
lastTimestepUpdated_.store(other.lastTimestepUpdated_.load());
sharedCount_.store(other.sharedCount_.load());
mass_.store(other.mass_.load());
lastUpdate_.store(other.lastUpdate_.load());
cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
lengthClassIndex_ = other.lengthClassIndex_;
logPerBasePrior_ = other.logPerBasePrior_;
Expand All @@ -108,6 +100,7 @@ class Transcript {
hasAnchorFragment_.store(other.hasAnchorFragment_.load());
active_ = other.active_;
isDecoy_ = other.isDecoy_;
skipBiasCorrection_ = other.skipBiasCorrection_;
}

Transcript& operator=(Transcript&& other) {
Expand All @@ -128,10 +121,8 @@ class Transcript {

uniqueCount_.store(other.uniqueCount_);
totalCount_.store(other.totalCount_.load());
lastTimestepUpdated_.store(other.lastTimestepUpdated_.load());
sharedCount_.store(other.sharedCount_.load());
mass_.store(other.mass_.load());
lastUpdate_.store(other.lastUpdate_.load());
cachedEffectiveLength_.store(other.cachedEffectiveLength_.load());
lengthClassIndex_ = other.lengthClassIndex_;
logPerBasePrior_ = other.logPerBasePrior_;
Expand All @@ -140,6 +131,7 @@ class Transcript {
hasAnchorFragment_.store(other.hasAnchorFragment_.load());
active_ = other.active_;
isDecoy_ = other.isDecoy_;
skipBiasCorrection_ = other.skipBiasCorrection_;
return *this;
}

Expand Down Expand Up @@ -198,13 +190,6 @@ class Transcript {
salmon::utils::incLoop(sharedCount_, sc);
}

inline void setLastTimestepUpdated(uint64_t currentTimestep) {
uint64_t oldTimestep = lastTimestepUpdated_;
if (currentTimestep > oldTimestep) {
lastTimestepUpdated_ = currentTimestep;
}
}

inline void addBias(double bias) {
salmon::utils::incLoopLog(avgMassBias_, bias);
}
Expand Down Expand Up @@ -284,30 +269,7 @@ class Transcript {
cachedEffectiveLength_.store(cel);
}

/**
* If we should update the effective length, then do it and cache the result.
* Otherwise, return the cached result.
*/
/*
double getLogEffectiveLength(const FragmentLengthDistribution& fragLengthDist,
size_t currObs, size_t burnInObs, bool
forceUpdate=false) { if (forceUpdate or (lastUpdate_ == 0) or (currObs -
lastUpdate_ >= 250000) or (lastUpdate_ < burnInObs and currObs > burnInObs)) {
// compute new number
lastUpdate_.store(currObs);
double cel = computeLogEffectiveLength(fragLengthDist);
cachedEffectiveLength_.store(cel);
//priorMass_ = cel + logPerBasePrior_;
return cachedEffectiveLength_.load();
} else {
// return cached number
return cachedEffectiveLength_.load();
}
}
*/

double perBasePrior() { return std::exp(logPerBasePrior_); }
inline size_t lastTimestepUpdated() { return lastTimestepUpdated_.load(); }

void lengthClassIndex(uint32_t ind) { lengthClassIndex_ = ind; }
uint32_t lengthClassIndex() const { return lengthClassIndex_; }
Expand Down Expand Up @@ -528,6 +490,9 @@ class Transcript {

void computePolyAPositions() { computePolyAPositions_(); }

void setSkipBiasCorrection(bool skip) { skipBiasCorrection_ = skip; }
bool skipBiasCorrection() const { return skipBiasCorrection_; }

std::string RefName;
uint32_t RefLength;
uint32_t CompleteLength;
Expand Down Expand Up @@ -707,13 +672,10 @@ class Transcript {

std::atomic<size_t> uniqueCount_;
std::atomic<size_t> totalCount_;
// The most recent timestep at which this transcript's mass was updated.
std::atomic<size_t> lastTimestepUpdated_;
double priorMass_;
tbb::atomic<double> mass_;
tbb::atomic<double> sharedCount_;
tbb::atomic<double> cachedEffectiveLength_;
tbb::atomic<size_t> lastUpdate_;
tbb::atomic<double> avgMassBias_;
uint32_t lengthClassIndex_;
double logPerBasePrior_;
Expand All @@ -723,6 +685,7 @@ class Transcript {
std::atomic<bool> hasAnchorFragment_{false};
bool active_;
bool isDecoy_{false};
bool skipBiasCorrection_{false};

bool reduceGCMemory_{false};
double gcFracLen_{0.0};
Expand Down
11 changes: 8 additions & 3 deletions src/ProgramOptionsGenerator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,15 @@ namespace salmon {
"transcript identifier and the \"gene_id\" is assumed to contain the "
"corresponding "
"gene identifier.")
("auxTargetFile", po::value(&(sopt.auxTargetFile))->default_value(salmon::defaults::auxTargetFile),
"A file containing a list of \"auxiliary\" targets. These are valid targets (i.e., not decoys) to "
"which fragments are allowed to map and be assigned, and which will be quantified, but for which "
"auxiliary models like sequence-specific and fragment-GC bias correction should not be applied."
)
("meta", po::bool_switch(&(sopt.meta))->default_value(salmon::defaults::metaMode),
"If you're using Salmon on a metagenomic dataset, consider setting this "
"flag to disable parts of the "
"abundance estimation model that make less sense for metagenomic data.");
"flag to disable parts of the abundance estimation model that make less "
"sense for metagenomic data.");

// use rich eq classes by default
sopt.noRichEqClasses = salmon::defaults::noRichEqClasses;
Expand Down Expand Up @@ -796,7 +801,7 @@ namespace salmon {
return testing;
}

po::options_description ProgramOptionsGenerator::getDeprecatedOptions(SalmonOpts& sopt) {
po::options_description ProgramOptionsGenerator::getDeprecatedOptions(SalmonOpts& /*sopt*/) {
using std::string;
using std::vector;

Expand Down
1 change: 0 additions & 1 deletion src/SalmonQuantifyAlignments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,6 @@ void processMiniBatch(AlignmentLibraryT<FragT>& alnLib,

double newMass = logForgettingMass + aln->logProb;
transcript.addMass(newMass);
transcript.setLastTimestepUpdated(currentMinibatchTimestep);

// ---- Collect seq-specific bias samples ------ //
auto getCIGARLength = [](bam_seq_t* s) -> uint32_t {
Expand Down
52 changes: 52 additions & 0 deletions src/SalmonUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "TryableSpinLock.hpp"
#include "UnpairedRead.hpp"
#include "TranscriptGroup.hpp"
#include "Transcript.hpp"

#include "spdlog/fmt/fmt.h"
#include "spdlog/fmt/ostr.h"
Expand Down Expand Up @@ -2197,6 +2198,53 @@ bool readEquivCounts(boost::filesystem::path& eqFilePathString,
return true;
}

/**
* @param log : The logger to which we should write any messages
* @param auxTargetFile : The name of the file containing any auxiliary target names
* @param transcripts : The list of transcript objects
*
* If the auxTargetFile is not empty (i.e. if the file exists)
**/
void markAuxiliaryTargets(std::shared_ptr<spdlog::logger> log, const std::string& auxTargetFile, std::vector<Transcript>& transcripts) {
namespace bfs = boost::filesystem;
// If the aux file is empty or doesn't exist
if (auxTargetFile == "") {
return;
} else if (!bfs::exists(auxTargetFile)) {
log->warn("The auxiliary target file {}, does not exist. No targets will be treated as auxiliary.",
auxTargetFile);
return;
}

std::ifstream auxFile(auxTargetFile);
if (!auxFile.good()) {
log->warn("Could not open the auxiliary target file {}. No targets will be treated as auxiliary.",
auxTargetFile);
return;
}

spp::sparse_hash_set<std::string> auxTargetNames;
std::string tname;
while (auxFile >> tname) { auxTargetNames.insert(tname); }

log->info("Parsed {:n} auxiliary targets from {}", auxTargetNames.size(), auxTargetFile);

size_t numAuxFound = 0;
for (auto& txp : transcripts) {
bool isAux = auxTargetNames.contains(txp.RefName);
txp.setSkipBiasCorrection(isAux);
numAuxFound += isAux ? 1 : 0;
}

if (numAuxFound != auxTargetNames.size()) {
log->warn("While {:n} auxiliary target names were found in {}, only {:n} were actually found "
"among tanscripts in the index. Please make sure that the names in {} match the "
"transcript names in the index as expected.", auxTargetNames.size(), auxTargetFile,
numAuxFound, auxTargetFile);
}
auxFile.close();
}

/**
* Computes (and returns) new effective lengths for the transcripts
* based on the current abundance estimates (alphas) and the current
Expand Down Expand Up @@ -2504,6 +2552,10 @@ int contextSize = outsideContext + insideContext;
// Get the transcript
const auto& txp = transcripts[it];

// If this transcript is in the list of transcripts for which the user
// has requested we skip bias correction, then do so
if (txp.skipBiasCorrection()) { continue; }

// Get the reference length and the
// "initial" effective length (not considering any biases)
int32_t refLen = static_cast<int32_t>(txp.RefLength);
Expand Down

0 comments on commit 7a37e8b

Please sign in to comment.