diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..e9ab196 --- /dev/null +++ b/.clang-format @@ -0,0 +1,28 @@ +# This file is part of preseq +# +# Copyright (C) 2024: Andrew D. Smith +# +# Authors: Andrew D. Smith +# +# This is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +BasedOnStyle: LLVM +ColumnLimit: 80 +IndentWidth: 2 +AlwaysBreakAfterReturnType: TopLevel +ContinuationIndentWidth: 2 +ConstructorInitializerIndentWidth: 2 +BraceWrapping: + BeforeElse: true + BeforeCatch: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +SpacesBeforeTrailingComments: 2 diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml new file mode 100644 index 0000000..874410b --- /dev/null +++ b/.github/workflows/clang-format.yml @@ -0,0 +1,39 @@ +# This file is part of preseq +# +# Copyright (C) 2024: Andrew D. Smith +# +# Authors: Andrew D. Smith +# +# This is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +name: Source formatting with clang-format + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + workflow_dispatch: + +jobs: + clang-format: + runs-on: ubuntu-24.04 + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install dependencies + run: sudo apt-get install -y clang-format + + - name: Run clang-format + run: | + clang-format --dry-run -Werror $(git ls-files '*.*pp') diff --git a/.github/workflows/cpplint.yml b/.github/workflows/cpplint.yml new file mode 100644 index 0000000..24627c4 --- /dev/null +++ b/.github/workflows/cpplint.yml @@ -0,0 +1,52 @@ +# This file is part of preseq +# +# Copyright (C) 2024: Andrew D. Smith +# +# Authors: Andrew D. Smith +# +# This is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +name: Linting with cpplint + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + workflow_dispatch: + +jobs: + cpplint: + runs-on: ubuntu-24.04 + strategy: + matrix: + python-version: ["3.12"] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Python setup ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install cpplint + run: | + python -m pip install --upgrade pip + pip install cpplint + + - name: Check cpplint version + run: cpplint --version + + - name: Run cpplint + run: | + cpplint --quiet --repository=. $(git ls-files '*.*pp') diff --git a/CPPLINT.cfg b/CPPLINT.cfg new file mode 100644 index 0000000..52bd4ce --- /dev/null +++ b/CPPLINT.cfg @@ -0,0 +1,28 @@ +# This file is part of preseq +# +# Copyright (C) 2024: Andrew D. Smith +# +# Authors: Andrew D. Smith +# +# This is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. + +set noparent +filter=-runtime/references +filter=-build/include_subdir +filter=-build/include_order +filter=-build/c++11 +filter=-build/c++17 +# Formatting below handled by clang-format +filter=-whitespace/line_length +filter=-whitespace/newline +filter=-readability/braces +filter=-whitespace/semicolon +filter=-whitespace/indent diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..c4c9ee6 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Run 'autoreconf -i' to generate 'configure', 'Makefile.in', etc. +# +# The first time this is run on a new cloned git repo the configure +# script will not be present, only the configure.ac and +# Makefile.am. The rest must be generated by `autoreconf -i`. +# +# If you are working with a distribution (file ending with ".tar.gz" +# or similar) then this script should not be needed, and should not be +# present, as all the files should already exist. You should only run +# this script if you know what you are doing with autoreconf. +# +# This script will only work with an argument to confirm the help +# message has been read. + +runautoreconf() { + autoreconf -i; +} + +if test -d .git && test "$(basename "${PWD}")" = "preseq" +then + runautoreconf + exit 0 +else + echo " It seems you are either attempting to run this script " + echo " from the wrong directory, or in a source tree that was " + echo " not obtained by cloning the preseq git repo. " + echo " " + echo " ./autogen.sh generates the configure script. Only run " + echo " " + echo " Only run this if you know what you are doing with " + echo " autoreconf and are simply avoiding doing that. If you " + echo " just want to use the software, download a release and " + echo " this script will not be needed. " + exit 1 +fi diff --git a/src/continued_fraction.cpp b/src/continued_fraction.cpp index 2a10dfb..00709c1 100644 --- a/src/continued_fraction.cpp +++ b/src/continued_fraction.cpp @@ -19,15 +19,16 @@ #include "continued_fraction.hpp" -#include -#include +#include #include +#include +#include -using std::vector; -using std::min; +using std::fabs; using std::isfinite; +using std::min; using std::pow; -using std::fabs; +using std::vector; // ADS: the std::pow function is used frequently to get (-1)^x for // integer x. This doesn't make sense, and should be replaced at some @@ -43,30 +44,29 @@ using std::fabs; */ static void quotdiff_algorithm(const vector &ps_coeffs, vector &cf_coeffs) { - - const size_t depth = ps_coeffs.size(); // degree of power series + const size_t depth = ps_coeffs.size(); // degree of power series // q_table[0] never used, and undefined - vector > q_table(depth, vector(depth + 1, 0.0)); + vector> q_table(depth, vector(depth + 1, 0.0)); // q_table[1][j] = ratio of ps coefficients for (size_t j = 0; j < depth - 1; j++) - q_table[1][j] = ps_coeffs[j + 1]/ps_coeffs[j]; + q_table[1][j] = ps_coeffs[j + 1] / ps_coeffs[j]; // e_table[0] is always 0 - vector > e_table(depth, vector(depth + 1, 0.0)); + vector> e_table(depth, vector(depth + 1, 0.0)); // e_table[1] follows the general recurrence (same as in loop below) for (size_t j = 0; j < depth - 1; j++) - e_table[1][j] = q_table[1][j+1] - q_table[1][j] + e_table[0][j+1]; + e_table[1][j] = q_table[1][j + 1] - q_table[1][j] + e_table[0][j + 1]; // using intial values of E(i)(j)'s and Q(i)(j)'s, fill rest of the // q table and e table for (size_t i = 2; i < depth; i++) { - for (size_t j = 0; j < depth; j++) - q_table[i][j] = q_table[i-1][j+1]*e_table[i-1][j+1]/e_table[i-1][j]; + q_table[i][j] = + q_table[i - 1][j + 1] * e_table[i - 1][j + 1] / e_table[i - 1][j]; for (size_t j = 0; j < depth; j++) - e_table[i][j] = q_table[i][j+1] - q_table[i][j] + e_table[i-1][j+1]; + e_table[i][j] = q_table[i][j + 1] - q_table[i][j] + e_table[i - 1][j + 1]; } cf_coeffs.resize(depth); @@ -74,10 +74,9 @@ quotdiff_algorithm(const vector &ps_coeffs, vector &cf_coeffs) { cf_coeffs[0] = ps_coeffs[0]; // set remaining CF coefficients from e and q table values for (size_t i = 1; i < depth; ++i) - cf_coeffs[i] = (i % 2 == 0) ? -e_table[i/2][0] : -q_table[(i + 1)/2][0]; + cf_coeffs[i] = (i % 2 == 0) ? -e_table[i / 2][0] : -q_table[(i + 1) / 2][0]; } - /* compute CF coeffs when upper_offset > 0 above the diagonal; this * means degree of polynomial in numerator of Pade approximant is * greater than degree of polynomial in the denominator @@ -86,7 +85,6 @@ static void quotdiff_above_diagonal(const vector &ps_coeffs, const size_t offset, vector &cf_coeffs, vector &offset_coeffs) { - // get the high order PS coeffs for approximation by CF vector high_ps_coeffs(begin(ps_coeffs) + offset, end(ps_coeffs)); @@ -98,23 +96,20 @@ quotdiff_above_diagonal(const vector &ps_coeffs, const size_t offset, offset_coeffs.resize(offset); } - // calculate CF coeffs when lower_offset > 0 static void quotdiff_below_diagonal(const vector &ps_coeffs, const size_t offset, vector &cf_coeffs, vector &offset_coeffs) { - // need to work with reciprocal series g = 1/f, then invert vector recip_ps_coeffs(ps_coeffs.size()); - recip_ps_coeffs[0] = 1.0/ps_coeffs[0]; + recip_ps_coeffs[0] = 1.0 / ps_coeffs[0]; for (size_t i = 1; i < ps_coeffs.size(); ++i) { - double x = 0.0; for (size_t j = 0; j < i; ++j) - x += ps_coeffs[i - j]*recip_ps_coeffs[j]; + x += ps_coeffs[i - j] * recip_ps_coeffs[j]; - recip_ps_coeffs[i] = -x/ps_coeffs[0]; + recip_ps_coeffs[i] = -x / ps_coeffs[0]; } // qd to compute cf_coeffs using remaining coeffs @@ -127,7 +122,6 @@ quotdiff_below_diagonal(const vector &ps_coeffs, const size_t offset, offset_coeffs.resize(offset); } - /* decrease degree of CF keeping coeffs equal to original */ void decrease_degree(const size_t decrement, ContinuedFraction &the_cf) { @@ -137,7 +131,6 @@ decrease_degree(const size_t decrement, ContinuedFraction &the_cf) { the_cf.degree -= decrement; } - void truncate_degree(const size_t n_terms, ContinuedFraction &the_cf) { if (the_cf.degree < n_terms) { @@ -150,15 +143,14 @@ truncate_degree(const size_t n_terms, ContinuedFraction &the_cf) { } } - -ContinuedFraction::ContinuedFraction(const vector &ps_cf, - const int di, const size_t dg) : +ContinuedFraction::ContinuedFraction(const vector &ps_cf, const int di, + const size_t dg) : ps_coeffs(ps_cf), diagonal_idx(di), degree(dg) { if (diagonal_idx == 0) quotdiff_algorithm(ps_coeffs, cf_coeffs); else if (diagonal_idx > 0) quotdiff_above_diagonal(ps_coeffs, diagonal_idx, cf_coeffs, offset_coeffs); - else // if (cont_frac_estimate.lower_offset > 0) { + else // if (cont_frac_estimate.lower_offset > 0) { quotdiff_below_diagonal(ps_coeffs, -diagonal_idx, cf_coeffs, offset_coeffs); // NOTE: negative sign "-" (-diagonal_idx > 0) for below diagonal } @@ -168,23 +160,21 @@ ContinuedFraction::ContinuedFraction(const vector &ps_cf, static double get_rescale_value(const double numerator, const double denominator) { - static const double tolerance = 1e-20; // magic + static const double tolerance = 1e-20; // magic const double rescale_val = fabs(numerator) + fabs(denominator); - if (rescale_val > 1.0/tolerance) - return 1.0/rescale_val; + if (rescale_val > 1.0 / tolerance) + return 1.0 / rescale_val; else if (rescale_val < tolerance) - return 1.0/rescale_val; + return 1.0 / rescale_val; return 1.0; } - /* calculate ContinuedFraction approx when there is no offset uses euler's * recursion */ static double -evaluate_on_diagonal(const vector &cf_coeffs, - const double val, const size_t depth) { - +evaluate_on_diagonal(const vector &cf_coeffs, const double val, + const size_t depth) { // initialize double current_num = 0.0; double prev_num1 = cf_coeffs[0]; @@ -195,10 +185,9 @@ evaluate_on_diagonal(const vector &cf_coeffs, double prev_denom2 = 1.0; for (size_t i = 1; i < min(cf_coeffs.size(), depth); i++) { - // calculate current values - current_num = prev_num1 + cf_coeffs[i]*val*prev_num2; - current_denom = prev_denom1 + cf_coeffs[i]*val*prev_denom2; + current_num = prev_num1 + cf_coeffs[i] * val * prev_num2; + current_denom = prev_denom1 + cf_coeffs[i] * val * prev_denom2; // update previous values prev_num2 = prev_num1; @@ -219,49 +208,44 @@ evaluate_on_diagonal(const vector &cf_coeffs, prev_denom1 *= rescale_val; prev_denom2 *= rescale_val; } - return current_num/current_denom; + return current_num / current_denom; } static double evaluate_power_series(const vector &ps_coeffs, const double val) { double x = 0.0; for (size_t i = 0; i < ps_coeffs.size(); i++) - x += ps_coeffs[i]*pow(val, i); + x += ps_coeffs[i] * pow(val, i); return x; } - /* evaluate CF when upper_offset > 0 using Euler's recursion */ static double evaluate_above_diagonal(const vector &cf_coeffs, - const vector &offset_coeffs, - const double val, const size_t depth) { - + const vector &offset_coeffs, const double val, + const size_t depth) { const double cf_part = evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size()); const double ps_part = evaluate_power_series(offset_coeffs, val); - return ps_part + pow(val, offset_coeffs.size())*cf_part; + return ps_part + pow(val, offset_coeffs.size()) * cf_part; } - // calculate ContinuedFraction approx when lower_offdiag > 0 static double evaluate_below_diagonal(const vector &cf_coeffs, - const vector &offset_coeffs, - const double val, const size_t depth) { - + const vector &offset_coeffs, const double val, + const size_t depth) { const double cf_part = evaluate_on_diagonal(cf_coeffs, val, depth - offset_coeffs.size()); const double ps_part = evaluate_power_series(offset_coeffs, val); // recall that if lower_offset > 0, we are working with 1/f, invert approx - return 1.0/(ps_part + pow(val, offset_coeffs.size())*cf_part); + return 1.0 / (ps_part + pow(val, offset_coeffs.size()) * cf_part); } - // evaluate CF at a given point double ContinuedFraction::operator()(const double val) const { @@ -273,9 +257,8 @@ ContinuedFraction::operator()(const double val) const { return evaluate_on_diagonal(cf_coeffs, val, degree); } - -std::ostream& -operator<<(std::ostream& the_stream, const ContinuedFraction &cf) { +std::ostream & +operator<<(std::ostream &the_stream, const ContinuedFraction &cf) { using std::ios_base; using std::setw; @@ -285,17 +268,16 @@ operator<<(std::ostream& the_stream, const ContinuedFraction &cf) { the_stream << "OFFSET_COEFFS" << '\t' << "PS_COEFFS" << '\n'; const size_t offset = cf.offset_coeffs.size(); for (size_t i = 0; i < offset; ++i) - the_stream << setw(12) << cf.offset_coeffs[i] << '\t' - << setw(12) << cf.ps_coeffs[i] << '\n'; + the_stream << setw(12) << cf.offset_coeffs[i] << '\t' << setw(12) + << cf.ps_coeffs[i] << '\n'; the_stream << "CF_COEFFS" << '\n'; for (size_t i = 0; i < cf.cf_coeffs.size(); ++i) - the_stream << setw(12) << cf.cf_coeffs[i] << '\t' - << setw(12) << cf.ps_coeffs[i + offset] << '\n'; + the_stream << setw(12) << cf.cf_coeffs[i] << '\t' << setw(12) + << cf.ps_coeffs[i + offset] << '\n'; the_stream.flags(orig_flags); return the_stream; } - // estimate yields by evaluating the CF at given points void ContinuedFraction::extrapolate_distinct(const double max_value, @@ -304,7 +286,7 @@ ContinuedFraction::extrapolate_distinct(const double max_value, estimates.clear(); estimates.push_back(0); for (double t = step_size; t <= max_value; t += step_size) - estimates.push_back(t*operator()(t)); + estimates.push_back(t * operator()(t)); } //////////////////////////////////////////////////////////////////////// @@ -321,7 +303,6 @@ const double CFA::search_step_size = 0.05; */ bool check_yield_estimates_stability(const vector &estimates) { - // require estimates are non-negative and finite for (size_t i = 0; i < estimates.size(); ++i) if (!std::isfinite(estimates[i]) || estimates[i] < 0.0) @@ -334,7 +315,7 @@ check_yield_estimates_stability(const vector &estimates) { // require negative second derivative for (size_t i = 2; i < estimates.size(); ++i) - if (estimates[i-1] - estimates[i-2] < estimates[i] - estimates[i-1]) + if (estimates[i - 1] - estimates[i - 2] < estimates[i] - estimates[i - 1]) return false; return !estimates.empty(); @@ -346,7 +327,6 @@ check_yield_estimates_stability(const vector &estimates) { */ ContinuedFraction CFA::optimal_cont_frac_distinct(const vector &counts_hist) const { - // we expect to use an underestimate, but this is dealt with outside // by ensuring we have an even number of max terms @@ -355,7 +335,7 @@ CFA::optimal_cont_frac_distinct(const vector &counts_hist) const { vector ps_coeffs; for (size_t j = 1; j <= max_terms; j++) - ps_coeffs.push_back(counts_hist[j]*pow(-1.0, j + 1)); + ps_coeffs.push_back(counts_hist[j] * pow(-1.0, j + 1)); ContinuedFraction full_cf(ps_coeffs, diagonal_idx, max_terms); @@ -367,13 +347,13 @@ CFA::optimal_cont_frac_distinct(const vector &counts_hist) const { return full_cf; } else { - // if max terms >= 7, start at 7 and check increasing cont frac's for (size_t i = 7 + (max_terms % 2 == 0); i <= max_terms; i += 2) { ContinuedFraction trunc_cf(full_cf); truncate_degree(i, trunc_cf); vector estimates; - trunc_cf.extrapolate_distinct(search_max_val, search_step_size, estimates); + trunc_cf.extrapolate_distinct(search_max_val, search_step_size, + estimates); if (check_yield_estimates_stability(estimates)) return trunc_cf; } diff --git a/src/continued_fraction.hpp b/src/continued_fraction.hpp index 33c4689..185b184 100644 --- a/src/continued_fraction.hpp +++ b/src/continued_fraction.hpp @@ -17,20 +17,20 @@ * along with this program. If not, see . */ -#ifndef CONTINUED_FRACTION_HPP -#define CONTINUED_FRACTION_HPP +#ifndef SRC_CONTINUED_FRACTION_HPP_ +#define SRC_CONTINUED_FRACTION_HPP_ -#include -#include +#include #include #include -#include +#include +#include struct ContinuedFraction { // Constructors ContinuedFraction() : diagonal_idx(0), degree(0ul) {} - ContinuedFraction(const std::vector &ps_cf, - const int di, const size_t dg); + ContinuedFraction(const std::vector &ps_cf, const int di, + const size_t dg); // Evaluate the continued fraction double operator()(const double val) const; @@ -40,12 +40,11 @@ struct ContinuedFraction { // Evaluate the continued fraction estimating distinct // along a curve from 0 to max_value - void - extrapolate_distinct(const double max_value, const double step_size, - std::vector &estimates) const; + void extrapolate_distinct(const double max_value, const double step_size, + std::vector &estimates) const; - bool is_valid() const {return !cf_coeffs.empty();} - size_t return_degree() const {return degree;} + bool is_valid() const { return !cf_coeffs.empty(); } + size_t return_degree() const { return degree; } std::vector ps_coeffs; std::vector cf_coeffs; @@ -54,14 +53,15 @@ struct ContinuedFraction { size_t degree; }; - // get continued fraction with lower degree -void decrease_degree(const size_t decrement, ContinuedFraction &cf); -void truncate_degree(const size_t truncated_degree, ContinuedFraction &cf); +// get continued fraction with lower degree +void +decrease_degree(const size_t decrement, ContinuedFraction &cf); +void +truncate_degree(const size_t truncated_degree, ContinuedFraction &cf); std::ostream & operator<<(std::ostream &out, const ContinuedFraction &cf); - class ContinuedFractionApproximation { public: ContinuedFractionApproximation(const int di, const size_t mt) : @@ -71,12 +71,11 @@ class ContinuedFractionApproximation { ContinuedFraction optimal_cont_frac_distinct(const std::vector &counts_hist) const; - int get_diagonal() const {return diagonal_idx;} + int get_diagonal() const { return diagonal_idx; } private: - - int diagonal_idx; // the diagonal to work with for estimates - size_t max_terms; // the maximum number of terms to try for a CF + int diagonal_idx; // the diagonal to work with for estimates + size_t max_terms; // the maximum number of terms to try for a CF /* note: these never change */ static const size_t min_allowed_degree; @@ -84,12 +83,11 @@ class ContinuedFractionApproximation { // largest value to search for lowerbound and stability static const double search_max_val; - //step size for search of lowerbound and stability + // step size for search of lowerbound and stability static const double search_step_size; - }; bool check_yield_estimates_stability(const std::vector &estimates); -#endif +#endif // SRC_CONTINUED_FRACTION_HPP_ diff --git a/src/load_data_for_complexity.cpp b/src/load_data_for_complexity.cpp index 7012cbf..1e05891 100644 --- a/src/load_data_for_complexity.cpp +++ b/src/load_data_for_complexity.cpp @@ -19,25 +19,32 @@ #include "load_data_for_complexity.hpp" +#include "GenomicRegion.hpp" +#include "MappedRead.hpp" + +#include + +#include // std::min +#include +#include #include +#include #include -#include #include -#include - -#include "GenomicRegion.hpp" -#include "MappedRead.hpp" +#include // std::swap -using std::string; -using std::vector; -using std::priority_queue; -using std::min; +using std::cerr; using std::endl; using std::max; -using std::cerr; -using std::unordered_map; -using std::runtime_error; +using std::min; using std::mt19937; +using std::priority_queue; +using std::runtime_error; +using std::size_t; +using std::string; +using std::uint64_t; +using std::unordered_map; +using std::vector; ////////////////////////////////////////////////////////////////////// // Data imputation @@ -47,7 +54,7 @@ static bool update_pe_duplicate_counts_hist(const GenomicRegion &curr_gr, const GenomicRegion &prev_gr, vector &counts_hist, - size_t ¤t_count){ + size_t ¤t_count) { // check if reads are sorted if (curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start() && @@ -60,49 +67,42 @@ update_pe_duplicate_counts_hist(const GenomicRegion &curr_gr, if (!curr_gr.same_chrom(prev_gr) || curr_gr.get_start() != prev_gr.get_start() || curr_gr.get_end() != prev_gr.get_end()) { - // histogram is too small, resize if (counts_hist.size() < current_count + 1) counts_hist.resize(current_count + 1, 0.0); ++counts_hist[current_count]; current_count = 1; } - else // next read is same, update current_count + else // next read is same, update current_count ++current_count; return true; } - static void update_se_duplicate_counts_hist(const GenomicRegion &curr_gr, const GenomicRegion &prev_gr, const string input_file_name, vector &counts_hist, - size_t ¤t_count){ + size_t ¤t_count) { // check if reads are sorted - if (curr_gr.same_chrom(prev_gr) && - curr_gr.get_start() < prev_gr.get_start()) - throw runtime_error("locations unsorted in: " - + input_file_name); + if (curr_gr.same_chrom(prev_gr) && curr_gr.get_start() < prev_gr.get_start()) + throw runtime_error("locations unsorted in: " + input_file_name); if (!curr_gr.same_chrom(prev_gr) || - curr_gr.get_start() != prev_gr.get_start()) + curr_gr.get_start() != prev_gr.get_start()) { // next read is new, update counts_hist to include current_count - { - // histogram is too small, resize - if(counts_hist.size() < current_count + 1) - counts_hist.resize(current_count + 1, 0.0); - ++counts_hist[current_count]; - current_count = 1; - } - else // next read is same, update current_count + // histogram is too small, resize + if (counts_hist.size() < current_count + 1) + counts_hist.resize(current_count + 1, 0.0); + ++counts_hist[current_count]; + current_count = 1; + } + else // next read is same, update current_count ++current_count; } - - -/////comparison function for priority queue///////////////// +// comparison function for priority queue /**************** FOR CLARITY BELOW WHEN COMPARING READS *************/ static inline bool @@ -123,47 +123,39 @@ end_greater(const GenomicRegion &a, const GenomicRegion &b) { } /******************************************************************************/ - struct GenomicRegionOrderChecker { bool operator()(const GenomicRegion &prev, const GenomicRegion &gr) const { return start_check(prev, gr); } - static bool - start_check(const GenomicRegion &prev, const GenomicRegion &gr) { - return (chrom_greater(prev, gr) - || (prev.same_chrom(gr) && start_greater(prev, gr)) - || (prev.same_chrom(gr) && same_start(prev, gr) - && end_greater(prev, gr))); + static bool start_check(const GenomicRegion &prev, const GenomicRegion &gr) { + return ( + chrom_greater(prev, gr) || + (prev.same_chrom(gr) && start_greater(prev, gr)) || + (prev.same_chrom(gr) && same_start(prev, gr) && end_greater(prev, gr))); } }; - -typedef priority_queue, - GenomicRegionOrderChecker> ReadPQ; - +typedef priority_queue, + GenomicRegionOrderChecker> + ReadPQ; static bool -is_ready_to_pop(const ReadPQ &pq, - const GenomicRegion &gr, +is_ready_to_pop(const ReadPQ &pq, const GenomicRegion &gr, const size_t max_width) { return !pq.top().same_chrom(gr) || - pq.top().get_end() + max_width < gr.get_start(); + pq.top().get_end() + max_width < gr.get_start(); } - static void -empty_pq(GenomicRegion &curr_gr, GenomicRegion &prev_gr, - size_t ¤t_count, vector &counts_hist, - ReadPQ &read_pq, const string &input_file_name) { - +empty_pq(GenomicRegion &curr_gr, GenomicRegion &prev_gr, size_t ¤t_count, + vector &counts_hist, ReadPQ &read_pq, + const string &input_file_name) { curr_gr = read_pq.top(); read_pq.pop(); // update counts hist - const bool UPDATE_SUCCESS = - update_pe_duplicate_counts_hist(curr_gr, prev_gr, counts_hist, - current_count); + const bool UPDATE_SUCCESS = update_pe_duplicate_counts_hist( + curr_gr, prev_gr, counts_hist, current_count); if (!UPDATE_SUCCESS) { std::ostringstream oss; oss << "reads unsorted in: " << input_file_name << "\n" @@ -175,7 +167,6 @@ empty_pq(GenomicRegion &curr_gr, GenomicRegion &prev_gr, prev_gr = curr_gr; } - /* * This code is used to deal with read data in BAM format. */ @@ -184,13 +175,11 @@ empty_pq(GenomicRegion &curr_gr, GenomicRegion &prev_gr, #include "htslib_wrapper_deprecated.hpp" size_t -load_counts_BAM_se(const string &input_file_name, - vector &counts_hist) { +load_counts_BAM_se(const string &input_file_name, vector &counts_hist) { const string mapper = "general"; SAMReader_deprecated sam_reader(input_file_name, mapper); - if(!sam_reader) - throw runtime_error("problem opening input file " - + input_file_name); + if (!sam_reader) + throw runtime_error("problem opening input file " + input_file_name); SAMRecord samr; sam_reader >> samr; @@ -206,17 +195,13 @@ load_counts_BAM_se(const string &input_file_name, while (sam_reader >> samr) { // only convert mapped and primary reads if (samr.is_primary && samr.is_mapped) { - // ignore unmapped reads & secondary alignments if (!(samr.is_mapping_paired) || - (samr.is_mapping_paired && samr.is_Trich)){ - //only count unpaired reads or the left mate of paired reads - + (samr.is_mapping_paired && samr.is_Trich)) { + // only count unpaired reads or the left mate of paired reads curr_mr = samr.mr; - update_se_duplicate_counts_hist(curr_mr.r, prev_mr.r, - input_file_name, - counts_hist, - current_count); + update_se_duplicate_counts_hist(curr_mr.r, prev_mr.r, input_file_name, + counts_hist, current_count); // update number of reads and prev read ++n_reads; @@ -239,7 +224,6 @@ static bool merge_mates(const size_t suffix_len, const size_t range, const GenomicRegion &one, const GenomicRegion &two, GenomicRegion &merged, int &len) { - assert(one.same_chrom(two)); const size_t read_start = min(one.get_start(), two.get_start()); const size_t read_end = max(one.get_end(), two.get_end()); @@ -264,43 +248,38 @@ merge_mates(const size_t suffix_len, const size_t range, } inline static bool -same_read(const size_t suffix_len, - const MappedRead &a, const MappedRead &b) { +same_read(const size_t suffix_len, const MappedRead &a, const MappedRead &b) { const string sa(a.r.get_name()); const string sb(b.r.get_name()); bool SAME_NAME = false; - if(sa == sb) + if (sa == sb) SAME_NAME = true; return (SAME_NAME && a.r.same_chrom(b.r)); } // return true if the genomic region is null static inline bool -GenomicRegionIsNull(const GenomicRegion &gr){ +GenomicRegionIsNull(const GenomicRegion &gr) { GenomicRegion null_gr; - if(gr == null_gr) + if (gr == null_gr) return true; return false; } - static void empty_pq(GenomicRegion &prev_gr, - priority_queue, - GenomicRegionOrderChecker> &read_pq, - const string &input_file_name, - vector &counts_hist, + priority_queue, + GenomicRegionOrderChecker> &read_pq, + const string &input_file_name, vector &counts_hist, size_t ¤t_count) { - GenomicRegion curr_gr = read_pq.top(); read_pq.pop(); // check if reads are sorted if (curr_gr.same_chrom(prev_gr) && - curr_gr.get_start() < prev_gr.get_start() - && curr_gr.get_end() < prev_gr.get_end()) { + curr_gr.get_start() < prev_gr.get_start() && + curr_gr.get_end() < prev_gr.get_end()) { std::ostringstream oss; oss << "reads unsorted in: " << input_file_name << "\n" << "prev = \t" << prev_gr << "\n" @@ -313,9 +292,8 @@ empty_pq(GenomicRegion &prev_gr, current_count = 1; else { std::ostringstream oss; - bool UPDATE_HIST = - update_pe_duplicate_counts_hist(curr_gr, prev_gr, - counts_hist, current_count); + bool UPDATE_HIST = update_pe_duplicate_counts_hist( + curr_gr, prev_gr, counts_hist, current_count); if (!UPDATE_HIST) { oss << "locations unsorted in: " << input_file_name << "\n" << "prev = \t" << prev_gr << "\n" @@ -326,21 +304,16 @@ empty_pq(GenomicRegion &prev_gr, prev_gr = curr_gr; } - size_t -load_counts_BAM_pe(const bool VERBOSE, - const string &input_file_name, +load_counts_BAM_pe(const bool VERBOSE, const string &input_file_name, const size_t MAX_SEGMENT_LENGTH, - const size_t MAX_READS_TO_HOLD, - size_t &n_paired, - size_t &n_mates, - vector &counts_hist) { - + const size_t MAX_READS_TO_HOLD, size_t &n_paired, + size_t &n_mates, vector &counts_hist) { const string mapper = "general"; SAMReader_deprecated sam_reader(input_file_name, mapper); // check sam_reader - if(!sam_reader) + if (!sam_reader) throw runtime_error("problem opening input file " + input_file_name); SAMRecord samr; @@ -357,34 +330,31 @@ load_counts_BAM_pe(const bool VERBOSE, GenomicRegion prev_gr; std::priority_queue, - GenomicRegionOrderChecker> read_pq; + GenomicRegionOrderChecker> + read_pq; unordered_map dangling_mates; while (sam_reader >> samr) { - // only convert mapped and primary reads if (samr.is_primary && samr.is_mapped) { ++n_mates; // deal with paired-end stuff if (samr.is_mapping_paired) { - const size_t name_len = samr.mr.r.get_name().size() - suffix_len; const string read_name(samr.mr.r.get_name().substr(0, name_len)); if (dangling_mates.find(read_name) != dangling_mates.end()) { // other end is in dangling mates, merge the two mates - if(same_read(suffix_len, samr.mr, - dangling_mates[read_name].mr)) { + if (same_read(suffix_len, samr.mr, dangling_mates[read_name].mr)) { if (samr.is_Trich) std::swap(samr, dangling_mates[read_name]); GenomicRegion merged; int len = 0; - const bool MERGE_SUCCESS = - merge_mates(suffix_len, MAX_SEGMENT_LENGTH, - dangling_mates[read_name].mr.r, samr.mr.r, - merged, len); + const bool MERGE_SUCCESS = merge_mates( + suffix_len, MAX_SEGMENT_LENGTH, dangling_mates[read_name].mr.r, + samr.mr.r, merged, len); // merge success! if (MERGE_SUCCESS && len >= 0 && len <= static_cast(MAX_SEGMENT_LENGTH)) { @@ -394,8 +364,8 @@ load_counts_BAM_pe(const bool VERBOSE, else { // informative error message! if (VERBOSE) { - cerr << "problem merging read " - << read_name << ", splitting read" << endl + cerr << "problem merging read " << read_name + << ", splitting read" << endl << samr.mr << "\t" << samr.is_mapping_paired << endl << dangling_mates[read_name].mr << "\t" << dangling_mates[read_name].is_mapping_paired << endl @@ -415,7 +385,7 @@ load_counts_BAM_pe(const bool VERBOSE, n_unpaired += 2; } } - else // didn't find read in dangling_mates, store for later + else // didn't find read in dangling_mates, store for later dangling_mates[read_name] = samr; } else { @@ -423,36 +393,37 @@ load_counts_BAM_pe(const bool VERBOSE, ++n_unpaired; } - // dangling mates is too large, flush dangling_mates of reads // on different chroms and too far away if (dangling_mates.size() > MAX_READS_TO_HOLD) { unordered_map tmp; for (unordered_map::iterator itr = - dangling_mates.begin(); itr != dangling_mates.end(); ++itr) { - if (itr->second.mr.r.get_chrom() != samr.mr.r.get_chrom() - || (itr->second.mr.r.get_chrom() == samr.mr.r.get_chrom() - && itr->second.mr.r.get_end() - + MAX_SEGMENT_LENGTH < samr.mr.r.get_start())) { - if(itr->second.seg_len >= 0) { + dangling_mates.begin(); + itr != dangling_mates.end(); ++itr) { + if (itr->second.mr.r.get_chrom() != samr.mr.r.get_chrom() || + (itr->second.mr.r.get_chrom() == samr.mr.r.get_chrom() && + itr->second.mr.r.get_end() + MAX_SEGMENT_LENGTH < + samr.mr.r.get_start())) { + if (itr->second.seg_len >= 0) { read_pq.push(itr->second.mr.r); ++n_unpaired; } } - else tmp[itr->first] = itr->second; + else + tmp[itr->first] = itr->second; } std::swap(tmp, dangling_mates); tmp.clear(); } - // now empty the priority queue if (!(read_pq.empty()) && is_ready_to_pop(read_pq, samr.mr.r, MAX_SEGMENT_LENGTH)) { - //begin emptying priority queue + // begin emptying priority queue while (!(read_pq.empty()) && is_ready_to_pop(read_pq, samr.mr.r, MAX_SEGMENT_LENGTH)) { - empty_pq(prev_gr, read_pq, input_file_name, counts_hist, current_count); + empty_pq(prev_gr, read_pq, input_file_name, counts_hist, + current_count); } } @@ -468,8 +439,8 @@ load_counts_BAM_pe(const bool VERBOSE, ++n_unpaired; } - //final iteration - while(!read_pq.empty()) + // final iteration + while (!read_pq.empty()) empty_pq(prev_gr, read_pq, input_file_name, counts_hist, current_count); if (counts_hist.size() < current_count + 1) @@ -490,12 +461,10 @@ load_counts_BAM_pe(const bool VERBOSE, #endif - /* this code is for BED file input */ size_t -load_counts_BED_se(const string input_file_name, - vector &counts_hist) { +load_counts_BED_se(const string input_file_name, vector &counts_hist) { // resize vals_hist counts_hist.clear(); counts_hist.resize(2, 0.0); @@ -518,40 +487,34 @@ load_counts_BED_se(const string input_file_name, } // to account for the last read compared to the one before it. - if(counts_hist.size() < current_count + 1) + if (counts_hist.size() < current_count + 1) counts_hist.resize(current_count + 1, 0.0); ++counts_hist[current_count]; return n_reads; } - size_t -load_counts_BED_pe(const string input_file_name, - vector &counts_hist) { - +load_counts_BED_pe(const string input_file_name, vector &counts_hist) { // resize vals_hist counts_hist.clear(); counts_hist.resize(2, 0.0); std::ifstream in(input_file_name.c_str()); if (!in) - throw runtime_error("problem opening file: " - + input_file_name); + throw runtime_error("problem opening file: " + input_file_name); GenomicRegion curr_gr, prev_gr; if (!(in >> prev_gr)) - throw runtime_error("problem opening file: " - + input_file_name); + throw runtime_error("problem opening file: " + input_file_name); size_t n_reads = 1; size_t current_count = 1; - //read in file and compare each gr with the one before it + // read in file and compare each gr with the one before it while (in >> curr_gr) { - const bool UPDATE_SUCCESS = - update_pe_duplicate_counts_hist(curr_gr, prev_gr, - counts_hist, current_count); + const bool UPDATE_SUCCESS = update_pe_duplicate_counts_hist( + curr_gr, prev_gr, counts_hist, current_count); if (!UPDATE_SUCCESS) throw runtime_error("reads unsorted in " + input_file_name); @@ -566,13 +529,11 @@ load_counts_BED_pe(const string input_file_name, ++counts_hist[current_count]; return n_reads; - } /* text file input */ size_t load_counts(const string &input_file_name, vector &counts_hist) { - std::ifstream in(input_file_name); if (!in) throw runtime_error("problem opening file: " + input_file_name); @@ -580,7 +541,6 @@ load_counts(const string &input_file_name, vector &counts_hist) { size_t n_counts = 0; string buffer; while (getline(in, buffer)) { - if (find(begin(buffer), end(buffer), '\r') != end(buffer)) throw runtime_error("carriage returns in values file " "(suggests dos or mac formatting)"); @@ -598,30 +558,27 @@ load_counts(const string &input_file_name, vector &counts_hist) { n_counts += count; } else if (val != 0) - throw runtime_error("problem reading file at line " - + toa(n_counts + 1)); + throw runtime_error("problem reading file at line " + + toa(n_counts + 1)); } in.peek(); } return n_counts; } - -//returns number of reads from file containing counts histogram +// returns number of reads from file containing counts histogram size_t load_histogram(const string &filename, vector &counts_hist) { - counts_hist.clear(); std::ifstream in(filename.c_str()); - if (!in) //if file doesn't open + if (!in) // if file doesn't open throw runtime_error("could not open histogram: " + filename); size_t n_reads = 0; size_t line_count = 0ul, prev_read_count = 0ul; string buffer; while (getline(in, buffer)) { - if (find(begin(buffer), end(buffer), '\r') != end(buffer)) throw runtime_error("carriage returns in histogram file " "(suggests dos or mac formatting)"); @@ -643,76 +600,64 @@ load_histogram(const string &filename, vector &counts_hist) { counts_hist[read_count] = frequency; if (read_count == 0ul) { throw runtime_error("counts histograms may not " - "include an entry for zero"); + "include an entry for zero"); } prev_read_count = read_count; - n_reads += static_cast(read_count*frequency); + n_reads += static_cast(read_count * frequency); } return n_reads; } - ///////////////////////////////////////////////////////// // Loading coverage counts //////////////////////////////////////////////////////// - // probabilistically split genomic regions into mutiple // genomic regions of width equal to bin_size static void -SplitGenomicRegion(const GenomicRegion &inputGR, - mt19937 &generator, const size_t bin_size, - vector &outputGRs){ - +SplitGenomicRegion(const GenomicRegion &inputGR, mt19937 &generator, + const size_t bin_size, vector &outputGRs) { outputGRs.clear(); GenomicRegion gr(inputGR); - double frac = static_cast(gr.get_start() % bin_size)/bin_size; + double frac = static_cast(gr.get_start() % bin_size) / bin_size; const size_t width = gr.get_width(); // ADS: this seems like a bunch of duplicated code just for a single // function difference - std::uniform_real_distribution dist(0.0,1.0); + std::uniform_real_distribution dist(0.0, 1.0); if (dist(generator) > frac) { - gr.set_start(std::floor(static_cast(gr.get_start())/ - bin_size)*bin_size); + gr.set_start(std::floor(static_cast(gr.get_start()) / bin_size) * + bin_size); gr.set_end(gr.get_start() + width); } else { - gr.set_start(std::ceil(static_cast(gr.get_start())/ - bin_size)*bin_size); + gr.set_start(std::ceil(static_cast(gr.get_start()) / bin_size) * + bin_size); gr.set_end(gr.get_start() + width); } - for(size_t i = 0; i < gr.get_width(); i += bin_size){ - + for (size_t i = 0; i < gr.get_width(); i += bin_size) { const size_t curr_start = gr.get_start() + i; - const size_t curr_end - = std::min(gr.get_end(), curr_start + bin_size); - frac = static_cast(curr_end - curr_start)/bin_size; + const size_t curr_end = std::min(gr.get_end(), curr_start + bin_size); + frac = static_cast(curr_end - curr_start) / bin_size; - if(dist(generator) <= frac){ - GenomicRegion binned_gr(gr.get_chrom(), curr_start, - curr_start + bin_size, - gr.get_name(), gr.get_score(), - gr.get_strand()); + if (dist(generator) <= frac) { + GenomicRegion binned_gr(gr.get_chrom(), curr_start, curr_start + bin_size, + gr.get_name(), gr.get_score(), gr.get_strand()); outputGRs.push_back(binned_gr); } } } - // split a mapped read into multiple genomic regions // based on the number of bases in each static void -SplitMappedRead(const bool VERBOSE, - const MappedRead &inputMR, - mt19937 &generator, - const size_t bin_size, - vector &outputGRs){ - +SplitMappedRead(const bool VERBOSE, const MappedRead &inputMR, + mt19937 &generator, const size_t bin_size, + vector &outputGRs) { outputGRs.clear(); size_t covered_bases = 0; @@ -727,14 +672,14 @@ SplitMappedRead(const bool VERBOSE, // if we reach the end of a bin, probabilistically create a binned read // with probability proportional to the number of covered bases if (read_iterator % bin_size == bin_size - 1) { - const double frac = static_cast(covered_bases)/bin_size; - std::uniform_real_distribution dist(0.0,1.0); + const double frac = static_cast(covered_bases) / bin_size; + std::uniform_real_distribution dist(0.0, 1.0); if (dist(generator) <= frac) { const size_t curr_start = read_iterator - (read_iterator % bin_size); const size_t curr_end = curr_start + bin_size; - const GenomicRegion binned_gr(inputMR.r.get_chrom(), curr_start, curr_end, - inputMR.r.get_name(), inputMR.r.get_score(), - inputMR.r.get_strand()); + const GenomicRegion binned_gr( + inputMR.r.get_chrom(), curr_start, curr_end, inputMR.r.get_name(), + inputMR.r.get_score(), inputMR.r.get_strand()); outputGRs.push_back(binned_gr); } total_covered_bases += covered_bases; @@ -744,8 +689,8 @@ SplitMappedRead(const bool VERBOSE, read_iterator++; } - const double frac = static_cast(covered_bases)/bin_size; - std::uniform_real_distribution dist(0.0,1.0); + const double frac = static_cast(covered_bases) / bin_size; + std::uniform_real_distribution dist(0.0, 1.0); if (dist(generator) <= frac) { const size_t curr_start = read_iterator - (read_iterator % bin_size); const size_t curr_end = curr_start + bin_size; @@ -756,17 +701,12 @@ SplitMappedRead(const bool VERBOSE, } } - size_t -load_coverage_counts_MR(const bool VERBOSE, - const string input_file_name, - const unsigned long int seed, - const size_t bin_size, - const size_t max_width, - vector &coverage_hist) { - +load_coverage_counts_MR(const bool VERBOSE, const string input_file_name, + const uint64_t seed, const size_t bin_size, + const size_t max_width, vector &coverage_hist) { srand(time(0) + getpid()); - //Runif runif(rand()); + // Runif runif(rand()); std::mt19937 generator(seed); std::ifstream in(input_file_name.c_str()); @@ -786,10 +726,8 @@ load_coverage_counts_MR(const bool VERBOSE, size_t current_count = 1; do { - if (mr.r.get_width() > max_width) - throw runtime_error("Encountered read of width " + - toa(mr.r.get_width()) + + throw runtime_error("Encountered read of width " + toa(mr.r.get_width()) + "max_width set too small"); vector splitGRs; @@ -805,30 +743,24 @@ load_coverage_counts_MR(const bool VERBOSE, // remove Genomic Regions from the priority queue if (splitGRs.size() > 0) while (!PQ.empty() && is_ready_to_pop(PQ, splitGRs.back(), max_width)) - empty_pq(curr_gr, prev_gr, current_count, - coverage_hist, PQ, input_file_name); - - } - while (in >> mr); + empty_pq(curr_gr, prev_gr, current_count, coverage_hist, PQ, + input_file_name); + } while (in >> mr); // done adding reads, now spit the rest out while (!PQ.empty()) - empty_pq(curr_gr, prev_gr, current_count, - coverage_hist, PQ, input_file_name); + empty_pq(curr_gr, prev_gr, current_count, coverage_hist, PQ, + input_file_name); return n_reads; } - size_t -load_coverage_counts_GR(const string input_file_name, - const unsigned long int seed, - const size_t bin_size, - const size_t max_width, +load_coverage_counts_GR(const string input_file_name, const uint64_t seed, + const size_t bin_size, const size_t max_width, vector &coverage_hist) { - srand(time(0) + getpid()); - //Runif runif(rand()); + // Runif runif(rand()); std::mt19937 generator(seed); std::ifstream in(input_file_name.c_str()); @@ -848,28 +780,26 @@ load_coverage_counts_GR(const string input_file_name, size_t current_count = 1; do { - vector splitGRs; SplitGenomicRegion(inputGR, generator, bin_size, splitGRs); // add split Genomic Regions to the priority queue - for(size_t i = 0; i < splitGRs.size(); i++) + for (size_t i = 0; i < splitGRs.size(); i++) PQ.push(splitGRs[i]); if (splitGRs.size() > 0) { // remove Genomic Regions from the priority queue while (!PQ.empty() && is_ready_to_pop(PQ, splitGRs.back(), max_width)) - empty_pq(curr_gr, prev_gr, current_count, - coverage_hist, PQ, input_file_name); + empty_pq(curr_gr, prev_gr, current_count, coverage_hist, PQ, + input_file_name); } n_reads++; - } - while (in >> inputGR); + } while (in >> inputGR); // done adding reads, now spit the rest out while (!PQ.empty()) - empty_pq(curr_gr, prev_gr, current_count, - coverage_hist, PQ, input_file_name); + empty_pq(curr_gr, prev_gr, current_count, coverage_hist, PQ, + input_file_name); return n_reads; } diff --git a/src/load_data_for_complexity.hpp b/src/load_data_for_complexity.hpp index c6e9616..1dc70fb 100644 --- a/src/load_data_for_complexity.hpp +++ b/src/load_data_for_complexity.hpp @@ -17,58 +17,51 @@ * along with this program. If not, see . */ -#ifndef LOAD_DATA_FOR_COMPLEXITY_HPP -#define LOAD_DATA_FOR_COMPLEXITY_HPP +#ifndef SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ +#define SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ +#include +#include #include #include -#include -size_t -load_coverage_counts_MR(const bool VERBOSE, - const std::string input_file_name, - const unsigned long int seed, - const size_t bin_size, - const size_t max_width, +std::size_t +load_coverage_counts_MR(const bool VERBOSE, const std::string input_file_name, + const std::uint64_t seed, const std::size_t bin_size, + const std::size_t max_width, std::vector &coverage_hist); - -size_t +std::size_t load_coverage_counts_GR(const std::string input_file_name, - const unsigned long int seed, - const size_t bin_size, - const size_t max_width, + const std::uint64_t seed, const std::size_t bin_size, + const std::size_t max_width, std::vector &coverage_hist); - -size_t +std::size_t load_histogram(const std::string &filename, std::vector &counts_hist); -size_t -load_counts(const std::string &input_file_name, std::vector &counts_hist); +std::size_t +load_counts(const std::string &input_file_name, + std::vector &counts_hist); -size_t +std::size_t load_counts_BED_pe(const std::string input_file_name, std::vector &counts_hist); -size_t +std::size_t load_counts_BED_se(const std::string input_file_name, std::vector &counts_hist); #ifdef HAVE_HTSLIB -size_t -load_counts_BAM_pe(const bool VERBOSE, - const std::string &input_file_name, - const size_t MAX_SEGMENT_LENGTH, - const size_t MAX_READS_TO_HOLD, - size_t &n_paired, - size_t &n_mates, - std::vector &counts_hist); +std::size_t +load_counts_BAM_pe(const bool VERBOSE, const std::string &input_file_name, + const std::size_t MAX_SEGMENT_LENGTH, + const std::size_t MAX_READS_TO_HOLD, std::size_t &n_paired, + std::size_t &n_mates, std::vector &counts_hist); -size_t +std::size_t load_counts_BAM_se(const std::string &input_file_name, std::vector &counts_hist); -#endif // HAVE_HTSLIB - +#endif // HAVE_HTSLIB -#endif +#endif // SRC_LOAD_DATA_FOR_COMPLEXITY_HPP_ diff --git a/src/moment_sequence.cpp b/src/moment_sequence.cpp index 9f8934e..f10e67c 100644 --- a/src/moment_sequence.cpp +++ b/src/moment_sequence.cpp @@ -20,85 +20,86 @@ #include "moment_sequence.hpp" -#include +#include +#include #include -#include -#include +#include #include #include -#include -#include +#include +#include +#include // std::swap +#include -using std::string; -using std::vector; +using std::cerr; using std::endl; +using std::find_if; +using std::isfinite; +using std::isinf; using std::max; -using std::cerr; using std::setprecision; +using std::string; using std::swap; -using std::find_if; using std::transform; -using std::isfinite; -using std::isinf; +using std::vector; -void -LU_decomp(vector > &A, vector &P) { - - const size_t N = A.size(); - double absA; - size_t i, j, k; - - P.clear(); - for (size_t x = 0; x <= N; x++) - P.push_back(x); - - for (i = 0; i < N; i++) { - double maxA = 0.0; - size_t imax = i; - - for (k = i; k < N; k++) - if ((absA = fabs(A[k][i])) > maxA) { - maxA = absA; - imax = k; - } - - if (imax != i) { - //pivoting P - size_t j = P[i]; - P[i] = P[imax]; - P[imax] = j; - - //pivoting rows of A - vector ptr(A[i]); - A[i] = A[imax]; - A[imax] = ptr; - - //counting pivots starting from N (for determinant) - P[N]++; - } - - for (j = i + 1; j < N; j++) { - A[j][i] /= A[i][i]; - - for (k = i + 1; k < N; k++) - A[j][k] -= A[j][i] * A[i][k]; - } +void +LU_decomp(vector> &A, vector &P) { + const size_t N = A.size(); + double absA; + size_t i, j, k; + + P.clear(); + for (size_t x = 0; x <= N; x++) + P.push_back(x); + + for (i = 0; i < N; i++) { + double maxA = 0.0; + size_t imax = i; + + for (k = i; k < N; k++) + if ((absA = fabs(A[k][i])) > maxA) { + maxA = absA; + imax = k; + } + + if (imax != i) { + // pivoting P + size_t j = P[i]; + P[i] = P[imax]; + P[imax] = j; + + // pivoting rows of A + vector ptr(A[i]); + A[i] = A[imax]; + A[imax] = ptr; + + // counting pivots starting from N (for determinant) + P[N]++; } -} -double -LU_determinant(vector > &A, vector &P) { + for (j = i + 1; j < N; j++) { + A[j][i] /= A[i][i]; - const size_t N = A.size(); + for (k = i + 1; k < N; k++) + A[j][k] -= A[j][i] * A[i][k]; + } + } +} - double det = A[0][0]; +double +LU_determinant(vector> &A, vector &P) { + const size_t N = A.size(); - for (size_t i = 1; i < N; ++i) - det *= A[i][i]; + double det = A[0][0]; - if ((P[N] - N) % 2 == 0) return det; + for (size_t i = 1; i < N; ++i) + det *= A[i][i]; - return -det; + if ((P[N] - N) % 2 == 0) + return det; + + return -det; } ///////////////////////////////////////////////////// @@ -106,45 +107,43 @@ LU_determinant(vector > &A, vector &P) { // ensure moment sequence is positive definite // truncate moment sequence to ensure pos def size_t -ensure_pos_def_mom_seq(vector &moments, - const double tolerance, +ensure_pos_def_mom_seq(vector &moments, const double tolerance, const bool VERBOSE) { - const size_t min_hankel_dim = 1; size_t hankel_dim = 2; - if (moments.size() < 2*hankel_dim) { + if (moments.size() < 2 * hankel_dim) { if (VERBOSE) cerr << "too few moments" << endl; return min_hankel_dim; } bool ACCEPT_HANKEL = true; - while (ACCEPT_HANKEL && (2*hankel_dim - 1 < moments.size())) { - - vector > hankel_mat(hankel_dim, vector(hankel_dim, 0.0)); + while (ACCEPT_HANKEL && (2 * hankel_dim - 1 < moments.size())) { + vector> hankel_mat(hankel_dim, + vector(hankel_dim, 0.0)); for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++) for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++) - hankel_mat[c_idx][r_idx] = moments[c_idx + r_idx]; + hankel_mat[c_idx][r_idx] = moments[c_idx + r_idx]; vector perm; LU_decomp(hankel_mat, perm); const double hankel_mat_det = LU_determinant(hankel_mat, perm); - vector > shift_hankel_matrix(hankel_dim, vector(hankel_dim, 0.0)); + vector> shift_hankel_matrix(hankel_dim, + vector(hankel_dim, 0.0)); for (size_t c_idx = 0; c_idx < hankel_dim; c_idx++) for (size_t r_idx = 0; r_idx < hankel_dim; r_idx++) - shift_hankel_matrix[c_idx][r_idx] = moments[c_idx + r_idx + 1]; + shift_hankel_matrix[c_idx][r_idx] = moments[c_idx + r_idx + 1]; vector s_perm; LU_decomp(shift_hankel_matrix, s_perm); - const double shift_hankel_mat_det = LU_determinant(shift_hankel_matrix, s_perm); + const double shift_hankel_mat_det = + LU_determinant(shift_hankel_matrix, s_perm); if (VERBOSE) { - cerr << "dim" << '\t' - << "hankel_det" << '\t' - << "shifted_hankel_det" << endl; - cerr << hankel_dim << '\t' - << hankel_mat_det << '\t' + cerr << "dim" << '\t' << "hankel_det" << '\t' << "shifted_hankel_det" + << endl; + cerr << hankel_dim << '\t' << hankel_mat_det << '\t' << shift_hankel_mat_det << endl; } @@ -155,7 +154,7 @@ ensure_pos_def_mom_seq(vector &moments, else { ACCEPT_HANKEL = false; hankel_dim--; - moments.resize(2*hankel_dim); + moments.resize(2 * hankel_dim); return hankel_dim; } } @@ -170,7 +169,6 @@ ensure_pos_def_mom_seq(vector &moments, // truncate if non-positive element found static void check_three_term_relation(vector &a, vector &b) { - // abort if first entry is zero or smaller if (a[0] <= 0.0) { a.clear(); @@ -179,8 +177,8 @@ check_three_term_relation(vector &a, vector &b) { for (size_t i = 0; i < b.size(); i++) // ADS: some strange logic here - if (b[i] <= 0.0 || !isfinite(b[i]) || - a[i + 1] <= 0.0 || !isfinite(a[i + 1])) { + if (b[i] <= 0.0 || !isfinite(b[i]) || a[i + 1] <= 0.0 || + !isfinite(a[i + 1])) { b.resize(i); a.resize(i + 1); break; @@ -191,7 +189,6 @@ check_three_term_relation(vector &a, vector &b) { // truncate at first non-positive element if found static void check_moment_sequence(vector &obs_moms) { - if (obs_moms[0] <= 0.0 || !isfinite(obs_moms[0])) obs_moms.clear(); @@ -203,30 +200,29 @@ check_moment_sequence(vector &obs_moms) { } } - void MomentSequence::unmodified_Chebyshev(const bool VERBOSE) { - - const size_t n_points = static_cast(floor(moments.size()/2)); + const size_t n_points = static_cast(floor(moments.size() / 2)); vector a(n_points, 0.0); vector b(n_points - 1, 0.0); - vector< vector > sigma(2*n_points, vector(2*n_points, 0.0)); + vector> sigma(2 * n_points, vector(2 * n_points, 0.0)); // initialization - a[0] = moments[1]/moments[0]; + a[0] = moments[1] / moments[0]; // sigma[-1][l] = 0 - for (size_t l = 0; l < 2*n_points; l++) + for (size_t l = 0; l < 2 * n_points; l++) sigma[0][l] = moments[l]; for (size_t k = 1; k <= n_points; k++) { - for (size_t l = k; l < 2*n_points - k; l++) { - sigma[k][l] = sigma[k-1][l+1] - a[k-1]*sigma[k-1][l]; + for (size_t l = k; l < 2 * n_points - k; l++) { + sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l]; if (k > 1) - sigma[k][l] -= b[k-2]*sigma[k-2][l]; + sigma[k][l] -= b[k - 2] * sigma[k - 2][l]; } if (k != n_points) { - a[k] = sigma[k][k+1]/sigma[k][k] - sigma[k-1][k]/sigma[k-1][k-1]; - b[k-1] = sigma[k][k]/sigma[k-1][k-1]; + a[k] = + sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1]; + b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1]; } } @@ -239,27 +235,27 @@ void MomentSequence::full_3term_recurrence(const bool VERBOSE, vector &full_alpha, vector &full_beta) { - - const size_t n_points = static_cast(floor(moments.size()/2)); + const size_t n_points = static_cast(floor(moments.size() / 2)); vector a(n_points, 0.0); vector b(n_points - 1, 0.0); - vector< vector > sigma(2*n_points, vector(2*n_points, 0.0)); + vector> sigma(2 * n_points, vector(2 * n_points, 0.0)); // initialization - a[0] = moments[1]/moments[0]; + a[0] = moments[1] / moments[0]; // sigma[-1][l] = 0 - for (size_t l = 0; l < 2*n_points; l++) + for (size_t l = 0; l < 2 * n_points; l++) sigma[0][l] = moments[l]; for (size_t k = 1; k <= n_points; k++) { - for (size_t l = k; l < 2*n_points - k; l++) { - sigma[k][l] = sigma[k-1][l+1] - a[k-1]*sigma[k-1][l]; + for (size_t l = k; l < 2 * n_points - k; l++) { + sigma[k][l] = sigma[k - 1][l + 1] - a[k - 1] * sigma[k - 1][l]; if (k > 1) - sigma[k][l] -= b[k-2]*sigma[k-2][l]; + sigma[k][l] -= b[k - 2] * sigma[k - 2][l]; } if (k != n_points) { - a[k] = sigma[k][k+1]/sigma[k][k] - sigma[k-1][k]/sigma[k-1][k-1]; - b[k-1] = sigma[k][k]/sigma[k-1][k-1]; + a[k] = + sigma[k][k + 1] / sigma[k][k] - sigma[k - 1][k] / sigma[k - 1][k - 1]; + b[k - 1] = sigma[k][k] / sigma[k - 1][k - 1]; } } @@ -267,7 +263,6 @@ MomentSequence::full_3term_recurrence(const bool VERBOSE, full_beta.swap(b); } - //////////////////////////////////////////////////// // Constructor @@ -282,7 +277,6 @@ MomentSequence::MomentSequence(const vector &obs_moms) : unmodified_Chebyshev(false); } - ///////////////////////////////////////////////////// // Quadrature Methods @@ -293,7 +287,6 @@ MomentSequence::MomentSequence(const vector &obs_moms) : static void QRiteration(vector &alpha, vector &beta, vector &weights) { - // initialize variables vector sin_theta(alpha.size(), 0.0); vector cos_theta(alpha.size(), 0.0); @@ -316,39 +309,38 @@ QRiteration(vector &alpha, vector &beta, z_bar[0] = z[0]; for (size_t j = 0; j < alpha.size() - 1; j++) { - // for d and b_bar, j here is j-1 in G&W if (d[j] == 0.0 && b_bar[j] == 0.0) { sin_theta[j] = 0.0; cos_theta[j] = 1.0; } else { - sin_theta[j] = d[j]/sqrt(d[j]*d[j] + b_bar[j]*b_bar[j]); - cos_theta[j] = b_bar[j]/sqrt(d[j]*d[j] + b_bar[j]*b_bar[j]); + sin_theta[j] = d[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); + cos_theta[j] = b_bar[j] / sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); } - a[j] = (a_bar[j]*cos_theta[j]*cos_theta[j] + - 2*b_tilde[j]*cos_theta[j]*sin_theta[j] + - alpha[j+1]*sin_theta[j]*sin_theta[j]); + a[j] = (a_bar[j] * cos_theta[j] * cos_theta[j] + + 2 * b_tilde[j] * cos_theta[j] * sin_theta[j] + + alpha[j + 1] * sin_theta[j] * sin_theta[j]); - a_bar[j+1] = (a_bar[j]*sin_theta[j]*sin_theta[j] - - 2*b_tilde[j]*cos_theta[j]*sin_theta[j] + - alpha[j+1]*cos_theta[j]*cos_theta[j]); + a_bar[j + 1] = (a_bar[j] * sin_theta[j] * sin_theta[j] - + 2 * b_tilde[j] * cos_theta[j] * sin_theta[j] + + alpha[j + 1] * cos_theta[j] * cos_theta[j]); if (j != 0) - b[j-1] = sqrt(d[j]*d[j] + b_bar[j]*b_bar[j]); + b[j - 1] = sqrt(d[j] * d[j] + b_bar[j] * b_bar[j]); - b_bar[j+1] = - ((a_bar[j] - alpha[j+1])*sin_theta[j]*cos_theta[j] + - b_tilde[j]*(sin_theta[j]*sin_theta[j] - cos_theta[j]*cos_theta[j])); + b_bar[j + 1] = ((a_bar[j] - alpha[j + 1]) * sin_theta[j] * cos_theta[j] + + b_tilde[j] * (sin_theta[j] * sin_theta[j] - + cos_theta[j] * cos_theta[j])); - b_tilde[j+1] = -beta[j+1]*cos_theta[j]; + b_tilde[j + 1] = -beta[j + 1] * cos_theta[j]; - d[j+1] = beta[j+1]*sin_theta[j]; + d[j + 1] = beta[j + 1] * sin_theta[j]; - z[j] = z_bar[j]*cos_theta[j] + weights[j+1]*sin_theta[j]; + z[j] = z_bar[j] * cos_theta[j] + weights[j + 1] * sin_theta[j]; - z_bar[j+1] = z_bar[j]*sin_theta[j] - weights[j+1]*cos_theta[j]; + z_bar[j + 1] = z_bar[j] * sin_theta[j] - weights[j + 1] * cos_theta[j]; } // last entries set equal to final "holding" values @@ -361,22 +353,18 @@ QRiteration(vector &alpha, vector &beta, swap(weights, z); } - static bool check_positivity(const vector &v) { return find_if(begin(v), end(v), - [](const double x) {return x <= 0.0 || isinf(x);}) == end(v); + [](const double x) { return x <= 0.0 || isinf(x); }) == end(v); } - bool MomentSequence::Lower_quadrature_rules(const bool VERBOSE, - const size_t n_points, - const double tol, + const size_t n_points, const double tol, const size_t max_iter, vector &points, vector &weights) { - // make sure that points.size() will be less than n_points vector a(alpha); a.resize((n_points < alpha.size()) ? n_points : alpha.size()); @@ -420,7 +408,7 @@ MomentSequence::Lower_quadrature_rules(const bool VERBOSE, // square entries in the weights vector transform(begin(weights), end(weights), begin(weights), - [](const double x) {return x*x;}); + [](const double x) { return x * x; }); return points_are_positive; } diff --git a/src/moment_sequence.hpp b/src/moment_sequence.hpp index 0f7ccbb..1898d74 100644 --- a/src/moment_sequence.hpp +++ b/src/moment_sequence.hpp @@ -17,47 +17,40 @@ * along with this program. If not, see . */ -#ifndef MOMENT_SEQUENCE_HPP -#define MOMENT_SEQUENCE_HPP +#ifndef SRC_MOMENT_SEQUENCE_HPP_ +#define SRC_MOMENT_SEQUENCE_HPP_ -#include -#include #include +#include +#include // test Hankel moment matrix to ensure the moment sequence // is positive definite -size_t ensure_pos_def_mom_seq(std::vector &moments, - const double tolerance, - const bool VERBOSE); +std::size_t +ensure_pos_def_mom_seq(std::vector &moments, const double tolerance, + const bool VERBOSE); struct MomentSequence { - - // Constructors MomentSequence() {} - MomentSequence(const std::vector &obs_moms); + explicit MomentSequence(const std::vector &obs_moms); - MomentSequence(const std::vector &a, - const std::vector &b): + MomentSequence(const std::vector &a, const std::vector &b) : alpha(a), beta(b) {}; - - // Estimate 3-term recurrence // these will be removed from the header when they are tested void unmodified_Chebyshev(const bool VERBOSE); void full_3term_recurrence(const bool VERBOSE, - std::vector &full_alpha, - std::vector &full_beta); - - // quadrature rules using QR on Jacobi matrix - bool Lower_quadrature_rules(const bool VERBOSE, - const size_t n_points, - const double tolerance, - const size_t max_iter, - std::vector &points, - std::vector &weights); + std::vector &full_alpha, + std::vector &full_beta); + // quadrature rules using QR on Jacobi matrix + bool Lower_quadrature_rules(const bool VERBOSE, const std::size_t n_points, + const double tolerance, + const std::size_t max_iter, + std::vector &points, + std::vector &weights); std::vector moments; // 3-term recurrence @@ -65,5 +58,4 @@ struct MomentSequence { std::vector beta; }; - -#endif +#endif // SRC_MOMENT_SEQUENCE_HPP_ diff --git a/src/preseq.cpp b/src/preseq.cpp index 349a3fa..4611a50 100644 --- a/src/preseq.cpp +++ b/src/preseq.cpp @@ -20,86 +20,92 @@ * along with this program. If not, see . */ -#include -#include -#include +#include + +#include "continued_fraction.hpp" +#include "load_data_for_complexity.hpp" +#include "moment_sequence.hpp" + +#include +#include +#include +#include + #include #include -#include -#include #include +#include +#include +#include #include +#include #include +#include #include -#include - -#include -#include -#include -#include - -#include "continued_fraction.hpp" -#include "load_data_for_complexity.hpp" -#include "moment_sequence.hpp" +#include +#include -using std::string; -using std::min; -using std::vector; -using std::endl; using std::cerr; +using std::endl; using std::isfinite; - -using std::setprecision; -using std::unordered_map; +using std::max; +using std::min; +using std::mt19937; using std::runtime_error; +using std::setprecision; +using std::string; using std::to_string; -using std::mt19937; -using std::max; - -static const string preseq_version = "3.2.0"; +using std::uint64_t; +using std::unordered_map; +using std::vector; -template T +template +T get_counts_from_hist(const vector &h) { T c = 0.0; for (size_t i = 0; i < h.size(); ++i) - c += i*h[i]; + c += i * h[i]; return c; } -template T -median_from_sorted_vector (const vector sorted_data, - const size_t stride, const size_t n) { - - if (n == 0 || sorted_data.empty()) return 0.0; +template +T +median_from_sorted_vector(const vector sorted_data, const size_t stride, + const size_t n) { + if (n == 0 || sorted_data.empty()) + return 0.0; const size_t lhs = (n - 1) / 2; const size_t rhs = n / 2; - if (lhs == rhs) return sorted_data[lhs * stride]; + if (lhs == rhs) + return sorted_data[lhs * stride]; return (sorted_data[lhs * stride] + sorted_data[rhs * stride]) / 2.0; } -template T -quantile_from_sorted_vector (const vector sorted_data, - const size_t stride, const size_t n, - const double f) { +template +T +quantile_from_sorted_vector(const vector sorted_data, const size_t stride, + const size_t n, const double f) { const double index = f * (n - 1); - const size_t lhs = (int)index; + const size_t lhs = static_cast(index); const double delta = index - lhs; - if (n == 0 || sorted_data.empty()) return 0.0; + if (n == 0 || sorted_data.empty()) + return 0.0; - if (lhs == n - 1) return sorted_data[lhs * stride]; + if (lhs == n - 1) + return sorted_data[lhs * stride]; - return (1 - delta) * sorted_data[lhs * stride] - + delta * sorted_data[(lhs + 1) * stride]; + return (1 - delta) * sorted_data[lhs * stride] + + delta * sorted_data[(lhs + 1) * stride]; } // Confidence interval stuff static void -median_and_ci(vector estimates, // by val so we can sort them +median_and_ci(vector estimates, // by val so we can sort them const double ci_level, double &median_estimate, double &lower_ci_estimate, double &upper_ci_estimate) { assert(!estimates.empty()); @@ -110,17 +116,16 @@ median_and_ci(vector estimates, // by val so we can sort them const size_t N = estimates.size(); median_estimate = median_from_sorted_vector(estimates, 1, N); - lower_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, alpha/2); - upper_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, 1.0 - alpha/2); + lower_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, alpha / 2); + upper_ci_estimate = + quantile_from_sorted_vector(estimates, 1, N, 1.0 - alpha / 2); } static void -vector_median_and_ci(const vector > &bootstrap_estimates, - const double ci_level, - vector &yield_estimates, +vector_median_and_ci(const vector> &bootstrap_estimates, + const double ci_level, vector &yield_estimates, vector &lower_ci_lognorm, vector &upper_ci_lognorm) { - yield_estimates.clear(); lower_ci_lognorm.clear(); upper_ci_lognorm.clear(); @@ -129,14 +134,13 @@ vector_median_and_ci(const vector > &bootstrap_estimates, const size_t n_est = bootstrap_estimates.size(); vector estimates_row(n_est, 0.0); for (size_t i = 0; i < bootstrap_estimates[0].size(); i++) { - // estimates is in wrong order, work locally on const val for (size_t k = 0; k < n_est; ++k) estimates_row[k] = bootstrap_estimates[k][i]; double median_estimate, lower_ci_estimate, upper_ci_estimate; - median_and_ci(estimates_row, ci_level, median_estimate, - lower_ci_estimate, upper_ci_estimate); + median_and_ci(estimates_row, ci_level, median_estimate, lower_ci_estimate, + upper_ci_estimate); sort(begin(estimates_row), end(estimates_row)); yield_estimates.push_back(median_estimate); @@ -147,10 +151,9 @@ vector_median_and_ci(const vector > &bootstrap_estimates, template void -multinomial(mt19937 &gen, const vector &mult_probs, - uint_type trials, vector &result) { - - typedef std::binomial_distribution binom_dist; +multinomial(mt19937 &gen, const vector &mult_probs, uint_type trials, + vector &result) { + typedef std::binomial_distribution binom_dist; result.clear(); result.resize(mult_probs.size()); @@ -160,12 +163,11 @@ multinomial(mt19937 &gen, const vector &mult_probs, auto r(begin(result)); auto p(begin(mult_probs)); - while (p != end(mult_probs)) { // iterate to sample for each category + while (p != end(mult_probs)) { // iterate to sample for each category + *r = binom_dist(trials, (*p) / remaining_prob)(gen); // take the sample - *r = binom_dist(trials, (*p)/remaining_prob)(gen); // take the sample - - remaining_prob -= *p++; // update remaining probability mass - trials -= *r++; // update remaining trials needed + remaining_prob -= *p++; // update remaining probability mass + trials -= *r++; // update remaining trials needed } if (trials > 0) @@ -175,8 +177,7 @@ multinomial(mt19937 &gen, const vector &mult_probs, // Lanczos approximation for gamma function for x >= 0.5 - essentially an // approximation for (x-1)! static double -factorial (double x) { - +factorial(double x) { // constants double LogRootTwoPi = 0.9189385332046727; double Euler = 2.71828182845904523536028747135; @@ -184,21 +185,19 @@ factorial (double x) { // Approximation for factorial is actually x-1 x -= 1.0; - vector lanczos { - 0.99999999999980993227684700473478, - 676.520368121885098567009190444019, - -1259.13921672240287047156078755283, - 771.3234287776530788486528258894, - -176.61502916214059906584551354, - 12.507343278686904814458936853, - -0.13857109526572011689554707, - 9.984369578019570859563e-6, - 1.50563273514931155834e-7 - }; + vector lanczos{0.99999999999980993227684700473478, + 676.520368121885098567009190444019, + -1259.13921672240287047156078755283, + 771.3234287776530788486528258894, + -176.61502916214059906584551354, + 12.507343278686904814458936853, + -0.13857109526572011689554707, + 9.984369578019570859563e-6, + 1.50563273514931155834e-7}; double Ag = lanczos[0]; - for (size_t k=1; k < lanczos.size(); k++) + for (size_t k = 1; k < lanczos.size(); k++) Ag += lanczos[k] / (x + k); double term1 = (x + 0.5) * log((x + 7.5) / Euler); @@ -219,15 +218,13 @@ void resample_hist(mt19937 &gen, const vector &vals_hist_distinct_counts, const vector &distinct_counts_hist, vector &out_hist) { - const size_t hist_size = distinct_counts_hist.size(); - vector sample_distinct_counts_hist(hist_size, 0); + vector sample_distinct_counts_hist(hist_size, 0); - unsigned int distinct = + const uint32_t distinct = accumulate(begin(distinct_counts_hist), end(distinct_counts_hist), 0.0); - multinomial(gen, distinct_counts_hist, distinct, - sample_distinct_counts_hist); + multinomial(gen, distinct_counts_hist, distinct, sample_distinct_counts_hist); out_hist.clear(); out_hist.resize(vals_hist_distinct_counts.back() + 1, 0.0); @@ -241,10 +238,10 @@ resample_hist(mt19937 &gen, const vector &vals_hist_distinct_counts, // N total sample size; S the total number of distincts // n sub sample size static double -interpolate_distinct(const vector &hist, const size_t N, - const size_t S, const size_t n) { - - const double denom = factorial(N + 1) - factorial(n + 1) - factorial(N - n + 1); +interpolate_distinct(const vector &hist, const size_t N, const size_t S, + const size_t n) { + const double denom = + factorial(N + 1) - factorial(n + 1) - factorial(N - n + 1); vector numer(hist.size(), 0); for (size_t i = 1; i < hist.size(); i++) { @@ -253,46 +250,39 @@ interpolate_distinct(const vector &hist, const size_t N, numer[i] = 0; } else { - - const double x = (factorial(N - i + 1) - factorial(n + 1) - - factorial(N - i - n + 1)); - numer[i] = exp(x - denom)*hist[i]; + const double x = + (factorial(N - i + 1) - factorial(n + 1) - factorial(N - i - n + 1)); + numer[i] = exp(x - denom) * hist[i]; } } return S - accumulate(begin(numer), end(numer), 0); } - static void extrapolate_curve(const ContinuedFraction &the_cf, - const double initial_distinct, - const double vals_sum, - const double initial_sample_size, - const double step_size, - const double max_sample_size, - vector &estimates) { - + const double initial_distinct, const double vals_sum, + const double initial_sample_size, const double step_size, + const double max_sample_size, vector &estimates) { double curr_samp_sz = initial_sample_size; while (curr_samp_sz < max_sample_size) { - const double fold = (curr_samp_sz - vals_sum)/vals_sum; + const double fold = (curr_samp_sz - vals_sum) / vals_sum; assert(fold >= 0.0); - estimates.push_back(initial_distinct + fold*the_cf(fold)); + estimates.push_back(initial_distinct + fold * the_cf(fold)); curr_samp_sz += step_size; } } void extrap_bootstrap(const bool VERBOSE, const bool allow_defects, - const unsigned long int seed, - const vector &orig_hist, + const uint64_t seed, const vector &orig_hist, const size_t n_bootstraps, const size_t orig_max_terms, const int diagonal, const double bin_step_size, const double max_extrap, const size_t max_iter, - vector > &bootstrap_estimates) { + vector> &bootstrap_estimates) { // clear returning vectors bootstrap_estimates.clear(); - //setup rng + // setup rng srand(time(0) + getpid()); mt19937 rng(seed); @@ -308,10 +298,10 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects, distinct_orig_hist.push_back(orig_hist[i]); } - for (size_t iter = 0; (iter < max_iter && bootstrap_estimates.size() < n_bootstraps); ++iter) { - + for (size_t iter = 0; + (iter < max_iter && bootstrap_estimates.size() < n_bootstraps); ++iter) { if (VERBOSE && iter > 0 && iter % 72 == 0) - cerr << endl; // bootstrap success progress only 72 char wide + cerr << endl; // bootstrap success progress only 72 char wide vector yield_vector; vector hist; @@ -327,8 +317,8 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects, const size_t distinct = accumulate(begin(hist), end(hist), 0.0); size_t curr_sample_sz = bin_step_size; while (curr_sample_sz < sample_vals_sum) { - yield_vector.push_back(interpolate_distinct(hist, sample_vals_sum, - distinct, curr_sample_sz)); + yield_vector.push_back( + interpolate_distinct(hist, sample_vals_sum, distinct, curr_sample_sz)); curr_sample_sz += bin_step_size; } @@ -345,32 +335,30 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects, bool successful_bootstrap = false; // defect mode, simple extrapolation if (allow_defects) { - vector ps_coeffs; for (size_t j = 1; j <= max_terms; j++) - ps_coeffs.push_back(hist[j]*std::pow(-1.0, j + 1)); + ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1)); const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms); extrapolate_curve(defect_cf, initial_distinct, sample_vals_sum, - curr_sample_sz, bin_step_size, - max_extrap, yield_vector); + curr_sample_sz, bin_step_size, max_extrap, + yield_vector); // no checking of curve in defect mode bootstrap_estimates.push_back(yield_vector); successful_bootstrap = true; } else { - // refit curve for lower bound const ContinuedFractionApproximation lower_cfa(diagonal, max_terms); - const ContinuedFraction lower_cf(lower_cfa.optimal_cont_frac_distinct(hist)); + const ContinuedFraction lower_cf( + lower_cfa.optimal_cont_frac_distinct(hist)); // extrapolate the curve start if (lower_cf.is_valid()) { - extrapolate_curve(lower_cf, initial_distinct, sample_vals_sum, - curr_sample_sz, bin_step_size, - max_extrap, yield_vector); + curr_sample_sz, bin_step_size, max_extrap, + yield_vector); // sanity check if (check_yield_estimates_stability(yield_vector)) { bootstrap_estimates.push_back(yield_vector); @@ -390,12 +378,10 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects, static bool extrap_single_estimate(const bool VERBOSE, const bool allow_defects, - vector &hist, - size_t max_terms, const int diagonal, - const double step_size, + vector &hist, size_t max_terms, + const int diagonal, const double step_size, const double max_extrap, vector &yield_estimate) { - yield_estimate.clear(); const double vals_sum = get_counts_from_hist(hist); @@ -406,8 +392,8 @@ extrap_single_estimate(const bool VERBOSE, const bool allow_defects, size_t step = static_cast(step_size); size_t sample = static_cast(step_size); for (; sample < upper_limit; sample += step) - yield_estimate.push_back(interpolate_distinct(hist, upper_limit, - initial_distinct, sample)); + yield_estimate.push_back( + interpolate_distinct(hist, upper_limit, initial_distinct, sample)); // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE size_t first_zero = 1; @@ -422,29 +408,28 @@ extrap_single_estimate(const bool VERBOSE, const bool allow_defects, max_terms = max_terms - (max_terms % 2 == 1); if (allow_defects) { - vector ps_coeffs; for (size_t j = 1; j <= max_terms; j++) - ps_coeffs.push_back(hist[j]*std::pow(-1.0, j + 1)); + ps_coeffs.push_back(hist[j] * std::pow(-1.0, j + 1)); const ContinuedFraction defect_cf(ps_coeffs, diagonal, max_terms); - extrapolate_curve(defect_cf, initial_distinct, vals_sum, - sample, step_size, max_extrap, yield_estimate); + extrapolate_curve(defect_cf, initial_distinct, vals_sum, sample, step_size, + max_extrap, yield_estimate); if (VERBOSE) cerr << defect_cf << endl; // NO FAIL! defect mode doesn't care about failure } else { - const ContinuedFractionApproximation lower_cfa(diagonal, max_terms); - const ContinuedFraction lower_cf(lower_cfa.optimal_cont_frac_distinct(hist)); + const ContinuedFraction lower_cf( + lower_cfa.optimal_cont_frac_distinct(hist)); // extrapolate curve if (lower_cf.is_valid()) { - extrapolate_curve(lower_cf, initial_distinct, vals_sum, - sample, step_size, max_extrap, yield_estimate); + extrapolate_curve(lower_cf, initial_distinct, vals_sum, sample, step_size, + max_extrap, yield_estimate); } else { // FAIL! @@ -459,24 +444,23 @@ extrap_single_estimate(const bool VERBOSE, const bool allow_defects, return true; } - static double GoodToulmin2xExtrap(const vector &counts_hist) { double two_fold_extrap = 0.0; for (size_t i = 0; i < counts_hist.size(); i++) - two_fold_extrap += pow(-1.0, i + 1)*counts_hist[i]; + two_fold_extrap += pow(-1.0, i + 1) * counts_hist[i]; return two_fold_extrap; } - static void -write_predicted_complexity_curve(const string &outfile, - const double c_level, const double step_size, +write_predicted_complexity_curve(const string &outfile, const double c_level, + const double step_size, const vector &yield_estimates, const vector &yield_lower_ci_lognorm, const vector &yield_upper_ci_lognorm) { std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out << "TOTAL_READS\tEXPECTED_DISTINCT\t" @@ -488,47 +472,41 @@ write_predicted_complexity_curve(const string &outfile, out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl; for (size_t i = 0; i < yield_estimates.size(); ++i) - out << (i + 1)*step_size << '\t' - << yield_estimates[i] << '\t' - << yield_lower_ci_lognorm[i] << '\t' - << yield_upper_ci_lognorm[i] << endl; + out << (i + 1) * step_size << '\t' << yield_estimates[i] << '\t' + << yield_lower_ci_lognorm[i] << '\t' << yield_upper_ci_lognorm[i] + << endl; } - // ADS: functions same, header different (above and this one) static void -write_predicted_coverage_curve(const string &outfile, - const double c_level, +write_predicted_coverage_curve(const string &outfile, const double c_level, const double base_step_size, const size_t bin_size, const vector &cvrg_estimates, const vector &cvrg_lower_ci_lognorm, const vector &cvrg_upper_ci_lognorm) { std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out << "TOTAL_BASES\tEXPECTED_COVERED_BASES\t" - << "LOWER_" << 100*c_level << "%CI\t" - << "UPPER_" << 100*c_level << "%CI" << endl; + << "LOWER_" << 100 * c_level << "%CI\t" + << "UPPER_" << 100 * c_level << "%CI" << endl; out.setf(std::ios_base::fixed, std::ios_base::floatfield); out.precision(1); out << 0 << '\t' << 0 << '\t' << 0 << '\t' << 0 << endl; for (size_t i = 0; i < cvrg_estimates.size(); ++i) - out << (i + 1)*base_step_size << '\t' - << cvrg_estimates[i]*bin_size << '\t' - << cvrg_lower_ci_lognorm[i]*bin_size << '\t' - << cvrg_upper_ci_lognorm[i]*bin_size << endl; + out << (i + 1) * base_step_size << '\t' << cvrg_estimates[i] * bin_size + << '\t' << cvrg_lower_ci_lognorm[i] * bin_size << '\t' + << cvrg_upper_ci_lognorm[i] * bin_size << endl; } - static int lc_extrap(const int argc, const char **argv) { - try { - static const size_t min_required_counts = 4; static const string min_required_counts_error_message = "max count before zero is less than min required count (" + @@ -542,7 +520,7 @@ lc_extrap(const int argc, const char **argv) { size_t n_bootstraps = 100; int diagonal = 0; double c_level = 0.95; - unsigned long int seed = 408; + uint64_t seed = 408; /* FLAGS */ bool VERBOSE = false; @@ -557,49 +535,57 @@ lc_extrap(const int argc, const char **argv) { size_t MAX_SEGMENT_LENGTH = 5000; #endif - string description = "Extrapolate the complexity of a library. This is the approach \ - described in Daley & Smith (2013). The method applies rational \ - function approximation via continued fractions with the \ - original goal of estimating the number of distinct reads that a \ - sequencing library would yield upon deeper sequencing. This \ - method has been used for many different purposes since then."; + string description = + R"(Extrapolate the complexity of a library. This is the approach \ +described in Daley & Smith (2013). The method applies rational \ +function approximation via continued fractions with the \ +original goal of estimating the number of distinct reads that a \ +sequencing library would yield upon deeper sequencing. This \ +method has been used for many different purposes since then. +)"; /********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/ OptionParser opt_parse(strip_path(argv[1]), description, ""); opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", - false , outfile); - opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap); - opt_parse.add_opt("step",'s',"extrapolation step size", false, step_size); - opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps); - opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level); - opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms); + false, outfile); + opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false, + max_extrap); + opt_parse.add_opt("step", 's', "extrapolation step size", false, step_size); + opt_parse.add_opt("boots", 'n', "number of bootstraps", false, + n_bootstraps); + opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, + c_level); + opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, + orig_max_terms); opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE); #ifdef HAVE_HTSLIB - opt_parse.add_opt("bam", 'B', "input is in BAM format", - false, BAM_FORMAT_INPUT); - opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging " + opt_parse.add_opt("bam", 'B', "input is in BAM format", false, + BAM_FORMAT_INPUT); + opt_parse.add_opt("seg_len", 'l', + "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); #endif - opt_parse.add_opt("pe", 'P', "input is paired end read file", - false, PAIRED_END); - opt_parse.add_opt("vals", 'V', - "input is a text file containing only the observed counts", - false, VALS_INPUT); + opt_parse.add_opt("pe", 'P', "input is paired end read file", false, + PAIRED_END); + opt_parse.add_opt( + "vals", 'V', "input is a text file containing only the observed counts", + false, VALS_INPUT); opt_parse.add_opt("hist", 'H', "input is a text file containing the observed histogram", false, HIST_INPUT); opt_parse.add_opt("quick", 'Q', "quick mode (no bootstraps) for confidence intervals", false, SINGLE_ESTIMATE); - opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects); - opt_parse.add_opt("seed", 'r', "seed for random number generator", - false, seed); + opt_parse.add_opt("defects", 'D', "no testing for defects", false, + allow_defects); + opt_parse.add_opt("seed", 'r', "seed for random number generator", false, + seed); opt_parse.set_show_defaults(); vector leftover_args; // ADS: suspect bug below; "-about" isn't working. - opt_parse.parse(argc-1, argv+1, leftover_args); + opt_parse.parse(argc - 1, argv + 1, leftover_args); if (argc == 2 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; @@ -640,10 +626,9 @@ lc_extrap(const int argc, const char **argv) { const size_t MAX_READS_TO_HOLD = 5000000; size_t n_paired = 0; size_t n_mates = 0; - n_reads = load_counts_BAM_pe(VERBOSE, input_file_name, - MAX_SEGMENT_LENGTH, - MAX_READS_TO_HOLD, n_paired, - n_mates, counts_hist); + n_reads = + load_counts_BAM_pe(VERBOSE, input_file_name, MAX_SEGMENT_LENGTH, + MAX_READS_TO_HOLD, n_paired, n_mates, counts_hist); if (VERBOSE) { cerr << "MERGED PAIRED END READS = " << n_paired << endl; cerr << "MATES PROCESSED = " << n_mates << endl; @@ -660,7 +645,7 @@ lc_extrap(const int argc, const char **argv) { cerr << "PAIRED_END_BED_INPUT" << endl; n_reads = load_counts_BED_pe(input_file_name, counts_hist); } - else { // default is single end bed file + else { // default is single end bed file if (VERBOSE) cerr << "BED_INPUT" << endl; n_reads = load_counts_BED_se(input_file_name, counts_hist); @@ -681,7 +666,7 @@ lc_extrap(const int argc, const char **argv) { const size_t distinct_counts = std::count_if(begin(counts_hist), end(counts_hist), - [](const double x) {return x > 0.0;}); + [](const double x) { return x > 0.0; }); if (VERBOSE) cerr << "TOTAL READS = " << n_reads << endl @@ -708,7 +693,7 @@ lc_extrap(const int argc, const char **argv) { // const size_t total_reads = get_counts_from_hist(counts_hist); - //assert(total_reads == n_reads); // ADS: why commented out? + // assert(total_reads == n_reads); // ADS: why commented out? // check that min required count is satisfied if (orig_max_terms < min_required_counts) @@ -719,17 +704,17 @@ lc_extrap(const int argc, const char **argv) { vector yield_estimates; if (SINGLE_ESTIMATE) { - - const bool single_estimate_success = - extrap_single_estimate(VERBOSE, allow_defects, counts_hist, orig_max_terms, - diagonal, step_size, max_extrap, yield_estimates); + const bool single_estimate_success = extrap_single_estimate( + VERBOSE, allow_defects, counts_hist, orig_max_terms, diagonal, + step_size, max_extrap, yield_estimates); // IF FAILURE, EXIT if (!single_estimate_success) throw runtime_error("single estimate failed, run " "full mode for estimates"); std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl; @@ -738,15 +723,15 @@ lc_extrap(const int argc, const char **argv) { out << 0 << '\t' << 0 << endl; for (size_t i = 0; i < yield_estimates.size(); ++i) - out << (i + 1)*step_size << '\t' << yield_estimates[i] << endl; + out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl; } else { if (VERBOSE) cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; - const size_t max_iter = 100*n_bootstraps; + const size_t max_iter = 100 * n_bootstraps; - vector > bootstrap_estimates; + vector> bootstrap_estimates; extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps, orig_max_terms, diagonal, step_size, max_extrap, max_iter, bootstrap_estimates); @@ -786,9 +771,7 @@ lc_extrap(const int argc, const char **argv) { static int gc_extrap(const int argc, const char **argv) { - try { - const size_t MIN_REQUIRED_COUNTS = 4; int diagonal = 0; @@ -801,58 +784,57 @@ gc_extrap(const int argc, const char **argv) { bool SINGLE_ESTIMATE = false; double max_extrap = 1.0e12; size_t n_bootstraps = 100; - unsigned long int seed = 408; + uint64_t seed = 408; bool allow_defects = false; bool NO_SEQUENCE = false; double c_level = 0.95; - const string description = + const string description = R"( "Extrapolate the size of the covered genome by mapped reads. This \ approach is described in Daley & Smith (2014). The method is the \ same as for lc_extrap: using rational function approximation to \ a power-series expansion for the number of \"unobserved\" bases \ in the initial sample. The gc_extrap method is adapted to deal \ - with individual nucleotides rather than distinct reads."; + with individual nucleotides rather than distinct reads.)"; // ********* GET COMMAND LINE ARGUMENTS FOR GC EXTRAP ********** - OptionParser opt_parse(strip_path(argv[1]), description, - ""); - opt_parse.add_opt("output", 'o', "coverage yield output file (default: stdout)", - false , outfile); - opt_parse.add_opt("max_width", 'w', "max fragment length, " + OptionParser opt_parse(strip_path(argv[1]), description, ""); + opt_parse.add_opt("output", 'o', + "coverage yield output file (default: stdout)", false, + outfile); + opt_parse.add_opt("max_width", 'w', + "max fragment length, " "set equal to read length for single end reads", false, max_width); - opt_parse.add_opt("bin_size", 'b', "bin size", - false, bin_size); - opt_parse.add_opt("extrap",'e',"maximum extrapolation in base pairs", + opt_parse.add_opt("bin_size", 'b', "bin size", false, bin_size); + opt_parse.add_opt("extrap", 'e', "maximum extrapolation in base pairs", false, max_extrap); - opt_parse.add_opt("step",'s',"step size in bases between extrapolations", + opt_parse.add_opt("step", 's', "step size in bases between extrapolations", false, base_step_size); - opt_parse.add_opt("bootstraps",'n',"number of bootstraps", - false, n_bootstraps); - opt_parse.add_opt("cval", 'c', "level for confidence intervals", - false, c_level); - opt_parse.add_opt("terms",'x',"maximum number of terms", - false, orig_max_terms); - opt_parse.add_opt("verbose", 'v', "print more information", - false, VERBOSE); + opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false, + n_bootstraps); + opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, + c_level); + opt_parse.add_opt("terms", 'x', "maximum number of terms", false, + orig_max_terms); + opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); opt_parse.add_opt("bed", 'B', "input is in bed format without sequence information", false, NO_SEQUENCE); - opt_parse.add_opt("quick",'Q', + opt_parse.add_opt("quick", 'Q', "quick mode: run gc_extrap without " "bootstrapping for confidence intervals", false, SINGLE_ESTIMATE); opt_parse.add_opt("defects", 'D', "defects mode to extrapolate without testing for defects", false, allow_defects); - opt_parse.add_opt("seed", 'r', "seed for random number generator", - false, seed); + opt_parse.add_opt("seed", 'r', "seed for random number generator", false, + seed); opt_parse.set_show_defaults(); vector leftover_args; - opt_parse.parse(argc-1, argv+1, leftover_args); + opt_parse.parse(argc - 1, argv + 1, leftover_args); if (argc == 2 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; @@ -886,8 +868,8 @@ gc_extrap(const int argc, const char **argv) { else { if (VERBOSE) cerr << "MAPPED READ FORMAT" << endl; - n_reads = load_coverage_counts_MR(VERBOSE, input_file_name, seed, bin_size, - max_width, coverage_hist); + n_reads = load_coverage_counts_MR(VERBOSE, input_file_name, seed, + bin_size, max_width, coverage_hist); } const double total_bins = get_counts_from_hist(coverage_hist); @@ -895,8 +877,8 @@ gc_extrap(const int argc, const char **argv) { const double distinct_bins = accumulate(coverage_hist.begin(), coverage_hist.end(), 0.0); - const double avg_bins_per_read = total_bins/n_reads; - double bin_step_size = base_step_size/bin_size; + const double avg_bins_per_read = total_bins / n_reads; + double bin_step_size = base_step_size / bin_size; const size_t max_observed_count = coverage_hist.size() - 1; @@ -914,8 +896,8 @@ gc_extrap(const int argc, const char **argv) { << "TOTAL BINS = " << total_bins << endl << "BINS PER READ = " << avg_bins_per_read << endl << "DISTINCT BINS = " << distinct_bins << endl - << "TOTAL BASES = " << total_bins*bin_size << endl - << "TOTAL COVERED BASES = " << distinct_bins*bin_size << endl + << "TOTAL BASES = " << total_bins * bin_size << endl + << "TOTAL COVERED BASES = " << distinct_bins * bin_size << endl << "MAX COVERAGE COUNT = " << max_observed_count << endl << "COUNTS OF 1 = " << coverage_hist[1] << endl; @@ -946,18 +928,17 @@ gc_extrap(const int argc, const char **argv) { vector coverage_estimates; if (SINGLE_ESTIMATE) { - - bool SINGLE_ESTIMATE_SUCCESS = - extrap_single_estimate(VERBOSE, allow_defects, coverage_hist, orig_max_terms, diagonal, - bin_step_size, max_extrap/bin_size, - coverage_estimates); + bool SINGLE_ESTIMATE_SUCCESS = extrap_single_estimate( + VERBOSE, allow_defects, coverage_hist, orig_max_terms, diagonal, + bin_step_size, max_extrap / bin_size, coverage_estimates); // IF FAILURE, EXIT if (!SINGLE_ESTIMATE_SUCCESS) throw runtime_error("SINGLE ESTIMATE FAILED, NEED TO RUN IN " "FULL MODE FOR ESTIMATES"); std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out << "TOTAL_BASES\tEXPECTED_DISTINCT" << endl; @@ -967,36 +948,33 @@ gc_extrap(const int argc, const char **argv) { out << 0 << '\t' << 0 << endl; for (size_t i = 0; i < coverage_estimates.size(); ++i) - out << (i + 1)*base_step_size << '\t' - << coverage_estimates[i]*bin_size << endl; + out << (i + 1) * base_step_size << '\t' + << coverage_estimates[i] * bin_size << endl; } else { - if (VERBOSE) cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; - const size_t max_iter = 10*n_bootstraps; + const size_t max_iter = 10 * n_bootstraps; - vector > bootstrap_estimates; - extrap_bootstrap(VERBOSE, allow_defects, seed, coverage_hist, n_bootstraps, orig_max_terms, - diagonal, bin_step_size, max_extrap/bin_size, - max_iter, bootstrap_estimates); + vector> bootstrap_estimates; + extrap_bootstrap(VERBOSE, allow_defects, seed, coverage_hist, + n_bootstraps, orig_max_terms, diagonal, bin_step_size, + max_extrap / bin_size, max_iter, bootstrap_estimates); if (VERBOSE) cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl; vector coverage_upper_ci_lognorm, coverage_lower_ci_lognorm; - vector_median_and_ci(bootstrap_estimates, c_level, - coverage_estimates, coverage_lower_ci_lognorm, + vector_median_and_ci(bootstrap_estimates, c_level, coverage_estimates, + coverage_lower_ci_lognorm, coverage_upper_ci_lognorm); if (VERBOSE) cerr << "[WRITING OUTPUT]" << endl; - write_predicted_coverage_curve(outfile, c_level, base_step_size, - bin_size, coverage_estimates, - coverage_lower_ci_lognorm, - coverage_upper_ci_lognorm); - + write_predicted_coverage_curve( + outfile, c_level, base_step_size, bin_size, coverage_estimates, + coverage_lower_ci_lognorm, coverage_upper_ci_lognorm); } } catch (runtime_error &e) { @@ -1010,65 +988,60 @@ gc_extrap(const int argc, const char **argv) { return EXIT_SUCCESS; } - - //////////////////////////////////////////////////////////////////////// ///// C_CURVE BELOW HERE static int c_curve(const int argc, const char **argv) { - try { - bool VERBOSE = false; bool PAIRED_END = false; bool HIST_INPUT = false; bool VALS_INPUT = false; - unsigned long int seed = 408; + uint64_t seed = 408; string outfile; size_t upper_limit = 0; double step_size = 1e6; - #ifdef HAVE_HTSLIB bool BAM_FORMAT_INPUT = false; size_t MAX_SEGMENT_LENGTH = 5000; #endif - const string description = - "Generate the complexity curve for data. This does not extrapolate, \ - but instead resamples from the given data."; + const string description = R"( +Generate the complexity curve for data. This does not extrapolate, \ +but instead resamples from the given data.)"; /********** GET COMMAND LINE ARGUMENTS FOR C_CURVE ***********/ OptionParser opt_parse(strip_path(argv[1]), description, ""); opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", - false , outfile); - opt_parse.add_opt("step",'s',"step size in extrapolations", - false, step_size); - opt_parse.add_opt("verbose", 'v', "print more information", - false, VERBOSE); - opt_parse.add_opt("pe", 'P', "input is paired end read file", - false, PAIRED_END); + false, outfile); + opt_parse.add_opt("step", 's', "step size in extrapolations", false, + step_size); + opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); + opt_parse.add_opt("pe", 'P', "input is paired end read file", false, + PAIRED_END); opt_parse.add_opt("hist", 'H', "input is a text file containing the observed histogram", false, HIST_INPUT); - opt_parse.add_opt("vals", 'V', - "input is a text file containing only the observed counts", - false, VALS_INPUT); + opt_parse.add_opt( + "vals", 'V', "input is a text file containing only the observed counts", + false, VALS_INPUT); #ifdef HAVE_HTSLIB - opt_parse.add_opt("bam", 'B', "input is in BAM format", - false, BAM_FORMAT_INPUT); - opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging " + opt_parse.add_opt("bam", 'B', "input is in BAM format", false, + BAM_FORMAT_INPUT); + opt_parse.add_opt("seg_len", 'l', + "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); #endif - opt_parse.add_opt("seed", 'r', "seed for random number generator", - false, seed); + opt_parse.add_opt("seed", 'r', "seed for random number generator", false, + seed); opt_parse.set_show_defaults(); vector leftover_args; - opt_parse.parse(argc-1, argv+1, leftover_args); + opt_parse.parse(argc - 1, argv + 1, leftover_args); if (argc == 2 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; @@ -1089,7 +1062,7 @@ c_curve(const int argc, const char **argv) { /******************************************************************/ // Setup the random number generator - srand(time(0) + getpid()); //give the random fxn a new seed + srand(time(0) + getpid()); // give the random fxn a new seed mt19937 rng(seed); vector counts_hist; @@ -1113,13 +1086,12 @@ c_curve(const int argc, const char **argv) { const size_t MAX_READS_TO_HOLD = 5000000; size_t n_paired = 0; size_t n_mates = 0; - n_reads = load_counts_BAM_pe(VERBOSE, input_file_name, - MAX_SEGMENT_LENGTH, MAX_READS_TO_HOLD, - n_paired, n_mates, counts_hist); + n_reads = + load_counts_BAM_pe(VERBOSE, input_file_name, MAX_SEGMENT_LENGTH, + MAX_READS_TO_HOLD, n_paired, n_mates, counts_hist); if (VERBOSE) cerr << "MERGED PAIRED END READS = " << n_paired << endl << "MATES PROCESSED = " << n_mates << endl; - } else if (BAM_FORMAT_INPUT) { if (VERBOSE) @@ -1132,7 +1104,7 @@ c_curve(const int argc, const char **argv) { cerr << "PAIRED_END_BED_INPUT" << endl; n_reads = load_counts_BED_pe(input_file_name, counts_hist); } - else { // default is single end bed file + else { // default is single end bed file if (VERBOSE) cerr << "BED_INPUT" << endl; n_reads = load_counts_BED_se(input_file_name, counts_hist); @@ -1146,7 +1118,7 @@ c_curve(const int argc, const char **argv) { const size_t distinct_counts = std::count_if(begin(counts_hist), end(counts_hist), - [](const double x) {return x > 0.0;}); + [](const double x) { return x > 0.0; }); if (VERBOSE) cerr << "TOTAL READS = " << n_reads << endl @@ -1166,14 +1138,16 @@ c_curve(const int argc, const char **argv) { } if (upper_limit == 0) - upper_limit = n_reads; //set upper limit to equal the number of molecules + upper_limit = n_reads; // set upper limit to equal the number of + // molecules - //handles output of c_curve + // handles output of c_curve std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); - //prints the complexity curve + // prints the complexity curve out << "total_reads" << "\t" << "distinct_reads" << endl; out << 0 << '\t' << 0 << endl; for (size_t i = step_size; i <= upper_limit; i += step_size) { @@ -1195,16 +1169,13 @@ c_curve(const int argc, const char **argv) { return EXIT_SUCCESS; } - ///////////////////////////////////////////////////////// ///////////////////////////////////////////////////////// // BOUND_UNOBS: bounding n_0 static int bound_pop(const int argc, const char **argv) { - try { - bool VERBOSE = false; bool PAIRED_END = false; bool HIST_INPUT = false; @@ -1223,48 +1194,56 @@ bound_pop(const int argc, const char **argv) { size_t n_bootstraps = 500; double c_level = 0.95; size_t max_iter = 100; - unsigned long int seed = 408; + uint64_t seed = 408; - const string description = - "Estimate the size of the underlying population based on counts \ - of observed species in an initial sample."; + const string description = R"( +Estimate the size of the underlying population based on counts \ +of observed species in an initial sample.)"; /********** GET COMMAND LINE ARGUMENTS FOR BOUND_POP ***********/ - OptionParser opt_parse(strip_path(argv[1]), - description, ""); - opt_parse.add_opt("output", 'o', "species richness output file " - "(default: stdout)", false , outfile); - opt_parse.add_opt("max_num_points",'p',"maximum number of points in " - "quadrature estimates", false, max_num_points); - opt_parse.add_opt("tolerance", 't', "numerical tolerance", - false, tolerance); - opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", - false, n_bootstraps); - opt_parse.add_opt("clevel", 'c', "level for confidence intervals", - false, c_level); - opt_parse.add_opt("verbose", 'v', "print more information", - false, VERBOSE); - opt_parse.add_opt("pe", 'P', "input is paired end read file", - false, PAIRED_END); - opt_parse.add_opt("hist", 'H', "input is a text file containing the " - "observed histogram", false, HIST_INPUT); - opt_parse.add_opt("vals", 'V', "input is a text file containing only the " - "observed duplicate counts", false, VALS_INPUT); + OptionParser opt_parse(strip_path(argv[1]), description, ""); + opt_parse.add_opt("output", 'o', + "species richness output file " + "(default: stdout)", + false, outfile); + opt_parse.add_opt("max_num_points", 'p', + "maximum number of points in " + "quadrature estimates", + false, max_num_points); + opt_parse.add_opt("tolerance", 't', "numerical tolerance", false, + tolerance); + opt_parse.add_opt("bootstraps", 'n', "number of bootstraps", false, + n_bootstraps); + opt_parse.add_opt("clevel", 'c', "level for confidence intervals", false, + c_level); + opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); + opt_parse.add_opt("pe", 'P', "input is paired end read file", false, + PAIRED_END); + opt_parse.add_opt("hist", 'H', + "input is a text file containing the " + "observed histogram", + false, HIST_INPUT); + opt_parse.add_opt("vals", 'V', + "input is a text file containing only the " + "observed duplicate counts", + false, VALS_INPUT); #ifdef HAVE_HTSLIB - opt_parse.add_opt("bam", 'B', "input is in BAM format", - false, BAM_FORMAT_INPUT); - opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging " + opt_parse.add_opt("bam", 'B', "input is in BAM format", false, + BAM_FORMAT_INPUT); + opt_parse.add_opt("seg_len", 'l', + "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); #endif - opt_parse.add_opt("quick", 'Q', "quick mode, estimate without bootstrapping", - false, QUICK_MODE); - opt_parse.add_opt("seed", 'r', "seed for random number generator", - false, seed); + opt_parse.add_opt("quick", 'Q', + "quick mode, estimate without bootstrapping", false, + QUICK_MODE); + opt_parse.add_opt("seed", 'r', "seed for random number generator", false, + seed); opt_parse.set_show_defaults(); vector leftover_args; - opt_parse.parse(argc-1, argv+1, leftover_args); + opt_parse.parse(argc - 1, argv + 1, leftover_args); if (argc == 2 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; @@ -1305,10 +1284,9 @@ bound_pop(const int argc, const char **argv) { const size_t MAX_READS_TO_HOLD = 5000000; size_t n_paired = 0; size_t n_mates = 0; - n_obs = load_counts_BAM_pe(VERBOSE, input_file_name, - MAX_SEGMENT_LENGTH, - MAX_READS_TO_HOLD, n_paired, - n_mates, counts_hist); + n_obs = + load_counts_BAM_pe(VERBOSE, input_file_name, MAX_SEGMENT_LENGTH, + MAX_READS_TO_HOLD, n_paired, n_mates, counts_hist); if (VERBOSE) { cerr << "MERGED PAIRED END READS = " << n_paired << endl; cerr << "MATES PROCESSED = " << n_mates << endl; @@ -1325,22 +1303,22 @@ bound_pop(const int argc, const char **argv) { cerr << "PAIRED_END_BED_INPUT" << endl; n_obs = load_counts_BED_pe(input_file_name, counts_hist); } - else { // default is single end bed file + else { // default is single end bed file if (VERBOSE) cerr << "BED_INPUT" << endl; n_obs = load_counts_BED_se(input_file_name, counts_hist); } - const double distinct_obs = accumulate(begin(counts_hist), end(counts_hist), 0.0); + const double distinct_obs = + accumulate(begin(counts_hist), end(counts_hist), 0.0); vector measure_moments; // mu_r = (r + 1)! n_{r+1} / n_1 size_t idx = 1; while (counts_hist[idx] > 0 && idx <= counts_hist.size()) { // idx + 1 because function calculates (x-1)! - measure_moments.push_back(exp(factorial(idx + 1) + - log(counts_hist[idx]) - - log(counts_hist[1]))); + measure_moments.push_back( + exp(factorial(idx + 1) + log(counts_hist[idx]) - log(counts_hist[1]))); if (!isfinite(measure_moments.back())) { measure_moments.pop_back(); break; @@ -1364,12 +1342,11 @@ bound_pop(const int argc, const char **argv) { cerr << std::setprecision(16) << measure_moments[i] << endl; } - if (QUICK_MODE) { - if (measure_moments.size() < 2*max_num_points) - max_num_points = static_cast(floor(measure_moments.size()/2)); + if (measure_moments.size() < 2 * max_num_points) + max_num_points = static_cast(floor(measure_moments.size() / 2)); else - measure_moments.resize(2*max_num_points); + measure_moments.resize(2 * max_num_points); size_t n_points = 0; n_points = ensure_pos_def_mom_seq(measure_moments, tolerance, VERBOSE); if (VERBOSE) @@ -1394,14 +1371,14 @@ bound_pop(const int argc, const char **argv) { } vector points, weights; - obs_mom_seq.Lower_quadrature_rules(VERBOSE, n_points, tolerance, - max_iter, points, weights); + obs_mom_seq.Lower_quadrature_rules(VERBOSE, n_points, tolerance, max_iter, + points, weights); // renormalize if needed const double weights_sum = accumulate(begin(weights), end(weights), 0.0); if (weights_sum != 1.0) for (size_t i = 0; i < weights.size(); i++) - weights[i] = weights[i]/weights_sum; + weights[i] = weights[i] / weights_sum; if (VERBOSE) { cerr << "points = " << endl; @@ -1418,7 +1395,7 @@ bound_pop(const int argc, const char **argv) { double estimated_unobs = 0.0; for (size_t i = 0; i < weights.size(); i++) - estimated_unobs += counts_hist[1]*weights[i]/points[i]; + estimated_unobs += counts_hist[1] * weights[i] / points[i]; if (estimated_unobs > 0.0) estimated_unobs += distinct_obs; @@ -1428,7 +1405,8 @@ bound_pop(const int argc, const char **argv) { } std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out.setf(std::ios_base::fixed, std::ios_base::floatfield); @@ -1436,14 +1414,12 @@ bound_pop(const int argc, const char **argv) { out << "quadrature_estimated_unobs" << '\t' << "n_points" << endl; out << estimated_unobs << '\t' << n_points << endl; - } // NOT QUICK MODE, BOOTSTRAP else { - vector quad_estimates; - //setup rng + // setup rng srand(time(0) + getpid()); mt19937 rng(seed); @@ -1457,27 +1433,29 @@ bound_pop(const int argc, const char **argv) { distinct_counts_hist.push_back(counts_hist[i]); } - for (size_t iter = 0; iter < max_iter && quad_estimates.size() < n_bootstraps; ++iter) { + for (size_t iter = 0; + iter < max_iter && quad_estimates.size() < n_bootstraps; ++iter) { if (VERBOSE) cerr << "iter=" << "\t" << iter << endl; vector sample_hist; - resample_hist(rng, counts_hist_distinct_counts, - distinct_counts_hist, sample_hist); + resample_hist(rng, counts_hist_distinct_counts, distinct_counts_hist, + sample_hist); - const double sampled_distinct = accumulate(begin(sample_hist), end(sample_hist), 0.0); + const double sampled_distinct = + accumulate(begin(sample_hist), end(sample_hist), 0.0); // initialize moments, 0th moment is 1 vector bootstrap_moments(1, 1.0); // moments[r] = (r + 1)! n_{r+1} / n_1 - for (size_t i = 0; i < 2*max_num_points; i++) { - bootstrap_moments.push_back(exp(factorial(i + 3) + - log(sample_hist[i + 2]) - - log(sample_hist[1])) ); + for (size_t i = 0; i < 2 * max_num_points; i++) { + bootstrap_moments.push_back(exp( + factorial(i + 3) + log(sample_hist[i + 2]) - log(sample_hist[1]))); } size_t n_points = 0; - n_points = ensure_pos_def_mom_seq(bootstrap_moments, tolerance, VERBOSE); + n_points = + ensure_pos_def_mom_seq(bootstrap_moments, tolerance, VERBOSE); n_points = min(n_points, max_num_points); if (VERBOSE) cerr << "n_points = " << n_points << endl; @@ -1489,15 +1467,16 @@ bound_pop(const int argc, const char **argv) { max_iter, points, weights); // renormalize if needed - const double weights_sum = accumulate(begin(weights), end(weights), 0.0); + const double weights_sum = + accumulate(begin(weights), end(weights), 0.0); if (weights_sum != 1.0) for (size_t i = 0; i < weights.size(); i++) - weights[i] = weights[i]/weights_sum; + weights[i] = weights[i] / weights_sum; double estimated_unobs = 0.0; for (size_t i = 0; i < weights.size(); i++) - estimated_unobs += counts_hist[1]*weights[i]/points[i]; + estimated_unobs += counts_hist[1] * weights[i] / points[i]; if (estimated_unobs > 0.0) estimated_unobs += sampled_distinct; @@ -1542,20 +1521,20 @@ bound_pop(const int argc, const char **argv) { } double median_estimate, lower_ci, upper_ci; - median_and_ci(quad_estimates, c_level, median_estimate, - lower_ci, upper_ci); + median_and_ci(quad_estimates, c_level, median_estimate, lower_ci, + upper_ci); std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out.setf(std::ios_base::fixed, std::ios_base::floatfield); out.precision(1); - out << "median_estimated_unobs" << '\t' - << "lower_ci" << '\t' << "upper_ci" << endl; - out << median_estimate << '\t' - << lower_ci << '\t' << upper_ci << endl; + out << "median_estimated_unobs" << '\t' << "lower_ci" << '\t' + << "upper_ci" << endl; + out << median_estimate << '\t' << lower_ci << '\t' << upper_ci << endl; } } catch (runtime_error &e) { @@ -1571,9 +1550,7 @@ bound_pop(const int argc, const char **argv) { static int pop_size(const int argc, const char **argv) { - try { - static const size_t min_required_counts = 4; static const string min_required_counts_error_message = "max count before zero is less than min required count (" + @@ -1588,7 +1565,7 @@ pop_size(const int argc, const char **argv) { size_t n_bootstraps = 100; int diagonal = 0; double c_level = 0.95; - unsigned long int seed = 408; + uint64_t seed = 408; /* FLAGS */ bool VERBOSE = false; @@ -1614,38 +1591,44 @@ pop_size(const int argc, const char **argv) { OptionParser opt_parse(strip_path(argv[1]), description, ""); opt_parse.add_opt("output", 'o', "yield output file (default: stdout)", - false , outfile); - opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap); - opt_parse.add_opt("steps",'s',"number of steps", false, n_desired_steps); - opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps); - opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level); - opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, orig_max_terms); + false, outfile); + opt_parse.add_opt("extrap", 'e', "maximum extrapolation", false, + max_extrap); + opt_parse.add_opt("steps", 's', "number of steps", false, n_desired_steps); + opt_parse.add_opt("boots", 'n', "number of bootstraps", false, + n_bootstraps); + opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, + c_level); + opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, + orig_max_terms); opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE); #ifdef HAVE_HTSLIB - opt_parse.add_opt("bam", 'B', "input is in BAM format", - false, BAM_FORMAT_INPUT); - opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging " + opt_parse.add_opt("bam", 'B', "input is in BAM format", false, + BAM_FORMAT_INPUT); + opt_parse.add_opt("seg_len", 'l', + "maximum segment length when merging " "paired end bam reads", false, MAX_SEGMENT_LENGTH); #endif - opt_parse.add_opt("pe", 'P', "input is paired end read file", - false, PAIRED_END); - opt_parse.add_opt("vals", 'V', - "input is a text file containing only the observed counts", - false, VALS_INPUT); + opt_parse.add_opt("pe", 'P', "input is paired end read file", false, + PAIRED_END); + opt_parse.add_opt( + "vals", 'V', "input is a text file containing only the observed counts", + false, VALS_INPUT); opt_parse.add_opt("hist", 'H', "input is a text file containing the observed histogram", false, HIST_INPUT); opt_parse.add_opt("quick", 'Q', "quick mode (no bootstraps) for confidence intervals", false, SINGLE_ESTIMATE); - opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects); - opt_parse.add_opt("seed", 'r', "seed for random number generator", - false, seed); + opt_parse.add_opt("defects", 'D', "no testing for defects", false, + allow_defects); + opt_parse.add_opt("seed", 'r', "seed for random number generator", false, + seed); opt_parse.set_show_defaults(); vector leftover_args; // ADS: suspect bug below; "-about" isn't working. - opt_parse.parse(argc-1, argv+1, leftover_args); + opt_parse.parse(argc - 1, argv + 1, leftover_args); if (argc == 2 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; @@ -1686,10 +1669,9 @@ pop_size(const int argc, const char **argv) { const size_t MAX_READS_TO_HOLD = 5000000; size_t n_paired = 0; size_t n_mates = 0; - n_reads = load_counts_BAM_pe(VERBOSE, input_file_name, - MAX_SEGMENT_LENGTH, - MAX_READS_TO_HOLD, n_paired, - n_mates, counts_hist); + n_reads = + load_counts_BAM_pe(VERBOSE, input_file_name, MAX_SEGMENT_LENGTH, + MAX_READS_TO_HOLD, n_paired, n_mates, counts_hist); if (VERBOSE) { cerr << "MERGED PAIRED END READS = " << n_paired << endl; cerr << "MATES PROCESSED = " << n_mates << endl; @@ -1706,7 +1688,7 @@ pop_size(const int argc, const char **argv) { cerr << "PAIRED_END_BED_INPUT" << endl; n_reads = load_counts_BED_pe(input_file_name, counts_hist); } - else { // default is single end bed file + else { // default is single end bed file if (VERBOSE) cerr << "BED_INPUT" << endl; n_reads = load_counts_BED_se(input_file_name, counts_hist); @@ -1726,13 +1708,13 @@ pop_size(const int argc, const char **argv) { orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1); if (max_extrap < 1.0) - max_extrap = 1000000000*distinct_reads; + max_extrap = 1000000000 * distinct_reads; if (step_size < 1.0) - step_size = (max_extrap - distinct_reads)/n_desired_steps; + step_size = (max_extrap - distinct_reads) / n_desired_steps; const size_t distinct_counts = std::count_if(begin(counts_hist), end(counts_hist), - [](const double x) {return x > 0.0;}); + [](const double x) { return x > 0.0; }); if (VERBOSE) cerr << "TOTAL READS = " << n_reads << endl @@ -1759,7 +1741,7 @@ pop_size(const int argc, const char **argv) { // const size_t total_reads = get_counts_from_hist(counts_hist); - //assert(total_reads == n_reads); // ADS: why commented out? + // assert(total_reads == n_reads); // ADS: why commented out? // check that min required count is satisfied if (orig_max_terms < min_required_counts) @@ -1771,17 +1753,17 @@ pop_size(const int argc, const char **argv) { vector yield_estimates; if (SINGLE_ESTIMATE) { - - const bool single_estimate_success = - extrap_single_estimate(VERBOSE, allow_defects, counts_hist, orig_max_terms, - diagonal, step_size, max_extrap, yield_estimates); + const bool single_estimate_success = extrap_single_estimate( + VERBOSE, allow_defects, counts_hist, orig_max_terms, diagonal, + step_size, max_extrap, yield_estimates); // IF FAILURE, EXIT if (!single_estimate_success) throw runtime_error("single estimate failed, run " "full mode for estimates"); std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl; @@ -1790,15 +1772,15 @@ pop_size(const int argc, const char **argv) { out << 0 << '\t' << 0 << endl; for (size_t i = 0; i < yield_estimates.size(); ++i) - out << (i + 1)*step_size << '\t' << yield_estimates[i] << endl; + out << (i + 1) * step_size << '\t' << yield_estimates[i] << endl; } else { if (VERBOSE) cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl; - const size_t max_iter = 100*n_bootstraps; + const size_t max_iter = 100 * n_bootstraps; - vector > bootstrap_estimates; + vector> bootstrap_estimates; extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps, orig_max_terms, diagonal, step_size, max_extrap, max_iter, bootstrap_estimates); @@ -1816,7 +1798,8 @@ pop_size(const int argc, const char **argv) { cerr << "[WRITING OUTPUT]" << endl; std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf()); out.setf(std::ios_base::fixed, std::ios_base::floatfield); @@ -1827,13 +1810,12 @@ pop_size(const int argc, const char **argv) { throw runtime_error("problem with number of estimates in pop_size"); const bool converged = - (yield_estimates[n_ests] - yield_estimates[n_ests-1] < 1.0); + (yield_estimates[n_ests] - yield_estimates[n_ests - 1] < 1.0); - out << "pop_size_estimate" << '\t' - << "lower_ci" << '\t' << "upper_ci" << endl; - out << yield_estimates.back() << '\t' - << yield_lower_ci_lognorm.back() << '\t' - << yield_upper_ci_lognorm.back(); + out << "pop_size_estimate" << '\t' << "lower_ci" << '\t' << "upper_ci" + << endl; + out << yield_estimates.back() << '\t' << yield_lower_ci_lognorm.back() + << '\t' << yield_upper_ci_lognorm.back(); if (!converged) out << "\tnot_converged"; out << endl; @@ -1850,52 +1832,59 @@ pop_size(const int argc, const char **argv) { return EXIT_SUCCESS; } +static string +usage_message() { + std::ostringstream oss; + oss << "preseq: a program for analyzing library complexity\n" + "Version: "; + oss << VERSION; + oss << "\n\n" + "Usage: preseq [OPTIONS]\n\n" + ": c_curve generate complexity curve for a library\n" + " lc_extrap predict the yield for future experiments\n" + " gc_extrap predict genome coverage low input\n" + " sequencing experiments\n" + " bound_pop lower bound on population size\n" + " pop_size estimate number of unique species\n"; + return oss.str(); +} + int main(const int argc, const char **argv) { - - static const string usage_message = - "preseq: a program for analyzing library complexity\n" - "Version: " + preseq_version + "\n\n" - "Usage: preseq [OPTIONS]\n\n" - ": c_curve generate complexity curve for a library\n" - " lc_extrap predict the yield for future experiments\n" - " gc_extrap predict genome coverage low input\n" - " sequencing experiments\n" - " bound_pop lower bound on population size\n" - " pop_size estimate number of unique species\n"; + // static string usage_message = + // "preseq: a program for analyzing library complexity\n" + // "Version: " + + // preseq_version + + // "\n\n" + // "Usage: preseq [OPTIONS]\n\n" + // ": c_curve generate complexity curve for a library\n" + // " lc_extrap predict the yield for future experiments\n" + // " gc_extrap predict genome coverage low input\n" + // " sequencing experiments\n" + // " bound_pop lower bound on population size\n" + // " pop_size estimate number of unique species\n"; if (argc < 2) - cerr << usage_message << endl; + cerr << usage_message() << endl; else if (strcmp(argv[1], "lc_extrap") == 0) { - return lc_extrap(argc, argv); - } else if (strcmp(argv[1], "c_curve") == 0) { - return c_curve(argc, argv); - } else if (strcmp(argv[1], "gc_extrap") == 0) { - return gc_extrap(argc, argv); - } else if (strcmp(argv[1], "bound_pop") == 0) { - return bound_pop(argc, argv); - } else if (strcmp(argv[1], "pop_size") == 0) { - return pop_size(argc, argv); - } else { cerr << "unrecognized command: " << argv[1] << endl << usage_message << endl; return EXIT_SUCCESS; - } } diff --git a/src/to-mr.cpp b/src/to-mr.cpp index 140b066..c9d4f99 100644 --- a/src/to-mr.cpp +++ b/src/to-mr.cpp @@ -19,61 +19,60 @@ #include -#include -#include -#include +#include #include -#include +#include #include -#include +#include +#include +#include +#include "MappedRead.hpp" #include "OptionParser.hpp" -#include "smithlab_utils.hpp" -#include "smithlab_os.hpp" #include "htslib_wrapper_deprecated.hpp" -#include "MappedRead.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -using std::string; -using std::vector; -using std::cout; using std::cerr; +using std::cout; using std::endl; using std::ifstream; using std::max; using std::min; using std::runtime_error; -using std::unordered_map; +using std::string; using std::swap; - +using std::unordered_map; +using std::vector; /********Below are functions for merging pair-end reads********/ static void fill_overlap(const bool pos_str, const MappedRead &mr, const size_t start, const size_t end, const size_t offset, string &seq, string &scr) { - const size_t a = pos_str ? (start - mr.r.get_start()) : (mr.r.get_end() - end); - const size_t b = pos_str ? (end - mr.r.get_start()) : (mr.r.get_end() - start); + const size_t a = + pos_str ? (start - mr.r.get_start()) : (mr.r.get_end() - end); + const size_t b = + pos_str ? (end - mr.r.get_start()) : (mr.r.get_end() - start); copy(mr.seq.begin() + a, mr.seq.begin() + b, seq.begin() + offset); copy(mr.scr.begin() + a, mr.scr.begin() + b, scr.begin() + offset); } static void -merge_mates(const size_t suffix_len, const size_t range, - const MappedRead &one, const MappedRead &two, - MappedRead &merged, int &len) { - +merge_mates(const size_t suffix_len, const size_t range, const MappedRead &one, + const MappedRead &two, MappedRead &merged, int &len) { const bool pos_str = one.r.pos_strand(); const size_t overlap_start = max(one.r.get_start(), two.r.get_start()); const size_t overlap_end = min(one.r.get_end(), two.r.get_end()); - const size_t one_left = pos_str ? - one.r.get_start() : max(overlap_end, one.r.get_start()); + const size_t one_left = + pos_str ? one.r.get_start() : max(overlap_end, one.r.get_start()); const size_t one_right = pos_str ? min(overlap_start, one.r.get_end()) : one.r.get_end(); - const size_t two_left = pos_str ? - max(overlap_end, two.r.get_start()) : two.r.get_start(); - const size_t two_right = pos_str ? - two.r.get_end() : min(overlap_start, two.r.get_end()); + const size_t two_left = + pos_str ? max(overlap_end, two.r.get_start()) : two.r.get_start(); + const size_t two_right = + pos_str ? two.r.get_end() : min(overlap_start, two.r.get_end()); len = pos_str ? (two_right - one_left) : (one_right - two_left); @@ -107,9 +106,11 @@ merge_mates(const size_t suffix_len, const size_t range, // use the mate with the most info to fill in the overlap if (info_one >= info_two) - fill_overlap(pos_str, one, overlap_start, overlap_end, lim_one, seq, scr); + fill_overlap(pos_str, one, overlap_start, overlap_end, lim_one, seq, + scr); else - fill_overlap(pos_str, two, overlap_start, overlap_end, lim_one, seq, scr); + fill_overlap(pos_str, two, overlap_start, overlap_end, lim_one, seq, + scr); } } @@ -125,8 +126,7 @@ merge_mates(const size_t suffix_len, const size_t range, } inline static bool -same_read(const size_t suffix_len, - const MappedRead &a, const MappedRead &b) { +same_read(const size_t suffix_len, const MappedRead &a, const MappedRead &b) { const string sa(a.r.get_name()); const string sb(b.r.get_name()); return std::equal(sa.begin(), sa.end() - suffix_len, sb.begin()); @@ -149,9 +149,7 @@ get_read_name(const SAMRecord &aln, const size_t suffix_len) { int main(int argc, const char **argv) { - try { - string outfile; string mapper = "general"; size_t max_frag_len = 1000; @@ -164,17 +162,15 @@ main(int argc, const char **argv) { "Convert the SAM/BAM output from " "to MethPipe mapped read format", "sam/bam_file"); - opt_parse.add_opt("output", 'o', "Name of output file", - false, outfile); + opt_parse.add_opt("output", 'o', "Name of output file", false, outfile); // opt_parse.add_opt("mapper", 'm', // "Original mapper: bismark, bs_seeker or general", // true, mapper); opt_parse.add_opt("suff", 's', "read name suffix length (default: 1)", false, suffix_len); - opt_parse.add_opt("max-frag", 'L', "maximum allowed insert size", - false, max_frag_len); - opt_parse.add_opt("verbose", 'v', "print more information", - false, VERBOSE); + opt_parse.add_opt("max-frag", 'L', "maximum allowed insert size", false, + max_frag_len); + opt_parse.add_opt("verbose", 'v', "print more information", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -199,12 +195,13 @@ main(int argc, const char **argv) { /****************** END COMMAND LINE OPTIONS *****************/ std::ofstream of; - if (!outfile.empty()) of.open(outfile.c_str()); + if (!outfile.empty()) + of.open(outfile.c_str()); std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf()); if (VERBOSE) cerr << "[input file: " << mapped_reads_file << "]" << endl - << "[output file: " - << (outfile.empty() ? "stdout" : outfile) << "]" << endl; + << "[output file: " << (outfile.empty() ? "stdout" : outfile) << "]" + << endl; SAMReader_deprecated sam_reader(mapped_reads_file, mapper); unordered_map dangling_mates; @@ -223,15 +220,16 @@ main(int argc, const char **argv) { MappedRead merged; int len = 0; - merge_mates(suffix_len, max_frag_len, - dangling_mates[read_name].mr, aln.mr, merged, len); + merge_mates(suffix_len, max_frag_len, dangling_mates[read_name].mr, + aln.mr, merged, len); if (len <= static_cast(max_frag_len)) out << merged << endl; else if (len > 0) out << dangling_mates[read_name].mr << endl << aln.mr << endl; dangling_mates.erase(read_name); } - else dangling_mates[read_name] = aln; + else + dangling_mates[read_name] = aln; // flush dangling_mates if (dangling_mates.size() > max_dangling) { @@ -239,12 +237,14 @@ main(int argc, const char **argv) { for (auto &&mates : dangling_mates) if (mates.second.mr.r.get_chrom() < aln.mr.r.get_chrom() || (mates.second.mr.r.get_chrom() == aln.mr.r.get_chrom() && - mates.second.mr.r.get_end() + max_frag_len < aln.mr.r.get_start())) { + mates.second.mr.r.get_end() + max_frag_len < + aln.mr.r.get_start())) { if (!mates.second.is_Trich) revcomp(mates.second.mr); out << mates.second.mr << endl; } - else tmp[mates.first] = mates.second; + else + tmp[mates.first] = mates.second; swap(tmp, dangling_mates); } }