Skip to content

Commit

Permalink
Rework diversity command to use new tagging filters
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed May 31, 2024
1 parent 6a50159 commit 3806574
Show file tree
Hide file tree
Showing 7 changed files with 294 additions and 388 deletions.
546 changes: 215 additions & 331 deletions src/commands/analyze/diversity.cpp

Large diffs are not rendered by default.

21 changes: 8 additions & 13 deletions src/commands/analyze/diversity.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact:
Lucas Czech <[email protected]>
Department of Plant Biology, Carnegie Institution For Science
260 Panama Street, Stanford, CA 94305, USA
Lucas Czech <[email protected]>
University of Copenhagen, Globe Institute, Section for GeoGenetics
Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
*/

#include "CLI/CLI.hpp"

#include "options/diversity_processor.hpp"
#include "options/file_output.hpp"
#include "options/poolsizes.hpp"
#include "options/table_output.hpp"
Expand All @@ -51,17 +52,11 @@ class DiversityOptions
VariantFilterNumericalOptions filter_numerical;
VariantTransformSubsampleOptions transform_subsample;
WindowOptions window;
PoolsizesOptions poolsizes;

// Using defaults from PoPoolation, see Variance-sliding.pl
CliOption<std::string> measure = "all";
// CliOption<size_t> min_count = 2;
// CliOption<size_t> min_read_depth = 4;
// CliOption<size_t> max_read_depth = 1000000;
// CliOption<double> min_read_depth_fraction = 0.6;

// Compatibility
CliOption<bool> with_popoolation_bugs = false;
// Specific settings
DiversityProcessorOptions diversity_processor;
CliOption<bool> no_extra_columns = false;
CliOption<bool> no_nan_windows = false;
CliOption<bool> popoolation_format = false;

// Output options
Expand Down
72 changes: 36 additions & 36 deletions src/commands/analyze/fst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ void setup_fst( CLI::App& app )
// -------------------------------------------------------------------------

// Fst Processor
options->fst_processor.add_fst_processor_opts_to_app( sub, true, "Settings" );
options->fst_processor.add_fst_processor_opts_to_app( sub, true );

// Settings: Write pi tables
options->write_pi_tables.option = sub->add_flag(
Expand Down Expand Up @@ -273,8 +273,8 @@ void prepare_output_files_(
(*target) << "chrom" << sep_char << "start" << sep_char << "end";

// Print the extra column headers if needed.
// Need to make sure that this is the same order as used in print_total_stats_()
// and print_sample_stats_(). Change all occurrences if needed.
// Need to make sure that this is the same order as used in write_total_stats_()
// and write_sample_stats_(). Change all occurrences if needed.
if( !options.no_extra_columns.value ) {
(*target) << sep_char << "total.masked";
(*target) << sep_char << "total.missing";
Expand Down Expand Up @@ -307,10 +307,10 @@ void prepare_output_files_(
}

// -------------------------------------------------------------------------
// print_total_stats_
// write_total_stats_
// -------------------------------------------------------------------------

void print_total_stats_(
void write_total_stats_(
FstOptions const& options,
genesis::population::VariantFilterStats const& variant_filter_stats,
std::shared_ptr<genesis::utils::BaseOutputTarget>& target
Expand Down Expand Up @@ -342,10 +342,10 @@ void print_total_stats_(
}

// -------------------------------------------------------------------------
// print_sample_stats_
// write_sample_stats_
// -------------------------------------------------------------------------

void print_sample_stats_(
void write_sample_stats_(
FstOptions const& options,
genesis::population::SampleCountsFilterStats const& sample_counts_filter_stats,
std::shared_ptr<genesis::utils::BaseOutputTarget>& target
Expand All @@ -369,7 +369,7 @@ void print_sample_stats_(
}

// -------------------------------------------------------------------------
// print_pairwise_value_list_
// write_pairwise_value_list_
// -------------------------------------------------------------------------

/**
Expand All @@ -378,7 +378,7 @@ void print_sample_stats_(
* This is meant for whole genome computations, where we have a single value per pair of samples.
* This function then prints these values, in rows of the form "Sample1 Sample2 value".
*/
void print_pairwise_value_list_(
void write_pairwise_value_list_(
FstOptions const& options,
std::string const& value_name,
::genesis::population::FstPoolProcessor const& processor,
Expand All @@ -400,8 +400,8 @@ void print_pairwise_value_list_(
(*target) << "first" << sep_char << "second";

// Print the additional column headers, if needed.
// Need to make sure that this is the same order as used in print_total_stats_()
// and print_sample_stats_(). Change all occurrences if needed.
// Need to make sure that this is the same order as used in write_total_stats_()
// and write_sample_stats_(). Change all occurrences if needed.
if( !options.no_extra_columns.value ) {
(*target) << sep_char << "total.masked";
(*target) << sep_char << "total.missing";
Expand All @@ -421,16 +421,16 @@ void print_pairwise_value_list_(
(*target) << sample_names[pair.first] << sep_char << sample_names[pair.second];

// Print the extra columns as needed.
print_total_stats_( options, processor.get_filter_stats(), target );
print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
write_total_stats_( options, processor.get_filter_stats(), target );
write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );

// Now print the actual value we want.
(*target) << sep_char << values[i] << "\n";
}
}

// -------------------------------------------------------------------------
// print_pairwise_value_matrix_
// write_pairwise_value_matrix_
// -------------------------------------------------------------------------

/**
Expand All @@ -440,7 +440,7 @@ void print_pairwise_value_list_(
* This function then prints these values, as a symmetrical matrix, with the samples in the rows
* and columns.
*/
void print_pairwise_value_matrix_(
void write_pairwise_value_matrix_(
FstOptions const& options,
std::string const& value_name,
std::vector<double> const& values,
Expand Down Expand Up @@ -470,10 +470,10 @@ void print_pairwise_value_matrix_(
}

// -------------------------------------------------------------------------
// print_output_line_
// write_output_line_
// -------------------------------------------------------------------------

void print_output_line_(
void write_output_line_(
FstOptions const& options,
::genesis::population::FstPoolProcessor const& processor,
VariantWindowView const& window,
Expand All @@ -492,11 +492,11 @@ void print_output_line_(
(*target) << window.chromosome();
(*target) << sep_char << window.first_position();
(*target) << sep_char << window.last_position();
print_total_stats_( options, processor.get_filter_stats(), target );
write_total_stats_( options, processor.get_filter_stats(), target );

// Write the per-pair FST values in the correct order.
for( size_t i = 0; i < values.size(); ++i ) {
print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
if( std::isfinite( values[i] ) ) {
(*target) << sep_char << values[i];
} else {
Expand All @@ -507,15 +507,15 @@ void print_output_line_(
}

// -------------------------------------------------------------------------
// print_fst_files_
// write_fst_files_
// -------------------------------------------------------------------------

/**
* @brief Print the fst values for all sample pairs, for a given window.
*
* This function is called once per window, and writes out the per-pair results to the output file.
*/
void print_to_output_files_(
void write_to_output_files_(
FstOptions const& options,
::genesis::population::FstPoolProcessor const& processor,
VariantWindowView const& window,
Expand Down Expand Up @@ -559,15 +559,15 @@ void print_to_output_files_(
++state.skipped_count;
} else {
++state.used_count;
print_output_line_( options, processor, window, window_fst, state.fst_output_target );
write_output_line_( options, processor, window, window_fst, state.fst_output_target );
if( options.write_pi_tables.value ) {
print_output_line_(
write_output_line_(
options, processor, window, std::get<0>(*pi_vectors), state.pi_within_output_target
);
print_output_line_(
write_output_line_(
options, processor, window, std::get<1>(*pi_vectors), state.pi_between_output_target
);
print_output_line_(
write_output_line_(
options, processor, window, std::get<2>(*pi_vectors), state.pi_total_output_target
);
}
Expand All @@ -576,29 +576,29 @@ void print_to_output_files_(
// For whole genome, and then also for all-to-all, we produce special tables on top.
if( state.is_whole_genome ) {
internal_check( options.window.get_num_windows() == 1, "Whole genome with multiple windows" );
print_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
write_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
if( options.write_pi_tables.value ) {
print_pairwise_value_list_(
write_pairwise_value_list_(
options, "pi-within", processor, std::get<0>(*pi_vectors), sample_pairs
);
print_pairwise_value_list_(
write_pairwise_value_list_(
options, "pi-between", processor, std::get<1>(*pi_vectors), sample_pairs
);
print_pairwise_value_list_(
write_pairwise_value_list_(
options, "pi-total", processor, std::get<2>(*pi_vectors), sample_pairs
);
}

if( state.is_all_to_all ) {
print_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
write_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
if( options.write_pi_tables.value ) {
print_pairwise_value_matrix_(
write_pairwise_value_matrix_(
options, "pi-within", std::get<0>(*pi_vectors), sample_pairs
);
print_pairwise_value_matrix_(
write_pairwise_value_matrix_(
options, "pi-between", std::get<1>(*pi_vectors), sample_pairs
);
print_pairwise_value_matrix_(
write_pairwise_value_matrix_(
options, "pi-total", std::get<2>(*pi_vectors), sample_pairs
);
}
Expand Down Expand Up @@ -676,7 +676,7 @@ void run_fst( FstOptions const& options )
// Reset all calculator accumulators to zero for this window.
processor.reset();

// Compute diversity over samples.
// Compute FST between pairs of samples.
size_t processed_cnt = 0;
for( auto const& variant : window ) {
internal_check(
Expand All @@ -691,8 +691,8 @@ void run_fst( FstOptions const& options )
"Number of processed positions does not equal filter sum"
);

// Print the output to all files that we want.
print_to_output_files_( options, processor, window, sample_pairs, state );
// Write the output to all files that we want.
write_to_output_files_( options, processor, window, sample_pairs, state );
}
internal_check(
state.used_count + state.skipped_count == options.window.get_num_windows(),
Expand Down
4 changes: 2 additions & 2 deletions src/commands/analyze/fst.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class FstOptions

// Specific settings
FstProcessorOptions fst_processor;
CliOption<bool> write_pi_tables = false;
CliOption<bool> write_pi_tables = false;
CliOption<bool> no_extra_columns = false;
CliOption<bool> no_nan_windows = false;
CliOption<bool> no_nan_windows = false;

// Output options
TableOutputOptions table_output;
Expand Down
10 changes: 10 additions & 0 deletions src/options/diversity_processor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,16 @@ genesis::population::DiversityPoolProcessor DiversityProcessorOptions::get_diver
) const {
using namespace genesis::population;

// User input check
if( no_theta_pi.value && no_theta_watterson.value && no_tajima_d.value ) {
throw CLI::ValidationError(
no_theta_pi.option->get_name() + ", " +
no_theta_watterson.option->get_name() + ", " +
no_tajima_d.option->get_name(),
"All diversity metrics have been deactivate. Nothing is being computed."
);
}

// Get and check the pool sizes.
auto const pool_sizes = poolsizes.get_pool_sizes( sample_names );
internal_check(
Expand Down
20 changes: 18 additions & 2 deletions src/options/diversity_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,21 @@ class DiversityProcessorOptions
genesis::population::SampleCountsFilterNumericalParams filter_params
) const;

bool compute_theta_pi() const
{
return ! no_theta_pi.value;
}

bool compute_theta_watterson() const
{
return ! no_theta_watterson.value;
}

bool compute_tajima_d() const
{
return ! no_tajima_d.value;
}

// -------------------------------------------------------------------------
// Option Members
// -------------------------------------------------------------------------
Expand All @@ -98,11 +113,12 @@ class DiversityProcessorOptions
CliOption<bool> no_tajima_d = false;

// General settings
PoolsizesOptions poolsizes;
WindowAverageOptions window_average_policy;
PoolsizesOptions poolsizes;
WindowAverageOptions window_average_policy;

// Remnant settings. These are simply re-used from the numerical filters now.
// Kept here for reference, just in case they are needed for later.
// Using defaults from PoPoolation, see Variance-sliding.pl
// CliOption<size_t> min_count = 2;
// CliOption<size_t> min_read_depth = 4;
// CliOption<size_t> max_read_depth = 1000000;
Expand Down
9 changes: 5 additions & 4 deletions src/options/variant_transform_subsample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
"--subsample-max-read-depth",
max_read_depth_.value,
"If provided, the nucleotide counts of each sample are subsampled so that they do not "
"exceed this given maximum total read depth (sum of the four nucleotide counts, as well as "
"the any `N` and deleted `D` counts). "
"exceed this given maximum total read depth (sum of the four nucleotide counts `ACGT`, "
"as well as the any `N` and deleted `D` counts). "
"If they are below this value anyway, they are not changed. "
"This transformation is useful to limit the maximum read depth. For instance, the diversity "
"estimators for Theta Pi and Theta Watterson have terms that depend on read depth. "
Expand All @@ -63,8 +63,9 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
"Furthermore, a very low Tajima's D, usually indicative of a selective sweep, may be found "
"as an artifact in highly covered regions, as such regions have just more sequencing errors. "
"To avoid these kinds of biases we recommend to subsample to an uniform read depth. "
// "This transformation is applied after any filters, so that, e.g., filters high read depth "
// "remove any unwanted positions first. See `--subsample-method` for the subsampling method."
"This transformation is applied after the numerical filters, so that, e.g., filters for "
"high read depth are able to remove any unwanted positions first. "
"See `--subsample-method` for the subsampling method."
);
max_read_depth_.option->group( group );

Expand Down

0 comments on commit 3806574

Please sign in to comment.