diff --git a/src/commands/analyze/diversity.cpp b/src/commands/analyze/diversity.cpp index bed4d1d..c5ca6be 100644 --- a/src/commands/analyze/diversity.cpp +++ b/src/commands/analyze/diversity.cpp @@ -74,6 +74,9 @@ void setup_diversity( CLI::App& app ) options->filter_numerical.sample_min_count.option->required(); options->filter_numerical.sample_min_count.option->check( CLI::PositiveNumber ); + // Also offer subsampling options for this command. + options->transform_subsample.add_subsample_opts_to_app( sub ); + // Settings for the windowing. options->window.add_window_opts_to_app( sub, true ); @@ -620,6 +623,12 @@ void run_diversity( DiversityOptions const& options ) filter.only_snps = true; filter = options.filter_numerical.get_sample_filter( filter ).first; + // TODO right now, the numerical filters are applied in the diversity calculator, instead + // of on the stream beforehand - this will change soon once we support variant filters + // with masking properly. but in the meantime, that means that the subsampling will be + // applied _before_ the max coverage... need to fix asap! + options.transform_subsample.add_subsample_transformation( options.variant_input ); + // Get all samples names from the input file. auto const& sample_names = options.variant_input.sample_names(); diff --git a/src/commands/analyze/diversity.hpp b/src/commands/analyze/diversity.hpp index a263bba..1036aab 100644 --- a/src/commands/analyze/diversity.hpp +++ b/src/commands/analyze/diversity.hpp @@ -3,7 +3,7 @@ /* grenedalf - Genome Analyses of Differential Allele Frequencies - Copyright (C) 2020-2023 Lucas Czech + Copyright (C) 2020-2024 Lucas Czech This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,6 +31,7 @@ #include "options/table_output.hpp" #include "options/variant_filter_numerical.hpp" #include "options/variant_input.hpp" +#include "options/variant_transform_subsample.hpp" #include "options/window.hpp" #include "tools/cli_option.hpp" @@ -46,10 +47,11 @@ class DiversityOptions public: // Input options - VariantInputOptions variant_input; - VariantFilterNumericalOptions filter_numerical; - WindowOptions window; - PoolsizesOptions poolsizes; + VariantInputOptions variant_input; + VariantFilterNumericalOptions filter_numerical; + VariantTransformSubsampleOptions transform_subsample; + WindowOptions window; + PoolsizesOptions poolsizes; // Using defaults from PoPoolation, see Variance-sliding.pl CliOption measure = "all"; diff --git a/src/options/variant_transform_subsample.cpp b/src/options/variant_transform_subsample.cpp new file mode 100644 index 0000000..1a33a49 --- /dev/null +++ b/src/options/variant_transform_subsample.cpp @@ -0,0 +1,137 @@ +/* + grenedalf - Genome Analyses of Differential Allele Frequencies + Copyright (C) 2020-2024 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + Department of Plant Biology, Carnegie Institution For Science + 260 Panama Street, Stanford, CA 94305, USA +*/ + +#include "options/variant_transform_subsample.hpp" + +#include "options/global.hpp" +#include "tools/misc.hpp" + +#include "genesis/population/functions/subsample.hpp" +#include "genesis/population/functions/variant_input_stream.hpp" +#include "genesis/utils/text/string.hpp" + +#include +#include + +// ================================================================================================= +// Setup Functions +// ================================================================================================= + +void VariantTransformSubsampleOptions::add_subsample_opts_to_app( + CLI::App* sub, + std::string const& group +) { + // Correct setup check. + internal_check( + max_coverage_.option == nullptr, + "Cannot use the same VariantTransformSubsampleOptions object multiple times." + ); + + // Rename samples option. + max_coverage_.option = sub->add_option( + "--subsample-max-coverage", + max_coverage_.value, + "If provided, the nucleotide counts of each sample are subsampled so that they do not " + "exceed this given maximum total coverage (sum of the four nucleotide counts). " + "If they are below this value anyway, they are not changed. " + "This transformation is useful to limit the maximum coverage. For instance, the diversity " + "estimators for Theta Pi and Theta Watterson have terms that depend on coverage. " + "In particular when merging samples such as with `--sample-group-merge-table-file`, " + "having an upper limit can hence avoid long compute times." + // "This transformation is applied after any filters, so that, e.g., filters high coverage " + // "remove any unwanted positions first. See `--subsample-method` for the subsampling method." + ); + max_coverage_.option->group( group ); + + // Add option for sample name filter. + method_.option = sub->add_option( + "--subsample-method", + method_.value, + "When using `--subsample-max-coverage`, decide which method to use. The default `subscale` " + "simply re-scales the base counts to the given max coverage, and hence maintains the " + "allele frequencies (within integer precision). We recommend to use this to subsample to, " + "e.g., a max coverage of 10,000, which is a good compromise in most cases." + "The two alternative options re-sample instead, with and without replacement, by drawing " + "from a multinomial or multivariate hypergeometric distribution, respectively, based on " + "the original counts of the sample." + ); + method_.option->group( group ); + method_.option->needs( max_coverage_.option ); + method_.option->transform( + CLI::IsMember({ + "subscale", "subsample-with-replacement", "subsample-without-replacement" + }, CLI::ignore_case ) + ); +} + +// ================================================================================================= +// Run Functions +// ================================================================================================= + +// ------------------------------------------------------------------------- +// add_transform_subsample_filter +// ------------------------------------------------------------------------- + +void VariantTransformSubsampleOptions::add_subsample_transformation( + VariantInputOptions const& variant_input +) const { + using namespace genesis::population; + + // Nothing to do. + if( ! max_coverage_.option || ! *max_coverage_.option || max_coverage_.value == 0 ) { + return; + } + + // Add a transformation to the variant stream. + // We need to use a bool return value in the lambdas, + // as that's expected by the VariantInputOptions. + auto const method = genesis::utils::to_lower( method_.value ); + auto const max_coverage = max_coverage_.value; + if( method == "subscale" ) { + variant_input.add_combined_filter_and_transforms( + [ max_coverage ]( Variant& variant ){ + transform_subscale( variant, max_coverage ); + return true; + } + ); + } else if( method == "subsample-with-replacement" ) { + variant_input.add_combined_filter_and_transforms( + [ max_coverage ]( Variant& variant ){ + transform_subsample_with_replacement( variant, max_coverage ); + return true; + } + ); + } else if( method == "subsample-without-replacement" ) { + variant_input.add_combined_filter_and_transforms( + [ max_coverage ]( Variant& variant ){ + transform_subsample_without_replacement( variant, max_coverage ); + return true; + } + ); + } else { + throw CLI::ValidationError( + method_.option->get_name() + "(" + method_.value + ")", + "Invalid value." + ); + } +} diff --git a/src/options/variant_transform_subsample.hpp b/src/options/variant_transform_subsample.hpp new file mode 100644 index 0000000..213f671 --- /dev/null +++ b/src/options/variant_transform_subsample.hpp @@ -0,0 +1,89 @@ +#ifndef GRENEDALF_OPTIONS_VARIANT_TRANSFORM_SUBSAMPLE_H_ +#define GRENEDALF_OPTIONS_VARIANT_TRANSFORM_SUBSAMPLE_H_ + +/* + grenedalf - Genome Analyses of Differential Allele Frequencies + Copyright (C) 2020-2024 Lucas Czech + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + Contact: + Lucas Czech + Department of Plant Biology, Carnegie Institution For Science + 260 Panama Street, Stanford, CA 94305, USA +*/ + +#include "CLI/CLI.hpp" + +#include "tools/cli_option.hpp" +#include "options/variant_input.hpp" + +#include "genesis/population/streams/variant_input_stream.hpp" +#include "genesis/population/variant.hpp" + +#include +#include + +// ================================================================================================= +// Variant Transform Subsample Options +// ================================================================================================= + +/** + * @brief + */ +class VariantTransformSubsampleOptions +{ +public: + + // ------------------------------------------------------------------------- + // Constructor and Rule of Five + // ------------------------------------------------------------------------- + + VariantTransformSubsampleOptions() = default; + ~VariantTransformSubsampleOptions() = default; + + VariantTransformSubsampleOptions( VariantTransformSubsampleOptions const& ) = default; + VariantTransformSubsampleOptions( VariantTransformSubsampleOptions&& ) = default; + + VariantTransformSubsampleOptions& operator= ( VariantTransformSubsampleOptions const& ) = default; + VariantTransformSubsampleOptions& operator= ( VariantTransformSubsampleOptions&& ) = default; + + // ------------------------------------------------------------------------- + // Setup Functions + // ------------------------------------------------------------------------- + + void add_subsample_opts_to_app( + CLI::App* sub, + std::string const& group = "Sample Subsampling" + ); + + // ------------------------------------------------------------------------- + // Run Functions + // ------------------------------------------------------------------------- + + void add_subsample_transformation( VariantInputOptions const& variant_input ) const; + + // ------------------------------------------------------------------------- + // Option Members + // ------------------------------------------------------------------------- + +private: + + // Sample subsample transform + CliOption max_coverage_ = 0; + CliOption method_ = "subscale"; + +}; + +#endif // include guard