Skip to content

Commit

Permalink
Add base count subsampling functions
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Mar 6, 2024
1 parent f416add commit 91b2221
Show file tree
Hide file tree
Showing 8 changed files with 330 additions and 49 deletions.
99 changes: 96 additions & 3 deletions lib/genesis/population/functions/subsample.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "genesis/population/functions/subsample.hpp"

#include "genesis/population/functions/functions.hpp"
#include "genesis/utils/math/distribution.hpp"

#include <cassert>
#include <cmath>
Expand All @@ -43,7 +44,7 @@ namespace population {
// Subscaling
// =================================================================================================

void transform_subscale_max_coverage(
void transform_subscale(
BaseCounts& sample,
size_t max_coverage
) {
Expand All @@ -61,6 +62,11 @@ void transform_subscale_max_coverage(
size_t g_count = sample.g_count * scale;
size_t t_count = sample.t_count * scale;

// For completeness, we also scale the n and d counts, but they do not influence our counts,
// as we do not want them to dominate.
sample.n_count *= scale;
sample.d_count *= scale;

// Now we might have some remainder due to the integer rounding, which we want to proportionally
// distribute across the numbers, so that the largest count gets most. We only processed four
// numbers, so the remainder of the rounding is less than 4.
Expand Down Expand Up @@ -138,18 +144,105 @@ void transform_subscale_max_coverage(
}

// Now set the values of the sample to our computed counts.
assert( a_count + c_count + g_count + t_count == max_coverage );
sample.a_count = a_count;
sample.c_count = c_count;
sample.g_count = g_count;
sample.t_count = t_count;
}

void transform_subscale_max_coverage(
void transform_subscale(
Variant& variant,
size_t max_coverage
) {
// Call the transformation for each sample in the variant.
for( auto& sample : variant.samples ) {
transform_subscale( sample, max_coverage );
}
}

// =================================================================================================
// Subsampling
// =================================================================================================

/**
* @brief Local helper function to avoid code duplication. Takes the sampler (with or without
* replacement) and performs the resampling of base counts.
*/
template<typename Sampler>
void transform_subsample_(
BaseCounts& sample,
size_t max_coverage,
Sampler sampler
) {
// Get the total sum. If this does not exceed the max, we are done already.
// We do not want the n and d counts to influce the total coverage here.
size_t const total_sum = sample.a_count + sample.c_count + sample.g_count + sample.t_count;
if( total_sum <= max_coverage ) {
return;
}

// Make a random draw from a multinomial distrubiton with the counts.
// Here, we also take n and d into account for the resampling.
auto const new_counts = sampler(
std::vector<size_t>{{
sample.a_count,
sample.c_count,
sample.g_count,
sample.t_count,
sample.n_count,
sample.d_count
}},
max_coverage
);
assert( new_counts.size() == 6 );

// Set the sample counts
sample.a_count = new_counts[0];
sample.c_count = new_counts[1];
sample.g_count = new_counts[2];
sample.t_count = new_counts[3];
sample.n_count = new_counts[4];
sample.d_count = new_counts[5];
}

void transform_subsample_with_replacement(
BaseCounts& sample,
size_t max_coverage
) {
// Call the local helper function template, to avoid code duplication.
return transform_subsample_<std::vector<size_t>(*)(std::vector<size_t> const&, size_t)>(
sample, max_coverage, utils::multinomial_distribution
);
}

void transform_subsample_with_replacement(
Variant& variant,
size_t max_coverage
) {
// Call the transformation for each sample in the variant.
for( auto& sample : variant.samples ) {
transform_subsample_with_replacement( sample, max_coverage );
}
}

void transform_subsample_without_replacement(
BaseCounts& sample,
size_t max_coverage
) {
// Call the local helper function template, to avoid code duplication.
return transform_subsample_<std::vector<size_t>(*)(std::vector<size_t> const&, size_t)>(
sample, max_coverage, utils::multivariate_hypergeometric_distribution
);
}

void transform_subsample_without_replacement(
Variant& variant,
size_t max_coverage
) {
// Call the transformation for each sample in the variant.
for( auto& sample : variant.samples ) {
transform_subscale_max_coverage( sample, max_coverage );
transform_subsample_without_replacement( sample, max_coverage );
}
}

Expand Down
65 changes: 62 additions & 3 deletions lib/genesis/population/functions/subsample.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,77 @@ namespace population {
* This is for instance useful when computing diversity estimators, which have a runtime and memory
* cost that depends on the coverage. Hence, subscaling can reduce the overall runtime and memory
* usage, without significantly altering the results.
*
* @see transform_subsample_with_replacement()
* @see transform_subsample_without_replacement()
*/
void transform_subscale(
BaseCounts& sample,
size_t max_coverage
);

/**
* @copydoc transform_subscale( BaseCounts&, size_t )
*
* This overload acts on all Variant::samples in the given @p variant.
*/
void transform_subscale(
Variant& variant,
size_t max_coverage
);

/**
* @brief Transform a BaseCounts @p sample by subsampling the nucleotide counts (`A`, `C`, `G`, `T`)
* _with_ replacement to sum up to @p max if @p max_coverage is exceeded for the sample.
*
* If the sum of nucleotide counts (that is, BaseCounts::a_count, BaseCounts::c_count,
* BaseCounts::g_count, and BaseCounts::t_count) exceeds the given @p max_coverage, the counts
* are resampled _with_ replacement so that their sum is the given @p max_coverage. This uses
* @link ::genesis::utils::multinomial_distribution() multinomial_distribution()@endlink
* for the sampling. If the count sum is below, nothing is done.
*
* @see transform_subscale()
* @see transform_subsample_without_replacement()
*/
void transform_subsample_with_replacement(
BaseCounts& sample,
size_t max_coverage
);

/**
* @copydoc transform_subsample_with_replacement( BaseCounts&, size_t )
*
* This overload acts on all Variant::samples in the given @p variant.
*/
void transform_subsample_with_replacement(
Variant& variant,
size_t max_coverage
);

/**
* @brief Transform a BaseCounts @p sample by subsampling the nucleotide counts (`A`, `C`, `G`, `T`)
* _without_ replacement to sum up to @p max if @p max_coverage is exceeded for the sample.
*
* If the sum of nucleotide counts (that is, BaseCounts::a_count, BaseCounts::c_count,
* BaseCounts::g_count, and BaseCounts::t_count) exceeds the given @p max_coverage, the counts
* are resampled _without_ replacement so that their sum is the given @p max_coverage. This uses
* @link ::genesis::utils::multivariate_hypergeometric_distribution() multivariate_hypergeometric_distribution()@endlink
* for the sampling. If the count sum is below, nothing is done.
*
* @see transform_subscale()
* @see transform_subsample_with_replacement()
*/
void transform_subscale_max_coverage(
void transform_subsample_without_replacement(
BaseCounts& sample,
size_t max_coverage
);

/**
* @copydoc transform_subscale_max_coverage( BaseCounts&, size_t, bool )
* @copydoc transform_subsample_without_replacement( BaseCounts&, size_t )
*
* This overload acts on all Variant::samples in the given @p variant.
*/
void transform_subscale_max_coverage(
void transform_subsample_without_replacement(
Variant& variant,
size_t max_coverage
);
Expand Down
39 changes: 37 additions & 2 deletions lib/genesis/population/functions/variant_input_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,20 @@

#include "genesis/population/functions/variant_input_stream.hpp"

#include "genesis/population/functions/subsample.hpp"

#include <cassert>
#include <stdexcept>
#include <unordered_set>

namespace genesis {
namespace population {

// =================================================================================================
// Transforms & Filters
// Sample Name Filter
// =================================================================================================

std::vector<bool> make_sample_filter(
std::vector<bool> make_sample_name_filter(
std::vector<std::string> const& sample_names,
std::vector<std::string> const& names_filter,
bool inverse_filter
Expand Down Expand Up @@ -133,6 +136,38 @@ std::function<void(Variant&)> make_variant_input_stream_sample_name_filter_trans
};
}

// =================================================================================================
// Sample Subsetting / Subsampling
// =================================================================================================

std::function<void(Variant&)> make_variant_input_stream_sample_subsampling_transform(
size_t max_coverage,
SubsamplingMethod method
) {
// Simply return a function object that applies the transformation.
switch( method ) {
case SubsamplingMethod::kSubscale: {
return [ max_coverage ]( Variant& variant ){
transform_subscale( variant, max_coverage );
};
}
case SubsamplingMethod::kSubsampleWithReplacement: {
return [ max_coverage ]( Variant& variant ){
transform_subsample_with_replacement( variant, max_coverage );
};
}
case SubsamplingMethod::kSubsampleWithoutReplacement: {
return [ max_coverage ]( Variant& variant ){
transform_subsample_without_replacement( variant, max_coverage );
};
}
}

throw std::invalid_argument(
"Invalid method provided for make_variant_input_stream_sample_subsampling_transform()"
);
}

// =================================================================================================
// Observers
// =================================================================================================
Expand Down
56 changes: 53 additions & 3 deletions lib/genesis/population/functions/variant_input_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace genesis {
namespace population {

// =================================================================================================
// Transforms & Filters
// Sample Name Filter
// =================================================================================================

/**
Expand All @@ -63,7 +63,7 @@ namespace population {
* the filtering might be wrong), and that the names in the @p names_filter actually appear in the
* @p sample_names.
*/
std::vector<bool> make_sample_filter(
std::vector<bool> make_sample_name_filter(
std::vector<std::string> const& sample_names,
std::vector<std::string> const& names_filter,
bool inverse_filter = false
Expand All @@ -74,7 +74,7 @@ std::vector<bool> make_sample_filter(
*
* The function expects a bool vector indicating which samples within a Variant to keep.
* The vector needs to have the same length as the Variant has samples. It can be created for instance
* with make_sample_filter() based on sample names. However, as Variant does not store the names
* with make_sample_name_filter() based on sample names. However, as Variant does not store the names
* itself, those might need to be accessed from the VariantInputStream data() function,
* which yields a VariantInputStreamData object.
*
Expand All @@ -86,6 +86,56 @@ std::function<void(Variant&)> make_variant_input_stream_sample_name_filter_trans
std::vector<bool> const& sample_filter
);

// =================================================================================================
// Sample Subsetting / Subsampling
// =================================================================================================

/**
* @brief Select which method to use for reducing the max coverage of a BaseCounts sample or a
* Variant.
*
* See make_variant_input_stream_sample_subsampling_transform() for usage.
*/
enum class SubsamplingMethod
{
/**
* @brief Use transform_subscale()
*/
kSubscale,

/**
* @brief Use transform_subsample_with_replacement()
*/
kSubsampleWithReplacement,

/**
* @brief Use transform_subsample_without_replacement()
*/
kSubsampleWithoutReplacement
};

/**
* @brief Create a Variant transformation function that subscales or subsamples the base counts
* to be below a given @p max_coverage.
*
* This is intended to be used as a transformation on a VariantInputStream, see
* @link ::genesis::utils::GenericInputStream::add_transform() add_transform()@endlink for details.
* The function creates a transformation function to be used on a stream, and subsamples or
* subscales the BaseCounts of each Variant in the stream, so that @p max_coverage is not
* exceeded. This is useful for instance when computing the pool sequencing diversity estimators,
* which have computational terms depending on coverage; reducing high coverage can hence help
* to improve computational time.
*
* By default, we use SubsamplingMethod::kSubscale, which is the closest to a lossless reduction
* of the coverage that can be achieved with integer counts. The two other methods instead
* resample from a distribution based on the given counts of the Variant, and can hence also be
* used to create in-silico alternative populations based on the original sample.
*/
std::function<void(Variant&)> make_variant_input_stream_sample_subsampling_transform(
size_t max_coverage,
SubsamplingMethod method = SubsamplingMethod::kSubscale
);

// =================================================================================================
// Observers
// =================================================================================================
Expand Down
Loading

0 comments on commit 91b2221

Please sign in to comment.