From 57b8f6762314c3021c08bea79e0d5fa6e6747603 Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Wed, 13 Mar 2024 14:48:11 -0300 Subject: [PATCH] Refine subsampling option and functions --- CMakeLists.txt | 2 +- libs/genesis | 2 +- src/options/variant_filter_region.cpp | 1 + src/options/variant_transform_subsample.cpp | 15 ++++++++++----- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c50976f..9f1755f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" ) # These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out. # Thus, do not replace the hashes manually! SET( CLI11_COMMIT_HASH "5cb3efabce007c3a0230e4cc2e27da491c646b6c" ) #CLI11_COMMIT_HASH# -SET( genesis_COMMIT_HASH "91b2221df7921f080d362bbca134708fd962c864" ) #genesis_COMMIT_HASH# +SET( genesis_COMMIT_HASH "5aeac184fc0b5d78b72e944309a8d688d85fc581" ) #genesis_COMMIT_HASH# # Call the github download function, which takes four arguments: # - LIBPATH : Path to the libracy dir where dependencies are stored. diff --git a/libs/genesis b/libs/genesis index 91b2221..5aeac18 160000 --- a/libs/genesis +++ b/libs/genesis @@ -1 +1 @@ -Subproject commit 91b2221df7921f080d362bbca134708fd962c864 +Subproject commit 5aeac184fc0b5d78b72e944309a8d688d85fc581 diff --git a/src/options/variant_filter_region.cpp b/src/options/variant_filter_region.cpp index 4af198b..ed61c70 100644 --- a/src/options/variant_filter_region.cpp +++ b/src/options/variant_filter_region.cpp @@ -30,6 +30,7 @@ #include "genesis/population/formats/genome_region_reader.hpp" #include "genesis/population/formats/gff_reader.hpp" #include "genesis/population/formats/map_bim_reader.hpp" +#include "genesis/population/formats/vcf_common.hpp" #include "genesis/population/formats/vcf_input_stream.hpp" #include "genesis/population/functions/filter_transform.hpp" #include "genesis/population/functions/functions.hpp" diff --git a/src/options/variant_transform_subsample.cpp b/src/options/variant_transform_subsample.cpp index 1a33a49..d50f6cc 100644 --- a/src/options/variant_transform_subsample.cpp +++ b/src/options/variant_transform_subsample.cpp @@ -48,16 +48,21 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app( ); // Rename samples option. + // See https://www.kofler.or.at/bioinformatic/wp-content/uploads/2018/07/pooledAnalysis_part1.pdf max_coverage_.option = sub->add_option( "--subsample-max-coverage", max_coverage_.value, "If provided, the nucleotide counts of each sample are subsampled so that they do not " - "exceed this given maximum total coverage (sum of the four nucleotide counts). " + "exceed this given maximum total coverage (sum of the four nucleotide counts, as well as " + "the any `N` and deleted `D` counts). " "If they are below this value anyway, they are not changed. " "This transformation is useful to limit the maximum coverage. For instance, the diversity " "estimators for Theta Pi and Theta Watterson have terms that depend on coverage. " "In particular when merging samples such as with `--sample-group-merge-table-file`, " - "having an upper limit can hence avoid long compute times." + "having an upper limit can hence avoid long compute times. " + "Furthermore, a very low Tajima's D, usually indicative of a selective sweep, may be found " + "as an artifact in highly covered regions, as such regions have just more sequencing errors. " + "To avoid these kinds of biases we recommend to subsample to an uniform coverage. " // "This transformation is applied after any filters, so that, e.g., filters high coverage " // "remove any unwanted positions first. See `--subsample-method` for the subsampling method." ); @@ -110,21 +115,21 @@ void VariantTransformSubsampleOptions::add_subsample_transformation( if( method == "subscale" ) { variant_input.add_combined_filter_and_transforms( [ max_coverage ]( Variant& variant ){ - transform_subscale( variant, max_coverage ); + subscale_counts( variant, max_coverage ); return true; } ); } else if( method == "subsample-with-replacement" ) { variant_input.add_combined_filter_and_transforms( [ max_coverage ]( Variant& variant ){ - transform_subsample_with_replacement( variant, max_coverage ); + subsample_counts_with_replacement( variant, max_coverage ); return true; } ); } else if( method == "subsample-without-replacement" ) { variant_input.add_combined_filter_and_transforms( [ max_coverage ]( Variant& variant ){ - transform_subsample_without_replacement( variant, max_coverage ); + subsample_counts_without_replacement( variant, max_coverage ); return true; } );