From 3806574513c25ea0e8f47372e53c630cc6a5754f Mon Sep 17 00:00:00 2001
From: Lucas Czech <luc.czech@gmail.com>
Date: Fri, 31 May 2024 17:27:56 +0200
Subject: [PATCH] Rework diversity command to use new tagging filters

---
 src/commands/analyze/diversity.cpp          | 546 ++++++++------------
 src/commands/analyze/diversity.hpp          |  21 +-
 src/commands/analyze/fst.cpp                |  72 +--
 src/commands/analyze/fst.hpp                |   4 +-
 src/options/diversity_processor.cpp         |  10 +
 src/options/diversity_processor.hpp         |  20 +-
 src/options/variant_transform_subsample.cpp |   9 +-
 7 files changed, 294 insertions(+), 388 deletions(-)
diff --git a/src/commands/analyze/diversity.cpp b/src/commands/analyze/diversity.cpp
index 46e03e2..32b466c 100644
--- a/src/commands/analyze/diversity.cpp
+++ b/src/commands/analyze/diversity.cpp
@@ -38,10 +38,8 @@
 
 #include <cassert>
 
-// Keep it simple...
 using namespace genesis::population;
 using namespace genesis::utils;
-using VariantWindowView = WindowView<genesis::population::Variant>;
 
 // =================================================================================================
 //      Setup
@@ -53,8 +51,8 @@ void setup_diversity( CLI::App& app )
     auto options = std::make_shared<DiversityOptions>();
     auto sub = app.add_subcommand(
         "diversity",
-        "Compute pool-sequencing corrected diversity measures Theta Pi, "
-        "Theta Watterson, and Tajima's D."
+        "Compute pool-sequencing corrected diversity measures "
+        "Theta Pi, Theta Watterson, and Tajima's D."
     );
 
     // -------------------------------------------------------------------------
@@ -66,12 +64,14 @@ void setup_diversity( CLI::App& app )
     // options->variant_input.add_sample_name_opts_to_app( sub );
     // options->variant_input.add_region_filter_opts_to_app( sub );
 
-    // Individual settings for numerical filtering.
-    // We do not add the SNP filters here as user options, as we want to do our own filtering later
-    // on, so that we can keep track of covered invariant sites for the relative theta computation.
-    // We also do not add total filters here, as diversity is computed per-sample.
+    // We do not add the only-SNP filter here as a user option, as we always filter out invariant
+    // (non-SNP) sites below anyway. They do not contribute to the diversity.
     options->filter_numerical.add_sample_count_filter_opts_to_app( sub );
     options->filter_numerical.add_sample_read_depth_filter_opts_to_app( sub );
+    options->filter_numerical.add_total_read_depth_filter_opts_to_app( sub );
+    options->filter_numerical.add_total_snp_filter_opts_to_app( sub, false, true );
+    options->filter_numerical.add_total_snp_count_opts_to_app( sub );
+    options->filter_numerical.add_total_freq_filter_opts_to_app( sub );
 
     // We always need the min count option, and it cannot be 0.
     options->filter_numerical.sample_min_count.option->required();
@@ -87,78 +87,27 @@ void setup_diversity( CLI::App& app )
     //     Settings
     // -------------------------------------------------------------------------
 
-    // Settings: Pool sizes
-    options->poolsizes.add_poolsizes_opt_to_app( sub );
+    // Diversity Processor
+    options->diversity_processor.add_diversity_processor_opts_to_app( sub );
 
-    // Measure
-    options->measure.option = sub->add_option(
-        "--measure",
-        options->measure.value,
-        "Diversity measure to compute."
+    // Settings: Omit Empty Windows
+    options->no_extra_columns.option = sub->add_flag(
+        "--no-extra-columns",
+        options->no_extra_columns.value,
+        "Do not output the extra columns containing counts for each position and sample pair "
+        "that summarize the effects of the filtering. Only the window coordinates and the fst values "
+        "are printed in that case."
     );
-    options->measure.option->group( "Settings" );
-    options->measure.option->transform(
-        CLI::IsMember({ "all", "theta-pi", "theta-watterson", "tajimas-d" }, CLI::ignore_case )
-    );
-
-    // Below options are deactivated, as we use the more standardized numerical filters instead now.
-    // However, we currently to not apply them in the input stream, as we want to keep track
-    // of invariant positions, so that for file formats that contain data for all variants,
-    // we can accurately compute the relative Theta values.
-
-    // // Minimum allele count
-    // options->min_count.option = sub->add_option(
-    //     "--min-allele-count",
-    //     options->min_count.value,
-    //     "Minimum allele count needed for a base count (`ACGT`) to be considered. Bases below this "
-    //     "count are filtered out. Used for the identification of SNPs."
-    // )->group( "Settings" );
-    //
-    // // Minimum read depth
-    // options->min_read_depth.option = sub->add_option(
-    //     "--min-read-depth",
-    //     options->min_read_depth.value,
-    //     "Minimum read depth of a site. Sites with a lower read depth will not be considered "
-    //     "for SNP identification and read depth estimation."
-    // )->group( "Settings" );
-    //
-    // // Maximum read depth
-    // options->max_read_depth.option = sub->add_option(
-    //     "--max-read-depth",
-    //     options->max_read_depth.value,
-    //     "Maximum read depth used for SNP identification. Read depth in ALL populations has to be lower "
-    //     "or equal to this threshold, otherwise no SNP will be called."
-    // )->group( "Settings" );
-
-    // TODO reactivate, but per sample, instead of for all populations
-
-    // TODO
-    // add tajima d method, and other normalization stuff
-
-    // Minimum read depth fraction
-    // options->min_read_depth_fraction.option = sub->add_option(
-    //     "--min-read-depth-fraction",
-    //     options->min_read_depth_fraction.value,
-    //     "Minimum read depth fraction of a window being between `--min-read-depth` and `--max-read-depth` "
-    //     "in ALL populations that needs to be reached in order to compute the diversity measures."
-    // )->group( "Settings" );
-
-    // -------------------------------------------------------------------------
-    //     Formatting and Compatibility
-    // -------------------------------------------------------------------------
-
-    // Add table output options.
-    auto sep_opt = options->table_output.add_separator_char_opt_to_app( sub );
-    auto nan_opt = options->table_output.add_na_entry_opt_to_app( sub );
-
-    // Add an option to purposely activate the PoPoolation bugs that we discovered.
-    options->with_popoolation_bugs.option = sub->add_flag(
-        "--popoolation-corrected-tajimas-d",
-        options->with_popoolation_bugs.value,
-        "If set, use the pool-sequencing correction term for Tajima's D as implemented in "
-        "PoPoolation <=1.2.2, which contains two major bugs that alter numerical results. "
-        "We here offer to imitate these bugs for comparability with PoPoolation."
-    )->group( "Compatibility" );
+    options->no_extra_columns.option->group( "Settings" );
+
+    // // Settings: Omit Empty Windows
+    // options->no_nan_windows.option = sub->add_flag(
+    //     "--no-nan-windows",
+    //     options->no_nan_windows.value,
+    //     "Do not output windows where all values are n/a. This is can be relevant "
+    //     "with small window sizes (or individual positions), to reduce output clutter."
+    // );
+    // options->no_nan_windows.option->group( "Settings" );
 
     // PoPoolation output format
     options->popoolation_format.option = sub->add_flag(
@@ -167,17 +116,21 @@ void setup_diversity( CLI::App& app )
         "If set, instead of writing one output table for all measures and all samples, "
         "write the results in separate files for each sample and for each measure of "
         "Theta Pi, Theta Watterson, and Tajima's D, following the format of PoPoolation."
-    )->group( "Compatibility" );
+    )->group( "Settings" );
+
+    // -------------------------------------------------------------------------
+    //     Output
+    // -------------------------------------------------------------------------
+
+    // Add table output options.
+    auto sep_opt = options->table_output.add_separator_char_opt_to_app( sub );
+    auto nan_opt = options->table_output.add_na_entry_opt_to_app( sub );
 
     // Exclude separator char option and na entry in PoPoolation compatibility mode,
     // as we have to use their values in that case.
     sep_opt->excludes( options->popoolation_format.option );
     nan_opt->excludes( options->popoolation_format.option );
 
-    // -------------------------------------------------------------------------
-    //     Output
-    // -------------------------------------------------------------------------
-
     // Output
     options->file_output.add_default_output_opts_to_app( sub );
     options->file_output.add_file_compress_opt_to_app( sub );
@@ -201,13 +154,13 @@ void setup_diversity( CLI::App& app )
 // =================================================================================================
 
 // -------------------------------------------------------------------------
-//     DiversityOutputData
+//     Diversity Command State
 // -------------------------------------------------------------------------
 
 /**
  * @brief Collect all output_data that we might need in one place.
  */
-struct DiversityOutputData
+struct DiversityCommandState
 {
     // Which values are actually being computed?
     bool compute_theta_pi;
@@ -217,6 +170,7 @@ struct DiversityOutputData
     // Store the column separator and nan entry, so that we do not need to look them up every time.
     char sep_char;
     std::string na_entry;
+    bool write_extra_columns;
 
     // We need all pointers, for our format and for the PoPoolation format,
     // in order to avoid code duplication in the output.
@@ -233,32 +187,27 @@ struct DiversityOutputData
 /**
  * @brief Prepare output file and write fixed header fields.
  */
-DiversityOutputData prepare_output_data_(
+DiversityCommandState prepare_output_data_(
     DiversityOptions const& options,
     std::vector<std::string> const& sample_names
 ) {
-    DiversityOutputData output_data;
+    DiversityCommandState output;
 
     // Get the measures to compute. At least one of them will be active.
-    auto const measure = to_lower( options.measure.value );
-    output_data.compute_theta_pi = ( measure == "all" || measure == "theta-pi" );
-    output_data.compute_theta_wa = ( measure == "all" || measure == "theta-watterson" );
-    output_data.compute_tajima_d = ( measure == "all" || measure == "tajimas-d" );
-    assert(
-        output_data.compute_theta_pi ||
-        output_data.compute_theta_wa ||
-        output_data.compute_tajima_d
-    );
+    output.compute_theta_pi = options.diversity_processor.compute_theta_pi();
+    output.compute_theta_wa = options.diversity_processor.compute_theta_watterson();
+    output.compute_tajima_d = options.diversity_processor.compute_tajima_d();
 
     // Get the separator char to use for table entries.
-    output_data.sep_char = options.table_output.get_separator_char();
-    output_data.na_entry = options.table_output.get_na_entry();
+    output.sep_char = options.table_output.get_separator_char();
+    output.na_entry = options.table_output.get_na_entry();
+    output.write_extra_columns = !options.no_extra_columns.value;
 
     // -----------------------------------
     //     Settings checks
     // -----------------------------------
 
-    if( output_data.compute_tajima_d ) {
+    if( output.compute_tajima_d ) {
         if( options.filter_numerical.sample_min_count.value != 2 ) {
             throw CLI::ValidationError(
                 options.filter_numerical.sample_min_count.option->get_name(),
@@ -284,17 +233,17 @@ DiversityOutputData prepare_output_data_(
 
     if( options.popoolation_format.value ) {
         for( auto const& sample_name : sample_names ) {
-            if( output_data.compute_theta_pi ) {
+            if( output.compute_theta_pi ) {
                 options.file_output.check_output_files_nonexistence(
                     "diversity-" + sample_name + "-theta-pi", "csv"
                 );
             }
-            if( output_data.compute_theta_wa ) {
+            if( output.compute_theta_wa ) {
                 options.file_output.check_output_files_nonexistence(
                     "diversity-" + sample_name + "-theta-watterson", "csv"
                 );
             }
-            if( output_data.compute_tajima_d ) {
+            if( output.compute_tajima_d ) {
                 options.file_output.check_output_files_nonexistence(
                     "diversity-" + sample_name + "-tajimas-d", "csv"
                 );
@@ -312,22 +261,22 @@ DiversityOutputData prepare_output_data_(
 
         // Open the three PoPoolation file formats for each sample.
         for( auto const& sample_name : sample_names ) {
-            if( output_data.compute_theta_pi ) {
-                output_data.popoolation_theta_pi_ofss.emplace_back(
+            if( output.compute_theta_pi ) {
+                output.popoolation_theta_pi_ofss.emplace_back(
                     options.file_output.get_output_target(
                         "diversity-" + sample_name + "-theta-pi", "csv"
                     )
                 );
             }
-            if( output_data.compute_theta_wa ) {
-                output_data.popoolation_theta_wa_ofss.emplace_back(
+            if( output.compute_theta_wa ) {
+                output.popoolation_theta_wa_ofss.emplace_back(
                     options.file_output.get_output_target(
                         "diversity-" + sample_name + "-theta-watterson", "csv"
                     )
                 );
             }
-            if( output_data.compute_tajima_d ) {
-                output_data.popoolation_tajima_d_ofss.emplace_back(
+            if( output.compute_tajima_d ) {
+                output.popoolation_tajima_d_ofss.emplace_back(
                     options.file_output.get_output_target(
                         "diversity-" + sample_name + "-tajimas-d", "csv"
                     )
@@ -342,95 +291,59 @@ DiversityOutputData prepare_output_data_(
     } else {
 
         // Open and prepare our table format
-        output_data.table_ofs = options.file_output.get_output_target( "diversity", "csv" );
-        (*output_data.table_ofs) << "chrom" << output_data.sep_char << "start" << output_data.sep_char << "end";
+        output.table_ofs = options.file_output.get_output_target( "diversity", "csv" );
+        (*output.table_ofs) << "chrom" << output.sep_char << "start" << output.sep_char << "end";
+
+        // Print the extra column headers if needed.
+        if( output.write_extra_columns ) {
+            (*output.table_ofs) << output.sep_char << "total.masked";
+            (*output.table_ofs) << output.sep_char << "total.missing";
+            (*output.table_ofs) << output.sep_char << "total.empty";
+            (*output.table_ofs) << output.sep_char << "total.numeric";
+            (*output.table_ofs) << output.sep_char << "total.invariant";
+            (*output.table_ofs) << output.sep_char << "total.passed";
+        }
 
         // Make the header per-sample fields.
         std::vector<std::string> fields;
-
-        // fields.push_back( "variant_count" );
-        // fields.push_back( "read_depth_count" );
-        fields.push_back( "snp_count" );
-        fields.push_back( "read_depth_fraction" );
-
-        if( output_data.compute_theta_pi ) {
-            // fields.push_back( "theta_pi_abs" );
+        if( output.write_extra_columns ) {
+            fields.push_back( "missing" );
+            fields.push_back( "numeric" );
+            fields.push_back( "passed" );
+        }
+        if( output.compute_theta_pi ) {
             fields.push_back( "theta_pi" );
         }
-        if( output_data.compute_theta_wa ) {
-            // fields.push_back( "theta_watterson_abs" );
+        if( output.compute_theta_wa ) {
             fields.push_back( "theta_watterson" );
         }
-        if( output_data.compute_tajima_d ) {
+        if( output.compute_tajima_d ) {
             fields.push_back( "tajimas_d" );
         }
 
         // Write all fields for all samples.
         for( auto const& sample : sample_names ) {
             for( auto const& field : fields ) {
-                (*output_data.table_ofs) << output_data.sep_char << sample << "." << field;
+                (*output.table_ofs) << output.sep_char << sample << "." << field;
             }
         }
-        (*output_data.table_ofs) << "\n";
+        (*output.table_ofs) << "\n";
     }
 
-    return output_data;
-}
-
-// -------------------------------------------------------------------------
-//     get_diversity_calculators_
-// -------------------------------------------------------------------------
-
-/**
- * @brief Get the calculators for all samples.
- */
-std::vector<genesis::population::DiversityPoolCalculator> get_diversity_calculators_(
-    DiversityOptions const& options,
-    SampleCountsFilterNumericalParams const& filter,
-    std::vector<std::string> const& sample_names,
-    std::vector<size_t> const& pool_sizes,
-    DiversityOutputData const& output_data
-) {
-    // Prepare pool settings for each sample.
-    // We here re-use the numerical filter settings as provided by the user.
-    DiversityPoolSettings pool_settings;
-    pool_settings.min_count    = filter.min_count;
-    pool_settings.min_read_depth = filter.min_read_depth;
-    pool_settings.max_read_depth = filter.max_read_depth;
-    pool_settings.tajima_denominator_policy
-        = options.with_popoolation_bugs.value
-        ? TajimaDenominatorPolicy::kWithPopoolationBugs
-        : TajimaDenominatorPolicy::kProvidedMinReadDepth
-    ;
-
-    // TODO
-    // add the other options here
-
-    std::vector<DiversityPoolCalculator> sample_diversity_calculators;
-    for( size_t i = 0; i < sample_names.size(); ++i ) {
-        sample_diversity_calculators.emplace_back(
-            pool_settings, pool_sizes[i]
-        );
-        sample_diversity_calculators.back().enable_theta_pi( output_data.compute_theta_pi );
-        sample_diversity_calculators.back().enable_theta_watterson( output_data.compute_theta_wa );
-        sample_diversity_calculators.back().enable_tajima_d( output_data.compute_tajima_d );
-    }
-    return sample_diversity_calculators;
+    return output;
 }
 
 // -------------------------------------------------------------------------
-//     write_output_popoolation_
+//     write_output_popoolation_line_
 // -------------------------------------------------------------------------
 
 /**
  * @brief Write output for PoPoolation style tables
  */
-void write_output_popoolation_(
-    std::vector<std::string> const& sample_names,
+void write_output_popoolation_line_(
     VariantWindowView const& window,
-    std::vector<DiversityPoolCalculator> const& sample_diversity_calculators,
-    std::vector<SampleCountsFilterStats> const& sample_filter_stats,
-    DiversityOutputData& output_data
+    genesis::population::DiversityPoolProcessor const& processor,
+    DiversityCommandState& output
 ) {
     using namespace genesis::population;
 
@@ -441,17 +354,16 @@ void write_output_popoolation_(
     auto write_popoolation_line_ = [](
         std::shared_ptr<genesis::utils::BaseOutputTarget>& ofs,
         VariantWindowView const& window,
-        SampleCountsFilterStats const& stats,
-        DiversityPoolCalculator::Result const& results,
+        VariantFilterCategoryStats const& variant_stats,
+        SampleCountsFilterCategoryStats const& sample_stats,
         double value
     ) {
-        (void) results; // TODO
-        // internal_check(
-        //     stats[SampleCountsFilterTag::kPassed] == results.processed_count,
-        //     "stats[SampleCountsFilterTag::kPassed] != results.processed_count"
-        // );
+        // TODO Check if that is actually the correct way to do it.
+        // This is a bit messy, as our approach of handling sample vs total filters differs
+        // from PoPoolation, but this is as close as we can get, I think.
         auto const read_depth = static_cast<double>(
-            stats[SampleCountsFilterTag::kPassed] + stats[SampleCountsFilterTag::kNotSnp]
+            variant_stats[VariantFilterTagCategory::kInvariant] +
+            sample_stats[SampleCountsFilterTagCategory::kPassed]
         );
         auto const window_width = static_cast<double>( window.width() );
         auto const read_depth_fraction = read_depth / window_width;
@@ -459,7 +371,7 @@ void write_output_popoolation_(
         // Write fixed columns.
         (*ofs) << window.chromosome();
         (*ofs) << "\t" << anchor_position( window, WindowAnchorType::kIntervalMidpoint );
-        (*ofs) << "\t" << stats[SampleCountsFilterTag::kPassed];
+        (*ofs) << "\t" << sample_stats[SampleCountsFilterTagCategory::kPassed];
         (*ofs) << "\t" << std::fixed << std::setprecision( 3 ) << read_depth_fraction;
         if( std::isfinite( value ) ) {
             (*ofs) << "\t" << std::fixed << std::setprecision( 9 ) << value;
@@ -469,135 +381,158 @@ void write_output_popoolation_(
         (*ofs) << "\n";
     };
 
+    // Get the results for the whole set of calculators.
+    auto const results = processor.get_result( window.width() );
+    auto const total_stats = variant_filter_stats_category_counts(
+        processor.get_filter_stats()
+    );
+
     // Write to all individual files for each sample and each value.
-    for( size_t i = 0; i < sample_names.size(); ++i ) {
-        auto const& div_calc = sample_diversity_calculators[i];
-        auto const div_calc_result = div_calc.get_result( window.width() );
+    for( size_t i = 0; i < processor.calculators().size(); ++i ) {
+        auto const& calculator = processor.calculators()[i];
+        auto const sample_stats = sample_counts_filter_stats_category_counts(
+            calculator.get_filter_stats()
+        );
 
         // Theta Pi
-        if( output_data.compute_theta_pi ) {
+        if( output.compute_theta_pi ) {
             write_popoolation_line_(
-                output_data.popoolation_theta_pi_ofss[i],
+                output.popoolation_theta_pi_ofss[i],
                 window,
-                sample_filter_stats[i],
-                div_calc_result,
-                div_calc_result.theta_pi
+                total_stats,
+                sample_stats,
+                results[i].theta_pi
             );
         }
 
         // Theta Watterson
-        if( output_data.compute_theta_wa ) {
+        if( output.compute_theta_wa ) {
             write_popoolation_line_(
-                output_data.popoolation_theta_wa_ofss[i],
+                output.popoolation_theta_wa_ofss[i],
                 window,
-                sample_filter_stats[i],
-                div_calc_result,
-                div_calc_result.theta_watterson
+                total_stats,
+                sample_stats,
+                results[i].theta_watterson
             );
         }
 
         // Tajima's D
-        if( output_data.compute_tajima_d ) {
+        if( output.compute_tajima_d ) {
             write_popoolation_line_(
-                output_data.popoolation_tajima_d_ofss[i],
+                output.popoolation_tajima_d_ofss[i],
                 window,
-                sample_filter_stats[i],
-                div_calc_result,
-                div_calc_result.tajima_d
+                total_stats,
+                sample_stats,
+                results[i].tajima_d
             );
         }
     }
 }
 
 // -------------------------------------------------------------------------
-//     write_output_table_
+//     write_output_table_line_
 // -------------------------------------------------------------------------
 
 /**
  * @brief Write output for our table format
  */
-void write_output_table_(
-    std::vector<std::string> const& sample_names,
+void write_output_table_line_(
     VariantWindowView const& window,
-    std::vector<DiversityPoolCalculator> const& sample_diversity_calculators,
-    std::vector<SampleCountsFilterStats> const& sample_filter_stats,
-    DiversityOutputData& output_data
+    genesis::population::DiversityPoolProcessor const& processor,
+    DiversityCommandState& output
 ) {
     // Shorthand
     using namespace genesis::population;
-    auto& table_ofs = output_data.table_ofs;
+    auto& table_ofs = output.table_ofs;
+    auto const sep_char = output.sep_char;
 
     // Helper function to write a field value to one of the tables.
     auto write_table_field_ = [&]( double value ){
         if( std::isfinite( value ) ) {
-            (*table_ofs) << output_data.sep_char;
+            (*table_ofs) << sep_char;
             (*table_ofs) << std::defaultfloat << std::setprecision( 9 ) << value;
         } else {
-            (*table_ofs) << output_data.sep_char << output_data.na_entry;
+            (*table_ofs) << sep_char << output.na_entry;
         }
     };
 
     // Write fixed columns.
     (*table_ofs) << window.chromosome();
-    (*table_ofs) << output_data.sep_char << window.first_position();
-    (*table_ofs) << output_data.sep_char << window.last_position();
+    (*table_ofs) << sep_char << window.first_position();
+    (*table_ofs) << sep_char << window.last_position();
+    if( output.write_extra_columns ) {
+        // Summarize the exact statistics into their category summaries.
+        // Good enough for the user output here. We do not need to know which exact filter
+        // failed each time; it is good enough that it was a numerical one etc.
+        auto const total_stats = variant_filter_stats_category_counts(
+            processor.get_filter_stats()
+        );
+        // Print the counters.
+        // This obviously requires the column header line to exactly match this order.
+        // Change both if needed, throughout this file.
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kMasked ];
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kMissingInvalid ];
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kSamplesFailed ];
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kNumeric ];
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kInvariant ];
+        (*table_ofs) << sep_char << total_stats[ VariantFilterTagCategory::kPassed ];
+    }
+
+    // Get the results for the whole set of calculators.
+    auto const results = processor.get_result( window.width() );
+
+    // Write to all individual files for each sample and each value.
+    for( size_t i = 0; i < processor.calculators().size(); ++i ) {
+    }
 
     // Write the per-pair diversity values in the correct order.
-    for( size_t i = 0; i < sample_names.size(); ++i ) {
-        auto const read_depth = sample_filter_stats[i][SampleCountsFilterTag::kPassed] + sample_filter_stats[i][SampleCountsFilterTag::kNotSnp];
-        auto const window_width = static_cast<double>( window.width() );
-        auto const read_depth_fraction = static_cast<double>( read_depth ) / window_width;
-
-        auto const& div_calc = sample_diversity_calculators[i];
-        auto const div_calc_result = div_calc.get_result( read_depth );
-        // auto const div_calc_result = div_calc.get_result( window.width() );
-
-        // Meta info per sample
-        // (*table_ofs) << output_data.sep_char << div_calc.variant_count;
-        // (*table_ofs) << output_data.sep_char << div_calc.read_depth_count;
-        (*table_ofs) << output_data.sep_char << "TODO"; // TODO sample_filter_stats[SampleCountsFilterTag::kPassed]; // div_calc_result.processed_count;
-        (*table_ofs) << output_data.sep_char << std::fixed << std::setprecision( 3 )
-                     << read_depth_fraction;
-
-        // Values
-        if( output_data.compute_theta_pi ) {
-            // write_table_field_( div_calc_result.theta_pi_absolute );
-            write_table_field_( div_calc_result.theta_pi );
+    for( size_t i = 0; i < processor.calculators().size(); ++i ) {
+        auto const& calculator = processor.calculators()[i];
+        auto const sample_stats = sample_counts_filter_stats_category_counts(
+            calculator.get_filter_stats()
+        );
+
+        // Write all columns. The extra ones need to match the ones written in the header,
+        // so if either needs updating, both need to be changed.
+        if( output.write_extra_columns ) {
+            (*table_ofs) << sep_char << sample_stats[ SampleCountsFilterTagCategory::kMissingInvalid ];
+            (*table_ofs) << sep_char << sample_stats[ SampleCountsFilterTagCategory::kNumeric ];
+            (*table_ofs) << sep_char << sample_stats[ SampleCountsFilterTagCategory::kPassed ];
         }
-        if( output_data.compute_theta_wa ) {
-            // write_table_field_( div_calc_result.theta_watterson_absolute );
-            write_table_field_( div_calc_result.theta_watterson );
+        if( output.compute_theta_pi ) {
+            write_table_field_( results[i].theta_pi );
         }
-        if( output_data.compute_tajima_d ) {
-            write_table_field_( div_calc_result.tajima_d );
+        if( output.compute_theta_wa ) {
+            write_table_field_( results[i].theta_watterson );
+        }
+        if( output.compute_tajima_d ) {
+            write_table_field_( results[i].tajima_d );
         }
     }
     (*table_ofs) << "\n";
 }
 
 // -------------------------------------------------------------------------
-//     write_output_
+//     write_output_line_
 // -------------------------------------------------------------------------
 
 /**
- * @brief Write output to output_data
+ * @brief Write output to output
  */
-void write_output_(
+void write_output_line_(
     DiversityOptions const& options,
-    std::vector<std::string> const& sample_names,
     VariantWindowView const& window,
-    std::vector<DiversityPoolCalculator> const& sample_diversity_calculators,
-    std::vector<SampleCountsFilterStats> const& sample_filter_stats,
-    DiversityOutputData& output_data
+    genesis::population::DiversityPoolProcessor const& processor,
+    DiversityCommandState& output
 ) {
     // Write the data, depending on the format.
     if( options.popoolation_format.value ) {
-        write_output_popoolation_(
-            sample_names, window, sample_diversity_calculators, sample_filter_stats, output_data
+        write_output_popoolation_line_(
+            window, processor, output
         );
     } else {
-        write_output_table_(
-            sample_names, window, sample_diversity_calculators, sample_filter_stats, output_data
+        write_output_table_line_(
+            window, processor, output
         );
     }
 }
@@ -609,122 +544,71 @@ void write_output_(
 void run_diversity( DiversityOptions const& options )
 {
     // -------------------------------------------------------------------------
-    //     Settings
+    //     Options Preparation
     // -------------------------------------------------------------------------
 
-    // Warn the user about the PoPoolation bugs.
-    if( options.with_popoolation_bugs.value ) {
-        LOG_WARN << "Option `" << options.with_popoolation_bugs.option->get_name()
-                 << "` was activated. Note that this yields numerically different results "
-                 << "from the intended measure as described by the publication of Kofler et al, "
-                 << "and that this option is provided solely for comparability with results "
-                 << "obtained from PoPoolation.";
-    }
-
-    // Add a filter for just SNPs, per sample. Invariant sites don't change the absolute diversity.
-    // Right now, we however use the invariant sites to compute relative Theta, and so we do not
-    // want to filter them out beforehand. This is instead done in the actual processing,
-    // sample by sample. Bit slower, but not by much.
-    // SampleCountsFilterNumericalParams snp_filter;
-    // snp_filter.only_snps = true;
-    // options.variant_input.add_combined_filter_and_transforms(
-    //     [snp_filter]( genesis::population::Variant& variant ){
-    //         return genesis::population::filter_base_counts( variant, snp_filter );
-    //     }
-    // );
+    // Before accessing the variant input, we need to add the filters to it.
+    // We use a variant filter that always filters out non-SNP positions here.
+    // There might still be samples where a position is invariant,
+    // but that's okay, as that will be caught by the diversity computation anyway.
+    options.variant_input.add_combined_filter_and_transforms(
+        options.filter_numerical.make_sample_filter()
+    );
+    genesis::population::VariantFilterNumericalParams total_filter;
+    total_filter.only_snps = true;
+    options.variant_input.add_combined_filter_and_transforms(
+        options.filter_numerical.make_total_filter( total_filter )
+    );
 
-    // Prepare the base counts filter. We default the snps filter,
-    // and then override everything else with the user provided values.
-    SampleCountsFilterNumericalParams filter_params;
-    filter_params.only_snps = true;
-    filter_params = options.filter_numerical.get_sample_filter_params( filter_params ).first;
+    // TODO Previously, we used SampleCountsFilterNumericalParams::only_snps here instead of the
+    // only_snps filter for the total Variant. This is what we documented now, and the old
+    // behaviour can be achieved by simply running each sample indivudally. But maybe there is a
+    // better way, where this can be made a user option instead?
 
-    // TODO right now, the numerical filters are applied in the diversity calculator, instead
-    // of on the stream beforehand - this will change soon once we support variant filters
-    // with masking properly. but in the meantime, that means that the subsampling will be
-    // applied _before_ the max read_depth... need to fix asap!
+    // Lastly, apply the subsampling. It is important that this happens after the above numercial
+    // filters above, as otherwise we might subsample to a lower read depth, and then want to apply
+    // a read depth filter, which would not work any more.
     options.transform_subsample.add_subsample_transformation( options.variant_input );
 
-    // Get all samples names from the input file.
+    // Get all samples names from the input file, and make a processor for them.
     auto const& sample_names = options.variant_input.sample_names();
-
-    // Get the pool sizes.
-    auto const pool_sizes = options.poolsizes.get_pool_sizes( sample_names );
-    internal_check(
-        pool_sizes.size() == sample_names.size(),
-        "Inconsistent number of pool sizes and samples."
+    auto processor = options.diversity_processor.get_diversity_pool_processor(
+        sample_names, options.filter_numerical.get_sample_filter_params().first
     );
 
     // Prepare file output_data and print headers.
     auto output_data = prepare_output_data_( options, sample_names );
 
-    // Prepare pool settings for each sample.
-    auto sample_diversity_calculators = get_diversity_calculators_(
-        options, filter_params, sample_names, pool_sizes, output_data
-    );
-    std::vector<SampleCountsFilterStats> sample_filter_stats{ sample_names.size() };
-
     // -------------------------------------------------------------------------
     //     Main Loop
     // -------------------------------------------------------------------------
 
-    // Iterate the file and compute per-window diversitye measures.
+    // Iterate the file and compute per-window diversity measures.
     // We run the samples in parallel, storing their results before writing to the output file.
-    // For now, we compute all of them, in not the very most efficient way, but the easiest.
     auto window_it = options.window.get_variant_window_view_stream( options.variant_input );
     for( auto cur_it = window_it->begin(); cur_it != window_it->end(); ++cur_it ) {
-        auto& window = *cur_it;
+        auto const& window = *cur_it;
 
         // Reset all calculator accumulators to zero for this window.
-        for( size_t i = 0; i < sample_diversity_calculators.size(); ++i ) {
-            sample_diversity_calculators[i].reset();
-            sample_filter_stats[i].clear();
-        }
+        processor.reset();
 
         // Compute diversity over samples.
-        // #pragma omp parallel for
-        for( auto& variant : window ) {
+        size_t processed_cnt = 0;
+        for( auto const& variant : window ) {
             internal_check(
                 variant.samples.size() == sample_names.size(),
                 "Inconsistent number of samples in input file."
             );
-
-            // TODO refine the below to apply the filtering beforehand?!
-            // that would make computing relative theta more tricky though.
-
-            // Compute diversity for each sample.
-            for( size_t i = 0; i < sample_names.size(); ++i ) {
-                if( apply_sample_counts_filter_numerical(
-                    variant.samples[i], filter_params, sample_filter_stats[i] )
-                ) {
-                    sample_diversity_calculators[i].process( variant.samples[i] );
-                }
-            }
-
-            // It does not help here to parallelize, at least with the current layout...
-            // We lose too much to the synchronization overhead.
-            // if( sample_names.size() > 1 ) {
-            //     global_options.thread_pool()->parallel_for(
-            //         0, sample_names.size(),
-            //         [&]( size_t i ){
-            //             // Currently, we need to do some filtering here...
-            //             if( filter_base_counts( variant.samples[i], filter, sample_filter_stats[i] )) {
-            //                 sample_diversity_calculators[i].process( variant.samples[i] );
-            //             }
-            //         }
-            //     ).wait();
-            // } else if( sample_names.size() == 1 ) {
-            //     if( filter_base_counts( variant.samples[0], filter, sample_filter_stats[0] )) {
-            //         sample_diversity_calculators[0].process( variant.samples[0] );
-            //     }
-            // }
+            processor.process( variant );
+            ++processed_cnt;
         }
+        internal_check(
+            processed_cnt == processor.get_filter_stats().sum(),
+            "Number of processed positions does not equal filter sum"
+        );
 
         // Write the output to files.
-        write_output_(
-            options, sample_names, window,
-            sample_diversity_calculators, sample_filter_stats, output_data
-        );
+        write_output_line_( options, window, processor, output_data );
     }
 
     // Final user output.
diff --git a/src/commands/analyze/diversity.hpp b/src/commands/analyze/diversity.hpp
index d591ddb..4d3fa8b 100644
--- a/src/commands/analyze/diversity.hpp
+++ b/src/commands/analyze/diversity.hpp
@@ -19,13 +19,14 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
     Contact:
-    Lucas Czech <lczech@carnegiescience.edu>
-    Department of Plant Biology, Carnegie Institution For Science
-    260 Panama Street, Stanford, CA 94305, USA
+    Lucas Czech <lucas.czech@sund.ku.dk>
+    University of Copenhagen, Globe Institute, Section for GeoGenetics
+    Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
 */
 
 #include "CLI/CLI.hpp"
 
+#include "options/diversity_processor.hpp"
 #include "options/file_output.hpp"
 #include "options/poolsizes.hpp"
 #include "options/table_output.hpp"
@@ -51,17 +52,11 @@ class DiversityOptions
     VariantFilterNumericalOptions    filter_numerical;
     VariantTransformSubsampleOptions transform_subsample;
     WindowOptions                    window;
-    PoolsizesOptions                 poolsizes;
 
-    // Using defaults from PoPoolation, see Variance-sliding.pl
-    CliOption<std::string> measure = "all";
-    // CliOption<size_t> min_count = 2;
-    // CliOption<size_t> min_read_depth = 4;
-    // CliOption<size_t> max_read_depth = 1000000;
-    // CliOption<double> min_read_depth_fraction = 0.6;
-
-    // Compatibility
-    CliOption<bool> with_popoolation_bugs = false;
+    // Specific settings
+    DiversityProcessorOptions diversity_processor;
+    CliOption<bool> no_extra_columns   = false;
+    CliOption<bool> no_nan_windows     = false;
     CliOption<bool> popoolation_format = false;
 
     // Output options
diff --git a/src/commands/analyze/fst.cpp b/src/commands/analyze/fst.cpp
index d6fb9d2..608bc11 100644
--- a/src/commands/analyze/fst.cpp
+++ b/src/commands/analyze/fst.cpp
@@ -102,7 +102,7 @@ void setup_fst( CLI::App& app )
     // -------------------------------------------------------------------------
 
     // Fst Processor
-    options->fst_processor.add_fst_processor_opts_to_app( sub, true, "Settings" );
+    options->fst_processor.add_fst_processor_opts_to_app( sub, true );
 
     // Settings: Write pi tables
     options->write_pi_tables.option = sub->add_flag(
@@ -273,8 +273,8 @@ void prepare_output_files_(
         (*target) << "chrom" << sep_char << "start" << sep_char << "end";
 
         // Print the extra column headers if needed.
-        // Need to make sure that this is the same order as used in print_total_stats_()
-        // and print_sample_stats_(). Change all occurrences if needed.
+        // Need to make sure that this is the same order as used in write_total_stats_()
+        // and write_sample_stats_(). Change all occurrences if needed.
         if( !options.no_extra_columns.value ) {
             (*target) << sep_char << "total.masked";
             (*target) << sep_char << "total.missing";
@@ -307,10 +307,10 @@ void prepare_output_files_(
 }
 
 // -------------------------------------------------------------------------
-//     print_total_stats_
+//     write_total_stats_
 // -------------------------------------------------------------------------
 
-void print_total_stats_(
+void write_total_stats_(
     FstOptions const& options,
     genesis::population::VariantFilterStats const& variant_filter_stats,
     std::shared_ptr<genesis::utils::BaseOutputTarget>& target
@@ -342,10 +342,10 @@ void print_total_stats_(
 }
 
 // -------------------------------------------------------------------------
-//     print_sample_stats_
+//     write_sample_stats_
 // -------------------------------------------------------------------------
 
-void print_sample_stats_(
+void write_sample_stats_(
     FstOptions const& options,
     genesis::population::SampleCountsFilterStats const& sample_counts_filter_stats,
     std::shared_ptr<genesis::utils::BaseOutputTarget>& target
@@ -369,7 +369,7 @@ void print_sample_stats_(
 }
 
 // -------------------------------------------------------------------------
-//     print_pairwise_value_list_
+//     write_pairwise_value_list_
 // -------------------------------------------------------------------------
 
 /**
@@ -378,7 +378,7 @@ void print_sample_stats_(
  * This is meant for whole genome computations, where we have a single value per pair of samples.
  * This function then prints these values, in rows of the form "Sample1 Sample2 value".
  */
-void print_pairwise_value_list_(
+void write_pairwise_value_list_(
     FstOptions const& options,
     std::string const& value_name,
     ::genesis::population::FstPoolProcessor const& processor,
@@ -400,8 +400,8 @@ void print_pairwise_value_list_(
     (*target) << "first" << sep_char << "second";
 
     // Print the additional column headers, if needed.
-    // Need to make sure that this is the same order as used in print_total_stats_()
-    // and print_sample_stats_(). Change all occurrences if needed.
+    // Need to make sure that this is the same order as used in write_total_stats_()
+    // and write_sample_stats_(). Change all occurrences if needed.
     if( !options.no_extra_columns.value ) {
         (*target) << sep_char << "total.masked";
         (*target) << sep_char << "total.missing";
@@ -421,8 +421,8 @@ void print_pairwise_value_list_(
         (*target) << sample_names[pair.first] << sep_char << sample_names[pair.second];
 
         // Print the extra columns as needed.
-        print_total_stats_( options, processor.get_filter_stats(), target );
-        print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
+        write_total_stats_( options, processor.get_filter_stats(), target );
+        write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
 
         // Now print the actual value we want.
         (*target) << sep_char << values[i] << "\n";
@@ -430,7 +430,7 @@ void print_pairwise_value_list_(
 }
 
 // -------------------------------------------------------------------------
-//     print_pairwise_value_matrix_
+//     write_pairwise_value_matrix_
 // -------------------------------------------------------------------------
 
 /**
@@ -440,7 +440,7 @@ void print_pairwise_value_list_(
  * This function then prints these values, as a symmetrical matrix, with the samples in the rows
  * and columns.
  */
-void print_pairwise_value_matrix_(
+void write_pairwise_value_matrix_(
     FstOptions const& options,
     std::string const& value_name,
     std::vector<double> const& values,
@@ -470,10 +470,10 @@ void print_pairwise_value_matrix_(
 }
 
 // -------------------------------------------------------------------------
-//     print_output_line_
+//     write_output_line_
 // -------------------------------------------------------------------------
 
-void print_output_line_(
+void write_output_line_(
     FstOptions const& options,
     ::genesis::population::FstPoolProcessor const& processor,
     VariantWindowView const& window,
@@ -492,11 +492,11 @@ void print_output_line_(
     (*target) << window.chromosome();
     (*target) << sep_char << window.first_position();
     (*target) << sep_char << window.last_position();
-    print_total_stats_( options, processor.get_filter_stats(), target );
+    write_total_stats_( options, processor.get_filter_stats(), target );
 
     // Write the per-pair FST values in the correct order.
     for( size_t i = 0; i < values.size(); ++i ) {
-        print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
+        write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
         if( std::isfinite( values[i] ) ) {
             (*target) << sep_char << values[i];
         } else {
@@ -507,7 +507,7 @@ void print_output_line_(
 }
 
 // -------------------------------------------------------------------------
-//     print_fst_files_
+//     write_fst_files_
 // -------------------------------------------------------------------------
 
 /**
@@ -515,7 +515,7 @@ void print_output_line_(
  *
  * This function is called once per window, and writes out the per-pair results to the output file.
  */
-void print_to_output_files_(
+void write_to_output_files_(
     FstOptions const& options,
     ::genesis::population::FstPoolProcessor const& processor,
     VariantWindowView const& window,
@@ -559,15 +559,15 @@ void print_to_output_files_(
         ++state.skipped_count;
     } else {
         ++state.used_count;
-        print_output_line_( options, processor, window, window_fst, state.fst_output_target );
+        write_output_line_( options, processor, window, window_fst, state.fst_output_target );
         if( options.write_pi_tables.value ) {
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<0>(*pi_vectors), state.pi_within_output_target
             );
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<1>(*pi_vectors), state.pi_between_output_target
             );
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<2>(*pi_vectors), state.pi_total_output_target
             );
         }
@@ -576,29 +576,29 @@ void print_to_output_files_(
     // For whole genome, and then also for all-to-all, we produce special tables on top.
     if( state.is_whole_genome ) {
         internal_check( options.window.get_num_windows() == 1, "Whole genome with multiple windows" );
-        print_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
+        write_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
         if( options.write_pi_tables.value ) {
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-within",  processor, std::get<0>(*pi_vectors), sample_pairs
             );
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-between", processor, std::get<1>(*pi_vectors), sample_pairs
             );
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-total",   processor, std::get<2>(*pi_vectors), sample_pairs
             );
         }
 
         if( state.is_all_to_all ) {
-            print_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
+            write_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
             if( options.write_pi_tables.value ) {
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-within",  std::get<0>(*pi_vectors), sample_pairs
                 );
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-between", std::get<1>(*pi_vectors), sample_pairs
                 );
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-total",   std::get<2>(*pi_vectors), sample_pairs
                 );
             }
@@ -676,7 +676,7 @@ void run_fst( FstOptions const& options )
         // Reset all calculator accumulators to zero for this window.
         processor.reset();
 
-        // Compute diversity over samples.
+        // Compute FST between pairs of samples.
         size_t processed_cnt = 0;
         for( auto const& variant : window ) {
             internal_check(
@@ -691,8 +691,8 @@ void run_fst( FstOptions const& options )
             "Number of processed positions does not equal filter sum"
         );
 
-        // Print the output to all files that we want.
-        print_to_output_files_( options, processor, window, sample_pairs, state );
+        // Write the output to all files that we want.
+        write_to_output_files_( options, processor, window, sample_pairs, state );
     }
     internal_check(
         state.used_count + state.skipped_count == options.window.get_num_windows(),
diff --git a/src/commands/analyze/fst.hpp b/src/commands/analyze/fst.hpp
index c6568b1..66ef693 100644
--- a/src/commands/analyze/fst.hpp
+++ b/src/commands/analyze/fst.hpp
@@ -53,9 +53,9 @@ class FstOptions
 
     // Specific settings
     FstProcessorOptions fst_processor;
-    CliOption<bool>     write_pi_tables = false;
+    CliOption<bool>     write_pi_tables  = false;
     CliOption<bool>     no_extra_columns = false;
-    CliOption<bool>     no_nan_windows = false;
+    CliOption<bool>     no_nan_windows   = false;
 
     // Output options
     TableOutputOptions table_output;
diff --git a/src/options/diversity_processor.cpp b/src/options/diversity_processor.cpp
index 73e1c3b..2133245 100644
--- a/src/options/diversity_processor.cpp
+++ b/src/options/diversity_processor.cpp
@@ -172,6 +172,16 @@ genesis::population::DiversityPoolProcessor DiversityProcessorOptions::get_diver
 ) const {
     using namespace genesis::population;
 
+    // User input check
+    if( no_theta_pi.value && no_theta_watterson.value && no_tajima_d.value ) {
+        throw CLI::ValidationError(
+            no_theta_pi.option->get_name() + ", " +
+            no_theta_watterson.option->get_name() + ", " +
+            no_tajima_d.option->get_name(),
+            "All diversity metrics have been deactivate. Nothing is being computed."
+        );
+    }
+
     // Get and check the pool sizes.
     auto const pool_sizes = poolsizes.get_pool_sizes( sample_names );
     internal_check(
diff --git a/src/options/diversity_processor.hpp b/src/options/diversity_processor.hpp
index 0a9307b..5bbc26c 100644
--- a/src/options/diversity_processor.hpp
+++ b/src/options/diversity_processor.hpp
@@ -85,6 +85,21 @@ class DiversityProcessorOptions
         genesis::population::SampleCountsFilterNumericalParams filter_params
     ) const;
 
+    bool compute_theta_pi() const
+    {
+        return ! no_theta_pi.value;
+    }
+
+    bool compute_theta_watterson() const
+    {
+        return ! no_theta_watterson.value;
+    }
+
+    bool compute_tajima_d() const
+    {
+        return ! no_tajima_d.value;
+    }
+
     // -------------------------------------------------------------------------
     //     Option Members
     // -------------------------------------------------------------------------
@@ -98,11 +113,12 @@ class DiversityProcessorOptions
     CliOption<bool> no_tajima_d        = false;
 
     // General settings
-    PoolsizesOptions       poolsizes;
-    WindowAverageOptions   window_average_policy;
+    PoolsizesOptions     poolsizes;
+    WindowAverageOptions window_average_policy;
 
     // Remnant settings. These are simply re-used from the numerical filters now.
     // Kept here for reference, just in case they are needed for later.
+    // Using defaults from PoPoolation, see Variance-sliding.pl
     // CliOption<size_t> min_count = 2;
     // CliOption<size_t> min_read_depth = 4;
     // CliOption<size_t> max_read_depth = 1000000;
diff --git a/src/options/variant_transform_subsample.cpp b/src/options/variant_transform_subsample.cpp
index f938dbc..7c6d2d6 100644
--- a/src/options/variant_transform_subsample.cpp
+++ b/src/options/variant_transform_subsample.cpp
@@ -53,8 +53,8 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
         "--subsample-max-read-depth",
         max_read_depth_.value,
         "If provided, the nucleotide counts of each sample are subsampled so that they do not "
-        "exceed this given maximum total read depth (sum of the four nucleotide counts, as well as "
-        "the any `N` and deleted `D` counts). "
+        "exceed this given maximum total read depth (sum of the four nucleotide counts `ACGT`, "
+        "as well as the any `N` and deleted `D` counts). "
         "If they are below this value anyway, they are not changed. "
         "This transformation is useful to limit the maximum read depth. For instance, the diversity "
         "estimators for Theta Pi and Theta Watterson have terms that depend on read depth. "
@@ -63,8 +63,9 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
         "Furthermore, a very low Tajima's D, usually indicative of a selective sweep, may be found "
         "as an artifact in highly covered regions, as such regions have just more sequencing errors. "
         "To avoid these kinds of biases we recommend to subsample to an uniform read depth. "
-        // "This transformation is applied after any filters, so that, e.g., filters high read depth "
-        // "remove any unwanted positions first. See `--subsample-method` for the subsampling method."
+        "This transformation is applied after the numerical filters, so that, e.g., filters for "
+        "high read depth are able to remove any unwanted positions first. "
+        "See `--subsample-method` for the subsampling method."
     );
     max_read_depth_.option->group( group );