Rework diversity command to use new tagging filters

lczech · May 31, 2024 · 3806574 · 3806574
1 parent 6a50159
commit 3806574
Show file tree

Hide file tree

Showing 7 changed files with 294 additions and 388 deletions.
diff --git a/src/commands/analyze/diversity.cpp b/src/commands/analyze/diversity.cpp
diff --git a/src/commands/analyze/diversity.hpp b/src/commands/analyze/diversity.hpp
@@ -19,13 +19,14 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
     Contact:
-    Lucas Czech <[email protected]>
-    Department of Plant Biology, Carnegie Institution For Science
-    260 Panama Street, Stanford, CA 94305, USA
+    Lucas Czech <[email protected]>
+    University of Copenhagen, Globe Institute, Section for GeoGenetics
+    Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
 */
 
 #include "CLI/CLI.hpp"
 
+#include "options/diversity_processor.hpp"
 #include "options/file_output.hpp"
 #include "options/poolsizes.hpp"
 #include "options/table_output.hpp"
@@ -51,17 +52,11 @@ class DiversityOptions
     VariantFilterNumericalOptions    filter_numerical;
     VariantTransformSubsampleOptions transform_subsample;
     WindowOptions                    window;
-    PoolsizesOptions                 poolsizes;
 
-    // Using defaults from PoPoolation, see Variance-sliding.pl
-    CliOption<std::string> measure = "all";
-    // CliOption<size_t> min_count = 2;
-    // CliOption<size_t> min_read_depth = 4;
-    // CliOption<size_t> max_read_depth = 1000000;
-    // CliOption<double> min_read_depth_fraction = 0.6;
-
-    // Compatibility
-    CliOption<bool> with_popoolation_bugs = false;
+    // Specific settings
+    DiversityProcessorOptions diversity_processor;
+    CliOption<bool> no_extra_columns   = false;
+    CliOption<bool> no_nan_windows     = false;
     CliOption<bool> popoolation_format = false;
 
     // Output options

diff --git a/src/commands/analyze/fst.cpp b/src/commands/analyze/fst.cpp
@@ -102,7 +102,7 @@ void setup_fst( CLI::App& app )
     // -------------------------------------------------------------------------
 
     // Fst Processor
-    options->fst_processor.add_fst_processor_opts_to_app( sub, true, "Settings" );
+    options->fst_processor.add_fst_processor_opts_to_app( sub, true );
 
     // Settings: Write pi tables
     options->write_pi_tables.option = sub->add_flag(
@@ -273,8 +273,8 @@ void prepare_output_files_(
         (*target) << "chrom" << sep_char << "start" << sep_char << "end";
 
         // Print the extra column headers if needed.
-        // Need to make sure that this is the same order as used in print_total_stats_()
-        // and print_sample_stats_(). Change all occurrences if needed.
+        // Need to make sure that this is the same order as used in write_total_stats_()
+        // and write_sample_stats_(). Change all occurrences if needed.
         if( !options.no_extra_columns.value ) {
             (*target) << sep_char << "total.masked";
             (*target) << sep_char << "total.missing";
@@ -307,10 +307,10 @@ void prepare_output_files_(
 }
 
 // -------------------------------------------------------------------------
-//     print_total_stats_
+//     write_total_stats_
 // -------------------------------------------------------------------------
 
-void print_total_stats_(
+void write_total_stats_(
     FstOptions const& options,
     genesis::population::VariantFilterStats const& variant_filter_stats,
     std::shared_ptr<genesis::utils::BaseOutputTarget>& target
@@ -342,10 +342,10 @@ void print_total_stats_(
 }
 
 // -------------------------------------------------------------------------
-//     print_sample_stats_
+//     write_sample_stats_
 // -------------------------------------------------------------------------
 
-void print_sample_stats_(
+void write_sample_stats_(
     FstOptions const& options,
     genesis::population::SampleCountsFilterStats const& sample_counts_filter_stats,
     std::shared_ptr<genesis::utils::BaseOutputTarget>& target
@@ -369,7 +369,7 @@ void print_sample_stats_(
 }
 
 // -------------------------------------------------------------------------
-//     print_pairwise_value_list_
+//     write_pairwise_value_list_
 // -------------------------------------------------------------------------
 
 /**
@@ -378,7 +378,7 @@ void print_sample_stats_(
  * This is meant for whole genome computations, where we have a single value per pair of samples.
  * This function then prints these values, in rows of the form "Sample1 Sample2 value".
  */
-void print_pairwise_value_list_(
+void write_pairwise_value_list_(
     FstOptions const& options,
     std::string const& value_name,
     ::genesis::population::FstPoolProcessor const& processor,
@@ -400,8 +400,8 @@ void print_pairwise_value_list_(
     (*target) << "first" << sep_char << "second";
 
     // Print the additional column headers, if needed.
-    // Need to make sure that this is the same order as used in print_total_stats_()
-    // and print_sample_stats_(). Change all occurrences if needed.
+    // Need to make sure that this is the same order as used in write_total_stats_()
+    // and write_sample_stats_(). Change all occurrences if needed.
     if( !options.no_extra_columns.value ) {
         (*target) << sep_char << "total.masked";
         (*target) << sep_char << "total.missing";
@@ -421,16 +421,16 @@ void print_pairwise_value_list_(
         (*target) << sample_names[pair.first] << sep_char << sample_names[pair.second];
 
         // Print the extra columns as needed.
-        print_total_stats_( options, processor.get_filter_stats(), target );
-        print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
+        write_total_stats_( options, processor.get_filter_stats(), target );
+        write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
 
         // Now print the actual value we want.
         (*target) << sep_char << values[i] << "\n";
     }
 }
 
 // -------------------------------------------------------------------------
-//     print_pairwise_value_matrix_
+//     write_pairwise_value_matrix_
 // -------------------------------------------------------------------------
 
 /**
@@ -440,7 +440,7 @@ void print_pairwise_value_list_(
  * This function then prints these values, as a symmetrical matrix, with the samples in the rows
  * and columns.
  */
-void print_pairwise_value_matrix_(
+void write_pairwise_value_matrix_(
     FstOptions const& options,
     std::string const& value_name,
     std::vector<double> const& values,
@@ -470,10 +470,10 @@ void print_pairwise_value_matrix_(
 }
 
 // -------------------------------------------------------------------------
-//     print_output_line_
+//     write_output_line_
 // -------------------------------------------------------------------------
 
-void print_output_line_(
+void write_output_line_(
     FstOptions const& options,
     ::genesis::population::FstPoolProcessor const& processor,
     VariantWindowView const& window,
@@ -492,11 +492,11 @@ void print_output_line_(
     (*target) << window.chromosome();
     (*target) << sep_char << window.first_position();
     (*target) << sep_char << window.last_position();
-    print_total_stats_( options, processor.get_filter_stats(), target );
+    write_total_stats_( options, processor.get_filter_stats(), target );
 
     // Write the per-pair FST values in the correct order.
     for( size_t i = 0; i < values.size(); ++i ) {
-        print_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
+        write_sample_stats_( options, processor.calculators()[i]->get_filter_stats(), target );
         if( std::isfinite( values[i] ) ) {
             (*target) << sep_char << values[i];
         } else {
@@ -507,15 +507,15 @@ void print_output_line_(
 }
 
 // -------------------------------------------------------------------------
-//     print_fst_files_
+//     write_fst_files_
 // -------------------------------------------------------------------------
 
 /**
  * @brief Print the fst values for all sample pairs, for a given window.
  *
  * This function is called once per window, and writes out the per-pair results to the output file.
  */
-void print_to_output_files_(
+void write_to_output_files_(
     FstOptions const& options,
     ::genesis::population::FstPoolProcessor const& processor,
     VariantWindowView const& window,
@@ -559,15 +559,15 @@ void print_to_output_files_(
         ++state.skipped_count;
     } else {
         ++state.used_count;
-        print_output_line_( options, processor, window, window_fst, state.fst_output_target );
+        write_output_line_( options, processor, window, window_fst, state.fst_output_target );
         if( options.write_pi_tables.value ) {
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<0>(*pi_vectors), state.pi_within_output_target
             );
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<1>(*pi_vectors), state.pi_between_output_target
             );
-            print_output_line_(
+            write_output_line_(
                 options, processor, window, std::get<2>(*pi_vectors), state.pi_total_output_target
             );
         }
@@ -576,29 +576,29 @@ void print_to_output_files_(
     // For whole genome, and then also for all-to-all, we produce special tables on top.
     if( state.is_whole_genome ) {
         internal_check( options.window.get_num_windows() == 1, "Whole genome with multiple windows" );
-        print_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
+        write_pairwise_value_list_( options, "fst", processor, window_fst, sample_pairs );
         if( options.write_pi_tables.value ) {
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-within",  processor, std::get<0>(*pi_vectors), sample_pairs
             );
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-between", processor, std::get<1>(*pi_vectors), sample_pairs
             );
-            print_pairwise_value_list_(
+            write_pairwise_value_list_(
                 options, "pi-total",   processor, std::get<2>(*pi_vectors), sample_pairs
             );
         }
 
         if( state.is_all_to_all ) {
-            print_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
+            write_pairwise_value_matrix_( options, "fst", window_fst, sample_pairs );
             if( options.write_pi_tables.value ) {
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-within",  std::get<0>(*pi_vectors), sample_pairs
                 );
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-between", std::get<1>(*pi_vectors), sample_pairs
                 );
-                print_pairwise_value_matrix_(
+                write_pairwise_value_matrix_(
                     options, "pi-total",   std::get<2>(*pi_vectors), sample_pairs
                 );
             }
@@ -676,7 +676,7 @@ void run_fst( FstOptions const& options )
         // Reset all calculator accumulators to zero for this window.
         processor.reset();
 
-        // Compute diversity over samples.
+        // Compute FST between pairs of samples.
         size_t processed_cnt = 0;
         for( auto const& variant : window ) {
             internal_check(
@@ -691,8 +691,8 @@ void run_fst( FstOptions const& options )
             "Number of processed positions does not equal filter sum"
         );
 
-        // Print the output to all files that we want.
-        print_to_output_files_( options, processor, window, sample_pairs, state );
+        // Write the output to all files that we want.
+        write_to_output_files_( options, processor, window, sample_pairs, state );
     }
     internal_check(
         state.used_count + state.skipped_count == options.window.get_num_windows(),

diff --git a/src/commands/analyze/fst.hpp b/src/commands/analyze/fst.hpp
@@ -53,9 +53,9 @@ class FstOptions
 
     // Specific settings
     FstProcessorOptions fst_processor;
-    CliOption<bool>     write_pi_tables = false;
+    CliOption<bool>     write_pi_tables  = false;
     CliOption<bool>     no_extra_columns = false;
-    CliOption<bool>     no_nan_windows = false;
+    CliOption<bool>     no_nan_windows   = false;
 
     // Output options
     TableOutputOptions table_output;

diff --git a/src/options/diversity_processor.cpp b/src/options/diversity_processor.cpp
@@ -172,6 +172,16 @@ genesis::population::DiversityPoolProcessor DiversityProcessorOptions::get_diver
 ) const {
     using namespace genesis::population;
 
+    // User input check
+    if( no_theta_pi.value && no_theta_watterson.value && no_tajima_d.value ) {
+        throw CLI::ValidationError(
+            no_theta_pi.option->get_name() + ", " +
+            no_theta_watterson.option->get_name() + ", " +
+            no_tajima_d.option->get_name(),
+            "All diversity metrics have been deactivate. Nothing is being computed."
+        );
+    }
+
     // Get and check the pool sizes.
     auto const pool_sizes = poolsizes.get_pool_sizes( sample_names );
     internal_check(

diff --git a/src/options/diversity_processor.hpp b/src/options/diversity_processor.hpp
@@ -85,6 +85,21 @@ class DiversityProcessorOptions
         genesis::population::SampleCountsFilterNumericalParams filter_params
     ) const;
 
+    bool compute_theta_pi() const
+    {
+        return ! no_theta_pi.value;
+    }
+
+    bool compute_theta_watterson() const
+    {
+        return ! no_theta_watterson.value;
+    }
+
+    bool compute_tajima_d() const
+    {
+        return ! no_tajima_d.value;
+    }
+
     // -------------------------------------------------------------------------
     //     Option Members
     // -------------------------------------------------------------------------
@@ -98,11 +113,12 @@ class DiversityProcessorOptions
     CliOption<bool> no_tajima_d        = false;
 
     // General settings
-    PoolsizesOptions       poolsizes;
-    WindowAverageOptions   window_average_policy;
+    PoolsizesOptions     poolsizes;
+    WindowAverageOptions window_average_policy;
 
     // Remnant settings. These are simply re-used from the numerical filters now.
     // Kept here for reference, just in case they are needed for later.
+    // Using defaults from PoPoolation, see Variance-sliding.pl
     // CliOption<size_t> min_count = 2;
     // CliOption<size_t> min_read_depth = 4;
     // CliOption<size_t> max_read_depth = 1000000;

diff --git a/src/options/variant_transform_subsample.cpp b/src/options/variant_transform_subsample.cpp
@@ -53,8 +53,8 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
         "--subsample-max-read-depth",
         max_read_depth_.value,
         "If provided, the nucleotide counts of each sample are subsampled so that they do not "
-        "exceed this given maximum total read depth (sum of the four nucleotide counts, as well as "
-        "the any `N` and deleted `D` counts). "
+        "exceed this given maximum total read depth (sum of the four nucleotide counts `ACGT`, "
+        "as well as the any `N` and deleted `D` counts). "
         "If they are below this value anyway, they are not changed. "
         "This transformation is useful to limit the maximum read depth. For instance, the diversity "
         "estimators for Theta Pi and Theta Watterson have terms that depend on read depth. "
@@ -63,8 +63,9 @@ void VariantTransformSubsampleOptions::add_subsample_opts_to_app(
         "Furthermore, a very low Tajima's D, usually indicative of a selective sweep, may be found "
         "as an artifact in highly covered regions, as such regions have just more sequencing errors. "
         "To avoid these kinds of biases we recommend to subsample to an uniform read depth. "
-        // "This transformation is applied after any filters, so that, e.g., filters high read depth "
-        // "remove any unwanted positions first. See `--subsample-method` for the subsampling method."
+        "This transformation is applied after the numerical filters, so that, e.g., filters for "
+        "high read depth are able to remove any unwanted positions first. "
+        "See `--subsample-method` for the subsampling method."
     );
     max_read_depth_.option->group( group );