diff --git a/src/commands/convert/sync.cpp b/src/commands/convert/sync.cpp index 697f438..2f09914 100644 --- a/src/commands/convert/sync.cpp +++ b/src/commands/convert/sync.cpp @@ -101,6 +101,15 @@ void setup_sync( CLI::App& app ) ); options->gapless.option->group( "Settings" ); + // For user convenience, we also check if the general settings option for making + // the stream gapless is given, to improve the option description here. + auto make_gapless_opt = sub->get_option_no_throw( "--make-gapless" ); + if( make_gapless_opt ) { + auto descr = options->gapless.option->get_description(); + descr += " Note: This option is an alias for the more general `--make-gapless` option."; + options->gapless.option->description( descr ); + } + // Guess Reference Base options->guess_ref_base.option = sub->add_flag( "--guess-reference-base", diff --git a/src/options/variant_input.cpp b/src/options/variant_input.cpp index 3cbe0bb..0211b77 100644 --- a/src/options/variant_input.cpp +++ b/src/options/variant_input.cpp @@ -150,6 +150,26 @@ void VariantInputOptions::add_input_files_opts_to_app( CLI::IsMember( enum_map_keys( multi_file_contribution_type_map_ ), CLI::ignore_case ) ); + // Make gapless stream. If renamed, change it in the `sync` command as well, + // where we use the option name to imporove the option description there. + make_gapless_stream_.option = sub->add_flag( + "--make-gapless", + make_gapless_stream_.value, + "By default, we only operate on the positions for which there is data. " + "In particular, positions that are absent in the input are completely ignored; " + "they do not even show up in the `missing` column of output tables. " + "This is because for the statistics, data being absend or (marked as) missing " + "is merely a sementic distinction, but it does not change the results. " + "However, it might make processing with downstream tools easier if the output contains " + "all positions, for instance when using `single` windows. " + "With this option, all absent positions are filled in as missing data, so that they " + "show up in the `missing` column and as entries in single windows. " + "If a referene genome or dictionary is given, this might also include positions beyond " + "where there is input data, up until the length of each chromosome. " + "Note that this can lead to large ouput tables when processing single positions." + ); + make_gapless_stream_.option->group( group ); + // Hidden options to set the Generic Input Stream block sizes for speed. // First for the main block size of the stream that is collecing all Variants, @@ -680,6 +700,11 @@ void VariantInputOptions::conditionally_make_gapless_stream_() const return; } + // Also if the user provided the gapless option here. + if( make_gapless_stream_.value ) { + gapless_stream_ = true; + } + // Now we check if we want to make a gapless stream, e.g., by demand from a command. // If so, we further check if any references are given, which we then want to use // for the stream in order to properly get the chromosome lengths. diff --git a/src/options/variant_input.hpp b/src/options/variant_input.hpp index ce8694d..b8986e4 100644 --- a/src/options/variant_input.hpp +++ b/src/options/variant_input.hpp @@ -155,6 +155,16 @@ class VariantInputOptions gapless_stream_ = value; } + /** + * @brief Return whether the stream is made gapless due to some setting. + * + * This option is set to its correct value only after get_stream() has been called. + */ + bool gapless_stream() const + { + return gapless_stream_; + } + /** * @brief Transformations and filters for individual input sources. * @@ -320,6 +330,7 @@ class VariantInputOptions // to avoid code repetition when adding and processing them here. std::vector> input_files_; CliOption multi_file_loci_set_ = "union"; + CliOption make_gapless_stream_ = false; // Hidden options to set the Generic Input Stream block size for speed. CliOption iterator_block_size_ = 8192; diff --git a/src/options/window.cpp b/src/options/window.cpp index b149570..f58601c 100644 --- a/src/options/window.cpp +++ b/src/options/window.cpp @@ -326,7 +326,7 @@ std::unique_ptr WindowOptions::get_variant_window_stream( } case WindowType::kSingle: { result = genesis::utils::make_unique( - get_variant_window_stream_single_( input_stream ) + get_variant_window_stream_single_( input_stream, variant_input.gapless_stream() ) ); break; } @@ -413,7 +413,7 @@ std::unique_ptr WindowOptions::get_variant_window_view_ case WindowType::kSingle: { result = genesis::utils::make_unique( make_window_view_stream( - get_variant_window_stream_single_( input_stream ) + get_variant_window_stream_single_( input_stream, variant_input.gapless_stream() ) ) ); break; @@ -598,8 +598,19 @@ WindowOptions::get_variant_window_stream_queue_( WindowOptions::VariantPositionWindowStream WindowOptions::get_variant_window_stream_single_( - genesis::population::VariantInputStream& input + genesis::population::VariantInputStream& input, + bool is_gapless_stream ) const { + // If our input is meant to be gapless, we also do not want to filter for passing positions here. + // That will produce huge output, but well, it's want the user wants :-) + // As far as we can tell, this is the only type of window that needs this; + // all other window types behave exactly the same, except that they also are counting + // the missing positions properly when a gapless stream is given. + if( is_gapless_stream ) { + return genesis::population::make_default_position_window_stream( + input.begin(), input.end() + ); + } return genesis::population::make_passing_variant_position_window_stream( input.begin(), input.end() ); diff --git a/src/options/window.hpp b/src/options/window.hpp index dc73f71..1f45f37 100644 --- a/src/options/window.hpp +++ b/src/options/window.hpp @@ -243,7 +243,8 @@ class WindowOptions ) const; VariantPositionWindowStream get_variant_window_stream_single_( - genesis::population::VariantInputStream& input + genesis::population::VariantInputStream& input, + bool is_gapless_stream ) const; VariantRegionWindowStream get_variant_window_stream_regions_(