From 9a549f160cc8b9bfcf70b7a1456063b7481b45a9 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 30 Apr 2019 10:15:51 +0100 Subject: [PATCH 01/17] when reference with alternate haplotypes is detected, use bwa mem with bwakit post-processing --- lib/npg_pipeline/function/seq_alignment.pm | 27 +++++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 76b22a43b..bea24755a 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -40,6 +40,7 @@ Readonly::Scalar my $DEFAULT_RNA_ANALYSIS => q{tophat2}; Readonly::Array my @RNA_ANALYSES => qw{tophat2 star hisat2}; Readonly::Scalar my $TARGET_REGIONS_DIR_NAME => q{target}; Readonly::Scalar my $REFERENCE_ABSENT => q{REFERENCE_NOT_AVAILABLE}; +Readonly::Scalar my $JS_DIR => q{/software/solexa/pkg/bwakit/bwakit-0.7.15/}; =head2 phix_reference @@ -272,6 +273,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) run_lane_ss_fq2 => $fq2_filepath, seqchksum_orig_file => $seqchksum_orig_file, s2_input_format => $self->s1_s2_intfile_format, + markdup_method => q[samtools], }; my $p4_ops = { prune => [], @@ -316,8 +318,12 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) # There will be a new exception to the use of "aln": if you specify a reference # with alt alleles e.g. GRCh38_full_analysis_set_plus_decoy_hla, then we will use - # bwa's "mem" - $bwa = ($do_target_alignment and $self->_is_alt_reference($dp)) ? 'bwa_mem' : $bwa; + # bwa's "mem" with post-processing using the bwa-postalt.js script from bwakit + if($do_target_alignment and (my $alt_ref = $self->_alt_reference($dp))) { + $p4_param_vals->{alignment_method} = $bwa = 'bwa_mem_bwakit'; + $p4_param_vals->{fa_alt_path} = $alt_ref; + $p4_param_vals->{js_dir} = $JS_DIR; + } my $skip_target_markdup_metrics = (not $spike_tag and not $do_target_alignment); @@ -411,7 +417,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) } else { push @{$p4_ops->{prune}}, 'foptgt.*samtools_stats_F0.*00_bait.*-'; # confirm hyphen - push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_bamsort_coord:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; + push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) } my $p4_local_assignments = {}; @@ -480,6 +486,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) bwa_aln => q[bwa0_6], bwa_aln_se => q[bwa0_6], bwa_mem => q[bwa0_6], + bwa_mem_bwakit => q[bwa0_6], ); my %ref_suffix = ( picard => q{.dict}, @@ -503,6 +510,14 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) if(not $self->is_paired_read) { $p4_param_vals->{bwa_mem_p_flag} = undef; } + + # if a ".alt" file exists in the "alignment_reference_genome" directory, set "alt_ctg_dir" to the + # directory and change "alignment_method" to "bwa_mem_bwakit" +# if($p4_param_vals->{alignment_method} eq q[bwa0_6] and -f "$p4_param_vals->{alignment_reference_genome} . q[.alt]") { +# my($filename, $dirs) = fileparse($p4_param_vals->{alignment_reference_genome}); +# $p4_param_vals->{alt_ctg_dir} = $dirs; +# $p4_param_vals->{alignment_method} = q[bwa_mem_bwakit]; +# } } if($human_split) { @@ -850,14 +865,14 @@ sub _default_human_split_ref { return $human_ref; } -sub _is_alt_reference { +sub _alt_reference { my ($self, $dp) = @_; my $ref = $self->_ref($dp, q{bwa0_6}); if ($ref) { $ref .= q{.alt}; - return -e $ref; + if(-e $ref) { return $ref; } + else { return; } } - return; } sub _p4_stage2_params_path { From 5b065438f004559c5c9449cab637b37e67fda831 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Mon, 13 May 2019 15:04:24 +0100 Subject: [PATCH 02/17] remove hard-coding of path to javascript script location for bwakit - instead specify using environment variables or general_values config update tests which use the new _js_scripts_dir attribute --- lib/npg_pipeline/function/seq_alignment.pm | 16 ++++++++++++++-- t/20-function-seq_alignment.t | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index bea24755a..3569f9f73 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -40,7 +40,6 @@ Readonly::Scalar my $DEFAULT_RNA_ANALYSIS => q{tophat2}; Readonly::Array my @RNA_ANALYSES => qw{tophat2 star hisat2}; Readonly::Scalar my $TARGET_REGIONS_DIR_NAME => q{target}; Readonly::Scalar my $REFERENCE_ABSENT => q{REFERENCE_NOT_AVAILABLE}; -Readonly::Scalar my $JS_DIR => q{/software/solexa/pkg/bwakit/bwakit-0.7.15/}; =head2 phix_reference @@ -96,6 +95,19 @@ sub _build__num_cpus { $self->general_values_conf()->{'seq_alignment_slots'} || $NUM_SLOTS); } +has '_js_scripts_dir' => ( isa => 'Str', + is => 'ro', + required => 0, + lazy_build => 1, + ); +sub _build__js_scripts_dir { + my $self = shift; + return $ENV{'NPG_PIPELINE_JS_SCRIPTS_DIR'} + || $self->general_values_conf()->{'js_scripts_directory'} + || $ENV{'NPG_PIPELINE_SCRIPTS_DIR'} + || $self->general_values_conf()->{'scripts_directory'}; +} + has '_do_gbs_plex_analysis' => ( isa => 'Bool', is => 'rw', default => 0, @@ -322,7 +334,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) if($do_target_alignment and (my $alt_ref = $self->_alt_reference($dp))) { $p4_param_vals->{alignment_method} = $bwa = 'bwa_mem_bwakit'; $p4_param_vals->{fa_alt_path} = $alt_ref; - $p4_param_vals->{js_dir} = $JS_DIR; + $p4_param_vals->{js_dir} = $self->_js_scripts_dir; } diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 216d04f4c..ecbcb5308 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -537,6 +537,7 @@ subtest 'test 4' => sub { my $target_file = "$ref_dir/Homo_sapiens/GRCh38_full_analysis_set_plus_decoy_hla/all/target/Homo_sapiens.GRCh38_full_analysis_set_plus_decoy_hla.fa"; local $ENV{'NPG_CACHED_SAMPLESHEET_FILE'} = q[t/data/hiseqx/samplesheet_16839.csv]; + local $ENV{'NPG_PIPELINE_JS_SCRIPTS_DIR'} = q[t/bin/]; my $hsx_gen; lives_ok { @@ -1091,6 +1092,7 @@ subtest 'test 11' => sub { close $fhss or warn "Failed to close $new_ss"; # new samplesheet has one chromium sample in lane 1 local $ENV{'NPG_CACHED_SAMPLESHEET_FILE'} = $new_ss; + local $ENV{'NPG_PIPELINE_JS_SCRIPTS_DIR'} = q[t/bin/]; my $chromium_gen; lives_ok { From a9393de37e37ee1965442ff8f746185a75787fa7 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Thu, 8 Aug 2019 16:26:53 +0100 Subject: [PATCH 03/17] add markdup_method parameter to parameter list --- t/20-function-seq_alignment.t | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 61da1a814..7cca8bd9f 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -818,7 +818,8 @@ subtest 'test 6' => sub { "af_metrics"=> "20268_1#1_bam_alignment_filter_metrics.json", "run_lane_ss_fq1"=> $dir."/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal/archive/lane1/plex1/.npg_cache_10000/20268_1#1_1.fastq", "bait_regions_file"=> $dir .'/baits/Human_all_exon_V5/1000Genomes_hs37d5/S04380110-CTR.interval_list', - "recal_dir"=> $dir .'/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal' + "recal_dir"=> $dir .'/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal', + "markdup_method" => "samtools" },], 'ops' => { 'prune' => ['fop.*_bmd_multiway:calibration_pu-', From d86ee2f2fd611e3573053eb58e12533e70e5add9 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 27 Aug 2019 10:42:27 +0100 Subject: [PATCH 04/17] only do bwakit postporcessing when bwakit attribute is true (default false) --- lib/npg_pipeline/base/options.pm | 15 +++++++++++++++ lib/npg_pipeline/function/seq_alignment.pm | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/npg_pipeline/base/options.pm b/lib/npg_pipeline/base/options.pm index df84a21d6..6ea284e43 100644 --- a/lib/npg_pipeline/base/options.pm +++ b/lib/npg_pipeline/base/options.pm @@ -205,6 +205,21 @@ has q{p4s2_aligner_intfile} => ( documentation => q{Forces p4 stage2 to create an intermediate file when doing alignments.}, ); +=head2 bwakit + +Tells p4 stage2 (seq_alignment) to postprocess alignments with bwakit +script (when alignments done with bwa mem using a reference with alternate +haplotypes) + +=cut + +has q{bwakit} => ( + isa => q{Bool}, + is => q{ro}, + default => 0, + documentation => q{postprocess alignments with bwakit}, +); + =head2 align_tag0 Toggles alignment of tag#0 in secondary analysis diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 707803a86..7dac7d7ee 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -328,7 +328,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) # There will be a new exception to the use of "aln": if you specify a reference # with alt alleles e.g. GRCh38_full_analysis_set_plus_decoy_hla, then we will use # bwa's "mem" with post-processing using the bwa-postalt.js script from bwakit - if($do_target_alignment and (my $alt_ref = $self->_alt_reference($dp))) { + if($do_target_alignment and $self->bwakit and (my $alt_ref = $self->_alt_reference($dp))) { $p4_param_vals->{alignment_method} = $bwa = 'bwa_mem_bwakit'; $p4_param_vals->{fa_alt_path} = $alt_ref; $p4_param_vals->{js_dir} = $self->_js_scripts_dir; From cca944a34c493a84761699fd0f05b0a0f2b0211b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 10 Sep 2019 16:38:22 +0100 Subject: [PATCH 05/17] add bwakit_enable and markdup_method attributes --- lib/npg_pipeline/product/release.pm | 44 +++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/npg_pipeline/product/release.pm b/lib/npg_pipeline/product/release.pm index 37851603b..2f0a1302c 100644 --- a/lib/npg_pipeline/product/release.pm +++ b/lib/npg_pipeline/product/release.pm @@ -543,6 +543,50 @@ sub bqsr_known_sites { return @known_sites; } +=head2 bwakit_enable + + Arg [1] : npg_pipeline::product + + Example : $obj->bwakit_enable($product) + Description: Return true if bwakit's postalt processing is to be run on the product. + + Returntype : Bool + +=cut + +sub bwakit_enable { + my ($self, $product) = @_; + + my $rpt = $product->rpt_list(); + my $name = $product->file_name_root(); + + if ($self->find_study_config($product)->{bwakit}->{enable}) { + $self->info("Product $name, $rpt is for bwakit postalt processing"); + return 1; + } + + $self->info("Product $name, $rpt is NOT for bwakit postalt processing"); + + return 0; +} + +=head2 markdup_method + + Arg [1] : npg_pipeline::product + + Example : $obj->markdup_method($product); + Description: Return mark duplicate method, + the value might be undefined. + + Returntype : Str + +=cut + +sub markdup_method { + my ($self, $product) = @_; + return $self->find_study_config($product)->{markdup_method}; +} + =head2 staging_deletion_delay Arg [1] : npg_pipeline::product From 7bcec0877ce682cccb0e3b61a45a0762d588dbc6 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 10 Sep 2019 16:44:27 +0100 Subject: [PATCH 06/17] check if using patterned flowcell, set pixel distance for optical duplicate marking appropriately allow markdup method to be set from product_release.yml (default biobambam) allow bwakit postalt processing to be switched on from either pipeline flag or product_release.yml entry (alt haplotype reference required) fix spike tag splice to work with all markdup methods --- lib/npg_pipeline/function/seq_alignment.pm | 33 ++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 7dac7d7ee..402d5715b 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -37,6 +37,9 @@ Readonly::Scalar my $REFERENCE_ARRAY_ANALYSIS_IDX => q{3}; Readonly::Scalar my $REFERENCE_ARRAY_TVERSION_IDX => q{2}; Readonly::Scalar my $DEFAULT_RNA_ANALYSIS => q{tophat2}; Readonly::Array my @RNA_ANALYSES => qw{tophat2 star hisat2}; +Readonly::Scalar my $PFC_MARKDUP_OPT_DIST => q{2500}; # distance in pixels for optical duplicate detection on patterned flowcells +Readonly::Scalar my $NON_PFC_MARKDUP_OPT_DIST => q{100}; # distance in pixels for optical duplicate detection on non-patterned flowcells +Readonly::Scalar my $MARKDUP_DEFAULT => q{biobambam}; =head2 phix_reference @@ -195,6 +198,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) my $archive_path = $self->archive_path; my $dp_archive_path = $dp->path($archive_path); my $recal_path= $self->recalibrated_path; #? + my $uses_patterned_flowcell = $self->uses_patterned_flowcell; my $qc_out_path = $dp->qc_out_path($archive_path); my $cache10k_path = $dp->short_files_cache_path($archive_path); @@ -233,12 +237,14 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) my $fq1_filepath = File::Spec->catdir($cache10k_path, $dp->file_name(ext => 'fastq', suffix => '1')); my $fq2_filepath = File::Spec->catdir($cache10k_path, $dp->file_name(ext => 'fastq', suffix => '2')); my $seqchksum_orig_file = File::Spec->catdir($dp_archive_path, $dp->file_name(ext => 'orig.seqchksum')); + my $markdup_method = $self->markdup_method($dp) or $MARKDUP_DEFAULT; $self->debug(qq{ rpt_list: $rpt_list}); $self->debug(qq{ reference_genome: $reference_genome}); $self->debug(qq{ is_tag_zero_product: $is_tag_zero_product}); $self->debug(qq{ is_pool: $is_pool}); $self->debug(qq{ dp_archive_path: $dp_archive_path}); + $self->debug(qq{ uses_patterned_flowcell: $uses_patterned_flowcell}); $self->debug(qq{ cache10k_path: $cache10k_path}); $self->debug(qq{ bfs_input_file: $bfs_input_file}); $self->debug(qq{ af_input_file: $af_input_file}); @@ -282,7 +288,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) run_lane_ss_fq2 => $fq2_filepath, seqchksum_orig_file => $seqchksum_orig_file, s2_input_format => $self->s1_s2_intfile_format, - markdup_method => q[samtools], + markdup_method => $markdup_method, }; my $p4_ops = { prune => [], @@ -326,12 +332,18 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) $self->info(qq{ do_target_alignment for $name_root is } . ($do_target_alignment?q[TRUE]:q[FALSE])); # There will be a new exception to the use of "aln": if you specify a reference - # with alt alleles e.g. GRCh38_full_analysis_set_plus_decoy_hla, then we will use - # bwa's "mem" with post-processing using the bwa-postalt.js script from bwakit - if($do_target_alignment and $self->bwakit and (my $alt_ref = $self->_alt_reference($dp))) { - $p4_param_vals->{alignment_method} = $bwa = 'bwa_mem_bwakit'; - $p4_param_vals->{fa_alt_path} = $alt_ref; - $p4_param_vals->{js_dir} = $self->_js_scripts_dir; + # with alt alleles e.g. GRCh38_full_analysis_set_plus_decoy_hla and bwakit postalt + # processing is enabled, then we will use bwa's "mem" with post-processing using + # the bwa-postalt.js script from bwakit + if($do_target_alignment and ($self->bwakit or $self->bwakit_enable($dp))) { # two ways to specify bwakit? + if((my $alt_ref = $self->_alt_reference($dp))) { + $p4_param_vals->{alignment_method} = $bwa = 'bwa_mem_bwakit'; + $p4_param_vals->{fa_alt_path} = $alt_ref; + $p4_param_vals->{js_dir} = $self->_js_scripts_dir; + } + else { + $self->info(q[bwakit postal processing specified, but no alternate haplotypes in reference]); + } } @@ -433,7 +445,12 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) } else { push @{$p4_ops->{prune}}, 'foptgt.*samtools_stats_F0.*00_bait.*-'; # confirm hyphen - push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) + if($markdup_method eq q[samtools) { + push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) + } + else { + push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_bamsort_coord:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; + } } my $p4_local_assignments = {}; From 4188d1ca5dac02dbadc4fe45b654e43ad93bc00d Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Wed, 11 Sep 2019 11:50:28 +0100 Subject: [PATCH 07/17] fixes for seq_slignment, its tests and test data --- lib/npg_pipeline/function/seq_alignment.pm | 7 +-- t/20-function-seq_alignment.t | 44 +++++++++++++------ .../config/seq_alignment/general_values.ini | 9 ++++ .../config/seq_alignment/product_release.yml | 12 +++++ 4 files changed, 55 insertions(+), 17 deletions(-) create mode 100644 t/data/release/config/seq_alignment/general_values.ini create mode 100644 t/data/release/config/seq_alignment/product_release.yml diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 402d5715b..61f38dcd9 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -21,7 +21,8 @@ use npg_pipeline::cache::reference; use npg_pipeline::function::definition; extends q{npg_pipeline::base}; -with q{npg_pipeline::function::util}; +with qw{ npg_pipeline::function::util + npg_pipeline::product::release }; our $VERSION = '0'; @@ -237,7 +238,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) my $fq1_filepath = File::Spec->catdir($cache10k_path, $dp->file_name(ext => 'fastq', suffix => '1')); my $fq2_filepath = File::Spec->catdir($cache10k_path, $dp->file_name(ext => 'fastq', suffix => '2')); my $seqchksum_orig_file = File::Spec->catdir($dp_archive_path, $dp->file_name(ext => 'orig.seqchksum')); - my $markdup_method = $self->markdup_method($dp) or $MARKDUP_DEFAULT; + my $markdup_method = ($self->markdup_method($dp) or $MARKDUP_DEFAULT); $self->debug(qq{ rpt_list: $rpt_list}); $self->debug(qq{ reference_genome: $reference_genome}); @@ -445,7 +446,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) } else { push @{$p4_ops->{prune}}, 'foptgt.*samtools_stats_F0.*00_bait.*-'; # confirm hyphen - if($markdup_method eq q[samtools) { + if($markdup_method eq q[samtools]) { push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) } else { diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 7cca8bd9f..0f67dd619 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -183,6 +183,7 @@ subtest 'test 1' => sub { verbose => 0, repository => $dir, force_phix_split => 0, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; @@ -258,6 +259,7 @@ subtest 'test 1' => sub { verbose => 0, repository => $dir, force_phix_split => 1, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object (forcing on phix split)'; @@ -323,7 +325,8 @@ subtest 'test 2' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2014}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($rna_gen->id_run, 13066, 'id_run inferred correctly'); @@ -381,7 +384,8 @@ subtest 'test 2' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2017}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($rna_gen->id_run, 17550, 'id_run inferred correctly'); @@ -464,7 +468,8 @@ subtest 'test 2' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2018}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; @@ -497,7 +502,8 @@ subtest 'test 3' => sub { runfolder_path => $runfolder_path, recalibrated_path => "$bc_path/no_cal", timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($se_gen->id_run, 18472, 'id_run inferred correctly'); @@ -551,7 +557,8 @@ subtest 'test 4' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($hsx_gen->id_run, 16839, 'id_run inferred correctly'); @@ -653,7 +660,8 @@ subtest 'test 5' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($hs_gen->id_run, 16807, 'id_run inferred correctly'); @@ -734,6 +742,7 @@ subtest 'test 6' => sub { timestamp => q{2016}, repository => $dir, verbose => 1, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; @@ -819,7 +828,7 @@ subtest 'test 6' => sub { "run_lane_ss_fq1"=> $dir."/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal/archive/lane1/plex1/.npg_cache_10000/20268_1#1_1.fastq", "bait_regions_file"=> $dir .'/baits/Human_all_exon_V5/1000Genomes_hs37d5/S04380110-CTR.interval_list', "recal_dir"=> $dir .'/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal', - "markdup_method" => "samtools" + "markdup_method" => "biobambam" },], 'ops' => { 'prune' => ['fop.*_bmd_multiway:calibration_pu-', @@ -857,7 +866,8 @@ subtest 'test 7' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($ms_gen->id_run, 16850, 'id_run inferred correctly'); @@ -930,7 +940,8 @@ subtest 'test 8' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($hs_gen->id_run, 16756, 'id_run inferred correctly'); @@ -1006,7 +1017,8 @@ subtest 'test 9' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($ms_gen->id_run, 16866, 'id_run inferred correctly'); @@ -1082,7 +1094,8 @@ subtest 'test 10' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2016}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($ms_gen->id_run, 20990, 'id_run (20990) inferred correctly'); @@ -1157,7 +1170,8 @@ subtest 'test 11' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2015}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($chromium_gen->id_run, 16839, 'id_run inferred correctly'); @@ -1230,7 +1244,8 @@ subtest 'test 12' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2017}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ) } 'no error creating an object'; is ($ms_gen->id_run, 24135, 'id_run inferred correctly'); @@ -1302,7 +1317,8 @@ subtest 'generate compositions only' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2017}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ); apply_all_roles($ms_gen, 'npg_pipeline::runfolder_scaffold'); $ms_gen->create_product_level(); diff --git a/t/data/release/config/seq_alignment/general_values.ini b/t/data/release/config/seq_alignment/general_values.ini new file mode 100644 index 000000000..d38367e6a --- /dev/null +++ b/t/data/release/config/seq_alignment/general_values.ini @@ -0,0 +1,9 @@ +executor_type=lsf +publish2irods_max_errors=10 +single_plex_decode_max_no_calls=6 +illumina2bam_memory=4000 +qc_adapter_cpu=3 +p4_stage1_memory=20000 +p4_stage1_slots=8 +p4_stage1_i2b_thread_count=8 +p4_stage1_split_threads_count=4 diff --git a/t/data/release/config/seq_alignment/product_release.yml b/t/data/release/config/seq_alignment/product_release.yml new file mode 100644 index 000000000..50044cad4 --- /dev/null +++ b/t/data/release/config/seq_alignment/product_release.yml @@ -0,0 +1,12 @@ +--- +default: + bqsr: + enable: true + known-sites: + - dbsnp_138.hg38 + - Mills_and_1000G_gold_standard.indels.hg38 + +study: + - study_id: "1713" + data_deletion: + staging_deletion_delay: 10 From de9afc817d9f5d9707bc0ee5194ae0d5bddfef3b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Fri, 13 Sep 2019 14:02:42 +0100 Subject: [PATCH 08/17] set markdup optical distance parameter --- lib/npg_pipeline/function/seq_alignment.pm | 6 +++++- t/20-function-seq_alignment.t | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 61f38dcd9..3f1ed6128 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -239,6 +239,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) my $fq2_filepath = File::Spec->catdir($cache10k_path, $dp->file_name(ext => 'fastq', suffix => '2')); my $seqchksum_orig_file = File::Spec->catdir($dp_archive_path, $dp->file_name(ext => 'orig.seqchksum')); my $markdup_method = ($self->markdup_method($dp) or $MARKDUP_DEFAULT); + my $markdup_optical_distance = ($uses_patterned_flowcell? $PFC_MARKDUP_OPT_DIST: $NON_PFC_MARKDUP_OPT_DIST); $self->debug(qq{ rpt_list: $rpt_list}); $self->debug(qq{ reference_genome: $reference_genome}); @@ -290,6 +291,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) seqchksum_orig_file => $seqchksum_orig_file, s2_input_format => $self->s1_s2_intfile_format, markdup_method => $markdup_method, + markdup_optical_distance_value => $markdup_optical_distance, }; my $p4_ops = { prune => [], @@ -594,12 +596,14 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) nonconsented_humansplit => $nchs, do_gbs_plex => $do_gbs_plex, do_rna => $do_rna, + human_split => ($human_split ? $human_split : q[none]), + markdup_method => $markdup_method, + markdup_optical_distance => $markdup_optical_distance, ); while (my ($text, $value) = each %info) { $self->info(qq[ $text is ] . ($value ? q[true] : q[false])); } - $self->info(q[ human_split is ] . ($human_split ? $human_split : q[none])); $self->info(q[ p4 parameters written to ] . $param_vals_fname); $self->info(q[ Using p4 template alignment_wtsi_stage2_] . $nchs_template_label . q[template.json]); diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 0f67dd619..da154d8f4 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -828,7 +828,8 @@ subtest 'test 6' => sub { "run_lane_ss_fq1"=> $dir."/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal/archive/lane1/plex1/.npg_cache_10000/20268_1#1_1.fastq", "bait_regions_file"=> $dir .'/baits/Human_all_exon_V5/1000Genomes_hs37d5/S04380110-CTR.interval_list', "recal_dir"=> $dir .'/160704_MS3_20268_A_MS4000667-300V2/Data/Intensities/BAM_basecalls_20160712-154117/no_cal', - "markdup_method" => "biobambam" + "markdup_method" => "biobambam", + "markdup_optical_distance_value" => 100, },], 'ops' => { 'prune' => ['fop.*_bmd_multiway:calibration_pu-', From 02a6f1981c6614658ace38b84b6fc44f18be85ea Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Fri, 13 Sep 2019 14:58:43 +0100 Subject: [PATCH 09/17] correct p4 parameters logging --- lib/npg_pipeline/function/seq_alignment.pm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 3f1ed6128..cddb93e90 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -596,14 +596,14 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) nonconsented_humansplit => $nchs, do_gbs_plex => $do_gbs_plex, do_rna => $do_rna, - human_split => ($human_split ? $human_split : q[none]), - markdup_method => $markdup_method, - markdup_optical_distance => $markdup_optical_distance, ); while (my ($text, $value) = each %info) { $self->info(qq[ $text is ] . ($value ? q[true] : q[false])); } + $self->info(q[ human_split is ] . ($human_split ? $human_split : q[none])); + $self->info(q[ markdup_method is ] . ($markdup_method ? $markdup_method : q[unspecified])); + $self->info(q[ markdup_optical_distance is ] . ($markdup_optical_distance ? $markdup_optical_distance : q[unspecified])); $self->info(q[ p4 parameters written to ] . $param_vals_fname); $self->info(q[ Using p4 template alignment_wtsi_stage2_] . $nchs_template_label . q[template.json]); From 6d9d5ecff42451854ff84b9de6780d514093491b Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 17 Sep 2019 13:24:23 +0100 Subject: [PATCH 10/17] remove dead code --- lib/npg_pipeline/function/seq_alignment.pm | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index cddb93e90..747be46b5 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -546,14 +546,6 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) if(not $self->is_paired_read) { $p4_param_vals->{bwa_mem_p_flag} = undef; } - - # if a ".alt" file exists in the "alignment_reference_genome" directory, set "alt_ctg_dir" to the - # directory and change "alignment_method" to "bwa_mem_bwakit" -# if($p4_param_vals->{alignment_method} eq q[bwa0_6] and -f "$p4_param_vals->{alignment_reference_genome} . q[.alt]") { -# my($filename, $dirs) = fileparse($p4_param_vals->{alignment_reference_genome}); -# $p4_param_vals->{alt_ctg_dir} = $dirs; -# $p4_param_vals->{alignment_method} = q[bwa_mem_bwakit]; -# } } if($human_split) { @@ -864,11 +856,6 @@ sub _p4_stage2_params_path { my $path = $self->recalibrated_path; -# temporarily dump all p4s2 params files in no_cal (ignore position) -# if($self->is_multiplexed_lane($position)) { -# $path .= q[/lane] . $position; -# } - return $path; } From ec07523bca6a7b505d07be891d5f37a4523fee2d Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Wed, 18 Sep 2019 17:27:11 +0100 Subject: [PATCH 11/17] tests for markdup_method setting in product_release.yml --- t/20-function-seq_alignment.t | 56 ++++++++++++++++++- t/data/hiseq/samplesheet_16807.csv | 2 +- .../config/seq_alignment/product_release.yml | 27 ++++++--- 3 files changed, 75 insertions(+), 10 deletions(-) diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 30a8f6c6b..2e1e18b61 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 14; +use Test::More tests => 15; use Test::Exception; use Test::Deep; use Test::Warn; @@ -1250,6 +1250,7 @@ subtest 'test 12' => sub { ) } 'no error creating an object'; is ($ms_gen->id_run, 24135, 'id_run inferred correctly'); + make_path "$bc_path/archive/tileviz"; apply_all_roles($ms_gen, 'npg_pipeline::runfolder_scaffold'); $ms_gen->create_product_level(); @@ -1340,4 +1341,57 @@ subtest 'generate compositions only' => sub { } }; +subtest 'product_release_tests' => sub { + plan tests => 92; + + my %test_runs = ( + 16850 => { platform => 'miseq', runfolder_name => '150710_MS2_16850_A_MS3014507-500V2', markdup_method => 'samtools', }, + 16866 => { platform => 'miseq', runfolder_name => '150713_MS8_16866_A_MS3734403-300V2', markdup_method => 'samtools', }, + 20990 => { platform => 'miseq', runfolder_name => '161010_MS5_20990_A_MS4548606-300V2', markdup_method => 'picard', }, + 24135 => { platform => 'miseq', runfolder_name => '171020_MS5_24135_A_MS5476963-300V2', markdup_method => 'samtools', }, + 16756 => { platform => 'hiseq', runfolder_name => '150701_HS36_16756_B_C711RANXX', markdup_method => 'samtools', }, + 16803 => { platform => 'hiseq', runfolder_name => '150706_HS21_16803_B_HGLFHADXX', markdup_method => 'samtools', }, + 16807 => { platform => 'hiseq', runfolder_name => '150707_HS38_16807_A_C7U2YANXX', markdup_method => 'samtools', }, + 20268 => { platform => 'hiseq', runfolder_name => '160704_MS3_20268_A_MS4000667-300V2', markdup_method => 'biobambam', }, + 16839 => { platform => 'hiseqx', runfolder_name => '150709_HX4_16839_A_H7MHWCCXX', markdup_method => 'samtools', }, + ); + + for my $run (keys %test_runs) { + my $run_details = $test_runs{$run}; + my $runfolder_path = join q[/], $dir, $run_details->{runfolder_name}; + my $bc_path = join q[/], $runfolder_path, 'Data/Intensities/BAM_basecalls_17760704-123456/no_cal'; + `mkdir -p $bc_path`; + my $cache_dir = join q[/], $runfolder_path, "Data/Intensities/BAM_basecalls_17760704-123456/metadata_cache_$run"; + `mkdir -p $cache_dir`; + + copy("t/data/$run_details->{platform}/${run}_RunInfo.xml", "$runfolder_path/RunInfo.xml") or die 'Copy failed'; + copy("t/data/run_params/runParameters.miseq.xml", "$runfolder_path/runParameters.xml") + or die "runParameters.xml copy failed"; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = qq[t/data/$run_details->{platform}/samplesheet_${run}.csv]; + + my $sa_gen; + lives_ok { + $sa_gen = npg_pipeline::function::seq_alignment->new( + run_folder => $run_details->{runfolder_name}, + runfolder_path => $runfolder_path, + recalibrated_path => $bc_path, + timestamp => q{1776}, + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', + ) + } 'no error creating an object'; + is ($sa_gen->id_run, $run, 'id_run inferred correctly'); + + my $dps; + lives_ok { $dps = $sa_gen->products->{data_products} } "no error finding data products for run $run"; + for my $i (0..$#{$dps}) { + my $markdup_method = $sa_gen->markdup_method($dps->[$i]); + + is ($markdup_method, $run_details->{markdup_method}, "markdup_method for entry $i for run $run should be inferred as $markdup_method"); + } + } + +}; + 1; diff --git a/t/data/hiseq/samplesheet_16807.csv b/t/data/hiseq/samplesheet_16807.csv index b14ad0dea..a7d3925ca 100644 --- a/t/data/hiseq/samplesheet_16807.csv +++ b/t/data/hiseq/samplesheet_16807.csv @@ -1,4 +1,4 @@ [Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, Lane,Sample_ID,Sample_Name,GenomeFolder,Index,bait_name,default_library_type,default_tag_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_cohort,sample_common_name,sample_consent_withdrawn,sample_description,sample_donor_id,sample_id,sample_name,sample_public_name,sample_reference_genome,sample_supplier_name,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index, 6,14241974,ERS758334,,ATCACGTT,,qPCR only,ATCACGTT,ky1@sanger.ac.uk ncb@sanger.ac.uk,,ncb@sanger.ac.uk,ky1@sanger.ac.uk,0,0,,, 14241974,,9606,S4077,1769,Genome-wide CRISPR gRNA screens with Mouse v2 library 2,,,from:190 to:200,ERS758334,,Homo Sapien,,,,2318403,3081STDY6127081,HT29C3_1,Homo_sapiens (1000Genomes_hs37d5),,888,ERP005600,1,0,0,PCR products were generated using genome-wide CRISPR libraries as template DNA. Subsequently%2C PCR products are pooled and subjected to Illumina library preparation. The library will be sequenced by HiSeq2500 RapidRun. %0A%0AThis data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute (including details of any publication moratoria)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F submitted for publication. For information on the proper use of data shared by the Wellcome Trust Sanger Institute (including information on acknowledgement)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F,3081,ILC CRISPR gRNA library validation,Homo_sapiens (1000Genomes_hs37d5),,CRISPR gRNA library validation,1, -6,14241975,ERS758335,,CGATGTTT,,qPCR only,CGATGTTT,ky1@sanger.ac.uk ncb@sanger.ac.uk,,ncb@sanger.ac.uk,ky1@sanger.ac.uk,0,0,,, 14241975,,9606,S4077,1769,Genome-wide CRISPR gRNA screens with Mouse v2 library 2,,,from:190 to:200,ERS758335,,Homo Sapien,,,,2318404,3081STDY6127082,HT29C3_2,Homo_sapiens (1000Genomes_hs37d5),,888,ERP005600,1,0,0,PCR products were generated using genome-wide CRISPR libraries as template DNA. Subsequently%2C PCR products are pooled and subjected to Illumina library preparation. The library will be sequenced by HiSeq2500 RapidRun. %0A%0AThis data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute (including details of any publication moratoria)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F submitted for publication. For information on the proper use of data shared by the Wellcome Trust Sanger Institute (including information on acknowledgement)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F,3081,ILC CRISPR gRNA library validation,Homo_sapiens (1000Genomes_hs37d5),,CRISPR gRNA library validation,2, +6,14241975,ERS758335,,CGATGTTT,,qPCR only,CGATGTTT,ky1@sanger.ac.uk ncb@sanger.ac.uk,,ncb@sanger.ac.uk,ky1@sanger.ac.uk,0,0,,, 14241975,,9606,S4077,1769,Genome-wide CRISPR gRNA screens with Mouse v2 library 2,,,from:190 to:200,ERS758335,,Homo Sapien,,,,2318404,3081STDY6127082,HT29C3_2,Homo_sapiens (GRCh38_full_analysis_set_plus_decoy_hla),,888,ERP005600,1,0,0,PCR products were generated using genome-wide CRISPR libraries as template DNA. Subsequently%2C PCR products are pooled and subjected to Illumina library preparation. The library will be sequenced by HiSeq2500 RapidRun. %0A%0AThis data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute (including details of any publication moratoria)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F submitted for publication. For information on the proper use of data shared by the Wellcome Trust Sanger Institute (including information on acknowledgement)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F,3081,ILC CRISPR gRNA library validation,Homo_sapiens (1000Genomes_hs37d5),,CRISPR gRNA library validation,2, diff --git a/t/data/release/config/seq_alignment/product_release.yml b/t/data/release/config/seq_alignment/product_release.yml index 50044cad4..c3123054a 100644 --- a/t/data/release/config/seq_alignment/product_release.yml +++ b/t/data/release/config/seq_alignment/product_release.yml @@ -1,12 +1,23 @@ --- default: - bqsr: - enable: true - known-sites: - - dbsnp_138.hg38 - - Mills_and_1000G_gold_standard.indels.hg38 + markdup_method: "samtools" study: - - study_id: "1713" - data_deletion: - staging_deletion_delay: 10 + - study_id: "3573" + markdup_method: "samtools" + - study_id: "3710" + markdup_method: "samtools" + - study_id: "4416" + markdup_method: "picard" + - study_id: "4825" + markdup_method: "samtools" + - study_id: "3188" + markdup_method: "samtools" + - study_id: "2846" + markdup_method: "samtools" + - study_id: "3081" + markdup_method: "samtools" + - study_id: "3948" + markdup_method: "biobambam" + - study_id: "3597" + markdup_method: "samtools" From 2de4e6ec3164ca2e4d2908245e5002e866b38342 Mon Sep 17 00:00:00 2001 From: "Martin O. Pollard" Date: Mon, 9 Sep 2019 11:24:08 +0100 Subject: [PATCH 12/17] Add necessary code for BWA MEM 2 --- lib/npg_pipeline/function/seq_alignment.pm | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index e69d22b34..28e8c1920 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -479,15 +479,17 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) } } else { + # Parse the reference genome for the product my ($organism, $strain, $tversion, $analysis) = npg_tracking::data::reference->new(($self->repository ? (q(repository)=>$self->repository) : ()))->parse_reference_genome($l->reference_genome); - $p4_param_vals->{bwa_executable} = q[bwa0_6]; + # if a non-standard aligner is specified in ref string select it $p4_param_vals->{alignment_method} = ($analysis || $bwa); my %methods_to_aligners = ( bwa_aln => q[bwa0_6], bwa_aln_se => q[bwa0_6], bwa_mem => q[bwa0_6], + bwa_mem2 => q[bwa0_6], ); my %ref_suffix = ( picard => q{.dict}, @@ -499,6 +501,13 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) $aligner = $methods_to_aligners{$aligner}; } + # BWA MEM2 requires a different executable + if ($p4_param_vals->{alignment_method} eq q[bwa_mem2]) { + $p4_param_vals->{bwa_executable} = q[bwa-mem2]; + } else { + $p4_param_vals->{bwa_executable} = q[bwa0_6]; + } + if($do_target_alignment) { $p4_param_vals->{alignment_reference_genome} = $self->_ref($dp, $aligner); } if(exists $ref_suffix{$aligner}) { $p4_param_vals->{alignment_reference_genome} .= $ref_suffix{$aligner}; From 067c2e3554f5d80ec4533931254c7b1904dc2df2 Mon Sep 17 00:00:00 2001 From: "Martin O. Pollard" Date: Wed, 9 Oct 2019 14:34:17 +0100 Subject: [PATCH 13/17] Add test and improve names of seq_alignment tests --- Changes | 2 + MANIFEST | 1 + t/20-function-seq_alignment.t | 61 +++++++++++++++++---- t/data/miseq/samplesheet_24135_bwa_mem2.csv | 4 ++ 4 files changed, 57 insertions(+), 11 deletions(-) create mode 100644 t/data/miseq/samplesheet_24135_bwa_mem2.csv diff --git a/Changes b/Changes index 416d2d0de..9994b0ce0 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,8 @@ LIST OF CHANGES --------------- + - add BWA MEM2 support to sequence_align function + release 57.4.0 - change genotype qc check to cram input - LSF array indexes fix for jobs dealign with chunked data diff --git a/MANIFEST b/MANIFEST index b10e784dc..6cdfe3f09 100644 --- a/MANIFEST +++ b/MANIFEST @@ -248,6 +248,7 @@ t/data/miseq/samplesheet_16850.csv t/data/miseq/samplesheet_16866.csv t/data/miseq/samplesheet_20990.csv t/data/miseq/samplesheet_24135.csv +t/data/miseq/samplesheet_24135_bwa_mem2.csv t/data/novaseq/180709_A00538_0010_BH3FCMDRXX/Data/Intensities/BAM_basecalls_20180805-013153/metadata_cache_26291/samplesheet_26291.csv t/data/novaseq/180709_A00538_0010_BH3FCMDRXX/Data/Intensities/BAM_basecalls_20180805-013153/metadata_cache_26291/samplesheet_26291_updated.csv t/data/novaseq/180709_A00538_0010_BH3FCMDRXX/Data/Intensities/BAM_basecalls_20180805-013153/no_cal/26291#0_p4s2_pv_in.json diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index a2f08a251..01714800f 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 14; +use Test::More tests => 15; use Test::Exception; use Test::Deep; use Test::Warn; @@ -156,7 +156,7 @@ sub _find { return $d; } -subtest 'test 1' => sub { +subtest 'basic functionality' => sub { plan tests => 33; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/rna_seq/samplesheet_12597.csv]; @@ -295,7 +295,7 @@ subtest 'test 1' => sub { is ($d->command, $command, 'correct non-multiplex lane args generated'); }; -subtest 'test 2' => sub { +subtest 'RNASeq analysis' => sub { plan tests => 22; ##RNASeq library 13066_8 library_type = Illumina cDNA protocol @@ -628,7 +628,7 @@ subtest 'test 4' => sub { is ($d->command, $command, 'command for HiSeqX run 16839 lane 7 tag 0'); }; -subtest 'test 5' => sub { +subtest 'Newer flowcell' => sub { plan tests => 5; ##HiSeq, run 16807_6 (newer flowcell) @@ -702,7 +702,7 @@ subtest 'test 5' => sub { is ($d->command, $command, 'command for HiSeq run 16807 lane 6 tag 0'); }; -subtest 'test 6' => sub { +subtest 'MiSeq WES baits' => sub { plan tests => 7; ##MiSeq, run 20268_1 (newer flowcell) - WITH bait added to samplesheet for lane 1 @@ -832,7 +832,7 @@ subtest 'test 6' => sub { }; -subtest 'test 7' => sub { +subtest 'cycle count over threshold' => sub { plan tests => 5; ##MiSeq, run 16850_1 (cycle count over threshold (currently >= 101)) @@ -904,7 +904,7 @@ subtest 'test 7' => sub { is ($d->command, $command, 'command for MiSeq run 16850 lane 1 tag 0'); }; -subtest 'test 8' => sub { +subtest 'nonconsented human split, no target alignment' => sub { plan tests => 5; ##MiSeq, run 16756_1 (nonconsented human split, no target alignment) @@ -980,7 +980,7 @@ subtest 'test 8' => sub { is ($d->command, $command, 'command for run 16756 lane 1 tag 0'); }; -subtest 'test 9' => sub { +subtest 'nonconsented human split, target alignment' => sub { plan tests => 5; ##MiSeq, run 16866_1 (nonconsented human split, target alignment) @@ -1056,7 +1056,7 @@ subtest 'test 9' => sub { is ($d->command, $command, 'command for run 16866 lane 1 tag 0'); }; -subtest 'test 10' => sub { +subtest 'no target alignment, no human split' => sub { plan tests => 5; ##MiSeq, run 20990_1 (no target alignment, no human split) @@ -1117,7 +1117,7 @@ subtest 'test 10' => sub { is ($d->command, $command, 'command for run 20990 lane 1 tag 2'); }; -subtest 'test 11' => sub { +subtest 'chromium' => sub { plan tests => 5; ##HiSeqX, run 16839_1 @@ -1203,7 +1203,7 @@ subtest 'test 11' => sub { is ($d->command, $command, 'command for run 16839 lane 1 tag 0'); }; -subtest 'test 12' => sub { +subtest 'miseq' => sub { plan tests => 11; my $runfolder = q{171020_MS5_24135_A_MS5476963-300V2}; @@ -1320,4 +1320,43 @@ subtest 'generate compositions only' => sub { } }; +subtest 'BWA MEM 2 test' => sub { + plan tests => 4; + + my $runfolder = q{171020_MS5_24135_A_MS5476963-300V2}; + my $runfolder_path = join q[/], $dir, 'compositions', $runfolder; + my $bc_path = join q[/], $runfolder_path, 'Data/Intensities/BAM_basecalls_20171127-134427/no_cal'; + make_path $bc_path; + my $cache_dir = join q[/], $runfolder_path, 'Data/Intensities/BAM_basecalls_20171127-134427/metadata_cache_24135'; + make_path $cache_dir; + make_path "$bc_path/lane1"; + make_path "$bc_path/archive/tileviz"; + + copy('t/data/miseq/24135_RunInfo.xml', "$runfolder_path/RunInfo.xml") or die 'Copy failed'; + copy('t/data/run_params/runParameters.miseq.xml', "$runfolder_path/runParameters.xml") + or die 'Copy failed'; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = q[t/data/miseq/samplesheet_24135_bwa_mem2.csv]; + + my $ms_gen = npg_pipeline::function::seq_alignment->new( + run_folder => $runfolder, + runfolder_path => $runfolder_path, + recalibrated_path => $bc_path, + timestamp => q{2017}, + repository => $dir + ); + apply_all_roles($ms_gen, 'npg_pipeline::runfolder_scaffold'); + $ms_gen->create_product_level(); + + my $da = $ms_gen->generate('analysis_pipeline'); + ok (($da and (@{$da} == 3)), 'three definitions returned'); + my $d = _find($da, 1, 1); + isa_ok ($d, 'npg_pipeline::function::definition'); + ok (!$d->excluded, 'step not excluded'); + + my $l = st::api::lims->new(id_run => 24135, position => 1, tag_index => 2); + my $analysis = $ms_gen->_analysis($l->reference_genome, '24135:1:2'); + ok ($analysis eq "bwa_mem2", 'run 24135 lane 1 tag 2 Analysis is BWA MEM 2'); +}; + 1; diff --git a/t/data/miseq/samplesheet_24135_bwa_mem2.csv b/t/data/miseq/samplesheet_24135_bwa_mem2.csv new file mode 100644 index 000000000..688edcba7 --- /dev/null +++ b/t/data/miseq/samplesheet_24135_bwa_mem2.csv @@ -0,0 +1,4 @@ +[Data],,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, +Sample_ID,Sample_Name,GenomeFolder,Index,Index2,bait_name,default_library_type,default_tag_sequence,default_tagtwo_sequence,email_addresses,email_addresses_of_followers,email_addresses_of_managers,email_addresses_of_owners,is_control,is_pool,lane_id,lane_priority,library_name,organism,organism_taxon_id,project_cost_code,project_id,project_name,purpose,qc_state,request_id,required_insert_size_range,sample_accession_number,sample_cohort,sample_common_name,sample_consent_withdrawn,sample_description,sample_donor_id,sample_id,sample_name,sample_public_name,sample_reference_genome,sample_supplier_name,spiked_phix_tag_index,study_accession_number,study_alignments_in_bam,study_contains_nonconsented_human,study_contains_nonconsented_xahuman,study_description,study_id,study_name,study_reference_genome,study_separate_y_chromosome_data,study_title,tag_index,gbs_plex_name, +20314922,mxPCR7155682,,ATCACGTT,AGGCGAAG,,GbS standard,ATCACGTT,AGGCGAAG,blah@sanger.ac.uk blah2@sanger.ac.uk,blah@sanger.ac.uk,blah2@sanger.ac.uk,blah2@sanger.ac.uk,0,0,20315681,0,20314922,,,S0910,,,standard,,,from:100 to:1000,,,,0,,mxPCR7155682,3373451,mxPCR7155682,,,HAPMAP5265538,,EGAS00001002541,1,0,0,This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute (including details of any publication moratoria)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F%0D%0A,4825,Multiplex PCR R%26D for Agena%2FFluidigm replacement,Homo_sapiens (GRCh38_15),0,Flexible multiplex PCR for genotyping-by-sequencing,1,Hs_MajorQC, +20314923,mxPCR7155683,,CGATGTTT,AGGCGAAG [bwa_mem2],,GnT MDA,CGATGTTT,AGGCGAAG,blah@sanger.ac.uk blah2@sanger.ac.uk,blah@sanger.ac.uk,blah2@sanger.ac.uk,blah2@sanger.ac.uk,0,0,20315681,0,20314923,,,S0910,,,standard,,,from:100 to:1000,,,,0,,mxPCR7155683,3373452,mxPCR7155683,,,HAPMAP5265539,,EGAS00001002541,1,0,0,This data is part of a pre-publication release. For information on the proper use of pre-publication data shared by the Wellcome Trust Sanger Institute (including details of any publication moratoria)%2C please see http%3A%2F%2Fwww.sanger.ac.uk%2Fdatasharing%2F%0D%0A,4825,Multiplex PCR R%26D for Agena%2FFluidigm replacement,Homo_sapiens (GRCh38_15) [bwa_mem2],0,Flexible multiplex PCR for genotyping-by-sequencing,2,Hs_MajorQC, From 5a5b4ea36892e42b09a18a75494653401450a4b0 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 22 Oct 2019 11:07:24 +0100 Subject: [PATCH 14/17] fix broken test file --- t/20-function-seq_alignment.t | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/t/20-function-seq_alignment.t b/t/20-function-seq_alignment.t index 485cba12e..d1fa58b93 100644 --- a/t/20-function-seq_alignment.t +++ b/t/20-function-seq_alignment.t @@ -1391,6 +1391,7 @@ subtest 'product_release_tests' => sub { is ($markdup_method, $run_details->{markdup_method}, "markdup_method for entry $i for run $run should be inferred as $markdup_method"); } } +}; subtest 'BWA MEM 2 test' => sub { plan tests => 4; @@ -1415,7 +1416,8 @@ subtest 'BWA MEM 2 test' => sub { runfolder_path => $runfolder_path, recalibrated_path => $bc_path, timestamp => q{2017}, - repository => $dir + repository => $dir, + conf_path => 't/data/release/config/seq_alignment', ); apply_all_roles($ms_gen, 'npg_pipeline::runfolder_scaffold'); $ms_gen->create_product_level(); From 2e66125e22c5c41ab5b78f4732b14894651c8247 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Tue, 22 Oct 2019 13:25:48 +0100 Subject: [PATCH 15/17] correct typo in log message --- lib/npg_pipeline/function/seq_alignment.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index bbcd7b68a..6cef7b93e 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -346,7 +346,7 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) $p4_param_vals->{js_dir} = $self->_js_scripts_dir; } else { - $self->info(q[bwakit postal processing specified, but no alternate haplotypes in reference]); + $self->info(q[bwakit postalt processing specified, but no alternate haplotypes in reference]); } } From 0636e31a9327f9b1fb8ff0de7e48ad676320e289 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Wed, 23 Oct 2019 11:58:01 +0100 Subject: [PATCH 16/17] add -f to rm command when removing intermediate cram files to avoid error and job failure when no intermediate crams are present --- lib/npg_pipeline/function/remove_intermediate_data.pm | 2 +- t/20-function-remove_intermediate_data.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/npg_pipeline/function/remove_intermediate_data.pm b/lib/npg_pipeline/function/remove_intermediate_data.pm index 92492dd0c..136a47b03 100644 --- a/lib/npg_pipeline/function/remove_intermediate_data.pm +++ b/lib/npg_pipeline/function/remove_intermediate_data.pm @@ -51,7 +51,7 @@ sub create { my $recal_path = $self->recalibrated_path; if(not $recal_path) { $self->logcroak('unable to determine recalibrated path for intermediate data deletion'); } - my $command = sprintf q[rm -v %s/*.cram], $recal_path; + my $command = sprintf q[rm -fv %s/*.cram], $recal_path; return [ npg_pipeline::function::definition->new( diff --git a/t/20-function-remove_intermediate_data.t b/t/20-function-remove_intermediate_data.t index 20b07af73..c1b1a670a 100644 --- a/t/20-function-remove_intermediate_data.t +++ b/t/20-function-remove_intermediate_data.t @@ -11,7 +11,7 @@ my $o=npg_pipeline::function::remove_intermediate_data->new( ); my $defs = $o->create; -my $expected_command = q[rm -v /a/recalibrated/path/no_cal/*.cram]; +my $expected_command = q[rm -fv /a/recalibrated/path/no_cal/*.cram]; ok ($defs && @{$defs} == 1, 'array of 1 definition is returned'); my $def = $defs->[0]; From a0a815d36541206d249a00f9d433048c166718e0 Mon Sep 17 00:00:00 2001 From: Kevin Lewis Date: Wed, 30 Oct 2019 09:48:53 +0000 Subject: [PATCH 17/17] extend spike_tag splice to "picard" markdup_method re-instate the removal of markdup from spike tag processing --- lib/npg_pipeline/function/seq_alignment.pm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/npg_pipeline/function/seq_alignment.pm b/lib/npg_pipeline/function/seq_alignment.pm index 6cef7b93e..d10b04368 100644 --- a/lib/npg_pipeline/function/seq_alignment.pm +++ b/lib/npg_pipeline/function/seq_alignment.pm @@ -452,11 +452,11 @@ sub _alignment_command { ## no critic (Subroutines::ProhibitExcessComplexity) } else { push @{$p4_ops->{prune}}, 'foptgt.*samtools_stats_F0.*00_bait.*-'; # confirm hyphen - if($markdup_method eq q[samtools]) { - push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) + if($markdup_method eq q[samtools] or $markdup_method eq q[picard]) { + push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_fixmate:', 'foptgt_000_markdup', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; # the fixmate node only works for mardkup_method samtools (pending p4 node id uniqueness bug fix) } else { - push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_bamsort_coord:', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; + push @{$p4_ops->{splice}}, 'ssfqc_tee_ssfqc:straight_through1:-foptgt_000_bamsort_coord:', 'foptgt_000_bammarkduplicates', 'foptgt_seqchksum_file:-scs_cmp_seqchksum:outputchk'; } }