diff --git a/bin/npg_samplesheet_generator_NovaSeqXSeries b/bin/npg_samplesheet_generator_NovaSeqXSeries index b96df3b3..3bf9ee5f 100755 --- a/bin/npg_samplesheet_generator_NovaSeqXSeries +++ b/bin/npg_samplesheet_generator_NovaSeqXSeries @@ -27,26 +27,46 @@ npg_samplesheet_generator_NovaSeqXSeries C - displays help message and exists - C - variant calling mode, defaults to C, other valid options - C and C C - LIMS batch identifier, optional C - NPG run ID, optional; if supplied, the record for this run should exists in the run tracking database + + C - + an optional list of read lengths, forward first, reverse + second; currently both default to 151 + + C - + an optional list of index read lengths for the first and, + optionally, second index read; if not given, computed from + the length of barcodes recorded in LIMS + C - a boolean option, false by default; if set, the DRAGEN germline analysis section is added to the file if suitable samples are present - C - an option to keep FASTQ files for aligned data, false - default - C - DRAGEN analysis can deal with a limited - number of distinct configurations. Set this attribute if - processing not on-board. + + C - variant calling mode, defaults to C, other valid options + C and C + + C - + an option to keep FASTQ files for aligned data, false by + default + + C - + DRAGEN analysis can deal with a limited number of distinct + configurations. Set this attribute if processing off-board. + + C - + DRAGEN software version to use, optional + + C - + name of the samplesheet CSV file, optional =head1 DESCRIPTION -Samplesheet generation to initiate DRAGEN analysis of data sequenced -on the NovaSeq Series X Illumina instrument. +Generates a samplesheet for the NovaSeq Series X Illumina instrument and +DRAGEN analysis. =head1 EXIT STATUS @@ -56,37 +76,13 @@ on the NovaSeq Series X Illumina instrument. =head1 CONFIGURATION -Access to the ml warehouse database is required. +Access to both ml warehouse and npg tracking database is required. =head1 DEPENDENCIES =over -=item strict - -=item warnings - -=item Carp - -=item Text::CSV - -=item Readonly - -=item List::MoreUtils - -=item List::Util - -=item Getopt::Long - -=item Pod::Usage - -=item DateTime - -=item Data::UUID - -=item st::api::lims - -=item npg_tracking::Schema +=item npg::samplesheet::novaseq_xseries =back diff --git a/lib/npg/samplesheet/novaseq_xseries.pm b/lib/npg/samplesheet/novaseq_xseries.pm index abed8cf5..27fb6853 100755 --- a/lib/npg/samplesheet/novaseq_xseries.pm +++ b/lib/npg/samplesheet/novaseq_xseries.pm @@ -8,7 +8,6 @@ use Carp; use Text::CSV; use List::MoreUtils qw(any none uniq); use List::Util qw(first max); -use Getopt::Long; use Pod::Usage; use DateTime; use Data::UUID; @@ -87,7 +86,7 @@ L ( 'run should exists in the run tracking database', ); + =head2 batch_id LIMS batch ID, an optional attribute. If not set, the C attribute @@ -158,9 +158,60 @@ sub _build_batch_id { return $batch_id; } + +=head2 index_read_length + +An array containing the length of the first and the second (if applicable) +indexing read. + +If not set, is computed as the longest first and second barcode as reported +by the LIMS system. + +=cut + +has 'index_read_length' => ( + 'isa' => 'ArrayRef', + 'is' => 'ro', + 'lazy_build' => 1, + 'required' => 0, + 'documentation' => 'An array containing the length of the first and the ' . + 'second (if applicable) indexing read..', +); +sub _build_index_read_length { + my $self = shift; + my $index1_length = max ( + map { length $_->[$LIST_INDEX_TAG1] } $self->products + ); + my $index2_length = max ( + map { length $_->[$LIST_INDEX_TAG2] } $self->products + ); + return [$index1_length, $index2_length]; +} + + +=head2 read_length + +An array containing the length of the forward and the reverse read. +If not set, is currently hardcoded as [151, 151]. + +=cut + +has 'read_length' => ( + 'isa' => 'ArrayRef', + 'is' => 'ro', + 'lazy_build' => 1, + 'required' => 0, + 'documentation' => 'An array containing the length of the forward and ' . + 'reverse read', +); +sub _build_read_length { + return [$READ1_LENGTH, $READ2_LENGTH]; +} + + =head2 align -A boolean option, false by default; if set, the DRAGEN germline iand/or +A boolean option, false by default; if set, the DRAGEN germline and/or RNA analysis is added to the samplesheet if suitable samples are present. =cut @@ -174,6 +225,7 @@ has 'align' => ( 'suitable samples are present.', ); + =head2 keep_fastq A boolean option to keep FASTQ files for aligned data, false by default. @@ -188,6 +240,7 @@ has 'keep_fastq' => ( 'by default.', ); + =head2 varcall Variant calling mode, defaults to C, other valid options are @@ -204,6 +257,7 @@ has 'varcall' => ( 'options are SmallVariantCaller and AllVariantCallers', ); + =head2 dragen_max_number_of_configs =cut @@ -218,12 +272,6 @@ has 'dragen_max_number_of_configs' => ( 'processing not on-board', ); -has '_current_number_of_configs' => ( - 'isa' => 'Int', - 'is' => 'rw', - 'default' => 0, - 'required' => 0, -); =head2 npg_tracking_schema @@ -241,6 +289,7 @@ sub _build_npg_tracking_schema { return npg_tracking::Schema->connect(); } + =head2 mlwh_schema DBIx Schema object for the mlwh database. @@ -257,6 +306,7 @@ sub _build_mlwh_schema { return WTSI::DNAP::Warehouse::Schema->connect(); } + =head2 run DBIx object for a row in the run table of the tracking database. @@ -287,6 +337,7 @@ sub _build_run { return $run; } + =head2 lims An attribute, an array of C type objects. @@ -311,6 +362,7 @@ sub _build_lims { )->children()]; } + =head2 run_name =cut @@ -328,7 +380,8 @@ sub _build_run_name { my $run_name; if ($self->has_id_run()) { # Embed instrument's Sanger network name and slot - $run_name = sprintf '%s_%s_%s', $self->id_run, $self->run->instrument->name, $self->get_instrument_side; + $run_name = sprintf '%s_%s_%s', $self->id_run, + $self->run->instrument->name, $self->get_instrument_side; } else { # Run is not tracked, generate a placeholder ID my $ug = Data::UUID->new(); @@ -340,6 +393,7 @@ sub _build_run_name { return $run_name; } + =head2 file_name =cut @@ -369,18 +423,12 @@ sub _build_file_name { return $file_name; } -has '_all_lines' => ( - 'isa' => 'ArrayRef', - 'is' => 'ro', - 'default' => sub { return []; } , - 'required' => 0, -); -sub _add_line { - my ($self, @columns) = @_; - push @{$self->_all_lines()}, @columns ? \@columns : []; - return; -} +=head2 products + +A list of products as given by LIMS, a read-only accessor. + +=cut has '_products' => ( 'isa' => 'ArrayRef', @@ -418,22 +466,6 @@ sub _build__products { return \@products; } -has '_index_reads_length' => ( - 'isa' => 'ArrayRef', - 'is' => 'ro', - 'lazy_build' => 1, - 'required' => 0, -); -sub _build__index_reads_length { - my $self = shift; - my $index1_length = max ( - map { length $_->[$LIST_INDEX_TAG1] } $self->products - ); - my $index2_length = max ( - map { length $_->[$LIST_INDEX_TAG2] } $self->products - ); - return [$index1_length, $index2_length]; -} =head2 process @@ -504,6 +536,41 @@ sub process { return; } +=head2 add_common_headers + +Adds the top-level header section. + +=cut + +sub add_common_headers { + my $self = shift; + + my ($index1_length, $index2_length) = @{$self->index_read_length()}; + $self->_add_line('[Header]'); + $self->_add_line(q[FileFormatVersion], 2); + $self->_add_line(q[RunName], $self->run_name); + $self->_add_line(qw(InstrumentPlatform NovaSeqXSeries)); + $self->_add_line(qw(InstrumentType NovaSeqXPlus)); + $self->_add_line(); + + # Reads section + $self->_add_line('[Reads]'); + $self->_add_line(q[Read1Cycles], $self->read_length()->[0]); + $self->_add_line(q[Read2Cycles], $self->read_length()->[1]); + if ($index1_length) { + $self->_add_line('Index1Cycles', $index1_length); + if ($index2_length) { + $self->_add_line('Index2Cycles', $index2_length); + } + } + + $self->_add_line(); + $self->_add_line('[Sequencing_Settings]'); + + return; +} + + =head2 add_bclconvert_section Adds BCLConvert_Settings and BCLConvert_Data sections. @@ -513,49 +580,33 @@ has not been added. The latter happens if the number of unique configurations for BCLConvert exceeds the maximum number of allowed configurations. In this case no further analysis sections should be added to the samplesheet. +The OverrideCycles column specifies the sequencing and indexing cycles to be +used when processing the sequencing data. Must adhere to the following +requirements: + +- Must be same number of fields (delimited by semicolon) as sequencing and + indexing reads specified in RunInfo.xml or 'Reads' section. + +- Indexing reads are specified with 'I', + sequencing reads are specified with 'Y', + UMI cycles are specified with 'U', + and trimmed reads are specified with 'N'. + +- The number of cycles specified for each read must equal the number of cycles + specified for that read in the RunInfo.xml file. + +- Only one 'Y' or 'I' sequence can be specified per read. + =cut sub add_bclconvert_section { my $self = shift; - my ($index1_length, $index2_length) = @{$self->_index_reads_length()}; + my ($index1_length, $index2_length) = @{$self->index_read_length()}; my @lines = (); push @lines, ['[BCLConvert_Settings]']; push @lines, [q[SoftwareVersion], $self->dragen_software_version]; - - # Not clear what CLI analysis option thie corresponds to. - # Looks likely to be a list of lanes to run a tag collision check. - # According to @srl, bcl-covert tries to correct one error by default - # but it checks the tags allow this, i.e. that they all differ by at least - # 3 bases, if they don't it disables the error correction - # $add_line->(qw(CombinedIndexCollisionCheck 1;3;4;6)); - - # CreateFastqForIndexReads might be an option. Do we need these files? - # $add_line->(qw(CreateFastqForIndexReads 1)); - # When 1 will be appropriate for this trim? - # $add_line->(qw(TrimUMI 0)); - # dragen is the other compression options push @lines, [qw(FastqCompressionFormat gzip)]; - - # Barcode mismatch tolerances, the default is 1. - # These settings can be omitted. - #if ($index1_length) { - # $add_line->(qw(BarcodeMismatchesIndex1 1)); - # if ($index2_length) { - # $add_line->(qw(BarcodeMismatchesIndex2 1)); - # } - #} - - # Adapter trimming settings. The sequence of the Read 1 (or 2) adapter - # to be masked or trimmed. To trim multiple adapters, separate the sequences - # with a plus sign (+) indicating independent adapters that must be - # independently assessed for masking or trimming for each read. - # Characters must be A, C, G, or T. - # It seems that this settign can also be a column in teh data section - # - # $add_line->(qw(AdapterRead1 SOME)); - # $add_line->(qw(AdapterRead2 OTHER)); - push @lines, []; push @lines, ['[BCLConvert_Data]']; @@ -569,17 +620,6 @@ sub add_bclconvert_section { } push @lines, \@data_header; - # Override Cycles - Specifies the sequencing and indexing cycles to be used - # when processing the sequencing data. Must adhere to the - # following requirements: - # - Must be same number of fields (delimited by semicolon) as sequencing and - # indexing reads specified in RunInfo.xml or Reads section. - # - Indexing reads are specified with I, sequencing reads are specified with - # Y, UMI cycles are specified with U, and trimmed reads are specified with N. - # - The number of cycles specified for each read must equal the number of - # cycles specified for that read in the RunInfo.xml file. - # - Only one Y or I sequence can be specified per read. - my $index_override = sub { my ($max_length, $barcode) = @_; my $i_cycles_number = length $barcode; @@ -604,7 +644,7 @@ sub add_bclconvert_section { if ($index1_length) { my $i7 = $product->[$LIST_INDEX_TAG1]; push @product_data, $i7; - push @override_cycles, q[Y] . $READ1_LENGTH; + push @override_cycles, q[Y] . $self->read_length()->[0]; push @override_cycles, $index_override->($index1_length, $i7); if ($index2_length) { @@ -613,7 +653,7 @@ sub add_bclconvert_section { push @override_cycles, $index_override->($index2_length, $i5); } - push @override_cycles, q[Y] . $READ2_LENGTH; + push @override_cycles, q[Y] . $self->read_length()->[1]; } my $override_cycles_string = join q[;], @override_cycles; # Might be an empty string ... @@ -637,43 +677,108 @@ sub add_bclconvert_section { return scalar @lines; } -=head2 add_common_headers -Adds the top-level header section. +=head2 add_germline_section + +Conditionally adds the DragenGermline_Settings and DragenGermline_Data +sections. =cut -sub add_common_headers { +sub add_germline_section { my $self = shift; - my ($index1_length, $index2_length) = @{$self->_index_reads_length()}; - $self->_add_line('[Header]'); - $self->_add_line(q[FileFormatVersion], 2); - $self->_add_line(q[RunName], $self->run_name); - $self->_add_line(qw(InstrumentPlatform NovaSeqXSeries)); - # NovaSeqxPlus or NovaSeqX. - # If the run id is given, this should come from the tracking database - # when we fix the type there. - $self->_add_line(qw(InstrumentType NovaSeqXPlus)); - $self->_add_line(); + if (none { $self->varcall eq $_ } @VAR_CALL_MODES) { + croak 'Uknown mode for variang calling - ' . $self->varcall; + } - # Reads section - $self->_add_line('[Reads]'); - $self->_add_line(q[Read1Cycles], $READ1_LENGTH); - $self->_add_line(q[Read2Cycles], $READ2_LENGTH); - if ($index1_length) { - $self->_add_line('Index1Cycles', $index1_length); - if ($index2_length) { - $self->_add_line('Index2Cycles', $index2_length); + my @to_align = (); + my @ref_matches = keys %REFERENCE_MAPING; + my $distinct_configs = {}; + + foreach my $p ( $self->products() ) { + + my $r = $p->[$LIST_INDEX_REF]; + my $lib_type = $p->[$LIST_INDEX_LIBTYPE]; + + if ( can_do_alignment($r) && !(do_ref_rna_alignment_test($r) || + do_libtype_rna_alignment_test($lib_type) || + do_libtype_tenx_test($lib_type)) ) { + + my $match = first { $r =~ /$_/xms} @ref_matches; + if ($self->_can_add_sample($distinct_configs, $match)) { + # TODO Are all variant calling modes compatible with all + # references? + push @to_align, [$p->[1], $REFERENCE_MAPING{$match}, $self->varcall]; + } } } - $self->_add_line(); - $self->_add_line('[Sequencing_Settings]'); + if (@to_align) { - return; + $self->_add_line('[DragenGermline_Settings]'); + $self->_add_line(q[SoftwareVersion], $self->dragen_software_version); + $self->_add_line(qw(MapAlignOutFormat cram)); + # Accepted values are true or false. Not clear whether this can be + # set per sample. + $self->_add_line(q(KeepFastq), $self->keep_fastq ? 'TRUE' : 'FALSE'); + $self->_add_line(); + + $self->_add_line(qw([DragenGermline_Data])); + $self->_add_line(qw(Sample_ID ReferenceGenomeDir VariantCallingMode)); + $self->_add_samples(@to_align); + } + + return scalar @to_align; } + +=head2 add_rna_section + +Conditionally adds the DragenRNA_Settings and DragenRNA_Data sections. + +=cut + +sub add_rna_section { + my $self = shift; + + my @to_align = (); + my @ref_matches = keys %REFERENCE_MAPING; + my $distinct_configs = {}; + + foreach my $p ( $self->products() ) { + + my $r = $p->[$LIST_INDEX_REF]; + my $lib_type = $p->[$LIST_INDEX_LIBTYPE]; + + if ( can_do_alignment($r) && ( + do_ref_rna_alignment_test($r) || + do_libtype_rna_alignment_test($lib_type)) ) { + + my $match = first { $r =~ /$_/xms} @ref_matches; + if ($self->_can_add_sample($distinct_configs, $match)) { + push @to_align, [$p->[1], $REFERENCE_MAPING{$match}]; + } + } + } + + if (@to_align) { + $self->_add_line('[DragenRNA_Settings]'); + $self->_add_line(q(SoftwareVersion), $self->dragen_software_version); + $self->_add_line(qw(MapAlignOutFormat cram)); + $self->_add_line(q(KeepFastq), $self->keep_fastq ? 'TRUE' : 'FALSE'); + $self->_add_line(qw(RnaPipelineMode FullPipeline)); + $self->_add_line(); + + $self->_add_line('[DragenRNA_Data]'); + $self->_add_line(qw(Sample_ID ReferenceGenomeDir)); + $self->_add_samples(@to_align); + } + + return scalar @to_align; +} + + =head2 can_do_alignment =cut @@ -683,6 +788,7 @@ sub can_do_alignment { return $r && !($r =~ /Not suitable/xmsi); } + =head2 do_ref_rna_alignment_test =cut @@ -692,6 +798,7 @@ sub do_ref_rna_alignment_test { return any { $r =~ /$_/xmsi } @RNA_ANALYSES_REFS; } + =head2 do_libtype_rna_alignment_test =cut @@ -701,6 +808,7 @@ sub do_libtype_rna_alignment_test { return any { $lt =~ /$_/xmsi } @RNA_ANALYSES_LIB_TYPES; } + =head2 do_libtype_tenx_test =cut @@ -710,6 +818,7 @@ sub do_libtype_tenx_test { return any { $lt =~ /$_/xmsi } @TENX_ANALYSES_LIB_TYPES; } + =head2 get_instrument_side Consult run tags to determine which slot/side of the instrument this run is @@ -727,6 +836,32 @@ sub get_instrument_side { return $side; } +################################################################## +# Private attributes and methods # +################################################################## + +# A writable counter. +has '_current_number_of_configs' => ( + 'isa' => 'Int', + 'is' => 'rw', + 'default' => 0, + 'required' => 0, +); + + +has '_all_lines' => ( + 'isa' => 'ArrayRef', + 'is' => 'ro', + 'default' => sub { return []; } , + 'required' => 0, +); + + +sub _add_line { + my ($self, @columns) = @_; + push @{$self->_all_lines()}, @columns ? \@columns : []; + return; +} sub _add_samples { my ($self, @samples) = @_; @@ -742,6 +877,7 @@ sub _add_samples { return; } + sub _can_add_sample { my ($self, $distinct_configs, $config) = @_; @@ -762,104 +898,6 @@ sub _can_add_sample { return 1; } -=head2 add_germline_section - -Conditionally adds the DragenGermline_Settings and DragenGermline_Data -sections. - -=cut - -sub add_germline_section { - my $self = shift; - - if (none { $self->varcall eq $_ } @VAR_CALL_MODES) { - croak 'Uknown mode for variang calling - ' . $self->varcall; - } - - my @to_align = (); - my @ref_matches = keys %REFERENCE_MAPING; - my $distinct_configs = {}; - - foreach my $p ( $self->products() ) { - - my $r = $p->[$LIST_INDEX_REF]; - my $lib_type = $p->[$LIST_INDEX_LIBTYPE]; - - if ( can_do_alignment($r) && !(do_ref_rna_alignment_test($r) || - do_libtype_rna_alignment_test($lib_type) || - do_libtype_tenx_test($lib_type)) ) { - - my $match = first { $r =~ /$_/xms} @ref_matches; - if ($self->_can_add_sample($distinct_configs, $match)) { - # TODO Are all variant calling modes compatible with all - # references? - push @to_align, [$p->[1], $REFERENCE_MAPING{$match}, $self->varcall]; - } - } - } - - if (@to_align) { - - $self->_add_line('[DragenGermline_Settings]'); - $self->_add_line(q[SoftwareVersion], $self->dragen_software_version); - $self->_add_line(qw(MapAlignOutFormat cram)); - # Accepted values are true or false. Not clear whether this can be - # set per sample. - $self->_add_line(q(KeepFastq), $self->keep_fastq ? 'TRUE' : 'FALSE'); - $self->_add_line(); - - $self->_add_line(qw([DragenGermline_Data])); - $self->_add_line(qw(Sample_ID ReferenceGenomeDir VariantCallingMode)); - $self->_add_samples(@to_align); - } - - return scalar @to_align; -} - -=head2 add_rna_section - -Conditionally adds the DragenRNA_Settings and DragenRNA_Data sections. - -=cut - -sub add_rna_section { - my $self = shift; - - my @to_align = (); - my @ref_matches = keys %REFERENCE_MAPING; - my $distinct_configs = {}; - - foreach my $p ( $self->products() ) { - - my $r = $p->[$LIST_INDEX_REF]; - my $lib_type = $p->[$LIST_INDEX_LIBTYPE]; - - if ( can_do_alignment($r) && - (do_ref_rna_alignment_test($r) || do_libtype_rna_alignment_test($lib_type)) ) { - - my $match = first { $r =~ /$_/xms} @ref_matches; - if ($self->_can_add_sample($distinct_configs, $match)) { - push @to_align, [$p->[1], $REFERENCE_MAPING{$match}]; - } - } - } - - if (@to_align) { - $self->_add_line('[DragenRNA_Settings]'); - $self->_add_line(q(SoftwareVersion), $self->dragen_software_version); - $self->_add_line(qw(MapAlignOutFormat cram)); - $self->_add_line(q(KeepFastq), $self->keep_fastq ? 'TRUE' : 'FALSE'); - $self->_add_line(qw(RnaPipelineMode FullPipeline)); - $self->_add_line(); - - $self->_add_line('[DragenRNA_Data]'); - $self->_add_line(qw(Sample_ID ReferenceGenomeDir)); - $self->_add_samples(@to_align); - } - - return scalar @to_align; -} - __PACKAGE__->meta->make_immutable; 1; @@ -892,8 +930,6 @@ __END__ =item List::Util -=item Getopt::Long - =item Pod::Usage =item DateTime diff --git a/t/47-npg_samplesheet_novaseq_xseries.t b/t/47-npg_samplesheet_novaseq_xseries.t index ddc3ec98..b1229d6d 100644 --- a/t/47-npg_samplesheet_novaseq_xseries.t +++ b/t/47-npg_samplesheet_novaseq_xseries.t @@ -118,7 +118,7 @@ subtest 'create the generator object, test simple attributes' => sub { }; subtest 'generate a samplesheet' => sub { - plan tests => 6; + plan tests => 8; my $file_name = '47995_NVX1_A_ssbatch98292.csv'; my $compare_file_root = @@ -131,6 +131,9 @@ subtest 'generate a samplesheet' => sub { file_name => $file_name ); + is_deeply ($g->index_read_length(), [8,8], 'correct lengths of index reads'); + is_deeply ($g->read_length(), [151,151], 'correct lengths of reads'); + # The code creates a new samplesheet in the working directory. # This will be changed in future. $g->process();