From 3540afed4a5bdc70c56043b361ada4192376d74e Mon Sep 17 00:00:00 2001 From: Marina Gourtovaia Date: Tue, 14 Nov 2023 16:02:34 +0000 Subject: [PATCH] Added tests for merging by library. Also refactored method itself. --- lib/st/api/lims.pm | 145 ++++++++++++++++++--------- t/40-st-lims-merge.t | 230 +++++++++++++++++++++++++++++-------------- 2 files changed, 253 insertions(+), 122 deletions(-) diff --git a/lib/st/api/lims.pm b/lib/st/api/lims.pm index 1b3fa7bd..f8c913b6 100644 --- a/lib/st/api/lims.pm +++ b/lib/st/api/lims.pm @@ -912,84 +912,135 @@ sub aggregate_xlanes { =head2 aggregate_libraries +Given a list of lane-level st::api::lims objects, finds their children, which +can be merged and analysed together across all or some of the lanes. + +All argument lane objects should belong to the same run. It is assumed that +they all use the same st::api::lims driver type. + +Returns two lists of objects, one for merged entities and one for singletons, +which are wrapped into a dictionary. Either of these lists can be empty. Both +of the lists are guaranteed not to be empty at the same time. + +Tag zero objects are neither added nor explicitly removed. Objects for +spiked-in controls (if present) are added to the list of singletons. + +The lists of singletons and merges are sorted. For a given set of input lane +st::api::lims objects the same lists are always returned. + +Criteria for entities to be eligible for a merge: +* they are not controls, +* they belong to the same library, +* they share the same tag index, therefore belong to different lanes, +* they belong to the same study. + +This method can be used both as instance and as a class method. + + my $all_lims = st::api::lims->aggregate_libraries($run_lims->children()); + for my $l (@{$all_lims->{'singles'}}) { + print 'No merge for ' . $l->to_string; + } + for my $l (@{$all_lims->{'merges'}}) { + print 'Merged entity ' . $l->to_string; + } + =cut + sub aggregate_libraries() { my ($self, $lane_lims_array) = @_; - my @id_runs = uniq map { $_->id_run } @{$lane_lims_array}; - if (@id_runs != 1) { - croak 'Multiple run IDs in a potential library merge'; - } + # This restriction might be lifted in future. + _check_value_is_unique('id_run', 'run IDs', $lane_lims_array); - my $all_lims_objects = {}; my $lims_objects_by_library = {}; - + my @singles = (); my @all_single_lims_objs = map { $_->is_pool ? $_->children() : $_ } @{$lane_lims_array}; - foreach my $obj (@all_single_lims_objs) { - if ($obj->is_control()) { # Do not merge spiked PhiX libraries. - push @{$all_lims_objects->{'single'}}, $obj; + if ($obj->is_control()) { + push @singles, $obj; } else { push @{$lims_objects_by_library->{$obj->library_id}}, $obj; } } - my $merge_set = {}; - - # Do not use $self for this to retain ability to use this method as a class - # method. + # Get the common st::api::lims driver arguments, which will be used + # to create objects for merged entities. + # Do not use $self for copying the driver arguments in order to retain + # ability to use this method as a class method. my %init = %{$lane_lims_array->[0]->_driver_arguments()}; + delete $init{position}; + delete $init{id_run}; + my $merges = {}; + my $lane_set_delim = q[,]; foreach my $library_id (keys %{$lims_objects_by_library}) { my @lib_lims = @{$lims_objects_by_library->{$library_id}}; if (@lib_lims == 1) { - push @{$all_lims_objects->{'single'}}, $lib_lims[0]; + push @singles, @lib_lims; } else { - my @study_ids = uniq map { $_->study_id } @lib_lims; - if (@study_ids != 1) { - croak 'Multiple studies in a potential merge'; - } - my @tag_indexes = - uniq - map { defined $_->tag_index ? $_->tag_index : 'undefined' } - @lib_lims; - if (@tag_indexes != 1) { - croak 'Inconsistent tag indexes in a potential merge'; - } - my @lanes = uniq map {$_->position} @lib_lims; - if (@lanes != @lib_lims) { - croak 'Intra-lane merge is detected in a potential merge'; - } - my $lane_set = join q[,], @lanes; - $all_lims_objects->{'merges'}->{$lane_set}->{$tag_indexes[0]} = - __PACKAGE__->new( - %init, - rpt_list => npg_tracking::glossary::rpt->deflate_rpts(\@lib_lims) - ); + _check_merge_correctness(\@lib_lims); + my $lane_set = join $lane_set_delim, + sort { $a <=> $b } map { $_->position } @lib_lims; + my $tag_index = $lib_lims[0]->tag_index; + $merges->{$lane_set}->{$tag_index} = __PACKAGE__->new( + %init, + rpt_list => npg_tracking::glossary::rpt->deflate_rpts(\@lib_lims) + ); } } - # Lane sets should not intersect. - my @positions = - map { (split /,/smx, $_) } - keys %{$all_lims_objects->{'merges'}}; - if (@positions != uniq @positions) { + # Lane sets should not intersect. Error if a lane belongs to multiple sets. + my @all_lanes_in_merged_sets = map { (split /$lane_set_delim/smx, $_) } + keys %{$merges}; + if (@all_lanes_in_merged_sets != uniq @all_lanes_in_merged_sets) { croak 'No clean split between lanes in potential merges'; } - # Within each set sort LIMS objects by tag index and keep - # the sorted list. - foreach my $lane_set ( keys %{$all_lims_objects->{'merges'}} ) { - my @sorted_lims_onjects = - map {$all_lims_objects->{'merges'}->{$lane_set}->{$_}} - (sort { $a <=> $b } keys %{$all_lims_objects->{'merges'}->{$lane_set}}); - $all_lims_objects->{'merges'}->{$lane_set} = \@sorted_lims_onjects; + my $all_lims_objects = {}; + # Arrange in a predictable consistent orger. + foreach my $lane_set ( sort keys %{$merges} ) { + my @tag_indexes = sort { $a <=> $b } keys %{$merges->{$lane_set}}; + push @{$all_lims_objects->{'merges'}}, + (map { $merges->{$lane_set}->{$_} } @tag_indexes); + } + $all_lims_objects->{'singles'} = [ + sort { $a->position <=> $b->position || $a->tag_index <=> $b->tag_index} + @singles + ]; + + # Sanity check. + my $num_objects = @{$all_lims_objects->{'merges'}} + + @{$all_lims_objects->{'singles'}}; + if ($num_objects == 0) { + croak 'Invalid aggregation by library, no objects returned'; } return $all_lims_objects; } +sub _check_merge_correctness{ + my $lib_lims = shift; + _check_value_is_unique('study_id', 'studies', $lib_lims); + _check_value_is_unique('tag_index', 'tag indexes', $lib_lims); + my @lanes = uniq map {$_->position} @{$lib_lims}; + if (@lanes != @{$lib_lims}) { + croak 'Intra-lane merge is detected in a potential merge'; + } + return; +} + +sub _check_value_is_unique { + my ($method_name, $property_name, $objects) = @_; + my @values = uniq + map { defined $_->$method_name ? $_->$method_name : 'undefined' } + @{$objects}; + if (@values != 1) { + croak "Multiple $property_name in a potential library merge"; + } + return; +} + =head2 create_tag_zero_object Using id_run and position values of this object, creates and returns diff --git a/t/40-st-lims-merge.t b/t/40-st-lims-merge.t index 01bc15e3..965c446d 100644 --- a/t/40-st-lims-merge.t +++ b/t/40-st-lims-merge.t @@ -1,12 +1,13 @@ use strict; use warnings; -use Test::More tests => 4; +use Test::More tests => 5; use Test::Exception; +use List::MoreUtils qw/all none/; use_ok('st::api::lims'); subtest 'Aggregation across lanes for pools' => sub { - plan tests => 85; + plan tests => 82; local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = 't/data/test40_lims/samplesheet_novaseq4lanes.csv'; @@ -81,81 +82,11 @@ subtest 'Aggregation across lanes for pools' => sub { 'rpt list for tag 21 object'); is ($tag_first->rpt_list, '25846:1:1;25846:2:1;25846:3:1;25846:4:1', 'rpt list for tag 1 object'); + ok ((none {defined $_->id_run} ($tag_zero, $tag_spiked, $tag_first, $tag_last)), + "id_run not defined"); - my $expected = { - '25846:1:0;25846:2:0;25846:3:0;25846:4:0' => { - 'sample_id' => undef, - 'sample_name' => undef, - 'sample_common_name' => 'Homo sapiens', - 'study_id' => 5318, - 'study_name' => 'NovaSeq testing', - 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', - 'library_id' => undef, - 'library_name' => undef, - 'library_type' => 'Standard', - 'default_tag_sequence' => undef, - 'study_alignments_in_bam' => 1, - 'study_contains_nonconsented_human' => 0 - }, - '25846:1:888;25846:2:888;25846:3:888;25846:4:888' => { - 'sample_id' => '1255141', - 'sample_name' => 'phiX_for_spiked_buffers', - 'sample_common_name' => undef, - 'study_id' => 198, - 'study_name' => 'Illumina Controls', - 'reference_genome' => undef, - 'library_id' => '17883061', - 'library_name' => '17883061', - 'library_type' => undef, - 'default_tag_sequence' => 'ACAACGCAATC', - 'study_alignments_in_bam' => 1, - 'study_contains_nonconsented_human' => 0 - }, - '25846:1:21;25846:2:21;25846:3:21;25846:4:21' => { - 'sample_id' => '3681772', - 'sample_name' => '5318STDY7462477', - 'sample_common_name' => 'Homo sapiens', - 'study_id' => 5318, - 'study_name' => 'NovaSeq testing', - 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', - 'library_id' => '21059089', - 'library_name' => '21059089', - 'library_type' => 'Standard', - 'default_tag_sequence' => 'TCGAGCGT', - 'study_alignments_in_bam' => 1, - 'study_contains_nonconsented_human' => 0 - }, - '25846:1:1;25846:2:1;25846:3:1;25846:4:1' => { - 'sample_id' => '3681752', - 'sample_name' => '5318STDY7462457', - 'sample_common_name' => 'Homo sapiens', - 'study_id' => 5318, - 'study_name' => 'NovaSeq testing', - 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', - 'library_id' => '21059039', - 'library_name' => '21059039', - 'library_type' => 'Standard', - 'default_tag_sequence' => 'ATCACGTT', - 'study_alignments_in_bam' => 1, - 'study_contains_nonconsented_human' => 0 - } - }; + _compare_properties([$tag_first, $tag_last, $tag_zero, $tag_spiked]); - for my $o (($tag_zero, $tag_spiked, $tag_first, $tag_last)) { - my $rpt_list = $o->rpt_list; - ok (!defined $o->id_run, "id_run not defined for $rpt_list"); - for my $method ( qw/ - sample_id sample_name sample_common_name - study_id study_name reference_genome - library_id library_name library_type - default_tag_sequence - /) { - is ($o->$method, $expected->{$rpt_list}->{$method}, "$method for $rpt_list"); - } - ok ($o->study_alignments_in_bam, "alignment true for $rpt_list"); - ok (!$o->study_contains_nonconsented_human, "nonconsented_human false for $rpt_list"); - } - ok ($tag_spiked->is_phix_spike, 'is phix spike'); ok (!$tag_first->is_phix_spike, 'is not phix spike'); ok (!$tag_zero->is_phix_spike, 'is not phix spike'); @@ -241,4 +172,153 @@ subtest 'Aggregation across lanes for a tag' => sub { } }; +subtest 'Aggregation by library' => sub { + plan tests => 68; + + local $ENV{NPG_CACHED_SAMPLESHEET_FILE} = + 't/data/test40_lims/samplesheet_novaseq4lanes.csv'; + # All lanes of this runs can be merged together. + # The pool contains a spiked-in control. + + my $id_run = 25846; + my @lane_lims = st::api::lims->new(id_run => $id_run)->children; + my $lims = st::api::lims->aggregate_libraries(\@lane_lims); + + for my $key_name (qw/singles merges/) { + ok (exists $lims->{$key_name}, "entry for $key_name exists"); + is (ref $lims->{$key_name}, 'ARRAY', 'the value is an array'); + } + is (keys %{$lims}, 2, 'no unexpected keys'); + is (@{$lims->{'singles'}}, 4, 'list of singles contains 4 objects'); + is (@{$lims->{'merges'}}, 21, 'list of merges contains 21 objects'); + + ok ( (all { $_->is_control } @{$lims->{'singles'}}), + 'all singles are spiked-in controls'); + ok ((none { defined $_->rpt_list } @{$lims->{'singles'}}), + 'rpt_list value is not defined for singles'); + ok ((all { $_->id_run == $id_run } @{$lims->{'singles'}}), + 'id_run is set correctly for singles'); + ok ((all { $_->tag_index == 888 } @{$lims->{'singles'}}), + 'tag_index is set correctly for singles'); + is ('1,2,3,4', join(q[,], map { $_->position } @{$lims->{'singles'}}), + 'objects are ordered in position acsending order'); + + my @rpt_lists = map { $_->rpt_list } @{$lims->{'merges'}}; + my @expected_rpt_lists = _generate_rpt_lists($id_run, [(1 .. 4)], [(1 .. 21)]); + is_deeply (\@rpt_lists, \@expected_rpt_lists, + 'merges list - correct object, correct sort order'); + + for my $method_name (qw/id_run position tag_index/) { + ok ((none { defined $_->$method_name } @{$lims->{'merges'}}), + "$method_name is not defined"); + } + ok ((all { $_->driver_type eq 'samplesheet' } @{$lims->{'merges'}}), + 'driver type is correct'); + + _compare_properties([$lims->{'merges'}->[0], $lims->{'merges'}->[20]]); + + $lims = st::api::lims->aggregate_libraries([$lane_lims[0], $lane_lims[2]]); + is (@{$lims->{'singles'}}, 2, 'list of singles contains 2 objects'); + is (@{$lims->{'merges'}}, 21, 'list of merges contains 21 objects'); + @rpt_lists = map { $_->rpt_list } @{$lims->{'merges'}}; + @expected_rpt_lists = _generate_rpt_lists($id_run, [1, 3], [(1 .. 21)]); + is_deeply (\@rpt_lists, \@expected_rpt_lists, + 'merges list - correct object, correct sort order'); + _compare_properties([$lims->{'merges'}->[0], $lims->{'merges'}->[20]]); +}; + +sub _generate_rpt_lists { + my ($id_run, $positions, $tag_indexes) = @_; + my @expected_rpt_lists = (); + foreach my $tag_index (@{$tag_indexes}) { + my @rpt_list = (); + for my $position (@{$positions}) { + push @rpt_list, join q[:], $id_run, $position, $tag_index; + } + push @expected_rpt_lists, join q[;], @rpt_list; + } + return @expected_rpt_lists; +} + +sub _compare_properties { + my $lims_objects = shift; + + my $expected_props = [ + { + 'sample_id' => '3681752', + 'sample_name' => '5318STDY7462457', + 'sample_common_name' => 'Homo sapiens', + 'study_id' => 5318, + 'study_name' => 'NovaSeq testing', + 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', + 'library_id' => '21059039', + 'library_name' => '21059039', + 'library_type' => 'Standard', + 'default_tag_sequence' => 'ATCACGTT', + 'study_alignments_in_bam' => 1, + 'study_contains_nonconsented_human' => 0 + }, + { + 'sample_id' => '3681772', + 'sample_name' => '5318STDY7462477', + 'sample_common_name' => 'Homo sapiens', + 'study_id' => 5318, + 'study_name' => 'NovaSeq testing', + 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', + 'library_id' => '21059089', + 'library_name' => '21059089', + 'library_type' => 'Standard', + 'default_tag_sequence' => 'TCGAGCGT', + 'study_alignments_in_bam' => 1, + 'study_contains_nonconsented_human' => 0 + }, + { + 'sample_id' => undef, + 'sample_name' => undef, + 'sample_common_name' => 'Homo sapiens', + 'study_id' => 5318, + 'study_name' => 'NovaSeq testing', + 'reference_genome' => 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_75_transcriptome)', + 'library_id' => undef, + 'library_name' => undef, + 'library_type' => 'Standard', + 'default_tag_sequence' => undef, + 'study_alignments_in_bam' => 1, + 'study_contains_nonconsented_human' => 0 + }, + { + 'sample_id' => '1255141', + 'sample_name' => 'phiX_for_spiked_buffers', + 'sample_common_name' => undef, + 'study_id' => 198, + 'study_name' => 'Illumina Controls', + 'reference_genome' => undef, + 'library_id' => '17883061', + 'library_name' => '17883061', + 'library_type' => undef, + 'default_tag_sequence' => 'ACAACGCAATC', + 'study_alignments_in_bam' => 1, + 'study_contains_nonconsented_human' => 0 + } + ]; + + my $num_objects = @{$lims_objects}; + for my $i ((0 .. $num_objects-1)) { + my $o = $lims_objects->[$i]; + my $rpt_list = $o->rpt_list; + my $expected = $expected_props->[$i]; + for my $method ( qw/ + sample_id sample_name sample_common_name + study_id study_name reference_genome + library_id library_name library_type + default_tag_sequence study_alignments_in_bam + /) { + is ($o->$method, $expected->{$method}, "$method for $rpt_list"); + } + ok (!$o->study_contains_nonconsented_human, "nonconsented_human false for $rpt_list"); + } + + return; +} + 1;