diff --git a/Changes b/Changes index b45ce77f..bdad7a33 100644 --- a/Changes +++ b/Changes @@ -1,5 +1,18 @@ LIST OF CHANGES +release 101.5.0 (2024-09-04) + - Moved the parser for the LIMS reference genome notation from + npg_tracking::data::reference::find to a new package + npg_tracking::data::reference::util, thus providing a stand-alone + implementation of the parser. + + Removed dependency of st::api::lims on npg_tracking::data::reference::find + since the latter brings in the dependency on npg_tracking::data::reference::list, + which requires that the reference repository root is defined and exists. + Instead used the reference genome notation parser from + npg_tracking::data::reference::util. + - fixes issues with tests for npg_seq_pipeline 68.5.0 and higher + release 101.4.0 (2024-08-30) - Added species_from_reference_genome method to st::api::lims. If the reference_genome value is defined, this new method returns the name of the diff --git a/MANIFEST b/MANIFEST index 6af3e082..9ad25f29 100644 --- a/MANIFEST +++ b/MANIFEST @@ -202,6 +202,7 @@ lib/npg_tracking/data/reference.pm lib/npg_tracking/data/reference/find.pm lib/npg_tracking/data/reference/info.pm lib/npg_tracking/data/reference/list.pm +lib/npg_tracking/data/reference/util.pm lib/npg_tracking/data/snv.pm lib/npg_tracking/data/snv/find.pm lib/npg_tracking/data/transcriptome.pm @@ -344,6 +345,7 @@ t/10-primer_panel.t t/10-reference-find.t t/10-reference-info.t t/10-reference-list.t +t/10-reference-util.t t/10-reference.t t/10-snv.t t/10-transcriptome.t diff --git a/lib/npg_tracking/data/reference/find.pm b/lib/npg_tracking/data/reference/find.pm index 2e96d48b..bb6f23a4 100644 --- a/lib/npg_tracking/data/reference/find.pm +++ b/lib/npg_tracking/data/reference/find.pm @@ -7,6 +7,7 @@ use File::Spec::Functions qw(catfile); use Readonly; use npg_tracking::util::abs_path qw(abs_path); +use npg_tracking::data::reference::util qw(parse_reference_genome_name); use npg_tracking::data::reference::info; use npg_tracking::util::messages; use npg_tracking::glossary::rpt; @@ -386,25 +387,7 @@ sub parse_reference_genome { my ($self, $reference_genome) = @_; $reference_genome ||= $self->reference_genome; if ($reference_genome) { - my ($organism, $strain, $tversion, $analysis, @array); - ## allows for transcriptome version and analysis e.g. 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_release_75) [star]' - $organism = '(?\S+)\s+'; - $strain = '(?\S+)'; - $tversion = '(?:\s+\+\s+(?\S+))?'; - $analysis = '(?:\s+[[](?\S+)[]])?'; - $reference_genome =~ qr{$organism [(] $strain $tversion [)] $analysis}smx; - $organism = $LAST_PAREN_MATCH{'organism'}; - $strain = $LAST_PAREN_MATCH{'strain'}; - $tversion = $LAST_PAREN_MATCH{'tversion'}; - $analysis = $LAST_PAREN_MATCH{'analysis'}; - if ($organism && $strain) { - if ($tversion || $analysis) { - @array = ($organism, $strain, $tversion, $analysis); - } else { - @array = ($organism, $strain); - } - return @array; - } + return parse_reference_genome_name($reference_genome); } return; } diff --git a/lib/npg_tracking/data/reference/util.pm b/lib/npg_tracking/data/reference/util.pm new file mode 100644 index 00000000..383a395e --- /dev/null +++ b/lib/npg_tracking/data/reference/util.pm @@ -0,0 +1,119 @@ +package npg_tracking::data::reference::util; + +use strict; +use warnings; +use English qw(-no_match_vars); +use Carp; +use Exporter qw(import); + +our @EXPORT_OK = qw(parse_reference_genome_name); + +our $VERSION = '0'; + +sub parse_reference_genome_name { + my $reference_genome = shift; + + $reference_genome or croak 'Reference genome name is not defined'; + + my ($organism, $strain, $tversion, $analysis); + + $organism = '(?\S+)\s+'; + $strain = '(?\S+)'; + $tversion = '(?:\s+\+\s+(?\S+))?'; + $analysis = '(?:\s+[[](?\S+)[]])?'; + $reference_genome =~ qr{$organism [(] $strain $tversion [)] $analysis}smx; + $organism = $LAST_PAREN_MATCH{'organism'}; + $strain = $LAST_PAREN_MATCH{'strain'}; + $tversion = $LAST_PAREN_MATCH{'tversion'}; + $analysis = $LAST_PAREN_MATCH{'analysis'}; + + if ($organism && $strain) { + my @array = ($organism, $strain); + if ($tversion || $analysis) { + push @array, $tversion, $analysis; + } + return @array; + } + + return; # To be compatible with the existing code, it is essential to return + # an undefined value rather than an empty array. +} + +1; + +__END__ + +=head1 NAME + +npg_tracking::data::reference::util + +=head1 SYNOPSIS + +=head1 DESCRIPTION + +A collection of simple utility function in support of reference finder. + +=head1 SUBROUTINES/METHODS + +=head2 parse_reference_genome_name + +Parses LIMs notation for reference genome, returns a list containing +an organism, strain (genome version) and, optionally, a transcriptome +version and/or a word indicating the type of analysis to be run. + +Returns an undefined value if the input does not conform to the expected +pattern. Errors if the input string is undefined or empty. + + use npg_tracking::data::reference::util qw(parse_reference_genome_name); + + parse_reference_genome_name(q[]); # errors + parse_reference_genome_name(); # errors + + my $name = 'Homo_sapiens (1000Genomes_hs37d5)'; + my @a = parse_reference_genome_name($name); + print join q[, ], @a; # prints Homo_sapiens, 1000Genomes_hs37d5 + + # 'ensembl_release_75' defines the transcriptom to use + # 'star' defines an aligher to use + $name = 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_release_75) [star]'; + parse_reference_genome_name($name); + print join q[, ], @a; + # prints Homo_sapiens, 1000Genomes_hs37d5, ensembl_release_75, star + + $name = 'Homo_sapiens 1000Genomes_hs37d5' + @a = parse_reference_genome_name($name); # no error, @a is undefined + +=head1 DIAGNOSTICS + +=head1 CONFIGURATION AND ENVIRONMENT + +=head1 DEPENDENCIES + +=head1 INCOMPATIBILITIES + +=head1 BUGS AND LIMITATIONS + +=head1 AUTHOR + +Marina Gourtovaia + +=head1 LICENSE AND COPYRIGHT + +Copyright (C) 2024 Genome Research Ltd. + +This file is part of NPG. + +NPG is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . + +=cut diff --git a/lib/st/api/lims.pm b/lib/st/api/lims.pm index 281ff7ab..bfadd4c6 100644 --- a/lib/st/api/lims.pm +++ b/lib/st/api/lims.pm @@ -12,7 +12,7 @@ use Class::Load qw/load_class/; use npg_tracking::util::types; use npg_tracking::glossary::rpt; use npg_tracking::glossary::composition::factory::rpt_list; -use npg_tracking::data::reference::find; +use npg_tracking::data::reference::util qw/parse_reference_genome_name/; our $VERSION = '0'; @@ -622,10 +622,9 @@ sub species_from_reference_genome { my $self = shift; if ($self->reference_genome) { - my @genome_as_array = npg_tracking::data::reference::find - ->parse_reference_genome($self->reference_genome); - if (@genome_as_array) { - return $genome_as_array[0]; + my @gname_components = parse_reference_genome_name($self->reference_genome); + if (@gname_components) { + return $gname_components[0]; } } return; @@ -1340,7 +1339,7 @@ __END__ =item npg_tracking::glossary::composition::component::illumina -=item npg_tracking::data::reference::find +=item npg_tracking::data::reference::util =back diff --git a/t/10-reference-find.t b/t/10-reference-find.t index dd2bb964..cbabd066 100644 --- a/t/10-reference-find.t +++ b/t/10-reference-find.t @@ -1,6 +1,6 @@ use strict; use warnings; -use Test::More tests => 52; +use Test::More tests => 44; use Test::Exception; use File::Spec::Functions qw(catfile); use Cwd qw(cwd); @@ -204,20 +204,7 @@ use_ok('npg_tracking::data::reference::find'); } { - no warnings 'uninitialized'; my $ruser = Moose::Meta::Class->create_anon_class( - roles => [qw/npg_tracking::data::reference::find/]) - ->new_object({ repository => $transcriptome_repos }); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome)])),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|','transcriptome ref genome parsing ok with correct format'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star]])),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|star','transcriptome ref genome parsing ok with aligner 1/2'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5) [star]])),q[Homo_sapiens|1000Genomes_hs37d5||star],'transcriptome ref genome parsing ok with aligner 2/2'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5)])),q[Homo_sapiens|1000Genomes_hs37d5],'transcriptome ref genome parsing ok without aligner'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + )])), q[],'transcriptome ref genome parsing ok - returns empty with missing transcriptome version'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 ensembl_74_transcriptome)])),q[],'transcriptome ref genome parsing ok - returns empty with missing delimiter'); - is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome])),q[],'transcriptome ref genome parsing ok - returns empty with missing bracket'); - is(join(q[|],$ruser->parse_reference_genome(q(Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star))),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|','transcriptome ref genome parsing ok - aligner ignored with missing square bracket'); - - $ruser = Moose::Meta::Class->create_anon_class( roles => [qw/npg_tracking::data::reference::find/]) ->new_object({ 'repository' => $transcriptome_repos, 'aligner' => 'fasta' }); my $lims = Test::MockObject->new(); diff --git a/t/10-reference-util.t b/t/10-reference-util.t new file mode 100644 index 00000000..b17794cb --- /dev/null +++ b/t/10-reference-util.t @@ -0,0 +1,54 @@ +use strict; +use warnings; +use Test::More tests => 14; +use Test::Exception; + +my @imports = qw/parse_reference_genome_name/; +use_ok('npg_tracking::data::reference::util', @imports); +can_ok('npg_tracking::data::reference::util', @imports); + +throws_ok { parse_reference_genome_name() } + qr//, 'error if the input is undefined'; +throws_ok { parse_reference_genome_name(q[]) } qr//, + 'error if the input is an empty string'; + +my @ref = parse_reference_genome_name(q[Salmonella_enterica (Enteritidis_P125109)]); +is_deeply (\@ref, [qw(Salmonella_enterica Enteritidis_P125109)], + 'species and strain are returned'); + +@ref = parse_reference_genome_name( + q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome)] +); +is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5', + 'ensembl_74_transcriptome', undef], + 'transcriptome ref genome parsing ok with correct format'); +@ref = parse_reference_genome_name( + q{Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star} +); +is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5', + 'ensembl_74_transcriptome', undef], + 'aligner ignored due to a missing square bracket'); + +is (join(q[|], parse_reference_genome_name( + q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star]]) + ),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|star', + 'transcriptome ref genome parsing ok with aligner'); +@ref = parse_reference_genome_name(q[Homo_sapiens (1000Genomes_hs37d5) [star]]); +is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5', undef, 'star'], + 'transcriptome ref genome parsing ok with aligner and no transcriptome'); + +lives_ok { + parse_reference_genome_name(q[Salmonella_enterica Enteritidis_P125109]) +} 'incorrect string pattern does not cause an error'; +is (parse_reference_genome_name(q[Salmonella_enterica Enteritidis_P125109]), + undef, 'no brackets - wrong pattern, undefined value is returned'); +is (parse_reference_genome_name(q[Homo_sapiens (1000Genomes_hs37d5 + )]), undef, + 'missing transcriptome version - wrong pattern'); +is (parse_reference_genome_name( + q[Homo_sapiens (1000Genomes_hs37d5 ensembl_74_transcriptome)] + ), undef, 'missing transcriptome delimiter - wrong pattern'); +is (parse_reference_genome_name( + q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome]), undef, + 'missing bracket - wrong pattern'); + +1;