Skip to content

Commit

Permalink
Merge pull request #844 from mgcam/utils4refs
Browse files Browse the repository at this point in the history
Created a stand alone ref. genome name parser.
  • Loading branch information
jmtcsngr authored Sep 4, 2024
2 parents 8ca8508 + a586196 commit 570f068
Show file tree
Hide file tree
Showing 7 changed files with 194 additions and 39 deletions.
11 changes: 11 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
LIST OF CHANGES

- Moved the parser for the LIMS reference genome notation from
npg_tracking::data::reference::find to a new package
npg_tracking::data::reference::util, thus providing a stand-alone
implementation of the parser.

Removed dependency of st::api::lims on npg_tracking::data::reference::find
since the latter brings in the dependency on npg_tracking::data::reference::list,
which requires that the reference repository root is defined and exists.
Instead used the reference genome notation parser from
npg_tracking::data::reference::util.

release 101.4.0 (2024-08-30)
- Added species_from_reference_genome method to st::api::lims. If the
reference_genome value is defined, this new method returns the name of the
Expand Down
2 changes: 2 additions & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ lib/npg_tracking/data/reference.pm
lib/npg_tracking/data/reference/find.pm
lib/npg_tracking/data/reference/info.pm
lib/npg_tracking/data/reference/list.pm
lib/npg_tracking/data/reference/util.pm
lib/npg_tracking/data/snv.pm
lib/npg_tracking/data/snv/find.pm
lib/npg_tracking/data/transcriptome.pm
Expand Down Expand Up @@ -344,6 +345,7 @@ t/10-primer_panel.t
t/10-reference-find.t
t/10-reference-info.t
t/10-reference-list.t
t/10-reference-util.t
t/10-reference.t
t/10-snv.t
t/10-transcriptome.t
Expand Down
21 changes: 2 additions & 19 deletions lib/npg_tracking/data/reference/find.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use File::Spec::Functions qw(catfile);
use Readonly;

use npg_tracking::util::abs_path qw(abs_path);
use npg_tracking::data::reference::util qw(parse_reference_genome_name);
use npg_tracking::data::reference::info;
use npg_tracking::util::messages;
use npg_tracking::glossary::rpt;
Expand Down Expand Up @@ -386,25 +387,7 @@ sub parse_reference_genome {
my ($self, $reference_genome) = @_;
$reference_genome ||= $self->reference_genome;
if ($reference_genome) {
my ($organism, $strain, $tversion, $analysis, @array);
## allows for transcriptome version and analysis e.g. 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_release_75) [star]'
$organism = '(?<organism>\S+)\s+';
$strain = '(?<strain>\S+)';
$tversion = '(?:\s+\+\s+(?<tversion>\S+))?';
$analysis = '(?:\s+[[](?<analysis>\S+)[]])?';
$reference_genome =~ qr{$organism [(] $strain $tversion [)] $analysis}smx;
$organism = $LAST_PAREN_MATCH{'organism'};
$strain = $LAST_PAREN_MATCH{'strain'};
$tversion = $LAST_PAREN_MATCH{'tversion'};
$analysis = $LAST_PAREN_MATCH{'analysis'};
if ($organism && $strain) {
if ($tversion || $analysis) {
@array = ($organism, $strain, $tversion, $analysis);
} else {
@array = ($organism, $strain);
}
return @array;
}
return parse_reference_genome_name($reference_genome);
}
return;
}
Expand Down
119 changes: 119 additions & 0 deletions lib/npg_tracking/data/reference/util.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package npg_tracking::data::reference::util;

use strict;
use warnings;
use English qw(-no_match_vars);
use Carp;
use Exporter qw(import);

our @EXPORT_OK = qw(parse_reference_genome_name);

our $VERSION = '0';

sub parse_reference_genome_name {
my $reference_genome = shift;

$reference_genome or croak 'Reference genome name is not defined';

my ($organism, $strain, $tversion, $analysis);

$organism = '(?<organism>\S+)\s+';
$strain = '(?<strain>\S+)';
$tversion = '(?:\s+\+\s+(?<tversion>\S+))?';
$analysis = '(?:\s+[[](?<analysis>\S+)[]])?';
$reference_genome =~ qr{$organism [(] $strain $tversion [)] $analysis}smx;
$organism = $LAST_PAREN_MATCH{'organism'};
$strain = $LAST_PAREN_MATCH{'strain'};
$tversion = $LAST_PAREN_MATCH{'tversion'};
$analysis = $LAST_PAREN_MATCH{'analysis'};

if ($organism && $strain) {
my @array = ($organism, $strain);
if ($tversion || $analysis) {
push @array, $tversion, $analysis;
}
return @array;
}

return; # To be compatible with the existing code, it is essential to return
# an undefined value rather than an empty array.
}

1;

__END__
=head1 NAME
npg_tracking::data::reference::util
=head1 SYNOPSIS
=head1 DESCRIPTION
A collection of simple utility function in support of reference finder.
=head1 SUBROUTINES/METHODS
=head2 parse_reference_genome_name
Parses LIMs notation for reference genome, returns a list containing
an organism, strain (genome version) and, optionally, a transcriptome
version and/or a word indicating the type of analysis to be run.
Returns an undefined value if the input does not conform to the expected
pattern. Errors if the input string is undefined or empty.
use npg_tracking::data::reference::util qw(parse_reference_genome_name);
parse_reference_genome_name(q[]); # errors
parse_reference_genome_name(); # errors
my $name = 'Homo_sapiens (1000Genomes_hs37d5)';
my @a = parse_reference_genome_name($name);
print join q[, ], @a; # prints Homo_sapiens, 1000Genomes_hs37d5
# 'ensembl_release_75' defines the transcriptom to use
# 'star' defines an aligher to use
$name = 'Homo_sapiens (1000Genomes_hs37d5 + ensembl_release_75) [star]';
parse_reference_genome_name($name);
print join q[, ], @a;
# prints Homo_sapiens, 1000Genomes_hs37d5, ensembl_release_75, star
$name = 'Homo_sapiens 1000Genomes_hs37d5'
@a = parse_reference_genome_name($name); # no error, @a is undefined
=head1 DIAGNOSTICS
=head1 CONFIGURATION AND ENVIRONMENT
=head1 DEPENDENCIES
=head1 INCOMPATIBILITIES
=head1 BUGS AND LIMITATIONS
=head1 AUTHOR
Marina Gourtovaia
=head1 LICENSE AND COPYRIGHT
Copyright (C) 2024 Genome Research Ltd.
This file is part of NPG.
NPG is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut
11 changes: 5 additions & 6 deletions lib/st/api/lims.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use Class::Load qw/load_class/;
use npg_tracking::util::types;
use npg_tracking::glossary::rpt;
use npg_tracking::glossary::composition::factory::rpt_list;
use npg_tracking::data::reference::find;
use npg_tracking::data::reference::util qw/parse_reference_genome_name/;

our $VERSION = '0';

Expand Down Expand Up @@ -622,10 +622,9 @@ sub species_from_reference_genome {
my $self = shift;

if ($self->reference_genome) {
my @genome_as_array = npg_tracking::data::reference::find
->parse_reference_genome($self->reference_genome);
if (@genome_as_array) {
return $genome_as_array[0];
my @gname_components = parse_reference_genome_name($self->reference_genome);
if (@gname_components) {
return $gname_components[0];
}
}
return;
Expand Down Expand Up @@ -1340,7 +1339,7 @@ __END__
=item npg_tracking::glossary::composition::component::illumina
=item npg_tracking::data::reference::find
=item npg_tracking::data::reference::util
=back
Expand Down
15 changes: 1 addition & 14 deletions t/10-reference-find.t
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use strict;
use warnings;
use Test::More tests => 52;
use Test::More tests => 44;
use Test::Exception;
use File::Spec::Functions qw(catfile);
use Cwd qw(cwd);
Expand Down Expand Up @@ -204,20 +204,7 @@ use_ok('npg_tracking::data::reference::find');
}

{
no warnings 'uninitialized';
my $ruser = Moose::Meta::Class->create_anon_class(
roles => [qw/npg_tracking::data::reference::find/])
->new_object({ repository => $transcriptome_repos });
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome)])),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|','transcriptome ref genome parsing ok with correct format');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star]])),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|star','transcriptome ref genome parsing ok with aligner 1/2');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5) [star]])),q[Homo_sapiens|1000Genomes_hs37d5||star],'transcriptome ref genome parsing ok with aligner 2/2');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5)])),q[Homo_sapiens|1000Genomes_hs37d5],'transcriptome ref genome parsing ok without aligner');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + )])), q[],'transcriptome ref genome parsing ok - returns empty with missing transcriptome version');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 ensembl_74_transcriptome)])),q[],'transcriptome ref genome parsing ok - returns empty with missing delimiter');
is(join(q[|],$ruser->parse_reference_genome(q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome])),q[],'transcriptome ref genome parsing ok - returns empty with missing bracket');
is(join(q[|],$ruser->parse_reference_genome(q(Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star))),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|','transcriptome ref genome parsing ok - aligner ignored with missing square bracket');

$ruser = Moose::Meta::Class->create_anon_class(
roles => [qw/npg_tracking::data::reference::find/])
->new_object({ 'repository' => $transcriptome_repos, 'aligner' => 'fasta' });
my $lims = Test::MockObject->new();
Expand Down
54 changes: 54 additions & 0 deletions t/10-reference-util.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
use strict;
use warnings;
use Test::More tests => 14;
use Test::Exception;

my @imports = qw/parse_reference_genome_name/;
use_ok('npg_tracking::data::reference::util', @imports);
can_ok('npg_tracking::data::reference::util', @imports);

throws_ok { parse_reference_genome_name() }
qr//, 'error if the input is undefined';
throws_ok { parse_reference_genome_name(q[]) } qr//,
'error if the input is an empty string';

my @ref = parse_reference_genome_name(q[Salmonella_enterica (Enteritidis_P125109)]);
is_deeply (\@ref, [qw(Salmonella_enterica Enteritidis_P125109)],
'species and strain are returned');

@ref = parse_reference_genome_name(
q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome)]
);
is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5',
'ensembl_74_transcriptome', undef],
'transcriptome ref genome parsing ok with correct format');
@ref = parse_reference_genome_name(
q{Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star}
);
is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5',
'ensembl_74_transcriptome', undef],
'aligner ignored due to a missing square bracket');

is (join(q[|], parse_reference_genome_name(
q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome) [star]])
),'Homo_sapiens|1000Genomes_hs37d5|ensembl_74_transcriptome|star',
'transcriptome ref genome parsing ok with aligner');
@ref = parse_reference_genome_name(q[Homo_sapiens (1000Genomes_hs37d5) [star]]);
is_deeply (\@ref, ['Homo_sapiens', '1000Genomes_hs37d5', undef, 'star'],
'transcriptome ref genome parsing ok with aligner and no transcriptome');

lives_ok {
parse_reference_genome_name(q[Salmonella_enterica Enteritidis_P125109])
} 'incorrect string pattern does not cause an error';
is (parse_reference_genome_name(q[Salmonella_enterica Enteritidis_P125109]),
undef, 'no brackets - wrong pattern, undefined value is returned');
is (parse_reference_genome_name(q[Homo_sapiens (1000Genomes_hs37d5 + )]), undef,
'missing transcriptome version - wrong pattern');
is (parse_reference_genome_name(
q[Homo_sapiens (1000Genomes_hs37d5 ensembl_74_transcriptome)]
), undef, 'missing transcriptome delimiter - wrong pattern');
is (parse_reference_genome_name(
q[Homo_sapiens (1000Genomes_hs37d5 + ensembl_74_transcriptome]), undef,
'missing bracket - wrong pattern');

1;

0 comments on commit 570f068

Please sign in to comment.