Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate most of the code about the run folder in npg_tracking::illumina::run::folder role #812

Merged
merged 5 commits into from
Apr 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,7 @@ lib/npg_tracking/glossary/tag.pm
lib/npg_tracking/illumina/run.pm
lib/npg_tracking/illumina/runfolder.pm
lib/npg_tracking/illumina/run/folder.pm
lib/npg_tracking/illumina/run/folder/location.pm
lib/npg_tracking/illumina/run/long_info.pm
lib/npg_tracking/illumina/run/short_info.pm
lib/npg_tracking/Schema.pm
lib/npg_tracking/Schema/Result/Annotation.pm
lib/npg_tracking/Schema/Result/Designation.pm
Expand Down Expand Up @@ -410,7 +408,6 @@ t/50-decorator.t
t/60-illumina-runfolder.t
t/60-illumina-run-folder.t
t/60-illumina-run-long_info.t
t/60-illumina-run-short_info.t
t/60-illumina-run.t
t/60-util-mailer.t
t/70-bin-npg_status2file.t
Expand Down
247 changes: 174 additions & 73 deletions lib/npg_tracking/illumina/run/folder.pm
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
package npg_tracking::illumina::run::folder;

use Moose::Role;
use Moose::Meta::Class;
use File::Spec::Functions qw(splitdir catfile catdir);
use Moose::Util::TypeConstraints;
use File::Spec::Functions qw/splitdir catfile catdir/;
use Carp;
use Cwd qw/getcwd/;
use Try::Tiny;
use Readonly;
use Math::Random::Secure qw/irand/;
use List::Util qw/first/;

use npg_tracking::util::types;
use npg_tracking::util::abs_path qw/abs_path/;
use npg_tracking::Schema;
use npg_tracking::glossary::lane;
use npg_tracking::illumina::run::folder::location;
use npg_tracking::util::config qw/get_config_staging_areas/;

our $VERSION = '0';

with q{npg_tracking::illumina::run};
with 'npg_tracking::illumina::run';


# Top-level directory where instruments create runfolders
Readonly::Scalar my $INCOMING_DIR => q{/incoming/};
Expand All @@ -37,6 +40,14 @@ Readonly::Scalar my $PP_ARCHIVE_DIR => q{pp_archive};
Readonly::Scalar our $SUMMARY_LINK => q{Latest_Summary};
Readonly::Scalar my $QC_DIR => q{qc};

my $config=get_config_staging_areas();
# The prod. value of prefix is '/export/esa-sv-*' in Feb. 2024
# Example prod. run folder path
# /export/esa-sv-20240201-01/IL_seq_data/incoming/20240124_LH00210_0016_B225GWVLT3
Readonly::Scalar my $STAGING_AREAS_PREFIX => $config->{'prefix'} || q();
Readonly::Scalar my $FOLDER_PATH_PREFIX_GLOB_PATTERN =>
"$STAGING_AREAS_PREFIX/IL*/*/";

Readonly::Hash my %NPG_PATH => (
q{runfolder_path} => 'Path to and including the run folder',
q{dragen_analysis_path} => 'Path to the DRAGEN analysis directory',
Expand All @@ -50,10 +61,91 @@ Readonly::Hash my %NPG_PATH => (
q{qc_path} => 'Path directory with top level QC data',
);

has q{id_run} => (
isa => q{NpgTrackingRunId},
is => q{ro},
required => 0,
lazy_build => 1,
documentation => 'Integer identifier for a sequencing run',
);
sub _build_id_run {
my ($self) = @_;

my $id_run;

if ($self->npg_tracking_schema()) {
if (!$self->has_run_folder()) {
$self->run_folder(); # Force the build
}
my $rs = $self->npg_tracking_schema()->resultset('Run')
->search({folder_name => $self->run_folder()});
if ($rs->count == 1) {
$id_run = $rs->next()->id_run();
}
}

# When no id_run is set, attempt to parse an id_run from the experiment name
# recorded in the Illumina XML file.
# We embed additional information in NovaSeqX samplesheets which have no
# meaning here. See L<Samplesheet generator|npg::samplesheet::novaseq_xseries>
if ( !$id_run && $self->can('experiment_name') && $self->experiment_name() ) {
($id_run, undef) = $self->experiment_name() =~ m{
\A
[\s]*
([\d]+) # id_run
([\w\d\s]*) # instrument name or other embedded info
\Z
}xms;
}

if( !$id_run ) {
croak q[Unable to identify id_run with data provided];
}

return $id_run;
}


my $run_folder_subtype_name = __PACKAGE__.q(::folder);
subtype $run_folder_subtype_name
=> as 'Str'
=> where { splitdir($_)==1 };

has q{run_folder} => (
isa => $run_folder_subtype_name,
is => q{ro},
lazy_build => 1,
documentation => 'Directory name of the run folder',
);
sub _build_run_folder {
my ($self) = @_;
($self->subpath or $self->has_id_run)
or croak 'Need a path or id_run to work out a run_folder';
return first {$_ ne q()} reverse File::Spec->splitdir($self->runfolder_path);
}


has q{npg_tracking_schema} => (
isa => q{Maybe[npg_tracking::Schema]},
is => q{ro},
lazy_build => 1,
);
sub _build_npg_tracking_schema {
my $schema;
try {
$schema = npg_tracking::Schema->connect();
} catch {
carp qq{Unable to connect to NPG tracking DB for faster globs.\n};
};
return $schema;
}


foreach my $path_attr ( keys %NPG_PATH ) {
has $path_attr => (
isa => q{Str},
is => q{ro},
predicate => 'has_' . $path_attr,
lazy_build => 1,
documentation => $NPG_PATH{$path_attr},
);
Expand All @@ -80,41 +172,36 @@ sub set_bam_basecall_path {
return $self->bam_basecall_path;
}

has q{npg_tracking_schema} => (
isa => q{Maybe[npg_tracking::Schema]},
is => q{ro},
lazy_build => 1,
);
sub _build_npg_tracking_schema {
my $schema;
try {
$schema = npg_tracking::Schema->connect();
} catch {
warn qq{WARNING: Unable to connect to NPG tracking DB for faster globs.\n};
};
return $schema;
}

sub _build_runfolder_path {
my ($self) = @_;

my $path = $self->_get_path_from_given_path();
$path && return $path;

my $db_runfolder_name;
my $runfolder_name;
if ( $self->can('run_folder') and $self->has_run_folder ) {
$runfolder_name = $self->run_folder;
my $path;
my $runfolder_name = $self->has_run_folder ? $self->run_folder : undef;

# Try to use one of paths (if any) supplied via a constructor to figure out
# the location of the run folder directory. This method examines the
# directory structure looking for subdirectories, which normally exist in
# the Illumina run folder.
if ($self->subpath()) {
$path = _get_path_from_given_path($self->subpath());
}

if ( $self->npg_tracking_schema() and
$self->can(q(id_run)) and $self->id_run() ) {
# Try to get the run folder name and glob from the database and then glob
# for the run folder directory. Limit this search to run folders that
# are known to be on staging.
if ((not $path) and $self->npg_tracking_schema()) {
# The code below needs run ID, so 'id_run' will be built if not given.
if (not $self->tracking_run->is_tag_set(q(staging))) {
croak sprintf 'NPG tracking reports run %i no longer on staging',
$self->id_run;
}
$db_runfolder_name = $self->tracking_run->folder_name;
my $db_runfolder_name = $self->tracking_run->folder_name;
if ($db_runfolder_name) {
if ($runfolder_name and ($db_runfolder_name ne $runfolder_name)) {
# Probably this is an error. Warn for now.
carp sprintf 'Inconsistent db and given run folder name: %s, %s',
$db_runfolder_name, $runfolder_name;
}
if (my $gpath = $self->tracking_run->folder_path_glob) {
$path = $self->_get_path_from_glob_pattern(
catfile($gpath, $db_runfolder_name)
Expand All @@ -123,21 +210,23 @@ sub _build_runfolder_path {
}
}

if ( (not $path) and $runfolder_name ) {
# Try to use the runfolder name, if set via the constructor, and the
# staging area prefix from the 'npg_tracking' configuration file to
# glob the file system. This is the most expensive file system glob,
# so doing this as the last resort.
if ((not $path) and $runfolder_name) {
$path = $self->_get_path_from_glob_pattern(
$self->_folder_path_glob_pattern() . $runfolder_name
);
}

if ( $db_runfolder_name and $runfolder_name and
($db_runfolder_name ne $runfolder_name) ) {
carp sprintf 'Inconsistent db and given run folder name: %s, %s',
$db_runfolder_name, $runfolder_name;
}
# Most likely, the code execution will not advance this far without $path
# being computed. In case of problems an error will be raised by one of
# the methods called above. Returning an undefined path will trigger an
# error since the 'runfolder_path' attribute is defined as a string.
# Raising an error here to help with deciphering error messages.

if (not $path) {
croak 'Failed to infer runfolder_path';
}
$path or croak 'Failed to infer runfolder_path';

return $path;
}
Expand Down Expand Up @@ -244,20 +333,19 @@ has q{subpath} => (
);
sub _build_subpath {
my $self = shift;
my $path;

foreach my $path_method ( qw/ recalibrated_path
basecall_path
intensity_path
archive_path
runfolder_path / ) {
my $has_path_method = q{has_} . $path_method;
if ($self->$has_path_method()) {
$path = $self->$path_method();
last;
return $self->$path_method();
}
}

return $path;
return;
}

#############
Expand All @@ -266,13 +354,8 @@ sub _build_subpath {
has q{_folder_path_glob_pattern} => (
isa => q{Str},
is => q{ro},
lazy_build => 1,
default => $FOLDER_PATH_PREFIX_GLOB_PATTERN,
);
sub _build__folder_path_glob_pattern {
my $test_dir = $ENV{TEST_DIR} || q{};
return $test_dir .
$npg_tracking::illumina::run::folder::location::FOLDER_PATH_PREFIX_GLOB_PATTERN;
}

sub _infer_analysis_path {
my ($path, $distance) = @_;
Expand Down Expand Up @@ -316,25 +399,19 @@ sub _get_path_from_glob_pattern {
}

sub _get_path_from_given_path {
my ($self) = @_;
my ($subpath) = @_;

$self->subpath or return;

my @subpath = splitdir( $self->subpath );
while (@subpath) {
my $path = catdir(@subpath);
if ( -d $path # path of all remaining parts of _given_path (subpath)
and
-d catdir($path, $CONFIG_DIR) # does this directory have a Config Directory
and
-d catdir($path, $DATA_DIR) # a runfolder is likely to have a Data directory
) {
return $path;
my @dirs = splitdir($subpath);
while (@dirs) {
my $path = catdir(@dirs);
# a runfolder has to have a Data directory
if (-d $path and -d catdir($path, $DATA_DIR)) {
return $path;
}
pop @subpath;
pop @dirs;
}

croak q{Nothing looks like a run_folder in any given subpath};
croak qq{Nothing looks like a run folder in any subpath of $subpath};
}

sub _get_analysis_path_from_glob {
Expand Down Expand Up @@ -389,20 +466,42 @@ recalibrated directory, which will be used to construct other paths from.

=head1 SUBROUTINES/METHODS

=head2 runfolder_path
=head2 id_run

=head2 bam_basecall
An attribute, NPG run identifier. If the value is not supplied, an attempt
to build it is made.

=head2 set_bam_basecall_path
If access to a run tracking database is available and the database contains
the run record and the run folder name is defined in the database record and
the run_folder attribute is defined or can be built, then its value is used
to retrieve the id_run value from the database.

If 'experiment_name' accessor is provided by the class that inherits from
this role, then, in the absence of a database record, an attempt is made to parse
out run ID from the value returned by the 'experiment_name' accessor. See
npg_tracking::illumina::run::long_info for the implementation of this accessor.

=head2 run_folder

Sets and returns bam_basecall_path. Error if this attribute has
already been set.
An attribute, run folder name, can be set in the constructor or lazy-built.

$obj->set_bam_basecall_path();
print $obj->bam_basecall_path(); # BAM_basecalls_SOME-RANDOM-NUMBER
=head2 npg_tracking_schema

$obj->set_bam_basecall_path(20190122);
print $obj->bam_basecall_path(); # BAM_basecalls_20190122
npg_tracking::Schema db handle object, which is allowed to be assigned an
undefined value. An attempt to build this attribute is made. In case of a
failure an undefined value is assigned.

=head2 runfolder_path

=head2 bam_basecall_path

=head2 set_bam_basecall_path

Sets and returns bam_basecall_path. Error if this attribute has
already been set.

$obj->set_bam_basecall_path();
print $obj->bam_basecall_path(); # BAM_basecalls_SOME-RANDOM-NUMBER

=head2 analysis_path

Expand Down Expand Up @@ -442,7 +541,7 @@ Might be undefined.

=item Moose::Role

=item Moose::Meta::Class
=item Moose::Util::TypeConstraints

=item Carp

Expand All @@ -456,6 +555,8 @@ Might be undefined.

=item Math::Random::Secure

=item List::Util

=back

=head1 INCOMPATIBILITIES
Expand Down
Loading
Loading