Skip to content

Commit

Permalink
Merge pull request #458 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
Merge devel to master for release 2.49.0
  • Loading branch information
jmtcsngr authored Apr 22, 2024
2 parents 238e5f4 + 4f8add1 commit 14575aa
Show file tree
Hide file tree
Showing 105 changed files with 4,615 additions and 1,321 deletions.
8 changes: 8 additions & 0 deletions Changes
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
Unreleased

Release 2.49.0
- PacBio iRODS data - set QC state metadata when it is safe to do so.
- Allow non deplexed Revio cells to publish now
- Analysis loading changes post SMRT Link v13
Remove very old test data and update some tests. Add new test data and minor
code change to support loading of data from deplexing jobs in SMRT Link v13+
(very rare as only done when deplexing on instrument is incorrect).

Release 2.48.0
- Add sequencing_control.subreads.bam and index file to PacBio run
archiving. These files can be used to regenerate QC metrics.
Expand Down
127 changes: 61 additions & 66 deletions MANIFEST

Large diffs are not rendered by default.

9 changes: 1 addition & 8 deletions bin/npg_pacbio_analysis_monitor.pl
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,11 @@

Readonly::Scalar my $DEFAULT_INTERVAL_DAYS => 14;
Readonly::Scalar my $DEFAULT_OLDER_THAN_DAYS => 0;
Readonly::Scalar my $DEFAULT_SUBDIR => 0;

my $api_uri;
my $collection;
my $debug;
my $interval = $DEFAULT_INTERVAL_DAYS;
my $is_sub = $DEFAULT_SUBDIR;
my $log4perl_config;
my $older_than = $DEFAULT_OLDER_THAN_DAYS;
my $pipeline_name;
Expand All @@ -38,7 +36,6 @@
pod2usage(-verbose => 2, -exitval => 0);
},
'interval=i' => \$interval,
'is_sub' => \$is_sub,
'logconf=s' => \$log4perl_config,
'older-than|older_than=i' => \$older_than,
'pipeline-name|pipeline_name=s' => \$pipeline_name,
Expand All @@ -62,7 +59,6 @@

my @init_args = (interval => $interval,
irods => $irods,
is_sub => $is_sub,
mlwh_schema => $wh_schema,
older_than => $older_than,
);
Expand Down Expand Up @@ -110,7 +106,7 @@ =head1 NAME
=head1 SYNOPSIS
npg_pacbio_analysis_monitor
[--collection <path>] [--debug] [--interval days] [--is_sub]
[--collection <path>] [--debug] [--interval days]
[--logconf <path>] [--older-than days] [--pipeline_name <name>]
[--task_name <name>] [--api-uri] [--verbose]
Expand All @@ -120,9 +116,6 @@ =head1 SYNOPSIS
--debug Enable debug level logging. Optional, defaults to
false.
--help Display help.
--is_sub Data to be loaded is in sub-direcories of the
main analysis output directroy. Optional, defaults
to false.
--interval Interval of time in days for analysis loading.
Optional, defaults to 14.
Expand Down
45 changes: 45 additions & 0 deletions lib/WTSI/NPG/HTS/PacBio/Annotator.pm
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,51 @@ sub make_tag_metadata {
return $self->_make_multi_value_metadata(\@run_records, $method_attr);
}

=head2 make_qc_metadata
Arg [n] PacBio run database records,
List[WTSI::DNAP::Warehouse::Schema::Result::PacBioRun].
+
Example : my @avus = $ann->make_qc_metadata(@run_records);
Description: Return QC outcome AVU metadata for a single product.
An empty list is returned if the input list contains
either no records or multiple records or the only record
is not linked to a record in the pac_bio_product_metrics
table.
This method should be called in the context of a single
iRODS object. If, according to a record in the pac_bio_run
table, a well contains multiple samples, but in practice
no deplexing was done, when trying to establish data
provenance we might get multiple pac_bio_run table rows.
Opting out of assigning a QC outcome in this case is
a conscious conservative decision that was made at the
time of writing (March 2024).
Returntype : List[HashRef]
=cut

sub make_qc_metadata {
my ($self, @run_records) = @_;

my @avus = ();
if (@run_records == 1) {
my @product_metrics = $run_records[0]->pac_bio_product_metrics()->all();
# Absence of linked product records is not unknown, one linked product
# record is normal, multiple linked records is, most likely, an error.
if (@product_metrics == 1) {
my $qc_outcome = $product_metrics[0]->qc();
if (defined $qc_outcome) {
push @avus, $self->make_avu($QC_STATE, $qc_outcome);
}
}
}

return @avus;
}

sub _make_multi_value_metadata {
my ($self, $objs, $method_attr) = @_;
# The method_attr argument is a map of method name to attribute name
Expand Down
3 changes: 2 additions & 1 deletion lib/WTSI/NPG/HTS/PacBio/MetaUpdater.pm
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ sub update_secondary_metadata {
try {
my @run_records = $self->find_pacbio_runs(
$id_run, $well, $tag_id, $plate_number);
my @secondary_avus = $self->make_secondary_metadata(@run_records);
my @secondary_avus = map { $self->$_(@run_records) }
qw/make_secondary_metadata make_qc_metadata/;
$obj->update_secondary_metadata(@secondary_avus);

$self->info("Updated metadata on '$path' ",
Expand Down
14 changes: 2 additions & 12 deletions lib/WTSI/NPG/HTS/PacBio/Sequel/AnalysisMonitor.pm
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ has 'pipeline_name' =>
(isa => 'Str',
is => 'ro',
required => 1,
default => 'pbsmrtpipe.pipelines.sa3_ds_barcode2',
default => 'cromwell.workflows.pb_demux_ccs',
documentation => 'A specified pipeline name to identify relevant jobs');

has 'task_name' =>
(isa => 'Str',
is => 'ro',
required => 1,
default => 'barcoding.tasks.lima-0',
default => 'call-lima/execution',
documentation => 'A specified task name to identify relevant output directories');

has 'job_root' =>
Expand All @@ -41,13 +41,6 @@ has 'job_root' =>
default => 'cromwell-job',
documentation => 'Root directory for job processing output');

has 'is_sub' =>
(isa => 'Bool',
is => 'ro',
required => 0,
default => 0,
documentation => 'Data to be loaded is in the sub-directories of main output directory');


=head2 publish_analysed_cells
Expand Down Expand Up @@ -119,9 +112,6 @@ sub _publish_analysis_path {
if ($self->dest_collection) {
push @init_args, dest_collection => $self->dest_collection;
}
if ($self->is_sub == 1 ) {
push @init_args, is_oninstrument => 1;
}

my $publisher = WTSI::NPG::HTS::PacBio::Sequel::AnalysisPublisher->new(@init_args);

Expand Down
25 changes: 18 additions & 7 deletions lib/WTSI/NPG/HTS/PacBio/Sequel/AnalysisPublisher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ our $SMT_METADATA_SET = q{hifi_reads.consensusreadset};

# Location of source metadata file
our $ENTRY_DIR = 'entry-points';
our $OUTPUT_DIR = 'outputs';

# Generic moviename file prefix
our $MOVIENAME_PATTERN = 'm[0-9a-z]+_\d+_\d+';
Expand All @@ -40,8 +41,8 @@ our $MOVIENAME_PATTERN = 'm[0-9a-z]+_\d+_\d+';
our $WELL_DIRECTORY_PATTERN = '\d+_[A-Z]\d+$';

# Additional sequence filenames permitted for loading
our @FNAME_PERMITTED = qw[fail_reads removed ccs hifi_reads fl_transcripts sequencing_control.subreads];
our @FNAME_NON_DEPLEXED = qw[unassigned removed sequencing_control.subreads];
our @FNAME_PERMITTED = qw[fail_reads removed ccs hifi_reads fl_transcripts sequencing_control.subreads unbarcoded];
our @FNAME_NON_DEPLEXED = qw[unassigned removed sequencing_control.subreads unbarcoded];
our @FNAME_FAILED = qw[fail_reads];

# Data processing level
Expand Down Expand Up @@ -323,7 +324,8 @@ sub list_files {
}
}
} else {
@files = $self->list_directory($self->runfolder_path, filter => $type);
@files = $self->list_directory
($self->runfolder_path, filter => $type, recurse => 1);
}

return \@files;
Expand Down Expand Up @@ -369,25 +371,34 @@ has '_metadata' =>
sub _build_metadata{
my ($self) = @_;

my $entry_path = catdir($self->analysis_path, $ENTRY_DIR);
my $entry_path = catdir($self->analysis_path, $ENTRY_DIR);
my $output_path = catdir($self->analysis_path, $OUTPUT_DIR);

my @metafiles;
if ($self->is_oninstrument == 1 && ! -d $entry_path && $self->is_smtwelve == 1) {
if (-d $output_path && ! -d $entry_path) {
# As all analysis cell based all metafiles should have the correct run name,
# well and plate number as no merged cell analysis - so just pick one.
my @files = $self->list_directory
($output_path, filter => $METADATA_SET .q[.]. $METADATA_FORMAT . q[$]);
push @metafiles, $files[0];
} elsif ($self->is_oninstrument == 1 && ! -d $entry_path && $self->is_smtwelve == 1) {
# Revio
@metafiles = $self->list_directory
($self->analysis_path,
filter => $self->movie_pattern .q[.]. $SMT_METADATA_SET .q[.]. $METADATA_FORMAT .q[$],
recurse => 1)
} elsif ($self->is_oninstrument == 1 && ! -d $entry_path) {
# Sequel IIe - as will never be upgraded from ICS v11
@metafiles = $self->list_directory
($self->analysis_path,
filter => $self->movie_pattern .q[.]. $METADATA_SET .q[.]. $METADATA_FORMAT .q[$])
} else {
} elsif (-d $entry_path) {
@metafiles = $self->list_directory
($entry_path, filter => $METADATA_FORMAT . q[$], recurse => 1);
}

if (@metafiles != 1) {
$self->logcroak('Expect one xml file in '. $self->analysis_path . ' (entry_dir)');
$self->logcroak('Expect one xml file in '. $self->analysis_path);
}
return WTSI::NPG::HTS::PacBio::Sequel::MetaXMLParser->new->parse_file
($metafiles[0], $METADATA_PREFIX);
Expand Down
27 changes: 17 additions & 10 deletions lib/WTSI/NPG/HTS/PacBio/Sequel/RunPublisher.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,11 @@ our $SEQUENCE_INDEX_FORMAT = 'pbi';
our $SEQUENCE_PRODUCT = 'subreads';
our $SEQUENCE_AUXILIARY = 'scraps';

# CCS Sequence file types - Sequel IIe
# CCS Sequence file types
our $CCS_SEQUENCE_PRODUCT = 'reads';
our $HIFI_SEQUENCE_PRODUCT = 'hifi_reads';
our $HIFIUB_SEQUENCE_PRODUCT = 'unbarcoded.hifi_reads';

# CCS Sequence file types - Revio
our $REV_HIFIUB_SEQ_PRODUCT = 'hifi_reads.unassigned';

our $REV_HIFIUB_SEQ_PRODUCT = 'hifi_reads.unassigned';

## Processing types
our $OFFINSTRUMENT = 'OffInstrument';
Expand All @@ -43,6 +40,7 @@ our $ONINSTRUMENTHO = 'OnInstrumentHifiOnly';
our $ONINSTRUMENTDP = 'OnInstrumentDeplex';
our $ONINSTRUMENTSR = 'OnInstrumentPlusSubreads';
our $ONINST_REVIO1 = 'OnInstrumentRevioOne';
our $ONINST_REVIO2 = 'OnInstrumentRevioTwo';

# Generic file prefix
our $FILE_PREFIX_PATTERN = 'm[0-9a-z]+_\d+_\d+';
Expand Down Expand Up @@ -97,6 +95,11 @@ sub _build_movie_pattern {
.q{[.]}. $REV_HIFIUB_SEQ_PRODUCT .q{[.]}. $SEQUENCE_FILE_FORMAT
.q{$}, undef, 1)->[0] ) {
$revio++;
} elsif ( defined $self->list_files($smrt_name, $REVIO_PREFIX_PATTERN
.q{[.]}. $HIFI_SEQUENCE_PRODUCT .q{[.]}. $SEQUENCE_FILE_FORMAT
.q{$}, undef, 1)->[0] ) {
## non deplexed data now supported for Revio
$revio++;
}
last if $revio > 0;
}
Expand Down Expand Up @@ -166,7 +169,8 @@ sub publish_files {
($num_files_cell, $num_processed_cell, $num_errors_cell) =
$self->_publish_on_instrument_cell($smrt_name, $process_type);
}
elsif ($process_type eq $ONINST_REVIO1) {
elsif (($process_type eq $ONINST_REVIO1) ||
($process_type eq $ONINST_REVIO2)) {
($num_files_cell, $num_processed_cell, $num_errors_cell) =
$self->_publish_revio_instrument_cell($smrt_name, $process_type);
}
Expand Down Expand Up @@ -211,7 +215,7 @@ sub _processing_type {
}
elsif (defined $self->list_files($smrt_name, $self->_movie_pattern .q{[.]}.
$HIFI_SEQUENCE_PRODUCT .q{[.]}. $SEQUENCE_FILE_FORMAT .q{$})->[0]) {
$type = $ONINSTRUMENTHO;
$type = ($self->_is_onrevio) ? $ONINST_REVIO2 : $ONINSTRUMENTHO;
}
elsif (defined $self->list_files($smrt_name, $self->_movie_pattern .q{[.]}.
$HIFIUB_SEQUENCE_PRODUCT .q{[.]}. $SEQUENCE_FILE_FORMAT .q{$})->[0]) {
Expand Down Expand Up @@ -325,8 +329,7 @@ sub _publish_revio_instrument_cell {

my $pub_xml = q[sts];

my ($nfb, $npb, $neb) = $self->_publish_deplexed_files
($smrt_name);
my ($nfb, $npb, $neb) = $self->_publish_deplexed_files($smrt_name);
my ($nfx, $npx, $nex) = $self->publish_xml_files
($smrt_name, $pub_xml);
my ($nfa, $npa, $nea) = $self->publish_aux_files
Expand Down Expand Up @@ -641,6 +644,9 @@ sub publish_image_archive {
q{hifi_reads.lima_summary.txt|summary.json|fail_reads.json|hifi_reads.json|}.
q{ccs_report.json|fail_reads.unassigned.json|hifi_reads.unassigned.json};
}
elsif ($process_type eq $ONINST_REVIO2) {
$file_types = q{ccs_report.txt|summary.json|ccs_report.json};
}

my $file_pattern1 = $self->_movie_pattern .q{.}. $file_types . q{$};
my $file_count = scalar split m/[|]/msx, $file_types;
Expand All @@ -649,7 +655,8 @@ sub publish_image_archive {
push @runfolder_files, @{$runfolder_file1};

# Optional 5mC report file
my $fmc_pattern = ($process_type eq $ONINST_REVIO1) ?
my $fmc_pattern =
(($process_type eq $ONINST_REVIO1) || ($process_type eq $ONINST_REVIO2)) ?
q{fail_reads.5mc_report.json|hifi_reads.5mc_report.json} : q{5mc_report.json};
my $file_pattern2 = $self->_movie_pattern .q{.}. $fmc_pattern .q{$};
my $runfolder_file2 = $self->list_files($smrt_name,$file_pattern2);
Expand Down
2 changes: 1 addition & 1 deletion t/data/mlwh_json/pacbio.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"version":"1.0","products":[{"irods_data_relative_path":"lima_output.lbc5--lbc5.bam","id_product":"6948c60e9ee9117255f1123e2c403013596339eda68b6b8e8867bc132023955d","irods_root_collection":"/testZone/home/irods/RunPublisherTest.XXXXX.0/publish_files/2_B01/","pipeline_name":"npg-prod","seq_platform_name":"pacbio"}, {"irods_data_relative_path":"lima_output.lbc12--lbc12.bam","id_product":"40ff7f2193f3b515a6c69ab284622c935097a63ccbf3eaf09cc39b5ff44468af","irods_root_collection":"/testZone/home/irods/RunPublisherTest.XXXXX.0/publish_files/2_B01/","pipeline_name":"npg-prod","seq_platform_name":"pacbio"}]}
{"version":"1.0","products":[{"irods_data_relative_path":"lima_output.lbc5--lbc5.bam","id_product":"6948c60e9ee9117255f1123e2c403013596339eda68b6b8e8867bc132023955d","irods_root_collection":"/testZone/home/irods/RunPublisherTest.XXXXX.0/publish_files/2_B01/","pipeline_name":"npg-prod","seq_platform_name":"pacbio"}, {"irods_data_relative_path":"lima_output.lbc12--lbc12.bam","id_product":"40ff7f2193f3b515a6c69ab284622c935097a63ccbf3eaf09cc39b5ff44468af","irods_root_collection":"/testZone/home/irods/RunPublisherTest.XXXXX.0/publish_files/2_B01/","pipeline_name":"npg-prod","seq_platform_name":"pacbio"}]}
1 change: 1 addition & 0 deletions t/data/mlwh_json/pacbio2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"version":"1.0","products":[{"id_product":"04bd5036fff6b0037535221ba755725b14ef0f4f44a3ac0030f899edaae2e391","irods_root_collection":"/testZone/home/irods/RunPublisherTest.XXXXX.0/publish_sequence_files/1_A01/","irods_data_relative_path":"m84098_240322_112047_s1.bc2048--bc2048.bam","pipeline_name":"npg-prod","seq_platform_name":"pacbio"}]}
Loading

0 comments on commit 14575aa

Please sign in to comment.