Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Back end support for pool metrics #224

Merged
merged 18 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions lang_qc/db/helper/wells.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -21,6 +21,7 @@

import logging
from datetime import date, datetime, timedelta
from statistics import mean, stdev
from typing import ClassVar, List

from pydantic import BaseModel, ConfigDict, Field
Expand All @@ -33,11 +34,13 @@
)
from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
from lang_qc.models.pager import PagedResponse
from lang_qc.models.qc_flow_status import QcFlowStatusEnum
from lang_qc.models.qc_state import QcState as QcStateModel
from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError
from lang_qc.util.type_checksum import PacBioWellSHA256

"""
This package is using an undocumented feature of Pydantic, type
Expand All @@ -64,7 +67,7 @@ class WellWh(BaseModel):
# The TestClient seems to be keeping these instances alive and changing them.

def get_mlwh_well_by_product_id(
self, id_product: str
self, id_product: PacBioWellSHA256
) -> PacBioRunWellMetrics | None:
"""
Returns a well row record from the well metrics table or
Expand All @@ -77,6 +80,44 @@ def get_mlwh_well_by_product_id(
)
).scalar_one_or_none()

def get_metrics_by_well_product_id(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other QC metrics in lang_qc/models/pacbio/qc_data.py have class methods to self-populate themselves. It might be reasonable to move this code to such class method

self, id_product: PacBioWellSHA256
) -> QCPoolMetrics | None:
well = self.get_mlwh_well_by_product_id(id_product)
if well:
product_metrics = well.pac_bio_product_metrics
if len(product_metrics) == 1:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One product might still have a tag and trigger demultiplexing. Let's change to a confition that chacks whether well.demultiplex_mode value contains the Instrument substring.

return None

cov: float | None
if any(p.hifi_num_reads is None for p in product_metrics):
cov = None
else:
hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
cov = stdev(hifi_reads) / mean(hifi_reads) * 100

return QCPoolMetrics(
pool_coeff_of_variance=cov,
products=[
SampleDeplexingStats(
id_product=prod.id_pac_bio_product,
tag1_name=prod.pac_bio_run.tag_identifier,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The product table might be not linked to the pac_bio_run table. It is fine to have either nothing or n/a, but @ces asks for a reason to be clearly displayed. I wonder whether a comment field should be added to this object.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose I should have a deliberately unlinked test case to go with this possibility.

Given that this is a data integrity problem, I wonder whether the right thing to do is to return an error to the client (say catch the exception in the controller) and render something like "data unlinked, inform NPG"? That's more immediately informative than leaving empty holes for them to guess at the significance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree

tag2_name=prod.pac_bio_run.tag2_identifier,
hifi_read_bases=prod.hifi_read_bases,
hifi_num_reads=prod.hifi_num_reads,
hifi_read_length_mean=prod.hifi_read_length_mean,
hifi_bases_percent=prod.hifi_bases_percent,
percentage_total_reads=(
prod.hifi_num_reads / well.hifi_num_reads * 100
if well.hifi_num_reads
else None
),
)
for prod in product_metrics
],
)
return None

def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
"""
Get recent not QC-ed completed wells from the mlwh database.
Expand Down
31 changes: 27 additions & 4 deletions lang_qc/endpoints/pacbio_well.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Adam Blanchet
Expand Down Expand Up @@ -37,6 +37,7 @@
from lang_qc.db.mlwh_connection import get_mlwh_db
from lang_qc.db.qc_connection import get_qc_db
from lang_qc.db.qc_schema import User
from lang_qc.models.pacbio.qc_data import QCPoolMetrics
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull
from lang_qc.models.qc_flow_status import QcFlowStatusEnum
from lang_qc.models.qc_state import QcState, QcStateBasic
Expand All @@ -46,7 +47,7 @@
InvalidDictValueError,
RunNotFoundError,
)
from lang_qc.util.type_checksum import ChecksumSHA256
from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256

"""
A collection of API endpoints that are specific to the PacBio sequencing
Expand Down Expand Up @@ -173,7 +174,7 @@ def get_wells_in_run(
response_model=PacBioWellFull,
)
def get_seq_metrics(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
mlwhdb_session: Session = Depends(get_mlwh_db),
qcdb_session: Session = Depends(get_qc_db),
) -> PacBioWellFull:
Expand All @@ -185,6 +186,28 @@ def get_seq_metrics(
return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state)


@router.get(
"/products/{id_product}/seq_level/pool",
summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID",
responses={
status.HTTP_404_NOT_FOUND: {"description": "Product not found"},
status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
},
response_model=QCPoolMetrics,
)
def get_product_metrics(
id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db)
) -> QCPoolMetrics:
metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id(
id_product
)
if metrics is None:
raise HTTPException(
status_code=404, detail="Well does not have any pool metrics"
)
return metrics


@router.post(
"/products/{id_product}/qc_claim",
summary="Claim the well to start QC",
Expand All @@ -210,7 +233,7 @@ def get_seq_metrics(
status_code=status.HTTP_201_CREATED,
)
def claim_qc(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
user: User = Depends(check_user),
qcdb_session: Session = Depends(get_qc_db),
mlwhdb_session: Session = Depends(get_mlwh_db),
Expand Down
31 changes: 30 additions & 1 deletion lang_qc/models/pacbio/qc_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -23,6 +23,7 @@
from pydantic import BaseModel, ConfigDict, Field

from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.util.type_checksum import PacBioProductSHA256


# Pydantic prohibits us from defining these as @classmethod or @staticmethod
Expand Down Expand Up @@ -153,3 +154,31 @@ def from_orm(cls, obj: PacBioRunWellMetrics):
qc_data[name]["value"] = getattr(obj, name, None)

return cls.model_validate(qc_data)


class SampleDeplexingStats(BaseModel):
"""
A representation of metrics for one product, some direct from the DB and others inferred

For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence.
As a result tag2_name will be None for most data in or before 2024.
"""

id_product: PacBioProductSHA256
tag1_name: str | None
tag2_name: str | None
mgcam marked this conversation as resolved.
Show resolved Hide resolved
hifi_read_bases: int | None
hifi_num_reads: int | None
hifi_read_length_mean: float | None
hifi_bases_percent: float | None
percentage_total_reads: float | None


class QCPoolMetrics(BaseModel):
pool_coeff_of_variance: float | None = Field(
title="Coefficient of variance for reads in the pool",
description="Percentage of the standard deviation w.r.t. mean, when pool is more than one",
)
products: list[SampleDeplexingStats] = Field(
title="List of products and their metrics"
)
18 changes: 18 additions & 0 deletions lang_qc/util/type_checksum.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,21 @@ def validate(cls, v, _):

def __repr__(self):
return f"ChecksumSHA256({super().__repr__()})"


class PacBioWellSHA256(ChecksumSHA256):
"""
A checksum generated from the coordinates of a single well on a plate in a PacBio run
"""

pass


class PacBioProductSHA256(ChecksumSHA256):
"""
A checksum generated from the combination of run, well, plate and any tags required for
deplexing, see `npg_id_generation.pac_bio.PacBioEntity`.
Tags only contribute to the checksum when samples are multiplexed.
"""

pass
Loading