Skip to content

Commit

Permalink
Merge pull request #224 from nerdstrike/render_product_metrics
Browse files Browse the repository at this point in the history
Back end support for pool metrics
  • Loading branch information
mgcam authored Jun 14, 2024
2 parents 0b225e7 + d8b755d commit 6516d9b
Show file tree
Hide file tree
Showing 9 changed files with 464 additions and 51 deletions.
53 changes: 51 additions & 2 deletions lang_qc/db/helper/wells.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -21,6 +21,7 @@

import logging
from datetime import date, datetime, timedelta
from statistics import mean, stdev
from typing import ClassVar, List

from pydantic import BaseModel, ConfigDict, Field
Expand All @@ -33,11 +34,13 @@
)
from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
from lang_qc.models.pager import PagedResponse
from lang_qc.models.qc_flow_status import QcFlowStatusEnum
from lang_qc.models.qc_state import QcState as QcStateModel
from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError
from lang_qc.util.type_checksum import PacBioWellSHA256

"""
This package is using an undocumented feature of Pydantic, type
Expand All @@ -64,7 +67,7 @@ class WellWh(BaseModel):
# The TestClient seems to be keeping these instances alive and changing them.

def get_mlwh_well_by_product_id(
self, id_product: str
self, id_product: PacBioWellSHA256
) -> PacBioRunWellMetrics | None:
"""
Returns a well row record from the well metrics table or
Expand All @@ -77,6 +80,52 @@ def get_mlwh_well_by_product_id(
)
).scalar_one_or_none()

def get_metrics_by_well_product_id(
self, id_product: PacBioWellSHA256
) -> QCPoolMetrics | None:
well = self.get_mlwh_well_by_product_id(id_product)
if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode:

product_metrics = well.pac_bio_product_metrics
lib_lims_data = [
product.pac_bio_run
for product in product_metrics
if product.pac_bio_run is not None
]
if len(lib_lims_data) != len(product_metrics):
raise Exception("Partially linked LIMS data or no linked LIMS data")

cov: float | None
if any(p.hifi_num_reads is None for p in product_metrics):
cov = None
else:
hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
cov = stdev(hifi_reads) / mean(hifi_reads) * 100

sample_stats = []
for (i, prod) in enumerate(product_metrics):
sample_stats.append(
SampleDeplexingStats(
id_product=prod.id_pac_bio_product,
tag1_name=lib_lims_data[i].tag_identifier,
tag2_name=lib_lims_data[i].tag2_identifier,
deplexing_barcode=prod.barcode4deplexing,
hifi_read_bases=prod.hifi_read_bases,
hifi_num_reads=prod.hifi_num_reads,
hifi_read_length_mean=prod.hifi_read_length_mean,
hifi_bases_percent=prod.hifi_bases_percent,
percentage_total_reads=(
prod.hifi_num_reads / well.hifi_num_reads * 100
if (well.hifi_num_reads and prod.hifi_num_reads)
else None
),
)
)

return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats)

return None

def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
"""
Get recent not QC-ed completed wells from the mlwh database.
Expand Down
5 changes: 5 additions & 0 deletions lang_qc/db/mlwh_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -639,6 +639,11 @@ class PacBioProductMetrics(Base):
hifi_read_length_mean = Column(
mysqlINTEGER(unsigned=True), nullable=True, comment="The mean HiFi read length"
)
barcode4deplexing = Column(
mysqlVARCHAR(62),
nullable=True,
comment="The barcode recorded in producing deplexed metrics for this product",
)
barcode_quality_score_mean = Column(
mysqlSMALLINT(unsigned=True),
nullable=True,
Expand Down
31 changes: 27 additions & 4 deletions lang_qc/endpoints/pacbio_well.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Adam Blanchet
Expand Down Expand Up @@ -37,6 +37,7 @@
from lang_qc.db.mlwh_connection import get_mlwh_db
from lang_qc.db.qc_connection import get_qc_db
from lang_qc.db.qc_schema import User
from lang_qc.models.pacbio.qc_data import QCPoolMetrics
from lang_qc.models.pacbio.well import (
PacBioPagedWells,
PacBioWellFull,
Expand All @@ -51,7 +52,7 @@
MissingLimsDataError,
RunNotFoundError,
)
from lang_qc.util.type_checksum import ChecksumSHA256
from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256

"""
A collection of API endpoints that are specific to the PacBio sequencing
Expand Down Expand Up @@ -204,7 +205,7 @@ def get_well_lims_info(
response_model=PacBioWellFull,
)
def get_seq_metrics(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
mlwhdb_session: Session = Depends(get_mlwh_db),
qcdb_session: Session = Depends(get_qc_db),
) -> PacBioWellFull:
Expand All @@ -216,6 +217,28 @@ def get_seq_metrics(
return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state)


@router.get(
"/products/{id_product}/seq_level/pool",
summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID",
responses={
status.HTTP_404_NOT_FOUND: {"description": "Product not found"},
status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
},
response_model=QCPoolMetrics,
)
def get_product_metrics(
id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db)
) -> QCPoolMetrics:
metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id(
id_product
)
if metrics is None:
raise HTTPException(
status_code=404, detail="Well does not have any pool metrics"
)
return metrics


@router.post(
"/products/{id_product}/qc_claim",
summary="Claim the well to start QC",
Expand All @@ -241,7 +264,7 @@ def get_seq_metrics(
status_code=status.HTTP_201_CREATED,
)
def claim_qc(
id_product: ChecksumSHA256,
id_product: PacBioWellSHA256,
user: User = Depends(check_user),
qcdb_session: Session = Depends(get_qc_db),
mlwhdb_session: Session = Depends(get_mlwh_db),
Expand Down
32 changes: 31 additions & 1 deletion lang_qc/models/pacbio/qc_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022, 2023 Genome Research Ltd.
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
#
# Authors:
# Marina Gourtovaia <[email protected]>
Expand All @@ -23,6 +23,7 @@
from pydantic import BaseModel, ConfigDict, Field

from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
from lang_qc.util.type_checksum import PacBioProductSHA256


# Pydantic prohibits us from defining these as @classmethod or @staticmethod
Expand Down Expand Up @@ -153,3 +154,32 @@ def from_orm(cls, obj: PacBioRunWellMetrics):
qc_data[name]["value"] = getattr(obj, name, None)

return cls.model_validate(qc_data)


class SampleDeplexingStats(BaseModel):
"""
A representation of metrics for one product, some direct from the DB and others inferred
For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence.
As a result tag2_name will be None for most data in or before 2024.
"""

id_product: PacBioProductSHA256
tag1_name: str | None
tag2_name: str | None
deplexing_barcode: str | None
hifi_read_bases: int | None
hifi_num_reads: int | None
hifi_read_length_mean: float | None
hifi_bases_percent: float | None
percentage_total_reads: float | None


class QCPoolMetrics(BaseModel):
pool_coeff_of_variance: float | None = Field(
title="Coefficient of variance for reads in the pool",
description="Percentage of the standard deviation w.r.t. mean, when pool is more than one",
)
products: list[SampleDeplexingStats] = Field(
title="List of products and their metrics"
)
18 changes: 18 additions & 0 deletions lang_qc/util/type_checksum.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,21 @@ def validate(cls, v, _):

def __repr__(self):
return f"ChecksumSHA256({super().__repr__()})"


class PacBioWellSHA256(ChecksumSHA256):
"""
A checksum generated from the coordinates of a single well on a plate in a PacBio run
"""

pass


class PacBioProductSHA256(ChecksumSHA256):
"""
A checksum generated from the combination of run, well, plate and any tags required for
deplexing, see `npg_id_generation.pac_bio.PacBioEntity`.
Tags only contribute to the checksum when samples are multiplexed.
"""

pass
Loading

0 comments on commit 6516d9b

Please sign in to comment.