Merge pull request #224 from nerdstrike/render_product_metrics

Back end support for pool metrics
wtsi-npg · Jun 14, 2024 · 6516d9b · 6516d9b
2 parents 0b225e7 + d8b755d
commit 6516d9b
Show file tree

Hide file tree

Showing 9 changed files with 464 additions and 51 deletions.
diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, 2023 Genome Research Ltd.
+# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
 #
 # Authors:
 #   Marina Gourtovaia <[email protected]>
@@ -21,6 +21,7 @@
 
 import logging
 from datetime import date, datetime, timedelta
+from statistics import mean, stdev
 from typing import ClassVar, List
 
 from pydantic import BaseModel, ConfigDict, Field
@@ -33,11 +34,13 @@
 )
 from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
 from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
+from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats
 from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
 from lang_qc.models.pager import PagedResponse
 from lang_qc.models.qc_flow_status import QcFlowStatusEnum
 from lang_qc.models.qc_state import QcState as QcStateModel
 from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError
+from lang_qc.util.type_checksum import PacBioWellSHA256
 
 """
 This package is using an undocumented feature of Pydantic, type
@@ -64,7 +67,7 @@ class WellWh(BaseModel):
     # The TestClient seems to be keeping these instances alive and changing them.
 
     def get_mlwh_well_by_product_id(
-        self, id_product: str
+        self, id_product: PacBioWellSHA256
     ) -> PacBioRunWellMetrics | None:
         """
         Returns a well row record from the well metrics table or
@@ -77,6 +80,52 @@ def get_mlwh_well_by_product_id(
             )
         ).scalar_one_or_none()
 
+    def get_metrics_by_well_product_id(
+        self, id_product: PacBioWellSHA256
+    ) -> QCPoolMetrics | None:
+        well = self.get_mlwh_well_by_product_id(id_product)
+        if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode:
+
+            product_metrics = well.pac_bio_product_metrics
+            lib_lims_data = [
+                product.pac_bio_run
+                for product in product_metrics
+                if product.pac_bio_run is not None
+            ]
+            if len(lib_lims_data) != len(product_metrics):
+                raise Exception("Partially linked LIMS data or no linked LIMS data")
+
+            cov: float | None
+            if any(p.hifi_num_reads is None for p in product_metrics):
+                cov = None
+            else:
+                hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
+                cov = stdev(hifi_reads) / mean(hifi_reads) * 100
+
+            sample_stats = []
+            for (i, prod) in enumerate(product_metrics):
+                sample_stats.append(
+                    SampleDeplexingStats(
+                        id_product=prod.id_pac_bio_product,
+                        tag1_name=lib_lims_data[i].tag_identifier,
+                        tag2_name=lib_lims_data[i].tag2_identifier,
+                        deplexing_barcode=prod.barcode4deplexing,
+                        hifi_read_bases=prod.hifi_read_bases,
+                        hifi_num_reads=prod.hifi_num_reads,
+                        hifi_read_length_mean=prod.hifi_read_length_mean,
+                        hifi_bases_percent=prod.hifi_bases_percent,
+                        percentage_total_reads=(
+                            prod.hifi_num_reads / well.hifi_num_reads * 100
+                            if (well.hifi_num_reads and prod.hifi_num_reads)
+                            else None
+                        ),
+                    )
+                )
+
+            return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats)
+
+        return None
+
     def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
         """
         Get recent not QC-ed completed wells from the mlwh database.

diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py
@@ -639,6 +639,11 @@ class PacBioProductMetrics(Base):
     hifi_read_length_mean = Column(
         mysqlINTEGER(unsigned=True), nullable=True, comment="The mean HiFi read length"
     )
+    barcode4deplexing = Column(
+        mysqlVARCHAR(62),
+        nullable=True,
+        comment="The barcode recorded in producing deplexed metrics for this product",
+    )
     barcode_quality_score_mean = Column(
         mysqlSMALLINT(unsigned=True),
         nullable=True,

diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, 2023 Genome Research Ltd.
+# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
 #
 # Authors:
 #   Adam Blanchet
@@ -37,6 +37,7 @@
 from lang_qc.db.mlwh_connection import get_mlwh_db
 from lang_qc.db.qc_connection import get_qc_db
 from lang_qc.db.qc_schema import User
+from lang_qc.models.pacbio.qc_data import QCPoolMetrics
 from lang_qc.models.pacbio.well import (
     PacBioPagedWells,
     PacBioWellFull,
@@ -51,7 +52,7 @@
     MissingLimsDataError,
     RunNotFoundError,
 )
-from lang_qc.util.type_checksum import ChecksumSHA256
+from lang_qc.util.type_checksum import ChecksumSHA256, PacBioWellSHA256
 
 """
 A collection of API endpoints that are specific to the PacBio sequencing
@@ -204,7 +205,7 @@ def get_well_lims_info(
     response_model=PacBioWellFull,
 )
 def get_seq_metrics(
-    id_product: ChecksumSHA256,
+    id_product: PacBioWellSHA256,
     mlwhdb_session: Session = Depends(get_mlwh_db),
     qcdb_session: Session = Depends(get_qc_db),
 ) -> PacBioWellFull:
@@ -216,6 +217,28 @@ def get_seq_metrics(
     return PacBioWellFull(db_well=mlwh_well, qc_state=qc_state)
 
 
+@router.get(
+    "/products/{id_product}/seq_level/pool",
+    summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID",
+    responses={
+        status.HTTP_404_NOT_FOUND: {"description": "Product not found"},
+        status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
+    },
+    response_model=QCPoolMetrics,
+)
+def get_product_metrics(
+    id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db)
+) -> QCPoolMetrics:
+    metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id(
+        id_product
+    )
+    if metrics is None:
+        raise HTTPException(
+            status_code=404, detail="Well does not have any pool metrics"
+        )
+    return metrics
+
+
 @router.post(
     "/products/{id_product}/qc_claim",
     summary="Claim the well to start QC",
@@ -241,7 +264,7 @@ def get_seq_metrics(
     status_code=status.HTTP_201_CREATED,
 )
 def claim_qc(
-    id_product: ChecksumSHA256,
+    id_product: PacBioWellSHA256,
     user: User = Depends(check_user),
     qcdb_session: Session = Depends(get_qc_db),
     mlwhdb_session: Session = Depends(get_mlwh_db),

diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, 2023 Genome Research Ltd.
+# Copyright (c) 2022, 2023, 2024 Genome Research Ltd.
 #
 # Authors:
 #   Marina Gourtovaia <[email protected]>
@@ -23,6 +23,7 @@
 from pydantic import BaseModel, ConfigDict, Field
 
 from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
+from lang_qc.util.type_checksum import PacBioProductSHA256
 
 
 # Pydantic prohibits us from defining these as @classmethod or @staticmethod
@@ -153,3 +154,32 @@ def from_orm(cls, obj: PacBioRunWellMetrics):
                     qc_data[name]["value"] = getattr(obj, name, None)
 
         return cls.model_validate(qc_data)
+
+
+class SampleDeplexingStats(BaseModel):
+    """
+    A representation of metrics for one product, some direct from the DB and others inferred
+
+    For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence.
+    As a result tag2_name will be None for most data in or before 2024.
+    """
+
+    id_product: PacBioProductSHA256
+    tag1_name: str | None
+    tag2_name: str | None
+    deplexing_barcode: str | None
+    hifi_read_bases: int | None
+    hifi_num_reads: int | None
+    hifi_read_length_mean: float | None
+    hifi_bases_percent: float | None
+    percentage_total_reads: float | None
+
+
+class QCPoolMetrics(BaseModel):
+    pool_coeff_of_variance: float | None = Field(
+        title="Coefficient of variance for reads in the pool",
+        description="Percentage of the standard deviation w.r.t. mean, when pool is more than one",
+    )
+    products: list[SampleDeplexingStats] = Field(
+        title="List of products and their metrics"
+    )
diff --git a/lang_qc/util/type_checksum.py b/lang_qc/util/type_checksum.py
@@ -40,3 +40,21 @@ def validate(cls, v, _):
 
     def __repr__(self):
         return f"ChecksumSHA256({super().__repr__()})"
+
+
+class PacBioWellSHA256(ChecksumSHA256):
+    """
+    A checksum generated from the coordinates of a single well on a plate in a PacBio run
+    """
+
+    pass
+
+
+class PacBioProductSHA256(ChecksumSHA256):
+    """
+    A checksum generated from the combination of run, well, plate and any tags required for
+    deplexing, see `npg_id_generation.pac_bio.PacBioEntity`.
+    Tags only contribute to the checksum when samples are multiplexed.
+    """
+
+    pass