Merge pull request #231 from mgcam/better_error_propagation

Moved QCPoolMetrics model generation to the model itself.
wtsi-npg · Jun 17, 2024 · 58c53bf · 58c53bf
2 parents efff421 + 221fa08
commit 58c53bf
Show file tree

Hide file tree

Showing 4 changed files with 113 additions and 80 deletions.
diff --git a/lang_qc/db/helper/wells.py b/lang_qc/db/helper/wells.py
@@ -21,7 +21,6 @@
 
 import logging
 from datetime import date, datetime, timedelta
-from statistics import mean, stdev
 from typing import ClassVar, List
 
 from pydantic import BaseModel, ConfigDict, Field
@@ -34,7 +33,6 @@
 )
 from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
 from lang_qc.db.qc_schema import QcState, QcStateDict, QcType
-from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats
 from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary
 from lang_qc.models.pager import PagedResponse
 from lang_qc.models.qc_flow_status import QcFlowStatusEnum
@@ -80,52 +78,6 @@ def get_mlwh_well_by_product_id(
             )
         ).scalar_one_or_none()
 
-    def get_metrics_by_well_product_id(
-        self, id_product: PacBioWellSHA256
-    ) -> QCPoolMetrics | None:
-        well = self.get_mlwh_well_by_product_id(id_product)
-        if well and well.demultiplex_mode and "Instrument" in well.demultiplex_mode:
-
-            product_metrics = well.pac_bio_product_metrics
-            lib_lims_data = [
-                product.pac_bio_run
-                for product in product_metrics
-                if product.pac_bio_run is not None
-            ]
-            if len(lib_lims_data) != len(product_metrics):
-                raise Exception("Partially linked LIMS data or no linked LIMS data")
-
-            cov: float | None
-            if any(p.hifi_num_reads is None for p in product_metrics):
-                cov = None
-            else:
-                hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
-                cov = stdev(hifi_reads) / mean(hifi_reads) * 100
-
-            sample_stats = []
-            for (i, prod) in enumerate(product_metrics):
-                sample_stats.append(
-                    SampleDeplexingStats(
-                        id_product=prod.id_pac_bio_product,
-                        tag1_name=lib_lims_data[i].tag_identifier,
-                        tag2_name=lib_lims_data[i].tag2_identifier,
-                        deplexing_barcode=prod.barcode4deplexing,
-                        hifi_read_bases=prod.hifi_read_bases,
-                        hifi_num_reads=prod.hifi_num_reads,
-                        hifi_read_length_mean=prod.hifi_read_length_mean,
-                        hifi_bases_percent=prod.hifi_bases_percent,
-                        percentage_total_reads=(
-                            prod.hifi_num_reads / well.hifi_num_reads * 100
-                            if (well.hifi_num_reads and prod.hifi_num_reads)
-                            else None
-                        ),
-                    )
-                )
-
-            return QCPoolMetrics(pool_coeff_of_variance=cov, products=sample_stats)
-
-        return None
-
     def recent_completed_wells(self) -> List[PacBioRunWellMetrics]:
         """
         Get recent not QC-ed completed wells from the mlwh database.

diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py
@@ -222,20 +222,21 @@ def get_seq_metrics(
     summary="Get sample (deplexing) metrics for a multiplexed well product by the well ID",
     responses={
         status.HTTP_404_NOT_FOUND: {"description": "Product not found"},
+        status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"},
         status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
     },
     response_model=QCPoolMetrics,
 )
 def get_product_metrics(
     id_product: PacBioWellSHA256, mlwhdb_session: Session = Depends(get_mlwh_db)
 ) -> QCPoolMetrics:
-    metrics = WellWh(mlwh_session=mlwhdb_session).get_metrics_by_well_product_id(
-        id_product
-    )
-    if metrics is None:
-        raise HTTPException(
-            status_code=404, detail="Well does not have any pool metrics"
-        )
+
+    mlwh_well = _find_well_product_or_error(id_product, mlwhdb_session)
+    try:
+        metrics = QCPoolMetrics(db_well=mlwh_well)
+    except MissingLimsDataError as err:
+        raise HTTPException(409, detail=str(err))
+
     return metrics
 
 

diff --git a/lang_qc/models/pacbio/qc_data.py b/lang_qc/models/pacbio/qc_data.py
@@ -20,9 +20,14 @@
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <http://www.gnu.org/licenses/>
 
-from pydantic import BaseModel, ConfigDict, Field
+from statistics import mean, stdev
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic.dataclasses import dataclass
 
 from lang_qc.db.mlwh_schema import PacBioRunWellMetrics
+from lang_qc.util.errors import MissingLimsDataError
 from lang_qc.util.type_checksum import PacBioProductSHA256
 
 
@@ -175,11 +180,72 @@ class SampleDeplexingStats(BaseModel):
     percentage_total_reads: float | None
 
 
-class QCPoolMetrics(BaseModel):
+@dataclass(kw_only=True, frozen=True)
+class QCPoolMetrics:
+
+    db_well: PacBioRunWellMetrics = Field(init_var=True)
     pool_coeff_of_variance: float | None = Field(
         title="Coefficient of variance for reads in the pool",
         description="Percentage of the standard deviation w.r.t. mean, when pool is more than one",
     )
     products: list[SampleDeplexingStats] = Field(
         title="List of products and their metrics"
     )
+
+    @model_validator(mode="before")
+    def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
+        """
+        Populates this object with the run and well tracking information
+        from a database row that is passed as an argument.
+        """
+
+        db_well_key_name = "db_well"
+        # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi
+        if db_well_key_name not in values.kwargs:
+            return values.kwargs
+
+        well: PacBioRunWellMetrics = values.kwargs[db_well_key_name]
+        if well is None:
+            raise ValueError(f"None {db_well_key_name} value is not allowed.")
+
+        cov: float = None
+        sample_stats = []
+
+        if well.demultiplex_mode and "Instrument" in well.demultiplex_mode:
+            product_metrics = well.pac_bio_product_metrics
+            lib_lims_data = [
+                product.pac_bio_run
+                for product in product_metrics
+                if product.pac_bio_run is not None
+            ]
+            if len(lib_lims_data) != len(product_metrics):
+                raise MissingLimsDataError(
+                    "Partially linked LIMS data or no linked LIMS data"
+                )
+
+            if any(p.hifi_num_reads is None for p in product_metrics):
+                cov = None
+            else:
+                hifi_reads = [prod.hifi_num_reads for prod in product_metrics]
+                cov = stdev(hifi_reads) / mean(hifi_reads) * 100
+
+            for (i, prod) in enumerate(product_metrics):
+                sample_stats.append(
+                    SampleDeplexingStats(
+                        id_product=prod.id_pac_bio_product,
+                        tag1_name=lib_lims_data[i].tag_identifier,
+                        tag2_name=lib_lims_data[i].tag2_identifier,
+                        deplexing_barcode=prod.barcode4deplexing,
+                        hifi_read_bases=prod.hifi_read_bases,
+                        hifi_num_reads=prod.hifi_num_reads,
+                        hifi_read_length_mean=prod.hifi_read_length_mean,
+                        hifi_bases_percent=prod.hifi_bases_percent,
+                        percentage_total_reads=(
+                            prod.hifi_num_reads / well.hifi_num_reads * 100
+                            if (well.hifi_num_reads and prod.hifi_num_reads)
+                            else None
+                        ),
+                    )
+                )
+
+        return {"pool_coeff_of_variance": cov, "products": sample_stats}
diff --git a/tests/test_pac_bio_qc_data_well.py b/tests/test_pac_bio_qc_data_well.py
@@ -2,7 +2,8 @@
 from npg_id_generation.pac_bio import PacBioEntity
 
 from lang_qc.db.helper.wells import WellWh
-from lang_qc.models.pacbio.qc_data import QCDataWell
+from lang_qc.models.pacbio.qc_data import QCDataWell, QCPoolMetrics
+from lang_qc.util.errors import MissingLimsDataError
 from tests.fixtures.sample_data import multiplexed_run, simplex_run
 
 
@@ -103,47 +104,60 @@ def test_creating_qc_data_well(mlwhdb_test_session, mlwhdb_load_runs):
 
 
 def test_pool_metrics_from_single_sample_well(mlwhdb_test_session, simplex_run):
-    helper = WellWh(session=mlwhdb_test_session)
+
     id = PacBioEntity(
         run_name=simplex_run.pac_bio_run_name,
         well_label=simplex_run.well_label,
         plate_number=simplex_run.plate_number,
     ).hash_product_id()
+    helper = WellWh(session=mlwhdb_test_session)
+    row = helper.get_mlwh_well_by_product_id(id)
 
-    metrics = helper.get_metrics_by_well_product_id(id)
-    assert metrics is None, "Got no metrics for a one-sample well"
+    metric = QCPoolMetrics(db_well=row)
+    assert metric.pool_coeff_of_variance is None
+    assert metric.products == []
 
 
 def test_pool_metrics_from_well(mlwhdb_test_session, multiplexed_run):
-    helper = WellWh(session=mlwhdb_test_session)
+
     id = PacBioEntity(run_name="RUN", well_label="B1", plate_number=1).hash_product_id()
-    metrics = helper.get_metrics_by_well_product_id(id)
+    helper = WellWh(session=mlwhdb_test_session)
+    row = helper.get_mlwh_well_by_product_id(id)
+    metrics_via_db = QCPoolMetrics(db_well=row)
+    metrics_direct = QCPoolMetrics(
+        pool_coeff_of_variance=metrics_via_db.pool_coeff_of_variance,
+        products=metrics_via_db.products,
+    )
 
-    assert metrics, "Two samples means we get a metrics response"
-    assert (
-        int(metrics.pool_coeff_of_variance) == 47
-    ), "Variance between 20 and 10 is ~47%"
+    for metrics in [metrics_via_db, metrics_direct]:
+        assert (
+            int(metrics.pool_coeff_of_variance) == 47
+        ), "Variance between 20 and 10 is ~47%"
 
-    assert metrics.products[0].hifi_read_bases == 100
-    assert (
-        metrics.products[1].hifi_read_bases == 900
-    ), "hifi read base counts are faithfully copied"
+        assert metrics.products[0].hifi_read_bases == 100
+        assert (
+            metrics.products[1].hifi_read_bases == 900
+        ), "hifi read base counts are faithfully copied"
+
+        assert (
+            int(metrics.products[0].percentage_total_reads) == 33
+        ), "10 of 30 reads is 33.3%"
+        assert (
+            int(metrics.products[1].percentage_total_reads) == 66
+        ), "20 of 30 reads is 66.6%"
 
-    assert (
-        int(metrics.products[0].percentage_total_reads) == 33
-    ), "10 of 30 reads is 33.3%"
-    assert (
-        int(metrics.products[1].percentage_total_reads) == 66
-    ), "20 of 30 reads is 66.6%"
 
+def test_errors_instantiating_pool_metrics(mlwhdb_test_session):
 
-def test_pool_metrics_from_well(mlwhdb_test_session):
+    with pytest.raises(ValueError, match=r"None db_well value is not allowed."):
+        QCPoolMetrics(db_well=None)
 
     id = PacBioEntity(
         run_name="TRACTION-RUN-1140", well_label="C1", plate_number=2
     ).hash_product_id()
     helper = WellWh(session=mlwhdb_test_session)
+    row = helper.get_mlwh_well_by_product_id(id)
     with pytest.raises(
-        Exception, match=r"Partially linked LIMS data or no linked LIMS data"
+        MissingLimsDataError, match=r"Partially linked LIMS data or no linked LIMS data"
     ):
-        helper.get_metrics_by_well_product_id(id)
+        QCPoolMetrics(db_well=row)