-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Back end support for pool metrics #224
Changes from 11 commits
59c2502
486faca
2b9dc32
3b3c32a
b092e19
4382aa7
7f329ff
3c9b9bb
7a55cc6
686481b
2b6ae77
3df42fa
11be2e0
6e5472a
2f9be8a
372fc56
579138b
d8b755d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2022, 2023 Genome Research Ltd. | ||
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. | ||
# | ||
# Authors: | ||
# Marina Gourtovaia <[email protected]> | ||
|
@@ -21,6 +21,7 @@ | |
|
||
import logging | ||
from datetime import date, datetime, timedelta | ||
from statistics import mean, stdev | ||
from typing import ClassVar, List | ||
|
||
from pydantic import BaseModel, ConfigDict, Field | ||
|
@@ -33,11 +34,13 @@ | |
) | ||
from lang_qc.db.mlwh_schema import PacBioRunWellMetrics | ||
from lang_qc.db.qc_schema import QcState, QcStateDict, QcType | ||
from lang_qc.models.pacbio.qc_data import QCPoolMetrics, SampleDeplexingStats | ||
from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellSummary | ||
from lang_qc.models.pager import PagedResponse | ||
from lang_qc.models.qc_flow_status import QcFlowStatusEnum | ||
from lang_qc.models.qc_state import QcState as QcStateModel | ||
from lang_qc.util.errors import EmptyListOfRunNamesError, RunNotFoundError | ||
from lang_qc.util.type_checksum import PacBioWellSHA256 | ||
|
||
""" | ||
This package is using an undocumented feature of Pydantic, type | ||
|
@@ -64,7 +67,7 @@ class WellWh(BaseModel): | |
# The TestClient seems to be keeping these instances alive and changing them. | ||
|
||
def get_mlwh_well_by_product_id( | ||
self, id_product: str | ||
self, id_product: PacBioWellSHA256 | ||
) -> PacBioRunWellMetrics | None: | ||
""" | ||
Returns a well row record from the well metrics table or | ||
|
@@ -77,6 +80,44 @@ def get_mlwh_well_by_product_id( | |
) | ||
).scalar_one_or_none() | ||
|
||
def get_metrics_by_well_product_id( | ||
self, id_product: PacBioWellSHA256 | ||
) -> QCPoolMetrics | None: | ||
well = self.get_mlwh_well_by_product_id(id_product) | ||
if well: | ||
product_metrics = well.pac_bio_product_metrics | ||
if len(product_metrics) == 1: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One product might still have a tag and trigger demultiplexing. Let's change to a confition that chacks whether |
||
return None | ||
|
||
cov: float | None | ||
if any(p.hifi_num_reads is None for p in product_metrics): | ||
cov = None | ||
else: | ||
hifi_reads = [prod.hifi_num_reads for prod in product_metrics] | ||
cov = stdev(hifi_reads) / mean(hifi_reads) * 100 | ||
|
||
return QCPoolMetrics( | ||
pool_coeff_of_variance=cov, | ||
products=[ | ||
SampleDeplexingStats( | ||
id_product=prod.id_pac_bio_product, | ||
tag1_name=prod.pac_bio_run.tag_identifier, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The product table might be not linked to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose I should have a deliberately unlinked test case to go with this possibility. Given that this is a data integrity problem, I wonder whether the right thing to do is to return an error to the client (say catch the exception in the controller) and render something like "data unlinked, inform NPG"? That's more immediately informative than leaving empty holes for them to guess at the significance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree |
||
tag2_name=prod.pac_bio_run.tag2_identifier, | ||
hifi_read_bases=prod.hifi_read_bases, | ||
hifi_num_reads=prod.hifi_num_reads, | ||
hifi_read_length_mean=prod.hifi_read_length_mean, | ||
hifi_bases_percent=prod.hifi_bases_percent, | ||
percentage_total_reads=( | ||
prod.hifi_num_reads / well.hifi_num_reads * 100 | ||
if well.hifi_num_reads | ||
else None | ||
), | ||
) | ||
for prod in product_metrics | ||
], | ||
) | ||
return None | ||
|
||
def recent_completed_wells(self) -> List[PacBioRunWellMetrics]: | ||
""" | ||
Get recent not QC-ed completed wells from the mlwh database. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2022, 2023 Genome Research Ltd. | ||
# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. | ||
# | ||
# Authors: | ||
# Marina Gourtovaia <[email protected]> | ||
|
@@ -23,6 +23,7 @@ | |
from pydantic import BaseModel, ConfigDict, Field | ||
|
||
from lang_qc.db.mlwh_schema import PacBioRunWellMetrics | ||
from lang_qc.util.type_checksum import PacBioProductSHA256 | ||
|
||
|
||
# Pydantic prohibits us from defining these as @classmethod or @staticmethod | ||
|
@@ -153,3 +154,31 @@ def from_orm(cls, obj: PacBioRunWellMetrics): | |
qc_data[name]["value"] = getattr(obj, name, None) | ||
|
||
return cls.model_validate(qc_data) | ||
|
||
|
||
class SampleDeplexingStats(BaseModel): | ||
""" | ||
A representation of metrics for one product, some direct from the DB and others inferred | ||
|
||
For a long time tag2_name was null and tag1_name was silently used at both ends of the sequence. | ||
As a result tag2_name will be None for most data in or before 2024. | ||
""" | ||
|
||
id_product: PacBioProductSHA256 | ||
tag1_name: str | None | ||
tag2_name: str | None | ||
mgcam marked this conversation as resolved.
Show resolved
Hide resolved
|
||
hifi_read_bases: int | None | ||
hifi_num_reads: int | None | ||
hifi_read_length_mean: float | None | ||
hifi_bases_percent: float | None | ||
percentage_total_reads: float | None | ||
|
||
|
||
class QCPoolMetrics(BaseModel): | ||
pool_coeff_of_variance: float | None = Field( | ||
title="Coefficient of variance for reads in the pool", | ||
description="Percentage of the standard deviation w.r.t. mean, when pool is more than one", | ||
) | ||
products: list[SampleDeplexingStats] = Field( | ||
title="List of products and their metrics" | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other QC metrics in
lang_qc/models/pacbio/qc_data.py
have class methods to self-populate themselves. It might be reasonable to move this code to such class method