From 5c77509fa5be9f2c542a2c561d5eb420b3ccbb4b Mon Sep 17 00:00:00 2001 From: mgcam Date: Sat, 8 Jun 2024 23:53:34 +0100 Subject: [PATCH 1/5] Added a model for PB library LIMS data. Reimplemented the PacBioExperiment class, removed from_orm method, replaced it by a pre-init hook. --- lang_qc/models/pacbio/experiment.py | 160 +++++++++++++++++++++------- lang_qc/models/pacbio/well.py | 4 +- tests/test_pac_bio_experiment.py | 47 +++++--- 3 files changed, 155 insertions(+), 56 deletions(-) diff --git a/lang_qc/models/pacbio/experiment.py b/lang_qc/models/pacbio/experiment.py index 14eeb7c..76bf598 100644 --- a/lang_qc/models/pacbio/experiment.py +++ b/lang_qc/models/pacbio/experiment.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023 Genome Research Ltd. +# Copyright (c) 2023, 2024 Genome Research Ltd. # # Authors: # Marina Gourtovaia @@ -19,14 +19,96 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see . -from typing import List +from typing import Any -from pydantic import BaseModel, ConfigDict, Field +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass from lang_qc.db.mlwh_schema import PacBioRun -class PacBioExperiment(BaseModel): +@dataclass(kw_only=True, frozen=True) +class PacBioLibrary: + """ + This model represents LIMS data associated with a PacBio library. + + The fields of the model can be assigned directly via the constructor. + However, if the `db_library` field, a single row of the PacBioRun table + class, is set via the constructor, the rest of the fields are populated + using this database row object, while any other information passed to the + constructor is disregarded. + + The `db_library` field is not present in the model instance that is + returned by the constructor. + """ + + db_library: PacBioRun = Field(init_var=True) + + study_id: str = Field( + title="LIMS-specific study identifier", + ) + study_name: str = Field( + title="Study name", + ) + sample_id: str = Field( + title="LIMS-specific Sample identifier", + ) + sample_name: str = Field( + title="Sample name", + ) + tag_sequence: list = Field( + title="Tag sequence", + description=""" + Tag sequences as a list. An empty list for a non-indexed library. + """, + ) + library_type: str | None = Field( + default=None, + title="Library type", + ) + pool_name: str | None = Field( + default=None, + title="Pool name", + description=""" + The pac_bio_library_tube_barcode from TRACTION, AKA pool name + """, + ) + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + """ + Populates the fields of this object with information available + in the LIMS system. Errors if the `db_library` attribute is not + set via the constructor. + """ + + # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi + if "db_library" not in values.kwargs: + return values.kwargs + db_row: PacBioRun = values.kwargs["db_library"] + if db_row is None: + raise ValueError("None db_library value is not allowed.") + + assigned = dict() + study = db_row.study + assigned["study_name"] = study.name + assigned["study_id"] = study.id_study_lims + sample = db_row.sample + assigned["sample_name"] = sample.name + assigned["sample_id"] = sample.id_sample_lims + assigned["library_type"] = db_row.pipeline_id_lims + assigned["pool_name"] = db_row.pac_bio_library_tube_barcode + assigned["tag_sequence"] = [] + if tag := db_row.tag_sequence: + assigned["tag_sequence"].append(tag) + if tag := db_row.tag2_sequence: + assigned["tag_sequence"].append(tag) + + return assigned + + +@dataclass(kw_only=True, frozen=True) +class PacBioExperiment: """ A response model that contains laboratory tracking information about the PacBio wells and samples prior to the start of the @@ -43,6 +125,8 @@ class PacBioExperiment(BaseModel): (library). """ + db_libraries: list[PacBioRun] = Field(init_var=True) + study_id: list = Field( title="Study identifier", description=""" @@ -50,21 +134,21 @@ class PacBioExperiment(BaseModel): an unlikely case of multiple studies). """, ) - study_name: str = Field( + study_name: str | None = Field( default=None, title="Study name", description=""" Study name, is not set in case of multiple studies. """, ) - sample_id: str = Field( + sample_id: str | None = Field( default=None, title="Sample identifier", description=""" Sample identifier, is not set in case of multiple samples. """, ) - sample_name: str = Field( + sample_name: str | None = Field( default=None, title="Sample name", description=""" @@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel): unlikely case of multiple library types. """, ) - pool_name: str = Field( + pool_name: str | None = Field( default=None, title="Pool name", description=""" The pac_bio_library_tube_barcode from TRACTION, AKA pool name """, ) - model_config = ConfigDict(from_attributes=True, extra="forbid") - @classmethod - def from_orm(cls, lims_db_rows: List[PacBioRun]): + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: """ - A factory method, creates an instance of the PacBioLimsData class. - Should be given a non-empty list of PacBioRun table row objects as - an argument. + Populates the fields of this object with information available + in the LIMS system. + Errors if the `db_libraries` attribute is not set via the constructor. """ + lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"] num_samples = len(lims_db_rows) if num_samples == 0: - raise Exception("Cannot create PacBioLimsData object, no data.") - if any(row is None for row in lims_db_rows): - raise Exception("Cannot create PacBioLimsData object, None row.") + raise ValueError("Empty db_libraries list is not allowed.") + + lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows] - # Using sets for some data instead of lists because we do not - # want repetitions. lims_data = { "num_samples": num_samples, - "study_id": set(), - "library_type": set(), "tag_sequence": [], } - study_name = None - for row in lims_db_rows: - lims_data["study_id"].add(row.study.id_study_lims) - lims_data["library_type"].add(row.pipeline_id_lims) - study_name = row.study.name - if pool_name := row.pac_bio_library_tube_barcode: - lims_data["pool_name"] = pool_name - if num_samples == 1: - if tag := row.tag_sequence: - lims_data["tag_sequence"].append(tag) - if tag := row.tag2_sequence: - lims_data["tag_sequence"].append(tag) - lims_data["sample_id"] = row.sample.id_sample_lims - lims_data["sample_name"] = row.sample.name - lims_data["study_name"] = row.study.name + lims_data["study_id"] = {o.study_id for o in lib_objects} # returns a set + lims_data["library_type"] = { + o.library_type if o.library_type is not None else "UNKNOWN" + for o in lib_objects + } + + pool_names = {o.pool_name for o in lib_objects} + if len(pool_names) > 1: + raise ValueError("Multiple pool names.") + lims_data["pool_name"] = pool_names.pop() + + o = lib_objects[0] + if num_samples == 1: + lims_data["tag_sequence"] = o.tag_sequence + lims_data["sample_id"] = o.sample_id + lims_data["sample_name"] = o.sample_name + lims_data["study_name"] = o.study_name if len(lims_data["study_id"]) == 1: - lims_data["study_name"] = study_name + lims_data["study_name"] = o.study_name - # Convert sets back to lists and sort so that the list items are + # Convert sets back to lists and sort so that the items are # in a predictable order. for key in ("library_type", "study_id"): lims_data[key] = sorted(lims_data[key]) - return cls.model_validate(lims_data) + return lims_data diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index d2047a5..d0d34cb 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -223,6 +223,8 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: assigned["metrics"] = QCDataWell.from_orm(mlwh_db_row) experiment_info = mlwh_db_row.get_experiment_info() if len(experiment_info): - assigned["experiment_tracking"] = PacBioExperiment.from_orm(experiment_info) + assigned["experiment_tracking"] = PacBioExperiment( + db_libraries=experiment_info + ) return assigned diff --git a/tests/test_pac_bio_experiment.py b/tests/test_pac_bio_experiment.py index ff97954..775693c 100644 --- a/tests/test_pac_bio_experiment.py +++ b/tests/test_pac_bio_experiment.py @@ -2,7 +2,19 @@ from sqlalchemy import select from lang_qc.db.mlwh_schema import PacBioRun -from lang_qc.models.pacbio.experiment import PacBioExperiment +from lang_qc.models.pacbio.experiment import PacBioExperiment, PacBioLibrary + + +def test_creating_library_object(mlwhdb_test_session, mlwhdb_load_runs): + + l = PacBioLibrary( + study_id="1", + sample_id="1", + study_name="st_name", + sample_name="sa_name", + tag_sequence=[], + ) + assert l.study_id == "1" def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): @@ -17,7 +29,13 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_row = mlwhdb_test_session.execute(query).scalars().one() - lims = PacBioExperiment.from_orm([well_row]) + with pytest.raises(Exception, match=r"Empty db_libraries list is not allowed."): + PacBioExperiment(db_libraries=[]) + + with pytest.raises(ValueError, match=r"None db_library value is not allowed."): + PacBioExperiment(db_libraries=[well_row, None]) + + lims = PacBioExperiment(db_libraries=[well_row]) assert lims.num_samples == 1 assert lims.study_id == ["6457"] assert lims.study_name == "Tree of Life - ASG" @@ -34,7 +52,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_row = mlwhdb_test_session.execute(query).scalars().one() - lims = PacBioExperiment.from_orm([well_row]) + lims = PacBioExperiment(db_libraries=[well_row]) assert lims.num_samples == 1 assert lims.study_id == ["5901"] assert lims.study_name == "DTOL_Darwin Tree of Life" @@ -51,7 +69,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 40 assert lims.study_id == ["7069"] assert lims.study_name == "Alternative Enzymes 2022 microbial genomes" @@ -68,7 +86,7 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 3 assert lims.study_id == ["5901", "6457"] assert lims.study_name is None @@ -85,7 +103,14 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): ) well_rows = mlwhdb_test_session.execute(query).scalars().all() - lims = PacBioExperiment.from_orm(well_rows) + with pytest.raises(ValueError, match=r"Multiple pool names."): + PacBioExperiment(db_libraries=well_rows) + + for row in well_rows: + row.pac_bio_library_tube_barcode = "AXCTYW" + mlwhdb_test_session.commit() + + lims = PacBioExperiment(db_libraries=well_rows) assert lims.num_samples == 42 assert lims.study_id == ["6457", "7069"] assert lims.study_name is None @@ -93,13 +118,3 @@ def test_creating_experiment_object(mlwhdb_test_session, mlwhdb_load_runs): assert lims.sample_name is None assert lims.library_type == ["PacBio_Ultra_Low_Input", "Pacbio_HiFi_mplx"] assert lims.tag_sequence == [] - - with pytest.raises( - Exception, match=r"Cannot create PacBioLimsData object, no data" - ): - PacBioExperiment.from_orm([]) - - with pytest.raises( - Exception, match=r"Cannot create PacBioLimsData object, None row" - ): - PacBioExperiment.from_orm([well_row, None]) From fbff9a74dac11d80972c88ef4b991b60b3d824a3 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 12:08:13 +0100 Subject: [PATCH 2/5] Created an extendable declarative base class ... for mlwh ORM classes so that common methods can be implemented. Customised __repr__ method for one of db classes. --- lang_qc/db/mlwh_schema.py | 36 ++++++++++++++++++++++++++++++++--- tests/test_mlwh_db_classes.py | 24 +++++++++++++++++++++++ 2 files changed, 57 insertions(+), 3 deletions(-) create mode 100644 tests/test_mlwh_db_classes.py diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index 395916f..c0796b1 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -25,9 +25,30 @@ from sqlalchemy.dialects.mysql import SMALLINT as mysqlSMALLINT from sqlalchemy.dialects.mysql import TINYINT as mysqlTINYINT from sqlalchemy.dialects.mysql import VARCHAR as mysqlVARCHAR -from sqlalchemy.orm import declarative_base, relationship +from sqlalchemy.orm import DeclarativeBase, relationship -Base = declarative_base() + +class Base(DeclarativeBase): + """ + A base class for declarative class definitions for the ml warehouse database. + """ + + def get_row_description(self, fields: list[str]) -> str: + """ + Returns a printable representation of the database table row. Interprets + a list of strings given as the `fields` argument as a list of column + names. Combines the name of the class, names of the given columns + and respective values into a row description. The columns for which + the row has a NULL value are omitted from the description. + """ + + pairs = [] + for name in fields: + value = self.__getattribute__(name) + if value is not None: + pairs.append(f"{name}={value}") + description = ", ".join(pairs) + return f"{self.__module__}.{self.__class__.__name__}: {description}" class Sample(Base): @@ -538,7 +559,16 @@ class PacBioRunWellMetrics(Base): "PacBioProductMetrics", back_populates="pac_bio_run_well_metrics" ) - def get_experiment_info(self): + """Custom or customised methods are added below""" + + def __repr__(self): + """Returns a printable representation of the database row""" + + return self.get_row_description( + ["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"] + ) + + def get_experiment_info(self) -> list[PacBioRun]: """Returns a list of PacBioRun mlwh database rows. Returns LIMS information about the PacBio experiment diff --git a/tests/test_mlwh_db_classes.py b/tests/test_mlwh_db_classes.py new file mode 100644 index 0000000..be0b89a --- /dev/null +++ b/tests/test_mlwh_db_classes.py @@ -0,0 +1,24 @@ +from sqlalchemy import select + +from lang_qc.db.mlwh_schema import PacBioRunWellMetrics + +"""Tests for custom and customised ORM methods""" + + +def test_pac_bio_well_metrics_repr(mlwhdb_test_session, mlwhdb_load_runs): + id1 = "cf18bd66e0f0895ea728c1d08103c62d3de8a57a5f879cee45f7b0acc028aa61" + id2 = "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf" + data = { + id1: f"pac_bio_run_name=TRACTION-RUN-92, well_label=A1, id_pac_bio_product={id1}", + id2: f"pac_bio_run_name=TRACTION-RUN-1140, well_label=A1, plate_number=2, id_pac_bio_product={id2}", + } + + for id in data.keys(): + query = select(PacBioRunWellMetrics).where( + PacBioRunWellMetrics.id_pac_bio_product == id + ) + db_row = mlwhdb_test_session.execute(query).scalar_one() + assert ( + db_row.__repr__() + == "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] + ) From ed54cabcfbe7c70b4facaf1550397e9d73acf920 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 13:17:13 +0100 Subject: [PATCH 3/5] Added a model representing libraries in a well. --- lang_qc/models/pacbio/well.py | 40 ++++++++++++++++++++-- tests/test_pb_well_models.py | 62 +++++++++++++++++++++++++++++++++-- 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index d0d34cb..e5bf0d8 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -27,7 +27,7 @@ from pydantic.dataclasses import dataclass from lang_qc.db.mlwh_schema import PacBioRunWellMetrics -from lang_qc.models.pacbio.experiment import PacBioExperiment +from lang_qc.models.pacbio.experiment import PacBioExperiment, PacBioLibrary from lang_qc.models.pacbio.qc_data import QCDataWell from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_state import QcState @@ -132,9 +132,10 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: """ # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi - mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] - assert mlwh_db_row + if "db_well" not in values.kwargs: + raise ValueError("None db_well value is not allowed.") + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] column_names = [column.key for column in PacBioRunWellMetrics.__table__.columns] assigned = dict() @@ -175,6 +176,39 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: return assigned +@dataclass(kw_only=True, frozen=True) +class PacBioWellLibraries(PacBioWell): + """A response model binding together basic PacBio well and LIMS data for + the libraries, which were sequenced in this well. + """ + + libraries: list[PacBioLibrary] = Field( + title="A list of `PacBioLibrary` objects", + description=""" + A list of `PacBioLibrary` objects. Each member of the list represents + a library, which was sequenced in this well. If the object is created + by supplying the `db_well` attribute via the constructor, the list + is never empty. The list is not sorted. + """, + ) + + @model_validator(mode="before") + def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: + + assigned = super().pre_root(values) + mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] + lims_data = mlwh_db_row.get_experiment_info() + if len(lims_data) == 0: + raise ValueError( + f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " + "on account of partially linked or unlinked product data." + ) + + assigned["libraries"] = [PacBioLibrary(db_library=row) for row in lims_data] + + return assigned + + class PacBioPagedWells(PagedResponse, extra="forbid"): """A response model for paged data about PacBio wells.""" diff --git a/tests/test_pb_well_models.py b/tests/test_pb_well_models.py index 12d64a4..ed80b75 100644 --- a/tests/test_pb_well_models.py +++ b/tests/test_pb_well_models.py @@ -1,10 +1,16 @@ +import pytest from npg_id_generation.pac_bio import PacBioEntity from sqlalchemy.orm import Session from lang_qc.db.helper.qc import get_qc_states_by_id_product_list from lang_qc.db.helper.wells import WellWh from lang_qc.db.mlwh_schema import PacBioRunWellMetrics -from lang_qc.models.pacbio.well import PacBioWellFull, PacBioWellSummary +from lang_qc.models.pacbio.experiment import PacBioLibrary +from lang_qc.models.pacbio.well import ( + PacBioWellFull, + PacBioWellLibraries, + PacBioWellSummary, +) from tests.conftest import compare_dates from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -116,9 +122,13 @@ def test_create_full_model( assert pb_well.experiment_tracking is None -def test_create_summary_model( +def test_create_summary_and_library_models( mlwhdb_test_session, qcdb_test_session, load_data4well_retrieval, mlwhdb_load_runs ): + + with pytest.raises(ValueError, match=r"None db_well value is not allowed."): + PacBioWellSummary(plate_number=3) + (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-92", "A1" ) @@ -126,6 +136,9 @@ def test_create_summary_model( _examine_well_model_a1(pb_well, well_row.id_pac_bio_product) assert pb_well.study_names == ["Tree of Life - ASG"] + pb_well = PacBioWellLibraries(db_well=well_row) + _examine_well_model_a1(pb_well, well_row.id_pac_bio_product) + (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION_RUN_1", "B1" ) @@ -140,7 +153,7 @@ def test_create_summary_model( _examine_well_model_c1(pb_well, well_row.id_pac_bio_product) -def test_create_summary_model_study_info( +def test_create_summary_and_library_models_lims_info( mlwhdb_test_session, qcdb_test_session, load_data4well_retrieval, mlwhdb_load_runs ): # Well with two samples, none is linked to LIMS @@ -150,6 +163,9 @@ def test_create_summary_model_study_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] + with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + PacBioWellLibraries(db_well=well_row) + # Fully linked wells with one sample (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-1162", "C1" @@ -163,6 +179,19 @@ def test_create_summary_model_study_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == ["DTOL_Darwin Tree of Life"] + pb_well = PacBioWellLibraries(db_well=well_row) + assert len(pb_well.libraries) == 1 + expected_lib = PacBioLibrary( + study_id="5901", + study_name="DTOL_Darwin Tree of Life", + sample_id="9463663", + sample_name="DTOL14290946", + tag_sequence=["CTCAGCATACGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7128", + ) + assert pb_well.libraries[0] == expected_lib + # A fully linked well with multiple samples, all belonging to the same study (well_row, qc_state) = _prepare_data( mlwhdb_test_session, qcdb_test_session, "TRACTION-RUN-1140", "B1", 1 @@ -180,6 +209,30 @@ def test_create_summary_model_study_info( "ToL_Blaxter_ Reference Genomes_ DNA", ] + pb_well = PacBioWellLibraries(db_well=well_row) + assert len(pb_well.libraries) == 4 + libs = {lib.sample_id: lib for lib in pb_well.libraries} + expected_lib = PacBioLibrary( + study_id="6771", + study_name="ToL_Blaxter_ Reference Genomes_ DNA", + sample_id="8657549", + sample_name="6771STDY13618009", + tag_sequence=["CTGCGATCACGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7676", + ) + assert libs["8657549"] == expected_lib + expected_lib = PacBioLibrary( + study_id="5901", + study_name="DTOL_Darwin Tree of Life", + sample_id="9463590", + sample_name="DTOL14291044", + tag_sequence=["TCTGCATCATGAGTAT"], + library_type="Pacbio_HiFi", + pool_name="TRAC-2-7676", + ) + assert libs["9463590"] == expected_lib + # A partially linked well with three samples, which belong to two studies. # The LIMS link for one of the samples is deleted so that two other samples # belong to the same study. @@ -188,3 +241,6 @@ def test_create_summary_model_study_info( ) pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] + + with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + PacBioWellLibraries(db_well=well_row) From 00df2edb15ea38c985b6c759677cf700374ef702 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 11 Jun 2024 18:59:28 +0100 Subject: [PATCH 4/5] Added an end point for well library data. --- lang_qc/endpoints/pacbio_well.py | 33 +++++++++++- lang_qc/models/pacbio/well.py | 3 +- lang_qc/util/errors.py | 7 +++ tests/endpoints/test_well_libraries.py | 70 ++++++++++++++++++++++++++ tests/test_pb_well_models.py | 5 +- 5 files changed, 114 insertions(+), 4 deletions(-) create mode 100644 tests/endpoints/test_well_libraries.py diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py index f9d4957..d2a5a61 100644 --- a/lang_qc/endpoints/pacbio_well.py +++ b/lang_qc/endpoints/pacbio_well.py @@ -37,13 +37,18 @@ from lang_qc.db.mlwh_connection import get_mlwh_db from lang_qc.db.qc_connection import get_qc_db from lang_qc.db.qc_schema import User -from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull +from lang_qc.models.pacbio.well import ( + PacBioPagedWells, + PacBioWellFull, + PacBioWellLibraries, +) from lang_qc.models.qc_flow_status import QcFlowStatusEnum from lang_qc.models.qc_state import QcState, QcStateBasic from lang_qc.util.auth import check_user from lang_qc.util.errors import ( InconsistentInputError, InvalidDictValueError, + MissingLimsDataError, RunNotFoundError, ) from lang_qc.util.type_checksum import ChecksumSHA256 @@ -163,6 +168,32 @@ def get_wells_in_run( return response +@router.get( + "/wells/{id_product}/libraries", + summary="Get well summary and LIMS data for all libraries", + responses={ + status.HTTP_404_NOT_FOUND: {"description": "Well product does not exist"}, + status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"}, + status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"}, + }, + response_model=PacBioWellLibraries, +) +def get_well_lims_info( + id_product: ChecksumSHA256, + mlwhdb_session: Session = Depends(get_mlwh_db), +) -> PacBioWellLibraries: + + db_well = _find_well_product_or_error(id_product, mlwhdb_session) + well_libraries: PacBioWellLibraries + try: + well_libraries = PacBioWellLibraries(db_well=db_well) + except MissingLimsDataError as err: + # 409 - Request conflicts with the current state of the server. + raise HTTPException(409, detail=str(err)) + + return well_libraries + + @router.get( "/products/{id_product}/seq_level", summary="Get full sequencing QC metrics and state for a product", diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index e5bf0d8..e809805 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -31,6 +31,7 @@ from lang_qc.models.pacbio.qc_data import QCDataWell from lang_qc.models.pager import PagedResponse from lang_qc.models.qc_state import QcState +from lang_qc.util.errors import MissingLimsDataError def get_field_names(cls): @@ -199,7 +200,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: mlwh_db_row: PacBioRunWellMetrics = values.kwargs["db_well"] lims_data = mlwh_db_row.get_experiment_info() if len(lims_data) == 0: - raise ValueError( + raise MissingLimsDataError( f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " "on account of partially linked or unlinked product data." ) diff --git a/lang_qc/util/errors.py b/lang_qc/util/errors.py index 21dab57..aeac0ef 100644 --- a/lang_qc/util/errors.py +++ b/lang_qc/util/errors.py @@ -25,3 +25,10 @@ class EmptyListOfRunNamesError(Exception): class RunNotFoundError(Exception): """Exception to be used when no well metrics data for a run is found.""" + + +class MissingLimsDataError(Exception): + """ + Exception to be used when product LIMS data is not available + or partially missing. + """ diff --git a/tests/endpoints/test_well_libraries.py b/tests/endpoints/test_well_libraries.py new file mode 100644 index 0000000..d35e476 --- /dev/null +++ b/tests/endpoints/test_well_libraries.py @@ -0,0 +1,70 @@ +from fastapi.testclient import TestClient +from sqlalchemy import select + +# from lang_qc.db.mlwh_schema import PacBioRunWellMetrics + + +def test_well_libraries(test_client: TestClient, mlwhdb_load_runs): + """Test retrieval of LIMS library data for a well.""" + + response = test_client.get(f"/pacbio/wells/malformed/libraries") + assert response.status_code == 422 + + id_product = "aaa8bd66e0f0895ea728c1d08103c62d3de8a57a5f879cee45f7b0acc028aa61" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 404 + + # Partially linked well + id_product = "26928ba6ec2a00c04dd6c7c68008ec9436e3979a384b9f708dc371c99f272e17" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 409 + assert response.json()["detail"] == "".join( + [ + "No LIMS data retrieved for lang_qc.db.mlwh_schema.PacBioRunWellMetrics:", + " pac_bio_run_name=TRACTION-RUN-1140, well_label=C1, plate_number=2,", + " id_pac_bio_product=26928ba6ec2a00c04dd6c7c68008ec9436e3979a384b9f708dc371c99f272e17", + " on account of partially linked or unlinked product data.", + ] + ) + + # Fully linked well + id_product = "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf" + response = test_client.get(f"/pacbio/wells/{id_product}/libraries") + assert response.status_code == 200 + expected_response = { + "id_product": "513c674f489b106c6af716dd0d210826ff03b7648d50888839c3722ca1b10dbf", + "label": "A1", + "plate_number": 2, + "run_name": "TRACTION-RUN-1140", + "run_start_time": "2024-02-23T10:28:12", + "run_complete_time": "2024-02-25T20:53:05", + "well_start_time": "2024-02-24T14:25:12", + "well_complete_time": "2024-02-26T00:27:52", + "run_status": "Complete", + "well_status": "Complete", + "instrument_name": "84093", + "instrument_type": "Revio", + "qc_state": None, + "libraries": [ + { + "study_id": "5901", + "study_name": "DTOL_Darwin Tree of Life", + "sample_id": "9478726", + "sample_name": "DTOL14523243", + "tag_sequence": ["ATCTGCACGTGAGTAT"], + "library_type": "Pacbio_HiFi", + "pool_name": "TRAC-2-7677", + }, + { + "study_id": "5901", + "study_name": "DTOL_Darwin Tree of Life", + "sample_id": "9518398", + "sample_name": "DTOL14180244", + "tag_sequence": ["ATGTACTAGTGAGTAT"], + "library_type": "Pacbio_HiFi", + "pool_name": "TRAC-2-7677", + }, + ], + } + + assert response.json() == expected_response diff --git a/tests/test_pb_well_models.py b/tests/test_pb_well_models.py index ed80b75..ce5560d 100644 --- a/tests/test_pb_well_models.py +++ b/tests/test_pb_well_models.py @@ -11,6 +11,7 @@ PacBioWellLibraries, PacBioWellSummary, ) +from lang_qc.util.errors import MissingLimsDataError from tests.conftest import compare_dates from tests.fixtures.well_data import load_data4well_retrieval, load_dicts_and_users @@ -163,7 +164,7 @@ def test_create_summary_and_library_models_lims_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] - with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + with pytest.raises(MissingLimsDataError, match=r"No LIMS data retrieved"): PacBioWellLibraries(db_well=well_row) # Fully linked wells with one sample @@ -242,5 +243,5 @@ def test_create_summary_and_library_models_lims_info( pb_well = PacBioWellSummary(db_well=well_row) assert pb_well.study_names == [] - with pytest.raises(ValueError, match=r"No LIMS data retrieved"): + with pytest.raises(MissingLimsDataError, match=r"No LIMS data retrieved"): PacBioWellLibraries(db_well=well_row) From fd3e9ef00b758a50ac6a7ab39463fa88f752de2b Mon Sep 17 00:00:00 2001 From: mgcam Date: Wed, 12 Jun 2024 15:01:54 +0100 Subject: [PATCH 5/5] Dropped direct calls to __repr__() Also made the helper function of the parent class 'private'. --- lang_qc/db/mlwh_schema.py | 4 ++-- lang_qc/models/pacbio/well.py | 2 +- tests/test_mlwh_db_classes.py | 7 +++---- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py index c0796b1..c1cbff0 100644 --- a/lang_qc/db/mlwh_schema.py +++ b/lang_qc/db/mlwh_schema.py @@ -33,7 +33,7 @@ class Base(DeclarativeBase): A base class for declarative class definitions for the ml warehouse database. """ - def get_row_description(self, fields: list[str]) -> str: + def _get_row_description(self, fields: list[str]) -> str: """ Returns a printable representation of the database table row. Interprets a list of strings given as the `fields` argument as a list of column @@ -564,7 +564,7 @@ class PacBioRunWellMetrics(Base): def __repr__(self): """Returns a printable representation of the database row""" - return self.get_row_description( + return self._get_row_description( ["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"] ) diff --git a/lang_qc/models/pacbio/well.py b/lang_qc/models/pacbio/well.py index e809805..00926da 100644 --- a/lang_qc/models/pacbio/well.py +++ b/lang_qc/models/pacbio/well.py @@ -201,7 +201,7 @@ def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: lims_data = mlwh_db_row.get_experiment_info() if len(lims_data) == 0: raise MissingLimsDataError( - f"No LIMS data retrieved for {mlwh_db_row.__repr__()} " + f"No LIMS data retrieved for {str(mlwh_db_row)} " "on account of partially linked or unlinked product data." ) diff --git a/tests/test_mlwh_db_classes.py b/tests/test_mlwh_db_classes.py index be0b89a..5b832fe 100644 --- a/tests/test_mlwh_db_classes.py +++ b/tests/test_mlwh_db_classes.py @@ -18,7 +18,6 @@ def test_pac_bio_well_metrics_repr(mlwhdb_test_session, mlwhdb_load_runs): PacBioRunWellMetrics.id_pac_bio_product == id ) db_row = mlwhdb_test_session.execute(query).scalar_one() - assert ( - db_row.__repr__() - == "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] - ) + expected_string = "lang_qc.db.mlwh_schema.PacBioRunWellMetrics: " + data[id] + assert db_row.__repr__() == expected_string + assert str(db_row) == expected_string