Merge pull request #229 from mgcam/well_libs

Well libs
wtsi-npg · Jun 12, 2024 · 0b225e7 · 0b225e7
2 parents 9d68ec0 + fd3e9ef
commit 0b225e7
Show file tree

Hide file tree

Showing 9 changed files with 418 additions and 66 deletions.
diff --git a/lang_qc/db/mlwh_schema.py b/lang_qc/db/mlwh_schema.py
@@ -25,9 +25,30 @@
 from sqlalchemy.dialects.mysql import SMALLINT as mysqlSMALLINT
 from sqlalchemy.dialects.mysql import TINYINT as mysqlTINYINT
 from sqlalchemy.dialects.mysql import VARCHAR as mysqlVARCHAR
-from sqlalchemy.orm import declarative_base, relationship
+from sqlalchemy.orm import DeclarativeBase, relationship
 
-Base = declarative_base()
+
+class Base(DeclarativeBase):
+    """
+    A base class for declarative class definitions for the ml warehouse database.
+    """
+
+    def _get_row_description(self, fields: list[str]) -> str:
+        """
+        Returns a printable representation of the database table row. Interprets
+        a list of strings given as the `fields` argument as a list of column
+        names. Combines the name of the class, names of the given columns
+        and respective values into a row description. The columns for which
+        the row has a NULL value are omitted from the description.
+        """
+
+        pairs = []
+        for name in fields:
+            value = self.__getattribute__(name)
+            if value is not None:
+                pairs.append(f"{name}={value}")
+        description = ", ".join(pairs)
+        return f"{self.__module__}.{self.__class__.__name__}: {description}"
 
 
 class Sample(Base):
@@ -538,7 +559,16 @@ class PacBioRunWellMetrics(Base):
         "PacBioProductMetrics", back_populates="pac_bio_run_well_metrics"
     )
 
-    def get_experiment_info(self):
+    """Custom or customised methods are added below"""
+
+    def __repr__(self):
+        """Returns a printable representation of the database row"""
+
+        return self._get_row_description(
+            ["pac_bio_run_name", "well_label", "plate_number", "id_pac_bio_product"]
+        )
+
+    def get_experiment_info(self) -> list[PacBioRun]:
         """Returns a list of PacBioRun mlwh database rows.
 
         Returns LIMS information about the PacBio experiment

diff --git a/lang_qc/endpoints/pacbio_well.py b/lang_qc/endpoints/pacbio_well.py
@@ -37,13 +37,18 @@
 from lang_qc.db.mlwh_connection import get_mlwh_db
 from lang_qc.db.qc_connection import get_qc_db
 from lang_qc.db.qc_schema import User
-from lang_qc.models.pacbio.well import PacBioPagedWells, PacBioWellFull
+from lang_qc.models.pacbio.well import (
+    PacBioPagedWells,
+    PacBioWellFull,
+    PacBioWellLibraries,
+)
 from lang_qc.models.qc_flow_status import QcFlowStatusEnum
 from lang_qc.models.qc_state import QcState, QcStateBasic
 from lang_qc.util.auth import check_user
 from lang_qc.util.errors import (
     InconsistentInputError,
     InvalidDictValueError,
+    MissingLimsDataError,
     RunNotFoundError,
 )
 from lang_qc.util.type_checksum import ChecksumSHA256
@@ -163,6 +168,32 @@ def get_wells_in_run(
     return response
 
 
+@router.get(
+    "/wells/{id_product}/libraries",
+    summary="Get well summary and LIMS data for all libraries",
+    responses={
+        status.HTTP_404_NOT_FOUND: {"description": "Well product does not exist"},
+        status.HTTP_422_UNPROCESSABLE_ENTITY: {"description": "Invalid product ID"},
+        status.HTTP_409_CONFLICT: {"description": "Missing or incomplete LIMS data"},
+    },
+    response_model=PacBioWellLibraries,
+)
+def get_well_lims_info(
+    id_product: ChecksumSHA256,
+    mlwhdb_session: Session = Depends(get_mlwh_db),
+) -> PacBioWellLibraries:
+
+    db_well = _find_well_product_or_error(id_product, mlwhdb_session)
+    well_libraries: PacBioWellLibraries
+    try:
+        well_libraries = PacBioWellLibraries(db_well=db_well)
+    except MissingLimsDataError as err:
+        # 409 - Request conflicts with the current state of the server.
+        raise HTTPException(409, detail=str(err))
+
+    return well_libraries
+
+
 @router.get(
     "/products/{id_product}/seq_level",
     summary="Get full sequencing QC metrics and state for a product",

diff --git a/lang_qc/models/pacbio/experiment.py b/lang_qc/models/pacbio/experiment.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 Genome Research Ltd.
+# Copyright (c) 2023, 2024 Genome Research Ltd.
 #
 # Authors:
 #   Marina Gourtovaia <[email protected]>
@@ -19,14 +19,96 @@
 # You should have received a copy of the GNU General Public License along with
 # this program. If not, see <http://www.gnu.org/licenses/>.
 
-from typing import List
+from typing import Any
 
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import Field, model_validator
+from pydantic.dataclasses import dataclass
 
 from lang_qc.db.mlwh_schema import PacBioRun
 
 
-class PacBioExperiment(BaseModel):
+@dataclass(kw_only=True, frozen=True)
+class PacBioLibrary:
+    """
+    This model represents LIMS data associated with a PacBio library.
+
+    The fields of the model can be assigned directly via the constructor.
+    However, if the `db_library` field, a single row of the PacBioRun table
+    class, is set via the constructor, the rest of the fields are populated
+    using this database row object, while  any other information passed to the
+    constructor is disregarded.
+
+    The  `db_library` field is not present in the model instance that is
+    returned by the constructor.
+    """
+
+    db_library: PacBioRun = Field(init_var=True)
+
+    study_id: str = Field(
+        title="LIMS-specific study identifier",
+    )
+    study_name: str = Field(
+        title="Study name",
+    )
+    sample_id: str = Field(
+        title="LIMS-specific Sample identifier",
+    )
+    sample_name: str = Field(
+        title="Sample name",
+    )
+    tag_sequence: list = Field(
+        title="Tag sequence",
+        description="""
+        Tag sequences as a list. An empty list for a non-indexed library.
+        """,
+    )
+    library_type: str | None = Field(
+        default=None,
+        title="Library type",
+    )
+    pool_name: str | None = Field(
+        default=None,
+        title="Pool name",
+        description="""
+        The pac_bio_library_tube_barcode from TRACTION, AKA pool name
+        """,
+    )
+
+    @model_validator(mode="before")
+    def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
+        """
+        Populates the fields of this object with information available
+        in the LIMS system. Errors if the `db_library` attribute is not
+        set via the constructor.
+        """
+
+        # https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi
+        if "db_library" not in values.kwargs:
+            return values.kwargs
+        db_row: PacBioRun = values.kwargs["db_library"]
+        if db_row is None:
+            raise ValueError("None db_library value is not allowed.")
+
+        assigned = dict()
+        study = db_row.study
+        assigned["study_name"] = study.name
+        assigned["study_id"] = study.id_study_lims
+        sample = db_row.sample
+        assigned["sample_name"] = sample.name
+        assigned["sample_id"] = sample.id_sample_lims
+        assigned["library_type"] = db_row.pipeline_id_lims
+        assigned["pool_name"] = db_row.pac_bio_library_tube_barcode
+        assigned["tag_sequence"] = []
+        if tag := db_row.tag_sequence:
+            assigned["tag_sequence"].append(tag)
+            if tag := db_row.tag2_sequence:
+                assigned["tag_sequence"].append(tag)
+
+        return assigned
+
+
+@dataclass(kw_only=True, frozen=True)
+class PacBioExperiment:
     """
     A response model that contains laboratory tracking information
     about the PacBio wells and samples prior to the start of the
@@ -43,28 +125,30 @@ class PacBioExperiment(BaseModel):
     (library).
     """
 
+    db_libraries: list[PacBioRun] = Field(init_var=True)
+
     study_id: list = Field(
         title="Study identifier",
         description="""
         Study identifiers as a sorted list of unique strings (to cover
         an unlikely case of multiple studies).
         """,
     )
-    study_name: str = Field(
+    study_name: str | None = Field(
         default=None,
         title="Study name",
         description="""
         Study name, is not set in case of multiple studies.
         """,
     )
-    sample_id: str = Field(
+    sample_id: str | None = Field(
         default=None,
         title="Sample identifier",
         description="""
         Sample identifier, is not set in case of multiple samples.
         """,
     )
-    sample_name: str = Field(
+    sample_name: str | None = Field(
         default=None,
         title="Sample name",
         description="""
@@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel):
         unlikely case of multiple library types.
         """,
     )
-    pool_name: str = Field(
+    pool_name: str | None = Field(
         default=None,
         title="Pool name",
         description="""
         The pac_bio_library_tube_barcode from TRACTION, AKA pool name
         """,
     )
-    model_config = ConfigDict(from_attributes=True, extra="forbid")
 
-    @classmethod
-    def from_orm(cls, lims_db_rows: List[PacBioRun]):
+    @model_validator(mode="before")
+    def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]:
         """
-        A factory method, creates an instance of the PacBioLimsData class.
-        Should be given a non-empty list of PacBioRun table row objects as
-        an argument.
+        Populates the fields of this object with information available
+        in the LIMS system.
+        Errors if the `db_libraries` attribute is not set via the constructor.
         """
 
+        lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"]
         num_samples = len(lims_db_rows)
         if num_samples == 0:
-            raise Exception("Cannot create PacBioLimsData object, no data.")
-        if any(row is None for row in lims_db_rows):
-            raise Exception("Cannot create PacBioLimsData object, None row.")
+            raise ValueError("Empty db_libraries list is not allowed.")
+
+        lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows]
 
-        # Using sets for some data instead of lists because we do not
-        # want repetitions.
         lims_data = {
             "num_samples": num_samples,
-            "study_id": set(),
-            "library_type": set(),
             "tag_sequence": [],
         }
-        study_name = None
-        for row in lims_db_rows:
-            lims_data["study_id"].add(row.study.id_study_lims)
-            lims_data["library_type"].add(row.pipeline_id_lims)
-            study_name = row.study.name
-            if pool_name := row.pac_bio_library_tube_barcode:
-                lims_data["pool_name"] = pool_name
-            if num_samples == 1:
-                if tag := row.tag_sequence:
-                    lims_data["tag_sequence"].append(tag)
-                    if tag := row.tag2_sequence:
-                        lims_data["tag_sequence"].append(tag)
-                lims_data["sample_id"] = row.sample.id_sample_lims
-                lims_data["sample_name"] = row.sample.name
-                lims_data["study_name"] = row.study.name
 
+        lims_data["study_id"] = {o.study_id for o in lib_objects}  # returns a set
+        lims_data["library_type"] = {
+            o.library_type if o.library_type is not None else "UNKNOWN"
+            for o in lib_objects
+        }
+
+        pool_names = {o.pool_name for o in lib_objects}
+        if len(pool_names) > 1:
+            raise ValueError("Multiple pool names.")
+        lims_data["pool_name"] = pool_names.pop()
+
+        o = lib_objects[0]
+        if num_samples == 1:
+            lims_data["tag_sequence"] = o.tag_sequence
+            lims_data["sample_id"] = o.sample_id
+            lims_data["sample_name"] = o.sample_name
+            lims_data["study_name"] = o.study_name
         if len(lims_data["study_id"]) == 1:
-            lims_data["study_name"] = study_name
+            lims_data["study_name"] = o.study_name
 
-        # Convert sets back to lists and sort so that the list items are
+        # Convert sets back to lists and sort so that the items are
         # in a predictable order.
         for key in ("library_type", "study_id"):
             lims_data[key] = sorted(lims_data[key])
 
-        return cls.model_validate(lims_data)
+        return lims_data