-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #229 from mgcam/well_libs
Well libs
- Loading branch information
Showing
9 changed files
with
418 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2023 Genome Research Ltd. | ||
# Copyright (c) 2023, 2024 Genome Research Ltd. | ||
# | ||
# Authors: | ||
# Marina Gourtovaia <[email protected]> | ||
|
@@ -19,14 +19,96 @@ | |
# You should have received a copy of the GNU General Public License along with | ||
# this program. If not, see <http://www.gnu.org/licenses/>. | ||
|
||
from typing import List | ||
from typing import Any | ||
|
||
from pydantic import BaseModel, ConfigDict, Field | ||
from pydantic import Field, model_validator | ||
from pydantic.dataclasses import dataclass | ||
|
||
from lang_qc.db.mlwh_schema import PacBioRun | ||
|
||
|
||
class PacBioExperiment(BaseModel): | ||
@dataclass(kw_only=True, frozen=True) | ||
class PacBioLibrary: | ||
""" | ||
This model represents LIMS data associated with a PacBio library. | ||
The fields of the model can be assigned directly via the constructor. | ||
However, if the `db_library` field, a single row of the PacBioRun table | ||
class, is set via the constructor, the rest of the fields are populated | ||
using this database row object, while any other information passed to the | ||
constructor is disregarded. | ||
The `db_library` field is not present in the model instance that is | ||
returned by the constructor. | ||
""" | ||
|
||
db_library: PacBioRun = Field(init_var=True) | ||
|
||
study_id: str = Field( | ||
title="LIMS-specific study identifier", | ||
) | ||
study_name: str = Field( | ||
title="Study name", | ||
) | ||
sample_id: str = Field( | ||
title="LIMS-specific Sample identifier", | ||
) | ||
sample_name: str = Field( | ||
title="Sample name", | ||
) | ||
tag_sequence: list = Field( | ||
title="Tag sequence", | ||
description=""" | ||
Tag sequences as a list. An empty list for a non-indexed library. | ||
""", | ||
) | ||
library_type: str | None = Field( | ||
default=None, | ||
title="Library type", | ||
) | ||
pool_name: str | None = Field( | ||
default=None, | ||
title="Pool name", | ||
description=""" | ||
The pac_bio_library_tube_barcode from TRACTION, AKA pool name | ||
""", | ||
) | ||
|
||
@model_validator(mode="before") | ||
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: | ||
""" | ||
Populates the fields of this object with information available | ||
in the LIMS system. Errors if the `db_library` attribute is not | ||
set via the constructor. | ||
""" | ||
|
||
# https://github.com/pydantic/pydantic-core/blob/main/python/pydantic_core/_pydantic_core.pyi | ||
if "db_library" not in values.kwargs: | ||
return values.kwargs | ||
db_row: PacBioRun = values.kwargs["db_library"] | ||
if db_row is None: | ||
raise ValueError("None db_library value is not allowed.") | ||
|
||
assigned = dict() | ||
study = db_row.study | ||
assigned["study_name"] = study.name | ||
assigned["study_id"] = study.id_study_lims | ||
sample = db_row.sample | ||
assigned["sample_name"] = sample.name | ||
assigned["sample_id"] = sample.id_sample_lims | ||
assigned["library_type"] = db_row.pipeline_id_lims | ||
assigned["pool_name"] = db_row.pac_bio_library_tube_barcode | ||
assigned["tag_sequence"] = [] | ||
if tag := db_row.tag_sequence: | ||
assigned["tag_sequence"].append(tag) | ||
if tag := db_row.tag2_sequence: | ||
assigned["tag_sequence"].append(tag) | ||
|
||
return assigned | ||
|
||
|
||
@dataclass(kw_only=True, frozen=True) | ||
class PacBioExperiment: | ||
""" | ||
A response model that contains laboratory tracking information | ||
about the PacBio wells and samples prior to the start of the | ||
|
@@ -43,28 +125,30 @@ class PacBioExperiment(BaseModel): | |
(library). | ||
""" | ||
|
||
db_libraries: list[PacBioRun] = Field(init_var=True) | ||
|
||
study_id: list = Field( | ||
title="Study identifier", | ||
description=""" | ||
Study identifiers as a sorted list of unique strings (to cover | ||
an unlikely case of multiple studies). | ||
""", | ||
) | ||
study_name: str = Field( | ||
study_name: str | None = Field( | ||
default=None, | ||
title="Study name", | ||
description=""" | ||
Study name, is not set in case of multiple studies. | ||
""", | ||
) | ||
sample_id: str = Field( | ||
sample_id: str | None = Field( | ||
default=None, | ||
title="Sample identifier", | ||
description=""" | ||
Sample identifier, is not set in case of multiple samples. | ||
""", | ||
) | ||
sample_name: str = Field( | ||
sample_name: str | None = Field( | ||
default=None, | ||
title="Sample name", | ||
description=""" | ||
|
@@ -94,59 +178,57 @@ class PacBioExperiment(BaseModel): | |
unlikely case of multiple library types. | ||
""", | ||
) | ||
pool_name: str = Field( | ||
pool_name: str | None = Field( | ||
default=None, | ||
title="Pool name", | ||
description=""" | ||
The pac_bio_library_tube_barcode from TRACTION, AKA pool name | ||
""", | ||
) | ||
model_config = ConfigDict(from_attributes=True, extra="forbid") | ||
|
||
@classmethod | ||
def from_orm(cls, lims_db_rows: List[PacBioRun]): | ||
@model_validator(mode="before") | ||
def pre_root(cls, values: dict[str, Any]) -> dict[str, Any]: | ||
""" | ||
A factory method, creates an instance of the PacBioLimsData class. | ||
Should be given a non-empty list of PacBioRun table row objects as | ||
an argument. | ||
Populates the fields of this object with information available | ||
in the LIMS system. | ||
Errors if the `db_libraries` attribute is not set via the constructor. | ||
""" | ||
|
||
lims_db_rows: list[PacBioRun] = values.kwargs["db_libraries"] | ||
num_samples = len(lims_db_rows) | ||
if num_samples == 0: | ||
raise Exception("Cannot create PacBioLimsData object, no data.") | ||
if any(row is None for row in lims_db_rows): | ||
raise Exception("Cannot create PacBioLimsData object, None row.") | ||
raise ValueError("Empty db_libraries list is not allowed.") | ||
|
||
lib_objects = [PacBioLibrary(db_library=row) for row in lims_db_rows] | ||
|
||
# Using sets for some data instead of lists because we do not | ||
# want repetitions. | ||
lims_data = { | ||
"num_samples": num_samples, | ||
"study_id": set(), | ||
"library_type": set(), | ||
"tag_sequence": [], | ||
} | ||
study_name = None | ||
for row in lims_db_rows: | ||
lims_data["study_id"].add(row.study.id_study_lims) | ||
lims_data["library_type"].add(row.pipeline_id_lims) | ||
study_name = row.study.name | ||
if pool_name := row.pac_bio_library_tube_barcode: | ||
lims_data["pool_name"] = pool_name | ||
if num_samples == 1: | ||
if tag := row.tag_sequence: | ||
lims_data["tag_sequence"].append(tag) | ||
if tag := row.tag2_sequence: | ||
lims_data["tag_sequence"].append(tag) | ||
lims_data["sample_id"] = row.sample.id_sample_lims | ||
lims_data["sample_name"] = row.sample.name | ||
lims_data["study_name"] = row.study.name | ||
|
||
lims_data["study_id"] = {o.study_id for o in lib_objects} # returns a set | ||
lims_data["library_type"] = { | ||
o.library_type if o.library_type is not None else "UNKNOWN" | ||
for o in lib_objects | ||
} | ||
|
||
pool_names = {o.pool_name for o in lib_objects} | ||
if len(pool_names) > 1: | ||
raise ValueError("Multiple pool names.") | ||
lims_data["pool_name"] = pool_names.pop() | ||
|
||
o = lib_objects[0] | ||
if num_samples == 1: | ||
lims_data["tag_sequence"] = o.tag_sequence | ||
lims_data["sample_id"] = o.sample_id | ||
lims_data["sample_name"] = o.sample_name | ||
lims_data["study_name"] = o.study_name | ||
if len(lims_data["study_id"]) == 1: | ||
lims_data["study_name"] = study_name | ||
lims_data["study_name"] = o.study_name | ||
|
||
# Convert sets back to lists and sort so that the list items are | ||
# Convert sets back to lists and sort so that the items are | ||
# in a predictable order. | ||
for key in ("library_type", "study_id"): | ||
lims_data[key] = sorted(lims_data[key]) | ||
|
||
return cls.model_validate(lims_data) | ||
return lims_data |
Oops, something went wrong.