From 7ed860b98e2c14a2bff31a74eeccbeca904c38d4 Mon Sep 17 00:00:00 2001 From: Keith James Date: Thu, 18 Jan 2024 09:29:11 +0000 Subject: [PATCH 1/2] Avoid mutation of the plate_number public attribute This change decouples PacBioEntity ID generation from the public Pydantic API of the class. This avoids surprising mutation of the plate_number attribute during validation. A small number of tests checked for the mutation behaviour, so these are also changed. Remove redundant @classmethod decorators because the Pydantic @field_validator already returns a class method. Add exclusion directories to .flake8 to avoid false positives when run locally. --- .flake8 | 1 + npg_id_generation/pac_bio.py | 47 +++++++++++++++++++++++++----------- tests/test_hashing.py | 43 +++++++++++++++++++-------------- 3 files changed, 59 insertions(+), 32 deletions(-) diff --git a/.flake8 b/.flake8 index c201160..c0bbb24 100644 --- a/.flake8 +++ b/.flake8 @@ -2,3 +2,4 @@ max-line-length = 100 ignore = E251, E265, E261, E302, W503 per-file-ignores = __init__.py:F401 +exclude = .idea .git .github .venv .vscode venv diff --git a/npg_id_generation/pac_bio.py b/npg_id_generation/pac_bio.py index 67b1a50..82e3e54 100644 --- a/npg_id_generation/pac_bio.py +++ b/npg_id_generation/pac_bio.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, 2023 Genome Research Ltd. +# Copyright (c) 2022, 2023, 2024 Genome Research Ltd. # # Authors: # Adam Blanchet @@ -20,11 +20,12 @@ # You should have received a copy of the GNU General Public License along with # this program. If not, see . +import io +import re from hashlib import sha256 from typing import Optional -from pydantic import BaseModel, Field, field_validator, ConfigDict -import re +from pydantic import BaseModel, ConfigDict, Field, field_validator def concatenate_tags(tags: list[str]): @@ -55,7 +56,9 @@ class PacBioEntity(BaseModel): We are not using this explicit sort for now since it adds to the execution time. - Order the attributes alphabetically! + Order the attributes alphabetically to maintain order in the output + of model_to_json(). The hash_product_id() method does not depend on + the Pydantic ordering, however (it uses a custom JSON serializer). """ model_config = ConfigDict(extra="forbid") @@ -69,7 +72,8 @@ class PacBioEntity(BaseModel): Plate number is a positive integer and is relevant for Revio instruments only, thus it defaults to None. To be backward-compatible with Revio product IDs generated so far, - when the value of this attribute is 1, we reset it to undefined. + when the value of this attribute is 1, it is ignored when serializing + to generate an ID. """, ) tags: Optional[str] = Field( @@ -84,14 +88,12 @@ class PacBioEntity(BaseModel): ) @field_validator("run_name", "well_label", "tags") - @classmethod def attributes_are_non_empty_strings(cls, v): if (v is not None) and (v == ""): raise ValueError("Cannot be an empty string") return v @field_validator("well_label") - @classmethod def well_label_conforms_to_pattern(cls, v): if not re.match("^[A-Z][1-9][0-9]?$", v): raise ValueError( @@ -99,13 +101,7 @@ def well_label_conforms_to_pattern(cls, v): ) return v - @field_validator("plate_number") - @classmethod - def plate_number_default(cls, v): - return None if (v is None) or (v == 1) else v - @field_validator("tags") - @classmethod def tags_have_correct_characters(cls, v): if (v is not None) and (not re.match("^[ACGT]+(,[ACGT]+)*$", v)): raise ValueError( @@ -116,4 +112,27 @@ def tags_have_correct_characters(cls, v): def hash_product_id(self): """Generate a sha256sum for the PacBio Entity""" - return sha256(self.model_dump_json(exclude_none=True).encode()).hexdigest() + # Avoid using Pydantic's model_to_json() method as it requires somewhat + # complicated setup with decorators to create our backwards-compatible JSON + # serialization. + + # The JSON is built with StringIO to ensure the order of the attributes and + # that it's faster than using json.dumps() with sort_keys=True (timeit + # estimates that this is ~4x faster. + json = io.StringIO() + json.write('{"run_name":"') + json.write(self.run_name) + json.write('","well_label":"') + json.write(self.well_label) + json.write('"') + if self.plate_number is not None and self.plate_number > 1: + json.write(',"plate_number":') + json.write(str(self.plate_number)) + json.write('"') + if self.tags is not None: + json.write(',"tags":"') + json.write(self.tags) + json.write('"') + json.write("}") + + return sha256(json.getvalue().encode("utf-8")).hexdigest() diff --git a/tests/test_hashing.py b/tests/test_hashing.py index 1ba69b5..d579d6d 100644 --- a/tests/test_hashing.py +++ b/tests/test_hashing.py @@ -41,17 +41,14 @@ def test_whitespace(): def test_different_ways_to_create_object(): - results = [] - results.append(PacBioEntity(run_name="MARATHON", well_label="A1").hash_product_id()) - results.append(PacBioEntity(well_label="A1", run_name="MARATHON").hash_product_id()) - results.append( - PacBioEntity(run_name="MARATHON", well_label="A1", tags=None).hash_product_id() - ) - results.append( + results = [ + PacBioEntity(run_name="MARATHON", well_label="A1").hash_product_id(), + PacBioEntity(well_label="A1", run_name="MARATHON").hash_product_id(), + PacBioEntity(run_name="MARATHON", well_label="A1", tags=None).hash_product_id(), PacBioEntity.model_validate_json( '{"run_name": "MARATHON", "well_label": "A1"}' - ).hash_product_id() - ) + ).hash_product_id(), + ] assert len(set(results)) == 1 with pytest.raises(ValidationError) as excinfo: @@ -140,13 +137,13 @@ def test_plate_number_defaults(): e3 = PacBioEntity( run_name="MARATHON", well_label="A1", tags="TAGC", plate_number=None ) - assert e1.plate_number is None + assert e1.plate_number == 1 assert e2.plate_number is None assert e3.plate_number is None - assert e1.model_dump_json(exclude_none=True) == e2.model_dump_json( + assert e1.model_dump_json(exclude_none=True) != e2.model_dump_json( exclude_none=True ) - assert e1.model_dump_json(exclude_none=True) == e3.model_dump_json( + assert e1.model_dump_json(exclude_none=True) != e3.model_dump_json( exclude_none=True ) assert e1.hash_product_id() == e2.hash_product_id() @@ -154,9 +151,9 @@ def test_plate_number_defaults(): e1 = PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=1) e2 = PacBioEntity(run_name="MARATHON", well_label="A1") - assert e1.plate_number is None + assert e1.plate_number == 1 assert e2.plate_number is None - assert e1.model_dump_json() == e2.model_dump_json() + assert e1.model_dump_json() != e2.model_dump_json() assert e1.hash_product_id() == e2.hash_product_id() @@ -198,12 +195,15 @@ def test_multiple_plates_make_difference(): def test_expected_hashes(): """Test against expected hashes.""" + # plate_number absent (historical runs) or plate_number == 1 + p1_sha256 = "cda15311f706217e31e32d42d524bc35662a6fc15623ce5abe6a31ed741801ae" + # plate_number == 2 + p2_sha256 = "7ca9d350c9b14f0883ac568220b8e5d97148a4eeb41d9de00b5733299d7bcd89" test_cases = [ - ( - '{"run_name": "MARATHON", "well_label": "A1"}', - "cda15311f706217e31e32d42d524bc35662a6fc15623ce5abe6a31ed741801ae", - ), + ('{"run_name": "MARATHON", "well_label": "A1"}', p1_sha256), + ('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 1}', p1_sha256), + ('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2}', p2_sha256), ( '{"run_name": "SEMI-MARATHON", "well_label": "D1"}', "b55417615e458c23049cc84822531a435c0c4069142f0e1d5e4378d48d9f7bd2", @@ -242,3 +242,10 @@ def test_tags_not_sorted(): != pb_entities[1].hash_product_id() != pb_entities[2].hash_product_id() ) + + +def test_regression_github_issue19(): + # https://github.com/wtsi-npg/npg_id_generation/issues/19 + + e1 = PacBioEntity(run_name="MARATHON", well_label="A1", tags="ACGT", plate_number=1) + assert e1.plate_number == 1, "Plate number should be 1 and not None" From 24287f1d62d38844d9c67d04930526dd4419dfeb Mon Sep 17 00:00:00 2001 From: mgcam Date: Fri, 2 Feb 2024 14:12:31 +0000 Subject: [PATCH 2/2] Updated the changelog for release 5.0.0 --- CHANGELOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd3f2d6..e42887d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,19 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [5.0.0] + +### [Changed] + + - Stopped the mutation of the `plate_number` attribute at the time the + PacBioEntity object instance is created. The value of the `plate_number` + attribute is now stored and retrieved as supplied. As a consequence, the + JSON string returned by the `model_dump_json` method of this Pydantic + object is now different when `plate_namber` value is 1. + - To retain backwards compatibility, the `hash_product_id` method is + reimplemented since it can no longer use for ID generation the string + returned by the `model_dump_json` method. + ## [4.0.1] ### [Changed]