Skip to content

Commit

Permalink
Fixed a bug in generating the product ID.
Browse files Browse the repository at this point in the history
An existing bug in constructing a JSON representation for
a Pydantic model when the plate_number attribute is 2 or
larger resulted in incorrect IDs being generated. This came
to light while testing https://github.com/wtsi-npg/npg_langqc
against release 5.0.0 of this package.

The fix includes a return to using Pydantic models' built-in
model_dump_json method in order to mitigate the risk that
comes manual generation of JSON strings.
  • Loading branch information
mgcam committed Feb 19, 2024
1 parent c46f834 commit ec28b17
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 31 deletions.
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [5.0.1]

### [Fixed]
- An existing bug in constructing a JSON representation for a Pydantic model
when the plate_number attribute is 2 or larger resulted in incorrect IDs
being generated. The fix includes a return to using Pydantic models' built-in
model_dump_json method in order to mitigate the risk that comes with manual
generation of JSON strings.

## [5.0.0]

### [Changed]
Expand Down
32 changes: 6 additions & 26 deletions npg_id_generation/pac_bio.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.

import io
import re
from hashlib import sha256
from typing import Optional
Expand Down Expand Up @@ -57,8 +56,7 @@ class PacBioEntity(BaseModel):
execution time.
Order the attributes alphabetically to maintain order in the output
of model_to_json(). The hash_product_id() method does not depend on
the Pydantic ordering, however (it uses a custom JSON serializer).
of model_dump_json().
"""
model_config = ConfigDict(extra="forbid")

Expand Down Expand Up @@ -112,27 +110,9 @@ def tags_have_correct_characters(cls, v):
def hash_product_id(self):
"""Generate a sha256sum for the PacBio Entity"""

# Avoid using Pydantic's model_to_json() method as it requires somewhat
# complicated setup with decorators to create our backwards-compatible JSON
# serialization.

# The JSON is built with StringIO to ensure the order of the attributes and
# that it's faster than using json.dumps() with sort_keys=True (timeit
# estimates that this is ~4x faster.
json = io.StringIO()
json.write('{"run_name":"')
json.write(self.run_name)
json.write('","well_label":"')
json.write(self.well_label)
json.write('"')
if self.plate_number is not None and self.plate_number > 1:
json.write(',"plate_number":')
json.write(str(self.plate_number))
json.write('"')
if self.tags is not None:
json.write(',"tags":"')
json.write(self.tags)
json.write('"')
json.write("}")

return sha256(json.getvalue().encode("utf-8")).hexdigest()
json = self.model_dump_json(exclude_none=True)
else:
json = self.model_dump_json(exclude_none=True, exclude=["plate_number"])

return sha256(json.encode()).hexdigest()
20 changes: 15 additions & 5 deletions tests/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,14 +196,24 @@ def test_multiple_plates_make_difference():
def test_expected_hashes():
"""Test against expected hashes."""
# plate_number absent (historical runs) or plate_number == 1
p1_sha256 = "cda15311f706217e31e32d42d524bc35662a6fc15623ce5abe6a31ed741801ae"
id1_sha256 = "cda15311f706217e31e32d42d524bc35662a6fc15623ce5abe6a31ed741801ae"
# plate_number == 2
p2_sha256 = "7ca9d350c9b14f0883ac568220b8e5d97148a4eeb41d9de00b5733299d7bcd89"
id2_sha256 = "9d883922744395bccb226069cc2b77eac867778e2e31c8c1ac3651ed2a83c059"
id3_sha256 = "dc77c4a7f34d84afbb895fcaee72fc8bead9dac20e8d3a9614091d9dd4519acd"
id4_sha256 = "e40d98a697a5ffcb09f7ed62597216094198c0d804be100bbb9f6050482182e3"

test_cases = [
('{"run_name": "MARATHON", "well_label": "A1"}', p1_sha256),
('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 1}', p1_sha256),
('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2}', p2_sha256),
('{"run_name": "MARATHON", "well_label": "A1"}', id1_sha256),
('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 1}', id1_sha256),
('{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2}', id2_sha256),
(
'{"run_name": "TRACTION_RUN_16", "well_label": "A1", "plate_number": 2}',
id3_sha256,
),
(
'{"run_name": "TRACTION_RUN_16", "well_label": "A1", "plate_number": 3}',
id4_sha256,
),
(
'{"run_name": "SEMI-MARATHON", "well_label": "D1"}',
"b55417615e458c23049cc84822531a435c0c4069142f0e1d5e4378d48d9f7bd2",
Expand Down

0 comments on commit ec28b17

Please sign in to comment.