From 6b6dd4030450687b9e827d68cb9a7a4daaa06571 Mon Sep 17 00:00:00 2001 From: mgcam Date: Tue, 25 Jul 2023 17:39:33 +0100 Subject: [PATCH 1/3] Added plate_number to teh PacBio entity attributes. --- bin/generate_pac_bio_id | 22 ++++++++++-- npg_id_generation/pac_bio.py | 31 +++++++++++++--- tests/test_hashing.py | 69 ++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 6 deletions(-) diff --git a/bin/generate_pac_bio_id b/bin/generate_pac_bio_id index 0b7396d..fca5a27 100755 --- a/bin/generate_pac_bio_id +++ b/bin/generate_pac_bio_id @@ -23,7 +23,9 @@ import argparse from npg_id_generation.pac_bio import PacBioEntity, concatenate_tags parser = argparse.ArgumentParser( - description="Generates a 64 character product id for a pac bio product.", + description=""" + Generates a 64-character product ID for a PacBio sequencing product. + """, formatter_class=argparse.RawTextHelpFormatter, ) @@ -36,6 +38,17 @@ parser.add_argument( help="The name of the run to which the product belongs.", ) +parser.add_argument( + "-p", + "--plate_number", + "--plate-number", + required=False, + type=int, + help=""" +Plate number, relevant for Revio instruments only. + """, +) + parser.add_argument( "-w", "--well_label", @@ -49,6 +62,7 @@ A well label should fit the pattern A1. ) parser.add_argument( + "-t", "--tag", type=str, required=False, @@ -69,7 +83,11 @@ separator = "," def main(): tags = concatenate_tags(args.tag) - entity = PacBioEntity(run_name=args.run_name, well_label=args.well_label, tags=tags) + entity = PacBioEntity( + run_name=args.run_name, + plate_number=args.plate_number, + well_label=args.well_label, + tags=tags) print(f"{entity.hash_product_id()}\n") diff --git a/npg_id_generation/pac_bio.py b/npg_id_generation/pac_bio.py index b014519..104d4d6 100644 --- a/npg_id_generation/pac_bio.py +++ b/npg_id_generation/pac_bio.py @@ -45,12 +45,31 @@ def concatenate_tags(tags: list[str]): class PacBioEntity(BaseModel, extra=Extra.forbid): """A PacBio class for product ID generation.""" - # Order the attributes alphabetically, to allow for interoperability - # with a possible Perl API. - # Alternatively the sorting could be achieved with json.dumps()'s - # sort_keys argument. See https://docs.python.org/3/library/json.html#basic-usage + """ + Pydantic's current default is to serialize attributes in the order + they are listed. if this behaviour changes, we can restore it by + using json.dumps() sort_keys argument, see + https://docs.python.org/3/library/json.html#basic-usage + + We are not using this explicit sort for now since it adds to the + execution time. + + Order the attributes alphabetically! + """ + run_name: str = Field(title="Pac Bio run name as in LIMS") well_label: str = Field(title="Pac Bio well label") + plate_number: int = Field( + default=None, + ge=1, + title="Pac Bio plate number", + description=""" + Plate number is a positive integer and is relevant for Revio + instruments only, thus it defaults to None. + To be backward-compatible with Revio product IDs generated so far, + when the value of this attribute is 1, we reset it to undefined. + """, + ) tags: str = Field( default=None, title="A string representing tag or tags", @@ -76,6 +95,10 @@ def well_label_conforms_to_pattern(cls, v): ) return v + @validator("plate_number") + def plate_number_default(cls, v): + return None if (v is None) or (v == 1) else v + @validator("tags") def tags_have_correct_characters(cls, v): if (v is not None) and (not re.match("^[ACGT]+(,[ACGT]+)*$", v)): diff --git a/tests/test_hashing.py b/tests/test_hashing.py index e3ed46f..835c3d0 100644 --- a/tests/test_hashing.py +++ b/tests/test_hashing.py @@ -129,6 +129,75 @@ def test_tags_have_correct_characters(): ) +def test_plate_number_validation(): + + for n in [-1, 0]: + with pytest.raises(ValidationError) as excinfo: + PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=n) + assert "ensure this value is greater than or equal to 1" in str(excinfo.value) + + +def test_plate_number_defaults(): + """Test backwards compatibility for the plate number""" + + e1 = PacBioEntity(run_name="MARATHON", well_label="A1", tags="TAGC", plate_number=1) + e2 = PacBioEntity(run_name="MARATHON", well_label="A1", tags="TAGC") + e3 = PacBioEntity( + run_name="MARATHON", well_label="A1", tags="TAGC", plate_number=None + ) + assert e1.plate_number is None + assert e2.plate_number is None + assert e3.plate_number is None + assert e1.json(exclude_none=True) == e2.json(exclude_none=True) + assert e1.json(exclude_none=True) == e3.json(exclude_none=True) + assert e1.hash_product_id() == e2.hash_product_id() + assert e1.hash_product_id() == e3.hash_product_id() + + e1 = PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=1) + e2 = PacBioEntity(run_name="MARATHON", well_label="A1") + assert e1.plate_number is None + assert e2.plate_number is None + assert e1.json() == e2.json() + assert e1.hash_product_id() == e2.hash_product_id() + + +def test_multiple_plates_make_difference(): + + id_1 = PacBioEntity( + run_name="MARATHON", well_label="A1", tags="ACGT" + ).hash_product_id() + id_2 = PacBioEntity( + run_name="MARATHON", well_label="A1", tags="ACGT", plate_number=2 + ).hash_product_id() + id_3 = PacBioEntity( + run_name="MARATHON", well_label="A1", tags="ACGT", plate_number=3 + ).hash_product_id() + assert id_1 != id_2 + assert id_3 != id_2 + + id_1 = PacBioEntity(run_name="MARATHON", well_label="A1").hash_product_id() + id_2 = PacBioEntity( + run_name="MARATHON", well_label="A1", plate_number=2 + ).hash_product_id() + id_3 = PacBioEntity( + run_name="MARATHON", well_label="A1", plate_number=3 + ).hash_product_id() + assert id_1 != id_2 + assert id_3 != id_2 + + json = PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=2).json( + exclude_none=True + ) + assert json == '{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2}' + json = PacBioEntity( + run_name="MARATHON", well_label="A1", tags="ACTGG", plate_number=2 + ).json(exclude_none=True) + assert ( + json + == '{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2, "tags": "ACTGG"}' + ) + + def test_expected_hashes(): """Test against expected hashes.""" From bdd5f06d5330fb20486d2fc227b3309a36599f16 Mon Sep 17 00:00:00 2001 From: mgcam Date: Thu, 27 Jul 2023 09:32:40 +0100 Subject: [PATCH 2/3] Update the documentation --- README.md | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index d82716f..4bc4462 100644 --- a/README.md +++ b/README.md @@ -20,22 +20,37 @@ package implements a Python API. The attributes of objects are sequencing platform specific. The generator for the PacBio platform is implemented by the `PacBioEntity` class. -Examles of generating IDs for PacBio data: +Examles of generating IDs for PacBio data from Python code: ``` -from npg_id_generation.main import PacBioEntity +from npg_id_generation.pac_bio import PacBioEntity # from a JSON string via a class method test_case = '{"run_name": "MARATHON","well_label": "D1"}' print(PacBioEntity.parse_raw(test_case, content_type="json").hash_product_id()) # by setting object's attributes -print(PacBioEntity(run_name="MARATHON", well_label="D1").hash_product_id() +print(PacBioEntity(run_name="MARATHON", well_label="D1").hash_product_id()) +print(PacBioEntity( + run_name="MARATHON", + well_label="D1", + plate_number=2 + ).hash_product_id() +) # sample-specific indentifier # for multiple tags a sorted comma-separated list of tagscan be used print(PacBioEntity(run_name="MARATHON", well_label="D1", tags="AAGTACGT").hash_product_id() -``` +``` + +The npg_id_generation package also contains a script, `generate_pac_bio_id`, +which can be called from the command line. The script outputs the generated +ID to the STDOUT stream. Use the `--help` option to find out details. + +``` +# Using the script in the Perl code: +my $id = `npg_id_generation --run_name 'MARATHON' --well_label 'D1'`; +``` All generators should conform to a few simple rules: @@ -60,6 +75,15 @@ o5 = PacBioEntity.parse_raw('{"well_label": "l1", "run_name": "r1"}', content_t o6 = PacBioEntity.parse_raw('{"well_label": "l1","run_name": "r1", "tags": null}', content_type="json") ``` +In addition, to maintain backwards compatibility for PacBio Revio products, +the following two objects should generate the same ID, meaning that the +value of 1 for the plate number attribute is disregarded. + +``` +o1 = PacBioEntity(run_name="r1", well_label="l1") +o2 = PacBioEntity(run_name="r1", well_label="l1", plate_number=1) +``` + The algorithm used for generation of identifiers can be replicated in Perl; -on identical input data it gives identical results. However, we cannot guarantee -that this parity will always be maintained in future. +on identical input data it gives identical results. However, we cannot +guarantee that this parity will always be maintained in future. From f76aee662cc8ebe3c3bfbc08638a3d1d31479275 Mon Sep 17 00:00:00 2001 From: mgcam Date: Thu, 27 Jul 2023 12:44:46 +0100 Subject: [PATCH 3/3] Update CHANGELOG.md for release 4.0.0 --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f933b03..b7f9d0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [4.0.0] + +### Added + + - Add an extra PacBio entity attribute - plate_number. + ## [3.0.0] ### Changed