Skip to content

Commit

Permalink
Merge pull request #14 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
pull from devel to master to create release 4.0.0
  • Loading branch information
mgcam authored Jul 27, 2023
2 parents 2c1443e + f76aee6 commit db30694
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 12 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

## [4.0.0]

### Added

- Add an extra PacBio entity attribute - plate_number.

## [3.0.0]

### Changed
Expand Down
36 changes: 30 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,37 @@ package implements a Python API. The attributes of objects are sequencing
platform specific. The generator for the PacBio platform is implemented by the
`PacBioEntity` class.

Examles of generating IDs for PacBio data:
Examles of generating IDs for PacBio data from Python code:

```
from npg_id_generation.main import PacBioEntity
from npg_id_generation.pac_bio import PacBioEntity
# from a JSON string via a class method
test_case = '{"run_name": "MARATHON","well_label": "D1"}'
print(PacBioEntity.parse_raw(test_case, content_type="json").hash_product_id())
# by setting object's attributes
print(PacBioEntity(run_name="MARATHON", well_label="D1").hash_product_id()
print(PacBioEntity(run_name="MARATHON", well_label="D1").hash_product_id())
print(PacBioEntity(
run_name="MARATHON",
well_label="D1",
plate_number=2
).hash_product_id()
)
# sample-specific indentifier
# for multiple tags a sorted comma-separated list of tagscan be used
print(PacBioEntity(run_name="MARATHON", well_label="D1", tags="AAGTACGT").hash_product_id()
```
```

The npg_id_generation package also contains a script, `generate_pac_bio_id`,
which can be called from the command line. The script outputs the generated
ID to the STDOUT stream. Use the `--help` option to find out details.

```
# Using the script in the Perl code:
my $id = `npg_id_generation --run_name 'MARATHON' --well_label 'D1'`;
```

All generators should conform to a few simple rules:

Expand All @@ -60,6 +75,15 @@ o5 = PacBioEntity.parse_raw('{"well_label": "l1", "run_name": "r1"}', content_t
o6 = PacBioEntity.parse_raw('{"well_label": "l1","run_name": "r1", "tags": null}', content_type="json")
```

In addition, to maintain backwards compatibility for PacBio Revio products,
the following two objects should generate the same ID, meaning that the
value of 1 for the plate number attribute is disregarded.

```
o1 = PacBioEntity(run_name="r1", well_label="l1")
o2 = PacBioEntity(run_name="r1", well_label="l1", plate_number=1)
```

The algorithm used for generation of identifiers can be replicated in Perl;
on identical input data it gives identical results. However, we cannot guarantee
that this parity will always be maintained in future.
on identical input data it gives identical results. However, we cannot
guarantee that this parity will always be maintained in future.
22 changes: 20 additions & 2 deletions bin/generate_pac_bio_id
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ import argparse
from npg_id_generation.pac_bio import PacBioEntity, concatenate_tags

parser = argparse.ArgumentParser(
description="Generates a 64 character product id for a pac bio product.",
description="""
Generates a 64-character product ID for a PacBio sequencing product.
""",
formatter_class=argparse.RawTextHelpFormatter,
)

Expand All @@ -36,6 +38,17 @@ parser.add_argument(
help="The name of the run to which the product belongs.",
)

parser.add_argument(
"-p",
"--plate_number",
"--plate-number",
required=False,
type=int,
help="""
Plate number, relevant for Revio instruments only.
""",
)

parser.add_argument(
"-w",
"--well_label",
Expand All @@ -49,6 +62,7 @@ A well label should fit the pattern A1.
)

parser.add_argument(
"-t",
"--tag",
type=str,
required=False,
Expand All @@ -69,7 +83,11 @@ separator = ","

def main():
tags = concatenate_tags(args.tag)
entity = PacBioEntity(run_name=args.run_name, well_label=args.well_label, tags=tags)
entity = PacBioEntity(
run_name=args.run_name,
plate_number=args.plate_number,
well_label=args.well_label,
tags=tags)
print(f"{entity.hash_product_id()}\n")


Expand Down
31 changes: 27 additions & 4 deletions npg_id_generation/pac_bio.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,31 @@ def concatenate_tags(tags: list[str]):
class PacBioEntity(BaseModel, extra=Extra.forbid):
"""A PacBio class for product ID generation."""

# Order the attributes alphabetically, to allow for interoperability
# with a possible Perl API.
# Alternatively the sorting could be achieved with json.dumps()'s
# sort_keys argument. See https://docs.python.org/3/library/json.html#basic-usage
"""
Pydantic's current default is to serialize attributes in the order
they are listed. if this behaviour changes, we can restore it by
using json.dumps() sort_keys argument, see
https://docs.python.org/3/library/json.html#basic-usage
We are not using this explicit sort for now since it adds to the
execution time.
Order the attributes alphabetically!
"""

run_name: str = Field(title="Pac Bio run name as in LIMS")
well_label: str = Field(title="Pac Bio well label")
plate_number: int = Field(
default=None,
ge=1,
title="Pac Bio plate number",
description="""
Plate number is a positive integer and is relevant for Revio
instruments only, thus it defaults to None.
To be backward-compatible with Revio product IDs generated so far,
when the value of this attribute is 1, we reset it to undefined.
""",
)
tags: str = Field(
default=None,
title="A string representing tag or tags",
Expand All @@ -76,6 +95,10 @@ def well_label_conforms_to_pattern(cls, v):
)
return v

@validator("plate_number")
def plate_number_default(cls, v):
return None if (v is None) or (v == 1) else v

@validator("tags")
def tags_have_correct_characters(cls, v):
if (v is not None) and (not re.match("^[ACGT]+(,[ACGT]+)*$", v)):
Expand Down
69 changes: 69 additions & 0 deletions tests/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,75 @@ def test_tags_have_correct_characters():
)


def test_plate_number_validation():

for n in [-1, 0]:
with pytest.raises(ValidationError) as excinfo:
PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=n)
assert "ensure this value is greater than or equal to 1" in str(excinfo.value)


def test_plate_number_defaults():
"""Test backwards compatibility for the plate number"""

e1 = PacBioEntity(run_name="MARATHON", well_label="A1", tags="TAGC", plate_number=1)
e2 = PacBioEntity(run_name="MARATHON", well_label="A1", tags="TAGC")
e3 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="TAGC", plate_number=None
)
assert e1.plate_number is None
assert e2.plate_number is None
assert e3.plate_number is None
assert e1.json(exclude_none=True) == e2.json(exclude_none=True)
assert e1.json(exclude_none=True) == e3.json(exclude_none=True)
assert e1.hash_product_id() == e2.hash_product_id()
assert e1.hash_product_id() == e3.hash_product_id()

e1 = PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=1)
e2 = PacBioEntity(run_name="MARATHON", well_label="A1")
assert e1.plate_number is None
assert e2.plate_number is None
assert e1.json() == e2.json()
assert e1.hash_product_id() == e2.hash_product_id()


def test_multiple_plates_make_difference():

id_1 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="ACGT"
).hash_product_id()
id_2 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="ACGT", plate_number=2
).hash_product_id()
id_3 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="ACGT", plate_number=3
).hash_product_id()
assert id_1 != id_2
assert id_3 != id_2

id_1 = PacBioEntity(run_name="MARATHON", well_label="A1").hash_product_id()
id_2 = PacBioEntity(
run_name="MARATHON", well_label="A1", plate_number=2
).hash_product_id()
id_3 = PacBioEntity(
run_name="MARATHON", well_label="A1", plate_number=3
).hash_product_id()
assert id_1 != id_2
assert id_3 != id_2

json = PacBioEntity(run_name="MARATHON", well_label="A1", plate_number=2).json(
exclude_none=True
)
assert json == '{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2}'
json = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="ACTGG", plate_number=2
).json(exclude_none=True)
assert (
json
== '{"run_name": "MARATHON", "well_label": "A1", "plate_number": 2, "tags": "ACTGG"}'
)


def test_expected_hashes():
"""Test against expected hashes."""

Expand Down

0 comments on commit db30694

Please sign in to comment.