Skip to content

Commit

Permalink
Merge pull request #10 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
pull from devel to master to create release 2.0.0
  • Loading branch information
mgcam authored Apr 21, 2023
2 parents 9a6823f + c88f836 commit c0a85ba
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 52 deletions.
1 change: 1 addition & 0 deletions bin/generate_pac_bio_id
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
# @author Michael Kubiak <[email protected]>

import argparse

from npg_id_generation.pac_bio import PacBioEntity

parser = argparse.ArgumentParser(
Expand Down
28 changes: 13 additions & 15 deletions npg_id_generation/pac_bio.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Copyright (c) 2022 Genome Research Ltd.
# Copyright (c) 2022, 2023 Genome Research Ltd.
#
# Author: Adam Blanchet <[email protected]>
# Authors:
# Adam Blanchet <[email protected]>
# Michael Kubiak <[email protected]>
# Marina Gourtovaia <[email protected]>
#
# This file is part of npg_id_generation.
#
Expand All @@ -18,14 +21,15 @@
# this program. If not, see <http://www.gnu.org/licenses/>.

from hashlib import sha256

from pydantic import BaseModel, Extra, Field, validator


class PacBioEntity(BaseModel, extra=Extra.forbid):
"""A PacBio entity class for ID generation."""
"""A PacBio class for product ID generation."""

# Order these alphabetically, to allow for interoperability with
# a possible Perl API.
# Order the attributes alphabetically, to allow for interoperability
# with a possible Perl API.
# Alternatively the sorting could be achieved with json.dumps()'s
# sort_keys argument. See https://docs.python.org/3/library/json.html#basic-usage
run_name: str = Field(title="Pac Bio run name as in LIMS")
Expand All @@ -34,8 +38,10 @@ class PacBioEntity(BaseModel, extra=Extra.forbid):
default=None,
title="A string representing tag or tags",
description="""
A string representing a single tag (index) sequence or a comma-separated
list of multiple tags. It is important to order multiple tags consistently.
A string representing a single barcode index sequence (tag) or
a comma-separated list of multiple tags. The order of tags in
the list is meaningful for the purpose of product identification,
therefore it should not be changed by the code of this class.
""",
)

Expand All @@ -45,14 +51,6 @@ def attributes_are_non_empty_strings(cls, v):
raise ValueError("Cannot be an empty string")
return v

@validator("tags")
def sort_tags(cls, v):
if v is None:
return v
tags = v.split(",")
tags.sort()
return ",".join(tags)

def hash_product_id(self):
"""Generate a sha256sum for the PacBio Entity"""

Expand Down
51 changes: 14 additions & 37 deletions tests/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,8 @@
# Copyright (c) 2022 Genome Research Ltd.
#
# Author: Adam Blanchet <[email protected]>
#
# This file is part of npg_id_generation.
#
# npg_langqc is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.

"""Tests checking the hashing behaviour of objects."""

import pytest
from pydantic import ValidationError

from npg_id_generation.pac_bio import PacBioEntity


Expand Down Expand Up @@ -138,25 +120,20 @@ def test_expected_hashes():
)


def test_tags_sorted():
"""Test that tags are automatically sorted alphabetically before id generation"""

pb_entity_1 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="TCGA,ACGT,TGAC,AACG"
)
assert pb_entity_1.tags == "AACG,ACGT,TCGA,TGAC"
def test_tags_not_sorted():
"""Test that tags are not changed prior to id generation"""

pb_entity_2 = PacBioEntity(
run_name="MARATHON", well_label="A1", tags="ACGT,AACG,TGAC,TCGA"
)

pb_entity_3 = PacBioEntity.parse_raw(
'{"run_name": "MARATHON", "well_label": "A1", "tags": "TGAC,TCGA,AACG,ACGT"}'
)
run = "MARATHON"
well = "A1"
# Tags in these strings are the same, the difference is
# in the order.
tags_strings = ["TCGA,ACGT,TGAC,AACG", "ACGT,AACG,TGAC,TCGA", "TGAC,TCGA,AACG,ACGT"]
pb_entities = []
for tag_string in tags_strings:
pb_entities.append(PacBioEntity(run_name=run, well_label=well, tags=tag_string))

assert pb_entity_1.tags == pb_entity_2.tags == pb_entity_3.tags
assert (
pb_entity_3.hash_product_id()
== pb_entity_2.hash_product_id()
== pb_entity_3.hash_product_id()
pb_entities[0].hash_product_id()
!= pb_entities[1].hash_product_id()
!= pb_entities[2].hash_product_id()
)

0 comments on commit c0a85ba

Please sign in to comment.