Merge pull request #10 from wtsi-npg/devel

pull from devel to master to create release 2.0.0
wtsi-npg · Apr 21, 2023 · c0a85ba · c0a85ba
2 parents 9a6823f + c88f836
commit c0a85ba
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 52 deletions.
diff --git a/bin/generate_pac_bio_id b/bin/generate_pac_bio_id
@@ -19,6 +19,7 @@
 # @author Michael Kubiak <[email protected]>
 
 import argparse
+
 from npg_id_generation.pac_bio import PacBioEntity
 
 parser = argparse.ArgumentParser(

diff --git a/npg_id_generation/pac_bio.py b/npg_id_generation/pac_bio.py
@@ -1,6 +1,9 @@
-# Copyright (c) 2022 Genome Research Ltd.
+# Copyright (c) 2022, 2023 Genome Research Ltd.
 #
-# Author: Adam Blanchet <[email protected]>
+# Authors:
+#   Adam Blanchet <[email protected]>
+#   Michael Kubiak <[email protected]>
+#   Marina Gourtovaia <[email protected]>
 #
 # This file is part of npg_id_generation.
 #
@@ -18,14 +21,15 @@
 # this program. If not, see <http://www.gnu.org/licenses/>.
 
 from hashlib import sha256
+
 from pydantic import BaseModel, Extra, Field, validator
 
 
 class PacBioEntity(BaseModel, extra=Extra.forbid):
-    """A PacBio entity class for ID generation."""
+    """A PacBio class for product ID generation."""
 
-    # Order these alphabetically, to allow for interoperability with
-    # a possible Perl API.
+    # Order the attributes alphabetically, to allow for interoperability
+    # with a possible Perl API.
     # Alternatively the sorting could be achieved with json.dumps()'s
     # sort_keys argument. See https://docs.python.org/3/library/json.html#basic-usage
     run_name: str = Field(title="Pac Bio run name as in LIMS")
@@ -34,8 +38,10 @@ class PacBioEntity(BaseModel, extra=Extra.forbid):
         default=None,
         title="A string representing tag or tags",
         description="""
-        A string representing a single tag (index) sequence or a comma-separated
-        list of multiple tags. It is important to order multiple tags consistently.
+        A string representing a single barcode index sequence (tag) or
+        a comma-separated list of multiple tags. The order of tags in
+        the list is meaningful for the purpose of product identification,
+        therefore it should not be changed by the code of this class.
         """,
     )
 
@@ -45,14 +51,6 @@ def attributes_are_non_empty_strings(cls, v):
             raise ValueError("Cannot be an empty string")
         return v
 
-    @validator("tags")
-    def sort_tags(cls, v):
-        if v is None:
-            return v
-        tags = v.split(",")
-        tags.sort()
-        return ",".join(tags)
-
     def hash_product_id(self):
         """Generate a sha256sum for the PacBio Entity"""
 

diff --git a/tests/test_hashing.py b/tests/test_hashing.py
@@ -1,26 +1,8 @@
-# Copyright (c) 2022 Genome Research Ltd.
-#
-# Author: Adam Blanchet <[email protected]>
-#
-# This file is part of npg_id_generation.
-#
-# npg_langqc is free software: you can redistribute it and/or modify it under
-# the terms of the GNU General Public License as published by the Free Software
-# Foundation; either version 3 of the License, or (at your option) any later
-# version.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License along with
-# this program. If not, see <http://www.gnu.org/licenses/>.
-
 """Tests checking the hashing behaviour of objects."""
 
 import pytest
 from pydantic import ValidationError
+
 from npg_id_generation.pac_bio import PacBioEntity
 
 
@@ -138,25 +120,20 @@ def test_expected_hashes():
         )
 
 
-def test_tags_sorted():
-    """Test that tags are automatically sorted alphabetically before id generation"""
-
-    pb_entity_1 = PacBioEntity(
-        run_name="MARATHON", well_label="A1", tags="TCGA,ACGT,TGAC,AACG"
-    )
-    assert pb_entity_1.tags == "AACG,ACGT,TCGA,TGAC"
+def test_tags_not_sorted():
+    """Test that tags are not changed prior to id generation"""
 
-    pb_entity_2 = PacBioEntity(
-        run_name="MARATHON", well_label="A1", tags="ACGT,AACG,TGAC,TCGA"
-    )
-
-    pb_entity_3 = PacBioEntity.parse_raw(
-        '{"run_name": "MARATHON", "well_label": "A1", "tags": "TGAC,TCGA,AACG,ACGT"}'
-    )
+    run = "MARATHON"
+    well = "A1"
+    # Tags in these strings are the same, the difference is
+    # in the order.
+    tags_strings = ["TCGA,ACGT,TGAC,AACG", "ACGT,AACG,TGAC,TCGA", "TGAC,TCGA,AACG,ACGT"]
+    pb_entities = []
+    for tag_string in tags_strings:
+        pb_entities.append(PacBioEntity(run_name=run, well_label=well, tags=tag_string))
 
-    assert pb_entity_1.tags == pb_entity_2.tags == pb_entity_3.tags
     assert (
-        pb_entity_3.hash_product_id()
-        == pb_entity_2.hash_product_id()
-        == pb_entity_3.hash_product_id()
+        pb_entities[0].hash_product_id()
+        != pb_entities[1].hash_product_id()
+        != pb_entities[2].hash_product_id()
     )