Merge pull request #5 from wtsi-npg/devel

pull from devel to master to create the first release
wtsi-npg · Oct 26, 2022 · 4643c89 · 4643c89
2 parents 8f77dd2 + 40af322
commit 4643c89
Show file tree

Hide file tree

Showing 8 changed files with 295 additions and 130 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -26,10 +26,6 @@ jobs:
   test:
     runs-on: ubuntu-latest
 
-    strategy:
-      matrix:
-        mysql-version: ["5.7", "8.0"]
-
     steps:
       - uses: actions/checkout@v3
 
@@ -46,6 +42,7 @@ jobs:
         run: |
           poetry env use '3.10'
           poetry install
+          poetry self add "poetry-dynamic_versioning[plugin]"
 
       - name: Run pytest
         run: |

diff --git a/README.md b/README.md
@@ -1,4 +1,65 @@
 # npg_id_generation
 
-An API used to generate product IDs, which are hashes of the JSON representation of an object.
+An API used to generate product IDs, which are hashes of the JSON representation
+of an object.
 
+For different sequencing platforms different sets of identifiers might be used to
+fully describe the origin of data. For reasons of efficiency and interobility
+between different systems it is sometimes desirable to be able to use a single
+identifier, which will be unique not only within data for a single platform,
+but also between different platforms.
+
+In the Sanger Institute run ID, lane number and numerical tag index are used
+as identifiers for the Illumina platform. Historically, the first algorithm for
+generating unique identifiers was implemented in Perl for the Illumina platform,
+see [documentation](https://github.com/wtsi-npg/npg_tracking/blob/master/lib/npg_tracking/glossary/composition.pm
+).
+
+Later a need to have a similar API for other sequencing platforms arose. This
+package implements a Python API. The attributes of objects are sequencing
+platform specific. The generator for the PacBio platform is implemented by the
+`PacBioEntity` class.
+
+Examles of generating IDs for PacBio data:
+
+```
+from npg_id_generation.main import PacBioEntity
+
+# from a JSON string via a class method
+test_case = '{"run_name": "MARATHON","well_label": "D1"}'
+print(PacBioEntity.parse_raw(test_case, content_type="json").hash_product_id())
+
+# by setting object's attributes
+print(PacBioEntity(run_name="MARATHON", well_label="D1").hash_product_id()
+
+# sample-specific indentifier
+# for multiple tags a sorted comma-separated list of tagscan be used
+print(PacBioEntity(run_name="MARATHON", well_label="D1", tags="AAGTACGT").hash_product_id()
+``` 
+
+All generators should conform to a few simple rules:
+
+1. Uniqueness of the ID should be guaranteed.
+2. The ID should be a 64 characher string.
+3. It should be possible to generate an ID from a JSON string.
+4. The value of the ID should **not** depend on the order of attributes given
+   to the constructor or the order of keys used in JSON.
+5. The value of the ID should **not** depend on the amount of whitespace in
+   the input JSON.
+6. The value of the ID should **not** depend on whether the undefined values
+   of attributes are explicitly set.
+
+The examples below clarity the rules. Objects `o1` - `o6` should generate the same ID.
+
+```
+o1 = PacBioEntity(run_name="r1", well_label="l1")
+o2 = PacBioEntity(run_name="r1", well_label="l1", tags = None)
+o3 = PacBioEntity(well_label="l1", run_name="r1", )
+o4 = PacBioEntity.parse_raw('{"run_name": "r1","well_label": "l1"}', content_type="json")
+o5 = PacBioEntity.parse_raw('{"well_label": "l1",  "run_name": "r1"}', content_type="json")
+o6 = PacBioEntity.parse_raw('{"well_label": "l1","run_name": "r1", "tags": null}', content_type="json")
+```
+
+The algorithm used for generation of identifiers can be replicated in Perl;
+on identical input data it gives identical results. However, we cannot guarantee
+that this parity will always be maintained in future.
diff --git a/bin/generate_pac_bio_id b/bin/generate_pac_bio_id
@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2022 Genome Research Ltd. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# @author Michael Kubiak <[email protected]>
+
+import argparse
+from npg_id_generation.pac_bio import PacBioEntity
+
+parser = argparse.ArgumentParser(
+    description="A script to generate a product id for a pac bio product from a given run and well"
+)
+
+parser.add_argument(
+    "run_name", type=str, help="The name of the run to which the product belongs"
+)
+
+parser.add_argument("well_label", type=str, help="The well label")
+
+args = parser.parse_args()
+
+
+def main():
+    print(
+        f"{PacBioEntity(run_name=args.run_name, well_label=args.well_label).hash_product_id()}\n"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/npg_id_generation/__init__.py b/npg_id_generation/__init__.py
@@ -1 +0,0 @@
-from .main import PacBioEntity

diff --git a/npg_id_generation/main.py → npg_id_generation/pac_bio.py b/npg_id_generation/main.py → npg_id_generation/pac_bio.py
@@ -18,22 +18,35 @@
 # this program. If not, see <http://www.gnu.org/licenses/>.
 
 from hashlib import sha256
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Extra, Field, validator
 
 
-class PacBioEntity(BaseModel):
-    """A PacBio entity class used to create hashes."""
+class PacBioEntity(BaseModel, extra=Extra.forbid):
+    """A PacBio entity class for ID generation."""
 
     # Order these alphabetically, to allow for interoperability with
     # a possible Perl API.
     # Alternatively the sorting could be achieved with json.dumps()'s
     # sort_keys argument. See https://docs.python.org/3/library/json.html#basic-usage
-    run_name: str
-    well_label: str
-    tags: str = Field(default=None)
+    run_name: str = Field(title="Pac Bio run name as in LIMS")
+    well_label: str = Field(title="Pac Bio well label")
+    tags: str = Field(
+        default=None,
+        title="A string representing tag or tags",
+        description="""
+        A string representing a single tag (index) sequence or a comma-separated
+        list of multiple tags. It is important to order multiple tags consistently.
+        """,
+    )
+
+    @validator("run_name", "well_label", "tags")
+    def attributes_are_non_empty_strings(cls, v):
+        if (v is not None) and (v == ""):
+            raise ValueError("Cannot be an empty string")
+        return v
 
     def hash_product_id(self):
-        """Generate a sha256sum for the PacBioWell"""
+        """Generate a sha256sum for the PacBio Entity"""
 
         return sha256(
             self.json(exclude_none=True, separators=(",", ":")).encode()