From 739701a8e27fada45596365493939eb4c71e5e79 Mon Sep 17 00:00:00 2001
From: Keith James <kdj@sanger.ac.uk>
Date: Mon, 20 May 2024 15:02:31 +0100
Subject: [PATCH] Fix and simplify the Illumina split_name function

---
 src/npg_irods/illumina.py              | 39 +++++--------
 tests/illumina/test_metadata_update.py | 76 +++++++++++++++++++++++++-
 2 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/src/npg_irods/illumina.py b/src/npg_irods/illumina.py
index 2d9ebc7..1d23cf5 100644
--- a/src/npg_irods/illumina.py
+++ b/src/npg_irods/illumina.py
@@ -275,17 +275,21 @@ def split_name(name: str) -> tuple[str, str]:
     Extending this method to handle new types of file:
 
     If your new file can be handled by the pathlib API then you do not need to extend
-    the capabilities of this function. Otherwise, you will need to add an additional
-    regular expression to parse the stem from the file name.
+    the capabilities of this function. Otherwise, you will need to add a regular
+    expression to parse the stem from the file name.
 
     """
 
-    # Handle this form:
+    # The dot separator used in the stem is not consistent across all files.
+    n1 = re.sub(r"_F0x", ".F0x", name)
+    normalised = re.sub(r"_quality_", ".quality_", n1)
+
+    # Handle this form (normalised):
     #
-    # 9930555.ACXX.paired158.550b751b96_F0x900.stats
-    # 9930555.ACXX.paired158.550b751b96_F0xB00.stats
-    # 9930555.ACXX.paired158.550b751b96_F0xF04_target.stats
-    if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(_F0x\d+.*)$", name):
+    # 9930555.ACXX.paired158.550b751b96.F0x900.stats
+    # 9930555.ACXX.paired158.550b751b96.F0xB00.stats
+    # 9930555.ACXX.paired158.550b751b96.F0xF04_target.stats
+    if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(\.F0x[A-F0-9]+.*)$", normalised):
         stem, suffixes = match.groups()
 
     # Handle this form:
@@ -295,26 +299,9 @@ def split_name(name: str) -> tuple[str, str]:
     elif match := re.match(r"(\d+\.\w+\.\w+\.\w+)(.*)$", name):
         stem, suffixes = match.groups()
 
-    # Handle this form:
-    #
-    # [prefix]_F0x900.stats
-    # [prefix]_F0xB00.stats
-    # [prefix]_F0xF04_target.stats
-    # [prefix]_F0xF04_target_autosome.stats
-    elif match := re.match(r"([\w#]+)(_F0x\d+.*)$", name):
-        stem, suffixes = match.groups()
-
-    # Handle this form:
-    #
-    # [prefix]_quality_cycle_caltable.txt
-    # [prefix]_quality_cycle_surv.txt
-    # [prefix]_quality_error.txt
-    elif match := re.match(r"([\w#]+)(_quality_\.txt)$", name):
-        stem, suffixes = match.groups()
-
     else:
-        p = PurePath(name)
-        stem = without_suffixes(p)
+        p = PurePath(normalised)
+        stem = without_suffixes(p).as_posix()
         suffixes = "".join(p.suffixes)
 
     return stem, suffixes
diff --git a/tests/illumina/test_metadata_update.py b/tests/illumina/test_metadata_update.py
index ddd0ee0..c22f948 100644
--- a/tests/illumina/test_metadata_update.py
+++ b/tests/illumina/test_metadata_update.py
@@ -21,7 +21,7 @@
 from pytest import mark as m
 
 from helpers import history_in_meta
-from npg_irods.illumina import Component, ensure_secondary_metadata_updated
+from npg_irods.illumina import Component, ensure_secondary_metadata_updated, split_name
 from npg_irods.metadata.common import SeqConcept, SeqSubset
 from npg_irods.metadata.lims import (
     TrackedSample,
@@ -31,7 +31,7 @@
 )
 
 
-class TestIlluminaComponent:
+class TestIlluminaAPI:
     @m.context("When a component AVU is available")
     @m.it("Can be used to construct a Component")
     def test_make_component_from_avu(self):
@@ -57,6 +57,78 @@ def test_make_component_from_avu(self):
         assert c.tag_index == 1
         assert c.subset == SeqSubset.HUMAN
 
+    @m.context("When parsing names of Illumina data objects")
+    @m.it("Can split the name into a prefix and a suffix")
+    def test_split_name(self):
+        for base in [
+            "12345",  # expt, not multiplexed
+            "12345_phix",  # control, not multiplexed
+            "12345#1",  # expt, multiplexed
+            "12345_phix#1",  # control, multiplexed
+            "12345_1#1",  # expt with lane, multiplexed
+            "12345_phix_1#1",  # control with lane, multiplexed
+        ]:
+            assert split_name(f"{base}.cram") == (base, ".cram")
+            assert split_name(f"{base}.cram.crai") == (base, ".cram.crai")
+            assert split_name(f"{base}_F0x900.stats") == (base, ".F0x900.stats")
+            assert split_name(f"{base}_F0xB00.stats") == (base, ".F0xB00.stats")
+            assert split_name(f"{base}_F0xF04_target.stats") == (
+                base,
+                ".F0xF04_target.stats",
+            )
+            assert split_name(f"{base}_F0xF04_target_autosome.stats") == (
+                base,
+                ".F0xF04_target_autosome.stats",
+            )
+            assert split_name(f"{base}_F0xB00.samtools_stats.json") == (
+                base,
+                ".F0xB00.samtools_stats.json",
+            )
+            assert split_name(f"{base}_quality_cycle_caltable.txt") == (
+                base,
+                ".quality_cycle_caltable.txt",
+            )
+            assert split_name(f"{base}_quality_cycle_surv.txt") == (
+                base,
+                ".quality_cycle_surv.txt",
+            )
+            assert split_name(f"{base}_quality_error.txt") == (
+                base,
+                ".quality_error.txt",
+            )
+
+    @m.context("When parsing names of library-merged Illumina data objects")
+    @m.it("Can split the name into a prefix and a suffix")
+    def test_split_name_library_merge(self):
+        assert split_name("9930555.ACXX.paired158.550b751b96.cram") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".cram",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96.cram.crai") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".cram.crai",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96_F0x900.stats") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".F0x900.stats",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96_F0xB00.stats") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".F0xB00.stats",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96_F0xF04_target.stats") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".F0xF04_target.stats",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96.flagstat") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".flagstat",
+        )
+        assert split_name("9930555.ACXX.paired158.550b751b96.g.vcf.gz") == (
+            "9930555.ACXX.paired158.550b751b96",
+            ".g.vcf.gz",
+        )
+
 
 class TestIlluminaMetadataUpdate:
     @m.context("When the data are not multiplexed")