From 739701a8e27fada45596365493939eb4c71e5e79 Mon Sep 17 00:00:00 2001 From: Keith James Date: Mon, 20 May 2024 15:02:31 +0100 Subject: [PATCH] Fix and simplify the Illumina split_name function --- src/npg_irods/illumina.py | 39 +++++-------- tests/illumina/test_metadata_update.py | 76 +++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 28 deletions(-) diff --git a/src/npg_irods/illumina.py b/src/npg_irods/illumina.py index 2d9ebc7..1d23cf5 100644 --- a/src/npg_irods/illumina.py +++ b/src/npg_irods/illumina.py @@ -275,17 +275,21 @@ def split_name(name: str) -> tuple[str, str]: Extending this method to handle new types of file: If your new file can be handled by the pathlib API then you do not need to extend - the capabilities of this function. Otherwise, you will need to add an additional - regular expression to parse the stem from the file name. + the capabilities of this function. Otherwise, you will need to add a regular + expression to parse the stem from the file name. """ - # Handle this form: + # The dot separator used in the stem is not consistent across all files. + n1 = re.sub(r"_F0x", ".F0x", name) + normalised = re.sub(r"_quality_", ".quality_", n1) + + # Handle this form (normalised): # - # 9930555.ACXX.paired158.550b751b96_F0x900.stats - # 9930555.ACXX.paired158.550b751b96_F0xB00.stats - # 9930555.ACXX.paired158.550b751b96_F0xF04_target.stats - if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(_F0x\d+.*)$", name): + # 9930555.ACXX.paired158.550b751b96.F0x900.stats + # 9930555.ACXX.paired158.550b751b96.F0xB00.stats + # 9930555.ACXX.paired158.550b751b96.F0xF04_target.stats + if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(\.F0x[A-F0-9]+.*)$", normalised): stem, suffixes = match.groups() # Handle this form: @@ -295,26 +299,9 @@ def split_name(name: str) -> tuple[str, str]: elif match := re.match(r"(\d+\.\w+\.\w+\.\w+)(.*)$", name): stem, suffixes = match.groups() - # Handle this form: - # - # [prefix]_F0x900.stats - # [prefix]_F0xB00.stats - # [prefix]_F0xF04_target.stats - # [prefix]_F0xF04_target_autosome.stats - elif match := re.match(r"([\w#]+)(_F0x\d+.*)$", name): - stem, suffixes = match.groups() - - # Handle this form: - # - # [prefix]_quality_cycle_caltable.txt - # [prefix]_quality_cycle_surv.txt - # [prefix]_quality_error.txt - elif match := re.match(r"([\w#]+)(_quality_\.txt)$", name): - stem, suffixes = match.groups() - else: - p = PurePath(name) - stem = without_suffixes(p) + p = PurePath(normalised) + stem = without_suffixes(p).as_posix() suffixes = "".join(p.suffixes) return stem, suffixes diff --git a/tests/illumina/test_metadata_update.py b/tests/illumina/test_metadata_update.py index ddd0ee0..c22f948 100644 --- a/tests/illumina/test_metadata_update.py +++ b/tests/illumina/test_metadata_update.py @@ -21,7 +21,7 @@ from pytest import mark as m from helpers import history_in_meta -from npg_irods.illumina import Component, ensure_secondary_metadata_updated +from npg_irods.illumina import Component, ensure_secondary_metadata_updated, split_name from npg_irods.metadata.common import SeqConcept, SeqSubset from npg_irods.metadata.lims import ( TrackedSample, @@ -31,7 +31,7 @@ ) -class TestIlluminaComponent: +class TestIlluminaAPI: @m.context("When a component AVU is available") @m.it("Can be used to construct a Component") def test_make_component_from_avu(self): @@ -57,6 +57,78 @@ def test_make_component_from_avu(self): assert c.tag_index == 1 assert c.subset == SeqSubset.HUMAN + @m.context("When parsing names of Illumina data objects") + @m.it("Can split the name into a prefix and a suffix") + def test_split_name(self): + for base in [ + "12345", # expt, not multiplexed + "12345_phix", # control, not multiplexed + "12345#1", # expt, multiplexed + "12345_phix#1", # control, multiplexed + "12345_1#1", # expt with lane, multiplexed + "12345_phix_1#1", # control with lane, multiplexed + ]: + assert split_name(f"{base}.cram") == (base, ".cram") + assert split_name(f"{base}.cram.crai") == (base, ".cram.crai") + assert split_name(f"{base}_F0x900.stats") == (base, ".F0x900.stats") + assert split_name(f"{base}_F0xB00.stats") == (base, ".F0xB00.stats") + assert split_name(f"{base}_F0xF04_target.stats") == ( + base, + ".F0xF04_target.stats", + ) + assert split_name(f"{base}_F0xF04_target_autosome.stats") == ( + base, + ".F0xF04_target_autosome.stats", + ) + assert split_name(f"{base}_F0xB00.samtools_stats.json") == ( + base, + ".F0xB00.samtools_stats.json", + ) + assert split_name(f"{base}_quality_cycle_caltable.txt") == ( + base, + ".quality_cycle_caltable.txt", + ) + assert split_name(f"{base}_quality_cycle_surv.txt") == ( + base, + ".quality_cycle_surv.txt", + ) + assert split_name(f"{base}_quality_error.txt") == ( + base, + ".quality_error.txt", + ) + + @m.context("When parsing names of library-merged Illumina data objects") + @m.it("Can split the name into a prefix and a suffix") + def test_split_name_library_merge(self): + assert split_name("9930555.ACXX.paired158.550b751b96.cram") == ( + "9930555.ACXX.paired158.550b751b96", + ".cram", + ) + assert split_name("9930555.ACXX.paired158.550b751b96.cram.crai") == ( + "9930555.ACXX.paired158.550b751b96", + ".cram.crai", + ) + assert split_name("9930555.ACXX.paired158.550b751b96_F0x900.stats") == ( + "9930555.ACXX.paired158.550b751b96", + ".F0x900.stats", + ) + assert split_name("9930555.ACXX.paired158.550b751b96_F0xB00.stats") == ( + "9930555.ACXX.paired158.550b751b96", + ".F0xB00.stats", + ) + assert split_name("9930555.ACXX.paired158.550b751b96_F0xF04_target.stats") == ( + "9930555.ACXX.paired158.550b751b96", + ".F0xF04_target.stats", + ) + assert split_name("9930555.ACXX.paired158.550b751b96.flagstat") == ( + "9930555.ACXX.paired158.550b751b96", + ".flagstat", + ) + assert split_name("9930555.ACXX.paired158.550b751b96.g.vcf.gz") == ( + "9930555.ACXX.paired158.550b751b96", + ".g.vcf.gz", + ) + class TestIlluminaMetadataUpdate: @m.context("When the data are not multiplexed")