Skip to content

Commit

Permalink
Merge pull request #312 from kjsanger/bug/illumina-split-name
Browse files Browse the repository at this point in the history
Fix and simplify the Illumina split_name function
  • Loading branch information
kjsanger authored May 20, 2024
2 parents 0d12ad5 + 739701a commit f46c0e4
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 28 deletions.
39 changes: 13 additions & 26 deletions src/npg_irods/illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,17 +275,21 @@ def split_name(name: str) -> tuple[str, str]:
Extending this method to handle new types of file:
If your new file can be handled by the pathlib API then you do not need to extend
the capabilities of this function. Otherwise, you will need to add an additional
regular expression to parse the stem from the file name.
the capabilities of this function. Otherwise, you will need to add a regular
expression to parse the stem from the file name.
"""

# Handle this form:
# The dot separator used in the stem is not consistent across all files.
n1 = re.sub(r"_F0x", ".F0x", name)
normalised = re.sub(r"_quality_", ".quality_", n1)

# Handle this form (normalised):
#
# 9930555.ACXX.paired158.550b751b96_F0x900.stats
# 9930555.ACXX.paired158.550b751b96_F0xB00.stats
# 9930555.ACXX.paired158.550b751b96_F0xF04_target.stats
if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(_F0x\d+.*)$", name):
# 9930555.ACXX.paired158.550b751b96.F0x900.stats
# 9930555.ACXX.paired158.550b751b96.F0xB00.stats
# 9930555.ACXX.paired158.550b751b96.F0xF04_target.stats
if match := re.match(r"(\d+\.\w+\.\w+\.\w+)(\.F0x[A-F0-9]+.*)$", normalised):
stem, suffixes = match.groups()

# Handle this form:
Expand All @@ -295,26 +299,9 @@ def split_name(name: str) -> tuple[str, str]:
elif match := re.match(r"(\d+\.\w+\.\w+\.\w+)(.*)$", name):
stem, suffixes = match.groups()

# Handle this form:
#
# [prefix]_F0x900.stats
# [prefix]_F0xB00.stats
# [prefix]_F0xF04_target.stats
# [prefix]_F0xF04_target_autosome.stats
elif match := re.match(r"([\w#]+)(_F0x\d+.*)$", name):
stem, suffixes = match.groups()

# Handle this form:
#
# [prefix]_quality_cycle_caltable.txt
# [prefix]_quality_cycle_surv.txt
# [prefix]_quality_error.txt
elif match := re.match(r"([\w#]+)(_quality_\.txt)$", name):
stem, suffixes = match.groups()

else:
p = PurePath(name)
stem = without_suffixes(p)
p = PurePath(normalised)
stem = without_suffixes(p).as_posix()
suffixes = "".join(p.suffixes)

return stem, suffixes
Expand Down
76 changes: 74 additions & 2 deletions tests/illumina/test_metadata_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from pytest import mark as m

from helpers import history_in_meta
from npg_irods.illumina import Component, ensure_secondary_metadata_updated
from npg_irods.illumina import Component, ensure_secondary_metadata_updated, split_name
from npg_irods.metadata.common import SeqConcept, SeqSubset
from npg_irods.metadata.lims import (
TrackedSample,
Expand All @@ -31,7 +31,7 @@
)


class TestIlluminaComponent:
class TestIlluminaAPI:
@m.context("When a component AVU is available")
@m.it("Can be used to construct a Component")
def test_make_component_from_avu(self):
Expand All @@ -57,6 +57,78 @@ def test_make_component_from_avu(self):
assert c.tag_index == 1
assert c.subset == SeqSubset.HUMAN

@m.context("When parsing names of Illumina data objects")
@m.it("Can split the name into a prefix and a suffix")
def test_split_name(self):
for base in [
"12345", # expt, not multiplexed
"12345_phix", # control, not multiplexed
"12345#1", # expt, multiplexed
"12345_phix#1", # control, multiplexed
"12345_1#1", # expt with lane, multiplexed
"12345_phix_1#1", # control with lane, multiplexed
]:
assert split_name(f"{base}.cram") == (base, ".cram")
assert split_name(f"{base}.cram.crai") == (base, ".cram.crai")
assert split_name(f"{base}_F0x900.stats") == (base, ".F0x900.stats")
assert split_name(f"{base}_F0xB00.stats") == (base, ".F0xB00.stats")
assert split_name(f"{base}_F0xF04_target.stats") == (
base,
".F0xF04_target.stats",
)
assert split_name(f"{base}_F0xF04_target_autosome.stats") == (
base,
".F0xF04_target_autosome.stats",
)
assert split_name(f"{base}_F0xB00.samtools_stats.json") == (
base,
".F0xB00.samtools_stats.json",
)
assert split_name(f"{base}_quality_cycle_caltable.txt") == (
base,
".quality_cycle_caltable.txt",
)
assert split_name(f"{base}_quality_cycle_surv.txt") == (
base,
".quality_cycle_surv.txt",
)
assert split_name(f"{base}_quality_error.txt") == (
base,
".quality_error.txt",
)

@m.context("When parsing names of library-merged Illumina data objects")
@m.it("Can split the name into a prefix and a suffix")
def test_split_name_library_merge(self):
assert split_name("9930555.ACXX.paired158.550b751b96.cram") == (
"9930555.ACXX.paired158.550b751b96",
".cram",
)
assert split_name("9930555.ACXX.paired158.550b751b96.cram.crai") == (
"9930555.ACXX.paired158.550b751b96",
".cram.crai",
)
assert split_name("9930555.ACXX.paired158.550b751b96_F0x900.stats") == (
"9930555.ACXX.paired158.550b751b96",
".F0x900.stats",
)
assert split_name("9930555.ACXX.paired158.550b751b96_F0xB00.stats") == (
"9930555.ACXX.paired158.550b751b96",
".F0xB00.stats",
)
assert split_name("9930555.ACXX.paired158.550b751b96_F0xF04_target.stats") == (
"9930555.ACXX.paired158.550b751b96",
".F0xF04_target.stats",
)
assert split_name("9930555.ACXX.paired158.550b751b96.flagstat") == (
"9930555.ACXX.paired158.550b751b96",
".flagstat",
)
assert split_name("9930555.ACXX.paired158.550b751b96.g.vcf.gz") == (
"9930555.ACXX.paired158.550b751b96",
".g.vcf.gz",
)


class TestIlluminaMetadataUpdate:
@m.context("When the data are not multiplexed")
Expand Down

0 comments on commit f46c0e4

Please sign in to comment.