diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..73fcfa6 --- /dev/null +++ b/.prettierignore @@ -0,0 +1 @@ +devcontainer.json diff --git a/README.md b/README.md index be02c73..63bd962 100644 --- a/README.md +++ b/README.md @@ -173,14 +173,12 @@ In folder `anglerfish_????_??_??_?????/` ## Anglerfish Explore (Experimental) -`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running: +`anglerfish-explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running, e.g. for help text: ```shell -python anglerfish/explore/cli.py +anglerfish-explore --help ``` -inside the anglerfish directory. - ## Credits The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin. diff --git a/anglerfish/config/adaptors.yaml b/anglerfish/config/adaptors.yaml index e7a1380..510b3e9 100644 --- a/anglerfish/config/adaptors.yaml +++ b/anglerfish/config/adaptors.yaml @@ -1,24 +1,29 @@ # More adaptors can be added manually, following the format below. -# The position of an index within an adaptor is represented by the delimiter "-NNN-". +# The position of an index within an adaptor is represented by the delimiter "". +# The position and length of the UMI within an adaptor is represented by the delimiter "" where # is the length of the UMI. # The indexes themselves are represented in the sample sheet. # Ilumina unique dual indexes, see https://web.archive.org/web/20231129095351/https://support-docs.illumina.com/SHARE/AdapterSequences/Content/SHARE/AdapterSeq/Illumina_DNA/IlluminaUDIndexes.htm illumina_ud: - i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-TCGTCGGCAGCGTC - i7: CAAGCAGAAGACGGCATACGAGAT-NNN-GTCTCGTGGGCTCGG + i5: AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTC + i7: CAAGCAGAAGACGGCATACGAGATGTCTCGTGGGCTCGG truseq: i5: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT - i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG + i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG truseq_dual: - i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-ACACTCTTTCCCTACACGACGCTCTTCCGATCT - i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG + i5: AATGATACGGCGACCACCGAGATCTACACACACTCTTTCCCTACACGACGCTCTTCCGATCT + i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG + +truseq_umi: + i5: AATGATACGGCGACCACCGAGATCTACACACACTCTTTCCCTACACGACGCTCTTCCGATCT + i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG nextera_legacy: i5: AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG - i7: CAAGCAGAAGACGGCATACGAGAT-NNN-CGGTCTGCCTTGCCAGCCCGCTCAG + i7: CAAGCAGAAGACGGCATACGAGATCGGTCTGCCTTGCCAGCCCGCTCAG nextera_dual: - i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-GTCTCGTGGGCTCGG - i7: CAAGCAGAAGACGGCATACGAGAT-NNN-ATCTCGTATGCCGTCTTCTGCTTG + i5: AATGATACGGCGACCACCGAGATCTACACGTCTCGTGGGCTCGG + i7: CAAGCAGAAGACGGCATACGAGATATCTCGTATGCCGTCTTCTGCTTG diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 8db7b66..69e7ea1 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -1,21 +1,81 @@ import importlib import os +import re import yaml +idelim = re.compile(r"\") +udelim = re.compile(r"(\)") +ulen = re.compile(r"\") + class Adaptor: - def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None): - self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index) - self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index) - self.name = f"{adaptor}" + def __init__(self, adaptors, delim, adaptor_type, i7_index=None, i5_index=None): + self.i5 = AdaptorPart( + adaptors[adaptor_type]["i5"], adaptor_type, delim, i5_index + ) + self.i7 = AdaptorPart( + adaptors[adaptor_type]["i7"], adaptor_type, delim, i7_index + ) + self.i5_index = i5_index + self.i7_index = i7_index + self.i5_umi = re.findall(udelim, self.i5.sequence) + self.i5_umi_before = 0 + self.i5_umi_after = 0 + self.i7_umi = re.findall(udelim, self.i7.sequence) + self.i7_umi_before = 0 + self.i7_umi_after = 0 + self.name = f"{adaptor_type}" self.delim = delim - def get_i7_mask(self, insert_Ns=True): - return self.i7.get_mask(insert_Ns) + if len(self.i5_umi) > 1 or len(self.i7_umi) > 1: + raise UserWarning( + f"Adaptor {adaptor_type} has more than one UMI in either i5 or i7. This is not supported." + ) + # Check if UMI is before or after i5 index + if len(self.i5_umi) > 0 and ">" + self.i5_umi[0] in self.i5.sequence: + self.i5_umi_after = int(re.search(ulen, self.i5_umi[0]).group(1)) + elif len(self.i5_umi) > 0 and self.i5_umi[0] + "<" in self.i5.sequence: + self.i5_umi_before = int(re.search(ulen, self.i5_umi[0]).group(1)) + elif len(self.i5_umi) > 0: + raise UserWarning( + f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported." + ) + # Check if UMI is before or after i7 index + if len(self.i7_umi) > 0 and ">" + self.i7_umi[0] in self.i7.sequence: + self.i7_umi_after = int(re.search(ulen, self.i7_umi[0]).group(1)) + elif len(self.i7_umi) > 0 and self.i7_umi[0] + "<" in self.i7.sequence: + self.i7_umi_before = int(re.search(ulen, self.i7_umi[0]).group(1)) + elif len(self.i7_umi) > 0: + raise UserWarning( + f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported." + ) def get_i5_mask(self, insert_Ns=True): - return self.i5.get_mask(insert_Ns) + ilen = len(self.i5_index) if self.i5_index is not None and insert_Ns else 0 + ulen = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0 + # Test if the index is specified in the adaptor sequence when it shouldn't be + if has_match(idelim, self.i5.sequence) and self.i5_index is None and insert_Ns: + raise UserWarning("Adaptor has i5 but no sequence was specified") + if self.i5_index is not None or not insert_Ns: + new_i5 = re.sub(idelim, "N" * ilen, self.i5.sequence) + new_i5 = re.sub(udelim, "N" * ulen, new_i5) + return new_i5 + else: + return self.i5.sequence + + def get_i7_mask(self, insert_Ns=True): + ilen = len(self.i7_index) if self.i7_index is not None and insert_Ns else 0 + ulen = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0 + # Test if the index is specified in the adaptor sequence when it shouldn't be + if has_match(idelim, self.i7.sequence) and self.i7_index is None and insert_Ns: + raise UserWarning("Adaptor has i7 but no sequence was specified") + if self.i7_index is not None or not insert_Ns: + new_i7 = re.sub(idelim, "N" * ilen, self.i7.sequence) + new_i7 = re.sub(udelim, "N" * ulen, new_i7) + return new_i7 + else: + return self.i7.sequence def get_fastastring(self, insert_Ns=True): fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n" @@ -24,6 +84,7 @@ def get_fastastring(self, insert_Ns=True): class AdaptorPart: + # This class is used either the i5 or i7 adaptor def __init__(self, sequence, name, delim, index): self.sequence = sequence self.name = name @@ -49,6 +110,14 @@ def get_mask(self, insert_Ns): return self.sequence +# General function to check if a string contains a pattern +def has_match(delim, seq): + match = re.search(delim, seq) + if match is None: + return False + return True + + # Fetch all adaptors def load_adaptors(raw=False): p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") @@ -63,7 +132,7 @@ def load_adaptors(raw=False): adaptors = [] for adaptor in adaptors_raw: adaptors.append( - Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None) + Adaptor(adaptors_raw, "N", adaptor, i7_index=None, i5_index=None) ) return adaptors diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index db43cac..f6220ea 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -12,14 +12,16 @@ log = logging.getLogger("anglerfish") -def parse_cs(cs_string, index, max_distance): +def parse_cs(cs_string, index, umi_before=0, umi_after=0): """ Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance - TODO / idea: Do something big-brained with ONT squigglies """ nt = re.compile("\*n([atcg])") nts = "".join(re.findall(nt, cs_string)) - + if umi_before > 0: + nts = nts[umi_before:] + if umi_after > 0: + nts = nts[:-umi_after] # Allow for mismatches return nts, lev.distance(index.lower(), nts) @@ -53,7 +55,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1): subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) -def parse_paf_lines(paf, min_qual=10, complex_identifier=False): +def parse_paf_lines(paf, min_qual=1, complex_identifier=False): """ Read and parse one paf alignment lines. Returns a dict with the import values for later use @@ -185,14 +187,24 @@ def cluster_matches( i5_seq = adaptor.i5.index if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) - fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance) + fi5, d1 = parse_cs( + i5["cs"], + i5_seq, + adaptor.i5_umi_before, + adaptor.i5_umi_after, + ) except AttributeError: d1 = 0 # presumably it's single index, so no i5 i7_seq = adaptor.i7.index if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) - fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance) + fi7, d2 = parse_cs( + i7["cs"], + i7_seq, + adaptor.i7_umi_before, + adaptor.i7_umi_after, + ) dists.append(d1 + d2) index_min = min(range(len(dists)), key=dists.__getitem__) diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index b62c53f..f01c9cc 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -8,8 +8,12 @@ from anglerfish.demux.adaptor import Adaptor, load_adaptors -delim = "-NNN-" +idelim = re.compile(r"\") +udelim = re.compile(r"(\)") +ulen = re.compile(r"\") adaptors = load_adaptors(raw=True) +# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes +delim = "" @dataclass diff --git a/setup.py b/setup.py index 0b50d3f..3cf86dd 100644 --- a/setup.py +++ b/setup.py @@ -38,6 +38,7 @@ entry_points={ "console_scripts": [ "anglerfish=anglerfish.anglerfish:anglerfish", + "anglerfish-explore=anglerfish.explore.cli:main", ], }, zip_safe=False,