Merge pull request #72 from remiolsen/umi_support

Add UMI support
NationalGenomicsInfrastructure · Feb 13, 2024 · 580fc97 · 580fc97
2 parents 7c9b87b + 4b027d8
commit 580fc97
Show file tree

Hide file tree

Showing 7 changed files with 118 additions and 28 deletions.
diff --git a/.prettierignore b/.prettierignore
@@ -0,0 +1 @@
+devcontainer.json
diff --git a/README.md b/README.md
@@ -173,14 +173,12 @@ In folder `anglerfish_????_??_??_?????/`
 
 ## Anglerfish Explore (Experimental)
 
-`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running:
+`anglerfish-explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running, e.g. for help text:
 
 ```shell
-python anglerfish/explore/cli.py
+anglerfish-explore --help
 ```
 
-inside the anglerfish directory.
-
 ## Credits
 
 The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin.

diff --git a/anglerfish/config/adaptors.yaml b/anglerfish/config/adaptors.yaml
@@ -1,24 +1,29 @@
 # More adaptors can be added manually, following the format below.
-# The position of an index within an adaptor is represented by the delimiter "-NNN-".
+# The position of an index within an adaptor is represented by the delimiter "<N>".
+# The position and length of the UMI within an adaptor is represented by the delimiter "<U#>" where # is the length of the UMI.
 # The indexes themselves are represented in the sample sheet.
 
 # Ilumina unique dual indexes, see https://web.archive.org/web/20231129095351/https://support-docs.illumina.com/SHARE/AdapterSequences/Content/SHARE/AdapterSeq/Illumina_DNA/IlluminaUDIndexes.htm
 illumina_ud:
-  i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-TCGTCGGCAGCGTC
-  i7: CAAGCAGAAGACGGCATACGAGAT-NNN-GTCTCGTGGGCTCGG
+  i5: AATGATACGGCGACCACCGAGATCTACAC<N>TCGTCGGCAGCGTC
+  i7: CAAGCAGAAGACGGCATACGAGAT<N>GTCTCGTGGGCTCGG
 
 truseq:
   i5: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
-  i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
+  i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG
 
 truseq_dual:
-  i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-ACACTCTTTCCCTACACGACGCTCTTCCGATCT
-  i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
+  i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
+  i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG
+
+truseq_umi:
+  i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
+  i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N><U9>ATCTCGTATGCCGTCTTCTGCTTG
 
 nextera_legacy:
   i5: AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG
-  i7: CAAGCAGAAGACGGCATACGAGAT-NNN-CGGTCTGCCTTGCCAGCCCGCTCAG
+  i7: CAAGCAGAAGACGGCATACGAGAT<N>CGGTCTGCCTTGCCAGCCCGCTCAG
 
 nextera_dual:
-  i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-GTCTCGTGGGCTCGG
-  i7: CAAGCAGAAGACGGCATACGAGAT-NNN-ATCTCGTATGCCGTCTTCTGCTTG
+  i5: AATGATACGGCGACCACCGAGATCTACAC<N>GTCTCGTGGGCTCGG
+  i7: CAAGCAGAAGACGGCATACGAGAT<N>ATCTCGTATGCCGTCTTCTGCTTG
diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py
@@ -1,21 +1,81 @@
 import importlib
 import os
+import re
 
 import yaml
 
+idelim = re.compile(r"\<N\>")
+udelim = re.compile(r"(\<U\d+\>)")
+ulen = re.compile(r"\<U(\d+)\>")
+
 
 class Adaptor:
-    def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None):
-        self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index)
-        self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index)
-        self.name = f"{adaptor}"
+    def __init__(self, adaptors, delim, adaptor_type, i7_index=None, i5_index=None):
+        self.i5 = AdaptorPart(
+            adaptors[adaptor_type]["i5"], adaptor_type, delim, i5_index
+        )
+        self.i7 = AdaptorPart(
+            adaptors[adaptor_type]["i7"], adaptor_type, delim, i7_index
+        )
+        self.i5_index = i5_index
+        self.i7_index = i7_index
+        self.i5_umi = re.findall(udelim, self.i5.sequence)
+        self.i5_umi_before = 0
+        self.i5_umi_after = 0
+        self.i7_umi = re.findall(udelim, self.i7.sequence)
+        self.i7_umi_before = 0
+        self.i7_umi_after = 0
+        self.name = f"{adaptor_type}"
         self.delim = delim
 
-    def get_i7_mask(self, insert_Ns=True):
-        return self.i7.get_mask(insert_Ns)
+        if len(self.i5_umi) > 1 or len(self.i7_umi) > 1:
+            raise UserWarning(
+                f"Adaptor {adaptor_type} has more than one UMI in either i5 or i7. This is not supported."
+            )
+        # Check if UMI is before or after i5 index
+        if len(self.i5_umi) > 0 and ">" + self.i5_umi[0] in self.i5.sequence:
+            self.i5_umi_after = int(re.search(ulen, self.i5_umi[0]).group(1))
+        elif len(self.i5_umi) > 0 and self.i5_umi[0] + "<" in self.i5.sequence:
+            self.i5_umi_before = int(re.search(ulen, self.i5_umi[0]).group(1))
+        elif len(self.i5_umi) > 0:
+            raise UserWarning(
+                f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
+            )
+        # Check if UMI is before or after i7 index
+        if len(self.i7_umi) > 0 and ">" + self.i7_umi[0] in self.i7.sequence:
+            self.i7_umi_after = int(re.search(ulen, self.i7_umi[0]).group(1))
+        elif len(self.i7_umi) > 0 and self.i7_umi[0] + "<" in self.i7.sequence:
+            self.i7_umi_before = int(re.search(ulen, self.i7_umi[0]).group(1))
+        elif len(self.i7_umi) > 0:
+            raise UserWarning(
+                f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
+            )
 
     def get_i5_mask(self, insert_Ns=True):
-        return self.i5.get_mask(insert_Ns)
+        ilen = len(self.i5_index) if self.i5_index is not None and insert_Ns else 0
+        ulen = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0
+        # Test if the index is specified in the adaptor sequence when it shouldn't be
+        if has_match(idelim, self.i5.sequence) and self.i5_index is None and insert_Ns:
+            raise UserWarning("Adaptor has i5 but no sequence was specified")
+        if self.i5_index is not None or not insert_Ns:
+            new_i5 = re.sub(idelim, "N" * ilen, self.i5.sequence)
+            new_i5 = re.sub(udelim, "N" * ulen, new_i5)
+            return new_i5
+        else:
+            return self.i5.sequence
+
+    def get_i7_mask(self, insert_Ns=True):
+        ilen = len(self.i7_index) if self.i7_index is not None and insert_Ns else 0
+        ulen = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0
+        # Test if the index is specified in the adaptor sequence when it shouldn't be
+        if has_match(idelim, self.i7.sequence) and self.i7_index is None and insert_Ns:
+            raise UserWarning("Adaptor has i7 but no sequence was specified")
+        if self.i7_index is not None or not insert_Ns:
+            new_i7 = re.sub(idelim, "N" * ilen, self.i7.sequence)
+            new_i7 = re.sub(udelim, "N" * ulen, new_i7)
+            return new_i7
+        else:
+            return self.i7.sequence
 
     def get_fastastring(self, insert_Ns=True):
         fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n"
@@ -24,6 +84,7 @@ def get_fastastring(self, insert_Ns=True):
 
 
 class AdaptorPart:
+    # This class is used either the i5 or i7 adaptor
     def __init__(self, sequence, name, delim, index):
         self.sequence = sequence
         self.name = name
@@ -49,6 +110,14 @@ def get_mask(self, insert_Ns):
             return self.sequence
 
 
+# General function to check if a string contains a pattern
+def has_match(delim, seq):
+    match = re.search(delim, seq)
+    if match is None:
+        return False
+    return True
+
+
 # Fetch all adaptors
 def load_adaptors(raw=False):
     p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml")
@@ -63,7 +132,7 @@ def load_adaptors(raw=False):
     adaptors = []
     for adaptor in adaptors_raw:
         adaptors.append(
-            Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None)
+            Adaptor(adaptors_raw, "N", adaptor, i7_index=None, i5_index=None)
         )
 
     return adaptors
diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py
@@ -12,14 +12,16 @@
 log = logging.getLogger("anglerfish")
 
 
-def parse_cs(cs_string, index, max_distance):
+def parse_cs(cs_string, index, umi_before=0, umi_after=0):
     """
     Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance
-    TODO / idea: Do something big-brained with ONT squigglies
     """
     nt = re.compile("\*n([atcg])")
     nts = "".join(re.findall(nt, cs_string))
-
+    if umi_before > 0:
+        nts = nts[umi_before:]
+    if umi_after > 0:
+        nts = nts[:-umi_after]
     # Allow for mismatches
     return nts, lev.distance(index.lower(), nts)
 
@@ -53,7 +55,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1):
         subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True)
 
 
-def parse_paf_lines(paf, min_qual=10, complex_identifier=False):
+def parse_paf_lines(paf, min_qual=1, complex_identifier=False):
     """
     Read and parse one paf alignment lines.
     Returns a dict with the import values for later use
@@ -185,14 +187,24 @@ def cluster_matches(
                 i5_seq = adaptor.i5.index
                 if i5_reversed and i5_seq is not None:
                     i5_seq = str(Seq(i5_seq).reverse_complement())
-                fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance)
+                fi5, d1 = parse_cs(
+                    i5["cs"],
+                    i5_seq,
+                    adaptor.i5_umi_before,
+                    adaptor.i5_umi_after,
+                )
             except AttributeError:
                 d1 = 0  # presumably it's single index, so no i5
 
             i7_seq = adaptor.i7.index
             if i7_reversed and i7_seq is not None:
                 i7_seq = str(Seq(i7_seq).reverse_complement())
-            fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance)
+            fi7, d2 = parse_cs(
+                i7["cs"],
+                i7_seq,
+                adaptor.i7_umi_before,
+                adaptor.i7_umi_after,
+            )
             dists.append(d1 + d2)
 
         index_min = min(range(len(dists)), key=dists.__getitem__)

diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py
@@ -8,8 +8,12 @@
 
 from anglerfish.demux.adaptor import Adaptor, load_adaptors
 
-delim = "-NNN-"
+idelim = re.compile(r"\<N\>")
+udelim = re.compile(r"(\<U\d+\>)")
+ulen = re.compile(r"\<U(\d+)\>")
 adaptors = load_adaptors(raw=True)
+# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes
+delim = "<N>"
 
 
 @dataclass

diff --git a/setup.py b/setup.py
@@ -38,6 +38,7 @@
     entry_points={
         "console_scripts": [
             "anglerfish=anglerfish.anglerfish:anglerfish",
+            "anglerfish-explore=anglerfish.explore.cli:main",
         ],
     },
     zip_safe=False,