Skip to content

Commit

Permalink
Merge pull request #72 from remiolsen/umi_support
Browse files Browse the repository at this point in the history
Add UMI support
  • Loading branch information
remiolsen authored Feb 13, 2024
2 parents 7c9b87b + 4b027d8 commit 580fc97
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 28 deletions.
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
devcontainer.json
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,14 +173,12 @@ In folder `anglerfish_????_??_??_?????/`

## Anglerfish Explore (Experimental)

`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running:
`anglerfish-explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running, e.g. for help text:

```shell
python anglerfish/explore/cli.py
anglerfish-explore --help
```

inside the anglerfish directory.

## Credits

The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin.
Expand Down
23 changes: 14 additions & 9 deletions anglerfish/config/adaptors.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
# More adaptors can be added manually, following the format below.
# The position of an index within an adaptor is represented by the delimiter "-NNN-".
# The position of an index within an adaptor is represented by the delimiter "<N>".
# The position and length of the UMI within an adaptor is represented by the delimiter "<U#>" where # is the length of the UMI.
# The indexes themselves are represented in the sample sheet.

# Ilumina unique dual indexes, see https://web.archive.org/web/20231129095351/https://support-docs.illumina.com/SHARE/AdapterSequences/Content/SHARE/AdapterSeq/Illumina_DNA/IlluminaUDIndexes.htm
illumina_ud:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-TCGTCGGCAGCGTC
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-GTCTCGTGGGCTCGG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>TCGTCGGCAGCGTC
i7: CAAGCAGAAGACGGCATACGAGAT<N>GTCTCGTGGGCTCGG

truseq:
i5: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG

truseq_dual:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG

truseq_umi:
i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N><U9>ATCTCGTATGCCGTCTTCTGCTTG

nextera_legacy:
i5: AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-CGGTCTGCCTTGCCAGCCCGCTCAG
i7: CAAGCAGAAGACGGCATACGAGAT<N>CGGTCTGCCTTGCCAGCCCGCTCAG

nextera_dual:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-GTCTCGTGGGCTCGG
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>GTCTCGTGGGCTCGG
i7: CAAGCAGAAGACGGCATACGAGAT<N>ATCTCGTATGCCGTCTTCTGCTTG
85 changes: 77 additions & 8 deletions anglerfish/demux/adaptor.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,81 @@
import importlib
import os
import re

import yaml

idelim = re.compile(r"\<N\>")
udelim = re.compile(r"(\<U\d+\>)")
ulen = re.compile(r"\<U(\d+)\>")


class Adaptor:
def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None):
self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index)
self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index)
self.name = f"{adaptor}"
def __init__(self, adaptors, delim, adaptor_type, i7_index=None, i5_index=None):
self.i5 = AdaptorPart(
adaptors[adaptor_type]["i5"], adaptor_type, delim, i5_index
)
self.i7 = AdaptorPart(
adaptors[adaptor_type]["i7"], adaptor_type, delim, i7_index
)
self.i5_index = i5_index
self.i7_index = i7_index
self.i5_umi = re.findall(udelim, self.i5.sequence)
self.i5_umi_before = 0
self.i5_umi_after = 0
self.i7_umi = re.findall(udelim, self.i7.sequence)
self.i7_umi_before = 0
self.i7_umi_after = 0
self.name = f"{adaptor_type}"
self.delim = delim

def get_i7_mask(self, insert_Ns=True):
return self.i7.get_mask(insert_Ns)
if len(self.i5_umi) > 1 or len(self.i7_umi) > 1:
raise UserWarning(
f"Adaptor {adaptor_type} has more than one UMI in either i5 or i7. This is not supported."
)
# Check if UMI is before or after i5 index
if len(self.i5_umi) > 0 and ">" + self.i5_umi[0] in self.i5.sequence:
self.i5_umi_after = int(re.search(ulen, self.i5_umi[0]).group(1))
elif len(self.i5_umi) > 0 and self.i5_umi[0] + "<" in self.i5.sequence:
self.i5_umi_before = int(re.search(ulen, self.i5_umi[0]).group(1))
elif len(self.i5_umi) > 0:
raise UserWarning(
f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
)
# Check if UMI is before or after i7 index
if len(self.i7_umi) > 0 and ">" + self.i7_umi[0] in self.i7.sequence:
self.i7_umi_after = int(re.search(ulen, self.i7_umi[0]).group(1))
elif len(self.i7_umi) > 0 and self.i7_umi[0] + "<" in self.i7.sequence:
self.i7_umi_before = int(re.search(ulen, self.i7_umi[0]).group(1))
elif len(self.i7_umi) > 0:
raise UserWarning(
f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
)

def get_i5_mask(self, insert_Ns=True):
return self.i5.get_mask(insert_Ns)
ilen = len(self.i5_index) if self.i5_index is not None and insert_Ns else 0
ulen = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0
# Test if the index is specified in the adaptor sequence when it shouldn't be
if has_match(idelim, self.i5.sequence) and self.i5_index is None and insert_Ns:
raise UserWarning("Adaptor has i5 but no sequence was specified")
if self.i5_index is not None or not insert_Ns:
new_i5 = re.sub(idelim, "N" * ilen, self.i5.sequence)
new_i5 = re.sub(udelim, "N" * ulen, new_i5)
return new_i5
else:
return self.i5.sequence

def get_i7_mask(self, insert_Ns=True):
ilen = len(self.i7_index) if self.i7_index is not None and insert_Ns else 0
ulen = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0
# Test if the index is specified in the adaptor sequence when it shouldn't be
if has_match(idelim, self.i7.sequence) and self.i7_index is None and insert_Ns:
raise UserWarning("Adaptor has i7 but no sequence was specified")
if self.i7_index is not None or not insert_Ns:
new_i7 = re.sub(idelim, "N" * ilen, self.i7.sequence)
new_i7 = re.sub(udelim, "N" * ulen, new_i7)
return new_i7
else:
return self.i7.sequence

def get_fastastring(self, insert_Ns=True):
fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n"
Expand All @@ -24,6 +84,7 @@ def get_fastastring(self, insert_Ns=True):


class AdaptorPart:
# This class is used either the i5 or i7 adaptor
def __init__(self, sequence, name, delim, index):
self.sequence = sequence
self.name = name
Expand All @@ -49,6 +110,14 @@ def get_mask(self, insert_Ns):
return self.sequence


# General function to check if a string contains a pattern
def has_match(delim, seq):
match = re.search(delim, seq)
if match is None:
return False
return True


# Fetch all adaptors
def load_adaptors(raw=False):
p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml")
Expand All @@ -63,7 +132,7 @@ def load_adaptors(raw=False):
adaptors = []
for adaptor in adaptors_raw:
adaptors.append(
Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None)
Adaptor(adaptors_raw, "N", adaptor, i7_index=None, i5_index=None)
)

return adaptors
24 changes: 18 additions & 6 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
log = logging.getLogger("anglerfish")


def parse_cs(cs_string, index, max_distance):
def parse_cs(cs_string, index, umi_before=0, umi_after=0):
"""
Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance
TODO / idea: Do something big-brained with ONT squigglies
"""
nt = re.compile("\*n([atcg])")
nts = "".join(re.findall(nt, cs_string))

if umi_before > 0:
nts = nts[umi_before:]
if umi_after > 0:
nts = nts[:-umi_after]
# Allow for mismatches
return nts, lev.distance(index.lower(), nts)

Expand Down Expand Up @@ -53,7 +55,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1):
subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True)


def parse_paf_lines(paf, min_qual=10, complex_identifier=False):
def parse_paf_lines(paf, min_qual=1, complex_identifier=False):
"""
Read and parse one paf alignment lines.
Returns a dict with the import values for later use
Expand Down Expand Up @@ -185,14 +187,24 @@ def cluster_matches(
i5_seq = adaptor.i5.index
if i5_reversed and i5_seq is not None:
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance)
fi5, d1 = parse_cs(
i5["cs"],
i5_seq,
adaptor.i5_umi_before,
adaptor.i5_umi_after,
)
except AttributeError:
d1 = 0 # presumably it's single index, so no i5

i7_seq = adaptor.i7.index
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance)
fi7, d2 = parse_cs(
i7["cs"],
i7_seq,
adaptor.i7_umi_before,
adaptor.i7_umi_after,
)
dists.append(d1 + d2)

index_min = min(range(len(dists)), key=dists.__getitem__)
Expand Down
6 changes: 5 additions & 1 deletion anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

from anglerfish.demux.adaptor import Adaptor, load_adaptors

delim = "-NNN-"
idelim = re.compile(r"\<N\>")
udelim = re.compile(r"(\<U\d+\>)")
ulen = re.compile(r"\<U(\d+)\>")
adaptors = load_adaptors(raw=True)
# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes
delim = "<N>"


@dataclass
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
entry_points={
"console_scripts": [
"anglerfish=anglerfish.anglerfish:anglerfish",
"anglerfish-explore=anglerfish.explore.cli:main",
],
},
zip_safe=False,
Expand Down

0 comments on commit 580fc97

Please sign in to comment.