Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UMI support #72

Merged
merged 11 commits into from
Feb 13, 2024
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
devcontainer.json
23 changes: 14 additions & 9 deletions anglerfish/config/adaptors.yaml
Original file line number Diff line number Diff line change
@@ -1,24 +1,29 @@
# More adaptors can be added manually, following the format below.
# The position of an index within an adaptor is represented by the delimiter "-NNN-".
# The position of an index within an adaptor is represented by the delimiter "<N>".
# The position and length of the UMI within an adaptor is represented by the delimiter "<U#>" where # is the length of the UMI.
# The indexes themselves are represented in the sample sheet.

# Ilumina unique dual indexes, see https://web.archive.org/web/20231129095351/https://support-docs.illumina.com/SHARE/AdapterSequences/Content/SHARE/AdapterSeq/Illumina_DNA/IlluminaUDIndexes.htm
illumina_ud:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-TCGTCGGCAGCGTC
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-GTCTCGTGGGCTCGG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>TCGTCGGCAGCGTC
i7: CAAGCAGAAGACGGCATACGAGAT<N>GTCTCGTGGGCTCGG

truseq:
i5: AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG

truseq_dual:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG

truseq_umi:
i5: AATGATACGGCGACCACCGAGATCTACAC<N>ACACTCTTTCCCTACACGACGCTCTTCCGATCT
i7: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N><U9>ATCTCGTATGCCGTCTTCTGCTTG

nextera_legacy:
i5: AATGATACGGCGACCACCGAGATCTACACGCCTCCCTCGCGCCATCAG
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-CGGTCTGCCTTGCCAGCCCGCTCAG
i7: CAAGCAGAAGACGGCATACGAGAT<N>CGGTCTGCCTTGCCAGCCCGCTCAG

nextera_dual:
i5: AATGATACGGCGACCACCGAGATCTACAC-NNN-GTCTCGTGGGCTCGG
i7: CAAGCAGAAGACGGCATACGAGAT-NNN-ATCTCGTATGCCGTCTTCTGCTTG
i5: AATGATACGGCGACCACCGAGATCTACAC<N>GTCTCGTGGGCTCGG
i7: CAAGCAGAAGACGGCATACGAGAT<N>ATCTCGTATGCCGTCTTCTGCTTG
85 changes: 77 additions & 8 deletions anglerfish/demux/adaptor.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,81 @@
import importlib
import os
import re

import yaml

idelim = re.compile(r"\<N\>")
udelim = re.compile(r"(\<U\d+\>)")
ulen = re.compile(r"\<U(\d+)\>")


class Adaptor:
def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None):
self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index)
self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index)
self.name = f"{adaptor}"
def __init__(self, adaptors, delim, adaptor_type, i7_index=None, i5_index=None):
self.i5 = AdaptorPart(
adaptors[adaptor_type]["i5"], adaptor_type, delim, i5_index
)
self.i7 = AdaptorPart(
adaptors[adaptor_type]["i7"], adaptor_type, delim, i7_index
)
self.i5_index = i5_index
self.i7_index = i7_index
self.i5_umi = re.findall(udelim, self.i5.sequence)
self.i5_umi_before = 0
self.i5_umi_after = 0
self.i7_umi = re.findall(udelim, self.i7.sequence)
self.i7_umi_before = 0
self.i7_umi_after = 0
self.name = f"{adaptor_type}"
self.delim = delim

def get_i7_mask(self, insert_Ns=True):
return self.i7.get_mask(insert_Ns)
if len(self.i5_umi) > 1 or len(self.i7_umi) > 1:
raise UserWarning(
f"Adaptor {adaptor_type} has more than one UMI in either i5 or i7. This is not supported."
)
# Check if UMI is before or after i5 index
if len(self.i5_umi) > 0 and ">" + self.i5_umi[0] in self.i5.sequence:
self.i5_umi_after = int(re.search(ulen, self.i5_umi[0]).group(1))
elif len(self.i5_umi) > 0 and self.i5_umi[0] + "<" in self.i5.sequence:
self.i5_umi_before = int(re.search(ulen, self.i5_umi[0]).group(1))
elif len(self.i5_umi) > 0:
raise UserWarning(
f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
)
# Check if UMI is before or after i7 index
if len(self.i7_umi) > 0 and ">" + self.i7_umi[0] in self.i7.sequence:
self.i7_umi_after = int(re.search(ulen, self.i7_umi[0]).group(1))
elif len(self.i7_umi) > 0 and self.i7_umi[0] + "<" in self.i7.sequence:
self.i7_umi_before = int(re.search(ulen, self.i7_umi[0]).group(1))
elif len(self.i7_umi) > 0:
raise UserWarning(
f"Adaptor {adaptor_type} has UMI but it does not flank an index. This is not supported."
)

def get_i5_mask(self, insert_Ns=True):
return self.i5.get_mask(insert_Ns)
ilen = len(self.i5_index) if self.i5_index is not None and insert_Ns else 0
ulen = max(self.i5_umi_after, self.i5_umi_before) if insert_Ns else 0
# Test if the index is specified in the adaptor sequence when it shouldn't be
if has_match(idelim, self.i5.sequence) and self.i5_index is None and insert_Ns:
raise UserWarning("Adaptor has i5 but no sequence was specified")
if self.i5_index is not None or not insert_Ns:
new_i5 = re.sub(idelim, "N" * ilen, self.i5.sequence)
new_i5 = re.sub(udelim, "N" * ulen, new_i5)
return new_i5
else:
return self.i5.sequence

def get_i7_mask(self, insert_Ns=True):
ilen = len(self.i7_index) if self.i7_index is not None and insert_Ns else 0
ulen = max(self.i7_umi_after, self.i7_umi_before) if insert_Ns else 0
# Test if the index is specified in the adaptor sequence when it shouldn't be
if has_match(idelim, self.i7.sequence) and self.i7_index is None and insert_Ns:
raise UserWarning("Adaptor has i7 but no sequence was specified")
if self.i7_index is not None or not insert_Ns:
new_i7 = re.sub(idelim, "N" * ilen, self.i7.sequence)
new_i7 = re.sub(udelim, "N" * ulen, new_i7)
return new_i7
else:
return self.i7.sequence

def get_fastastring(self, insert_Ns=True):
fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n"
Expand All @@ -24,6 +84,7 @@ def get_fastastring(self, insert_Ns=True):


class AdaptorPart:
# This class is used either the i5 or i7 adaptor
def __init__(self, sequence, name, delim, index):
self.sequence = sequence
self.name = name
Expand All @@ -49,6 +110,14 @@ def get_mask(self, insert_Ns):
return self.sequence


# General function to check if a string contains a pattern
def has_match(delim, seq):
match = re.search(delim, seq)
if match is None:
return False
return True


# Fetch all adaptors
def load_adaptors(raw=False):
p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml")
Expand All @@ -63,7 +132,7 @@ def load_adaptors(raw=False):
adaptors = []
for adaptor in adaptors_raw:
adaptors.append(
Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None)
Adaptor(adaptors_raw, "N", adaptor, i7_index=None, i5_index=None)
)

return adaptors
24 changes: 18 additions & 6 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,16 @@
log = logging.getLogger("anglerfish")


def parse_cs(cs_string, index, max_distance):
def parse_cs(cs_string, index, umi_before=0, umi_after=0):
"""
Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance
TODO / idea: Do something big-brained with ONT squigglies
"""
nt = re.compile("\*n([atcg])")
nts = "".join(re.findall(nt, cs_string))

if umi_before > 0:
nts = nts[umi_before:]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this really work? If there are substitutions/deletions/insertions in the alignment both the reference and query nucleotide will be in the cs string right?

For example:
cg:Z:6M2D21M cs:Z::1*at:2*ac:1-ac:21
Or am I missing something?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It works if you add the correct number of N's into the template sequence. So we need to know that the index region is e.g., exactly 10 nt index + 9 nt UMI long. That way we can read the sequence directly from the CS string. Unrelated example:

cs:Z::33*nt*nc*nc*ng*ng*na*ng*na:9

Insertions and deletions will of course affect this perfect picture, but I don't think too terribly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair enough. I think we could use the fact that we're looking specifically for cases where there is an n to the left inside the *na* section. But we can leave that for later

if umi_after > 0:
nts = nts[:-umi_after]
# Allow for mismatches
return nts, lev.distance(index.lower(), nts)

Expand Down Expand Up @@ -53,7 +55,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1):
subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True)


def parse_paf_lines(paf, min_qual=10, complex_identifier=False):
def parse_paf_lines(paf, min_qual=1, complex_identifier=False):
"""
Read and parse one paf alignment lines.
Returns a dict with the import values for later use
Expand Down Expand Up @@ -185,14 +187,24 @@ def cluster_matches(
i5_seq = adaptor.i5.index
if i5_reversed and i5_seq is not None:
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance)
fi5, d1 = parse_cs(
i5["cs"],
i5_seq,
adaptor.i5_umi_before,
adaptor.i5_umi_after,
)
except AttributeError:
d1 = 0 # presumably it's single index, so no i5

i7_seq = adaptor.i7.index
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance)
fi7, d2 = parse_cs(
i7["cs"],
i7_seq,
adaptor.i7_umi_before,
adaptor.i7_umi_after,
)
dists.append(d1 + d2)

index_min = min(range(len(dists)), key=dists.__getitem__)
Expand Down
6 changes: 5 additions & 1 deletion anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

from anglerfish.demux.adaptor import Adaptor, load_adaptors

delim = "-NNN-"
idelim = re.compile(r"\<N\>")
udelim = re.compile(r"(\<U\d+\>)")
ulen = re.compile(r"\<U(\d+)\>")
adaptors = load_adaptors(raw=True)
# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes
delim = "<N>"


@dataclass
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
entry_points={
"console_scripts": [
"anglerfish=anglerfish.anglerfish:anglerfish",
"anglerfish-explore=anglerfish.explore.cli:main",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! Although I don't know what the effect of this is. Does it make the explore command executable or add it to the path?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With conda, and I guess with most package managers, both! So now this will work:

% anglerfish-explore --help
Usage: anglerfish-explore [OPTIONS]

Options:
  -f, --fastq TEXT                Fastq file to align  [required]
...

I'm thinking maybe for a 1.0 release we can unify these commands. I'm open to having them as subcommands, eg. anglerfish run and anglerfish explore

],
},
zip_safe=False,
Expand Down
Loading