Skip to content

Commit

Permalink
objectify alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
kedhammar committed Jun 14, 2024
1 parent b9c3fdd commit fcc034a
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 204 deletions.
11 changes: 6 additions & 5 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@

from .demux.adaptor import Adaptor
from .demux.demux import (
Alignment,
categorize_matches,
cluster_matches,
layout_matches,
parse_paf_lines,
parse_reads_alns_from_paf,
run_minimap2,
write_demuxedfastq,
)
Expand Down Expand Up @@ -138,12 +139,12 @@ def run_demux(args):
for i in f:
num_fq += 1
num_fq = int(num_fq / 4)
paf_entries = parse_paf_lines(align_path)
reads_alns: dict[str, list[Alignment]] = parse_reads_alns_from_paf(align_path)

# Make stats
log.info(f" Searching for adaptor hits in {adaptor_bc_name}")
fragments, singletons, concats, unknowns = layout_matches(
adaptor_name + "_i5", adaptor_name + "_i7", paf_entries
fragments, singletons, concats, unknowns = categorize_matches(
adaptor_name + "_i5", adaptor_name + "_i7", reads_alns
)
stats = AlignmentStat(adaptor_bc_name)
stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns)
Expand Down
202 changes: 111 additions & 91 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,83 +75,103 @@ def run_minimap2(
subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True)


def parse_paf_lines(
paf_path: str, min_qual: int = 1, complex_identifier: bool = False
) -> dict[str, list[dict]]:
class Alignment:
"""
Read and parse one paf alignment lines.
Returns a dict with the import values for later use.
Class instantiated from one paf alignment entry.
If complex_identifier is True (default False), the keys will be on the form
"{read}_{i5_or_i7}_{strand_str}".
Retains the important values for later use.
"""
entries: dict = {}

def __init__(self, paf_line: str):
paf_cols = paf_line.split()

# Unpack cols to vars for type annotation
self.read_name: str = paf_cols[0]
self.adapter_name: str = paf_cols[5]
self.read_len: int = int(paf_cols[1]) # read length
self.read_start: int = int(paf_cols[2]) # start alignment on read
self.read_end: int = int(paf_cols[3]) # end alignment on read
self.read_strand: str = paf_cols[4]
self.cg: str = paf_cols[-2] # cigar string
self.cs: str = paf_cols[-1] # cigar diff string
self.q: int = int(paf_cols[11]) # Q score
self.iseq: str | None = None
self.sample: str | None = None


def parse_reads_alns_from_paf(
paf_path: str, min_qual: int = 1, complex_identifier: bool = False
) -> dict[str, list[Alignment]]:
"""
Parse a .paf file into a dict, mapping reads to their respective alignment objects.
Outputs:
reads_alns = {
"read1": [
aln_read1_adaptor1_i5,
aln_read1_adaptor1_i7,
aln_read1_adaptor2_i5,
...
],
"read2": [
aln_read1_adaptor1_i5,
aln_read1_adaptor1_i7,
aln_read1_adaptor2_i5,
...
],
...
}
complex_identifier = False (default)
--> The keys will be on the form "{read}".
complex_identifier = True
--> The keys will be on the form "{read}_{i5_or_i7}_{strand_str}".
"""
reads_alns: dict = {}
with open(paf_path) as paf:
for paf_line in paf:
paf_cols = paf_line.split()
try:
# TODO: objectify this

# Unpack cols to vars for type annotation
read: str = paf_cols[0]
adapter: str = paf_cols[5]
rlen: int = int(paf_cols[1]) # read length
rstart: int = int(paf_cols[2]) # start alignment on read
rend: int = int(paf_cols[3]) # end alignment on read
strand: str = paf_cols[4]
cg: str = paf_cols[-2] # cigar string
cs: str = paf_cols[-1] # cigar diff string
q: int = int(paf_cols[11]) # Q score
iseq: str | None = None
sample: str | None = None
# Parse paf line into Alignment object
aln = Alignment(paf_line)

# Determine identifier
if complex_identifier:
i5_or_i7 = adapter.split("_")[-1]
if strand == "+":
strand_str = "positive"
else:
strand_str = "negative"
key = f"{read}_{i5_or_i7}_{strand_str}"
else:
key = read
i5_or_i7 = aln.adapter_name.split("_")[-1]
strand_str = "positive" if aln.read_strand == "+" else "negative"
key = (
f"{aln.read_name}_{i5_or_i7}_{strand_str}"
if complex_identifier
else aln.read_name
)

except IndexError:
log.debug(f"Could not find all paf columns: {read}")
log.debug(f"Could not find all paf columns: {aln.read_name}")
continue

if q < min_qual:
log.debug(f"Low quality alignment: {read}")
if aln.q < min_qual:
log.debug(f"Low quality alignment: {aln.read_name}")
continue

# Compile entry
entry = {
"read": read,
"adapter": adapter,
"rlen": rlen,
"rstart": rstart,
"rend": rend,
"strand": strand,
"cg": cg,
"cs": cs,
"q": q,
"iseq": iseq,
"sample": sample,
}

if key in entries.keys():
entries[key].append(entry)
if key in reads_alns.keys():
reads_alns[key].append(aln)
else:
entries[key] = [entry]
reads_alns[key] = [aln]

return entries
return reads_alns


def layout_matches(
i5_name: str, i7_name: str, paf_entries: dict[str, list[dict]]
) -> tuple[dict, dict, dict, dict]:
def categorize_matches(
i5_name: str, i7_name: str, reads_alns: dict[str, list[Alignment]]
) -> tuple[
dict[str, list[Alignment]],
dict[str, list[Alignment]],
dict[str, list[Alignment]],
dict[str, list[Alignment]],
]:
"""
Search the parsed paf alignments and layout possible Illumina library fragments
Search the parsed paf alignments and layout possible Illumina library fragments,
Returns dicts:
- fragments. Reads with one I7 and one I5
- singletons. Reads with that only match either I5 or I7 adaptors
Expand All @@ -163,33 +183,33 @@ def layout_matches(
singletons = {}
concats = {}
unknowns = {}
for read, entry_list in paf_entries.items():
sorted_entries = []
for k in range(len(entry_list) - 1):
entry_i = entry_list[k]
entry_j = entry_list[k + 1]
for read, alns in reads_alns.items():
sorted_alns = []
for i in range(len(alns) - 1):
aln_i: Alignment = alns[i]
aln_j: Alignment = alns[i + 1]
if (
entry_i["adapter"] != entry_j["adapter"]
and (entry_i["adapter"] == i5_name and entry_j["adapter"] == i7_name)
or (entry_j["adapter"] == i5_name and entry_i["adapter"] == i7_name)
aln_i.adapter_name != aln_j.adapter_name
and (aln_i.adapter_name == i5_name and aln_j.adapter_name == i7_name)
or (aln_j.adapter_name == i5_name and aln_i.adapter_name == i7_name)
):
if entry_i in sorted_entries:
sorted_entries.append(entry_j)
if aln_i in sorted_alns:
sorted_alns.append(aln_j)
else:
sorted_entries.extend([entry_i, entry_j])
if len(entry_list) == 1:
singletons[read] = entry_list
elif len(sorted_entries) == 2:
fragments[read] = sorted(sorted_entries, key=lambda l: l["rstart"])
elif len(sorted_entries) > 2:
concats[read] = sorted(sorted_entries, key=lambda l: l["rstart"])
sorted_alns.extend([aln_i, aln_j])
if len(alns) == 1:
singletons[read] = alns
elif len(sorted_alns) == 2:
fragments[read] = sorted(sorted_alns, key=lambda aln: aln.read_start)
elif len(sorted_alns) > 2:
concats[read] = sorted(sorted_alns, key=lambda aln: aln.read_start)
log.debug(
f"Concatenated fragment: {read}, found: {[(i['adapter'],i['rstart']) for i in sorted_entries]}"
f"Concatenated fragment: {read}, found: {[(aln.adapter_name,aln.read_start) for aln in sorted_alns]}"
)
else:
unknowns[read] = entry_list
unknowns[read] = alns
log.debug(
f"Unknown fragment: {read}, found: {[(i['adapter'],i['rstart']) for i in entry_list]}"
f"Unknown fragment: {read}, found: {[(aln.adapter_name,aln.read_start) for aln in alns]}"
)
# TODO: add minimum insert size
return (fragments, singletons, concats, unknowns)
Expand All @@ -208,19 +228,19 @@ def cluster_matches(
unmatched_bed = []
for read, alignments in matches.items():
if (
alignments[0]["adapter"][-2:] == "i5"
and alignments[1]["adapter"][-2:] == "i7"
alignments[0].adapter_name[-2:] == "i5"
and alignments[1].adapter_name[-2:] == "i7"
):
i5 = alignments[0]
i7 = alignments[1]
i5_aln = alignments[0]
i7_aln = alignments[1]
elif (
alignments[1]["adapter"][-2:] == "i5"
and alignments[0]["adapter"][-2:] == "i7"
alignments[1].adapter_name[-2:] == "i5"
and alignments[0].adapter_name[-2:] == "i7"
):
i5 = alignments[1]
i7 = alignments[0]
i5_aln = alignments[1]
i7_aln = alignments[0]
else:
log.debug(" {read} has no valid illumina fragment")
log.debug(f" {read} has no valid illumina fragment")
continue

dists = []
Expand All @@ -232,7 +252,7 @@ def cluster_matches(
if i5_reversed and i5_seq is not None:
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(
i5["cs"],
i5_aln.cs,
i5_seq,
adaptor.i5.len_umi_before_index,
adaptor.i5.len_umi_after_index,
Expand All @@ -245,7 +265,7 @@ def cluster_matches(
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(
i7["cs"],
i7_aln.cs,
i7_seq,
adaptor.i7.len_umi_before_index,
adaptor.i7.len_umi_after_index,
Expand All @@ -259,8 +279,8 @@ def cluster_matches(
# Test if two samples in the sheet is equidistant to the i5/i7
if len([i for i, j in enumerate(dists) if j == dists[index_min]]) > 1:
continue
start_insert = min(i5["rend"], i7["rend"])
end_insert = max(i7["rstart"], i5["rstart"])
start_insert = min(i5_aln.read_end, i7_aln.read_end)
end_insert = max(i7_aln.read_start, i5_aln.read_start)
if end_insert - start_insert < 10:
continue
if dists[index_min] > max_distance:
Expand Down
29 changes: 13 additions & 16 deletions anglerfish/explore/explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import pandas as pd

from anglerfish.demux.adaptor import Adaptor, load_adaptors
from anglerfish.demux.demux import parse_paf_lines, run_minimap2
from anglerfish.demux.demux import Alignment, parse_reads_alns_from_paf, run_minimap2
from anglerfish.explore.entropy import calculate_relative_entropy

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -44,13 +44,13 @@ def run_explore(
log.info(f"Run uuid {run_uuid}")

adaptors = cast(list[Adaptor], load_adaptors())
alignments: list[tuple[Adaptor, str]] = []
adaptors_and_aln_paths: list[tuple[Adaptor, str]] = []

# Map all reads against all adaptors
for adaptor in adaptors:
# Align
aln_path = os.path.join(outdir, f"{adaptor.name}.paf")
alignments.append((adaptor, aln_path))
adaptors_and_aln_paths.append((adaptor, aln_path))
if os.path.exists(aln_path) and use_existing:
log.info(f"Skipping {adaptor.name} as alignment already exists")
continue
Expand All @@ -70,21 +70,18 @@ def run_explore(
# Parse alignments
entries: dict = {}
adaptors_included = []
for adaptor, aln_path in alignments:
for adaptor, aln_path in adaptors_and_aln_paths:
log.info(f"Parsing {adaptor.name}")
aln_dict_with_lists: dict[str, list[dict]] = parse_paf_lines(
reads_alns: dict[str, list[Alignment]] = parse_reads_alns_from_paf(
aln_path, complex_identifier=True
)

# Choose only the highest scoring alignment for each combination of read, adaptor end and strand
aln_dict = dict(
[
(aln_name, max(aln_dicts, key=lambda x: x["q"]))
for aln_name, aln_dicts in aln_dict_with_lists.items()
]
)

df = pd.DataFrame.from_dict(aln_dict, orient="index")
reads_to_highest_q_aln_dict: dict[str, dict] = {}
for aln_name, alns in reads_alns.items():
highest_q_aln = max(alns, key=lambda aln: aln.q)
reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln)
df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index")
nr_good_hits = {}

# Match Insert Match = mim
Expand Down Expand Up @@ -125,7 +122,7 @@ def run_explore(
insert_thres_high = insert_thres_high

requirements = (
(df_mim["adapter"] == f"{adaptor.name}_{adaptor_end_name}")
(df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}")
& (df_mim["match_1_len"] >= (before_thres))
& (df_mim["insert_len"] >= insert_thres_low)
& (df_mim["insert_len"] <= insert_thres_high)
Expand Down Expand Up @@ -155,8 +152,8 @@ def run_explore(

# Filter multiple hits per read and adaptor end
df_good_hits = df_good_hits.sort_values(
by=["read", "match_1_len"], ascending=False
).drop_duplicates(subset=["read", "adapter"], keep="first")
by=["read_name", "match_1_len"], ascending=False
).drop_duplicates(subset=["read_name", "adapter_name"], keep="first")

if adaptor.name not in entries.keys():
entries[adaptor.name] = {}
Expand Down
Loading

0 comments on commit fcc034a

Please sign in to comment.