Skip to content

Commit

Permalink
Add matching unknown indices to samplesheet
Browse files Browse the repository at this point in the history
  • Loading branch information
remiolsen committed May 14, 2024
1 parent 4e6c8cd commit e43d8f4
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
39 changes: 35 additions & 4 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from collections import Counter
from itertools import groupby

import Levenshtein as lev
import numpy as np
import pkg_resources

Expand Down Expand Up @@ -242,11 +243,41 @@ def run_demux(args):

# Top unmatched indexes
nomatch_count = Counter([x[3] for x in no_matches])
if args.max_unknowns is None:
if args.max_unknowns == 0:
args.max_unknowns = len([sample for sample in ss]) + 10
report.add_unmatched_stat(
nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name
)

# We search for the closest sample in the samplesheet to the list of unknowns
top_unknowns = []
for i in nomatch_count.most_common(args.max_unknowns):
sample_dists = [
(
lev.distance(
i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
),
x.sample_name,
)
for x in ss
]
closest_sample = min(sample_dists, key=lambda x: x[0])
# If the distance is more than half the index length, we remove it
if closest_sample[0] >= (len(i[0]) / 2) + 1:
closest_sample = (closest_sample[0], None)
else:
# We might have two samples with the same distance
all_min = [
x[1]
for x in sample_dists
if x[0] == closest_sample[0] and x[1] != closest_sample[1]
]
# This list might be too long, so we truncate it
if len(all_min) > 4:
all_min = all_min[:4]
all_min.append("...")
if all_min:
closest_sample = (closest_sample[0], ";".join(all_min))

top_unknowns.append([i[0], i[1], closest_sample[1]])
report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name)

# Check if there were samples in the samplesheet without adaptor alignments and add them to report
for entry in ss:
Expand Down
15 changes: 10 additions & 5 deletions anglerfish/demux/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


class Report:
unmatch_header = ["index", "num_reads", "ont_barcode"]
unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"]

def __init__(self, run_name, uuid, version):
self.run_name = run_name
Expand Down Expand Up @@ -44,8 +44,8 @@ def write_report(self, outdir):
uhead = getattr(Report, "unmatch_header")
f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab
for key, unmatch in self.unmatched_stats.items():
for idx, mnum in unmatch:
f.write(f"{idx}\t{mnum}\t{key[0]}\n")
for idx, mnum, closest in unmatch:
f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n")
log.debug(
f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes"
)
Expand Down Expand Up @@ -75,9 +75,14 @@ def write_json(self, outdir):
dict(zip(getattr(SampleStat, "header"), slist))
)
for key, unmatch in self.unmatched_stats.items():
for idx, mnum in unmatch:
for idx, mnum, closest in unmatch:
json_out["undetermined"].append(
dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]]))
dict(
zip(
getattr(Report, "unmatch_header"),
[idx, mnum, key[0], closest],
)
)
)
with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f:
f.write(json.dumps(json_out, indent=2, sort_keys=True))
Expand Down

0 comments on commit e43d8f4

Please sign in to comment.