From e43d8f4b1cca82c561e2fb6fe417b7df3df0eb09 Mon Sep 17 00:00:00 2001 From: Remi-Andre Olsen Date: Tue, 14 May 2024 13:20:02 +0200 Subject: [PATCH] Add matching unknown indices to samplesheet --- anglerfish/anglerfish.py | 39 ++++++++++++++++++++++++++++++++++---- anglerfish/demux/report.py | 15 ++++++++++----- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index bf206a6..20b8096 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -9,6 +9,7 @@ from collections import Counter from itertools import groupby +import Levenshtein as lev import numpy as np import pkg_resources @@ -242,11 +243,41 @@ def run_demux(args): # Top unmatched indexes nomatch_count = Counter([x[3] for x in no_matches]) - if args.max_unknowns is None: + if args.max_unknowns == 0: args.max_unknowns = len([sample for sample in ss]) + 10 - report.add_unmatched_stat( - nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name - ) + + # We search for the closest sample in the samplesheet to the list of unknowns + top_unknowns = [] + for i in nomatch_count.most_common(args.max_unknowns): + sample_dists = [ + ( + lev.distance( + i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower() + ), + x.sample_name, + ) + for x in ss + ] + closest_sample = min(sample_dists, key=lambda x: x[0]) + # If the distance is more than half the index length, we remove it + if closest_sample[0] >= (len(i[0]) / 2) + 1: + closest_sample = (closest_sample[0], None) + else: + # We might have two samples with the same distance + all_min = [ + x[1] + for x in sample_dists + if x[0] == closest_sample[0] and x[1] != closest_sample[1] + ] + # This list might be too long, so we truncate it + if len(all_min) > 4: + all_min = all_min[:4] + all_min.append("...") + if all_min: + closest_sample = (closest_sample[0], ";".join(all_min)) + + top_unknowns.append([i[0], i[1], closest_sample[1]]) + report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name) # Check if there were samples in the samplesheet without adaptor alignments and add them to report for entry in ss: diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 46923e0..ec433e7 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -8,7 +8,7 @@ class Report: - unmatch_header = ["index", "num_reads", "ont_barcode"] + unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"] def __init__(self, run_name, uuid, version): self.run_name = run_name @@ -44,8 +44,8 @@ def write_report(self, outdir): uhead = getattr(Report, "unmatch_header") f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab for key, unmatch in self.unmatched_stats.items(): - for idx, mnum in unmatch: - f.write(f"{idx}\t{mnum}\t{key[0]}\n") + for idx, mnum, closest in unmatch: + f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n") log.debug( f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes" ) @@ -75,9 +75,14 @@ def write_json(self, outdir): dict(zip(getattr(SampleStat, "header"), slist)) ) for key, unmatch in self.unmatched_stats.items(): - for idx, mnum in unmatch: + for idx, mnum, closest in unmatch: json_out["undetermined"].append( - dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]])) + dict( + zip( + getattr(Report, "unmatch_header"), + [idx, mnum, key[0], closest], + ) + ) ) with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f: f.write(json.dumps(json_out, indent=2, sort_keys=True))