Add matching unknown indices to samplesheet

NationalGenomicsInfrastructure · May 14, 2024 · e43d8f4 · e43d8f4
1 parent 4e6c8cd
commit e43d8f4
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 9 deletions.
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -9,6 +9,7 @@
 from collections import Counter
 from itertools import groupby
 
+import Levenshtein as lev
 import numpy as np
 import pkg_resources
 
@@ -242,11 +243,41 @@ def run_demux(args):
 
         # Top unmatched indexes
         nomatch_count = Counter([x[3] for x in no_matches])
-        if args.max_unknowns is None:
+        if args.max_unknowns == 0:
             args.max_unknowns = len([sample for sample in ss]) + 10
-        report.add_unmatched_stat(
-            nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name
-        )
+
+        # We search for the closest sample in the samplesheet to the list of unknowns
+        top_unknowns = []
+        for i in nomatch_count.most_common(args.max_unknowns):
+            sample_dists = [
+                (
+                    lev.distance(
+                        i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower()
+                    ),
+                    x.sample_name,
+                )
+                for x in ss
+            ]
+            closest_sample = min(sample_dists, key=lambda x: x[0])
+            # If the distance is more than half the index length, we remove it
+            if closest_sample[0] >= (len(i[0]) / 2) + 1:
+                closest_sample = (closest_sample[0], None)
+            else:
+                # We might have two samples with the same distance
+                all_min = [
+                    x[1]
+                    for x in sample_dists
+                    if x[0] == closest_sample[0] and x[1] != closest_sample[1]
+                ]
+                # This list might be too long, so we truncate it
+                if len(all_min) > 4:
+                    all_min = all_min[:4]
+                    all_min.append("...")
+                if all_min:
+                    closest_sample = (closest_sample[0], ";".join(all_min))
+
+            top_unknowns.append([i[0], i[1], closest_sample[1]])
+        report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name)
 
     # Check if there were samples in the samplesheet without adaptor alignments and add them to report
     for entry in ss:

diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py
@@ -8,7 +8,7 @@
 
 
 class Report:
-    unmatch_header = ["index", "num_reads", "ont_barcode"]
+    unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"]
 
     def __init__(self, run_name, uuid, version):
         self.run_name = run_name
@@ -44,8 +44,8 @@ def write_report(self, outdir):
             uhead = getattr(Report, "unmatch_header")
             f.write(f"\n{chr(9).join(uhead)}\n")  # chr(9) = tab
             for key, unmatch in self.unmatched_stats.items():
-                for idx, mnum in unmatch:
-                    f.write(f"{idx}\t{mnum}\t{key[0]}\n")
+                for idx, mnum, closest in unmatch:
+                    f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n")
         log.debug(
             f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes"
         )
@@ -75,9 +75,14 @@ def write_json(self, outdir):
                 dict(zip(getattr(SampleStat, "header"), slist))
             )
         for key, unmatch in self.unmatched_stats.items():
-            for idx, mnum in unmatch:
+            for idx, mnum, closest in unmatch:
                 json_out["undetermined"].append(
-                    dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]]))
+                    dict(
+                        zip(
+                            getattr(Report, "unmatch_header"),
+                            [idx, mnum, key[0], closest],
+                        )
+                    )
                 )
         with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f:
             f.write(json.dumps(json_out, indent=2, sort_keys=True))