diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 7befff2..da881a5 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -9,6 +9,7 @@ from collections import Counter from itertools import groupby +import Levenshtein as lev import numpy as np import pkg_resources @@ -47,7 +48,6 @@ def run_demux(args): if args.debug: log.setLevel(logging.DEBUG) run_uuid = str(uuid.uuid4()) - os.mkdir(args.out_fastq) ss = SampleSheet(args.samplesheet, args.ont_barcodes) version = pkg_resources.get_distribution("bio-anglerfish").version report = Report(args.run_name, run_uuid, version) @@ -103,7 +103,12 @@ def run_demux(args): adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append( (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq)) ) - + if os.path.exists(args.out_fastq): + raise FileExistsError( + f"Output folder '{args.out_fastq}' already exists. Please remove it or specify another --run_name" + ) + else: + os.mkdir(args.out_fastq) out_fastqs = [] for key, sample in adaptors_sorted.items(): adaptor_name, ont_barcode = key @@ -205,6 +210,7 @@ def run_demux(args): f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches" ) no_matches, matches = flipped[best_flip] + flipped_i7, flipped_i5 = flips[best_flip].values() else: log.info( f" Lenient mode: using original index orientation for {adaptor_name}" @@ -262,11 +268,41 @@ def run_demux(args): # Top unmatched indexes nomatch_count = Counter([x[3] for x in no_matches]) - if args.max_unknowns is None: + if args.max_unknowns == 0: args.max_unknowns = len([sample for sample in ss]) + 10 - report.add_unmatched_stat( - nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name - ) + + # We search for the closest sample in the samplesheet to the list of unknowns + top_unknowns = [] + for i in nomatch_count.most_common(args.max_unknowns): + sample_dists = [ + ( + lev.distance( + i[0], f"{x.adaptor.i7_index}+{x.adaptor.i5_index}".lower() + ), + x.sample_name, + ) + for x in ss + ] + closest_sample = min(sample_dists, key=lambda x: x[0]) + # If the distance is more than half the index length, we remove it + if closest_sample[0] >= (len(i[0]) / 2) + 1: + closest_sample = (closest_sample[0], None) + else: + # We might have two samples with the same distance + all_min = [ + x[1] + for x in sample_dists + if x[0] == closest_sample[0] and x[1] != closest_sample[1] + ] + # This list might be too long, so we truncate it + if len(all_min) > 4: + all_min = all_min[:4] + all_min.append("...") + if all_min: + closest_sample = (closest_sample[0], ";".join(all_min)) + + top_unknowns.append([i[0], i[1], closest_sample[1]]) + report.add_unmatched_stat(top_unknowns, ont_barcode, adaptor_name) # Check if there were samples in the samplesheet without adaptor alignments and add them to report for entry in ss: diff --git a/anglerfish/cli.py b/anglerfish/cli.py index 7c28194..9e87279 100644 --- a/anglerfish/cli.py +++ b/anglerfish/cli.py @@ -271,6 +271,9 @@ def run( ) utcnow = dt.datetime.now(dt.timezone.utc) runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S") + if run_name != "anglerfish": + runname = args.run_name + assert os.path.exists(args.out_fastq), f"Output folder '{args.out_fastq}' not found" assert os.path.exists( args.samplesheet diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index e833ebc..49e8b67 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -8,7 +8,7 @@ class Report: - unmatch_header = ["index", "num_reads", "ont_barcode"] + unmatch_header = ["index", "num_reads", "ont_barcode", "closest_match"] def __init__(self, run_name, uuid, version): self.run_name = run_name @@ -44,8 +44,8 @@ def write_report(self, outdir): uhead = getattr(Report, "unmatch_header") f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab for key, unmatch in self.unmatched_stats.items(): - for idx, mnum in unmatch: - f.write(f"{idx}\t{mnum}\t{key[0]}\n") + for idx, mnum, closest in unmatch: + f.write(f"{idx}\t{mnum}\t{key[0]}\t{closest}\n") log.debug( f"Wrote anglerfish_stats.txt to {outdir}, size: {os.path.getsize(os.path.join(outdir, 'anglerfish_stats.txt'))} bytes" ) @@ -75,9 +75,14 @@ def write_json(self, outdir): dict(zip(getattr(SampleStat, "header"), slist)) ) for key, unmatch in self.unmatched_stats.items(): - for idx, mnum in unmatch: + for idx, mnum, closest in unmatch: json_out["undetermined"].append( - dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]])) + dict( + zip( + getattr(Report, "unmatch_header"), + [idx, mnum, key[0], closest], + ) + ) ) with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f: f.write(json.dumps(json_out, indent=2, sort_keys=True))