diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 6184fe5..58a401a 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -76,6 +76,7 @@ def run_demux(args): paf_entries = parse_paf_lines(aln_path) # Make stats + log.info(f" Searching for adaptor hits in {adaptor_bc_name}") fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries) stats = AlignmentStat(adaptor_bc_name) stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns) @@ -83,18 +84,32 @@ def run_demux(args): # Demux no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance) - flipped = False - if args.lenient: # Try reverse complementing the I5 index and choose the best match - rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, i5_reversed=True) - if len(rc_matches) * 0.2 > len(matches): - log.info(f"Reversing I5 index for adaptor {adaptor_name} found at least 80% more matches") - no_matches = rc_no_matches - matches = rc_matches - flipped = True + flipped_i7 = False; flipped_i5 = False + if args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match + flips = { + "i7": {"i7_reversed": True, "i5_reversed": False}, + "i5": {"i7_reversed": False, "i5_reversed": True}, + "i7+i5": {"i7_reversed": True, "i5_reversed": True} + } + flipped = {} + for flip, rev in flips.items(): + rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, **rev) + flipped[flip] = (rc_matches, rc_no_matches, len(rc_matches)) + best_flip = max(zip(flipped.values(), flipped.keys()))[1] + print(best_flip, flipped[best_flip][2]) + # There are no barcode flips with unambiguously more matches, so we abort + if sorted([i[2] for i in flipped.values()])[-1] == sorted([i[2] for i in flipped.values()])[-2]: + log.info(f"Could not find any barcode reverse complements with unambiguously more matches") + elif flipped[best_flip][2] * 0.2 > len(matches): + log.info(f"Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least 80% more matches") + matches, no_matches, _ = flipped[best_flip] + flipped_i7, flipped_i5 = flips[best_flip].values() + else: + log.info(f"Using original index orientation for {adaptor_name}") for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]): - ## To avoid collisions in fastq filenames, we add the ONT barcode to the sample name + # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name fq_prefix = k if ont_barcode: fq_prefix = ont_barcode+"-"+fq_prefix @@ -110,13 +125,15 @@ def run_demux(args): rmean = np.round(np.mean(rlens),2) rstd = np.round(np.std(rlens),2) - sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped, ont_barcode) + sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped_i7, flipped_i5, ont_barcode) report.add_sample_stat(sample_stat) if not args.skip_demux: write_demuxedfastq(sample_dict, fastq_path, fq_name) # Top unmatched indexes nomatch_count = Counter([x[3] for x in no_matches]) + if args.max_unknowns == None: + args.max_unknowns = len([sample for sample in ss]) + 10 report.add_unmatched_stat(nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name) # Check if there were samples in the samplesheet without adaptor alignments and add them to report @@ -140,7 +157,7 @@ def anglerfish(): parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing') parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS) parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2') - parser.add_argument('--max-unknowns', '-u', type=int, default=10, help='Maximum number of unknown indices to show in the output (default: 10)') + parser.add_argument('--max-unknowns', '-u', type=int, help='Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)') parser.add_argument('--run_name', '-r', default='anglerfish', help='Name of the run (default: anglerfish)') parser.add_argument('--lenient', '-l', action='store_true', help='Will try reverse complementing the I5 index and choose the best match. USE WITH EXTREME CAUTION!') parser.add_argument('--ont_barcodes', '-n', action='store_true', help='Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately') diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index c403d33..7154287 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -94,7 +94,6 @@ def layout_matches(i5_name, i7_name, paf_entries): - unknowns. Any other reads """ - log.info(" Searching for adaptor hits") fragments = {}; singletons = {}; concats = {}; unknowns = {} for read, entry_list in paf_entries.items(): sorted_entries = [] @@ -119,7 +118,7 @@ def layout_matches(i5_name, i7_name, paf_entries): return (fragments, singletons, concats, unknowns) -def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False): +def cluster_matches(sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False): # Only illumina fragments matched = {}; matched_bed = []; unmatched_bed = [] @@ -146,8 +145,12 @@ def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False): i5_seq = str(Seq(i5_seq).reverse_complement()) fi5, d1 = parse_cs(i5['cs'], i5_seq, max_distance) except AttributeError: - d1 = 0 # presumably there it's single index - fi7, d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance) + d1 = 0 # presumably it's single index, so no i5 + + i7_seq = adaptor.i7_index + if i7_reversed and i7_seq is not None: + i7_seq = str(Seq(i7_seq).reverse_complement()) + fi7, d2 = parse_cs(i7['cs'], i7_seq, max_distance) dists.append(d1+d2) index_min = min(range(len(dists)), key=dists.__getitem__) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index c21d6b0..57d50b3 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -31,7 +31,7 @@ def write_report(self, outdir): f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n") f.write("\n{}\n".format("\t".join(getattr(SampleStat, "header")))) for sample in self.sample_stats: - f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i5_reversed}\t{sample.ont_barcode}\n") + f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i7_reversed}\t{sample.i5_reversed}\t{sample.ont_barcode}\n") uhead = getattr(Report, 'unmatch_header') f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab for key, unmatch in self.unmatched_stats.items(): @@ -50,7 +50,7 @@ def write_json(self, outdir): for astat in self.aln_stats: json_out["paf_stats"].append(astat.paf_stats) for sample in self.sample_stats: - slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i5_reversed, sample.ont_barcode] + slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i7_reversed, sample.i5_reversed, sample.ont_barcode] json_out["sample_stats"].append(dict(zip(getattr(SampleStat, "header"),slist))) for key, unmatch in self.unmatched_stats.items(): for idx, mnum in unmatch: @@ -61,7 +61,7 @@ def write_json(self, outdir): def write_dataframe(self,outdir,samplesheet): """Write a dataframe of the stats to a csv file. TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order: - sample_name, num_reads, mean_read_len, std_read_len, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index + sample_name, num_reads, mean_read_len, std_read_len, i7_reversed, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index """ out_list = [] for sample in self.sample_stats: @@ -116,12 +116,14 @@ class SampleStat: num_reads: int mean_read_len: float std_read_len: float + i7_reversed: bool i5_reversed: bool ont_barcode: str = None header: ClassVar[list] = ["sample_name", "#reads", # We specify this for historical reasons "mean_read_len", "std_read_len", + "i7_reversed", "i5_reversed", "ont_barcode"]