Changed: lenient will try all BC orientations

NationalGenomicsInfrastructure · Nov 27, 2023 · cfcfe05 · cfcfe05
1 parent 55a956b
commit cfcfe05
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 18 deletions.
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -76,25 +76,40 @@ def run_demux(args):
         paf_entries = parse_paf_lines(aln_path)
 
         # Make stats
+        log.info(f" Searching for adaptor hits in {adaptor_bc_name}")
         fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
         stats = AlignmentStat(adaptor_bc_name)
         stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns)
         report.add_alignment_stat(stats)
 
         # Demux
         no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance)
-        flipped = False
-        if args.lenient: # Try reverse complementing the I5 index and choose the best match
-            rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, i5_reversed=True)
-            if len(rc_matches) * 0.2 > len(matches):
-                log.info(f"Reversing I5 index for adaptor {adaptor_name} found at least 80% more matches")
-                no_matches = rc_no_matches
-                matches = rc_matches
-                flipped = True
+        flipped_i7 = False; flipped_i5 = False
+        if args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match
+            flips = {
+                    "i7": {"i7_reversed": True, "i5_reversed": False},
+                    "i5": {"i7_reversed": False, "i5_reversed": True},
+                    "i7+i5": {"i7_reversed": True, "i5_reversed": True}
+            }
+            flipped = {}
+            for flip, rev in flips.items():
+                rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, **rev)
+                flipped[flip] = (rc_matches, rc_no_matches, len(rc_matches))
+            best_flip = max(zip(flipped.values(), flipped.keys()))[1]
+            print(best_flip, flipped[best_flip][2])
+            # There are no barcode flips with unambiguously more matches, so we abort
+            if sorted([i[2] for i in flipped.values()])[-1] == sorted([i[2] for i in flipped.values()])[-2]:
+                log.info(f"Could not find any barcode reverse complements with unambiguously more matches")
+            elif flipped[best_flip][2] * 0.2 > len(matches):
+                log.info(f"Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least 80% more matches")
+                matches, no_matches, _ = flipped[best_flip]
+                flipped_i7, flipped_i5 = flips[best_flip].values()
+            else:
+                log.info(f"Using original index orientation for {adaptor_name}")
 
         for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]):
 
-            ## To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
+            # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
             fq_prefix = k
             if ont_barcode:
                 fq_prefix = ont_barcode+"-"+fq_prefix
@@ -110,13 +125,15 @@ def run_demux(args):
             rmean = np.round(np.mean(rlens),2)
             rstd = np.round(np.std(rlens),2)
 
-            sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped, ont_barcode)
+            sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped_i7, flipped_i5, ont_barcode)
             report.add_sample_stat(sample_stat)
             if not args.skip_demux:
                 write_demuxedfastq(sample_dict, fastq_path, fq_name)
 
         # Top unmatched indexes
         nomatch_count = Counter([x[3] for x in no_matches])
+        if args.max_unknowns == None:
+            args.max_unknowns = len([sample for sample in ss]) + 10
         report.add_unmatched_stat(nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name)
 
     # Check if there were samples in the samplesheet without adaptor alignments and add them to report
@@ -140,7 +157,7 @@ def anglerfish():
     parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
     parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS)
     parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2')
-    parser.add_argument('--max-unknowns', '-u', type=int, default=10, help='Maximum number of unknown indices to show in the output (default: 10)')
+    parser.add_argument('--max-unknowns', '-u', type=int, help='Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)')
     parser.add_argument('--run_name', '-r', default='anglerfish', help='Name of the run (default: anglerfish)')
     parser.add_argument('--lenient', '-l', action='store_true', help='Will try reverse complementing the I5 index and choose the best match. USE WITH EXTREME CAUTION!')
     parser.add_argument('--ont_barcodes', '-n', action='store_true', help='Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately')

diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py
@@ -94,7 +94,6 @@ def layout_matches(i5_name, i7_name, paf_entries):
         - unknowns. Any other reads
     """
 
-    log.info(" Searching for adaptor hits")
     fragments = {}; singletons = {}; concats = {}; unknowns = {}
     for read, entry_list in paf_entries.items():
         sorted_entries = []
@@ -119,7 +118,7 @@ def layout_matches(i5_name, i7_name, paf_entries):
     return (fragments, singletons, concats, unknowns)
 
 
-def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False):
+def cluster_matches(sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False):
 
     # Only illumina fragments
     matched = {}; matched_bed = []; unmatched_bed = []
@@ -146,8 +145,12 @@ def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False):
                     i5_seq = str(Seq(i5_seq).reverse_complement())
                 fi5, d1 = parse_cs(i5['cs'], i5_seq, max_distance)
             except AttributeError:
-                d1 = 0 # presumably there it's single index
-            fi7, d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
+                d1 = 0 # presumably it's single index, so no i5
+
+            i7_seq = adaptor.i7_index
+            if i7_reversed and i7_seq is not None:
+                i7_seq = str(Seq(i7_seq).reverse_complement())
+            fi7, d2 = parse_cs(i7['cs'], i7_seq, max_distance)
             dists.append(d1+d2)
 
         index_min = min(range(len(dists)), key=dists.__getitem__)

diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py
@@ -31,7 +31,7 @@ def write_report(self, outdir):
                     f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n")
             f.write("\n{}\n".format("\t".join(getattr(SampleStat, "header"))))
             for sample in self.sample_stats:
-                f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i5_reversed}\t{sample.ont_barcode}\n")
+                f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i7_reversed}\t{sample.i5_reversed}\t{sample.ont_barcode}\n")
             uhead = getattr(Report, 'unmatch_header')
             f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab
             for key, unmatch in self.unmatched_stats.items():
@@ -50,7 +50,7 @@ def write_json(self, outdir):
         for astat in self.aln_stats:
             json_out["paf_stats"].append(astat.paf_stats)
         for sample in self.sample_stats:
-            slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i5_reversed, sample.ont_barcode]
+            slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i7_reversed, sample.i5_reversed, sample.ont_barcode]
             json_out["sample_stats"].append(dict(zip(getattr(SampleStat, "header"),slist)))
         for key, unmatch in self.unmatched_stats.items():
             for idx, mnum in unmatch:
@@ -61,7 +61,7 @@ def write_json(self, outdir):
     def write_dataframe(self,outdir,samplesheet):
         """Write a dataframe of the stats to a csv file.
             TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order:
-            sample_name, num_reads, mean_read_len, std_read_len, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index 
+            sample_name, num_reads, mean_read_len, std_read_len, i7_reversed, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index
         """
         out_list = []
         for sample in self.sample_stats:
@@ -116,12 +116,14 @@ class SampleStat:
     num_reads: int
     mean_read_len: float
     std_read_len: float
+    i7_reversed: bool
     i5_reversed: bool
     ont_barcode: str = None
     header: ClassVar[list] = ["sample_name",
                               "#reads", # We specify this for historical reasons
                               "mean_read_len",
                               "std_read_len",
+                              "i7_reversed",
                               "i5_reversed",
                               "ont_barcode"]