Skip to content

Commit

Permalink
Changed: lenient will try all BC orientations
Browse files Browse the repository at this point in the history
  • Loading branch information
remiolsen committed Nov 27, 2023
1 parent 55a956b commit cfcfe05
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 18 deletions.
39 changes: 28 additions & 11 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,25 +76,40 @@ def run_demux(args):
paf_entries = parse_paf_lines(aln_path)

# Make stats
log.info(f" Searching for adaptor hits in {adaptor_bc_name}")
fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
stats = AlignmentStat(adaptor_bc_name)
stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns)
report.add_alignment_stat(stats)

# Demux
no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance)
flipped = False
if args.lenient: # Try reverse complementing the I5 index and choose the best match
rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, i5_reversed=True)
if len(rc_matches) * 0.2 > len(matches):
log.info(f"Reversing I5 index for adaptor {adaptor_name} found at least 80% more matches")
no_matches = rc_no_matches
matches = rc_matches
flipped = True
flipped_i7 = False; flipped_i5 = False
if args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match
flips = {
"i7": {"i7_reversed": True, "i5_reversed": False},
"i5": {"i7_reversed": False, "i5_reversed": True},
"i7+i5": {"i7_reversed": True, "i5_reversed": True}
}
flipped = {}
for flip, rev in flips.items():
rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, **rev)
flipped[flip] = (rc_matches, rc_no_matches, len(rc_matches))
best_flip = max(zip(flipped.values(), flipped.keys()))[1]
print(best_flip, flipped[best_flip][2])
# There are no barcode flips with unambiguously more matches, so we abort
if sorted([i[2] for i in flipped.values()])[-1] == sorted([i[2] for i in flipped.values()])[-2]:
log.info(f"Could not find any barcode reverse complements with unambiguously more matches")
elif flipped[best_flip][2] * 0.2 > len(matches):
log.info(f"Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least 80% more matches")
matches, no_matches, _ = flipped[best_flip]
flipped_i7, flipped_i5 = flips[best_flip].values()
else:
log.info(f"Using original index orientation for {adaptor_name}")

for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]):

## To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
# To avoid collisions in fastq filenames, we add the ONT barcode to the sample name
fq_prefix = k
if ont_barcode:
fq_prefix = ont_barcode+"-"+fq_prefix
Expand All @@ -110,13 +125,15 @@ def run_demux(args):
rmean = np.round(np.mean(rlens),2)
rstd = np.round(np.std(rlens),2)

sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped, ont_barcode)
sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped_i7, flipped_i5, ont_barcode)
report.add_sample_stat(sample_stat)
if not args.skip_demux:
write_demuxedfastq(sample_dict, fastq_path, fq_name)

# Top unmatched indexes
nomatch_count = Counter([x[3] for x in no_matches])
if args.max_unknowns == None:
args.max_unknowns = len([sample for sample in ss]) + 10
report.add_unmatched_stat(nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name)

# Check if there were samples in the samplesheet without adaptor alignments and add them to report
Expand All @@ -140,7 +157,7 @@ def anglerfish():
parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS)
parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2')
parser.add_argument('--max-unknowns', '-u', type=int, default=10, help='Maximum number of unknown indices to show in the output (default: 10)')
parser.add_argument('--max-unknowns', '-u', type=int, help='Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)')
parser.add_argument('--run_name', '-r', default='anglerfish', help='Name of the run (default: anglerfish)')
parser.add_argument('--lenient', '-l', action='store_true', help='Will try reverse complementing the I5 index and choose the best match. USE WITH EXTREME CAUTION!')
parser.add_argument('--ont_barcodes', '-n', action='store_true', help='Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately')
Expand Down
11 changes: 7 additions & 4 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,6 @@ def layout_matches(i5_name, i7_name, paf_entries):
- unknowns. Any other reads
"""

log.info(" Searching for adaptor hits")
fragments = {}; singletons = {}; concats = {}; unknowns = {}
for read, entry_list in paf_entries.items():
sorted_entries = []
Expand All @@ -119,7 +118,7 @@ def layout_matches(i5_name, i7_name, paf_entries):
return (fragments, singletons, concats, unknowns)


def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False):
def cluster_matches(sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False):

# Only illumina fragments
matched = {}; matched_bed = []; unmatched_bed = []
Expand All @@ -146,8 +145,12 @@ def cluster_matches(sample_adaptor, matches, max_distance, i5_reversed=False):
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(i5['cs'], i5_seq, max_distance)
except AttributeError:
d1 = 0 # presumably there it's single index
fi7, d2 = parse_cs(i7['cs'], adaptor.i7_index, max_distance)
d1 = 0 # presumably it's single index, so no i5

i7_seq = adaptor.i7_index
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(i7['cs'], i7_seq, max_distance)
dists.append(d1+d2)

index_min = min(range(len(dists)), key=dists.__getitem__)
Expand Down
8 changes: 5 additions & 3 deletions anglerfish/demux/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def write_report(self, outdir):
f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n")
f.write("\n{}\n".format("\t".join(getattr(SampleStat, "header"))))
for sample in self.sample_stats:
f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i5_reversed}\t{sample.ont_barcode}\n")
f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i7_reversed}\t{sample.i5_reversed}\t{sample.ont_barcode}\n")
uhead = getattr(Report, 'unmatch_header')
f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab
for key, unmatch in self.unmatched_stats.items():
Expand All @@ -50,7 +50,7 @@ def write_json(self, outdir):
for astat in self.aln_stats:
json_out["paf_stats"].append(astat.paf_stats)
for sample in self.sample_stats:
slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i5_reversed, sample.ont_barcode]
slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i7_reversed, sample.i5_reversed, sample.ont_barcode]
json_out["sample_stats"].append(dict(zip(getattr(SampleStat, "header"),slist)))
for key, unmatch in self.unmatched_stats.items():
for idx, mnum in unmatch:
Expand All @@ -61,7 +61,7 @@ def write_json(self, outdir):
def write_dataframe(self,outdir,samplesheet):
"""Write a dataframe of the stats to a csv file.
TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order:
sample_name, num_reads, mean_read_len, std_read_len, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index
sample_name, num_reads, mean_read_len, std_read_len, i7_reversed, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index
"""
out_list = []
for sample in self.sample_stats:
Expand Down Expand Up @@ -116,12 +116,14 @@ class SampleStat:
num_reads: int
mean_read_len: float
std_read_len: float
i7_reversed: bool
i5_reversed: bool
ont_barcode: str = None
header: ClassVar[list] = ["sample_name",
"#reads", # We specify this for historical reasons
"mean_read_len",
"std_read_len",
"i7_reversed",
"i5_reversed",
"ont_barcode"]

Expand Down

0 comments on commit cfcfe05

Please sign in to comment.