diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 30d56a5..0705933 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -95,41 +95,40 @@ def run_demux(args): exit() log.debug(f"Samplesheet bc_dist == {min_distance}") - # Get samplesheet map - samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = ( - ss.map_adaptor_barcode_pairs_to_sample_info() - ) + # Get adaptor-barcode combinations from samplesheet + adaptor_barcode_sets = ss.get_adaptor_barcode_sets() - # Iterate across samplesheet map and process samples + # Iterate across samplesheet rows conforming to the adaptor-barcode combinations out_fastqs = [] - for key, sample_maps in samplesheet_map.items(): - adaptor_name, ont_barcode = key + for adaptor_name, ont_barcode in adaptor_barcode_sets: + + subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode) # Grab the fastq path of the first sample - fastq_path = sample_maps[0][2] + fastq_path = subset_rows[0].fastq + fastq_files = glob.glob(fastq_path) # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name if ont_barcode: adaptor_bc_name = f"{adaptor_name}_{ont_barcode}" else: adaptor_bc_name = adaptor_name - fastq_files = glob.glob(fastq_path) - + # Align alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") with open(adaptor_fasta_path, "w") as f: f.write(ss.get_fastastring(adaptor_name)) - for fq in fastq_files: - run_minimap2(fq, adaptor_fasta_path, alignment_path, args.threads) + for fq_file in fastq_files: + run_minimap2(fq_file, adaptor_fasta_path, alignment_path, args.threads) # Easy line count in input fastq files - num_fq = 0 - for fq in fastq_files: - with gzip.open(fq, "rb") as f: + num_fq_lines = 0 + for fq_file in fastq_files: + with gzip.open(fq_file, "rb") as f: for i in f: - num_fq += 1 - num_fq = int(num_fq / 4) + num_fq_lines += 1 + num_fq = int(num_fq_lines / 4) reads_to_alns: dict[str, list[Alignment]] = map_reads_to_alns(alignment_path) # Make stats @@ -159,7 +158,7 @@ def run_demux(args): f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled" ) no_matches, matches = cluster_matches( - samplesheet_map[key], + subset_rows, fragments, args.max_distance, **flips[args.force_rc], @@ -176,7 +175,7 @@ def run_demux(args): spawn = pool.apply_async( cluster_matches, args=( - samplesheet_map[key], + subset_rows, fragments, args.max_distance, rev["i7_reversed"], @@ -215,7 +214,7 @@ def run_demux(args): no_matches, matches = flipped["original"] else: no_matches, matches = cluster_matches( - samplesheet_map[key], fragments, args.max_distance + subset_rows, fragments, args.max_distance ) out_pool = [] diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index aba5ef4..dba293c 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -10,7 +10,7 @@ from Bio.Seq import Seq from Bio.SeqIO.QualityIO import FastqGeneralIterator -from anglerfish.demux.adaptor import Adaptor +from anglerfish.demux.samplesheet import SampleSheetEntry log = logging.getLogger("anglerfish") @@ -217,7 +217,7 @@ def categorize_matches( def cluster_matches( - sample_adaptor: list[tuple[str, Adaptor, str]], + rows: list[SampleSheetEntry], matches: dict, max_distance: int, i7_reversed: bool = False, @@ -247,29 +247,30 @@ def cluster_matches( dists = [] fi5 = "" fi7 = "" - for _, adaptor, _ in sample_adaptor: - if adaptor.i5.index_seq is not None: - i5_seq = adaptor.i5.index_seq + for row in rows: + + if row.adaptor.i5.index_seq is not None: + i5_seq = row.adaptor.i5.index_seq if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) fi5, d1 = parse_cs( i5_aln.cs, i5_seq, - adaptor.i5.len_umi_before_index, - adaptor.i5.len_umi_after_index, + row.adaptor.i5.len_umi_before_index, + row.adaptor.i5.len_umi_after_index, ) else: d1 = 0 - if adaptor.i7.index_seq is not None: - i7_seq = adaptor.i7.index_seq + if row.adaptor.i7.index_seq is not None: + i7_seq = row.adaptor.i7.index_seq if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) fi7, d2 = parse_cs( i7_aln.cs, i7_seq, - adaptor.i7.len_umi_before_index, - adaptor.i7.len_umi_after_index, + row.adaptor.i7.len_umi_before_index, + row.adaptor.i7.len_umi_after_index, ) else: d2 = 0 @@ -286,15 +287,15 @@ def cluster_matches( continue if dists[index_min] > max_distance: # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7.index_seq or "") + len( - adaptor.i5.index_seq or "" + if len(fi7) + len(fi5) == len(row.adaptor.i7.index_seq or "") + len( + row.adaptor.i5.index_seq or "" ): fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) continue matched[read] = alignments matched_bed.append( - [read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."] + [read, start_insert, end_insert, dists[index_min][0], "999", "."] ) log.debug(f" Matched {len(matched)} reads, unmatched {len(unmatched_bed)} reads") return unmatched_bed, matched_bed diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 4ee8ba1..ffa741f 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -160,37 +160,31 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str: return outstr - def map_adaptor_barcode_pairs_to_sample_info( + def get_adaptor_barcode_sets( self, - ) -> dict[tuple[str, str], list[tuple[str, Adaptor, str]]]: - """Create a map of unique adaptor-barcode pairings to a list of tuples - each containing the necessary information to process a given sample. - - samplesheet_map = { - adaptor_name_str, ont_barcode_str ) : [ # Unique adaptor-barcode pairing - (sample1_name_str, Adaptor, fastq1_str), # Info to process sample1 - (sample2_name_str, Adaptor, fastq2_str), # Info to process sample2 - ... - ], - ... - } - """ + ) -> list[tuple[str, str]]: + """Return a set of unique adaptor-barcode pairings in the samplesheet.""" # Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet - adaptor_barcode_tuples: set[tuple[str, str]] = set( - [(entry.adaptor.name, entry.ont_barcode) for entry in self] + adaptor_barcode_sets: list[tuple[str, str]] = list( + set([(row.adaptor.name, row.ont_barcode) for row in self.rows]) ) - # Map the unique adaptor-barcode pairings to a list of tuples containing the sample name, Adaptor object, and fastq path - samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = dict( - [(i, []) for i in adaptor_barcode_tuples] - ) - for entry in self: - samplesheet_map[(entry.adaptor.name, entry.ont_barcode)].append( - (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq)) - ) + return adaptor_barcode_sets + + def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]: + """Return a subset of samplesheet rows based on logical criteria.""" + + subset_rows = [] + + for row in self.rows: + if row.adaptor.name == adaptor_name and ont_barcode == row.ont_barcode: + subset_rows.append(row) + else: + continue + + return subset_rows - return samplesheet_map def __iter__(self): return iter(self.rows)