wip simplify samplesheet iteration

kedhammar · Jul 1, 2024 · d2eacdc · d2eacdc
1 parent 6685fe8
commit d2eacdc
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 59 deletions.
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -95,41 +95,40 @@ def run_demux(args):
         exit()
     log.debug(f"Samplesheet bc_dist == {min_distance}")
 
-    # Get samplesheet map
-    samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = (
-        ss.map_adaptor_barcode_pairs_to_sample_info()
-    )
+    # Get adaptor-barcode combinations from samplesheet
+    adaptor_barcode_sets = ss.get_adaptor_barcode_sets()
 
-    # Iterate across samplesheet map and process samples
+    # Iterate across samplesheet rows conforming to the adaptor-barcode combinations
     out_fastqs = []
-    for key, sample_maps in samplesheet_map.items():
-        adaptor_name, ont_barcode = key
+    for adaptor_name, ont_barcode in adaptor_barcode_sets:
+
+        subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode)
 
         # Grab the fastq path of the first sample
-        fastq_path = sample_maps[0][2]
+        fastq_path = subset_rows[0].fastq
+        fastq_files = glob.glob(fastq_path) 
 
         # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
         if ont_barcode:
             adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
         else:
             adaptor_bc_name = adaptor_name
-        fastq_files = glob.glob(fastq_path)
-
+
         # Align
         alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
         adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
         with open(adaptor_fasta_path, "w") as f:
             f.write(ss.get_fastastring(adaptor_name))
-        for fq in fastq_files:
-            run_minimap2(fq, adaptor_fasta_path, alignment_path, args.threads)
+        for fq_file in fastq_files:
+            run_minimap2(fq_file, adaptor_fasta_path, alignment_path, args.threads)
 
         # Easy line count in input fastq files
-        num_fq = 0
-        for fq in fastq_files:
-            with gzip.open(fq, "rb") as f:
+        num_fq_lines = 0
+        for fq_file in fastq_files:
+            with gzip.open(fq_file, "rb") as f:
                 for i in f:
-                    num_fq += 1
-        num_fq = int(num_fq / 4)
+                    num_fq_lines += 1
+        num_fq = int(num_fq_lines / 4)
         reads_to_alns: dict[str, list[Alignment]] = map_reads_to_alns(alignment_path)
 
         # Make stats
@@ -159,7 +158,7 @@ def run_demux(args):
                 f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled"
             )
             no_matches, matches = cluster_matches(
-                samplesheet_map[key],
+                subset_rows,
                 fragments,
                 args.max_distance,
                 **flips[args.force_rc],
@@ -176,7 +175,7 @@ def run_demux(args):
                 spawn = pool.apply_async(
                     cluster_matches,
                     args=(
-                        samplesheet_map[key],
+                        subset_rows,
                         fragments,
                         args.max_distance,
                         rev["i7_reversed"],
@@ -215,7 +214,7 @@ def run_demux(args):
                 no_matches, matches = flipped["original"]
         else:
             no_matches, matches = cluster_matches(
-                samplesheet_map[key], fragments, args.max_distance
+                subset_rows, fragments, args.max_distance
             )
 
         out_pool = []

diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py
@@ -10,7 +10,7 @@
 from Bio.Seq import Seq
 from Bio.SeqIO.QualityIO import FastqGeneralIterator
 
-from anglerfish.demux.adaptor import Adaptor
+from anglerfish.demux.samplesheet import SampleSheetEntry
 
 log = logging.getLogger("anglerfish")
 
@@ -217,7 +217,7 @@ def categorize_matches(
 
 
 def cluster_matches(
-    sample_adaptor: list[tuple[str, Adaptor, str]],
+    rows: list[SampleSheetEntry],
     matches: dict,
     max_distance: int,
     i7_reversed: bool = False,
@@ -247,29 +247,30 @@ def cluster_matches(
         dists = []
         fi5 = ""
         fi7 = ""
-        for _, adaptor, _ in sample_adaptor:
-            if adaptor.i5.index_seq is not None:
-                i5_seq = adaptor.i5.index_seq
+        for row in rows:
+
+            if row.adaptor.i5.index_seq is not None:
+                i5_seq = row.adaptor.i5.index_seq
                 if i5_reversed and i5_seq is not None:
                     i5_seq = str(Seq(i5_seq).reverse_complement())
                 fi5, d1 = parse_cs(
                     i5_aln.cs,
                     i5_seq,
-                    adaptor.i5.len_umi_before_index,
-                    adaptor.i5.len_umi_after_index,
+                    row.adaptor.i5.len_umi_before_index,
+                    row.adaptor.i5.len_umi_after_index,
                 )
             else:
                 d1 = 0
 
-            if adaptor.i7.index_seq is not None:
-                i7_seq = adaptor.i7.index_seq
+            if row.adaptor.i7.index_seq is not None:
+                i7_seq = row.adaptor.i7.index_seq
                 if i7_reversed and i7_seq is not None:
                     i7_seq = str(Seq(i7_seq).reverse_complement())
                 fi7, d2 = parse_cs(
                     i7_aln.cs,
                     i7_seq,
-                    adaptor.i7.len_umi_before_index,
-                    adaptor.i7.len_umi_after_index,
+                    row.adaptor.i7.len_umi_before_index,
+                    row.adaptor.i7.len_umi_after_index,
                 )
             else:
                 d2 = 0
@@ -286,15 +287,15 @@ def cluster_matches(
             continue
         if dists[index_min] > max_distance:
             # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns"
-            if len(fi7) + len(fi5) == len(adaptor.i7.index_seq or "") + len(
-                adaptor.i5.index_seq or ""
+            if len(fi7) + len(fi5) == len(row.adaptor.i7.index_seq or "") + len(
+                row.adaptor.i5.index_seq or ""
             ):
                 fi75 = "+".join([i for i in [fi7, fi5] if not i == ""])
                 unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
             continue
         matched[read] = alignments
         matched_bed.append(
-            [read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."]
+            [read, start_insert, end_insert, dists[index_min][0], "999", "."]
         )
     log.debug(f" Matched {len(matched)} reads, unmatched {len(unmatched_bed)} reads")
     return unmatched_bed, matched_bed

diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py
@@ -160,37 +160,31 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str:
 
         return outstr
 
-    def map_adaptor_barcode_pairs_to_sample_info(
+    def get_adaptor_barcode_sets(
         self,
-    ) -> dict[tuple[str, str], list[tuple[str, Adaptor, str]]]:
-        """Create a map of unique adaptor-barcode pairings to a list of tuples
-        each containing the necessary information to process a given sample.
-
-            samplesheet_map = {
-                adaptor_name_str, ont_barcode_str ) : [    # Unique adaptor-barcode pairing
-                    (sample1_name_str, Adaptor, fastq1_str),  # Info to process sample1
-                    (sample2_name_str, Adaptor, fastq2_str),  # Info to process sample2
-                    ...
-                ],
-                ...
-            }
-        """
+    ) -> list[tuple[str, str]]:
+        """Return a set of unique adaptor-barcode pairings in the samplesheet."""
 
         # Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet
-        adaptor_barcode_tuples: set[tuple[str, str]] = set(
-            [(entry.adaptor.name, entry.ont_barcode) for entry in self]
+        adaptor_barcode_sets: list[tuple[str, str]] = list(
+            set([(row.adaptor.name, row.ont_barcode) for row in self.rows])
         )
 
-        # Map the unique adaptor-barcode pairings to a list of tuples containing the sample name, Adaptor object, and fastq path
-        samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = dict(
-            [(i, []) for i in adaptor_barcode_tuples]
-        )
-        for entry in self:
-            samplesheet_map[(entry.adaptor.name, entry.ont_barcode)].append(
-                (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
-            )
+        return adaptor_barcode_sets
+
+    def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]:
+        """Return a subset of samplesheet rows based on logical criteria."""
+
+        subset_rows = []
+
+        for row in self.rows:
+            if row.adaptor.name == adaptor_name and ont_barcode == row.ont_barcode:
+                subset_rows.append(row)
+            else:
+                continue
+
+        return subset_rows
 
-        return samplesheet_map
 
     def __iter__(self):
         return iter(self.rows)