propagate changes to samplesheet.py

kedhammar · May 24, 2024 · 7ae19dc · 7ae19dc
1 parent f117a5d
commit 7ae19dc
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 42 deletions.
diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py
@@ -18,17 +18,18 @@ class Adaptor:
     def __init__(
         self,
         name: str,
-        i5_token: str,
-        i7_token: str,
+        adaptors: dict,
         i7_index: str | None = None,
         i5_index: str | None = None,
     ):
         self.name: str = name
+        self.i5_token = (adaptors[name]["i5"],)
+        self.i7_token = (adaptors[name]["i7"],)
         self.index_token: str = INDEX_TOKEN
 
         # i5 attributes
         self.i5 = AdaptorPart(
-            sequence_token=i5_token,
+            sequence_token=self.i5_token,
             name=name,
             index=i5_index,
         )
@@ -39,7 +40,7 @@ def __init__(
 
         # i7 attributes
         self.i7 = AdaptorPart(
-            sequence_token=i7_token,
+            sequence_token=self.i7_token,
             name=name,
             index=i7_index,
         )
@@ -183,11 +184,5 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict:
     else:
         adaptors = []
         for adaptor_name in adaptors_dict:
-            adaptors.append(
-                Adaptor(
-                    name=adaptor_name,
-                    i5_token=adaptors_dict[adaptor_name]["i5"],
-                    i7_token=adaptors_dict[adaptor_name]["i7"],
-                )
-            )
+            adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict))
         return adaptors
diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py
@@ -8,12 +8,7 @@
 
 from anglerfish.demux.adaptor import Adaptor, load_adaptors
 
-idelim = re.compile(r"\<N\>")
-udelim = re.compile(r"(\<U\d+\>)")
-ulen = re.compile(r"\<U(\d+)\>")
 adaptors = load_adaptors(raw=True)
-# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes
-delim = "<N>"
 
 
 @dataclass
@@ -25,10 +20,14 @@ class SampleSheetEntry:
 
 
 class SampleSheet:
-    def __init__(self, input_csv, ont_bc):
-        # Read samplesheet in format:
-        # sample_name, adaptors, i7_index(-i5_index), fastq_path
-        # If we are demuxing a run with ONT barcodes, we will have to assume fastq files are located in "barcode##" folders
+    def __init__(self, input_csv: str, ont_barcodes_enabled: bool):
+        """Read samplesheet in format:
+
+            sample_name, adaptors, i7_index(-i5_index), fastq_path
+
+        If we are demuxing a run with ONT barcodes, we will have to assume
+        fastq files are located in "barcode##" folders.
+        """
 
         self.samplesheet = []
         try:
@@ -40,54 +39,68 @@ def __init__(self, input_csv, ont_bc):
                 fieldnames=["sample_name", "adaptors", "index", "fastq_path"],
                 dialect=dialect,
             )
-            rn = 1
+            row_number = 1
 
             test_globs = {}
             for row in data:
                 if row["adaptors"] not in adaptors:
                     raise UserWarning(
                         f"'{row['adaptors']}' not in the list of valid adaptors: {adaptors.keys()}"
                     )
-                i7i5 = row["index"].split("-")
-                i7 = i7i5[0]
-                i5 = None
-                if len(i7i5) > 1:
-                    i5 = i7i5[1]
+                i7i5_split = row["index"].split("-")
+                i7_index = i7i5_split[0]
+                if len(i7i5_split) > 1:
+                    i5_index = i7i5_split[1]
+                else:
+                    i5_index = None
 
                 sample_name = row["sample_name"]
                 test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"])
 
-                bc_re = re.compile(r"\/(barcode\d\d|unclassified)\/")
-                ont_barcode = None
-                if ont_bc:
-                    ob = re.findall(bc_re, row["fastq_path"])
+                barcode_dir_pattern = re.compile(r"\/(barcode\d\d|unclassified)\/")
+
+                if ont_barcodes_enabled:
+                    barcode_dir_match = re.findall(
+                        barcode_dir_pattern, row["fastq_path"]
+                    )
                     assert (
-                        len(ob) > 0 and len(ob[0][-1]) > 0
+                        len(barcode_dir_match) > 0 and len(barcode_dir_match[0][-1]) > 0
                     ), "ONT barcode not found in fastq path. In ONT barcode mode (-n), fastq files must be located in barcode## folders"
-                    ont_barcode = ob[0]
+                    ont_barcode = barcode_dir_match[0]
+                else:
+                    ont_barcode = None
 
                 ss_entry = SampleSheetEntry(
                     sample_name,
-                    Adaptor(adaptors, delim, row["adaptors"], i7, i5),
+                    Adaptor(
+                        name=row["adaptors"],
+                        adaptors=adaptors,
+                        i5_index=i5_index,
+                        i7_index=i7_index,
+                    ),
                     row["fastq_path"],
                     ont_barcode,
                 )
                 self.samplesheet.append(ss_entry)
-                rn += 1
+                row_number += 1
 
-            # Explanation: Don't mess around with the globs too much. Don't refer to the same file twice but using globs,
-            # e.g, ./input.fastq and ./[i]nput.fastq
+            # Explanation: Don't mess around with the globs too much.
+            # Don't refer to the same file twice but using globs, e.g, ./input.fastq and ./[i]nput.fastq
             for a, b in combinations(test_globs.values(), 2):
                 if len(set(a) & set(b)) > 0:
                     raise UserWarning(
-                        "Fastq paths are inconsistent. Please check samplesheet"
+                        "Fastq paths are inconsistent. Please check samplesheet."
                     )
 
-            if not ont_bc and len(set([v[0] for v in test_globs.values()])) > 1:
+            if (
+                not ont_barcodes_enabled
+                and len(set([v[0] for v in test_globs.values()])) > 1
+            ):
                 raise UserWarning(
-                    """Found several different fastq files in samplesheet. Please carefully check any glob patterns. 
-                                  If you are using ONT barcodes, please specify the --ont_barcodes flag. Or if you are trying to input several 
-                                  sets of fastqs into anglerfish, please run anglerfish separately for each set."""
+                    "Found several different fastq files in samplesheet. Please carefully check any glob patterns."
+                    + " If you are using ONT barcodes, please specify the --ont_barcodes flag."
+                    + " Or if you are trying to input several sets of fastqs into anglerfish,"
+                    + " please run anglerfish separately for each set."
                 )
 
         except:
@@ -96,7 +109,9 @@ def __init__(self, input_csv, ont_bc):
             csvfile.close()
 
     def minimum_bc_distance(self):
-        """Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group"""
+        """Compute the minimum edit distance between all barcodes in samplesheet,
+        or within each ONT barcode group.
+        """
 
         ss_by_bc = {}
         testset = {}