naming, annotation and structuring

kedhammar · May 24, 2024 · a589b41 · a589b41
1 parent c25a5c3
commit a589b41
Showing 1 changed file with 36 additions and 14 deletions.
diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py
@@ -40,6 +40,7 @@
          ```´´
 """
 
+
 def run_demux(args):
     multiprocessing.set_start_method("spawn")
 
@@ -54,29 +55,49 @@ def run_demux(args):
     log.info(f" version {version}")
     log.info(f" arguments {vars(args)}")
     log.info(f" run uuid {run_uuid}")
-    bc_dist = ss.minimum_bc_distance()
+    min_distance = ss.minimum_bc_distance()
     if args.max_distance is None:
-        if bc_dist > 1:
+        if min_distance > 1:
             args.max_distance = 2
         else:
             args.max_distance = 1
         log.info(f"Using maximum edit distance of {args.max_distance}")
-    if args.max_distance >= bc_dist:
+    if args.max_distance >= min_distance:
         log.error(
-            f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={bc_dist}"
+            f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={min_distance}"
         )
         exit()
-    log.debug(f"Samplesheet bc_dist == {bc_dist}")
+    log.debug(f"Samplesheet bc_dist == {min_distance}")
     if args.threads > MAX_PROCESSES:
         log.warning(
             f" Setting threads to {MAX_PROCESSES} as the maximum number of processes is {MAX_PROCESSES}"
         )
         args.threads = MAX_PROCESSES
 
-    # Sort the adaptors by type and size
-    adaptors_t = [(entry.adaptor.name, entry.ont_barcode) for entry in ss]
-    adaptor_set = set(adaptors_t)
-    adaptors_sorted = dict([(i, []) for i in adaptor_set])
+    ## Sort the adaptors by type and size
+
+    # Get a list of tuples with the adaptor name and ONT barcode
+    adaptor_tuples: list[tuple[str, str]] = [
+        (entry.adaptor.name, entry.ont_barcode) for entry in ss
+    ]
+
+    # Convert to set to enforce uniqueness
+    adaptor_set: set[tuple[str, str]] = set(adaptor_tuples)
+
+    # Create a dictionary with the adaptors as keys and an empty list as value
+    adaptors_sorted: dict[tuple[str, str], list] = dict([(i, []) for i in adaptor_set])
+
+    # Populate the dictionary values with sample-specific information
+    """
+        adaptors_sorted = {
+            ( adaptor_name, ont_barcode ) : [
+                (sample_name, adaptor, fastq),
+                (sample_name, adaptor, fastq),
+                ...
+            ],
+            ...
+        } 
+    """
     for entry in ss:
         adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append(
             (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
@@ -87,18 +108,19 @@ def run_demux(args):
         adaptor_name, ont_barcode = key
         fastq_path = sample[0][2]
         # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
-        adaptor_bc_name = adaptor_name
         if ont_barcode:
-            adaptor_bc_name = adaptor_name + "_" + ont_barcode
+            adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
+        else:
+            adaptor_bc_name = adaptor_name
         fastq_files = glob.glob(fastq_path)
 
         # Align
-        aln_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
+        align_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
         adaptor_path = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
         with open(adaptor_path, "w") as f:
             f.write(ss.get_fastastring(adaptor_name))
         for fq in fastq_files:
-            run_minimap2(fq, adaptor_path, aln_path, args.threads)
+            run_minimap2(fq, adaptor_path, align_path, args.threads)
 
         # Easy line count in input fastq files
         num_fq = 0
@@ -107,7 +129,7 @@ def run_demux(args):
                 for i in f:
                     num_fq += 1
         num_fq = int(num_fq / 4)
-        paf_entries = parse_paf_lines(aln_path)
+        paf_entries = parse_paf_lines(align_path)
 
         # Make stats
         log.info(f" Searching for adaptor hits in {adaptor_bc_name}")