Skip to content

Commit

Permalink
propagate changes to samplesheet.py
Browse files Browse the repository at this point in the history
  • Loading branch information
kedhammar committed May 24, 2024
1 parent f117a5d commit 7ae19dc
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 42 deletions.
17 changes: 6 additions & 11 deletions anglerfish/demux/adaptor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@ class Adaptor:
def __init__(
self,
name: str,
i5_token: str,
i7_token: str,
adaptors: dict,
i7_index: str | None = None,
i5_index: str | None = None,
):
self.name: str = name
self.i5_token = (adaptors[name]["i5"],)
self.i7_token = (adaptors[name]["i7"],)
self.index_token: str = INDEX_TOKEN

# i5 attributes
self.i5 = AdaptorPart(
sequence_token=i5_token,
sequence_token=self.i5_token,
name=name,
index=i5_index,
)
Expand All @@ -39,7 +40,7 @@ def __init__(

# i7 attributes
self.i7 = AdaptorPart(
sequence_token=i7_token,
sequence_token=self.i7_token,
name=name,
index=i7_index,
)
Expand Down Expand Up @@ -183,11 +184,5 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict:
else:
adaptors = []
for adaptor_name in adaptors_dict:
adaptors.append(
Adaptor(
name=adaptor_name,
i5_token=adaptors_dict[adaptor_name]["i5"],
i7_token=adaptors_dict[adaptor_name]["i7"],
)
)
adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict))
return adaptors
77 changes: 46 additions & 31 deletions anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,7 @@

from anglerfish.demux.adaptor import Adaptor, load_adaptors

idelim = re.compile(r"\<N\>")
udelim = re.compile(r"(\<U\d+\>)")
ulen = re.compile(r"\<U(\d+)\>")
adaptors = load_adaptors(raw=True)
# This is some leftover ugliness from a merge conflict to reconcile the old and new adaptor classes
delim = "<N>"


@dataclass
Expand All @@ -25,10 +20,14 @@ class SampleSheetEntry:


class SampleSheet:
def __init__(self, input_csv, ont_bc):
# Read samplesheet in format:
# sample_name, adaptors, i7_index(-i5_index), fastq_path
# If we are demuxing a run with ONT barcodes, we will have to assume fastq files are located in "barcode##" folders
def __init__(self, input_csv: str, ont_barcodes_enabled: bool):
"""Read samplesheet in format:
sample_name, adaptors, i7_index(-i5_index), fastq_path
If we are demuxing a run with ONT barcodes, we will have to assume
fastq files are located in "barcode##" folders.
"""

self.samplesheet = []
try:
Expand All @@ -40,54 +39,68 @@ def __init__(self, input_csv, ont_bc):
fieldnames=["sample_name", "adaptors", "index", "fastq_path"],
dialect=dialect,
)
rn = 1
row_number = 1

test_globs = {}
for row in data:
if row["adaptors"] not in adaptors:
raise UserWarning(
f"'{row['adaptors']}' not in the list of valid adaptors: {adaptors.keys()}"
)
i7i5 = row["index"].split("-")
i7 = i7i5[0]
i5 = None
if len(i7i5) > 1:
i5 = i7i5[1]
i7i5_split = row["index"].split("-")
i7_index = i7i5_split[0]
if len(i7i5_split) > 1:
i5_index = i7i5_split[1]
else:
i5_index = None

sample_name = row["sample_name"]
test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"])

bc_re = re.compile(r"\/(barcode\d\d|unclassified)\/")
ont_barcode = None
if ont_bc:
ob = re.findall(bc_re, row["fastq_path"])
barcode_dir_pattern = re.compile(r"\/(barcode\d\d|unclassified)\/")

if ont_barcodes_enabled:
barcode_dir_match = re.findall(
barcode_dir_pattern, row["fastq_path"]
)
assert (
len(ob) > 0 and len(ob[0][-1]) > 0
len(barcode_dir_match) > 0 and len(barcode_dir_match[0][-1]) > 0
), "ONT barcode not found in fastq path. In ONT barcode mode (-n), fastq files must be located in barcode## folders"
ont_barcode = ob[0]
ont_barcode = barcode_dir_match[0]
else:
ont_barcode = None

ss_entry = SampleSheetEntry(
sample_name,
Adaptor(adaptors, delim, row["adaptors"], i7, i5),
Adaptor(
name=row["adaptors"],
adaptors=adaptors,
i5_index=i5_index,
i7_index=i7_index,
),
row["fastq_path"],
ont_barcode,
)
self.samplesheet.append(ss_entry)
rn += 1
row_number += 1

# Explanation: Don't mess around with the globs too much. Don't refer to the same file twice but using globs,
# e.g, ./input.fastq and ./[i]nput.fastq
# Explanation: Don't mess around with the globs too much.
# Don't refer to the same file twice but using globs, e.g, ./input.fastq and ./[i]nput.fastq
for a, b in combinations(test_globs.values(), 2):
if len(set(a) & set(b)) > 0:
raise UserWarning(
"Fastq paths are inconsistent. Please check samplesheet"
"Fastq paths are inconsistent. Please check samplesheet."
)

if not ont_bc and len(set([v[0] for v in test_globs.values()])) > 1:
if (
not ont_barcodes_enabled
and len(set([v[0] for v in test_globs.values()])) > 1
):
raise UserWarning(
"""Found several different fastq files in samplesheet. Please carefully check any glob patterns.
If you are using ONT barcodes, please specify the --ont_barcodes flag. Or if you are trying to input several
sets of fastqs into anglerfish, please run anglerfish separately for each set."""
"Found several different fastq files in samplesheet. Please carefully check any glob patterns."
+ " If you are using ONT barcodes, please specify the --ont_barcodes flag."
+ " Or if you are trying to input several sets of fastqs into anglerfish,"
+ " please run anglerfish separately for each set."
)

except:
Expand All @@ -96,7 +109,9 @@ def __init__(self, input_csv, ont_bc):
csvfile.close()

def minimum_bc_distance(self):
"""Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group"""
"""Compute the minimum edit distance between all barcodes in samplesheet,
or within each ONT barcode group.
"""

ss_by_bc = {}
testset = {}
Expand Down

0 comments on commit 7ae19dc

Please sign in to comment.