Skip to content

Commit

Permalink
wip simplify samplesheet iteration
Browse files Browse the repository at this point in the history
  • Loading branch information
kedhammar committed Jul 1, 2024
1 parent 6685fe8 commit d2eacdc
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 59 deletions.
39 changes: 19 additions & 20 deletions anglerfish/anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,41 +95,40 @@ def run_demux(args):
exit()
log.debug(f"Samplesheet bc_dist == {min_distance}")

# Get samplesheet map
samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = (
ss.map_adaptor_barcode_pairs_to_sample_info()
)
# Get adaptor-barcode combinations from samplesheet
adaptor_barcode_sets = ss.get_adaptor_barcode_sets()

# Iterate across samplesheet map and process samples
# Iterate across samplesheet rows conforming to the adaptor-barcode combinations
out_fastqs = []
for key, sample_maps in samplesheet_map.items():
adaptor_name, ont_barcode = key
for adaptor_name, ont_barcode in adaptor_barcode_sets:

subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode)

# Grab the fastq path of the first sample
fastq_path = sample_maps[0][2]
fastq_path = subset_rows[0].fastq
fastq_files = glob.glob(fastq_path)

# If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name
if ont_barcode:
adaptor_bc_name = f"{adaptor_name}_{ont_barcode}"
else:
adaptor_bc_name = adaptor_name
fastq_files = glob.glob(fastq_path)


# Align
alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf")
adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta")
with open(adaptor_fasta_path, "w") as f:
f.write(ss.get_fastastring(adaptor_name))
for fq in fastq_files:
run_minimap2(fq, adaptor_fasta_path, alignment_path, args.threads)
for fq_file in fastq_files:
run_minimap2(fq_file, adaptor_fasta_path, alignment_path, args.threads)

# Easy line count in input fastq files
num_fq = 0
for fq in fastq_files:
with gzip.open(fq, "rb") as f:
num_fq_lines = 0
for fq_file in fastq_files:
with gzip.open(fq_file, "rb") as f:
for i in f:
num_fq += 1
num_fq = int(num_fq / 4)
num_fq_lines += 1
num_fq = int(num_fq_lines / 4)
reads_to_alns: dict[str, list[Alignment]] = map_reads_to_alns(alignment_path)

# Make stats
Expand Down Expand Up @@ -159,7 +158,7 @@ def run_demux(args):
f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled"
)
no_matches, matches = cluster_matches(
samplesheet_map[key],
subset_rows,
fragments,
args.max_distance,
**flips[args.force_rc],
Expand All @@ -176,7 +175,7 @@ def run_demux(args):
spawn = pool.apply_async(
cluster_matches,
args=(
samplesheet_map[key],
subset_rows,
fragments,
args.max_distance,
rev["i7_reversed"],
Expand Down Expand Up @@ -215,7 +214,7 @@ def run_demux(args):
no_matches, matches = flipped["original"]
else:
no_matches, matches = cluster_matches(
samplesheet_map[key], fragments, args.max_distance
subset_rows, fragments, args.max_distance
)

out_pool = []
Expand Down
29 changes: 15 additions & 14 deletions anglerfish/demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from Bio.Seq import Seq
from Bio.SeqIO.QualityIO import FastqGeneralIterator

from anglerfish.demux.adaptor import Adaptor
from anglerfish.demux.samplesheet import SampleSheetEntry

log = logging.getLogger("anglerfish")

Expand Down Expand Up @@ -217,7 +217,7 @@ def categorize_matches(


def cluster_matches(
sample_adaptor: list[tuple[str, Adaptor, str]],
rows: list[SampleSheetEntry],
matches: dict,
max_distance: int,
i7_reversed: bool = False,
Expand Down Expand Up @@ -247,29 +247,30 @@ def cluster_matches(
dists = []
fi5 = ""
fi7 = ""
for _, adaptor, _ in sample_adaptor:
if adaptor.i5.index_seq is not None:
i5_seq = adaptor.i5.index_seq
for row in rows:

if row.adaptor.i5.index_seq is not None:
i5_seq = row.adaptor.i5.index_seq
if i5_reversed and i5_seq is not None:
i5_seq = str(Seq(i5_seq).reverse_complement())
fi5, d1 = parse_cs(
i5_aln.cs,
i5_seq,
adaptor.i5.len_umi_before_index,
adaptor.i5.len_umi_after_index,
row.adaptor.i5.len_umi_before_index,
row.adaptor.i5.len_umi_after_index,
)
else:
d1 = 0

if adaptor.i7.index_seq is not None:
i7_seq = adaptor.i7.index_seq
if row.adaptor.i7.index_seq is not None:
i7_seq = row.adaptor.i7.index_seq
if i7_reversed and i7_seq is not None:
i7_seq = str(Seq(i7_seq).reverse_complement())
fi7, d2 = parse_cs(
i7_aln.cs,
i7_seq,
adaptor.i7.len_umi_before_index,
adaptor.i7.len_umi_after_index,
row.adaptor.i7.len_umi_before_index,
row.adaptor.i7.len_umi_after_index,
)
else:
d2 = 0
Expand All @@ -286,15 +287,15 @@ def cluster_matches(
continue
if dists[index_min] > max_distance:
# Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns"
if len(fi7) + len(fi5) == len(adaptor.i7.index_seq or "") + len(
adaptor.i5.index_seq or ""
if len(fi7) + len(fi5) == len(row.adaptor.i7.index_seq or "") + len(
row.adaptor.i5.index_seq or ""
):
fi75 = "+".join([i for i in [fi7, fi5] if not i == ""])
unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."])
continue
matched[read] = alignments
matched_bed.append(
[read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."]
[read, start_insert, end_insert, dists[index_min][0], "999", "."]
)
log.debug(f" Matched {len(matched)} reads, unmatched {len(unmatched_bed)} reads")
return unmatched_bed, matched_bed
Expand Down
44 changes: 19 additions & 25 deletions anglerfish/demux/samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,37 +160,31 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str:

return outstr

def map_adaptor_barcode_pairs_to_sample_info(
def get_adaptor_barcode_sets(
self,
) -> dict[tuple[str, str], list[tuple[str, Adaptor, str]]]:
"""Create a map of unique adaptor-barcode pairings to a list of tuples
each containing the necessary information to process a given sample.
samplesheet_map = {
adaptor_name_str, ont_barcode_str ) : [ # Unique adaptor-barcode pairing
(sample1_name_str, Adaptor, fastq1_str), # Info to process sample1
(sample2_name_str, Adaptor, fastq2_str), # Info to process sample2
...
],
...
}
"""
) -> list[tuple[str, str]]:
"""Return a set of unique adaptor-barcode pairings in the samplesheet."""

# Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet
adaptor_barcode_tuples: set[tuple[str, str]] = set(
[(entry.adaptor.name, entry.ont_barcode) for entry in self]
adaptor_barcode_sets: list[tuple[str, str]] = list(
set([(row.adaptor.name, row.ont_barcode) for row in self.rows])
)

# Map the unique adaptor-barcode pairings to a list of tuples containing the sample name, Adaptor object, and fastq path
samplesheet_map: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = dict(
[(i, []) for i in adaptor_barcode_tuples]
)
for entry in self:
samplesheet_map[(entry.adaptor.name, entry.ont_barcode)].append(
(entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))
)
return adaptor_barcode_sets

def subset_rows(self, adaptor_name: str, ont_barcode: str|None) -> list[SampleSheetEntry]:
"""Return a subset of samplesheet rows based on logical criteria."""

subset_rows = []

for row in self.rows:
if row.adaptor.name == adaptor_name and ont_barcode == row.ont_barcode:
subset_rows.append(row)
else:
continue

return subset_rows

return samplesheet_map

def __iter__(self):
return iter(self.rows)
Expand Down

0 comments on commit d2eacdc

Please sign in to comment.