diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 360bbe2..a2304e9 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -63,6 +63,60 @@ jobs: # Configured in pyprojet.toml run: mypy . + # Use pipreqs to check for missing dependencies + pipreqs-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pipreqs + pip install -r requirements.txt + + - name: Run pipreqs + run: | + pipreqs --savepath pipreqs.txt 2>&1 | tee pipreqs_output.log + if grep -q 'WARNING: Package .* does not exist or network problems' pipreqs_output.log; then + missing_packages=$(grep 'WARNING: Package .* does not exist or network problems' pipreqs_output.log | sed -E 's/.*Package "(.*)" does not exist.*/\1/') + echo "ERROR: Add unresolved packages to requirements. Missing package(s): $missing_packages. Example: ' @ git+https://github.com//.git'" + exit 1 + fi + + - name: Compare requirements + run: | + # Extract and sort package names + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' requirements.txt | tr '[:upper:]' '[:lower:]' | sort -u > requirements.compare + awk -F'(=|==|>|>=|<|<=| @ )' '{print $1}' pipreqs.txt | tr '[:upper:]' '[:lower:]' | sort -u > pipreqs.compare + + # Compare package lists + if cmp -s requirements.compare pipreqs.compare + then + echo "Requirements are the same" + + exit 0 + else + echo "Requirements are different" + echo "" + + echo "=== current requirements.txt ===" + echo "" + cat requirements.compare + echo "" + + echo "=== pipreqs requirements ===" + echo "" + cat pipreqs.compare + + exit 1 + fi + # Use Prettier to check various file formats prettier: runs-on: ubuntu-latest diff --git a/README.md b/README.md index e4ffa20..e91a8b5 100644 --- a/README.md +++ b/README.md @@ -187,3 +187,13 @@ Also, the [Anglerfish logo](docs/Anglerfish_logo.svg) was designed by [@FranBona

+ +### Contributors + +- [@remiolsen](https://github.com/remiolsen) +- [@FranBonath](https://github.com/FranBonath) +- [@taborsak](https://github.com/taborsak) +- [@ssjunnebo](https://github.com/ssjunnebo) +- Carl Rubin +- [@alneberg](https://github.com/alneberg) +- [@kedhammar](https://github.com/kedhammar) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 4dfa150..3d6b582 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -7,17 +7,17 @@ import sys import uuid from collections import Counter +from importlib.metadata import version as get_version from itertools import groupby import Levenshtein as lev import numpy as np -import pkg_resources -from .demux.adaptor import Adaptor from .demux.demux import ( + Alignment, + categorize_matches, cluster_matches, - layout_matches, - parse_paf_lines, + map_reads_to_alns, run_minimap2, write_demuxedfastq, ) @@ -44,19 +44,42 @@ def run_demux(args): + # Boilerplate multiprocessing.set_start_method("spawn") + version = get_version("bio-anglerfish") + run_uuid = str(uuid.uuid4()) + # Parse arguments if args.debug: log.setLevel(logging.DEBUG) - run_uuid = str(uuid.uuid4()) - ss = SampleSheet(args.samplesheet, args.ont_barcodes) - version = pkg_resources.get_distribution("bio-anglerfish").version - report = Report(args.run_name, run_uuid, version) + + if args.threads > MAX_PROCESSES: + log.warning( + f" Setting threads to {MAX_PROCESSES} as the maximum number of processes is {MAX_PROCESSES}" + ) + args.threads = MAX_PROCESSES + + if os.path.exists(args.out_fastq): + raise FileExistsError( + f"Output folder '{args.out_fastq}' already exists. Please remove it or specify another --run_name" + ) + else: + os.mkdir(args.out_fastq) + + # Print logo and info sys.stderr.write(anglerfish_logo) log.info(f" version {version}") log.info(f" arguments {vars(args)}") log.info(f" run uuid {run_uuid}") + + # Instantiate samplesheet and report + ss = SampleSheet(args.samplesheet, args.ont_barcodes) + report = Report(args.run_name, run_uuid, version) + + # Calculate the minimum edit distance between indices in the samplesheet min_distance = ss.minimum_bc_distance() + + # Determine which edit distance to use for index matching if args.max_distance is None: # Default: Set the maximum distance for barcode matching to 0, 1 or 2 # depending on the smallest detected edit distance between indices in the samplesheet @@ -70,88 +93,55 @@ def run_demux(args): ) exit() log.debug(f"Samplesheet bc_dist == {min_distance}") - if args.threads > MAX_PROCESSES: - log.warning( - f" Setting threads to {MAX_PROCESSES} as the maximum number of processes is {MAX_PROCESSES}" - ) - args.threads = MAX_PROCESSES - ## Sort the adaptors by type and size - - # Get a list of tuples with the adaptor name and ONT barcode - adaptor_tuples: list[tuple[str, str]] = [ - (entry.adaptor.name, entry.ont_barcode) for entry in ss - ] - - # Convert to set to enforce uniqueness - adaptor_set: set[tuple[str, str]] = set(adaptor_tuples) - - # Create a dictionary with the adaptors as keys and an empty list as value - adaptors_sorted: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = dict( - [(i, []) for i in adaptor_set] - ) - - # Populate the dictionary values with sample-specific information - """ - adaptors_sorted = { - adaptor_name_str, ont_barcode_str ) : [ - (sample_name_str, Adaptor, fastq_str), - (sample_name_str, Adaptor, fastq_str), - ... - ], - ... - } - """ - for entry in ss: - adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append( - (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq)) - ) - if os.path.exists(args.out_fastq): - raise FileExistsError( - f"Output folder '{args.out_fastq}' already exists. Please remove it or specify another --run_name" - ) - else: - os.mkdir(args.out_fastq) + # Get adaptor-barcode combinations from samplesheet + adaptor_barcode_sets = ss.get_adaptor_barcode_sets() + + # Iterate across samplesheet rows conforming to the adaptor-barcode combinations out_fastqs = [] - for key, sample in adaptors_sorted.items(): - adaptor_name, ont_barcode = key - fastq_path = sample[0][2] + for adaptor_name, ont_barcode in adaptor_barcode_sets: + subset_rows = ss.subset_rows(adaptor_name=adaptor_name, ont_barcode=ont_barcode) + + # Grab the fastq files for the current entries + assert all([subset_rows[0].fastq == row.fastq for row in subset_rows]) + fastq_path = subset_rows[0].fastq + fastq_files = glob.glob(fastq_path) + # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name if ont_barcode: adaptor_bc_name = f"{adaptor_name}_{ont_barcode}" else: adaptor_bc_name = adaptor_name - fastq_files = glob.glob(fastq_path) # Align - align_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") - adaptor_path = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") - with open(adaptor_path, "w") as f: + alignment_path: str = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") + adaptor_fasta_path: str = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") + with open(adaptor_fasta_path, "w") as f: f.write(ss.get_fastastring(adaptor_name)) - for fq in fastq_files: - run_minimap2(fq, adaptor_path, align_path, args.threads) + for fq_file in fastq_files: + run_minimap2(fq_file, adaptor_fasta_path, alignment_path, args.threads) # Easy line count in input fastq files - num_fq = 0 - for fq in fastq_files: - with gzip.open(fq, "rb") as f: + num_fq_lines = 0 + for fq_file in fastq_files: + with gzip.open(fq_file, "rb") as f: for i in f: - num_fq += 1 - num_fq = int(num_fq / 4) - paf_entries = parse_paf_lines(align_path) + num_fq_lines += 1 + num_fq = int(num_fq_lines / 4) + reads_to_alns: dict[str, list[Alignment]] = map_reads_to_alns(alignment_path) # Make stats log.info(f" Searching for adaptor hits in {adaptor_bc_name}") - fragments, singletons, concats, unknowns = layout_matches( - adaptor_name + "_i5", adaptor_name + "_i7", paf_entries + fragments, singletons, concats, unknowns = categorize_matches( + i5_name=f"{adaptor_name}_i5", + i7_name=f"{adaptor_name}_i7", + reads_to_alns=reads_to_alns, ) stats = AlignmentStat(adaptor_bc_name) stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns) report.add_alignment_stat(stats) # Demux - no_matches = [] - matches = [] flipped_i7 = False flipped_i5 = False flips = { @@ -164,8 +154,8 @@ def run_demux(args): log.info( f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled" ) - no_matches, matches = cluster_matches( - adaptors_sorted[key], + unmatched_bed, matched_bed = cluster_matches( + subset_rows, fragments, args.max_distance, **flips[args.force_rc], @@ -182,7 +172,7 @@ def run_demux(args): spawn = pool.apply_async( cluster_matches, args=( - adaptors_sorted[key], + subset_rows, fragments, args.max_distance, rev["i7_reversed"], @@ -203,7 +193,7 @@ def run_demux(args): log.warning( " Lenient mode: Could not find any barcode reverse complements with unambiguously more matches. Using original index orientation for all adaptors. Please study the results carefully!" ) - no_matches, matches = flipped["original"] + unmatched_bed, matched_bed = flipped["original"] elif ( best_flip != "None" and len(flipped[best_flip][1]) @@ -212,20 +202,22 @@ def run_demux(args): log.info( f" Lenient mode: Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches" ) - no_matches, matches = flipped[best_flip] + unmatched_bed, matched_bed = flipped[best_flip] flipped_i7, flipped_i5 = flips[best_flip].values() else: log.info( f" Lenient mode: using original index orientation for {adaptor_name}" ) - no_matches, matches = flipped["original"] + unmatched_bed, matched_bed = flipped["original"] else: - no_matches, matches = cluster_matches( - adaptors_sorted[key], fragments, args.max_distance + unmatched_bed, matched_bed = cluster_matches( + subset_rows, fragments, args.max_distance ) out_pool = [] - for k, v in groupby(sorted(matches, key=lambda x: x[3]), key=lambda y: y[3]): + for k, v in groupby( + sorted(matched_bed, key=lambda x: x[3]), key=lambda y: y[3] + ): # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name fq_prefix = k if ont_barcode: @@ -270,7 +262,7 @@ def run_demux(args): ) # Top unmatched indexes - nomatch_count = Counter([x[3] for x in no_matches]) + nomatch_count = Counter([x[3] for x in unmatched_bed]) if args.max_unknowns == 0: args.max_unknowns = len([sample for sample in ss]) + 10 diff --git a/anglerfish/cli.py b/anglerfish/cli.py index 9e87279..2f45438 100644 --- a/anglerfish/cli.py +++ b/anglerfish/cli.py @@ -2,9 +2,9 @@ import datetime as dt import os from enum import Enum +from importlib.metadata import version as get_version from typing import Optional -import pkg_resources import typer from typing_extensions import Annotated @@ -23,7 +23,7 @@ class IndexOrientations(str, Enum): def version_callback(value: bool): if value: - print(f'anglerfish {pkg_resources.get_distribution("bio-anglerfish").version}') + print(f'anglerfish {get_version("bio-anglerfish")}') raise typer.Exit() diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 4f3ddda..abf5d31 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -10,7 +10,7 @@ from Bio.Seq import Seq from Bio.SeqIO.QualityIO import FastqGeneralIterator -from anglerfish.demux.adaptor import Adaptor +from anglerfish.demux.samplesheet import SampleSheetEntry log = logging.getLogger("anglerfish") @@ -40,8 +40,9 @@ def parse_cs( else: bases_spanning_index_mask = bases_spanning_mask # Return the index and the Levenshtein distance between it and the presumed index region of the read - return bases_spanning_index_mask, lev.distance( - index_seq.lower(), bases_spanning_index_mask + return ( + bases_spanning_index_mask, + lev.distance(index_seq.lower(), bases_spanning_index_mask), ) @@ -75,83 +76,103 @@ def run_minimap2( subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) -def parse_paf_lines( - paf_path: str, min_qual: int = 1, complex_identifier: bool = False -) -> dict[str, list[dict]]: +class Alignment: + """ + Class instantiated from one paf alignment entry. + + Retains the important values for later use. """ - Read and parse one paf alignment lines. - Returns a dict with the import values for later use. - If complex_identifier is True (default False), the keys will be on the form - "{read}_{i5_or_i7}_{strand_str}". + def __init__(self, paf_line: str): + paf_cols = paf_line.split() + + # Unpack cols to vars for type annotation + self.read_name: str = paf_cols[0] + self.read_len: int = int(paf_cols[1]) # read length + self.read_start: int = int(paf_cols[2]) # start alignment on read + self.read_end: int = int(paf_cols[3]) # end alignment on read + self.read_strand: str = paf_cols[4] + self.adapter_name: str = paf_cols[5] + + self.q: int = int(paf_cols[11]) # Q score + + self.cg: str = paf_cols[-2] # cigar string + self.cs: str = paf_cols[-1] # cigar diff string + + +def map_reads_to_alns( + paf_path: str, min_qual: int = 1, complex_identifier: bool = False +) -> dict[str, list[Alignment]]: + """ + Parse a .paf file into a dict, mapping reads to their respective alignment objects. + + Outputs: + + reads_to_alns = { + "read1": [ + aln_read1_adaptor1_i5, + aln_read1_adaptor1_i7, + aln_read1_adaptor2_i5, + ... + ], + "read2": [ + aln_read1_adaptor1_i5, + aln_read1_adaptor1_i7, + aln_read1_adaptor2_i5, + ... + ], + ... + } + + complex_identifier = False (default) + --> The keys will be on the form "{read_name}". + + complex_identifier = True + --> The keys will be on the form "{read_name}_{i5_or_i7}_{strand_str}". """ - entries: dict = {} + reads_to_alns: dict = {} with open(paf_path) as paf: for paf_line in paf: - paf_cols = paf_line.split() try: - # TODO: objectify this - - # Unpack cols to vars for type annotation - read: str = paf_cols[0] - adapter: str = paf_cols[5] - rlen: int = int(paf_cols[1]) # read length - rstart: int = int(paf_cols[2]) # start alignment on read - rend: int = int(paf_cols[3]) # end alignment on read - strand: str = paf_cols[4] - cg: str = paf_cols[-2] # cigar string - cs: str = paf_cols[-1] # cigar diff string - q: int = int(paf_cols[11]) # Q score - iseq: str | None = None - sample: str | None = None + # Parse paf line into Alignment object + aln = Alignment(paf_line) # Determine identifier - if complex_identifier: - i5_or_i7 = adapter.split("_")[-1] - if strand == "+": - strand_str = "positive" - else: - strand_str = "negative" - key = f"{read}_{i5_or_i7}_{strand_str}" - else: - key = read + i5_or_i7 = aln.adapter_name.split("_")[-1] + strand_str = "positive" if aln.read_strand == "+" else "negative" + key = ( + f"{aln.read_name}_{i5_or_i7}_{strand_str}" + if complex_identifier + else aln.read_name + ) except IndexError: - log.debug(f"Could not find all paf columns: {read}") + log.debug(f"Could not find all paf columns: {aln.read_name}") continue - if q < min_qual: - log.debug(f"Low quality alignment: {read}") + if aln.q < min_qual: + log.debug(f"Low quality alignment: {aln.read_name}") continue - # Compile entry - entry = { - "read": read, - "adapter": adapter, - "rlen": rlen, - "rstart": rstart, - "rend": rend, - "strand": strand, - "cg": cg, - "cs": cs, - "q": q, - "iseq": iseq, - "sample": sample, - } - - if key in entries.keys(): - entries[key].append(entry) + if key in reads_to_alns.keys(): + reads_to_alns[key].append(aln) else: - entries[key] = [entry] + reads_to_alns[key] = [aln] - return entries + return reads_to_alns -def layout_matches( - i5_name: str, i7_name: str, paf_entries: dict[str, list[dict]] -) -> tuple[dict, dict, dict, dict]: +def categorize_matches( + i5_name: str, i7_name: str, reads_to_alns: dict[str, list[Alignment]] +) -> tuple[ + dict[str, list[Alignment]], + dict[str, list[Alignment]], + dict[str, list[Alignment]], + dict[str, list[Alignment]], +]: """ - Search the parsed paf alignments and layout possible Illumina library fragments + Search the parsed paf alignments and layout possible Illumina library fragments, + Returns dicts: - fragments. Reads with one I7 and one I5 - singletons. Reads with that only match either I5 or I7 adaptors @@ -163,119 +184,171 @@ def layout_matches( singletons = {} concats = {} unknowns = {} - for read, entry_list in paf_entries.items(): - sorted_entries = [] - for k in range(len(entry_list) - 1): - entry_i = entry_list[k] - entry_j = entry_list[k + 1] + for read, alns in reads_to_alns.items(): + sorted_alns = [] + for i in range(len(alns) - 1): + aln_i: Alignment = alns[i] + aln_j: Alignment = alns[i + 1] if ( - entry_i["adapter"] != entry_j["adapter"] - and (entry_i["adapter"] == i5_name and entry_j["adapter"] == i7_name) - or (entry_j["adapter"] == i5_name and entry_i["adapter"] == i7_name) + aln_i.adapter_name != aln_j.adapter_name + and (aln_i.adapter_name == i5_name and aln_j.adapter_name == i7_name) + or (aln_j.adapter_name == i5_name and aln_i.adapter_name == i7_name) ): - if entry_i in sorted_entries: - sorted_entries.append(entry_j) + if aln_i in sorted_alns: + sorted_alns.append(aln_j) else: - sorted_entries.extend([entry_i, entry_j]) - if len(entry_list) == 1: - singletons[read] = entry_list - elif len(sorted_entries) == 2: - fragments[read] = sorted(sorted_entries, key=lambda l: l["rstart"]) - elif len(sorted_entries) > 2: - concats[read] = sorted(sorted_entries, key=lambda l: l["rstart"]) + sorted_alns.extend([aln_i, aln_j]) + if len(alns) == 1: + singletons[read] = alns + elif len(sorted_alns) == 2: + fragments[read] = sorted(sorted_alns, key=lambda aln: aln.read_start) + elif len(sorted_alns) > 2: + concats[read] = sorted(sorted_alns, key=lambda aln: aln.read_start) log.debug( - f"Concatenated fragment: {read}, found: {[(i['adapter'],i['rstart']) for i in sorted_entries]}" + f"Concatenated fragment: {read}, found: {[(aln.adapter_name,aln.read_start) for aln in sorted_alns]}" ) else: - unknowns[read] = entry_list + unknowns[read] = alns log.debug( - f"Unknown fragment: {read}, found: {[(i['adapter'],i['rstart']) for i in entry_list]}" + f"Unknown fragment: {read}, found: {[(aln.adapter_name,aln.read_start) for aln in alns]}" ) # TODO: add minimum insert size return (fragments, singletons, concats, unknowns) def cluster_matches( - sample_adaptor: list[tuple[str, Adaptor, str]], - matches: dict, + entries: list[SampleSheetEntry], + matches: dict[str, list[Alignment]], max_distance: int, i7_reversed: bool = False, i5_reversed: bool = False, ) -> tuple[list, list]: - # Only illumina fragments - matched = {} + """Return the BED coordinates of unmatching and matching reads. + + Inputs: + - matches: dict of reads to their respective alignments + - entries: samplesheet entries for a given adaptor-barcode combination + - max_distance: maximum allowed Levenshtein distance between index read and index sequence + - i7_reversed: boolean indicating whether the i7 index is reversed + - i5_reversed: boolean indicating whether the i5 index is reversed + + Outputs: + ( + unmatched_bed: list of bed coordinates for unmatched reads + matched_bed: list of bed coordinates for matched reads + ) + """ + + # Instantiate lists of bed coordinates to fill matched_bed = [] unmatched_bed = [] + + # Iterate over each read and it's alignments for read, alignments in matches.items(): + # Determine which alignment is i5 and which is i7 if ( - alignments[0]["adapter"][-2:] == "i5" - and alignments[1]["adapter"][-2:] == "i7" + alignments[0].adapter_name[-2:] == "i5" + and alignments[1].adapter_name[-2:] == "i7" ): - i5 = alignments[0] - i7 = alignments[1] + i5_aln = alignments[0] + i7_aln = alignments[1] elif ( - alignments[1]["adapter"][-2:] == "i5" - and alignments[0]["adapter"][-2:] == "i7" + alignments[1].adapter_name[-2:] == "i5" + and alignments[0].adapter_name[-2:] == "i7" ): - i5 = alignments[1] - i7 = alignments[0] + i5_aln = alignments[1] + i7_aln = alignments[0] else: - log.debug(" {read} has no valid illumina fragment") + log.debug(f" {read} has no valid illumina fragment") continue - dists = [] - fi5 = "" - fi7 = "" - for _, adaptor, _ in sample_adaptor: - if adaptor.i5.index_seq is not None: - i5_seq = adaptor.i5.index_seq - if i5_reversed and i5_seq is not None: - i5_seq = str(Seq(i5_seq).reverse_complement()) - fi5, d1 = parse_cs( - i5["cs"], + entries_index_dists = [] + # Iterate over sample sheet entries and collect the distances between their index and the read's index + for entry in entries: + # Parse i5 index read + if entry.adaptor.i5.index_seq is not None: + if i5_reversed: + i5_seq = str(Seq(entry.adaptor.i5.index_seq).reverse_complement()) + else: + i5_seq = entry.adaptor.i5.index_seq + + i5_index_read, i5_index_dist = parse_cs( + i5_aln.cs, i5_seq, - adaptor.i5.len_umi_before_index, - adaptor.i5.len_umi_after_index, + entry.adaptor.i5.len_umi_before_index, + entry.adaptor.i5.len_umi_after_index, ) else: - d1 = 0 - - if adaptor.i7.index_seq is not None: - i7_seq = adaptor.i7.index_seq - if i7_reversed and i7_seq is not None: - i7_seq = str(Seq(i7_seq).reverse_complement()) - fi7, d2 = parse_cs( - i7["cs"], + i5_index_read = "" + i5_index_dist = 0 + + # Parse i7 index read + if entry.adaptor.i7.index_seq is not None: + if i7_reversed: + i7_seq = str(Seq(entry.adaptor.i7.index_seq).reverse_complement()) + else: + i7_seq = entry.adaptor.i7.index_seq + + i7_index_read, i7_index_dist = parse_cs( + i7_aln.cs, i7_seq, - adaptor.i7.len_umi_before_index, - adaptor.i7.len_umi_after_index, + entry.adaptor.i7.len_umi_before_index, + entry.adaptor.i7.len_umi_after_index, ) else: - d2 = 0 + i7_index_read = "" + i7_index_dist = 0 - dists.append(d1 + d2) + entries_index_dists.append(i5_index_dist + i7_index_dist) + + # Find the list idx of the minimum distance between the read's index and the indices of the samples in the sheet + entries_min_index_dist_loc = min( + range(len(entries_index_dists)), key=entries_index_dists.__getitem__ + ) + entry_min_index_dist = entries[entries_min_index_dist_loc] - index_min = min(range(len(dists)), key=dists.__getitem__) - # Test if two samples in the sheet is equidistant to the i5/i7 - if len([i for i, j in enumerate(dists) if j == dists[index_min]]) > 1: + # If several samples in the sheet are equidistant from the read, skip the read + if entries_index_dists.count(min(entries_index_dists)) > 1: continue - start_insert = min(i5["rend"], i7["rend"]) - end_insert = max(i7["rstart"], i5["rstart"]) + + # Get the coordinates of the read insert + start_insert = min(i5_aln.read_end, i7_aln.read_end) + end_insert = max(i7_aln.read_start, i5_aln.read_start) + + # If the read insert length is too short, skip the read if end_insert - start_insert < 10: continue - if dists[index_min] > max_distance: - # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7.index_seq or "") + len( - adaptor.i5.index_seq or "" - ): - fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) + + # If the read is too far from the closest sample in the sheet + if entries_index_dists[entries_min_index_dist_loc] > max_distance: + # If the read has sensible lengths, add it to the unmatched beds + if len(i7_index_read) + len(i5_index_read) == len( + entry.adaptor.i7.index_seq or "" + ) + len(entry.adaptor.i5.index_seq or ""): + fi75 = "+".join( + [i for i in [i7_index_read, i5_index_read] if not i == ""] + ) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) - continue - matched[read] = alignments - matched_bed.append( - [read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."] - ) - log.debug(f" Matched {len(matched)} reads, unmatched {len(unmatched_bed)} reads") + # Otherwise, skip the read + else: + continue + else: + # Add the read to the matched beds + matched_bed.append( + [ + read, + start_insert, + end_insert, + entry_min_index_dist.sample_name, + "999", + ".", + ] + ) + + log.debug( + f" Matched {len(matched_bed)} reads, unmatched {len(unmatched_bed)} reads" + ) + return unmatched_bed, matched_bed diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 00c4766..1998f33 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -30,9 +30,8 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): fastq files are located in "barcode##" folders. """ - self.samplesheet = [] - try: - csvfile = open(input_csv) + self.rows = [] + with open(input_csv) as csvfile: csv_first_line: str = csvfile.readline() dialect = csv.Sniffer().sniff(csv_first_line, ",;\t") csvfile.seek(0) @@ -83,7 +82,7 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): row["fastq_path"], ont_barcode, ) - self.samplesheet.append(ss_entry) + self.rows.append(ss_entry) row_number += 1 # Explanation: Don't mess around with the globs too much. @@ -105,18 +104,13 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): + " please run anglerfish separately for each set." ) - except: - raise - finally: - csvfile.close() - def minimum_bc_distance(self) -> int: """Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group. """ ont_bc_to_adaptors: dict = {} - for entry in self.samplesheet: + for entry in self.rows: if entry.ont_barcode in ont_bc_to_adaptors: ont_bc_to_adaptors[entry.ont_barcode].append(entry.adaptor) else: @@ -152,7 +146,7 @@ def minimum_bc_distance(self) -> int: def get_fastastring(self, adaptor_name: str | None = None) -> str: fastas = {} - for entry in self.samplesheet: + for entry in self.rows: if entry.adaptor.name == adaptor_name or adaptor_name is None: fastas[entry.adaptor.name + "_i7"] = entry.adaptor.i7.get_mask() fastas[entry.adaptor.name + "_i5"] = entry.adaptor.i5.get_mask() @@ -165,8 +159,35 @@ def get_fastastring(self, adaptor_name: str | None = None) -> str: return outstr + def get_adaptor_barcode_sets( + self, + ) -> list[tuple[str, str | None]]: + """Return a set of unique adaptor-barcode pairings in the samplesheet.""" + + # Get a set corresponding to the unique pairings of adaptors and ONT barcodes in the samplesheet + adaptor_barcode_sets: list[tuple[str, str | None]] = list( + set([(row.adaptor.name, row.ont_barcode) for row in self.rows]) + ) + + return adaptor_barcode_sets + + def subset_rows( + self, adaptor_name: str, ont_barcode: str | None + ) -> list[SampleSheetEntry]: + """Return a subset of samplesheet rows based on logical criteria.""" + + subset_rows = [] + + for row in self.rows: + if row.adaptor.name == adaptor_name and ont_barcode == row.ont_barcode: + subset_rows.append(row) + else: + continue + + return subset_rows + def __iter__(self): - return iter(self.samplesheet) + return iter(self.rows) def __next__(self): pass diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 0579b61..e39396a 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -6,7 +6,7 @@ import pandas as pd from anglerfish.demux.adaptor import Adaptor, load_adaptors -from anglerfish.demux.demux import parse_paf_lines, run_minimap2 +from anglerfish.demux.demux import Alignment, map_reads_to_alns, run_minimap2 from anglerfish.explore.entropy import calculate_relative_entropy logging.basicConfig(level=logging.INFO) @@ -44,13 +44,13 @@ def run_explore( log.info(f"Run uuid {run_uuid}") adaptors = cast(list[Adaptor], load_adaptors()) - alignments: list[tuple[Adaptor, str]] = [] + adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] # Map all reads against all adaptors for adaptor in adaptors: # Align aln_path = os.path.join(outdir, f"{adaptor.name}.paf") - alignments.append((adaptor, aln_path)) + adaptors_and_aln_paths.append((adaptor, aln_path)) if os.path.exists(aln_path) and use_existing: log.info(f"Skipping {adaptor.name} as alignment already exists") continue @@ -70,21 +70,18 @@ def run_explore( # Parse alignments entries: dict = {} adaptors_included = [] - for adaptor, aln_path in alignments: + for adaptor, aln_path in adaptors_and_aln_paths: log.info(f"Parsing {adaptor.name}") - aln_dict_with_lists: dict[str, list[dict]] = parse_paf_lines( + reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( aln_path, complex_identifier=True ) # Choose only the highest scoring alignment for each combination of read, adaptor end and strand - aln_dict = dict( - [ - (aln_name, max(aln_dicts, key=lambda x: x["q"])) - for aln_name, aln_dicts in aln_dict_with_lists.items() - ] - ) - - df = pd.DataFrame.from_dict(aln_dict, orient="index") + reads_to_highest_q_aln_dict: dict[str, dict] = {} + for aln_name, alns in reads_alns.items(): + highest_q_aln = max(alns, key=lambda aln: aln.q) + reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln) + df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index") nr_good_hits = {} # Match Insert Match = mim @@ -125,7 +122,7 @@ def run_explore( insert_thres_high = insert_thres_high requirements = ( - (df_mim["adapter"] == f"{adaptor.name}_{adaptor_end_name}") + (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}") & (df_mim["match_1_len"] >= (before_thres)) & (df_mim["insert_len"] >= insert_thres_low) & (df_mim["insert_len"] <= insert_thres_high) @@ -155,8 +152,8 @@ def run_explore( # Filter multiple hits per read and adaptor end df_good_hits = df_good_hits.sort_values( - by=["read", "match_1_len"], ascending=False - ).drop_duplicates(subset=["read", "adapter"], keep="first") + by=["read_name", "match_1_len"], ascending=False + ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first") if adaptor.name not in entries.keys(): entries[adaptor.name] = {} diff --git a/requirements.txt b/requirements.txt index 8b0f07b..b4f7ef2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ +bio biopython==1.83 -levenshtein==0.23.0 numpy==1.26.3 pandas==2.1.4 pytest==8.2.1 +python_levenshtein==0.23.0 pyyaml==6.0.1 -typer==0.9.0 \ No newline at end of file +setuptools +typer==0.9.0 +typing_extensions diff --git a/tests/test_anglerfish/test_demux/test_demux.py b/tests/test_anglerfish/test_demux/test_demux.py index ffeb254..ed8124b 100644 --- a/tests/test_anglerfish/test_demux/test_demux.py +++ b/tests/test_anglerfish/test_demux/test_demux.py @@ -88,6 +88,7 @@ def fixture(): def test_run_minimap2(fixture): + """Check that the function runs successfully, not the output.""" # Test alignment on single read to_test.run_minimap2( fastq_in=fixture["testdata_single"], @@ -97,10 +98,6 @@ def test_run_minimap2(fixture): minimap_b=1, ) - expected = "0ad8bdb6-e009-43c5-95b1-d381e699f983\t418\t302\t374\t+\ttruseq_i7\t67\t0\t67\t51\t64\t25\tNM:i:23\tms:i:275\tAS:i:266\tnn:i:10\ttp:A:P\tcm:i:5\ts1:i:29\ts2:i:0\tde:f:0.2388\trl:i:0\tcg:Z:11M2D33M7I21M\tcs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17\n0ad8bdb6-e009-43c5-95b1-d381e699f983\t418\t45\t110\t+\ttruseq_i5\t58\t0\t58\t56\t66\t38\tNM:i:10\tms:i:313\tAS:i:305\tnn:i:0\ttp:A:P\tcm:i:10\ts1:i:37\ts2:i:0\tde:f:0.0667\trl:i:0\tcg:Z:15M1D6M7I3M1I33M\tcs:Z::15-a*cg:5+tcccgat:3+g:33\n" - received = open(fixture["paf_single"]).read() - assert expected == received - # Create aligntment from multiple reads to_test.run_minimap2( fastq_in=fixture["testdata_multiple"], @@ -111,108 +108,44 @@ def test_run_minimap2(fixture): ) -def test_parse_paf_lines_simple(fixture): - paf_lines_simple = to_test.parse_paf_lines(fixture["paf_single"]) - expected_simple = { - "0ad8bdb6-e009-43c5-95b1-d381e699f983": [ - { - "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", - "adapter": "truseq_i7", - "rlen": 418, - "rstart": 302, - "rend": 374, - "strand": "+", - "cg": "cg:Z:11M2D33M7I21M", - "cs": "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17", - "q": 25, - "iseq": None, - "sample": None, - }, - { - "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", - "adapter": "truseq_i5", - "rlen": 418, - "rstart": 45, - "rend": 110, - "strand": "+", - "cg": "cg:Z:15M1D6M7I3M1I33M", - "cs": "cs:Z::15-a*cg:5+tcccgat:3+g:33", - "q": 38, - "iseq": None, - "sample": None, - }, - ] - } - assert paf_lines_simple == expected_simple - - -def test_parse_paf_lines_complex(fixture): - paf_lines_complex = to_test.parse_paf_lines(fixture["paf_single"]) - expected_complex = { - "0ad8bdb6-e009-43c5-95b1-d381e699f983": [ - { - "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", - "adapter": "truseq_i7", - "rlen": 418, - "rstart": 302, - "rend": 374, - "strand": "+", - "cg": "cg:Z:11M2D33M7I21M", - "cs": "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17", - "q": 25, - "iseq": None, - "sample": None, - }, - { - "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", - "adapter": "truseq_i5", - "rlen": 418, - "rstart": 45, - "rend": 110, - "strand": "+", - "cg": "cg:Z:15M1D6M7I3M1I33M", - "cs": "cs:Z::15-a*cg:5+tcccgat:3+g:33", - "q": 38, - "iseq": None, - "sample": None, - }, - ] - } - assert paf_lines_complex == expected_complex +def test_map_reads_to_alns(fixture): + reads_alns = to_test.map_reads_to_alns(fixture["paf_single"]) + + for read_name, alns in reads_alns.items(): + assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983" + for aln in alns: + if aln.adapter_name == "truseq_i7": + assert ( + aln.cs + == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + else: + assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33" def test_parse_cs(fixture): - paf_lines_simple = to_test.parse_paf_lines(fixture["paf_single"]) - - for read_name, alignments in paf_lines_simple.items(): - for alignment in alignments: - cs_string = alignment["cs"] - cs_parsed = to_test.parse_cs( - cs_string=cs_string, - index_seq=fixture["index"], - umi_before=0, - umi_after=0, - ) + test_cs_str = ( + "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + expected_cs_parsed = ("taacttggtc", 0) - if alignment["adapter"] == "truseq_i7": - # Perfect match to index - assert cs_parsed[0] == fixture["index"].lower() - assert cs_parsed[1] == 0 - elif alignment["adapter"] == "truseq_i5": - # No index, distance the length of all concatenated substitutions of the cs string - assert cs_parsed[0] == "" - assert cs_parsed[1] == 10 - else: - raise AssertionError("Case not covered.") + cs_parsed = to_test.parse_cs( + cs_string=test_cs_str, + index_seq=fixture["index"], + umi_before=0, + umi_after=0, + ) + + assert cs_parsed == expected_cs_parsed -def test_layout_matches(fixture): +def test_categorize_matches(fixture): i5_name = "truseq_i5" i7_name = "truseq_i7" - paf_entries = to_test.parse_paf_lines(fixture["paf_multiple"]) + reads_alns = to_test.map_reads_to_alns(fixture["paf_multiple"]) - layout = to_test.layout_matches( - i5_name=i5_name, i7_name=i7_name, paf_entries=paf_entries + layout = to_test.categorize_matches( + i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns ) fragments, singletons, concats, unknowns = layout