diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml index 41643c4..0871cc5 100644 --- a/.github/workflows/anglerfish.yml +++ b/.github/workflows/anglerfish.yml @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10.2] + python-version: [3.12] steps: # Checkout code and install miniconda + environment @@ -37,10 +37,10 @@ jobs: - shell: bash -l {0} name: Run anglerfish with test data run: | - anglerfish run -s test/samples.csv + anglerfish run -s testdata/samples.csv # Run anglerfish explore - shell: bash -l {0} name: Run anglerfish explore run: | - anglerfish explore -f test/BC18_P14351_1001.fastq.gz -o test/explore_output + anglerfish explore -f testdata/BC18_P14351_1001.fastq.gz -o explore_output diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index 572ff68..360bbe2 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -16,7 +16,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.12" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -34,7 +34,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.12" - name: Install dependencies run: | python -m pip install --upgrade pip @@ -51,17 +51,17 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: "3.10" + python-version: "3.12" - name: Install dependencies run: | python -m pip install --upgrade pip pip install mypy # Start by installing type stubs - name: mypy --> Install stubs - run: echo -e "y" | mypy --install-types **/*.py || exit 0 + run: echo -e "y" | mypy --install-types . || exit 0 - name: mypy --> Static type checking # Configured in pyprojet.toml - run: mypy **/*.py + run: mypy . # Use Prettier to check various file formats prettier: diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 346e673..d667841 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -11,10 +11,10 @@ jobs: - uses: actions/checkout@v4 name: Check out source-code repository - - name: Set up Python 3.10 + - name: Set up Python 3.12 uses: actions/setup-python@v4 with: - python-version: 3.10.10 + python-version: 3.12 - name: Install python dependencies run: | diff --git a/.github/workflows/test-code.yml b/.github/workflows/test-code.yml new file mode 100644 index 0000000..a3fec89 --- /dev/null +++ b/.github/workflows/test-code.yml @@ -0,0 +1,44 @@ +name: test-code +on: [push, pull_request] + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + run_pytest: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [3.12] + + steps: + # Checkout code and install miniconda + environment + - uses: actions/checkout@v4 + - uses: mamba-org/setup-micromamba@v1 + with: + init-shell: bash + create-args: >- + python=${{ matrix.python-version }} + pip + environment-file: environment.yml + environment-name: anglerfish-dev + + # Install Anglerfish + - shell: bash -l {0} + name: Install Anglerfish + run: | + python -m pip install . + + # Install Pytest + - shell: bash -l {0} + name: Install Pytest + run: | + python -m pip install pytest + + # Run Pytest + - shell: bash -l {0} + name: Run Pytest + run: | + pytest . diff --git a/.gitignore b/.gitignore index 17aab89..5d3d0d4 100755 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ +*.egg-info *.pyc *~ -*.egg-info +.*_cache .DS_Store .benchmarks -.*_cache -node_modules +.ignoredir .vscode +__pycache__ build +node_modules diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 0ce1337..4dfa150 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -13,6 +13,7 @@ import numpy as np import pkg_resources +from .demux.adaptor import Adaptor from .demux.demux import ( cluster_matches, layout_matches, @@ -86,14 +87,16 @@ def run_demux(args): adaptor_set: set[tuple[str, str]] = set(adaptor_tuples) # Create a dictionary with the adaptors as keys and an empty list as value - adaptors_sorted: dict[tuple[str, str], list] = dict([(i, []) for i in adaptor_set]) + adaptors_sorted: dict[tuple[str, str], list[tuple[str, Adaptor, str]]] = dict( + [(i, []) for i in adaptor_set] + ) # Populate the dictionary values with sample-specific information """ adaptors_sorted = { - ( adaptor_name, ont_barcode ) : [ - (sample_name, adaptor, fastq), - (sample_name, adaptor, fastq), + adaptor_name_str, ont_barcode_str ) : [ + (sample_name_str, Adaptor, fastq_str), + (sample_name_str, Adaptor, fastq_str), ... ], ... @@ -168,7 +171,7 @@ def run_demux(args): **flips[args.force_rc], ) flipped_i7, flipped_i5 = flips[args.force_rc].values() - elif args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match + elif args.lenient: # Try reverse complementing the i5 and/or i7 indices and choose the best match flipped = {} results = [] pool = multiprocessing.Pool( diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 21b6a06..68e5c1f 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -63,10 +63,36 @@ class AdaptorPart: """This class is used for the i5 or i7 adaptor.""" def __init__(self, sequence_token: str, name: str, index_seq: str | None): + ## Type declaration of attributes to be assigned upon instantiation + # Attributes from arguments + self.name: str + self.sequence_token: str + self.index_seq: str | None + + # Index attributes + self.has_index: bool + self.len_index: int | None + self.len_before_index: int | None + self.len_after_index: int | None + + # UMI attributes + self.has_umi: bool + self.len_umi: int | None + self.len_umi_before_index: int | None + self.len_umi_after_index: int | None + + # Length attributes + self.len_total: int | None + self.len_constant: int + + # Instantiation outsorced to private method + self._setup(sequence_token, name, index_seq) + + def _setup(self, sequence_token: str, name: str, index_seq: str | None): # Assign attributes from args - self.sequence_token: str = sequence_token - self.name: str = name - self.index_seq: str | None = index_seq + self.sequence_token = sequence_token + self.name = name + self.index_seq = index_seq # Index bool and len if has_match(INDEX_TOKEN, self.sequence_token): @@ -76,6 +102,10 @@ def __init__(self, sequence_token: str, name: str, index_seq: str | None): self.len_index = len(index_seq) if index_seq else None else: + if self.index_seq is not None: + raise UserWarning( + "Index sequence specified, but no index token found in adaptor sequence." + ) self.has_index = False self.len_index = 0 @@ -87,21 +117,13 @@ def __init__(self, sequence_token: str, name: str, index_seq: str | None): ) elif len(umi_tokens) == 1: self.has_umi = True - self.len_umi = int( - re.search(UMI_LENGTH_TOKEN, self.sequence_token).group(1) - ) + umi_token_search = re.search(UMI_LENGTH_TOKEN, self.sequence_token) + assert isinstance(umi_token_search, re.Match) + self.len_umi = int(umi_token_search.group(1)) else: self.has_umi = False self.len_umi = 0 - # Type declaration of attributes to be assigned - self.len_before_index: int | None - self.len_after_index: int | None - self.len_umi_before_index: int | None - self.len_umi_after_index: int | None - self.len_total: int | None - self.len_constant: int - # Lengths if self.has_index and self.has_umi: # Index and UMI @@ -149,7 +171,12 @@ def __init__(self, sequence_token: str, name: str, index_seq: str | None): self.len_before_index = None self.len_after_index = None - self.len_total = len(self.get_mask(insert_Ns=True)) if self.index_seq else None + if ( + self.has_index is True and self.index_seq is not None + ) or self.has_index is False: + self.len_total = len(self.get_mask(insert_Ns=True)) + else: + self.len_total = None self.len_constant = len(self.get_mask(insert_Ns=False)) def get_mask(self, insert_Ns: bool = True) -> str: @@ -165,11 +192,12 @@ def get_mask(self, insert_Ns: bool = True) -> str: else 0 ) - umi_mask_length = ( - max(self.len_umi_after_index, self.len_umi_before_index) - if insert_Ns and self.has_umi - else 0 - ) + if insert_Ns and self.has_umi: + assert self.len_umi_before_index is not None + assert self.len_umi_after_index is not None + umi_mask_length = max(self.len_umi_after_index, self.len_umi_before_index) + else: + umi_mask_length = 0 # Test if the index is specified in the adaptor sequence when it shouldn't be if ( @@ -189,7 +217,7 @@ def get_mask(self, insert_Ns: bool = True) -> str: return self.sequence_token -def has_match(pattern: re.Pattern, query: str) -> bool: +def has_match(pattern: re.Pattern | str, query: str) -> bool: """General function to check if a string contains a pattern.""" match = re.search(pattern, query) if match is None: @@ -209,6 +237,8 @@ def validate_adaptors(adaptors_dict: dict): f"Adaptor {adaptor_name} has an invalid sequence for {i}: {sequence_token}. Does not conform to the pattern {VALID_SEQUENCE_TOKEN_PATTERN}." ) + return True + def load_adaptors(raw: bool = False) -> list[Adaptor] | dict: """Fetch all adaptors. @@ -226,7 +256,7 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict: adaptors_dict = yaml.safe_load(f) # Validate input - validate_adaptors(adaptors_dict) + assert validate_adaptors(adaptors_dict) is True # Optionally, return raw dict if raw: diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 48faef6..4f3ddda 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -4,49 +4,69 @@ import os import re import subprocess +from typing import cast import Levenshtein as lev from Bio.Seq import Seq from Bio.SeqIO.QualityIO import FastqGeneralIterator +from anglerfish.demux.adaptor import Adaptor + log = logging.getLogger("anglerfish") -def parse_cs(cs_string, index, umi_before=0, umi_after=0): +def parse_cs( + cs_string: str, + index_seq: str, + umi_before: int | None = 0, + umi_after: int | None = 0, +) -> tuple[str, int]: """ - Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance + Given a cs string, an index sequence, and optional UMI lengths: + + - Parse the cs string to find the suspected index region of the read. + - Return a tuple of the given index sequence and it's Levenshtein distance + to the parsed index region of the read. """ - nt = re.compile(r"\*n([atcg])") - nts = "".join(re.findall(nt, cs_string)) + # Create pattern for a substitution from "n" in the adaptor to a base in the read + n_subbed_pattern = re.compile(r"\*n([atcg])") + # Concatenate all n-to-base substitutions to yield the sequence of the read spanning the mask + bases_spanning_mask = "".join(re.findall(n_subbed_pattern, cs_string)) + # Trim away any UMIs if umi_before is not None and umi_before > 0: - nts = nts[umi_before:] - if umi_after is not None and umi_after > 0: - nts = nts[:-umi_after] - # Allow for mismatches - return nts, lev.distance(index.lower(), nts) + bases_spanning_index_mask = bases_spanning_mask[umi_before:] + elif umi_after is not None and umi_after > 0: + bases_spanning_index_mask = bases_spanning_mask[:-umi_after] + else: + bases_spanning_index_mask = bases_spanning_mask + # Return the index and the Levenshtein distance between it and the presumed index region of the read + return bases_spanning_index_mask, lev.distance( + index_seq.lower(), bases_spanning_index_mask + ) -def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1): +def run_minimap2( + fastq_in: str, + index_file: str, + output_paf: str, + threads: int, + minimap_b: int = 1, +): """ Runs Minimap2 """ - cmd = [ + cmd: list[str] = [ "minimap2", - "--cs", - "-m8", - "-k", - "10", - "-w", - "5", - "-A", - "6", - "-B", - str(minimap_b), - "-c", - "-t", - str(threads), - indexfile, - fastq_in, + "--cs", # Output the cs tag (short) + "-c", # Output cigar string in .paf + *["-A", "6"], # Matching score + *["-B", str(minimap_b)], # Mismatch penalty + *["-k", "10"], # k-mer size + *["-m", "8"], # Minimal chaining score + *["-t", str(threads)], # Number of threads + *["-w", "5"], # Minimizer window size + index_file, # Target + fastq_in, # Query ] run_log = f"{output_paf}.log" @@ -55,59 +75,81 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1): subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) -def parse_paf_lines(paf, min_qual=1, complex_identifier=False): +def parse_paf_lines( + paf_path: str, min_qual: int = 1, complex_identifier: bool = False +) -> dict[str, list[dict]]: """ Read and parse one paf alignment lines. - Returns a dict with the import values for later use + Returns a dict with the import values for later use. + If complex_identifier is True (default False), the keys will be on the form - "{read}_{i5_or_i7}_{strand_str}" + "{read}_{i5_or_i7}_{strand_str}". """ - entries = {} - with open(paf) as paf: + entries: dict = {} + with open(paf_path) as paf: for paf_line in paf: - aln = paf_line.split() + paf_cols = paf_line.split() try: # TODO: objectify this - entry = { - "read": aln[0], - "adapter": aln[5], - "rlen": int(aln[1]), # read length - "rstart": int(aln[2]), # start alignment on read - "rend": int(aln[3]), # end alignment on read - "strand": aln[4], - "cg": aln[-2], # cigar string - "cs": aln[-1], # cs string - "q": int(aln[11]), # Q score - "iseq": None, - "sample": None, - } - read = entry["read"] + + # Unpack cols to vars for type annotation + read: str = paf_cols[0] + adapter: str = paf_cols[5] + rlen: int = int(paf_cols[1]) # read length + rstart: int = int(paf_cols[2]) # start alignment on read + rend: int = int(paf_cols[3]) # end alignment on read + strand: str = paf_cols[4] + cg: str = paf_cols[-2] # cigar string + cs: str = paf_cols[-1] # cigar diff string + q: int = int(paf_cols[11]) # Q score + iseq: str | None = None + sample: str | None = None + + # Determine identifier if complex_identifier: - i5_or_i7 = entry["adapter"].split("_")[-1] - if entry["strand"] == "+": + i5_or_i7 = adapter.split("_")[-1] + if strand == "+": strand_str = "positive" else: strand_str = "negative" - ix = f"{read}_{i5_or_i7}_{strand_str}" + key = f"{read}_{i5_or_i7}_{strand_str}" else: - ix = read + key = read + except IndexError: log.debug(f"Could not find all paf columns: {read}") continue - if entry["q"] < min_qual: + if q < min_qual: log.debug(f"Low quality alignment: {read}") continue - if ix in entries.keys(): - entries[ix].append(entry) + # Compile entry + entry = { + "read": read, + "adapter": adapter, + "rlen": rlen, + "rstart": rstart, + "rend": rend, + "strand": strand, + "cg": cg, + "cs": cs, + "q": q, + "iseq": iseq, + "sample": sample, + } + + if key in entries.keys(): + entries[key].append(entry) else: - entries[ix] = [entry] + entries[key] = [entry] return entries -def layout_matches(i5_name, i7_name, paf_entries): +def layout_matches( + i5_name: str, i7_name: str, paf_entries: dict[str, list[dict]] +) -> tuple[dict, dict, dict, dict]: """ Search the parsed paf alignments and layout possible Illumina library fragments Returns dicts: @@ -154,15 +196,17 @@ def layout_matches(i5_name, i7_name, paf_entries): def cluster_matches( - sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False -): + sample_adaptor: list[tuple[str, Adaptor, str]], + matches: dict, + max_distance: int, + i7_reversed: bool = False, + i5_reversed: bool = False, +) -> tuple[list, list]: # Only illumina fragments matched = {} matched_bed = [] unmatched_bed = [] for read, alignments in matches.items(): - i5 = False - i7 = False if ( alignments[0]["adapter"][-2:] == "i5" and alignments[1]["adapter"][-2:] == "i7" @@ -183,7 +227,7 @@ def cluster_matches( fi5 = "" fi7 = "" for _, adaptor, _ in sample_adaptor: - try: + if adaptor.i5.index_seq is not None: i5_seq = adaptor.i5.index_seq if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) @@ -193,18 +237,22 @@ def cluster_matches( adaptor.i5.len_umi_before_index, adaptor.i5.len_umi_after_index, ) - except AttributeError: - d1 = 0 # presumably it's single index, so no i5 - - i7_seq = adaptor.i7.index_seq - if i7_reversed and i7_seq is not None: - i7_seq = str(Seq(i7_seq).reverse_complement()) - fi7, d2 = parse_cs( - i7["cs"], - i7_seq, - adaptor.i7.len_umi_before_index, - adaptor.i7.len_umi_after_index, - ) + else: + d1 = 0 + + if adaptor.i7.index_seq is not None: + i7_seq = adaptor.i7.index_seq + if i7_reversed and i7_seq is not None: + i7_seq = str(Seq(i7_seq).reverse_complement()) + fi7, d2 = parse_cs( + i7["cs"], + i7_seq, + adaptor.i7.len_umi_before_index, + adaptor.i7.len_umi_after_index, + ) + else: + d2 = 0 + dists.append(d1 + d2) index_min = min(range(len(dists)), key=dists.__getitem__) @@ -231,7 +279,9 @@ def cluster_matches( return unmatched_bed, matched_bed -def write_demuxedfastq(beds, fastq_in, fastq_out): +def write_demuxedfastq( + beds: dict[str, list], fastq_in: os.PathLike, fastq_out: os.PathLike +) -> int: """ Intended for multiprocessing Take a set of coordinates in bed format [[seq1, start, end, ..][seq2, ..]] @@ -239,12 +289,15 @@ def write_demuxedfastq(beds, fastq_in, fastq_out): Return: PID of the process """ + gz_buf = 131072 - fq_files = glob.glob(fastq_in) + fq_files = cast(list[str], glob.glob(fastq_in)) + assert len(fq_files) > 0, f"No fastq files found looking for {fastq_in}." for fq in fq_files: with subprocess.Popen( ["gzip", "-c", "-d", fq], stdout=subprocess.PIPE, bufsize=gz_buf ) as fzi: + assert isinstance(fzi, subprocess.Popen) fi = io.TextIOWrapper(fzi.stdout, write_through=True) with open(fastq_out, "ab") as ofile: with subprocess.Popen( @@ -254,6 +307,7 @@ def write_demuxedfastq(beds, fastq_in, fastq_out): bufsize=gz_buf, close_fds=False, ) as oz: + assert isinstance(oz, subprocess.Popen) for title, seq, qual in FastqGeneralIterator(fi): new_title = title.split() if new_title[0] not in beds.keys(): diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 90adbe5..00c4766 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -3,12 +3,13 @@ import re from dataclasses import dataclass from itertools import combinations +from typing import cast import Levenshtein as lev from anglerfish.demux.adaptor import Adaptor, load_adaptors -ADAPTORS = load_adaptors(raw=True) +ADAPTORS = cast(dict, load_adaptors(raw=True)) @dataclass @@ -32,7 +33,8 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): self.samplesheet = [] try: csvfile = open(input_csv) - dialect = csv.Sniffer().sniff(csvfile.readline(), [",", ";", "\t"]) + csv_first_line: str = csvfile.readline() + dialect = csv.Sniffer().sniff(csv_first_line, ",;\t") csvfile.seek(0) data = csv.DictReader( csvfile, @@ -113,14 +115,14 @@ def minimum_bc_distance(self) -> int: or within each ONT barcode group. """ - ont_bc_to_adaptors = {} + ont_bc_to_adaptors: dict = {} for entry in self.samplesheet: if entry.ont_barcode in ont_bc_to_adaptors: ont_bc_to_adaptors[entry.ont_barcode].append(entry.adaptor) else: ont_bc_to_adaptors[entry.ont_barcode] = [entry.adaptor] - testset = {} + testset: dict = {} for ont_barcode, adaptors in ont_bc_to_adaptors.items(): testset[ont_barcode] = [] for adaptor in adaptors: @@ -148,7 +150,7 @@ def minimum_bc_distance(self) -> int: min_distances_all_barcodes.append(min(distances_within_barcode)) return min(min_distances_all_barcodes) - def get_fastastring(self, adaptor_name: str = None) -> str: + def get_fastastring(self, adaptor_name: str | None = None) -> str: fastas = {} for entry in self.samplesheet: if entry.adaptor.name == adaptor_name or adaptor_name is None: diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 25e9c1a..0579b61 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -1,10 +1,11 @@ import logging import os import uuid +from typing import cast import pandas as pd -from anglerfish.demux.adaptor import load_adaptors +from anglerfish.demux.adaptor import Adaptor, load_adaptors from anglerfish.demux.demux import parse_paf_lines, run_minimap2 from anglerfish.explore.entropy import calculate_relative_entropy @@ -13,17 +14,17 @@ def run_explore( - fastq, - outdir, - threads, - use_existing, - good_hit_threshold, - insert_thres_low, - insert_thres_high, - minimap_b, - min_hits_per_adaptor, - umi_threshold, - kmer_length, + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + good_hit_threshold: float, + insert_thres_low: int, + insert_thres_high: int, + minimap_b: int, + min_hits_per_adaptor: int, + umi_threshold: float, + kmer_length: int, ): # Setup a run directory run_uuid = str(uuid.uuid4()) @@ -42,36 +43,45 @@ def run_explore( log.info("Running anglerfish explore") log.info(f"Run uuid {run_uuid}") - adaptors = load_adaptors() - alignments = [] + adaptors = cast(list[Adaptor], load_adaptors()) + alignments: list[tuple[Adaptor, str]] = [] # Map all reads against all adaptors for adaptor in adaptors: - adaptor_name = adaptor.name - # Align - aln_path = os.path.join(outdir, f"{adaptor_name}.paf") + aln_path = os.path.join(outdir, f"{adaptor.name}.paf") alignments.append((adaptor, aln_path)) if os.path.exists(aln_path) and use_existing: - log.info(f"Skipping {adaptor_name} as alignment already exists") + log.info(f"Skipping {adaptor.name} as alignment already exists") continue - adaptor_path = os.path.join(outdir, f"{adaptor_name}.fasta") + adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") with open(adaptor_path, "w") as f: f.write(adaptor.get_fastastring(insert_Ns=False)) - log.info(f"Aligning {adaptor_name}") - run_minimap2(fastq, adaptor_path, aln_path, threads, minimap_b) + log.info(f"Aligning {adaptor.name}") + run_minimap2( + fastq_in=fastq, + index_file=adaptor_path, + output_paf=aln_path, + threads=threads, + minimap_b=minimap_b, + ) # Parse alignments - entries = {} + entries: dict = {} adaptors_included = [] for adaptor, aln_path in alignments: log.info(f"Parsing {adaptor.name}") - aln_dict_with_lists = parse_paf_lines(aln_path, complex_identifier=True) + aln_dict_with_lists: dict[str, list[dict]] = parse_paf_lines( + aln_path, complex_identifier=True + ) # Choose only the highest scoring alignment for each combination of read, adaptor end and strand aln_dict = dict( - [(k, max(v, key=lambda x: x["q"])) for k, v in aln_dict_with_lists.items()] + [ + (aln_name, max(aln_dicts, key=lambda x: x["q"])) + for aln_name, aln_dicts in aln_dict_with_lists.items() + ] ) df = pd.DataFrame.from_dict(aln_dict, orient="index") @@ -105,6 +115,9 @@ def run_explore( ["i5", "i7"], [adaptor.i5, adaptor.i7] ): if adaptor_end.has_index: + assert adaptor_end.len_before_index is not None + assert adaptor_end.len_after_index is not None + # Alignment thresholds before_thres = round(adaptor_end.len_before_index * good_hit_threshold) after_thres = round(adaptor_end.len_after_index * good_hit_threshold) diff --git a/requirements.txt b/requirements.txt index 3262cac..8b0f07b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ biopython==1.83 levenshtein==0.23.0 numpy==1.26.3 pandas==2.1.4 +pytest==8.2.1 pyyaml==6.0.1 typer==0.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 8c43b7a..06d9aed 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ author_email="remi-andre.olsen@scilifelab.se", url="https://github.com/remiolsen/anglerfish", license="MIT", - python_requires=">=3.10", + python_requires=">=3.12", packages=find_packages(), package_data={"": ["config/adaptors.yaml"]}, # Read requirements.txt diff --git a/test/samples.csv b/test/samples.csv deleted file mode 100755 index dcdca6e..0000000 --- a/test/samples.csv +++ /dev/null @@ -1 +0,0 @@ -P14351_1001,truseq,TAACTTGGTC,test/BC18_P14351_1001.fastq.gz diff --git a/test/BC18_P14351_1001.fastq.gz b/testdata/BC18_P14351_1001.fastq.gz similarity index 100% rename from test/BC18_P14351_1001.fastq.gz rename to testdata/BC18_P14351_1001.fastq.gz diff --git a/testdata/explore/explore_testdata.fastq b/testdata/explore/explore_testdata.fastq new file mode 100644 index 0000000..4794c8d --- /dev/null +++ b/testdata/explore/explore_testdata.fastq @@ -0,0 +1,4 @@ +@truseq-read_10_C-index_5G runid=test sampleid=Pools read=1 ch=1 start_time=2024-05-31T13:06:00Z +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCCCCCCCCCCGATCGGAAGAGCACACGTCTGAACTCCAGTCACGGGGGATCTCGTATGCCGTCTTCTGCTTG ++ +%%$$&(-6.8((%&$%$$%((%'$.(%%#$$*,-,#&(+%',+'$$+&&+'*'$%$#$$&**)(%%$)$#%&%$$%%$%$%%)%0$"%%&('&*#$',**''&%%$#&(&'($&))%##&096*5019=. diff --git a/testdata/explore/illumina_ud.fasta b/testdata/explore/illumina_ud.fasta new file mode 100644 index 0000000..8b221b4 --- /dev/null +++ b/testdata/explore/illumina_ud.fasta @@ -0,0 +1,4 @@ +>truseq_i5 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>truseq_i7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG \ No newline at end of file diff --git a/testdata/explore/paf_alignment_dissected.txt b/testdata/explore/paf_alignment_dissected.txt new file mode 100644 index 0000000..c87b471 --- /dev/null +++ b/testdata/explore/paf_alignment_dissected.txt @@ -0,0 +1,3 @@ +qname qlen qstart qend s tname tlen tstart tend nmatch n_aln_block qmap edit_distance_to_ref system_score aln_score n_ambiguous_bases aln_type n_minimizers_in_aln ? ? per_base_dvrgnc len_cnsctv_hits cigar_string cigar_diff +truseq_i5 58 0 58 + truseq-read_10_C-index_5G 130 0 58 58 58 60 NM:i:0 ms:i:348 AS:i:348 nn:i:0 tp:A:P cm:i:16 s1:i:54 s2:i:0 de:f:0 rl:i:0 cg:Z:58M cs:Z::58 +truseq_i7 57 0 57 + truseq-read_10_C-index_5G 130 68 130 57 62 60 NM:i:5 ms:i:333 AS:i:328 nn:i:0 tp:A:P cm:i:15 s1:i:54 s2:i:0 de:f:0.0172 rl:i:0 cg:Z:33M5D24M cs:Z::33-ggggg:24 diff --git a/testdata/samples.csv b/testdata/samples.csv new file mode 100755 index 0000000..82fc084 --- /dev/null +++ b/testdata/samples.csv @@ -0,0 +1 @@ +P14351_1001,truseq,TAACTTGGTC,testdata/BC18_P14351_1001.fastq.gz diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_anglerfish/test_demux/test_adaptor.py new file mode 100644 index 0000000..5b3fb34 --- /dev/null +++ b/tests/test_anglerfish/test_demux/test_adaptor.py @@ -0,0 +1,236 @@ +import re + +from anglerfish.demux import adaptor as to_test + + +def test_has_match(): + assert ( + to_test.has_match(re.compile("pattern"), "thereisapatterninthisstring") is True + ) + assert to_test.has_match(re.compile("pattern"), "nothinghere") is False + + +def test_validate_adaptors(): + valid_adaptors_dict = { + "truseq_umi": { + "i5": "TAGCCCTT", + "i7": "CAGTGGAA", + }, + } + assert to_test.validate_adaptors(valid_adaptors_dict) is True + + invalid_adaptors_dict = { + "falseq_umi": { + "i5": "TAGCCCTT", + "i7": "CAGTXYZ", + }, + } + try: + to_test.validate_adaptors(invalid_adaptors_dict) + except UserWarning as e: + assert e + else: + raise AssertionError("UserWarning not raised") + + +def test_load_adaptors(): + adaptors = to_test.load_adaptors() + assert adaptors + assert isinstance(adaptors, list) + for adaptor in adaptors: + assert isinstance(adaptor, to_test.Adaptor) + + adaptors_raw = to_test.load_adaptors(raw=True) + assert isinstance(adaptors_raw, dict) + for adaptor in adaptors_raw: + assert isinstance(adaptors_raw[adaptor], dict) + + +class TestAdaptorPart: + """Explicit combinatorial testing, ugly but effective and readable. + + Here-in are contained test cases for a variety of instantiated AdaptorPart objects. + All attributes and methods are tested for correctness. + """ + + def test_should_fail(self): + """Specifying an index on an adaptor without an index should raise a UserWarning.""" + try: + to_test.AdaptorPart( + sequence_token="ATCG", name="should_fail", index_seq="AAA" + ) + except UserWarning as e: + assert e + else: + raise AssertionError("UserWarning not raised") + + def test_simple(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCG", name="simple", index_seq=None + ) + assert adaptor_part.sequence_token == "ATCG" + assert adaptor_part.name == "simple" + assert adaptor_part.index_seq is None + assert adaptor_part.has_index is False + assert adaptor_part.len_index == 0 + assert adaptor_part.has_umi is False + assert adaptor_part.len_umi == 0 + assert adaptor_part.len_before_index is None + assert adaptor_part.len_after_index is None + assert adaptor_part.len_umi_before_index is None + assert adaptor_part.len_umi_after_index is None + assert adaptor_part.len_total == 4 + assert adaptor_part.len_constant == 4 + assert adaptor_part.get_mask(insert_Ns=True) == "ATCG" + assert adaptor_part.get_mask(insert_Ns=False) == "ATCG" + + def test_unknown_index(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCGATC", name="unknown_index", index_seq=None + ) + assert adaptor_part.sequence_token == "ATCGATC" + assert adaptor_part.name == "unknown_index" + assert adaptor_part.index_seq is None + assert adaptor_part.has_index is True + assert adaptor_part.len_index is None + assert adaptor_part.has_umi is False + assert adaptor_part.len_umi == 0 + assert adaptor_part.len_before_index == 4 + assert adaptor_part.len_after_index == 3 + assert adaptor_part.len_umi_before_index == 0 + assert adaptor_part.len_umi_after_index == 0 + assert adaptor_part.len_total is None + assert adaptor_part.len_constant == 7 + try: + adaptor_part.get_mask(insert_Ns=True) + except UserWarning as e: + assert e + else: + raise AssertionError("UserWarning not raised") + assert adaptor_part.get_mask(insert_Ns=False) == "ATCGATC" + + def test_known_index(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCGATC", name="known_index", index_seq="GGG" + ) + assert adaptor_part.sequence_token == "ATCGATC" + assert adaptor_part.name == "known_index" + assert adaptor_part.index_seq == "GGG" + assert adaptor_part.has_index is True + assert adaptor_part.len_index == 3 + assert adaptor_part.has_umi is False + assert adaptor_part.len_umi == 0 + assert adaptor_part.len_before_index == 4 + assert adaptor_part.len_after_index == 3 + assert adaptor_part.len_umi_before_index == 0 + assert adaptor_part.len_umi_after_index == 0 + assert adaptor_part.len_total == 10 + assert adaptor_part.len_constant == 7 + assert adaptor_part.get_mask(insert_Ns=True) == "ATCGNNNATC" + assert adaptor_part.get_mask(insert_Ns=False) == "ATCGATC" + + def test_unknown_index_umi(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCGATC", name="unknown_index_umi", index_seq=None + ) + assert adaptor_part.sequence_token == "ATCGATC" + assert adaptor_part.name == "unknown_index_umi" + assert adaptor_part.index_seq is None + assert adaptor_part.has_index is True + assert adaptor_part.len_index is None + assert adaptor_part.has_umi is True + assert adaptor_part.len_umi == 5 + assert adaptor_part.len_before_index == 4 + assert adaptor_part.len_after_index == 8 + assert adaptor_part.len_umi_before_index == 0 + assert adaptor_part.len_umi_after_index == 5 + assert adaptor_part.len_total is None + assert adaptor_part.len_constant == 7 + try: + adaptor_part.get_mask(insert_Ns=True) + except UserWarning as e: + assert e + else: + raise AssertionError("UserWarning not raised") + assert adaptor_part.get_mask(insert_Ns=False) == "ATCGATC" + + def test_known_index_umi(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCGATC", name="known_index_umi", index_seq="GGG" + ) + assert adaptor_part.sequence_token == "ATCGATC" + assert adaptor_part.name == "known_index_umi" + assert adaptor_part.index_seq == "GGG" + assert adaptor_part.has_index is True + assert adaptor_part.len_index == 3 + assert adaptor_part.has_umi is True + assert adaptor_part.len_umi == 5 + assert adaptor_part.len_before_index == 4 + assert adaptor_part.len_after_index == 8 + assert adaptor_part.len_umi_before_index == 0 + assert adaptor_part.len_umi_after_index == 5 + assert adaptor_part.len_total == 15 + assert adaptor_part.len_constant == 7 + assert adaptor_part.get_mask(insert_Ns=True) == "ATCGNNNNNNNNATC" + assert adaptor_part.get_mask(insert_Ns=False) == "ATCGATC" + + def test_umi_known_index(self): + adaptor_part = to_test.AdaptorPart( + sequence_token="ATCGATC", name="umi_known_index", index_seq="GGG" + ) + assert adaptor_part.sequence_token == "ATCGATC" + assert adaptor_part.name == "umi_known_index" + assert adaptor_part.index_seq == "GGG" + assert adaptor_part.has_index is True + assert adaptor_part.len_index == 3 + assert adaptor_part.has_umi is True + assert adaptor_part.len_umi == 5 + assert adaptor_part.len_before_index == 9 + assert adaptor_part.len_after_index == 3 + assert adaptor_part.len_umi_before_index == 5 + assert adaptor_part.len_umi_after_index == 0 + assert adaptor_part.len_total == 15 + assert adaptor_part.len_constant == 7 + assert adaptor_part.get_mask(insert_Ns=True) == "ATCGNNNNNNNNATC" + assert adaptor_part.get_mask(insert_Ns=False) == "ATCGATC" + + +class TestAdaptor: + def test_adaptor(self): + adaptors = { + "simple_and_index_umi": { + "i5": "AAA", + "i7": "AAACCC", + } + } + + adaptor = to_test.Adaptor( + "simple_and_index_umi", adaptors, i5_index=None, i7_index="TTT" + ) + assert adaptor.name == "simple_and_index_umi" + assert isinstance(adaptor.i5, to_test.AdaptorPart) + assert isinstance(adaptor.i7, to_test.AdaptorPart) + assert ( + adaptor.get_fastastring(insert_Ns=True) + == "\n".join( + [ + ">simple_and_index_umi_i5", + "AAA", + ">simple_and_index_umi_i7", + "AAANNNNNNNCCC", + ] + ) + + "\n" + ) + assert ( + adaptor.get_fastastring(insert_Ns=False) + == "\n".join( + [ + ">simple_and_index_umi_i5", + "AAA", + ">simple_and_index_umi_i7", + "AAACCC", + ] + ) + + "\n" + ) diff --git a/tests/test_anglerfish/test_demux/test_demux.py b/tests/test_anglerfish/test_demux/test_demux.py new file mode 100644 index 0000000..ffeb254 --- /dev/null +++ b/tests/test_anglerfish/test_demux/test_demux.py @@ -0,0 +1,224 @@ +import gzip +import tempfile +from pathlib import Path + +import pytest + +from anglerfish.demux import demux as to_test + + +@pytest.fixture(scope="module") +def fixture(): + f"""Fixture for all tests within {__file__}. + + Creates a temporary directory with... + - truseq.fasta: A fasta file with the truseq i7 and i5 adaptors. + - testdata_single.fastq.gz: A single read subset from the testdata with a perfect index match (TAACTTGGTC) to the truseq i7 adaptor. + - testdata_multiple.fastq.gz: Fabricated test data intended to test the categorization of alignments. + + Expected files to be created... + - test_single.paf: The output of minimap2 run on testdata_single.fastq.gz and truseq.fasta. + - test_multiple.paf: The output of minimap2 run on testdata_multiple.fastq.gz and truseq.fasta. + """ + tmp = tempfile.TemporaryDirectory() + tmp_path = Path(tmp.name) + + # Create testdata of single read + with gzip.open(tmp_path / "testdata_single.fastq.gz", "wb") as f: + contents = "\n".join( + [ + "@0ad8bdb6-e009-43c5-95b1-d381e699f983 runid=7d98ab9ee7f513f92fd5e382dc3dd563b956e4fa sampleid=Pools read=7283 ch=21 start_time=2020-01-17T15:19:22Z", + "CGTTGTAGTTCACCAAACCCAACAACCTAGATAGGCCGCACCTAAAATGATACGGCGACCGCGAGATCCCGATTCTGACACTCTTTCCCTACACGACGCTCTTCCGATCTCACAAAATTATTAGGCATGGTGGTGCATGCCTGTGGTCCCAGCTACTTGGGAGGCTGAGGCAGGGAATCGCTTGAACCCAGGAGGCAGAGGTTGCAGTGAGCCAGACCAGCGCCACTGTACTCCAGCCTGGCAACAAGCAGGAACTCCGTTTCAAAACAAGATCTAGGGTGGTTTTCCCAAGTTAGTCAGGAGATCGGAAGAGCACGTCGGAACTCCAGTCACTAACTTGGTCAACCTAGAAATCTGATGCCGTCTTCTGCTTGTTAGGTGCTGGCCTATCTGGGTGTTGGGTTTGGTTGTATAACGT", + "+", + "&&##$$'$()'$$&$&%%%%%%%(*340++1','',,)-*669:98731/0+&&(%&$$$%(',,4$$,&;>@:2$-')&&25<<>/4.,'*$,--**6<;<>)1...2-&%+///+++*%'*1B:93;:8618'/&%,%'()344:=:'*6*5$*39090-/4/4)537-/166+)%,'(%'02)5-5@A:/;75&'3?=9=--=;.,/030'9%&&((48977;<=@;3445%'.2<=;51/(.,),*7*%&(,(()&'/01*)'%%$%%(%%$%$", + "", + ] + ) + f.write(contents.encode()) + + # Create testdata of multiple different reads + dummy_read = "ATCG" * 10 + truseq_i5 = "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT" + truseq_index = "TAACTTGGTC" + truseq_i7 = ( + f"GATCGGAAGAGCACACGTCTGAACTCCAGTCAC{truseq_index}ATCTCGTATGCCGTCTTCTGCTTG" + ) + + test_reads = { + "fragment_read": truseq_i5 + dummy_read + truseq_i7, + "singleton_i7_read": dummy_read + truseq_i7, + "singleton_i5_read": truseq_i5 + dummy_read, + "concat_read": truseq_i5 + truseq_i5 + dummy_read + truseq_i7 + truseq_i7, + "unknown_read": truseq_i5 + dummy_read + truseq_i5, + } + + with gzip.open(tmp_path / "testdata_multiple.fastq.gz", "ab") as f: + for read_name, read_sequence in test_reads.items(): + contents = "\n".join( + [f"@{read_name}", read_sequence, "+", len(read_sequence) * "%", ""] + ) + f.write(contents.encode()) + + # Create adaptor fasta file + with open(tmp_path / "truseq.fasta", "w") as f: + contents = "\n".join( + [ + ">truseq_i7", + "GATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG", + ">truseq_i5", + "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "", + ] + ) + f.write(contents) + + fixture = { + "tmp_path": tmp_path, + "index_file": tmp_path / "truseq.fasta", + "index": truseq_index, + "testdata_single": tmp_path / "testdata_single.fastq.gz", + "testdata_multiple": tmp_path / "testdata_multiple.fastq.gz", + "paf_single": tmp_path / "test_single.paf", + "paf_multiple": tmp_path / "test_multiple.paf", + } + + yield fixture + + tmp.cleanup() + + +def test_run_minimap2(fixture): + # Test alignment on single read + to_test.run_minimap2( + fastq_in=fixture["testdata_single"], + index_file=fixture["index_file"], + output_paf=fixture["paf_single"], + threads=1, + minimap_b=1, + ) + + expected = "0ad8bdb6-e009-43c5-95b1-d381e699f983\t418\t302\t374\t+\ttruseq_i7\t67\t0\t67\t51\t64\t25\tNM:i:23\tms:i:275\tAS:i:266\tnn:i:10\ttp:A:P\tcm:i:5\ts1:i:29\ts2:i:0\tde:f:0.2388\trl:i:0\tcg:Z:11M2D33M7I21M\tcs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17\n0ad8bdb6-e009-43c5-95b1-d381e699f983\t418\t45\t110\t+\ttruseq_i5\t58\t0\t58\t56\t66\t38\tNM:i:10\tms:i:313\tAS:i:305\tnn:i:0\ttp:A:P\tcm:i:10\ts1:i:37\ts2:i:0\tde:f:0.0667\trl:i:0\tcg:Z:15M1D6M7I3M1I33M\tcs:Z::15-a*cg:5+tcccgat:3+g:33\n" + received = open(fixture["paf_single"]).read() + assert expected == received + + # Create aligntment from multiple reads + to_test.run_minimap2( + fastq_in=fixture["testdata_multiple"], + index_file=fixture["index_file"], + output_paf=fixture["paf_multiple"], + threads=1, + minimap_b=1, + ) + + +def test_parse_paf_lines_simple(fixture): + paf_lines_simple = to_test.parse_paf_lines(fixture["paf_single"]) + expected_simple = { + "0ad8bdb6-e009-43c5-95b1-d381e699f983": [ + { + "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", + "adapter": "truseq_i7", + "rlen": 418, + "rstart": 302, + "rend": 374, + "strand": "+", + "cg": "cg:Z:11M2D33M7I21M", + "cs": "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17", + "q": 25, + "iseq": None, + "sample": None, + }, + { + "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", + "adapter": "truseq_i5", + "rlen": 418, + "rstart": 45, + "rend": 110, + "strand": "+", + "cg": "cg:Z:15M1D6M7I3M1I33M", + "cs": "cs:Z::15-a*cg:5+tcccgat:3+g:33", + "q": 38, + "iseq": None, + "sample": None, + }, + ] + } + assert paf_lines_simple == expected_simple + + +def test_parse_paf_lines_complex(fixture): + paf_lines_complex = to_test.parse_paf_lines(fixture["paf_single"]) + expected_complex = { + "0ad8bdb6-e009-43c5-95b1-d381e699f983": [ + { + "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", + "adapter": "truseq_i7", + "rlen": 418, + "rstart": 302, + "rend": 374, + "strand": "+", + "cg": "cg:Z:11M2D33M7I21M", + "cs": "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17", + "q": 25, + "iseq": None, + "sample": None, + }, + { + "read": "0ad8bdb6-e009-43c5-95b1-d381e699f983", + "adapter": "truseq_i5", + "rlen": 418, + "rstart": 45, + "rend": 110, + "strand": "+", + "cg": "cg:Z:15M1D6M7I3M1I33M", + "cs": "cs:Z::15-a*cg:5+tcccgat:3+g:33", + "q": 38, + "iseq": None, + "sample": None, + }, + ] + } + assert paf_lines_complex == expected_complex + + +def test_parse_cs(fixture): + paf_lines_simple = to_test.parse_paf_lines(fixture["paf_single"]) + + for read_name, alignments in paf_lines_simple.items(): + for alignment in alignments: + cs_string = alignment["cs"] + cs_parsed = to_test.parse_cs( + cs_string=cs_string, + index_seq=fixture["index"], + umi_before=0, + umi_after=0, + ) + + if alignment["adapter"] == "truseq_i7": + # Perfect match to index + assert cs_parsed[0] == fixture["index"].lower() + assert cs_parsed[1] == 0 + elif alignment["adapter"] == "truseq_i5": + # No index, distance the length of all concatenated substitutions of the cs string + assert cs_parsed[0] == "" + assert cs_parsed[1] == 10 + else: + raise AssertionError("Case not covered.") + + +def test_layout_matches(fixture): + i5_name = "truseq_i5" + i7_name = "truseq_i7" + paf_entries = to_test.parse_paf_lines(fixture["paf_multiple"]) + + layout = to_test.layout_matches( + i5_name=i5_name, i7_name=i7_name, paf_entries=paf_entries + ) + fragments, singletons, concats, unknowns = layout + + # Assert reads were categorized as expected + assert len(fragments["fragment_read"]) == 2 + assert len(singletons["singleton_i5_read"]) == 1 + assert len(singletons["singleton_i7_read"]) == 1 + assert len(concats["concat_read"]) == 4 + assert len(unknowns["unknown_read"]) == 2