diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..1c10fd3 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,33 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Anglerfish", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile", + // Use devcontainer target + "target": "devcontainer", + }, + "initializeCommand": "echo 'Workaround, see github https://github.com/microsoft/vscode-remote-release/issues/9302#issuecomment-1854476541'", + "features": {}, + "customizations": { + "vscode": { + "extensions": [ + "esbenp.prettier-vscode", + "wholroyd.jinja", + "ms-python.python", + "charliermarsh.ruff@2024.2.0", + "ms-azuretools.vscode-docker", + ], + }, + }, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + // "postCreateCommand": "pip3 install -e .", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml index 0fed1f1..0bef153 100644 --- a/.github/workflows/anglerfish.yml +++ b/.github/workflows/anglerfish.yml @@ -19,6 +19,7 @@ jobs: python=${{ matrix.python-version }} pip environment-file: environment.yml + environment-name: anglerfish-dev # Install Anglerfish - shell: bash -l {0} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09aaac5..4162f50 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # .pre-commit-config.yaml repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.1.13 hooks: - id: ruff - id: ruff-format diff --git a/Dockerfile b/Dockerfile index 445b676..53aad9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,59 @@ -FROM mambaorg/micromamba +FROM mambaorg/micromamba as base LABEL author="Remi-Andre Olsen" \ description="Anglerfish development version" \ maintainer="remi-andre.olsen@scilifelab.se" USER root + +# Check for arm64 architecture to install minimap2 +ARG TARGETARCH +ENV MINIMAP_VERSION=2.26 +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Install compliation tools for minimap2 + apt-get update;\ + apt-get install -y curl build-essential libz-dev;\ + # Download minimap2 + curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\ + # Compile minimap2 + cd minimap2-${MINIMAP_VERSION};\ + make arm_neon=1 aarch64=1; \ + # Add to path + mv minimap2 /usr/local/bin/;\ + fi + COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml / -RUN micromamba env create -n anglerfish && micromamba install -y -n anglerfish -f /environment.yml && micromamba clean --all --yes -ENV PATH /opt/conda/envs/anglerfish/bin:$PATH +COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt / + +# Remove minimap2 from environment.yml for arm64 +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + grep -v 'minimap2' /environment.yml > /environment.tmp.yml ;\ + else \ + cp /environment.yml /environment.tmp.yml ;\ + fi ;\ + chown $MAMBA_USER:$MAMBA_USER /environment.tmp.yml # Add source files to the container ADD . /usr/src/anglerfish WORKDIR /usr/src/anglerfish -RUN eval "$(micromamba shell hook --shell bash)" && micromamba activate anglerfish && python -m pip install .[dev] + +# Activate the environment +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +RUN micromamba install -y -n base -f /environment.tmp.yml && micromamba clean --all --yes + +##### +# Devcontainer +##### +FROM base as devcontainer + +# Useful tools for devcontainer +RUN apt-get update;\ + apt-get install -y git vim +RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install -e .[dev] + +##### +# Main +##### +FROM base as main USER $MAMBA_USER +RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev] \ No newline at end of file diff --git a/README.md b/README.md index 7de2e87..be02c73 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,25 @@ conda install -c bioconda anglerfish pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerfish.git ``` +### For Arm64 processors (e.g. Apple M2) + +Anglerfish depends on minimap2 which needs to be [compiled from source](https://github.com/lh3/minimap2?tab=readme-ov-file#installation) for Arm64 processors. +When minimap2 is compiled and available on $PATH, anglerfish can be installed with pip: + +```shell +pip install bio-anglerfish +``` + +Additionaly, if Docker is your cup of tea, the Dockerfile supplied in this repository should also work on both arm64 and x86 processors. + ## Source development 1. [Install miniconda](https://docs.conda.io/en/latest/miniconda.html). 2. Set up repo clone with editable install +For x86 processors (e.g. Intel/AMD): + ``` git clone https://github.com/remiolsen/anglerfish.git cd anglerfish @@ -49,6 +62,18 @@ conda activate anglerfish-dev pip install -e ".[dev]" ``` +For Arm64 processors (e.g. Apple M1/M2). First [compile and install minimap2 manually](https://github.com/lh3/minimap2?tab=readme-ov-file#installation), then: + +``` +git clone https://github.com/remiolsen/anglerfish.git +cd anglerfish +# Create a the anglerfish conda environment (but remove minimap2) +conda env create -f <(grep -v minimap2 environment.yml) +# Install anglerfish +conda activate anglerfish-dev +pip install -e ".[dev]" +``` + 3. (Optional) Install pre-commit to prevent committing code that will fail linting ``` @@ -146,6 +171,16 @@ In folder `anglerfish_????_??_??_?????/` - `anglerfish_stats.txt` Barcode statistics from anglerfish run - `anglerfish_stats.json` Machine readable anglerfish statistics +## Anglerfish Explore (Experimental) + +`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running: + +```shell +python anglerfish/explore/cli.py +``` + +inside the anglerfish directory. + ## Credits The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin. diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py new file mode 100644 index 0000000..8db7b66 --- /dev/null +++ b/anglerfish/demux/adaptor.py @@ -0,0 +1,69 @@ +import importlib +import os + +import yaml + + +class Adaptor: + def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None): + self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index) + self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index) + self.name = f"{adaptor}" + self.delim = delim + + def get_i7_mask(self, insert_Ns=True): + return self.i7.get_mask(insert_Ns) + + def get_i5_mask(self, insert_Ns=True): + return self.i5.get_mask(insert_Ns) + + def get_fastastring(self, insert_Ns=True): + fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n" + fasta_i7 = f">{self.name}_i7\n{self.get_i7_mask(insert_Ns)}\n" + return fasta_i5 + fasta_i7 + + +class AdaptorPart: + def __init__(self, sequence, name, delim, index): + self.sequence = sequence + self.name = name + self.delim = delim + self.index = index + + def has_index(self): + return self.sequence.find(self.delim) > -1 + + def len_before_index(self): + return self.sequence.find(self.delim) + + def len_after_index(self): + return len(self.sequence) - self.sequence.find(self.delim) - len(self.delim) + + def get_mask(self, insert_Ns): + if self.has_index(): + if not insert_Ns: + return self.sequence.replace(self.delim, "") + else: + return self.sequence.replace(self.delim, "N" * len(self.index)) + else: + return self.sequence + + +# Fetch all adaptors +def load_adaptors(raw=False): + p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") + assert isinstance(p, os.PathLike) + + adaptors_raw = [] + with open(p) as f: + adaptors_raw = yaml.safe_load(f) + + if raw: + return adaptors_raw + adaptors = [] + for adaptor in adaptors_raw: + adaptors.append( + Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None) + ) + + return adaptors diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 444bb01..db43cac 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -24,7 +24,7 @@ def parse_cs(cs_string, index, max_distance): return nts, lev.distance(index.lower(), nts) -def run_minimap2(fastq_in, indexfile, output_paf, threads): +def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1): """ Runs Minimap2 """ @@ -36,9 +36,10 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): "10", "-w", "5", - "-B1", - "-A6", - "--dual=no", + "-A", + "6", + "-B", + str(minimap_b), "-c", "-t", str(threads), @@ -46,14 +47,18 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): fastq_in, ] - with open(output_paf, "ab") as ofile: - subprocess.run(cmd, stdout=ofile, check=True) + run_log = f"{output_paf}.log" + with open(output_paf, "ab") as ofile, open(run_log, "ab") as log_file: + p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=log_file) + subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) -def parse_paf_lines(paf, min_qual=10): +def parse_paf_lines(paf, min_qual=10, complex_identifier=False): """ Read and parse one paf alignment lines. Returns a dict with the import values for later use + If complex_identifier is True (default False), the keys will be on the form + "{read}_{i5_or_i7}_{strand_str}" """ entries = {} with open(paf) as paf: @@ -62,17 +67,28 @@ def parse_paf_lines(paf, min_qual=10): try: # TODO: objectify this entry = { + "read": aln[0], "adapter": aln[5], "rlen": int(aln[1]), # read length "rstart": int(aln[2]), # start alignment on read "rend": int(aln[3]), # end alignment on read "strand": aln[4], + "cg": aln[-2], # cigar string "cs": aln[-1], # cs string "q": int(aln[11]), # Q score "iseq": None, "sample": None, } - read = aln[0] + read = entry["read"] + if complex_identifier: + i5_or_i7 = entry["adapter"].split("_")[-1] + if entry["strand"] == "+": + strand_str = "positive" + else: + strand_str = "negative" + ix = f"{read}_{i5_or_i7}_{strand_str}" + else: + ix = read except IndexError: log.debug(f"Could not find all paf columns: {read}") continue @@ -81,10 +97,10 @@ def parse_paf_lines(paf, min_qual=10): log.debug(f"Low quality alignment: {read}") continue - if read in entries.keys(): - entries[read].append(entry) + if ix in entries.keys(): + entries[ix].append(entry) else: - entries[read] = [entry] + entries[ix] = [entry] return entries @@ -166,14 +182,14 @@ def cluster_matches( fi7 = "" for _, adaptor, _ in sample_adaptor: try: - i5_seq = adaptor.i5_index + i5_seq = adaptor.i5.index if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance) except AttributeError: d1 = 0 # presumably it's single index, so no i5 - i7_seq = adaptor.i7_index + i7_seq = adaptor.i7.index if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance) @@ -189,8 +205,8 @@ def cluster_matches( continue if dists[index_min] > max_distance: # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len( - adaptor.i5_index or "" + if len(fi7) + len(fi5) == len(adaptor.i7.index or "") + len( + adaptor.i5.index or "" ): fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 456ff3b..46923e0 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -97,8 +97,8 @@ def write_dataframe(self, outdir, samplesheet): sen_dict = asdict(sentry) if sen_dict["sample_name"] == s_dict["sample_name"]: s_dict["adaptor_name"] = sen_dict["adaptor"].name - s_dict["i7_index"] = sen_dict["adaptor"].i7_index - s_dict["i5_index"] = sen_dict["adaptor"].i5_index + s_dict["i7_index"] = sen_dict["adaptor"].i7.index + s_dict["i5_index"] = sen_dict["adaptor"].i5.index out_list.append(s_dict) for key, unmatch in self.unmatched_stats.items(): for unmatch_sample in unmatch: diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index ed0c2d6..b62c53f 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -1,19 +1,15 @@ import csv import glob -import importlib.resources -import os import re from dataclasses import dataclass from itertools import combinations import Levenshtein as lev -import yaml -p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") -assert isinstance(p, os.PathLike) -with open(p) as stream: - adaptors = yaml.safe_load(stream) +from anglerfish.demux.adaptor import Adaptor, load_adaptors + delim = "-NNN-" +adaptors = load_adaptors(raw=True) @dataclass @@ -24,32 +20,6 @@ class SampleSheetEntry: ont_barcode: str -class Adaptor: - def __init__(self, adaptor, i7_index=None, i5_index=None): - self.i5 = adaptors[adaptor]["i5"] - self.i7 = adaptors[adaptor]["i7"] - self.i5_index = i5_index - self.i7_index = i7_index - self.name = f"{adaptor}_len{len(i7_index)}" - - if delim in self.i5 and i5_index is None: - raise UserWarning("Adaptor has i5 but no sequence was specified") - if delim in self.i7 and i7_index is None: - raise UserWarning("Adaptor has i7 but no sequence was specified") - - def get_i5_mask(self): - if delim in self.i5: - return self.i5.replace(delim, "N" * len(self.i5_index)) - else: - return self.i5 - - def get_i7_mask(self): - if delim in self.i7: - return self.i7.replace(delim, "N" * len(self.i7_index)) - else: - return self.i7 - - class SampleSheet: def __init__(self, input_csv, ont_bc): # Read samplesheet in format: @@ -83,7 +53,7 @@ def __init__(self, input_csv, ont_bc): sample_name = row["sample_name"] test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"]) - bc_re = re.compile("\/(barcode\d\d|unclassified)\/") + bc_re = re.compile(r"\/(barcode\d\d|unclassified)\/") ont_barcode = None if ont_bc: ob = re.findall(bc_re, row["fastq_path"]) @@ -94,7 +64,7 @@ def __init__(self, input_csv, ont_bc): ss_entry = SampleSheetEntry( sample_name, - Adaptor(row["adaptors"], i7, i5), + Adaptor(adaptors, delim, row["adaptors"], i7, i5), row["fastq_path"], ont_barcode, ) @@ -135,10 +105,10 @@ def minimum_bc_distance(self): for ont_barcode, adaptors in ss_by_bc.items(): testset[ont_barcode] = [] for adaptor in adaptors: - if adaptor.i5_index is not None: - testset[ont_barcode].append(adaptor.i5_index + adaptor.i7_index) + if adaptor.i5.has_index(): + testset[ont_barcode].append(adaptor.i5.index + adaptor.i7.index) else: - testset[ont_barcode].append(adaptor.i7_index) + testset[ont_barcode].append(adaptor.i7.index) fq_distances = [] for ont_barcode, adaptors in testset.items(): @@ -148,7 +118,9 @@ def minimum_bc_distance(self): else: for a, b in [i for i in combinations(adaptors, 2)]: dist = lev.distance(a, b) - assert dist > 0, f"""There is one or more identical barcodes in the input samplesheet. + assert ( + dist > 0 + ), f"""There is one or more identical barcodes in the input samplesheet. First one found: {a}. If these exist in different ONT barcodes, please specify the --ont_barcodes flag.""" distances.append(dist) fq_distances.append(min(distances)) diff --git a/anglerfish/explore/__init__.py b/anglerfish/explore/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py new file mode 100644 index 0000000..1cceab5 --- /dev/null +++ b/anglerfish/explore/cli.py @@ -0,0 +1,100 @@ +import click + +from anglerfish.explore.explore import run_explore + + +@click.command() +@click.option("-f", "--fastq", required=True, help="Fastq file to align") +@click.option("-o", "--outdir", required=True, help="Output directory") +@click.option( + "-t", + "--threads", + default=4, + type=int, + help="Number of threads specified to minimap2", +) +@click.option( + "-e", + "--use-existing", + is_flag=True, + help="Use existing alignments if found in the specified output directory.", +) +@click.option( + "-g", + "--good_hit_threshold", + default=0.9, + type=float, + help="Fraction of adaptor bases immediately before and immediately after index insert required to match perfectly for a hit to be considered a good hit (default=0.9).", +) +@click.option( + "-i", + "--insert_thres_low", + default=4, + type=int, + help="Lower threshold for index(+UMI) insert length, with value included (deafult=4).", +) +@click.option( + "-j", + "--insert_thres_high", + default=30, + type=int, + help="Upper threshold for index(+UMI) insert length, with value included (default=30).", +) +@click.option( + "-B", + "--minimap_b", + default=4, + type=int, + help="Minimap2 -B parameter, mismatch penalty (default=4).", +) +@click.option( + "-m", + "--min_hits_per_adaptor", + default=50, + type=int, + help="Minimum number of good hits for an adaptor to be included in the analysis (default=50).", +) +@click.option( + "-u", + "--umi_threshold", + default=11, + type=float, + help="Minimum number of bases in insert to perform entropy calculation (default=11).", +) +@click.option( + "-k", + "--kmer_length", + default=2, + type=int, + help="Length of k-mers to use for entropy calculation (default=2).", +) +def main( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, +): + run_explore( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, + ) + + +if __name__ == "__main__": + main() diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py new file mode 100644 index 0000000..9a6bea9 --- /dev/null +++ b/anglerfish/explore/entropy.py @@ -0,0 +1,63 @@ +import itertools +from collections import deque + +import numpy as np + + +# Rolling window implementation +def _window(seq, n=3): + win = deque(maxlen=n) + for char in seq: + win.append(char) + if len(win) >= n: + yield tuple(win) + + +def _entropy(pk, qk): + """ + Kullback-Leibler divergence (relative entropy) + Reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html + """ + # To avoid zeroes in pk, add a tiny fraction of observed values + pk = pk + 1e-2 + + if np.sum(pk) - 1 > 1e-5: + pk = pk / np.sum(pk) + return np.sum(pk * np.log(pk / qk)) + + +# Each sequence gives matrix of position and k-mer occurence. +def _seq_to_count_matrix(dna_seq, kmer_positions, max=50, k=2): + """max is the length of the prefix that will be considered""" + if len(dna_seq) < max: + max = len(dna_seq) + + matrix = np.zeros((4**k, max - k + 1)) + for i, word in enumerate(_window(dna_seq[:max], n=k)): + matrix[kmer_positions[word], i] += 1 + return matrix + + +def _count_for_seqs(seqs, kmer_positions, insert_length, k=2): + seqs = [seq for seq in seqs if len(seq) == insert_length] + matrices = [_seq_to_count_matrix(seq, kmer_positions, max=50, k=k) for seq in seqs] + stacked = np.stack(matrices) + sum_a = np.sum(stacked, axis=0) + even_dist = np.ones(4**k) / (4**k) + entropies = [_entropy(a, even_dist) for a in sum_a.T] + return entropies + + +def _extract_inserts_from_df(df): + mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" + return df.cs.str.extract(mim_re_cs, expand=False).str.upper().tolist() + + +def calculate_relative_entropy(df_good_hits, kmer_length, insert_length): + all_kmers = itertools.product("ACTG", repeat=kmer_length) + kmer_positions = dict( + (kmer, position) for kmer, position in zip(all_kmers, range(4**kmer_length)) + ) + seqs = _extract_inserts_from_df(df_good_hits) + entropies = _count_for_seqs(seqs, kmer_positions, insert_length, k=kmer_length) + return entropies diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py new file mode 100644 index 0000000..1fd9cf8 --- /dev/null +++ b/anglerfish/explore/explore.py @@ -0,0 +1,200 @@ +import logging +import os +import uuid + +import pandas as pd + +from anglerfish.demux.adaptor import load_adaptors +from anglerfish.demux.demux import parse_paf_lines, run_minimap2 +from anglerfish.explore.entropy import calculate_relative_entropy + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("explore") + + +def run_explore( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, +): + # Setup a run directory + run_uuid = str(uuid.uuid4()) + try: + os.mkdir(outdir) + except FileExistsError: + log.info(f"Output directory {outdir} already exists") + if not use_existing: + log.error( + f"Output directory {outdir} already exists, please use --use_existing to continue" + ) + exit(1) + else: + pass + + log.info("Running anglerfish explore") + log.info(f"Run uuid {run_uuid}") + + adaptors = load_adaptors() + alignments = [] + + # Map all reads against all adaptors + for adaptor in adaptors: + adaptor_name = adaptor.name + + # Align + aln_path = os.path.join(outdir, f"{adaptor_name}.paf") + alignments.append((adaptor, aln_path)) + if os.path.exists(aln_path) and use_existing: + log.info(f"Skipping {adaptor_name} as alignment already exists") + continue + adaptor_path = os.path.join(outdir, f"{adaptor_name}.fasta") + with open(adaptor_path, "w") as f: + f.write(adaptor.get_fastastring(insert_Ns=False)) + + log.info(f"Aligning {adaptor_name}") + run_minimap2(fastq, adaptor_path, aln_path, threads, minimap_b) + + # Parse alignments + entries = {} + adaptors_included = [] + for adaptor, aln_path in alignments: + log.info(f"Parsing {adaptor.name}") + aln_dict_with_lists = parse_paf_lines(aln_path, complex_identifier=True) + + # Choose only the highest scoring alignment for each combination of read, adaptor end and strand + aln_dict = dict( + [(k, max(v, key=lambda x: x["q"])) for k, v in aln_dict_with_lists.items()] + ) + + df = pd.DataFrame.from_dict(aln_dict, orient="index") + nr_good_hits = {} + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + if adaptor_end.has_index(): + # Match Insert Match = mim + # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert + # The cg string allows for mismatches within the matching strings but no insertions or deletions + # All cg string matches are also cs string matches (subset) but not vice versa + mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim + mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" + df_mim = df[df.cs.str.match(mim_re_cs)] + + # Extract the match lengths + match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( + {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 + ) + match_col_df = match_col_df.astype( + { + "match_1_len": "int32", + "insert_len": "int32", + "match_2_len": "int32", + } + ) + + df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df + + # Alignment thresholds + before_thres = round( + adaptor_end.len_before_index() * good_hit_threshold + ) + after_thres = round(adaptor_end.len_after_index() * good_hit_threshold) + insert_thres_low = insert_thres_low + insert_thres_high = insert_thres_high + + requirements = ( + (df_mim["match_1_len"] >= (before_thres)) + & (df_mim["insert_len"] >= insert_thres_low) + & (df_mim["insert_len"] <= insert_thres_high) + & (df_mim["match_2_len"] >= (after_thres)) + ) + df_good_hits = df_mim[requirements] + + median_insert_length = df_good_hits["insert_len"].median() + insert_lengths = df_good_hits["insert_len"].value_counts() + else: + m_re_cs = r"^cs:Z::([1-9][0-9]*)$" + df_good_hits = df[df.cg.str.match(m_re_cs)] + match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( + {0: "match_1_len"}, axis=1 + ) + match_col_df = match_col_df.astype({"match_1_len": "int32"}) + + df_good_hits.loc[ + match_col_df.index, match_col_df.columns + ] = match_col_df + + thres = round( + (adaptor_end.len_before_index() + adaptor_end.len_after_index()) + * good_hit_threshold + ) + df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] + + median_insert_length = None + insert_lengths = None + + # Filter multiple hits per read and adaptor end + df_good_hits = df_good_hits.sort_values( + by=["read", "match_1_len"], ascending=False + ).drop_duplicates(subset=["read", "adapter"], keep="first") + + if adaptor.name not in entries.keys(): + entries[adaptor.name] = {} + entries[adaptor.name][adaptor_end_name] = df_good_hits + + nr_good_hits[adaptor_end_name] = len(df_good_hits) + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." + ) + + if min(nr_good_hits.values()) >= min_hits_per_adaptor: + log.info(f"Adaptor {adaptor.name} is included in the analysis") + adaptors_included.append(adaptor) + else: + log.info(f"Adaptor {adaptor.name} is excluded from the analysis") + + # Print insert length info for adaptor types included in the analysis + for adaptor in adaptors_included: + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + df_good_hits = entries[adaptor.name][adaptor_end_name] + if adaptor_end.has_index(): + median_insert_length = df_good_hits["insert_len"].median() + if median_insert_length > umi_threshold: + # Calculate entropies here + entropies = calculate_relative_entropy( + df_good_hits, kmer_length, median_insert_length + ) + entropy_file = os.path.join( + outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv" + ) + pd.Series(entropies).to_csv(entropy_file, float_format="%.2f") + log.info( + f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}" + ) + insert_lengths = df_good_hits["insert_len"].value_counts() + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" + ) + histogram_file = os.path.join( + outdir, f"{adaptor.name}_{adaptor_end_name}.hist.csv" + ) + insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) + log.info( + f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}" + ) + else: + median_insert_length = None + insert_lengths = None + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)" + ) diff --git a/environment.yml b/environment.yml index 26fd280..ec2d96d 100644 --- a/environment.yml +++ b/environment.yml @@ -4,10 +4,9 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::biopython=1.79 - - conda-forge::python-levenshtein=0.23.0 - - conda-forge::numpy>=1.22.0 - - bioconda::minimap2=2.20 - - conda-forge::pyyaml=6.0 + - bioconda::minimap2=2.26 + - pip + - pip: + - -r requirements.txt - conda-forge::pre-commit - conda-forge::prettier diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..db39464 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +biopython==1.83 +click==8.1.7 +levenshtein==0.23.0 +numpy==1.26.3 +pandas==2.1.4 +pyyaml==6.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py index bba0426..0b50d3f 100644 --- a/setup.py +++ b/setup.py @@ -32,12 +32,8 @@ python_requires=">=3.10", packages=find_packages(), package_data={"": ["config/adaptors.yaml"]}, - install_requires=[ - "python-levenshtein==0.23.0", - "biopython==1.79", - "numpy>=1.22.0", - "pyyaml==6.0", - ], + # Read requirements.txt + install_requires=[x.strip() for x in open("requirements.txt").readlines()], extras_require={"dev": ["ruff", "mypy", "editorconfig-checker"]}, entry_points={ "console_scripts": [