From ff353fe0386856a230d69d7ec100a4d217ff7616 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 16 Jan 2024 10:49:33 +0100 Subject: [PATCH 01/34] Working dockerfile for arm64 --- Dockerfile | 41 +++++++++++++++++++++++++++++++++++++---- environment.yml | 9 ++++----- requirements.txt | 4 ++++ setup.py | 8 ++------ 4 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile index c79f571..a8b2449 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,12 +5,45 @@ LABEL author="Remi-Andre Olsen" \ maintainer="remi-andre.olsen@scilifelab.se" USER root + +# Check for arm64 architecture to install minimap2 +ARG TARGETARCH +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Install compliation tools for minimap2 + apt-get update;\ + apt-get install -y curl build-essential libz-dev;\ + fi + +ENV MINIMAP_VERSION=2.26 + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Download minimap2 + curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\ + fi + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Compile minimap2 + cd minimap2-${MINIMAP_VERSION};\ + make arm_neon=1 aarch64=1; \ + # Add to path + mv minimap2 /usr/local/bin/;\ + fi + COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml / -RUN micromamba env create -n anglerfish && micromamba install -y -n anglerfish -f /environment.yml && micromamba clean --all --yes -ENV PATH /opt/conda/envs/anglerfish/bin:$PATH +COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt / + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + grep -v 'minimap2' /environment.yml > /environment.tmp.yml ;\ + else \ + cp /environment.yml /environment.tmp.yml ;\ + fi ;\ + chown $MAMBA_USER:$MAMBA_USER /environment.tmp.yml # Add source files to the container ADD . /usr/src/anglerfish WORKDIR /usr/src/anglerfish -RUN eval "$(micromamba shell hook --shell bash)" && micromamba activate anglerfish && python -m pip install . -USER $MAMBA_USER + +RUN micromamba install -y -n base -f /environment.tmp.yml && micromamba clean --all --yes + +RUN eval "$(micromamba shell hook --shell bash)" && micromamba activate base && python -m pip install . +USER $MAMBA_USER \ No newline at end of file diff --git a/environment.yml b/environment.yml index f695b3d..b36521a 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,7 @@ channels: - conda-forge - bioconda dependencies: - - conda-forge::biopython=1.79 - - conda-forge::python-levenshtein=0.23.0 - - conda-forge::numpy=1.22.0 - - bioconda::minimap2=2.20 - - conda-forge::pyyaml=6.0 + - bioconda::minimap2=2.26 + - pip + - pip: + - -r requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9fd76d3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy==1.26.3 +biopython==1.83 +levenshtein==0.23.0 +pyyaml==6.0.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 72b5591..e1bbfad 100644 --- a/setup.py +++ b/setup.py @@ -30,12 +30,8 @@ python_requires=">=3.7", packages = find_packages(), package_data = {"":["config/adaptors.yaml"]}, - install_requires=[ - 'python-levenshtein==0.23.0', - 'biopython==1.79', - 'numpy==1.22.0', - 'pyyaml==6.0' - ], + # Read requirements.txt + install_requires=[x.strip() for x in open("requirements.txt").readlines()], entry_points={ "console_scripts": [ "anglerfish=anglerfish.anglerfish:anglerfish", From f306ecdc0dcc554c2068bd8789b14ca236ec427d Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 16 Jan 2024 11:59:15 +0100 Subject: [PATCH 02/34] Add back pre-commit and prettier to conda env --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 3a1e3ed..dfdf6bd 100644 --- a/environment.yml +++ b/environment.yml @@ -8,3 +8,5 @@ dependencies: - pip - pip: - -r requirements.txt + - conda-forge::pre-commit + - conda-forge::prettier \ No newline at end of file From 5609e0db09350bc0e40b694930a173cc966d2517 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 16 Jan 2024 12:02:23 +0100 Subject: [PATCH 03/34] Ruff formatting of environment.yml --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index dfdf6bd..ec2d96d 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,6 @@ dependencies: - bioconda::minimap2=2.26 - pip - pip: - - -r requirements.txt + - -r requirements.txt - conda-forge::pre-commit - - conda-forge::prettier \ No newline at end of file + - conda-forge::prettier From 62b57eb280732199b27ae3d55094f72baced2ea2 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 16 Jan 2024 12:32:43 +0100 Subject: [PATCH 04/34] Basic click interface to explore main --- anglerfish/explore/__init__.py | 0 anglerfish/explore/explore.py | 79 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 anglerfish/explore/__init__.py create mode 100644 anglerfish/explore/explore.py diff --git a/anglerfish/explore/__init__.py b/anglerfish/explore/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py new file mode 100644 index 0000000..eeaaca1 --- /dev/null +++ b/anglerfish/explore/explore.py @@ -0,0 +1,79 @@ +import click + + +@click.command() +@click.option("-f", "--fastq", required=True, help="Fastq file to align") +@click.option("-a", "--adaptors_file", required=True, help="YAML file with adaptors") +@click.option("-o", "--outdir", required=True, help="Output directory") +@click.option("-t", "--threads", default=1, type=int, help="Number of threads") +@click.option( + "-n", "--no_overwrite", is_flag=True, help="Do not overwrite existing alignments" +) +@click.option( + "-g", + "--good_hit_threshold", + default=0.9, + type=float, + help="Fraction of bases before and after index insert required to match perfectly for a hit to be considered a good hit. Default=0.9", +) +@click.option( + "-i", + "--insert_thres_low", + default=4, + type=int, + help="Lower threshold for insert length, with value included", +) +@click.option( + "-j", + "--insert_thres_high", + default=30, + type=int, + help="Upper threshold for insert length, with value included", +) +@click.option( + "-B", + "--minimap_B", + default=4, + type=int, + help="Minimap2 -B parameter, mismatch penalty", +) +@click.option( + "-m", + "--min_hits_per_adaptor", + default=50, + type=int, + help="Minimum number of good hits for an adaptor to be included in the analysis", +) +@click.option( + "-u", + "--umi_threshold", + default=11, + type=float, + help="Minimum number of bases in insert to perform entropy calculation", +) +@click.option( + "-k", + "--kmer_length", + default=2, + type=int, + help="Length of k-mers to use for entropy calculation", +) +def main( + fastq, + adaptors_file, + outdir, + threads, + no_overwrite, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_B, + min_hits_per_adaptor, + umi_threshold, + kmer_length, +): + pass + + +if __name__ == "__main__": + main() From eff8a9e13d8ae32afcaa905ac994b360024b2187 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:09:48 +0000 Subject: [PATCH 05/34] New AdaptorPart class and moved Adaptor to separate module --- .pre-commit-config.yaml | 4 ++- anglerfish/demux/adaptor.py | 43 +++++++++++++++++++++++++++++++++ anglerfish/demux/demux.py | 8 +++--- anglerfish/demux/report.py | 4 +-- anglerfish/demux/samplesheet.py | 42 ++++++++------------------------ 5 files changed, 62 insertions(+), 39 deletions(-) create mode 100644 anglerfish/demux/adaptor.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1c09ed2..78e2f10 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # .pre-commit-config.yaml repos: - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.1.13 hooks: - id: ruff - id: ruff-format @@ -9,6 +9,8 @@ repos: rev: "v1.7.1" hooks: - id: mypy + args: [--ignore-missing-imports] + additional_dependencies: [types-PyYAML==6.0.12.12] - repo: https://github.com/pre-commit/mirrors-prettier rev: "v4.0.0-alpha.8" hooks: diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py new file mode 100644 index 0000000..6840490 --- /dev/null +++ b/anglerfish/demux/adaptor.py @@ -0,0 +1,43 @@ +class Adaptor: + def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None): + self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index) + self.i5 = AdaptorPart(adaptors[adaptor]["i5"], adaptor, delim, i5_index) + self.name = f"{adaptor}" + self.delim = delim + + def get_i7_mask(self, insert_Ns=True): + return self.i7.get_mask(insert_Ns) + + def get_i5_mask(self, insert_Ns=True): + return self.i5.get_mask(insert_Ns) + + def get_fastastring(self, insert_Ns=True): + fasta_i5 = f">{self.name}_i5\n{self.get_i5_mask(insert_Ns)}\n" + fasta_i7 = f">{self.name}_i7\n{self.get_i7_mask(insert_Ns)}\n" + return fasta_i5 + fasta_i7 + + +class AdaptorPart: + def __init__(self, sequence, name, delim, index): + self.sequence = sequence + self.name = name + self.delim = delim + self.index = index + + def has_index(self): + return self.sequence.find(self.delim) > -1 + + def len_before_index(self): + return self.sequence.find(self.delim) + + def len_after_index(self): + return len(self.sequence) - self.sequence.find(self.delim) - len(self.delim) + + def get_mask(self, insert_Ns): + if self.has_index(): + if not insert_Ns: + return self.sequence.replace(self.delim, "") + else: + return self.sequence.replace(self.delim, "N" * len(self.index)) + else: + return self.sequence diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 4d2ecb9..fe8b5e7 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -167,7 +167,7 @@ def cluster_matches( except AttributeError: d1 = 0 # presumably it's single index, so no i5 - i7_seq = adaptor.i7_index + i7_seq = adaptor.i7.index if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance) @@ -186,14 +186,14 @@ def cluster_matches( if dists[index_min] > max_distance: log.debug(f" No match {fi7}-{fi5}") # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len( - adaptor.i5_index or "" + if len(fi7) + len(fi5) == len(adaptor.i7.index or "") + len( + adaptor.i5.index or "" ): fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) continue matched[read] = alignments - log.debug(f" Matched {read} to {adaptor.i7_index}-{adaptor.i5_index}") + log.debug(f" Matched {read} to {adaptor.i7.index}-{adaptor.i5.index}") matched_bed.append( [read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."] ) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 03dba2c..5e92901 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -88,8 +88,8 @@ def write_dataframe(self, outdir, samplesheet): sen_dict = asdict(sentry) if sen_dict["sample_name"] == s_dict["sample_name"]: s_dict["adaptor_name"] = sen_dict["adaptor"].name - s_dict["i7_index"] = sen_dict["adaptor"].i7_index - s_dict["i5_index"] = sen_dict["adaptor"].i5_index + s_dict["i7_index"] = sen_dict["adaptor"].i7.index + s_dict["i5_index"] = sen_dict["adaptor"].i5.index out_list.append(s_dict) for key, unmatch in self.unmatched_stats.items(): for unmatch_sample in unmatch: diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index ed0c2d6..b31fc31 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -9,6 +9,8 @@ import Levenshtein as lev import yaml +from anglerfish.demux.adaptor import Adaptor + p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") assert isinstance(p, os.PathLike) with open(p) as stream: @@ -24,32 +26,6 @@ class SampleSheetEntry: ont_barcode: str -class Adaptor: - def __init__(self, adaptor, i7_index=None, i5_index=None): - self.i5 = adaptors[adaptor]["i5"] - self.i7 = adaptors[adaptor]["i7"] - self.i5_index = i5_index - self.i7_index = i7_index - self.name = f"{adaptor}_len{len(i7_index)}" - - if delim in self.i5 and i5_index is None: - raise UserWarning("Adaptor has i5 but no sequence was specified") - if delim in self.i7 and i7_index is None: - raise UserWarning("Adaptor has i7 but no sequence was specified") - - def get_i5_mask(self): - if delim in self.i5: - return self.i5.replace(delim, "N" * len(self.i5_index)) - else: - return self.i5 - - def get_i7_mask(self): - if delim in self.i7: - return self.i7.replace(delim, "N" * len(self.i7_index)) - else: - return self.i7 - - class SampleSheet: def __init__(self, input_csv, ont_bc): # Read samplesheet in format: @@ -83,7 +59,7 @@ def __init__(self, input_csv, ont_bc): sample_name = row["sample_name"] test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"]) - bc_re = re.compile("\/(barcode\d\d|unclassified)\/") + bc_re = re.compile(r"\/(barcode\d\d|unclassified)\/") ont_barcode = None if ont_bc: ob = re.findall(bc_re, row["fastq_path"]) @@ -94,7 +70,7 @@ def __init__(self, input_csv, ont_bc): ss_entry = SampleSheetEntry( sample_name, - Adaptor(row["adaptors"], i7, i5), + Adaptor(adaptors, delim, row["adaptors"], i7, i5), row["fastq_path"], ont_barcode, ) @@ -135,10 +111,10 @@ def minimum_bc_distance(self): for ont_barcode, adaptors in ss_by_bc.items(): testset[ont_barcode] = [] for adaptor in adaptors: - if adaptor.i5_index is not None: - testset[ont_barcode].append(adaptor.i5_index + adaptor.i7_index) + if adaptor.i5.has_index(): + testset[ont_barcode].append(adaptor.i5.index + adaptor.i7.index) else: - testset[ont_barcode].append(adaptor.i7_index) + testset[ont_barcode].append(adaptor.i7.index) fq_distances = [] for ont_barcode, adaptors in testset.items(): @@ -148,7 +124,9 @@ def minimum_bc_distance(self): else: for a, b in [i for i in combinations(adaptors, 2)]: dist = lev.distance(a, b) - assert dist > 0, f"""There is one or more identical barcodes in the input samplesheet. + assert ( + dist > 0 + ), f"""There is one or more identical barcodes in the input samplesheet. First one found: {a}. If these exist in different ONT barcodes, please specify the --ont_barcodes flag.""" distances.append(dist) fq_distances.append(min(distances)) From 25126ac6caff519834a37167534ad3a21ef2fdd1 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:14:47 +0000 Subject: [PATCH 06/34] Working devcontainer --- .devcontainer/dev.Dockerfile | 50 +++++++++++++++++++++++++++++++++ .devcontainer/devcontainer.json | 29 +++++++++++++++++++ environment.yml | 2 +- 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 .devcontainer/dev.Dockerfile create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/dev.Dockerfile b/.devcontainer/dev.Dockerfile new file mode 100644 index 0000000..1302fe2 --- /dev/null +++ b/.devcontainer/dev.Dockerfile @@ -0,0 +1,50 @@ +FROM mambaorg/micromamba + +LABEL author="Remi-Andre Olsen" \ + description="Anglerfish development version" \ + maintainer="remi-andre.olsen@scilifelab.se" + +USER root + +# Useful tools for devcontainer +RUN apt-get update;\ + apt-get install -y git + +# Check for arm64 architecture to install minimap2 +ARG TARGETARCH +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Install compliation tools for minimap2 + apt-get update;\ + apt-get install -y curl build-essential libz-dev;\ + fi + + +USER $MAMBA_USER +ENV MINIMAP_VERSION=2.26 + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Download minimap2 + curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\ + fi + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + # Compile minimap2 + cd minimap2-${MINIMAP_VERSION};\ + make arm_neon=1 aarch64=1; \ + # Add to PATH + echo "export PATH=$(pwd):\$PATH" >> ~/.bashrc ;\ + fi + +# Add requirements files to the container +COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/ +COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt /tmp/ + +RUN if [ "$TARGETARCH" = "arm64" ]; then \ + grep -v 'minimap2' /tmp/environment.yml > /tmp/environment.tmp.yml ;\ + else \ + cp /tmp/environment.yml /tmp/environment.tmp.yml ;\ + fi + +# Activate the environment +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +RUN micromamba install -y -f /tmp/environment.tmp.yml && micromamba clean --all --yes \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..435dfd0 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,29 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Anglerfish", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "dev.Dockerfile", + }, + "features": {}, + "customizations": { + "vscode": { + "extensions": [ + "esbenp.prettier-vscode", + "wholroyd.jinja", + "ms-python.python", + "charliermarsh.ruff@v2024.2.0", + ], + }, + }, + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + "postCreateCommand": "pip3 install -e .", + // Configure tool-specific properties. + // "customizations": {}, + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "devcontainer" +} diff --git a/environment.yml b/environment.yml index ec2d96d..dde02fd 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,5 @@ # Development environment for anglerfish -name: anglerfish-dev +name: base channels: - conda-forge - bioconda From 7fa07f433368dc59c74f9587818f263b88a0380c Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:39:36 +0000 Subject: [PATCH 07/34] Added click as requirement --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9fd76d3..a0f4dc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ -numpy==1.26.3 biopython==1.83 +click==8.1.7 levenshtein==0.23.0 +numpy==1.26.3 pyyaml==6.0.1 \ No newline at end of file From 2ee69982c0df3ff670c846c05b5574eb76f0c5ca Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:40:01 +0000 Subject: [PATCH 08/34] Load adaptors function --- anglerfish/demux/adaptor.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index 6840490..8db7b66 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -1,3 +1,9 @@ +import importlib +import os + +import yaml + + class Adaptor: def __init__(self, adaptors, delim, adaptor, i7_index=None, i5_index=None): self.i7 = AdaptorPart(adaptors[adaptor]["i7"], adaptor, delim, i7_index) @@ -41,3 +47,23 @@ def get_mask(self, insert_Ns): return self.sequence.replace(self.delim, "N" * len(self.index)) else: return self.sequence + + +# Fetch all adaptors +def load_adaptors(raw=False): + p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") + assert isinstance(p, os.PathLike) + + adaptors_raw = [] + with open(p) as f: + adaptors_raw = yaml.safe_load(f) + + if raw: + return adaptors_raw + adaptors = [] + for adaptor in adaptors_raw: + adaptors.append( + Adaptor(adaptors_raw, "-NNN-", adaptor, i7_index=None, i5_index=None) + ) + + return adaptors From fcf72380597fe0612850884999ba44c87e73fbe3 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:41:45 +0000 Subject: [PATCH 09/34] Samplesheet uses load adaptors function --- anglerfish/demux/samplesheet.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index b31fc31..b62c53f 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -1,21 +1,15 @@ import csv import glob -import importlib.resources -import os import re from dataclasses import dataclass from itertools import combinations import Levenshtein as lev -import yaml -from anglerfish.demux.adaptor import Adaptor +from anglerfish.demux.adaptor import Adaptor, load_adaptors -p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") -assert isinstance(p, os.PathLike) -with open(p) as stream: - adaptors = yaml.safe_load(stream) delim = "-NNN-" +adaptors = load_adaptors(raw=True) @dataclass From 149312df04c9121e119030d41b0e96e63677005b Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 08:45:30 +0000 Subject: [PATCH 10/34] Explore cli outline --- anglerfish/explore/cli.py | 94 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 anglerfish/explore/cli.py diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py new file mode 100644 index 0000000..caa5d83 --- /dev/null +++ b/anglerfish/explore/cli.py @@ -0,0 +1,94 @@ +import click + +from anglerfish.explore.explore import run_explore + + +@click.command() +@click.option("-f", "--fastq", required=True, help="Fastq file to align") +@click.option("-a", "--adaptors_file", required=True, help="YAML file with adaptors") +@click.option("-o", "--outdir", required=True, help="Output directory") +@click.option("-t", "--threads", default=1, type=int, help="Number of threads") +@click.option( + "-n", "--no_overwrite", is_flag=True, help="Do not overwrite existing alignments" +) +@click.option( + "-g", + "--good_hit_threshold", + default=0.9, + type=float, + help="Fraction of bases before and after index insert required to match perfectly for a hit to be considered a good hit. Default=0.9", +) +@click.option( + "-i", + "--insert_thres_low", + default=4, + type=int, + help="Lower threshold for insert length, with value included", +) +@click.option( + "-j", + "--insert_thres_high", + default=30, + type=int, + help="Upper threshold for insert length, with value included", +) +@click.option( + "-B", + "--minimap_b", + default=4, + type=int, + help="Minimap2 -B parameter, mismatch penalty", +) +@click.option( + "-m", + "--min_hits_per_adaptor", + default=50, + type=int, + help="Minimum number of good hits for an adaptor to be included in the analysis", +) +@click.option( + "-u", + "--umi_threshold", + default=11, + type=float, + help="Minimum number of bases in insert to perform entropy calculation", +) +@click.option( + "-k", + "--kmer_length", + default=2, + type=int, + help="Length of k-mers to use for entropy calculation", +) +def main( + fastq, + adaptors_file, + outdir, + threads, + no_overwrite, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, +): + run_explore( + fastq, + adaptors_file, + outdir, + threads, + no_overwrite, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, + ) + + +if __name__ == "__main__": + main() From c996de698c1da5dd1952e3edf27f83ec737e36f1 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 17 Jan 2024 21:17:52 +0000 Subject: [PATCH 11/34] Removed the unused adaptors_file argument for explore --- anglerfish/explore/cli.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py index caa5d83..1546a2f 100644 --- a/anglerfish/explore/cli.py +++ b/anglerfish/explore/cli.py @@ -5,9 +5,14 @@ @click.command() @click.option("-f", "--fastq", required=True, help="Fastq file to align") -@click.option("-a", "--adaptors_file", required=True, help="YAML file with adaptors") @click.option("-o", "--outdir", required=True, help="Output directory") -@click.option("-t", "--threads", default=1, type=int, help="Number of threads") +@click.option( + "-t", + "--threads", + default=1, + type=int, + help="Number of threads specified by minimap2", +) @click.option( "-n", "--no_overwrite", is_flag=True, help="Do not overwrite existing alignments" ) @@ -62,7 +67,6 @@ ) def main( fastq, - adaptors_file, outdir, threads, no_overwrite, @@ -76,7 +80,6 @@ def main( ): run_explore( fastq, - adaptors_file, outdir, threads, no_overwrite, From 8ab6ae8de0102836329f2a6e514234809c099f91 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 08:46:36 +0100 Subject: [PATCH 12/34] Github actions fix, mamba environment name --- .github/workflows/anglerfish.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml index 0fed1f1..0bef153 100644 --- a/.github/workflows/anglerfish.yml +++ b/.github/workflows/anglerfish.yml @@ -19,6 +19,7 @@ jobs: python=${{ matrix.python-version }} pip environment-file: environment.yml + environment-name: anglerfish-dev # Install Anglerfish - shell: bash -l {0} From c0e3f0ce2047567a666cf8baae77771703f19367 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 08:13:35 +0000 Subject: [PATCH 13/34] Added log file for mapping and optional B parameter --- anglerfish/demux/demux.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index fe8b5e7..0a43f3e 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -24,7 +24,7 @@ def parse_cs(cs_string, index, max_distance): return nts, lev.distance(index.lower(), nts) -def run_minimap2(fastq_in, indexfile, output_paf, threads): +def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=4): """ Runs Minimap2 """ @@ -36,9 +36,10 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): "10", "-w", "5", - "-B1", - "-A6", - "--dual=no", + "-A", + "6", + "-B", + str(minimap_b), "-c", "-t", str(threads), @@ -46,8 +47,10 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): fastq_in, ] - with open(output_paf, "ab") as ofile: - subprocess.run(cmd, stdout=ofile, check=True) + run_log = f"{output_paf}.log" + with open(output_paf, "ab") as ofile, open(run_log, "ab") as log_file: + p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=log_file) + subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) def parse_paf_lines(paf, min_qual=10): From 3b670c964f992bfb8514aea4b6710026d16bbcaf Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 10:26:08 +0000 Subject: [PATCH 14/34] Explore command can run minimap2 --- anglerfish/explore/explore.py | 109 ++++++++++++++-------------------- 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index eeaaca1..0373ca6 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -1,79 +1,60 @@ -import click +import logging +import os +import uuid +from anglerfish.demux.adaptor import load_adaptors +from anglerfish.demux.demux import run_minimap2 -@click.command() -@click.option("-f", "--fastq", required=True, help="Fastq file to align") -@click.option("-a", "--adaptors_file", required=True, help="YAML file with adaptors") -@click.option("-o", "--outdir", required=True, help="Output directory") -@click.option("-t", "--threads", default=1, type=int, help="Number of threads") -@click.option( - "-n", "--no_overwrite", is_flag=True, help="Do not overwrite existing alignments" -) -@click.option( - "-g", - "--good_hit_threshold", - default=0.9, - type=float, - help="Fraction of bases before and after index insert required to match perfectly for a hit to be considered a good hit. Default=0.9", -) -@click.option( - "-i", - "--insert_thres_low", - default=4, - type=int, - help="Lower threshold for insert length, with value included", -) -@click.option( - "-j", - "--insert_thres_high", - default=30, - type=int, - help="Upper threshold for insert length, with value included", -) -@click.option( - "-B", - "--minimap_B", - default=4, - type=int, - help="Minimap2 -B parameter, mismatch penalty", -) -@click.option( - "-m", - "--min_hits_per_adaptor", - default=50, - type=int, - help="Minimum number of good hits for an adaptor to be included in the analysis", -) -@click.option( - "-u", - "--umi_threshold", - default=11, - type=float, - help="Minimum number of bases in insert to perform entropy calculation", -) -@click.option( - "-k", - "--kmer_length", - default=2, - type=int, - help="Length of k-mers to use for entropy calculation", -) -def main( +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("explore") + + +def run_explore( fastq, - adaptors_file, outdir, threads, no_overwrite, good_hit_threshold, insert_thres_low, insert_thres_high, - minimap_B, + minimap_b, min_hits_per_adaptor, umi_threshold, kmer_length, ): - pass + # Setup a run directory + run_uuid = str(uuid.uuid4()) + try: + os.mkdir(outdir) + except FileExistsError: + log.info(f"Output directory {outdir} already exists") + if not no_overwrite: + log.error( + f"Output directory {outdir} already exists, please use --no_overwrite to continue" + ) + exit(1) + else: + pass + + log.info("Running anglerfish explore") + log.info(f"Run uuid {run_uuid}") + + adaptors = load_adaptors() + alignments = [] + + # Map all reads against all adaptors + for adaptor in adaptors: + adaptor_name = adaptor.name + # Align + aln_path = os.path.join(outdir, f"{adaptor_name}.paf") + alignments.append((adaptor, aln_path)) + if os.path.exists(aln_path) and no_overwrite: + log.info(f"Skipping {adaptor_name} as alignment already exists") + continue + adaptor_path = os.path.join(outdir, f"{adaptor_name}.fasta") + with open(adaptor_path, "w") as f: + f.write(adaptor.get_fastastring(insert_Ns=False)) -if __name__ == "__main__": - main() + log.info(f"Aligning {adaptor_name}") + run_minimap2(fastq, adaptor_path, aln_path, threads, minimap_b) From 28483de085d6a2e7cc4a4229757f26034fadea4c Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 20:49:40 +0000 Subject: [PATCH 15/34] Added parameter complex index to the parse_paf_lines function --- anglerfish/demux/demux.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 0a43f3e..f174443 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -53,10 +53,12 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=4): subprocess.run("sort", stdin=p1.stdout, stdout=ofile, check=True) -def parse_paf_lines(paf, min_qual=10): +def parse_paf_lines(paf, min_qual=10, complex_identifier=False): """ Read and parse one paf alignment lines. Returns a dict with the import values for later use + If complex_identifier is True (default False), the keys will be on the form + "{read}_{i5_or_i7}_{strand_str}" """ entries = {} with open(paf) as paf: @@ -65,17 +67,28 @@ def parse_paf_lines(paf, min_qual=10): try: # TODO: objectify this entry = { + "read": aln[0], "adapter": aln[5], "rlen": int(aln[1]), # read length "rstart": int(aln[2]), # start alignment on read "rend": int(aln[3]), # end alignment on read "strand": aln[4], + "cg": aln[-2], # cigar string "cs": aln[-1], # cs string "q": int(aln[11]), # Q score "iseq": None, "sample": None, } - read = aln[0] + read = entry["read"] + if complex_identifier: + i5_or_i7 = entry["adapter"].split("_")[-1] + if entry["strand"] == "+": + strand_str = "positive" + else: + strand_str = "negative" + ix = f"{read}_{i5_or_i7}_{strand_str}" + else: + ix = read except IndexError: log.debug(f"Could not find all paf columns: {read}") continue @@ -84,10 +97,10 @@ def parse_paf_lines(paf, min_qual=10): log.debug(f"Low quality alignment: {read}") continue - if read in entries.keys(): - entries[read].append(entry) + if ix in entries.keys(): + entries[ix].append(entry) else: - entries[read] = [entry] + entries[ix] = [entry] return entries From 5b7e9aba0b451ee76ee697e994b1e94cbbc900ca Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 20:50:24 +0000 Subject: [PATCH 16/34] 4 threads by default for explore --- anglerfish/explore/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py index 1546a2f..975f819 100644 --- a/anglerfish/explore/cli.py +++ b/anglerfish/explore/cli.py @@ -9,7 +9,7 @@ @click.option( "-t", "--threads", - default=1, + default=4, type=int, help="Number of threads specified by minimap2", ) From 078a667d510d3eeeb0a4a61d224456b52e873bf9 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 20:56:41 +0000 Subject: [PATCH 17/34] Explore command working except entropy --- anglerfish/explore/explore.py | 126 +++++++++++++++++++++++++++++++++- requirements.txt | 1 + 2 files changed, 126 insertions(+), 1 deletion(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 0373ca6..6faa395 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -2,8 +2,10 @@ import os import uuid +import pandas as pd + from anglerfish.demux.adaptor import load_adaptors -from anglerfish.demux.demux import run_minimap2 +from anglerfish.demux.demux import parse_paf_lines, run_minimap2 logging.basicConfig(level=logging.INFO) log = logging.getLogger("explore") @@ -58,3 +60,125 @@ def run_explore( log.info(f"Aligning {adaptor_name}") run_minimap2(fastq, adaptor_path, aln_path, threads, minimap_b) + + # Parse alignments + entries = {} + adaptors_included = [] + for adaptor, aln_path in alignments: + log.info(f"Parsing {adaptor.name}") + aln_dict_with_lists = parse_paf_lines(aln_path, complex_identifier=True) + + # Choose only the highest scoring alignment for each combination of read, adaptor end and strand + aln_dict = dict( + [(k, max(v, key=lambda x: x["q"])) for k, v in aln_dict_with_lists.items()] + ) + + df = pd.DataFrame.from_dict(aln_dict, orient="index") + nr_good_hits = {} + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + if adaptor_end.has_index(): + # Match Insert Match = mim + # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert + # The cg string allows for mismatches within the matching strings but no insertions or deletions + # All cg string matches are also cs string matches (subset) but not vice versa + mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim + mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" + df_mim = df[df.cs.str.match(mim_re_cs)] + + # Extract the match lengths + match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( + {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 + ) + match_col_df = match_col_df.astype( + { + "match_1_len": "int32", + "insert_len": "int32", + "match_2_len": "int32", + } + ) + + df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df + + # Alignment thresholds + before_thres = round( + adaptor_end.len_before_index() * good_hit_threshold + ) + after_thres = round(adaptor_end.len_after_index() * good_hit_threshold) + insert_thres_low = insert_thres_low + insert_thres_high = insert_thres_high + + requirements = ( + (df_mim["match_1_len"] >= (before_thres)) + & (df_mim["insert_len"] >= insert_thres_low) + & (df_mim["insert_len"] <= insert_thres_high) + & (df_mim["match_2_len"] >= (after_thres)) + ) + df_good_hits = df_mim[requirements] + + median_insert_length = df_good_hits["insert_len"].median() + insert_lengths = df_good_hits["insert_len"].value_counts() + else: + m_re_cs = r"^cs:Z::([1-9][0-9]*)$" + df_good_hits = df[df.cg.str.match(m_re_cs)] + match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( + {0: "match_1_len"}, axis=1 + ) + match_col_df = match_col_df.astype({"match_1_len": "int32"}) + + df_good_hits.loc[ + match_col_df.index, match_col_df.columns + ] = match_col_df + + thres = round( + (adaptor_end.len_before_index() + adaptor_end.len_after_index()) + * good_hit_threshold + ) + df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] + + median_insert_length = None + insert_lengths = None + + # Filter multiple hits per read and adaptor end + df_good_hits = df_good_hits.sort_values( + by=["read", "match_1_len"], ascending=False + ).drop_duplicates(subset=["read", "adapter"], keep="first") + + if adaptor.name not in entries.keys(): + entries[adaptor.name] = {} + entries[adaptor.name][adaptor_end_name] = df_good_hits + + nr_good_hits[adaptor_end_name] = len(df_good_hits) + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." + ) + + if min(nr_good_hits.values()) >= min_hits_per_adaptor: + log.info(f"Adaptor {adaptor.name} is included in the analysis") + adaptors_included.append(adaptor) + else: + log.info(f"Adaptor {adaptor.name} is excluded from the analysis") + + # Print insert length info for adaptor types included in the analysis + for adaptor in adaptors_included: + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + df_good_hits = entries[adaptor.name][adaptor_end_name] + if adaptor_end.has_index(): + median_insert_length = df_good_hits["insert_len"].median() + if median_insert_length > umi_threshold: + # Calculate entropies here + pass + insert_lengths = df_good_hits["insert_len"].value_counts() + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" + ) + log.info(insert_lengths[sorted(insert_lengths.index)]) + else: + median_insert_length = None + insert_lengths = None + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)" + ) diff --git a/requirements.txt b/requirements.txt index a0f4dc2..db39464 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ biopython==1.83 click==8.1.7 levenshtein==0.23.0 numpy==1.26.3 +pandas==2.1.4 pyyaml==6.0.1 \ No newline at end of file From 76e499ab3eda08cea790e94358db849ba55fce18 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 21:38:06 +0000 Subject: [PATCH 18/34] Added entropy calculation and printing to the explore command --- anglerfish/explore/entropy.py | 60 +++++++++++++++++++++++++++++++++++ anglerfish/explore/explore.py | 8 ++++- 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 anglerfish/explore/entropy.py diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py new file mode 100644 index 0000000..cafdf9e --- /dev/null +++ b/anglerfish/explore/entropy.py @@ -0,0 +1,60 @@ +import itertools +from collections import deque + +import numpy as np + + +# Rolling window implementation +def _window(seq, n=3): + win = deque(maxlen=n) + for char in seq: + win.append(char) + if len(win) >= n: + yield tuple(win) + + +def _entropy(pk, qk): + """Taken from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html + + Felt unecessary to add scipy as dependency just for this function. + """ + # TODO: Catch potential exceptions here for weird input values. + return -np.sum(pk * np.log(qk)) + + +# Each sequence gives matrix of position and k-mer occurence. +def _seq_to_count_matrix(dna_seq, kmer_positions, max=50, k=2): + """max is the length of the prefix that will be considered""" + if len(dna_seq) < max: + max = len(dna_seq) + + matrix = np.zeros((4**k, max - k + 1)) + for i, word in enumerate(_window(dna_seq[:max], n=k)): + matrix[kmer_positions[word], i] += 1 + return matrix + + +def _count_for_seqs(seqs, kmer_positions, insert_length, k=2): + seqs = [seq for seq in seqs if len(seq) == insert_length] + matrices = [_seq_to_count_matrix(seq, kmer_positions, max=50, k=k) for seq in seqs] + stacked = np.stack(matrices) + sum_a = np.sum(stacked, axis=0) + even_dist = np.ones(4**k) / (4**k) + entropies = [_entropy(a, even_dist) for a in sum_a.T] + + return entropies + + +def _extract_inserts_from_df(df): + mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" + return df.cs.str.extract(mim_re_cs, expand=False).str.upper().tolist() + + +def calculate_relative_entropy(df_good_hits, kmer_length, insert_length): + all_kmers = itertools.product("ACTG", repeat=kmer_length) + kmer_positions = dict( + (kmer, position) for kmer, position in zip(all_kmers, range(4**kmer_length)) + ) + seqs = _extract_inserts_from_df(df_good_hits) + entropies = _count_for_seqs(seqs, kmer_positions, insert_length, k=kmer_length) + return entropies diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 6faa395..f62f2c6 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -6,6 +6,7 @@ from anglerfish.demux.adaptor import load_adaptors from anglerfish.demux.demux import parse_paf_lines, run_minimap2 +from anglerfish.explore.entropy import calculate_relative_entropy logging.basicConfig(level=logging.INFO) log = logging.getLogger("explore") @@ -170,7 +171,12 @@ def run_explore( median_insert_length = df_good_hits["insert_len"].median() if median_insert_length > umi_threshold: # Calculate entropies here - pass + entropies = calculate_relative_entropy( + df_good_hits, kmer_length, median_insert_length + ) + log.info( + f"{adaptor.name}:{adaptor_end_name} had entropy {entropies}" + ) insert_lengths = df_good_hits["insert_len"].value_counts() log.info( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" From cba68fc1724da29da9567216e93a9fe90b3d990d Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 18 Jan 2024 21:40:47 +0000 Subject: [PATCH 19/34] Change back to demux default for B in minimap2 --- anglerfish/demux/demux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index f174443..ad01123 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -24,7 +24,7 @@ def parse_cs(cs_string, index, max_distance): return nts, lev.distance(index.lower(), nts) -def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=4): +def run_minimap2(fastq_in, indexfile, output_paf, threads, minimap_b=1): """ Runs Minimap2 """ From 776b146aa3e03e844adc16a0e55f329c8052ee1a Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 22 Jan 2024 07:36:39 +0100 Subject: [PATCH 20/34] Fix bug in entropy calculation --- anglerfish/explore/entropy.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py index cafdf9e..403d9cd 100644 --- a/anglerfish/explore/entropy.py +++ b/anglerfish/explore/entropy.py @@ -3,7 +3,6 @@ import numpy as np - # Rolling window implementation def _window(seq, n=3): win = deque(maxlen=n) @@ -14,12 +13,13 @@ def _window(seq, n=3): def _entropy(pk, qk): - """Taken from https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html - - Felt unecessary to add scipy as dependency just for this function. """ - # TODO: Catch potential exceptions here for weird input values. - return -np.sum(pk * np.log(qk)) + Kullback-Leibler divergence (relative entropy) + Reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html + """ + if np.sum(pk) - 1 > 1e-5: + pk = pk / np.sum(pk) + return np.sum(pk * np.log(pk/qk)) # Each sequence gives matrix of position and k-mer occurence. @@ -41,7 +41,6 @@ def _count_for_seqs(seqs, kmer_positions, insert_length, k=2): sum_a = np.sum(stacked, axis=0) even_dist = np.ones(4**k) / (4**k) entropies = [_entropy(a, even_dist) for a in sum_a.T] - return entropies From 73533b570df060ed4e74a47078692caf0fe96871 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 23 Jan 2024 09:08:04 +0100 Subject: [PATCH 21/34] Ruff formatting for entropy file --- anglerfish/explore/entropy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py index 403d9cd..9ac733a 100644 --- a/anglerfish/explore/entropy.py +++ b/anglerfish/explore/entropy.py @@ -3,6 +3,7 @@ import numpy as np + # Rolling window implementation def _window(seq, n=3): win = deque(maxlen=n) @@ -14,12 +15,12 @@ def _window(seq, n=3): def _entropy(pk, qk): """ - Kullback-Leibler divergence (relative entropy) + Kullback-Leibler divergence (relative entropy) Reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html """ if np.sum(pk) - 1 > 1e-5: pk = pk / np.sum(pk) - return np.sum(pk * np.log(pk/qk)) + return np.sum(pk * np.log(pk / qk)) # Each sequence gives matrix of position and k-mer occurence. From 33c4003165c87b09acdf375d1e395e1218c50fd2 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 24 Jan 2024 16:33:29 +0000 Subject: [PATCH 22/34] Use single multi-stage dockerfile --- .devcontainer/dev.Dockerfile | 50 --------------------------------- .devcontainer/devcontainer.json | 10 +++++-- Dockerfile | 29 ++++++++++++------- 3 files changed, 26 insertions(+), 63 deletions(-) delete mode 100644 .devcontainer/dev.Dockerfile diff --git a/.devcontainer/dev.Dockerfile b/.devcontainer/dev.Dockerfile deleted file mode 100644 index 1302fe2..0000000 --- a/.devcontainer/dev.Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -FROM mambaorg/micromamba - -LABEL author="Remi-Andre Olsen" \ - description="Anglerfish development version" \ - maintainer="remi-andre.olsen@scilifelab.se" - -USER root - -# Useful tools for devcontainer -RUN apt-get update;\ - apt-get install -y git - -# Check for arm64 architecture to install minimap2 -ARG TARGETARCH -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - # Install compliation tools for minimap2 - apt-get update;\ - apt-get install -y curl build-essential libz-dev;\ - fi - - -USER $MAMBA_USER -ENV MINIMAP_VERSION=2.26 - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - # Download minimap2 - curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\ - fi - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - # Compile minimap2 - cd minimap2-${MINIMAP_VERSION};\ - make arm_neon=1 aarch64=1; \ - # Add to PATH - echo "export PATH=$(pwd):\$PATH" >> ~/.bashrc ;\ - fi - -# Add requirements files to the container -COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/ -COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt /tmp/ - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ - grep -v 'minimap2' /tmp/environment.yml > /tmp/environment.tmp.yml ;\ - else \ - cp /tmp/environment.yml /tmp/environment.tmp.yml ;\ - fi - -# Activate the environment -ARG MAMBA_DOCKERFILE_ACTIVATE=1 -RUN micromamba install -y -f /tmp/environment.tmp.yml && micromamba clean --all --yes \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 435dfd0..1c10fd3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -6,8 +6,11 @@ // Sets the run context to one level up instead of the .devcontainer folder. "context": "..", // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. - "dockerfile": "dev.Dockerfile", + "dockerfile": "../Dockerfile", + // Use devcontainer target + "target": "devcontainer", }, + "initializeCommand": "echo 'Workaround, see github https://github.com/microsoft/vscode-remote-release/issues/9302#issuecomment-1854476541'", "features": {}, "customizations": { "vscode": { @@ -15,13 +18,14 @@ "esbenp.prettier-vscode", "wholroyd.jinja", "ms-python.python", - "charliermarsh.ruff@v2024.2.0", + "charliermarsh.ruff@2024.2.0", + "ms-azuretools.vscode-docker", ], }, }, // Use 'forwardPorts' to make a list of ports inside the container available locally. // "forwardPorts": [], - "postCreateCommand": "pip3 install -e .", + // "postCreateCommand": "pip3 install -e .", // Configure tool-specific properties. // "customizations": {}, // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. diff --git a/Dockerfile b/Dockerfile index 73b291f..390bd25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM mambaorg/micromamba +FROM mambaorg/micromamba as base LABEL author="Remi-Andre Olsen" \ description="Anglerfish development version" \ @@ -8,20 +8,13 @@ USER root # Check for arm64 architecture to install minimap2 ARG TARGETARCH +ENV MINIMAP_VERSION=2.26 RUN if [ "$TARGETARCH" = "arm64" ]; then \ # Install compliation tools for minimap2 apt-get update;\ apt-get install -y curl build-essential libz-dev;\ - fi - -ENV MINIMAP_VERSION=2.26 - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ # Download minimap2 curl -L https://github.com/lh3/minimap2/archive/refs/tags/v${MINIMAP_VERSION}.tar.gz | tar -zxvf - ;\ - fi - -RUN if [ "$TARGETARCH" = "arm64" ]; then \ # Compile minimap2 cd minimap2-${MINIMAP_VERSION};\ make arm_neon=1 aarch64=1; \ @@ -32,6 +25,7 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \ COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml / COPY --chown=$MAMBA_USER:$MAMBA_USER requirements.txt / +# Remove minimap2 from environment.yml for arm64 RUN if [ "$TARGETARCH" = "arm64" ]; then \ grep -v 'minimap2' /environment.yml > /environment.tmp.yml ;\ else \ @@ -47,5 +41,20 @@ WORKDIR /usr/src/anglerfish ARG MAMBA_DOCKERFILE_ACTIVATE=1 RUN micromamba install -y -f /environment.tmp.yml && micromamba clean --all --yes +##### +# Devcontainer +##### +FROM base as devcontainer + +# Useful tools for devcontainer +RUN apt-get update;\ + apt-get install -y git vim + +RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install -e .[dev] + +##### +# Main +##### +FROM base as main RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev] -USER $MAMBA_USER +USER $MAMBA_USER \ No newline at end of file From 567bf915a8eeb3161f5f1133abf7df636e3fe56c Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 25 Jan 2024 15:51:28 +0100 Subject: [PATCH 23/34] Fix for sneaky bug inside try except --- anglerfish/demux/demux.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index ad01123..c19a5f8 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -176,7 +176,7 @@ def cluster_matches( fi7 = "" for _, adaptor, _ in sample_adaptor: try: - i5_seq = adaptor.i5_index + i5_seq = adaptor.i5.index if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance) From bfc376185c5e7d17ce32804df664a5c1197761bb Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Thu, 25 Jan 2024 16:31:53 +0100 Subject: [PATCH 24/34] Test printing histogram to csv --- anglerfish/explore/explore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index f62f2c6..b848002 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -181,6 +181,8 @@ def run_explore( log.info( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" ) + histogram_file = os.path.join(outdir, f"{adaptor_end.name}.hist.csv") + insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) log.info(insert_lengths[sorted(insert_lengths.index)]) else: median_insert_length = None From 2319559af496b36176d8774046abe0136d8f56ab Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 11:10:29 +0100 Subject: [PATCH 25/34] Avoid zeroes in observations for entropy calculation --- anglerfish/explore/entropy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anglerfish/explore/entropy.py b/anglerfish/explore/entropy.py index 9ac733a..9a6bea9 100644 --- a/anglerfish/explore/entropy.py +++ b/anglerfish/explore/entropy.py @@ -18,6 +18,9 @@ def _entropy(pk, qk): Kullback-Leibler divergence (relative entropy) Reference https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html """ + # To avoid zeroes in pk, add a tiny fraction of observed values + pk = pk + 1e-2 + if np.sum(pk) - 1 > 1e-5: pk = pk / np.sum(pk) return np.sum(pk * np.log(pk / qk)) From daed1d9d14f5c3a1cb9ae91f43aff94294534d73 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 12:37:33 +0000 Subject: [PATCH 26/34] Output to files instead of terminal --- anglerfish/explore/explore.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index b848002..c7bd6d2 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -174,8 +174,12 @@ def run_explore( entropies = calculate_relative_entropy( df_good_hits, kmer_length, median_insert_length ) + entropy_file = os.path.join( + outdir, f"{adaptor_end.name}.entropy.csv" + ) + pd.Series(entropies).to_csv(entropy_file) log.info( - f"{adaptor.name}:{adaptor_end_name} had entropy {entropies}" + f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}" ) insert_lengths = df_good_hits["insert_len"].value_counts() log.info( @@ -183,7 +187,9 @@ def run_explore( ) histogram_file = os.path.join(outdir, f"{adaptor_end.name}.hist.csv") insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) - log.info(insert_lengths[sorted(insert_lengths.index)]) + log.info( + f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}" + ) else: median_insert_length = None insert_lengths = None From 67689bdf2f3269c58fd4fd916cf6ee722ff9cb62 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 12:50:04 +0000 Subject: [PATCH 27/34] Hopefully more unique output file names --- anglerfish/explore/explore.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index c7bd6d2..cbba30a 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -175,7 +175,7 @@ def run_explore( df_good_hits, kmer_length, median_insert_length ) entropy_file = os.path.join( - outdir, f"{adaptor_end.name}.entropy.csv" + outdir, f"{adaptor.name}_{adaptor_end.name}.entropy.csv" ) pd.Series(entropies).to_csv(entropy_file) log.info( @@ -185,7 +185,9 @@ def run_explore( log.info( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" ) - histogram_file = os.path.join(outdir, f"{adaptor_end.name}.hist.csv") + histogram_file = os.path.join( + outdir, f"{adaptor.name}_{adaptor_end.name}.hist.csv" + ) insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) log.info( f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}" From a1321ba8313d864ef17e1c29291f89e46080f284 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 14:23:59 +0100 Subject: [PATCH 28/34] Trying to fix wrong name of output files --- anglerfish/explore/explore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index cbba30a..0dc7428 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -175,7 +175,7 @@ def run_explore( df_good_hits, kmer_length, median_insert_length ) entropy_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end.name}.entropy.csv" + outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv" ) pd.Series(entropies).to_csv(entropy_file) log.info( @@ -186,7 +186,7 @@ def run_explore( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" ) histogram_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end.name}.hist.csv" + outdir, f"{adaptor.name}_{adaptor_end_name}.hist.csv" ) insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) log.info( From 316b981008e1abb00b4cbdec5a5aa20e7c3216b2 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 13:37:06 +0000 Subject: [PATCH 29/34] Trying out float format for entropy --- anglerfish/explore/explore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index 0dc7428..de176b1 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -177,7 +177,7 @@ def run_explore( entropy_file = os.path.join( outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv" ) - pd.Series(entropies).to_csv(entropy_file) + pd.Series(entropies).to_csv(entropy_file, float_format="%.2f") log.info( f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}" ) From c9dda4639a467c7fb04aadb333e3b570b0002bad Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 13:51:41 +0000 Subject: [PATCH 30/34] Updated CLI parameters and help messages --- anglerfish/explore/cli.py | 25 ++++++++++++++----------- anglerfish/explore/explore.py | 8 ++++---- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py index 975f819..1cceab5 100644 --- a/anglerfish/explore/cli.py +++ b/anglerfish/explore/cli.py @@ -11,65 +11,68 @@ "--threads", default=4, type=int, - help="Number of threads specified by minimap2", + help="Number of threads specified to minimap2", ) @click.option( - "-n", "--no_overwrite", is_flag=True, help="Do not overwrite existing alignments" + "-e", + "--use-existing", + is_flag=True, + help="Use existing alignments if found in the specified output directory.", ) @click.option( "-g", "--good_hit_threshold", default=0.9, type=float, - help="Fraction of bases before and after index insert required to match perfectly for a hit to be considered a good hit. Default=0.9", + help="Fraction of adaptor bases immediately before and immediately after index insert required to match perfectly for a hit to be considered a good hit (default=0.9).", ) @click.option( "-i", "--insert_thres_low", default=4, type=int, - help="Lower threshold for insert length, with value included", + help="Lower threshold for index(+UMI) insert length, with value included (deafult=4).", ) @click.option( "-j", "--insert_thres_high", default=30, type=int, - help="Upper threshold for insert length, with value included", + help="Upper threshold for index(+UMI) insert length, with value included (default=30).", ) @click.option( "-B", "--minimap_b", default=4, type=int, - help="Minimap2 -B parameter, mismatch penalty", + help="Minimap2 -B parameter, mismatch penalty (default=4).", ) @click.option( "-m", "--min_hits_per_adaptor", default=50, type=int, - help="Minimum number of good hits for an adaptor to be included in the analysis", + help="Minimum number of good hits for an adaptor to be included in the analysis (default=50).", ) @click.option( "-u", "--umi_threshold", default=11, type=float, - help="Minimum number of bases in insert to perform entropy calculation", + help="Minimum number of bases in insert to perform entropy calculation (default=11).", ) @click.option( "-k", "--kmer_length", default=2, type=int, - help="Length of k-mers to use for entropy calculation", + help="Length of k-mers to use for entropy calculation (default=2).", ) def main( fastq, outdir, threads, - no_overwrite, + use_existing, good_hit_threshold, insert_thres_low, insert_thres_high, @@ -82,7 +85,7 @@ def main( fastq, outdir, threads, - no_overwrite, + use_existing, good_hit_threshold, insert_thres_low, insert_thres_high, diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index de176b1..1fd9cf8 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -16,7 +16,7 @@ def run_explore( fastq, outdir, threads, - no_overwrite, + use_existing, good_hit_threshold, insert_thres_low, insert_thres_high, @@ -31,9 +31,9 @@ def run_explore( os.mkdir(outdir) except FileExistsError: log.info(f"Output directory {outdir} already exists") - if not no_overwrite: + if not use_existing: log.error( - f"Output directory {outdir} already exists, please use --no_overwrite to continue" + f"Output directory {outdir} already exists, please use --use_existing to continue" ) exit(1) else: @@ -52,7 +52,7 @@ def run_explore( # Align aln_path = os.path.join(outdir, f"{adaptor_name}.paf") alignments.append((adaptor, aln_path)) - if os.path.exists(aln_path) and no_overwrite: + if os.path.exists(aln_path) and use_existing: log.info(f"Skipping {adaptor_name} as alignment already exists") continue adaptor_path = os.path.join(outdir, f"{adaptor_name}.fasta") From 2bbd6854041f359386594712749975852877dcc9 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 16:26:07 +0100 Subject: [PATCH 31/34] Changed back to named env in environment and specify base in dockerfile --- Dockerfile | 7 +++---- environment.yml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 390bd25..53aad9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,7 +39,7 @@ WORKDIR /usr/src/anglerfish # Activate the environment ARG MAMBA_DOCKERFILE_ACTIVATE=1 -RUN micromamba install -y -f /environment.tmp.yml && micromamba clean --all --yes +RUN micromamba install -y -n base -f /environment.tmp.yml && micromamba clean --all --yes ##### # Devcontainer @@ -49,12 +49,11 @@ FROM base as devcontainer # Useful tools for devcontainer RUN apt-get update;\ apt-get install -y git vim - RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install -e .[dev] ##### # Main ##### FROM base as main -RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev] -USER $MAMBA_USER \ No newline at end of file +USER $MAMBA_USER +RUN eval "$(micromamba shell hook --shell bash)" && python -m pip install .[dev] \ No newline at end of file diff --git a/environment.yml b/environment.yml index dde02fd..ec2d96d 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,5 @@ # Development environment for anglerfish -name: base +name: anglerfish-dev channels: - conda-forge - bioconda From 62aef7a0e081aa7c62cb964cad1e57f26e9698e3 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 16:34:05 +0100 Subject: [PATCH 32/34] Added instructions for Arm64 installation --- README.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/README.md b/README.md index 7de2e87..1f81352 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,25 @@ conda install -c bioconda anglerfish pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerfish.git ``` +### For Arm64 processors (e.g. Apple M2) + +Anglerfish depends on minimap2 which needs to be [compiled from source](https://github.com/lh3/minimap2?tab=readme-ov-file#installation) for Arm64 processors. +When minimap2 is compiled and available on $PATH, anglerfish can be installed with pip: + +```shell +pip install bio-anglerfish +``` + +The Dockerfile supplied here includes a + ## Source development 1. [Install miniconda](https://docs.conda.io/en/latest/miniconda.html). 2. Set up repo clone with editable install +For x86 processors (e.g. Intle/AMD): + ``` git clone https://github.com/remiolsen/anglerfish.git cd anglerfish @@ -49,6 +62,18 @@ conda activate anglerfish-dev pip install -e ".[dev]" ``` +For Arm64 processors (e.g. Apple M1/M2). First [compile and install minimap2 manually](https://github.com/lh3/minimap2?tab=readme-ov-file#installation), then: + +``` +git clone https://github.com/remiolsen/anglerfish.git +cd anglerfish +# Create a the anglerfish conda environment (but remove minimap2) +conda env create -f <(grep -v minimap2 environment.yml) +# Install anglerfish +conda activate anglerfish-dev +pip install -e ".[dev]" +``` + 3. (Optional) Install pre-commit to prevent committing code that will fail linting ``` From 23ef19f7bb2d82704bcb6d2f0f59edde4e933910 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Fri, 26 Jan 2024 16:43:27 +0100 Subject: [PATCH 33/34] Some note about the explore command in readme --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 1f81352..c5a18dc 100644 --- a/README.md +++ b/README.md @@ -171,6 +171,16 @@ In folder `anglerfish_????_??_??_?????/` - `anglerfish_stats.txt` Barcode statistics from anglerfish run - `anglerfish_stats.json` Machine readable anglerfish statistics +## Anglerfish Explore (Experimental) + +`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running: + +```shell +python anglerfish/explore/cli.py +``` + +inside the anglerfish directory. + ## Credits The Anglerfish code was written by [@remiolsen](https://github.com/remiolsen) but it would not exist without the contributions of [@FranBonath](https://github.com/FranBonath), [@taborsak](https://github.com/taborsak), [@ssjunnebo](https://github.com/ssjunnebo) and Carl Rubin. From 5cb657efe0e6528c80584fe5efbfe562c00fe0d4 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 29 Jan 2024 14:12:59 +0100 Subject: [PATCH 34/34] Fixes according to review comments --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c5a18dc..be02c73 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ When minimap2 is compiled and available on $PATH, anglerfish can be installed wi pip install bio-anglerfish ``` -The Dockerfile supplied here includes a +Additionaly, if Docker is your cup of tea, the Dockerfile supplied in this repository should also work on both arm64 and x86 processors. ## Source development @@ -50,7 +50,7 @@ The Dockerfile supplied here includes a 2. Set up repo clone with editable install -For x86 processors (e.g. Intle/AMD): +For x86 processors (e.g. Intel/AMD): ``` git clone https://github.com/remiolsen/anglerfish.git