diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml index 0bef153..41643c4 100644 --- a/.github/workflows/anglerfish.yml +++ b/.github/workflows/anglerfish.yml @@ -37,4 +37,10 @@ jobs: - shell: bash -l {0} name: Run anglerfish with test data run: | - anglerfish -s test/samples.csv + anglerfish run -s test/samples.csv + + # Run anglerfish explore + - shell: bash -l {0} + name: Run anglerfish explore + run: | + anglerfish explore -f test/BC18_P14351_1001.fastq.gz -o test/explore_output diff --git a/README.md b/README.md index 63bd962..e4ffa20 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,7 @@ P12345_101,truseq,CAGGACGT,/path/to/*.fastq.gz Then run: ``` -anglerfish -s /path/to/samples.csv +anglerfish run -s /path/to/samples.csv ``` ### Options @@ -173,10 +173,10 @@ In folder `anglerfish_????_??_??_?????/` ## Anglerfish Explore (Experimental) -`anglerfish-explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running, e.g. for help text: +`anglerfish explore` is a command that aims to explore a sequencing pool without a given samplesheet and give hints on what adapter types are present, which index lenghts are used and whether there are any UMIs within the index sequence. The Anglerfish explore command is still under heavy development but can be triggered by running, e.g. for help text: ```shell -anglerfish-explore --help +anglerfish explore --help ``` ## Credits diff --git a/anglerfish/__main__.py b/anglerfish/__main__.py index 9310eaa..9816aeb 100644 --- a/anglerfish/__main__.py +++ b/anglerfish/__main__.py @@ -1,7 +1,7 @@ import multiprocessing -from .anglerfish import anglerfish +from .cli import app if __name__ == "__main__": multiprocessing.freeze_support() - anglerfish() + app() diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index f7a56cf..492dad6 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -1,13 +1,12 @@ #!/usr/bin/env python -import argparse import glob import gzip import logging import multiprocessing import os +import sys import uuid from collections import Counter -from datetime import datetime as dt from itertools import groupby import numpy as np @@ -39,7 +38,18 @@ def run_demux(args): ss = SampleSheet(args.samplesheet, args.ont_barcodes) version = pkg_resources.get_distribution("bio-anglerfish").version report = Report(args.run_name, run_uuid, version) - + sys.stderr.write(""" + ___ + ( ) \ -..__ + _.|~”~~~”…_ + ^´ `>. +(+ (+ ) “<..<^( + `´ ``´ ___ ( + \__..~ __( _…_( + \ / + “--…_ _..~%´ + ```´´ +""") log.info(f" version {version}") log.info(f" arguments {vars(args)}") log.info(f" run uuid {run_uuid}") @@ -118,7 +128,7 @@ def run_demux(args): "i5": {"i7_reversed": False, "i5_reversed": True}, "i7+i5": {"i7_reversed": True, "i5_reversed": True}, } - if args.force_rc is not None: + if args.force_rc != "original": log.info( f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled" ) @@ -245,105 +255,3 @@ def run_demux(args): report.write_report(args.out_fastq) report.write_json(args.out_fastq) report.write_dataframe(args.out_fastq, ss) - - if args.skip_fastqc: - log.warning( - " As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant." - ) - - -def anglerfish(): - parser = argparse.ArgumentParser( - description="Tools to demux I7 and I5 barcodes when sequenced by single-molecules" - ) - parser.add_argument( - "--samplesheet", - "-s", - required=True, - help="CSV formatted list of samples and barcodes", - ) - parser.add_argument( - "--out_fastq", - "-o", - default=".", - help="Analysis output folder (default: Current dir)", - ) - parser.add_argument( - "--threads", - "-t", - default=4, - type=int, - help="Number of threads to use (default: 4)", - ) - parser.add_argument( - "--skip_demux", - "-c", - action="store_true", - help="Only do BC counting and not demuxing", - ) - parser.add_argument( - "--skip_fastqc", "-f", action="store_true", help=argparse.SUPPRESS - ) - parser.add_argument( - "--max-distance", - "-m", - type=int, - help="Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2", - ) - parser.add_argument( - "--max-unknowns", - "-u", - type=int, - help="Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)", - ) - parser.add_argument( - "--run_name", - "-r", - default="anglerfish", - help="Name of the run (default: anglerfish)", - ) - parser.add_argument( - "--lenient", - "-l", - action="store_true", - help="Will try reverse complementing the I5 and/or I7 indices and choose the best match.", - ) - parser.add_argument( - "--lenient_factor", - "-x", - default=4.0, - type=float, - help="If lenient is set, this is the minimum factor of additional matches required to reverse complement the index (default: 4.0)", - ) - parser.add_argument( - "--force_rc", - "-p", - choices=["i7", "i5", "i7+i5"], - help="Force reverse complementing the I5 and/or I7 indices. This will disregard lenient mode.", - ) - parser.add_argument( - "--ont_barcodes", - "-n", - action="store_true", - help="Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately", - ) - parser.add_argument( - "--debug", "-d", action="store_true", help="Extra commandline output" - ) - parser.add_argument( - "--version", - "-v", - action="version", - help="Print version and quit", - version=f'anglerfish {pkg_resources.get_distribution("bio-anglerfish").version}', - ) - args = parser.parse_args() - utcnow = dt.utcnow() - runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S") - - assert os.path.exists(args.out_fastq) - assert os.path.exists(args.samplesheet) - args.out_fastq = os.path.join(os.path.abspath(args.out_fastq), runname) - args.samplesheet = os.path.abspath(args.samplesheet) - args.run_name = runname - run_demux(args) diff --git a/anglerfish/cli.py b/anglerfish/cli.py new file mode 100644 index 0000000..ee1b68c --- /dev/null +++ b/anglerfish/cli.py @@ -0,0 +1,280 @@ +import argparse +import datetime as dt +import os +from enum import Enum +from typing import Optional + +import pkg_resources +import typer +from typing_extensions import Annotated + +from .anglerfish import run_demux +from .explore.explore import run_explore + +app = typer.Typer(pretty_exceptions_show_locals=False) + + +class IndexOrientations(str, Enum): + i7 = "i7" + i5 = "i5" + i7i5 = "i7+i5" + original = "original" + + +def version_callback(value: bool): + if value: + print(f'anglerfish {pkg_resources.get_distribution("bio-anglerfish").version}') + raise typer.Exit() + + +def deprecated_callback(value: bool): + if value: + raise typer.BadParameter( + "Please use the 'anglerfish run -s' command to run anglerfish with a samplesheet. Running only 'anglerfish -s' is not supported as of version 0.7.0" + ) + + +@app.callback() +def main( + version: Annotated[ + Optional[bool], + typer.Option( + "--version", + "-v", + help="Print version and quit", + is_eager=True, + callback=version_callback, + ), + ] = False, + samplesheet: Annotated[ + Optional[str], + typer.Option( + "--samplesheet", + "-s", + hidden=True, + is_eager=True, + callback=deprecated_callback, + ), + ] = "", +): + """ + Anglerfish is a tool designed to demultiplex Illumina libraries sequenced on Oxford Nanopore flowcells. + The primary purpose for this would be to do QC, i.e. to check pool balancing, assess contamination, library insert sizes and so on. + """ + if samplesheet: + raise typer.BadParameter( + "Please use the 'run' command to run anglerfish with a samplesheet. Running only 'anglerfish' is not supported as of version 0.7.0" + ) + + +@app.command() +def explore( + fastq: Annotated[str, typer.Option("--fastq", "-f", help="Fastq file to align")], + outdir: Annotated[str, typer.Option("--outdir", "-o", help="Output directory")], + threads: Annotated[ + int, + typer.Option( + "--threads", + "-t", + help="Number of threads specified to minimap2", + ), + ] = 4, + use_existing: Annotated[ + bool, + typer.Option( + "--use-existing", + "-e", + help="Use existing alignments if found in the specified output directory.", + ), + ] = False, + good_hit_threshold: Annotated[ + float, + typer.Option( + "--good_hit_threshold", + "-g", + help="Fraction of adaptor bases immediately before and immediately after index insert required to match perfectly for a hit to be considered a good hit", + ), + ] = 0.9, + insert_thres_low: Annotated[ + int, + typer.Option( + "--insert_thres_low", + "-i", + help="Lower threshold for index(+UMI) insert length, with value included.", + ), + ] = 4, + insert_thres_high: Annotated[ + int, + typer.Option( + "--insert_thres_high", + "-j", + help="Upper threshold for index(+UMI) insert length, with value included.", + ), + ] = 30, + minimap_b: Annotated[ + int, + typer.Option( + "--minimap_b", + "-B", + help="Minimap2 -B parameter, mismatch penalty.", + ), + ] = 4, + min_hits_per_adaptor: Annotated[ + int, + typer.Option( + "--min_hits_per_adaptor", + "-m", + help="Minimum number of good hits for an adaptor to be included in the analysis.", + ), + ] = 50, + umi_threshold: Annotated[ + float, + typer.Option( + "--umi_threshold", + "-u", + help="Minimum number of bases in insert to perform entropy calculation.", + ), + ] = 11, + kmer_length: Annotated[ + int, + typer.Option( + "--kmer_length", + "-k", + help="Kmer length for entropy calculation.", + ), + ] = 2, + version: Annotated[ + Optional[bool], + typer.Option( + "--version", + "-v", + help="Print version and quit", + is_eager=True, + callback=version_callback, + ), + ] = False, +): + """This is an advanced samplesheet-free version of anglerfish.""" + run_explore( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, + ) + + +@app.command() +def run( + samplesheet: Annotated[ + str, + typer.Option( + "--samplesheet", "-s", help="CSV formatted list of samples and barcodes" + ), + ], + out_fastq: Annotated[ + str, typer.Option("--out_fastq", "-o", help="Analysis output folder") + ] = ".", + threads: Annotated[ + int, typer.Option("--threads", "-t", help="Number of threads to use") + ] = 4, + skip_demux: Annotated[ + bool, + typer.Option("--skip_demux", "-c", help="Only do BC counting and not demuxing"), + ] = False, + max_distance: Annotated[ + int, + typer.Option( + "--max-distance", + "-m", + help="Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2", + ), + ] = 2, + max_unknowns: Annotated[ + int, + typer.Option( + "--max-unknowns", + "-u", + help="Maximum number of unknown indices to show in the output. default is length of samplesheet + 10", + ), + ] = 0, + run_name: Annotated[ + str, typer.Option("--run_name", "-r", help="Run name") + ] = "anglerfish", + lenient: Annotated[ + bool, + typer.Option( + "--lenient", + "-l", + help="Will try reverse complementing the I5 and/or I7 indices and choose the best match.", + ), + ] = False, + lenient_factor: Annotated[ + float, + typer.Option( + "--lenient_factor", + "-x", + help="If lenient is set, this is the minimum factor of additional matches required to reverse complement the index", + ), + ] = 4.0, + force_rc: Annotated[ + IndexOrientations, + typer.Option( + "--force_rc", + "-p", + help="Force reverse complementing the I5 and/or I7 indices. If set to anything other than 'original' this will disregard lenient mode.", + ), + ] = IndexOrientations.original, + ont_barcodes: Annotated[ + bool, + typer.Option( + "--ont_barcodes", + "-n", + help="Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately", + ), + ] = False, + debug: Annotated[bool, typer.Option("--debug", "-d", help="Debug mode")] = False, + version: Annotated[ + Optional[bool], + typer.Option( + "--version", + "-v", + help="Print version and quit", + is_eager=True, + callback=version_callback, + ), + ] = False, +): + """Run anglerfish. This is the main command for anglerfish""" + args = argparse.Namespace( + samplesheet=samplesheet, + out_fastq=out_fastq, + threads=threads, + skip_demux=skip_demux, + max_distance=max_distance, + max_unknowns=max_unknowns, + run_name=run_name, + lenient=lenient, + lenient_factor=lenient_factor, + force_rc=force_rc.value, + ont_barcodes=ont_barcodes, + debug=debug, + version=version, + ) + utcnow = dt.datetime.now(dt.timezone.utc) + runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S") + assert os.path.exists(args.out_fastq), f"Output folder '{args.out_fastq}' not found" + assert os.path.exists( + args.samplesheet + ), f"Samplesheet file '{args.samplesheet}' not found, please provide a valid path when using the --samplesheet option." + args.out_fastq = os.path.join(os.path.abspath(args.out_fastq), runname) + args.samplesheet = os.path.abspath(args.samplesheet) + args.run_name = runname + + run_demux(args) diff --git a/anglerfish/explore/cli.py b/anglerfish/explore/cli.py deleted file mode 100644 index 1cceab5..0000000 --- a/anglerfish/explore/cli.py +++ /dev/null @@ -1,100 +0,0 @@ -import click - -from anglerfish.explore.explore import run_explore - - -@click.command() -@click.option("-f", "--fastq", required=True, help="Fastq file to align") -@click.option("-o", "--outdir", required=True, help="Output directory") -@click.option( - "-t", - "--threads", - default=4, - type=int, - help="Number of threads specified to minimap2", -) -@click.option( - "-e", - "--use-existing", - is_flag=True, - help="Use existing alignments if found in the specified output directory.", -) -@click.option( - "-g", - "--good_hit_threshold", - default=0.9, - type=float, - help="Fraction of adaptor bases immediately before and immediately after index insert required to match perfectly for a hit to be considered a good hit (default=0.9).", -) -@click.option( - "-i", - "--insert_thres_low", - default=4, - type=int, - help="Lower threshold for index(+UMI) insert length, with value included (deafult=4).", -) -@click.option( - "-j", - "--insert_thres_high", - default=30, - type=int, - help="Upper threshold for index(+UMI) insert length, with value included (default=30).", -) -@click.option( - "-B", - "--minimap_b", - default=4, - type=int, - help="Minimap2 -B parameter, mismatch penalty (default=4).", -) -@click.option( - "-m", - "--min_hits_per_adaptor", - default=50, - type=int, - help="Minimum number of good hits for an adaptor to be included in the analysis (default=50).", -) -@click.option( - "-u", - "--umi_threshold", - default=11, - type=float, - help="Minimum number of bases in insert to perform entropy calculation (default=11).", -) -@click.option( - "-k", - "--kmer_length", - default=2, - type=int, - help="Length of k-mers to use for entropy calculation (default=2).", -) -def main( - fastq, - outdir, - threads, - use_existing, - good_hit_threshold, - insert_thres_low, - insert_thres_high, - minimap_b, - min_hits_per_adaptor, - umi_threshold, - kmer_length, -): - run_explore( - fastq, - outdir, - threads, - use_existing, - good_hit_threshold, - insert_thres_low, - insert_thres_high, - minimap_b, - min_hits_per_adaptor, - umi_threshold, - kmer_length, - ) - - -if __name__ == "__main__": - main() diff --git a/requirements.txt b/requirements.txt index db39464..3262cac 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ biopython==1.83 -click==8.1.7 levenshtein==0.23.0 numpy==1.26.3 pandas==2.1.4 -pyyaml==6.0.1 \ No newline at end of file +pyyaml==6.0.1 +typer==0.9.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 47d8660..8c43b7a 100644 --- a/setup.py +++ b/setup.py @@ -38,8 +38,7 @@ extras_require={"dev": ["ruff", "mypy", "editorconfig-checker"]}, entry_points={ "console_scripts": [ - "anglerfish=anglerfish.anglerfish:anglerfish", - "anglerfish-explore=anglerfish.explore.cli:main", + "anglerfish=anglerfish.cli:app", ], }, zip_safe=False,