From c2e28f93bd3616fb28e57bde9d08d48500e95172 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:43:44 +0100 Subject: [PATCH 01/15] add ci files --- .github/workflows/lint-code.yml | 135 ++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 19 +++++ .vscode/settings.json | 8 ++ pyproject.toml | 21 ++++- 4 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/lint-code.yml create mode 100644 .pre-commit-config.yaml create mode 100644 .vscode/settings.json diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml new file mode 100644 index 0000000..c73256e --- /dev/null +++ b/.github/workflows/lint-code.yml @@ -0,0 +1,135 @@ +name: lint-code +on: [push, pull_request] + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +jobs: + # Use ruff to check for code style violations + ruff-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check for style violations + # Configured in pyproject.toml + run: ruff check . + + # Use ruff to check code formatting + ruff-format: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + - name: ruff --> Check code formatting + run: ruff format --check . + + # Use mypy for static type checking + mypy-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install mypy + # Start by installing type stubs + - name: mypy --> Install stubs + run: echo -e "y" | mypy --install-types **/*.py || exit 0 + - name: mypy --> Static type checking + # Configured in pyprojet.toml + run: mypy **/*.py + + # Use pipreqs to check for missing dependencies + pipreqs-check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + + - name: Install pipreqs + run: pip install pipreqs + + - name: Install requirements + run: pip install -r requirements.txt + + - name: Run pipreqs + run: pipreqs --savepath pipreqs.txt + + - name: Compare requirements + run: | + # Extract and sort package names + awk '{print $1}' $1 | sort -u > "$1".compare + awk -F'==' '{print $1}' $2 | sort -u > "$2".compare + + # Compare package lists + if cmp -s "$1".compare "$2".compare + then + echo "Requirements are the same" + exit 0 + else + echo "Requirements are different" + exit 1 + fi + + # Use Prettier to check various file formats + prettier: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install Prettier + run: npm install -g prettier + + - name: Run Prettier --check + run: prettier --check . + + # Use editorconfig to check all remaining file formats + editorconfig: + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Setup node + uses: actions/setup-node@v4 + with: + node-version: "20" + + - name: Install editorconfig-checker + run: npm install -g editorconfig-checker + + - name: editorconfig --> Lint files + run: editorconfig-checker $(git ls-files | grep -v '.py\|.md\|.json\|.yml\|.yaml\|.html') diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..1c09ed2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +# .pre-commit-config.yaml +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.6 + hooks: + - id: ruff + - id: ruff-format + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.7.1" + hooks: + - id: mypy + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v4.0.0-alpha.8" + hooks: + - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.2" + hooks: + - id: editorconfig-checker diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..6e4306d --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,8 @@ +{ + "editor.formatOnSave": true, + "editor.defaultFormatter": "esbenp.prettier-vscode", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "prettier.configPath": "./pyproject.toml" +} diff --git a/pyproject.toml b/pyproject.toml index 7fd26b9..b3bc5ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,22 @@ [build-system] requires = ["setuptools"] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" + +[tool.ruff.lint] +select =[ + # Ruff default rules + # ------------------------------ + "E4", # pycodestyle Imports + "E7", # pycodestyle Statements + "E9", # pycodestyle Runtime + "F", # Pyflakes + + # Additional Comment + # ------------------------------------------------------ + "I", # isort Best-practice sorting of imports + "UP", # pyupgrade Make sure syntax is up-to-date +] + +[tool.mypy] +ignore_missing_imports = true +follow_imports = 'skip' From 7634a3bd7b150f552f064e7089b4ab160b6f4564 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:46:11 +0100 Subject: [PATCH 02/15] non-invasive formatting --- .github/workflows/anglerfish.yml | 8 +- .github/workflows/pypi.yml | 2 +- README.md | 25 ++-- anglerfish/__main__.py | 4 +- anglerfish/anglerfish.py | 216 +++++++++++++++++++++++-------- anglerfish/demux/demux.py | 138 ++++++++++++-------- anglerfish/demux/report.py | 111 ++++++++++------ anglerfish/demux/samplesheet.py | 74 ++++++----- setup.py | 51 ++++---- 9 files changed, 405 insertions(+), 224 deletions(-) diff --git a/.github/workflows/anglerfish.yml b/.github/workflows/anglerfish.yml index 2077e95..0fed1f1 100644 --- a/.github/workflows/anglerfish.yml +++ b/.github/workflows/anglerfish.yml @@ -14,11 +14,11 @@ jobs: - uses: actions/checkout@v4 - uses: mamba-org/setup-micromamba@v1 with: - init-shell: bash - create-args: >- + init-shell: bash + create-args: >- python=${{ matrix.python-version }} pip - environment-file: environment.yml + environment-file: environment.yml # Install Anglerfish - shell: bash -l {0} @@ -29,7 +29,7 @@ jobs: # Run anglerfish --help - shell: bash -l {0} name: Test anglerfish - run: | + run: | anglerfish --help # Run anglerfish using test data diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index c1bd356..346e673 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -28,4 +28,4 @@ jobs: uses: pypa/gh-action-pypi-publish@master with: user: __token__ - password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index 123c4b2..2a4867d 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ # Anglerfish + [![Anglerfish CI Status](https://github.com/remiolsen/anglerfish/workflows/Anglerfish/badge.svg)](https://github.com/remiolsen/anglerfish/actions) [![PyPI](https://img.shields.io/pypi/v/bio-anglerfish)](https://pypi.python.org/pypi/bio-anglerfish/) [![Conda (channel only)](https://img.shields.io/conda/vn/bioconda/anglerfish)](https://anaconda.org/bioconda/anglerfish) [![Docker Container available](https://img.shields.io/docker/automated/remiolsen/anglerfish.svg)](https://hub.docker.com/r/remiolsen/anglerfish/) - ## Introduction Anglerfish is a tool designed to demultiplex Illumina libraries sequenced on Oxford Nanopore @@ -17,18 +17,18 @@ For more information on how this can be used, please see this [poster](docs/AGBT ### Requirements -* Python3 (3.7) +- Python3 (3.7) Python modules: -* biopython v. 1.70 -* python-levenshtein v. 0.12.0 -* numpy v. 1.19.2 -* pyyaml v. 6.0 +- biopython v. 1.70 +- python-levenshtein v. 0.12.0 +- numpy v. 1.19.2 +- pyyaml v. 6.0 Software: -* minimap2 v. 2.20 +- minimap2 v. 2.20 ### From PyPi @@ -65,8 +65,8 @@ pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerf Anglerfish requires two files to run. - * A basecalled FASTQ file from for instance Guppy (`/path/to/ONTreads.fastq.gz`) - * A samplesheet containing the sample names and indices expected to be found in the sequencing run. (`/path/to/samples.csv`) +- A basecalled FASTQ file from for instance Guppy (`/path/to/ONTreads.fastq.gz`) +- A samplesheet containing the sample names and indices expected to be found in the sequencing run. (`/path/to/samples.csv`) Example of a samplesheet file: @@ -135,10 +135,9 @@ P54321_101,truseq,ATTACTCG,/path/to/barcode02/*.fastq.gz In folder `anglerfish_????_??_??_?????/` -* `*.fastq.gz` Demultiplexed reads (if any) -* `anglerfish_stats.txt` Barcode statistics from anglerfish run -* `anglerfish_stats.json` Machine readable anglerfish statistics - +- `*.fastq.gz` Demultiplexed reads (if any) +- `anglerfish_stats.txt` Barcode statistics from anglerfish run +- `anglerfish_stats.json` Machine readable anglerfish statistics ## Credits diff --git a/anglerfish/__main__.py b/anglerfish/__main__.py index b48c1cd..c11e37d 100644 --- a/anglerfish/__main__.py +++ b/anglerfish/__main__.py @@ -1,4 +1,4 @@ from . import anglerfish -if __name__ == '__main__': - anglerfish() \ No newline at end of file +if __name__ == "__main__": + anglerfish() diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 436c8c6..551536f 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -9,16 +9,22 @@ from datetime import datetime as dt from itertools import groupby from collections import Counter -from .demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq +from .demux.demux import ( + run_minimap2, + parse_paf_lines, + layout_matches, + cluster_matches, + write_demuxedfastq, +) from .demux.samplesheet import SampleSheet from .demux.report import Report, SampleStat, AlignmentStat import gzip + logging.basicConfig(level=logging.INFO) -log = logging.getLogger('anglerfish') +log = logging.getLogger("anglerfish") def run_demux(args): - run_uuid = str(uuid.uuid4()) os.mkdir(args.out_fastq) ss = SampleSheet(args.samplesheet, args.ont_barcodes) @@ -36,7 +42,9 @@ def run_demux(args): args.max_distance = 1 log.info(f"Using maximum edit distance of {args.max_distance}") if args.max_distance >= bc_dist: - log.error(f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={bc_dist}") + log.error( + f" Edit distance of barcodes in samplesheet are less than the minimum specified {args.max_distance}>={bc_dist}" + ) exit() log.debug(f"Samplesheet bc_dist == {bc_dist}") @@ -45,22 +53,23 @@ def run_demux(args): adaptor_set = set(adaptors_t) adaptors_sorted = dict([(i, []) for i in adaptor_set]) for entry in ss: - adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append((entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq))) + adaptors_sorted[(entry.adaptor.name, entry.ont_barcode)].append( + (entry.sample_name, entry.adaptor, os.path.abspath(entry.fastq)) + ) out_fastqs = [] for key, sample in adaptors_sorted.items(): - adaptor_name, ont_barcode = key fastq_path = sample[0][2] # If there are multiple ONT barcodes, we need to add the ONT barcode to the adaptor name adaptor_bc_name = adaptor_name if ont_barcode: - adaptor_bc_name = adaptor_name+"_"+ont_barcode + adaptor_bc_name = adaptor_name + "_" + ont_barcode fastq_files = glob.glob(fastq_path) # Align aln_path = os.path.join(args.out_fastq, f"{adaptor_bc_name}.paf") - adaptor_path = os.path.join(args.out_fastq,f"{adaptor_name}.fasta") + adaptor_path = os.path.join(args.out_fastq, f"{adaptor_name}.fasta") with open(adaptor_path, "w") as f: f.write(ss.get_fastastring(adaptor_name)) for fq in fastq_files: @@ -69,70 +78,101 @@ def run_demux(args): # Easy line count in input fastq files num_fq = 0 for fq in fastq_files: - with gzip.open(fq, 'rb') as f: + with gzip.open(fq, "rb") as f: for i in f: - num_fq += 1 - num_fq = int(num_fq / 4) + num_fq += 1 + num_fq = int(num_fq / 4) paf_entries = parse_paf_lines(aln_path) # Make stats log.info(f" Searching for adaptor hits in {adaptor_bc_name}") - fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries) + fragments, singletons, concats, unknowns = layout_matches( + adaptor_name + "_i5", adaptor_name + "_i7", paf_entries + ) stats = AlignmentStat(adaptor_bc_name) stats.compute_pafstats(num_fq, fragments, singletons, concats, unknowns) report.add_alignment_stat(stats) # Demux - no_matches = []; matches = [] - flipped_i7 = False; flipped_i5 = False + no_matches = [] + matches = [] + flipped_i7 = False + flipped_i5 = False flips = { "i7": {"i7_reversed": True, "i5_reversed": False}, "i5": {"i7_reversed": False, "i5_reversed": True}, - "i7+i5": {"i7_reversed": True, "i5_reversed": True} + "i7+i5": {"i7_reversed": True, "i5_reversed": True}, } if args.force_rc is not None: - log.info(f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled") - no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, **flips[args.force_rc]) + log.info( + f" Force reverse complementing {args.force_rc} index for adaptor {adaptor_name}. Lenient mode is disabled" + ) + no_matches, matches = cluster_matches( + adaptors_sorted[key], + fragments, + args.max_distance, + **flips[args.force_rc], + ) flipped_i7, flipped_i5 = flips[args.force_rc].values() - elif args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match - no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance) + elif args.lenient: # Try reverse complementing the I5 and/or i7 indices and choose the best match + no_matches, matches = cluster_matches( + adaptors_sorted[key], fragments, args.max_distance + ) flipped = {} for flip, rev in flips.items(): - rc_no_matches, rc_matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance, **rev) + rc_no_matches, rc_matches = cluster_matches( + adaptors_sorted[key], fragments, args.max_distance, **rev + ) flipped[flip] = (rc_matches, rc_no_matches, len(rc_matches)) best_flip = max(zip(flipped.values(), flipped.keys()))[1] # There are no barcode flips with unambiguously more matches, so we abort - if sorted([i[2] for i in flipped.values()])[-1] == sorted([i[2] for i in flipped.values()])[-2]: - log.info(f"Could not find any barcode reverse complements with unambiguously more matches") + if ( + sorted([i[2] for i in flipped.values()])[-1] + == sorted([i[2] for i in flipped.values()])[-2] + ): + log.info( + f"Could not find any barcode reverse complements with unambiguously more matches" + ) elif flipped[best_flip][2] > len(matches) * args.lenient_factor: - log.info(f" Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches") + log.info( + f" Reverse complementing {best_flip} index for adaptor {adaptor_name} found at least {args.lenient_factor} times more matches" + ) matches, no_matches, _ = flipped[best_flip] flipped_i7, flipped_i5 = flips[best_flip].values() else: log.info(f" Using original index orientation for {adaptor_name}") else: - no_matches, matches = cluster_matches(adaptors_sorted[key], fragments, args.max_distance) - - for k, v in groupby(sorted(matches,key=lambda x: x[3]), key=lambda y: y[3]): + no_matches, matches = cluster_matches( + adaptors_sorted[key], fragments, args.max_distance + ) + for k, v in groupby(sorted(matches, key=lambda x: x[3]), key=lambda y: y[3]): # To avoid collisions in fastq filenames, we add the ONT barcode to the sample name fq_prefix = k if ont_barcode: - fq_prefix = ont_barcode+"-"+fq_prefix - fq_name = os.path.join(args.out_fastq, fq_prefix+".fastq.gz") + fq_prefix = ont_barcode + "-" + fq_prefix + fq_name = os.path.join(args.out_fastq, fq_prefix + ".fastq.gz") out_fastqs.append(fq_name) sample_dict = {i[0]: [i] for i in v} # Find read lengths rlens = np.array([]) - for l,w in sample_dict.items(): + for l, w in sample_dict.items(): for i in w: - rlens = np.append(rlens, i[2]-i[1]) - rmean = np.round(np.mean(rlens),2) - rstd = np.round(np.std(rlens),2) + rlens = np.append(rlens, i[2] - i[1]) + rmean = np.round(np.mean(rlens), 2) + rstd = np.round(np.std(rlens), 2) - sample_stat = SampleStat(k, len(sample_dict.keys()), rmean, rstd, flipped_i7, flipped_i5, ont_barcode) + sample_stat = SampleStat( + k, + len(sample_dict.keys()), + rmean, + rstd, + flipped_i7, + flipped_i5, + ont_barcode, + ) report.add_sample_stat(sample_stat) if not args.skip_demux: write_demuxedfastq(sample_dict, fastq_path, fq_name) @@ -141,11 +181,15 @@ def run_demux(args): nomatch_count = Counter([x[3] for x in no_matches]) if args.max_unknowns == None: args.max_unknowns = len([sample for sample in ss]) + 10 - report.add_unmatched_stat(nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name) + report.add_unmatched_stat( + nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name + ) # Check if there were samples in the samplesheet without adaptor alignments and add them to report for entry in ss: - if entry.sample_name not in [s.sample_name for s in [stat for stat in report.sample_stats]]: + if entry.sample_name not in [ + s.sample_name for s in [stat for stat in report.sample_stats] + ]: sample_stat = SampleStat(entry.sample_name, 0, 0, 0, False, ont_barcode) report.add_sample_stat(sample_stat) @@ -154,31 +198,99 @@ def run_demux(args): report.write_dataframe(args.out_fastq, ss) if args.skip_fastqc: - log.warning(" As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant.") + log.warning( + " As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant." + ) + def anglerfish(): - parser = argparse.ArgumentParser(description='Tools to demux I7 and I5 barcodes when sequenced by single-molecules') - parser.add_argument('--samplesheet', '-s', required=True, help='CSV formatted list of samples and barcodes') - parser.add_argument('--out_fastq', '-o', default='.', help='Analysis output folder (default: Current dir)') - parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)') - parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing') - parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS) - parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2') - parser.add_argument('--max-unknowns', '-u', type=int, help='Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)') - parser.add_argument('--run_name', '-r', default='anglerfish', help='Name of the run (default: anglerfish)') - parser.add_argument('--lenient', '-l', action='store_true', help='Will try reverse complementing the I5 and/or I7 indices and choose the best match.') - parser.add_argument('--lenient_factor', '-x', default=4.0, type=float, help='If lenient is set, this is the minimum factor of additional matches required to reverse complement the index (default: 4.0)') - parser.add_argument('--force_rc', '-p', choices=['i7', 'i5', 'i7+i5'], help='Force reverse complementing the I5 and/or I7 indices. This will disregard lenient mode.') - parser.add_argument('--ont_barcodes', '-n', action='store_true', help='Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately') - parser.add_argument('--debug', '-d', action='store_true', help='Extra commandline output') - parser.add_argument('--version', '-v', action='version', help='Print version and quit', version=f'anglerfish {pkg_resources.get_distribution("bio-anglerfish").version}') + parser = argparse.ArgumentParser( + description="Tools to demux I7 and I5 barcodes when sequenced by single-molecules" + ) + parser.add_argument( + "--samplesheet", + "-s", + required=True, + help="CSV formatted list of samples and barcodes", + ) + parser.add_argument( + "--out_fastq", + "-o", + default=".", + help="Analysis output folder (default: Current dir)", + ) + parser.add_argument( + "--threads", "-t", default=4, help="Number of threads to use (default: 4)" + ) + parser.add_argument( + "--skip_demux", + "-c", + action="store_true", + help="Only do BC counting and not demuxing", + ) + parser.add_argument( + "--skip_fastqc", "-f", action="store_true", help=argparse.SUPPRESS + ) + parser.add_argument( + "--max-distance", + "-m", + type=int, + help="Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2", + ) + parser.add_argument( + "--max-unknowns", + "-u", + type=int, + help="Maximum number of unknown indices to show in the output (default: length of samplesheet + 10)", + ) + parser.add_argument( + "--run_name", + "-r", + default="anglerfish", + help="Name of the run (default: anglerfish)", + ) + parser.add_argument( + "--lenient", + "-l", + action="store_true", + help="Will try reverse complementing the I5 and/or I7 indices and choose the best match.", + ) + parser.add_argument( + "--lenient_factor", + "-x", + default=4.0, + type=float, + help="If lenient is set, this is the minimum factor of additional matches required to reverse complement the index (default: 4.0)", + ) + parser.add_argument( + "--force_rc", + "-p", + choices=["i7", "i5", "i7+i5"], + help="Force reverse complementing the I5 and/or I7 indices. This will disregard lenient mode.", + ) + parser.add_argument( + "--ont_barcodes", + "-n", + action="store_true", + help="Will assume the samplesheet refers to a single ONT run prepped with a barcoding kit. And will treat each barcode separately", + ) + parser.add_argument( + "--debug", "-d", action="store_true", help="Extra commandline output" + ) + parser.add_argument( + "--version", + "-v", + action="version", + help="Print version and quit", + version=f'anglerfish {pkg_resources.get_distribution("bio-anglerfish").version}', + ) args = parser.parse_args() utcnow = dt.utcnow() runname = utcnow.strftime(f"{args.run_name}_%Y_%m_%d_%H%M%S") assert os.path.exists(args.out_fastq) assert os.path.exists(args.samplesheet) - args.out_fastq = os.path.join(os.path.abspath(args.out_fastq),runname) + args.out_fastq = os.path.join(os.path.abspath(args.out_fastq), runname) args.samplesheet = os.path.abspath(args.samplesheet) args.run_name = runname run_demux(args) diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 7154287..b55bdac 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -6,8 +6,9 @@ import io from Bio.SeqIO.QualityIO import FastqGeneralIterator from Bio.Seq import Seq + logging.basicConfig(level=logging.INFO) -log = logging.getLogger('demux') +log = logging.getLogger("demux") def parse_cs(cs_string, index, max_distance): @@ -30,22 +31,24 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): "minimap2", "--cs", "-m8", - "-k", "10", - "-w", "5", + "-k", + "10", + "-w", + "5", "-B1", "-A6", "--dual=no", "-c", - "-t", str(threads), + "-t", + str(threads), indexfile, - fastq_in + fastq_in, ] with open(output_paf, "ab") as ofile: proc = subprocess.run(cmd, stdout=ofile, check=True) - def parse_paf_lines(paf, min_qual=10): """ Read and parse one paf alignment lines. @@ -57,16 +60,17 @@ def parse_paf_lines(paf, min_qual=10): aln = paf_line.split() try: # TODO: objectify this - entry = {"adapter": aln[5], - "rlen": int(aln[1]), # read length - "rstart": int(aln[2]), # start alignment on read - "rend": int(aln[3]), # end alignment on read - "strand": aln[4], - "cs": aln[-1], # cs string - "q": int(aln[11]), # Q score - "iseq": None, - "sample": None - } + entry = { + "adapter": aln[5], + "rlen": int(aln[1]), # read length + "rstart": int(aln[2]), # start alignment on read + "rend": int(aln[3]), # end alignment on read + "strand": aln[4], + "cs": aln[-1], # cs string + "q": int(aln[11]), # Q score + "iseq": None, + "sample": None, + } read = aln[0] except IndexError: log.debug(f"Could not find all paf columns: {read}") @@ -94,14 +98,20 @@ def layout_matches(i5_name, i7_name, paf_entries): - unknowns. Any other reads """ - fragments = {}; singletons = {}; concats = {}; unknowns = {} + fragments = {} + singletons = {} + concats = {} + unknowns = {} for read, entry_list in paf_entries.items(): sorted_entries = [] - for k in range(len(entry_list)-1): - entry_i = entry_list[k]; entry_j = entry_list[k+1] - if entry_i['adapter'] != entry_j['adapter'] and \ - (entry_i['adapter'] == i5_name and entry_j['adapter'] == i7_name) or \ - (entry_j['adapter'] == i5_name and entry_i['adapter'] == i7_name): + for k in range(len(entry_list) - 1): + entry_i = entry_list[k] + entry_j = entry_list[k + 1] + if ( + entry_i["adapter"] != entry_j["adapter"] + and (entry_i["adapter"] == i5_name and entry_j["adapter"] == i7_name) + or (entry_j["adapter"] == i5_name and entry_i["adapter"] == i7_name) + ): if entry_i in sorted_entries: sorted_entries.append(entry_j) else: @@ -109,27 +119,35 @@ def layout_matches(i5_name, i7_name, paf_entries): if len(entry_list) == 1: singletons[read] = entry_list elif len(sorted_entries) == 2: - fragments[read] = sorted(sorted_entries,key=lambda l:l['rstart']) + fragments[read] = sorted(sorted_entries, key=lambda l: l["rstart"]) elif len(sorted_entries) > 2: - concats[read] = sorted(sorted_entries,key=lambda l:l['rstart']) + concats[read] = sorted(sorted_entries, key=lambda l: l["rstart"]) else: unknowns[read] = entry_list - #TODO: add minimum insert size + # TODO: add minimum insert size return (fragments, singletons, concats, unknowns) -def cluster_matches(sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False): - +def cluster_matches( + sample_adaptor, matches, max_distance, i7_reversed=False, i5_reversed=False +): # Only illumina fragments - matched = {}; matched_bed = []; unmatched_bed = [] + matched = {} + matched_bed = [] + unmatched_bed = [] for read, alignments in matches.items(): - i5 = False i7 = False - if alignments[0]['adapter'][-2:] == 'i5' and alignments[1]['adapter'][-2:] == 'i7': + if ( + alignments[0]["adapter"][-2:] == "i5" + and alignments[1]["adapter"][-2:] == "i7" + ): i5 = alignments[0] i7 = alignments[1] - elif alignments[1]['adapter'][-2:] == 'i5' and alignments[0]['adapter'][-2:] == 'i7': + elif ( + alignments[1]["adapter"][-2:] == "i5" + and alignments[0]["adapter"][-2:] == "i7" + ): i5 = alignments[1] i7 = alignments[0] else: @@ -137,72 +155,80 @@ def cluster_matches(sample_adaptor, matches, max_distance, i7_reversed=False, i5 continue dists = [] - fi5 = ""; fi7 = "" + fi5 = "" + fi7 = "" for _, adaptor, _ in sample_adaptor: try: i5_seq = adaptor.i5_index if i5_reversed and i5_seq is not None: i5_seq = str(Seq(i5_seq).reverse_complement()) - fi5, d1 = parse_cs(i5['cs'], i5_seq, max_distance) + fi5, d1 = parse_cs(i5["cs"], i5_seq, max_distance) except AttributeError: - d1 = 0 # presumably it's single index, so no i5 + d1 = 0 # presumably it's single index, so no i5 i7_seq = adaptor.i7_index if i7_reversed and i7_seq is not None: i7_seq = str(Seq(i7_seq).reverse_complement()) - fi7, d2 = parse_cs(i7['cs'], i7_seq, max_distance) - dists.append(d1+d2) + fi7, d2 = parse_cs(i7["cs"], i7_seq, max_distance) + dists.append(d1 + d2) index_min = min(range(len(dists)), key=dists.__getitem__) # Test if two samples in the sheet is equidistant to the i5/i7 - if len([i for i, j in enumerate(dists) if j==dists[index_min]]) > 1: + if len([i for i, j in enumerate(dists) if j == dists[index_min]]) > 1: log.debug(" Ambiguous alignment, skipping") continue - start_insert = min(i5['rend'],i7['rend']) - end_insert = max(i7['rstart'],i5['rstart']) + start_insert = min(i5["rend"], i7["rend"]) + end_insert = max(i7["rstart"], i5["rstart"]) if end_insert - start_insert < 10: log.debug(" Erroneous / overlapping adaptor matches") continue if dists[index_min] > max_distance: log.debug(f" No match {fi7}-{fi5}") # Find only full length i7(+i5) adaptor combos. Basically a list of "known unknowns" - if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len(adaptor.i5_index or ""): + if len(fi7) + len(fi5) == len(adaptor.i7_index or "") + len( + adaptor.i5_index or "" + ): fi75 = "+".join([i for i in [fi7, fi5] if not i == ""]) unmatched_bed.append([read, start_insert, end_insert, fi75, "999", "."]) continue matched[read] = alignments log.debug(f" Matched {read} to {adaptor.i7_index}-{adaptor.i5_index}") - matched_bed.append([read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."]) + matched_bed.append( + [read, start_insert, end_insert, sample_adaptor[index_min][0], "999", "."] + ) return unmatched_bed, matched_bed - def write_demuxedfastq(beds, fastq_in, fastq_out): """ - Take a set of coordinates in bed format [[seq1, start, end, ..][seq2, ..]] - from over a set of fastq entries in the input files and do extraction. - TODO: Can be optimized using pigz or rewritten using python threading + Take a set of coordinates in bed format [[seq1, start, end, ..][seq2, ..]] + from over a set of fastq entries in the input files and do extraction. + TODO: Can be optimized using pigz or rewritten using python threading """ gz_buf = 131072 fq_files = glob.glob(fastq_in) for fq in fq_files: - with subprocess.Popen(["gzip", "-c", "-d", fq], - stdout=subprocess.PIPE, bufsize=gz_buf) as fzi: + with subprocess.Popen( + ["gzip", "-c", "-d", fq], stdout=subprocess.PIPE, bufsize=gz_buf + ) as fzi: fi = io.TextIOWrapper(fzi.stdout, write_through=True) - with open(fastq_out, 'ab') as ofile: - with subprocess.Popen(["gzip", "-c", "-f"], - stdin=subprocess.PIPE, stdout=ofile, bufsize=gz_buf, close_fds=False) as oz: - + with open(fastq_out, "ab") as ofile: + with subprocess.Popen( + ["gzip", "-c", "-f"], + stdin=subprocess.PIPE, + stdout=ofile, + bufsize=gz_buf, + close_fds=False, + ) as oz: for title, seq, qual in FastqGeneralIterator(fi): new_title = title.split() if new_title[0] not in beds.keys(): continue outfqs = "" for bed in beds[new_title[0]]: - - new_title[0] += "_"+bed[3] + new_title[0] += "_" + bed[3] outfqs += "@{}\n".format(" ".join(new_title)) - outfqs += "{}\n".format(seq[bed[1]:bed[2]]) + outfqs += "{}\n".format(seq[bed[1] : bed[2]]) outfqs += "+\n" - outfqs += "{}\n".format(qual[bed[1]:bed[2]]) - oz.stdin.write(outfqs.encode('utf-8')) + outfqs += "{}\n".format(qual[bed[1] : bed[2]]) + oz.stdin.write(outfqs.encode("utf-8")) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 57d50b3..2ac3e89 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -3,8 +3,8 @@ from dataclasses import dataclass, asdict from typing import ClassVar -class Report(object): +class Report(object): unmatch_header = ["index", "num_reads", "ont_barcode"] def __init__(self, run_name, uuid, version): @@ -17,27 +17,33 @@ def __init__(self, run_name, uuid, version): def add_alignment_stat(self, aln_stat): self.aln_stats.append(aln_stat) + def add_sample_stat(self, sample_stat): self.sample_stats.append(sample_stat) + def add_unmatched_stat(self, unmatched_stat, ont_barcode, adaptor_name): self.unmatched_stats[(ont_barcode, adaptor_name)] = unmatched_stat def write_report(self, outdir): - with open(os.path.join(outdir,"anglerfish_stats.txt"), "w") as f: - f.write(f"Anglerfish v. {self.version} (run: {self.run_name}, {self.uuid})\n===================\n") + with open(os.path.join(outdir, "anglerfish_stats.txt"), "w") as f: + f.write( + f"Anglerfish v. {self.version} (run: {self.run_name}, {self.uuid})\n===================\n" + ) for astat in self.aln_stats: f.write(f"{astat.adaptor_name}:\n") - for i,j in astat.paf_stats.items(): + for i, j in astat.paf_stats.items(): f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n") f.write("\n{}\n".format("\t".join(getattr(SampleStat, "header")))) for sample in self.sample_stats: - f.write(f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i7_reversed}\t{sample.i5_reversed}\t{sample.ont_barcode}\n") - uhead = getattr(Report, 'unmatch_header') - f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab + f.write( + f"{sample.sample_name}\t{sample.num_reads}\t{sample.mean_read_len}\t{sample.std_read_len}\t{sample.i7_reversed}\t{sample.i5_reversed}\t{sample.ont_barcode}\n" + ) + uhead = getattr(Report, "unmatch_header") + f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab for key, unmatch in self.unmatched_stats.items(): for idx, mnum in unmatch: f.write("{}\t{}\t{}\n".format(idx, mnum, key[0])) - + def write_json(self, outdir): json_out = { "anglerfish_version": self.version, @@ -45,23 +51,35 @@ def write_json(self, outdir): "run_uuid": self.uuid, "paf_stats": [], "sample_stats": [], - "undetermined": [] + "undetermined": [], } for astat in self.aln_stats: json_out["paf_stats"].append(astat.paf_stats) for sample in self.sample_stats: - slist = [sample.sample_name, sample.num_reads, sample.mean_read_len, sample.std_read_len, sample.i7_reversed, sample.i5_reversed, sample.ont_barcode] - json_out["sample_stats"].append(dict(zip(getattr(SampleStat, "header"),slist))) + slist = [ + sample.sample_name, + sample.num_reads, + sample.mean_read_len, + sample.std_read_len, + sample.i7_reversed, + sample.i5_reversed, + sample.ont_barcode, + ] + json_out["sample_stats"].append( + dict(zip(getattr(SampleStat, "header"), slist)) + ) for key, unmatch in self.unmatched_stats.items(): for idx, mnum in unmatch: - json_out["undetermined"].append(dict(zip(getattr(Report, "unmatch_header"),[idx, mnum, key[0]]))) - with open(os.path.join(outdir,"anglerfish_stats.json"), "w") as f: - f.write(json.dumps(json_out,indent=2, sort_keys=True)) + json_out["undetermined"].append( + dict(zip(getattr(Report, "unmatch_header"), [idx, mnum, key[0]])) + ) + with open(os.path.join(outdir, "anglerfish_stats.json"), "w") as f: + f.write(json.dumps(json_out, indent=2, sort_keys=True)) - def write_dataframe(self,outdir,samplesheet): + def write_dataframe(self, outdir, samplesheet): """Write a dataframe of the stats to a csv file. - TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order: - sample_name, num_reads, mean_read_len, std_read_len, i7_reversed, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index + TODO: This needs be cleaned up and made more robust. Especially lock in / decouple from upstream the header names and order: + sample_name, num_reads, mean_read_len, std_read_len, i7_reversed, i5_reversed, ont_barcode, adaptor_name, i7_index, i5_index """ out_list = [] for sample in self.sample_stats: @@ -85,7 +103,7 @@ def write_dataframe(self,outdir,samplesheet): un["i7_index"] = i7i5[0] un["i5_index"] = i7i5[1] out_list.append(un) - with open(os.path.join(outdir,"anglerfish_dataframe.csv"), "w") as f: + with open(os.path.join(outdir, "anglerfish_dataframe.csv"), "w") as f: out_header = out_list[0].keys() f.write(",".join(out_header)) f.write("\n") @@ -93,25 +111,39 @@ def write_dataframe(self,outdir,samplesheet): f.write(",".join([str(out[i]) for i in out_header])) f.write("\n") -class AlignmentStat(object): - def __init__(self, adaptor_name): - self.adaptor_name = adaptor_name - self.paf_stats = {} +class AlignmentStat(object): + def __init__(self, adaptor_name): + self.adaptor_name = adaptor_name + self.paf_stats = {} def compute_pafstats(self, num_fq, fragments, singletons, concats, unknowns): - total = len(fragments)+len(singletons)+len(concats)+len(unknowns) - self.paf_stats["input_reads"] = [num_fq , 1.0] - self.paf_stats["reads aligning to adaptor sequences"] = [total, total/float(num_fq)] - self.paf_stats["aligned reads matching both I7 and I5 adaptor"] = [len(fragments), len(fragments)/float(total)] - self.paf_stats["aligned reads matching only I7 or I5 adaptor"] = [len(singletons), len(singletons)/float(total)] - self.paf_stats["aligned reads matching multiple I7/I5 adaptor pairs"] = [len(concats), len(concats)/float(total)] - self.paf_stats["aligned reads with uncategorized alignments"] = [len(unknowns), len(unknowns)/float(total)] + total = len(fragments) + len(singletons) + len(concats) + len(unknowns) + self.paf_stats["input_reads"] = [num_fq, 1.0] + self.paf_stats["reads aligning to adaptor sequences"] = [ + total, + total / float(num_fq), + ] + self.paf_stats["aligned reads matching both I7 and I5 adaptor"] = [ + len(fragments), + len(fragments) / float(total), + ] + self.paf_stats["aligned reads matching only I7 or I5 adaptor"] = [ + len(singletons), + len(singletons) / float(total), + ] + self.paf_stats["aligned reads matching multiple I7/I5 adaptor pairs"] = [ + len(concats), + len(concats) / float(total), + ] + self.paf_stats["aligned reads with uncategorized alignments"] = [ + len(unknowns), + len(unknowns) / float(total), + ] @dataclass class SampleStat: - sample_name: str num_reads: int mean_read_len: float @@ -119,13 +151,12 @@ class SampleStat: i7_reversed: bool i5_reversed: bool ont_barcode: str = None - header: ClassVar[list] = ["sample_name", - "#reads", # We specify this for historical reasons - "mean_read_len", - "std_read_len", - "i7_reversed", - "i5_reversed", - "ont_barcode"] - - - + header: ClassVar[list] = [ + "sample_name", + "#reads", # We specify this for historical reasons + "mean_read_len", + "std_read_len", + "i7_reversed", + "i5_reversed", + "ont_barcode", + ] diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 6bb117d..12f6dae 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -13,9 +13,9 @@ adaptors = yaml.safe_load(stream) delim = "-NNN-" + @dataclass class SampleSheetEntry: - sample_name: str adaptor: object fastq: str @@ -23,9 +23,7 @@ class SampleSheetEntry: class Adaptor(object): - def __init__(self, adaptor, i7_index=None, i5_index=None): - self.i5 = adaptors[adaptor]["i5"] self.i7 = adaptors[adaptor]["i7"] self.i5_index = i5_index @@ -39,21 +37,19 @@ def __init__(self, adaptor, i7_index=None, i5_index=None): def get_i5_mask(self): if delim in self.i5: - return self.i5.replace(delim, "N"*len(self.i5_index)) + return self.i5.replace(delim, "N" * len(self.i5_index)) else: return self.i5 def get_i7_mask(self): if delim in self.i7: - return self.i7.replace(delim, "N"*len(self.i7_index)) + return self.i7.replace(delim, "N" * len(self.i7_index)) else: return self.i7 class SampleSheet(object): - def __init__(self, input_csv, ont_bc): - # Read samplesheet in format: # sample_name, adaptors, i7_index(-i5_index), fastq_path # If we are demuxing a run with ONT barcodes, we will have to assume fastq files are located in "barcode##" folders @@ -61,54 +57,70 @@ def __init__(self, input_csv, ont_bc): self.samplesheet = [] try: csvfile = open(input_csv, "r") - dialect = csv.Sniffer().sniff(csvfile.readline(), [',',';','\t']) + dialect = csv.Sniffer().sniff(csvfile.readline(), [",", ";", "\t"]) csvfile.seek(0) - data = csv.DictReader(csvfile, - fieldnames=['sample_name', 'adaptors', 'index', 'fastq_path'], dialect=dialect) + data = csv.DictReader( + csvfile, + fieldnames=["sample_name", "adaptors", "index", "fastq_path"], + dialect=dialect, + ) rn = 1 test_globs = {} for row in data: - if row['adaptors'] not in adaptors: - raise UserWarning(f"'{row['adaptors']}' not in the list of valid adaptors: {adaptors.keys()}") + if row["adaptors"] not in adaptors: + raise UserWarning( + f"'{row['adaptors']}' not in the list of valid adaptors: {adaptors.keys()}" + ) i7i5 = row["index"].split("-") - i7 = i7i5[0]; i5 = None + i7 = i7i5[0] + i5 = None if len(i7i5) > 1: i5 = i7i5[1] - sample_name = row['sample_name'] - test_globs[row['fastq_path']] = glob.glob(row['fastq_path']) + sample_name = row["sample_name"] + test_globs[row["fastq_path"]] = glob.glob(row["fastq_path"]) bc_re = re.compile("\/(barcode\d\d|unclassified)\/") ont_barcode = None if ont_bc: - ob = re.findall(bc_re, row['fastq_path']) - assert len(ob) > 0 and len(ob[0][-1]) > 0, "ONT barcode not found in fastq path. In ONT barcode mode (-n), fastq files must be located in barcode## folders" + ob = re.findall(bc_re, row["fastq_path"]) + assert ( + len(ob) > 0 and len(ob[0][-1]) > 0 + ), "ONT barcode not found in fastq path. In ONT barcode mode (-n), fastq files must be located in barcode## folders" ont_barcode = ob[0] - ss_entry = SampleSheetEntry(sample_name, Adaptor(row['adaptors'], i7, i5),row['fastq_path'], ont_barcode) + ss_entry = SampleSheetEntry( + sample_name, + Adaptor(row["adaptors"], i7, i5), + row["fastq_path"], + ont_barcode, + ) self.samplesheet.append(ss_entry) rn += 1 # Explanation: Don't mess around with the globs too much. Don't refer to the same file twice but using globs, # e.g, ./input.fastq and ./[i]nput.fastq - for a,b in combinations(test_globs.values(), 2): + for a, b in combinations(test_globs.values(), 2): if len(set(a) & set(b)) > 0: - raise UserWarning(f"Fastq paths are inconsistent. Please check samplesheet") + raise UserWarning( + f"Fastq paths are inconsistent. Please check samplesheet" + ) if not ont_bc and len(set([v[0] for v in test_globs.values()])) > 1: - raise UserWarning("""Found several different fastq files in samplesheet. Please carefully check any glob patterns. + raise UserWarning( + """Found several different fastq files in samplesheet. Please carefully check any glob patterns. If you are using ONT barcodes, please specify the --ont_barcodes flag. Or if you are trying to input several - sets of fastqs into anglerfish, please run anglerfish separately for each set.""") + sets of fastqs into anglerfish, please run anglerfish separately for each set.""" + ) except: raise finally: csvfile.close() - def minimum_bc_distance(self): - """ Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group """ + """Compute the minimum edit distance between all barcodes in samplesheet, or within each ONT barcode group""" ss_by_bc = {} testset = {} @@ -122,18 +134,18 @@ def minimum_bc_distance(self): testset[ont_barcode] = [] for adaptor in adaptors: if adaptor.i5_index is not None: - testset[ont_barcode].append(adaptor.i5_index+adaptor.i7_index) + testset[ont_barcode].append(adaptor.i5_index + adaptor.i7_index) else: testset[ont_barcode].append(adaptor.i7_index) - fq_distances=[] + fq_distances = [] for ont_barcode, adaptors in testset.items(): distances = [] if len(adaptors) == 1: distances = [len(adaptors[0])] else: for a, b in [i for i in combinations(adaptors, 2)]: - dist = lev.distance(a,b) + dist = lev.distance(a, b) assert dist > 0, f"""There is one or more identical barcodes in the input samplesheet. First one found: {a}. If these exist in different ONT barcodes, please specify the --ont_barcodes flag.""" distances.append(dist) @@ -141,22 +153,22 @@ def minimum_bc_distance(self): return min(fq_distances) def get_fastastring(self, adaptor_name=None): - fastas = {} for entry in self.samplesheet: if entry.adaptor.name == adaptor_name or adaptor_name is None: - fastas[entry.adaptor.name+"_i7"] = entry.adaptor.get_i7_mask() - fastas[entry.adaptor.name+"_i5"] = entry.adaptor.get_i5_mask() + fastas[entry.adaptor.name + "_i7"] = entry.adaptor.get_i7_mask() + fastas[entry.adaptor.name + "_i5"] = entry.adaptor.get_i5_mask() assert len(fastas) > 0 outstr = "" for key, seq in fastas.items(): - outstr += ">{}\n{}\n".format(key,seq) + outstr += ">{}\n{}\n".format(key, seq) return outstr def __iter__(self): return iter(self.samplesheet) + def __next__(self): pass diff --git a/setup.py b/setup.py index 72b5591..2c8751a 100644 --- a/setup.py +++ b/setup.py @@ -12,29 +12,30 @@ """ from setuptools import setup, find_packages from pathlib import Path + this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() -version='0.6.0' +version = "0.6.0" setup( - name='bio-anglerfish', + name="bio-anglerfish", version=version, - description='Anglerfish, a tool to demultiplex Illumina libraries from ONT data', + description="Anglerfish, a tool to demultiplex Illumina libraries from ONT data", long_description=long_description, - long_description_content_type='text/markdown', - author='Remi-Andre Olsen', - author_email='remi-andre.olsen@scilifelab.se', - url='https://github.com/remiolsen/anglerfish', - license='MIT', + long_description_content_type="text/markdown", + author="Remi-Andre Olsen", + author_email="remi-andre.olsen@scilifelab.se", + url="https://github.com/remiolsen/anglerfish", + license="MIT", python_requires=">=3.7", - packages = find_packages(), - package_data = {"":["config/adaptors.yaml"]}, + packages=find_packages(), + package_data={"": ["config/adaptors.yaml"]}, install_requires=[ - 'python-levenshtein==0.23.0', - 'biopython==1.79', - 'numpy==1.22.0', - 'pyyaml==6.0' + "python-levenshtein==0.23.0", + "biopython==1.79", + "numpy==1.22.0", + "pyyaml==6.0", ], entry_points={ "console_scripts": [ @@ -43,16 +44,16 @@ }, zip_safe=False, classifiers=[ - "Development Status :: 5 - Production/Stable", - "Environment :: Console", - "Intended Audience :: Developers", - "Intended Audience :: Healthcare Industry", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX :: Linux", - "Programming Language :: Python", + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Healthcare Industry", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python", "Topic :: Scientific/Engineering", - "Topic :: Scientific/Engineering :: Medical Science Apps.", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ] + "Topic :: Scientific/Engineering :: Medical Science Apps.", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], ) From ab6f0acffccfc711357faea5b4c79101b3a63023 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:47:45 +0100 Subject: [PATCH 03/15] add file to supress blame and supress last commit --- .git-blame-ignore-revs | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..732882a --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# 240110, non-invasive, repo-wide formatting with ruff and prettier +7634a3bd7b150f552f064e7089b4ab160b6f4564 From 1d308fb21d4c4d07f1d6a97d0e85d9495db80557 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:49:30 +0100 Subject: [PATCH 04/15] ruff check autofix --- anglerfish/anglerfish.py | 22 ++++++++++++---------- anglerfish/demux/demux.py | 15 ++++++++------- anglerfish/demux/report.py | 10 +++++----- anglerfish/demux/samplesheet.py | 18 +++++++++--------- setup.py | 3 ++- 5 files changed, 36 insertions(+), 32 deletions(-) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index 551536f..fb92667 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -1,24 +1,26 @@ #!/usr/bin/env python import argparse -import logging import glob +import gzip +import logging import os -import pkg_resources -import numpy as np import uuid +from collections import Counter from datetime import datetime as dt from itertools import groupby -from collections import Counter + +import numpy as np +import pkg_resources + from .demux.demux import ( - run_minimap2, - parse_paf_lines, - layout_matches, cluster_matches, + layout_matches, + parse_paf_lines, + run_minimap2, write_demuxedfastq, ) +from .demux.report import AlignmentStat, Report, SampleStat from .demux.samplesheet import SampleSheet -from .demux.report import Report, SampleStat, AlignmentStat -import gzip logging.basicConfig(level=logging.INFO) log = logging.getLogger("anglerfish") @@ -132,7 +134,7 @@ def run_demux(args): == sorted([i[2] for i in flipped.values()])[-2] ): log.info( - f"Could not find any barcode reverse complements with unambiguously more matches" + "Could not find any barcode reverse complements with unambiguously more matches" ) elif flipped[best_flip][2] > len(matches) * args.lenient_factor: log.info( diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index b55bdac..840b8f0 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -1,11 +1,12 @@ import glob -import re +import io import logging -import Levenshtein as lev +import re import subprocess -import io -from Bio.SeqIO.QualityIO import FastqGeneralIterator + +import Levenshtein as lev from Bio.Seq import Seq +from Bio.SeqIO.QualityIO import FastqGeneralIterator logging.basicConfig(level=logging.INFO) log = logging.getLogger("demux") @@ -55,7 +56,7 @@ def parse_paf_lines(paf, min_qual=10): Returns a dict with the import values for later use """ entries = {} - with open(paf, "r") as paf: + with open(paf) as paf: for paf_line in paf: aln = paf_line.split() try: @@ -228,7 +229,7 @@ def write_demuxedfastq(beds, fastq_in, fastq_out): for bed in beds[new_title[0]]: new_title[0] += "_" + bed[3] outfqs += "@{}\n".format(" ".join(new_title)) - outfqs += "{}\n".format(seq[bed[1] : bed[2]]) + outfqs += f"{seq[bed[1] : bed[2]]}\n" outfqs += "+\n" - outfqs += "{}\n".format(qual[bed[1] : bed[2]]) + outfqs += f"{qual[bed[1] : bed[2]]}\n" oz.stdin.write(outfqs.encode("utf-8")) diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 2ac3e89..52ff8a7 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -1,10 +1,10 @@ -import os import json -from dataclasses import dataclass, asdict +import os +from dataclasses import asdict, dataclass from typing import ClassVar -class Report(object): +class Report: unmatch_header = ["index", "num_reads", "ont_barcode"] def __init__(self, run_name, uuid, version): @@ -42,7 +42,7 @@ def write_report(self, outdir): f.write(f"\n{chr(9).join(uhead)}\n") # chr(9) = tab for key, unmatch in self.unmatched_stats.items(): for idx, mnum in unmatch: - f.write("{}\t{}\t{}\n".format(idx, mnum, key[0])) + f.write(f"{idx}\t{mnum}\t{key[0]}\n") def write_json(self, outdir): json_out = { @@ -112,7 +112,7 @@ def write_dataframe(self, outdir, samplesheet): f.write("\n") -class AlignmentStat(object): +class AlignmentStat: def __init__(self, adaptor_name): self.adaptor_name = adaptor_name self.paf_stats = {} diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 12f6dae..35713a8 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -1,15 +1,15 @@ import csv -import Levenshtein as lev import glob +import importlib.resources import re from dataclasses import dataclass from itertools import combinations -import yaml -import importlib.resources +import Levenshtein as lev +import yaml p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") -with open(p, "r") as stream: +with open(p) as stream: adaptors = yaml.safe_load(stream) delim = "-NNN-" @@ -22,7 +22,7 @@ class SampleSheetEntry: ont_barcode: str -class Adaptor(object): +class Adaptor: def __init__(self, adaptor, i7_index=None, i5_index=None): self.i5 = adaptors[adaptor]["i5"] self.i7 = adaptors[adaptor]["i7"] @@ -48,7 +48,7 @@ def get_i7_mask(self): return self.i7 -class SampleSheet(object): +class SampleSheet: def __init__(self, input_csv, ont_bc): # Read samplesheet in format: # sample_name, adaptors, i7_index(-i5_index), fastq_path @@ -56,7 +56,7 @@ def __init__(self, input_csv, ont_bc): self.samplesheet = [] try: - csvfile = open(input_csv, "r") + csvfile = open(input_csv) dialect = csv.Sniffer().sniff(csvfile.readline(), [",", ";", "\t"]) csvfile.seek(0) data = csv.DictReader( @@ -104,7 +104,7 @@ def __init__(self, input_csv, ont_bc): for a, b in combinations(test_globs.values(), 2): if len(set(a) & set(b)) > 0: raise UserWarning( - f"Fastq paths are inconsistent. Please check samplesheet" + "Fastq paths are inconsistent. Please check samplesheet" ) if not ont_bc and len(set([v[0] for v in test_globs.values()])) > 1: @@ -163,7 +163,7 @@ def get_fastastring(self, adaptor_name=None): outstr = "" for key, seq in fastas.items(): - outstr += ">{}\n{}\n".format(key, seq) + outstr += f">{key}\n{seq}\n" return outstr diff --git a/setup.py b/setup.py index 2c8751a..6408fd5 100644 --- a/setup.py +++ b/setup.py @@ -10,9 +10,10 @@ conda install -c bioconda anglerfish """ -from setuptools import setup, find_packages from pathlib import Path +from setuptools import find_packages, setup + this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() From e7cd2a323ac25605ffa9ef9fe2eafbd2409cae0e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:50:12 +0100 Subject: [PATCH 05/15] supress blame of last commit --- .git-blame-ignore-revs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 732882a..9556a97 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,5 @@ # 240110, non-invasive, repo-wide formatting with ruff and prettier 7634a3bd7b150f552f064e7089b4ab160b6f4564 + +# 240110, safe ruff-fixes +1d308fb21d4c4d07f1d6a97d0e85d9495db80557 \ No newline at end of file From 6a4357b716921c2bcc2ad7628cb387741dde752d Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 13:59:16 +0100 Subject: [PATCH 06/15] Apply unsafe fixes, allow ambiguous variable names --- anglerfish/anglerfish.py | 6 +++--- anglerfish/demux/demux.py | 2 +- pyproject.toml | 4 ++++ 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/anglerfish/anglerfish.py b/anglerfish/anglerfish.py index fb92667..c79051f 100755 --- a/anglerfish/anglerfish.py +++ b/anglerfish/anglerfish.py @@ -37,7 +37,7 @@ def run_demux(args): log.info(f" arguments {vars(args)}") log.info(f" run uuid {run_uuid}") bc_dist = ss.minimum_bc_distance() - if args.max_distance == None: + if args.max_distance is None: if bc_dist > 1: args.max_distance = 2 else: @@ -75,7 +75,7 @@ def run_demux(args): with open(adaptor_path, "w") as f: f.write(ss.get_fastastring(adaptor_name)) for fq in fastq_files: - retcode = run_minimap2(fq, adaptor_path, aln_path, args.threads) + run_minimap2(fq, adaptor_path, aln_path, args.threads) # Easy line count in input fastq files num_fq = 0 @@ -181,7 +181,7 @@ def run_demux(args): # Top unmatched indexes nomatch_count = Counter([x[3] for x in no_matches]) - if args.max_unknowns == None: + if args.max_unknowns is None: args.max_unknowns = len([sample for sample in ss]) + 10 report.add_unmatched_stat( nomatch_count.most_common(args.max_unknowns), ont_barcode, adaptor_name diff --git a/anglerfish/demux/demux.py b/anglerfish/demux/demux.py index 840b8f0..4d2ecb9 100644 --- a/anglerfish/demux/demux.py +++ b/anglerfish/demux/demux.py @@ -47,7 +47,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads): ] with open(output_paf, "ab") as ofile: - proc = subprocess.run(cmd, stdout=ofile, check=True) + subprocess.run(cmd, stdout=ofile, check=True) def parse_paf_lines(paf, min_qual=10): diff --git a/pyproject.toml b/pyproject.toml index b3bc5ba..8ba7254 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,10 @@ select =[ "UP", # pyupgrade Make sure syntax is up-to-date ] +ignore = [ + "E741", # ambiguous variable name +] + [tool.mypy] ignore_missing_imports = true follow_imports = 'skip' From 4aafe91c49709a9f11634e0d38345bbf77913824 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 14:12:30 +0100 Subject: [PATCH 07/15] manual fixes + update gha config --- .github/workflows/lint-code.yml | 36 --------------------------------- anglerfish/__main__.py | 2 +- anglerfish/demux/report.py | 2 +- anglerfish/demux/samplesheet.py | 2 ++ 4 files changed, 4 insertions(+), 38 deletions(-) diff --git a/.github/workflows/lint-code.yml b/.github/workflows/lint-code.yml index c73256e..572ff68 100644 --- a/.github/workflows/lint-code.yml +++ b/.github/workflows/lint-code.yml @@ -63,42 +63,6 @@ jobs: # Configured in pyprojet.toml run: mypy **/*.py - # Use pipreqs to check for missing dependencies - pipreqs-check: - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - - name: Install pipreqs - run: pip install pipreqs - - - name: Install requirements - run: pip install -r requirements.txt - - - name: Run pipreqs - run: pipreqs --savepath pipreqs.txt - - - name: Compare requirements - run: | - # Extract and sort package names - awk '{print $1}' $1 | sort -u > "$1".compare - awk -F'==' '{print $1}' $2 | sort -u > "$2".compare - - # Compare package lists - if cmp -s "$1".compare "$2".compare - then - echo "Requirements are the same" - exit 0 - else - echo "Requirements are different" - exit 1 - fi - # Use Prettier to check various file formats prettier: runs-on: ubuntu-latest diff --git a/anglerfish/__main__.py b/anglerfish/__main__.py index c11e37d..72a9288 100644 --- a/anglerfish/__main__.py +++ b/anglerfish/__main__.py @@ -1,4 +1,4 @@ -from . import anglerfish +from .anglerfish import anglerfish if __name__ == "__main__": anglerfish() diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index 52ff8a7..cb70cce 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -150,7 +150,7 @@ class SampleStat: std_read_len: float i7_reversed: bool i5_reversed: bool - ont_barcode: str = None + ont_barcode: str|None = None header: ClassVar[list] = [ "sample_name", "#reads", # We specify this for historical reasons diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 35713a8..ed0c2d6 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -1,6 +1,7 @@ import csv import glob import importlib.resources +import os import re from dataclasses import dataclass from itertools import combinations @@ -9,6 +10,7 @@ import yaml p = importlib.resources.files("anglerfish.config").joinpath("adaptors.yaml") +assert isinstance(p, os.PathLike) with open(p) as stream: adaptors = yaml.safe_load(stream) delim = "-NNN-" From 6bd5cdc8907f9a8b9c723146c0c891e04573b307 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 14:14:16 +0100 Subject: [PATCH 08/15] update gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index b2df197..3aeabc2 100755 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ *~ *.egg-info .DS_Store +.benchmarks +.*_cache \ No newline at end of file From eed9f879a8398a0a5033be2afe96627c3e975a45 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 15:50:23 +0100 Subject: [PATCH 09/15] document dev requirements and tweak formatting --- anglerfish/demux/report.py | 2 +- requirements-dev-conda.txt | 2 ++ requirements-dev-pip.txt | 3 +++ 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 requirements-dev-conda.txt create mode 100644 requirements-dev-pip.txt diff --git a/anglerfish/demux/report.py b/anglerfish/demux/report.py index cb70cce..03dba2c 100644 --- a/anglerfish/demux/report.py +++ b/anglerfish/demux/report.py @@ -150,7 +150,7 @@ class SampleStat: std_read_len: float i7_reversed: bool i5_reversed: bool - ont_barcode: str|None = None + ont_barcode: str | None = None header: ClassVar[list] = [ "sample_name", "#reads", # We specify this for historical reasons diff --git a/requirements-dev-conda.txt b/requirements-dev-conda.txt new file mode 100644 index 0000000..1a04d6a --- /dev/null +++ b/requirements-dev-conda.txt @@ -0,0 +1,2 @@ +conda-forge::pre-commit +conda-forge::prettier diff --git a/requirements-dev-pip.txt b/requirements-dev-pip.txt new file mode 100644 index 0000000..faed137 --- /dev/null +++ b/requirements-dev-pip.txt @@ -0,0 +1,3 @@ +ruff +mypy +editorconfig-checker From 4606cf2c911d81ce517de4138514bc5dc2aeb794 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 16:06:46 +0100 Subject: [PATCH 10/15] supplement readme --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 2a4867d..2d1304a 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,13 @@ conda env create -f environment.yml pip install -e . ``` +### Developer tools + +``` +conda install --file requirements-dev-conda.txt +pip install -r requirements-dev-pip.txt +``` + ### Development version ``` From 4837509902166076b1c7b258132e0bc0339f5c0e Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 16:07:31 +0100 Subject: [PATCH 11/15] add pre-commit cache dir to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3aeabc2..0771c71 100755 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,5 @@ *.egg-info .DS_Store .benchmarks -.*_cache \ No newline at end of file +.*_cache +node_modules From b950721cbf1ff82e7289d8a72789496435eb6bc5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 16:25:38 +0100 Subject: [PATCH 12/15] Update reademe, make .vscode settings optional for contributors by including it in .gitignore --- .gitignore | 1 + README.md | 32 +++++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 0771c71..86637f5 100755 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ .benchmarks .*_cache node_modules +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 2d1304a..9093cb1 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,17 @@ pip install bio-anglerfish conda install -c bioconda anglerfish ``` -### Manually with Conda +### Install development version -First [install miniconda](https://docs.conda.io/en/latest/miniconda.html), then: +``` +pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerfish.git +``` + +## Source development + +1. [Install miniconda](https://docs.conda.io/en/latest/miniconda.html). + +2. Set up repo clone with editable install ``` git clone https://github.com/remiolsen/anglerfish.git @@ -52,20 +60,34 @@ cd anglerfish # Create a the anglerfish conda environment conda env create -f environment.yml # Install anglerfish +conda activate anglerfish pip install -e . ``` -### Developer tools +3. Install developer tools ``` conda install --file requirements-dev-conda.txt pip install -r requirements-dev-pip.txt ``` -### Development version +4. (Optional) Install pre-commit to prevent committing code that will fail linting ``` -pip install --upgrade --force-reinstall git+https://github.com/remiolsen/anglerfish.git +pre-commit install +``` + +5. (Optional) Enable automatic formatting in VS Code by creating `.vscode/settings.json` with: + +``` +{ + "editor.formatOnSave": true, + "editor.defaultFormatter": "esbenp.prettier-vscode", + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff" + }, + "prettier.configPath": "./pyproject.toml" +} ``` ## Usage From 22d116d52c980288b653946d40c3ae8fc5e365d1 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 16:27:45 +0100 Subject: [PATCH 13/15] newline --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 86637f5..9c7393b 100755 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,4 @@ .benchmarks .*_cache node_modules -.vscode \ No newline at end of file +.vscode From 2547706d40c4b928322112b460a42613fb971b1f Mon Sep 17 00:00:00 2001 From: kedhammar Date: Wed, 10 Jan 2024 16:30:05 +0100 Subject: [PATCH 14/15] remove from index --- .vscode/settings.json | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 6e4306d..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "editor.formatOnSave": true, - "editor.defaultFormatter": "esbenp.prettier-vscode", - "[python]": { - "editor.defaultFormatter": "charliermarsh.ruff" - }, - "prettier.configPath": "./pyproject.toml" -} From 9115460372dc461346607a0ef12ce4f210cfe9e5 Mon Sep 17 00:00:00 2001 From: kedhammar Date: Thu, 11 Jan 2024 11:53:25 +0100 Subject: [PATCH 15/15] ruff --- setup.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/setup.py b/setup.py index d5a7814..c8636a1 100644 --- a/setup.py +++ b/setup.py @@ -17,26 +17,26 @@ this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() -version='0.6.1-dev' +version = "0.6.1-dev" setup( name="bio-anglerfish", version=version, description="Anglerfish, a tool to demultiplex Illumina libraries from ONT data", long_description=long_description, - long_description_content_type='text/markdown', - author='Remi-Andre Olsen', - author_email='remi-andre.olsen@scilifelab.se', - url='https://github.com/remiolsen/anglerfish', - license='MIT', + long_description_content_type="text/markdown", + author="Remi-Andre Olsen", + author_email="remi-andre.olsen@scilifelab.se", + url="https://github.com/remiolsen/anglerfish", + license="MIT", python_requires=">=3.10", - packages = find_packages(), - package_data = {"":["config/adaptors.yaml"]}, + packages=find_packages(), + package_data={"": ["config/adaptors.yaml"]}, install_requires=[ - 'python-levenshtein==0.23.0', - 'biopython==1.79', - 'numpy>=1.22.0', - 'pyyaml==6.0' + "python-levenshtein==0.23.0", + "biopython==1.79", + "numpy>=1.22.0", + "pyyaml==6.0", ], entry_points={ "console_scripts": [