Skip to content

feat: amplicon utils #2725

feat: amplicon utils

feat: amplicon utils #2725

Workflow file for this run

name: Tests
on:
push:
branches:
- master
pull_request:
branches:
- "*"
jobs:
Cancel-previous-jobs:
runs-on: ubuntu-latest
if: github.ref != 'refs/heads/master'
steps:
- uses: khan/[email protected]
with:
workflows: "main.yml"
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
Formatting:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Formatting
uses: super-linter/[email protected]
env:
VALIDATE_ALL_CODEBASE: false
DEFAULT_BRANCH: master
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
FILTER_REGEX_EXCLUDE: CHANGELOG.md
VALIDATE_SNAKEMAKE_SNAKEFMT: true
VALIDATE_PYTHON_BLACK: true
VALIDATE_MARKDOWN: true
Linting:
runs-on: ubuntu-latest
env:
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }}
steps:
- uses: actions/checkout@v4
- name: Lint workflow
uses: snakemake/[email protected]
with:
directory: .
snakefile: workflow/Snakefile
stagein: mamba install -n snakemake -c conda-forge peppy
args: "--lint"
# pre-commit action currently fails:
# https://github.com/IKIM-Essen/uncovar/actions/runs/4304753941/jobs/7506225198#step:4:115
# revisit when new pre-commit release >3.0.0 is out
# Pre-Commit:
# runs-on: ubuntu-latest
# if: github.ref != 'refs/heads/master'
# steps:
# - uses: actions/checkout@v4
# - uses: actions/setup-python@v5
# - uses: pre-commit/[email protected]
Technology-Tests:
runs-on: ubuntu-latest
env:
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }}
needs:
- Formatting
- Linting
#- Pre-Commit
strategy:
matrix:
rule: [all, all -np]
# disable ont actions
technology: [all, illumina, ont, ion]
# technology: [all, illumina, ion]
seq_method: [shotgun, amplicon]
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/[email protected]
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
- uses: actions/setup-python@v5
with:
python-version: '3.11'
# android - will release about 10 GB if you don't need Android
# dotnet - will release about 20 GB if you don't need .NET
- name: Free up some disk sapce
run: |
sudo rm -rf /usr/local/lib/android
sudo rm -rf /usr/share/dotnet
- name: Prepare test data for all technologies
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'all' || matrix.rule == 'compare_assemblers')
run: |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv
echo ion-test,data/ion_reads.fastq.gz,,2022-01-01,$AMPLICON,ion >> .tests/config/pep/samples.csv
- name: Prepare test data for Illumina
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'illumina' || matrix.rule == 'compare_assemblers')
run: |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,$AMPLICON,illumina >> .tests/config/pep/samples.csv
- name: Prepare test data for Oxford Nanopore
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'ont' || matrix.rule == 'compare_assemblers')
run: |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz
echo sample_name,fq1,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo ont-test,data/ont_reads.fastq.gz,2022-01-01,$AMPLICON,ont >> .tests/config/pep/samples.csv
- name: Prepare test data for Ion Torrent
if: steps.test-data.outputs.cache-hit != true && (startsWith(matrix.rule, 'all') && matrix.technology == 'ion' || matrix.rule == 'compare_assemblers')
run: |
if [[ "${{ matrix.seq_method }}" = "shotgun" ]] ; then export AMPLICON=0; else export AMPLICON=1; fi
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ERR5745913.fastq.gz > .tests/data/ion_reads.fastq.gz
echo sample_name,fq1,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo ion-test,data/ion_reads.fastq.gz,2022-01-01,$AMPLICON,ion >> .tests/config/pep/samples.csv
- name: Use smaller reference files for testing
if: steps.test-resources.outputs.cache-hit != true
run: |
# mkdir -p .tests/resources/minikraken-8GB
# curl -SL https://github.com/thomasbtf/small-kraken-db/raw/master/human_k2db.tar.gz | tar zxvf - -C .tests/resources/minikraken-8GB --strip 1
mkdir -p .tests/resources/genomes
curl -SL "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=BA000005.3&db=nuccore&report=fasta" | gzip -c > .tests/resources/genomes/human-genome.fna.gz
- name: Simulate GISAID download
run: |
mkdir -p .tests/results/benchmarking/tables
echo -e "resources/genomes/B.1.1.7.fasta\nresources/genomes/B.1.351.fasta" > .tests/results/benchmarking/tables/strain-genomes.txt
mkdir -p .tests/resources/genomes
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314997.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.1.7.fasta
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314998.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.351.fasta
- name: Test rule ${{ matrix.rule }} on ${{ matrix.technology }} ${{ matrix.seq_method }} data
uses: snakemake/[email protected]
with:
directory: .tests
snakefile: workflow/Snakefile
args: "-p --use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba ${{ matrix.rule }}"
- name: Test report
uses: snakemake/[email protected]
if: startsWith(matrix.rule, 'all -np') != true
with:
directory: .tests
snakefile: workflow/Snakefile
args: "${{ matrix.rule }} --report report.zip"
- name: Upload report
uses: actions/upload-artifact@v4
if: matrix.technology == 'all' && matrix.rule != 'all -npr'
with:
name: report-rule-${{ matrix.rule }}-${{ matrix.technology }}-${{ matrix.seq_method }}
path: .tests/results/patient-reports/2022-01-01.zip
- name: Upload logs
uses: actions/upload-artifact@v4
if: matrix.technology == 'all' && matrix.rule != 'all -npr'
with:
name: log-rule-${{ matrix.rule }}-technology-${{ matrix.technology }}
path: .tests/logs/
- name: Change permissions for caching
run: sudo chmod -R 755 .tests/.snakemake/conda
- name: Print disk space
run: sudo df -h
Benchmarks-Tests:
runs-on: ubuntu-latest
env:
GISAID_API_TOKEN: ${{ secrets.GISAID_API_TOKEN }}
needs:
- Formatting
- Linting
#- Pre-Commit
strategy:
matrix:
rule:
[
benchmark_strain_calling,
benchmark_assembly,
benchmark_mixtures,
benchmark_non_sars_cov_2,
benchmark_reads,
compare_assemblers,
]
# generate_test_cases,
steps:
- uses: actions/checkout@v4
- name: Free Disk Space (Ubuntu)
uses: jlumbroso/[email protected]
with:
# this might remove tools that are actually needed,
# if set to "true" but frees about 6 GB
tool-cache: false
# all of these default to true, but feel free to set to
# "false" if necessary for your workflow
android: true
dotnet: true
haskell: true
large-packages: true
docker-images: false
swap-storage: true
# - name: Cache conda dependencies
# uses: actions/cache@v2
# with:
# path: |
# .tests/.snakemake/conda
# key: benchmarks-${{ runner.os }}-${{ matrix.rule }}-${{ matrix.technology }}-${{ matrix.seq_method }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }}
# TODO caches are currently completely misleading, as they lead to certain files becoming present on disk which might
# then hide failures that would otherwise be seen.
# - name: Get date
# id: get-date
# run: |
# echo "::set-output name=date::$(/bin/date -u "+%Y%m%d")"
# shell: bash
# - name: Cache resources
# id: test-resources
# uses: actions/cache@v2
# with:
# path: |
# .tests/resources/minikraken-8GB
# .tests/resources/genomes/human-genome.fna.gz
# key: ${{ runner.os }}-test-resources-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/resources**taxo.k2d') }}
# restore-keys: |
# ${{ runner.os }}-test-resources-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-test-resources-
# - name: Cache results
# if: startsWith(matrix.rule, 'all')
# id: test-results
# uses: actions/cache@v2
# with:
# path: |
# .tests/results
# key: ${{ runner.os }}-results-${{ steps.get-date.outputs.date }}-${{ hashFiles('**results/2021-02-01/qc/multiqc.html') }}
# restore-keys: |
# ${{ runner.os }}-results-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-results-
# - name: Cache data
# if: startsWith(matrix.rule, 'all')
# id: test-data
# uses: actions/cache@v2
# with:
# path: |
# .tests/data
# key: ${{ runner.os }}-test-data-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/data/*.fastq.gz') }}
# restore-keys: |
# ${{ runner.os }}-test-data-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-test-data-
# - name: Cache benchmark data
# if: startsWith(matrix.rule, 'all') != true
# id: benchmark-data
# uses: actions/cache@v2
# with:
# path: |
# .tests/resources/benchmarking
# key: ${{ runner.os }}-benchmark-data-${{ steps.get-date.outputs.date }}-${{ hashFiles('**.tests/resources/benchmarking/**/reads.1.fastq.gz') }}
# restore-keys: |
# ${{ runner.os }}-benchmark-data-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-benchmark-data-
# - name: Cache test dependencies
# if: startsWith(matrix.rule, 'all')
# id: test-dependencies
# uses: actions/cache@v2
# with:
# path: |
# .tests/.snakemake/conda
# key: ${{ runner.os }}-sars-cov-test-dependencies-${{ steps.get-date.outputs.date }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }}
# restore-keys: |
# ${{ runner.os }}-sars-cov-test-dependencies-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-sars-cov-test-dependencies-
# - name: Cache benchmark dependencies
# if: startsWith(matrix.rule, 'all') != true
# id: benchmark-dependencies
# uses: actions/cache@v2
# with:
# path: |
# .tests/.snakemake/conda
# key: ${{ runner.os }}-sars-cov-benchmark-dependencies-${{ steps.get-date.outputs.date }}-${{ hashFiles('*.tests/.snakemake/conda/*.yaml') }}
# restore-keys: |
# ${{ runner.os }}-sars-cov-benchmark-dependencies-${{ steps.get-date.outputs.date }}-
# ${{ runner.os }}-sars-cov-benchmark-dependencies-
- name: Prepare test data
if: matrix.rule == 'generate_test_cases'
run: |
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/ont_reads.fastq.gz > .tests/data/ont_reads.fastq.gz
echo sample_name,fq1,fq2,date,is_amplicon_data,technology,test_case > .tests/config/pep/samples.csv
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,1,illumina,case >> .tests/config/pep/samples.csv
echo ont-test,data/ont_reads.fastq.gz,,2022-01-01,1,ont,case >> .tests/config/pep/samples.csv
- name: Prepare test data
if: matrix.rule != 'generate_test_cases'
run: |
mkdir -p .tests/data
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.1.fastq.gz
curl -L https://github.com/thomasbtf/small-kraken-db/raw/master/B.1.1.7.reads.1.fastq.gz > .tests/data/B117.2.fastq.gz
echo sample_name,fq1,fq2,date,is_amplicon_data,technology > .tests/config/pep/samples.csv
echo illumina-test,data/B117.1.fastq.gz,data/B117.2.fastq.gz,2022-01-01,0,illumina >> .tests/config/pep/samples.csv
- name: Use smaller reference files for testing
if: steps.test-resources.outputs.cache-hit != true
run: |
# mkdir -p .tests/resources/minikraken-8GB
# curl -SL https://github.com/thomasbtf/small-kraken-db/raw/master/human_k2db.tar.gz | tar zxvf - -C .tests/resources/minikraken-8GB --strip 1
mkdir -p .tests/resources/genomes
curl -SL "https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=BA000005.3&db=nuccore&report=fasta" | gzip -c > .tests/resources/genomes/human-genome.fna.gz
- name: Simulate GISAID download
run: |
mkdir -p .tests/results/benchmarking/tables
echo -e "resources/genomes/B.1.1.7.fasta\nresources/genomes/B.1.351.fasta" > .tests/results/benchmarking/tables/strain-genomes.txt
mkdir -p .tests/resources/genomes
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314997.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.1.7.fasta
curl "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=MZ314998.1&rettype=fasta" | sed '$ d' > .tests/resources/genomes/B.1.351.fasta
- name: Test rule ${{ matrix.rule }}
uses: snakemake/[email protected]
with:
directory: .tests
snakefile: workflow/Snakefile
args: "-p --use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba ${{ matrix.rule }}"
- name: Test report
uses: snakemake/[email protected]
if: startsWith(matrix.rule, 'all -np') != true
with:
directory: .tests
snakefile: workflow/Snakefile
args: "${{ matrix.rule }} --report report.zip"
# - name: Upload report
# uses: actions/upload-artifact@v4
# with:
# name: report-rule-${{ matrix.rule }}
# path: .tests/results/patient-reports/2022-01-01.zip
- name: Upload logs
uses: actions/upload-artifact@v4
with:
name: log-rule-${{ matrix.rule }}
path: .tests/logs/
# - name: Unit test
# args: "--generate-unit-tests"
# - name: Test workflow (singularity)
# args: "--use-conda --use-singularity --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba"
# - name: Test input changes
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-input-changes`"
# - name: Test code changes
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-code-changes`"
# - name: Test params changes
# args: "--use-conda --show-failed-logs --cores 2 --resources ncbi_api_requests=1 --conda-cleanup-pkgs cache --conda-frontend mamba -R `snakemake --list-params-changes`"
- name: Check strain calling benchmark
if: matrix.rule == 'benchmark_strain_calling'
run: |
cat .tests/results/benchmarking/strain-calling.csv
if (tail -n+2 .tests/results/benchmarking/strain-calling.csv | grep mismatch > /dev/null)
then
echo "Strain calling failed in some cases (see above)."
exit 1
else
echo "Strain calling was successful in all cases."
fi
- name: Check pseudoassembly benchmark
if: matrix.rule == 'benchmark_assembly'
run: |
cat .tests/results/benchmarking/assembly/pseudoassembly.csv
if [[ $(tail -1 .tests/results/benchmarking/assembly/pseudoassembly.csv) < 0.95 ]]
then
echo "Pseudoassembly benchmarking failed. There is at least one assembly where the contigs do not cover 95% of the original sequence (see above)."
exit 1
else
echo "Pseudoassembly was successful."
fi
- name: Check assembly benchmark
if: matrix.rule == 'benchmark_assembly'
run: |
cat .tests/results/benchmarking/assembly/assembly.csv
if [[ $(tail -1 .tests/results/benchmarking/assembly/assembly.csv) < 0.8 ]]
then
echo "Assembly benchmarking failed. There is at least one assembly where the contigs do not cover 80% of the original sequence (see above)."
exit 1
else
echo "Assembly was successful."
fi
- name: Print non-sars-cov-2 kallisto calls
if: matrix.rule == 'benchmark_non_sars_cov_2'
run: |
cat .tests/results/benchmarking/tables/strain-calls/non-cov2-*.strains.kallisto.tsv
- name: Test non-sars-cov-2 coronaviruses
if: matrix.rule == 'benchmark_non_sars_cov_2'
run: |
cat .tests/results/benchmarking/non-sars-cov-2.csv
if (cat .tests/results/benchmarking/non-sars-cov-2.csv | grep 'is sars-cov-2' > /dev/null)
then
echo "Workflow failed! A non-sars-cov-2 genome was identified as sars-cov-2 (see above)."
exit 1
else
echo "Workflow sucessfully identified samples as non-sars-cov-2 in all cases."
fi
- name: Change permissions for caching
run: sudo chmod -R 755 .tests/.snakemake/conda
- name: Print disk space
run: sudo df -h