From 9889ccbd930ac5ebf77f7661899ba0754ed5d840 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 9 Jan 2024 11:33:03 +0100 Subject: [PATCH] feat: bash script to download data (#42) --- .github/workflows/ci.yml | 15 + Makefile | 32 +++ README.md | 328 +-------------------- download-data.sh | 604 +++++++++++++++++++++++++++++++++++++++ environment.yml | 7 + 5 files changed, 660 insertions(+), 326 deletions(-) create mode 100644 Makefile create mode 100644 download-data.sh create mode 100644 environment.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6796ce3..b64b027 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,6 +20,21 @@ jobs: with: file_or_dir: | *.yml + .github/workflows/*.yml + + - name: Setup formatting environment + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: environment.yml + init-shell: bash + cache-environment: true + post-cleanup: none # breaks otherwise + + - name: Check formating + run: | + make ci + shell: bash -el {0} + Up-Down: runs-on: ubuntu-latest diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..65c825f --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +.PHONY: default +default: help + +.PHONY: help +help: + @echo "Usage: make " + @echo + @echo "Targets:" + @echo " ci Run all CI steps" + @echo " lint Run all linters" + @echo " lint-shellcheck Run shellcheck linter" + @echo " format Run all formatters" + @echo " format-shellcheck Run shellcheck formatter" + +.PHONY: ci +ci: lint + +.PHONY: lint +lint: lint-shellcheck + +SHELLCHECK_EXCLUDES := SC2012,SC2046,SC2086 + +.PHONY: lint-shellcheck +lint-shellcheck: + shellcheck -e $(SHELLCHECK_EXCLUDES) *.sh + +.PHONY: format +format: format-shellcheck + +.PHONY: format-shellcheck +format-shellcheck: + shellcheck -e $(SHELLCHECK_EXCLUDES) -f diff *.sh | git apply diff --git a/README.md b/README.md index 5513d12..2844830 100644 --- a/README.md +++ b/README.md @@ -73,335 +73,11 @@ cp docker-compose.override.yml-dev docker-compose.override.yml Now you need to obtain the data to serve by the mehari, viguno, and annonars container. For this, we have prepared strongly reduced data sets (overall less than 2GB rather than hundreds of GB of data). -Obtain the annonars data: -```bash session -cat <<"EOF" >/tmp/tokens.txt -## -- full data ------------------------------------------------------------- -## uncomment the lines and use instead of reduced data for a full installation -# full/annonars/cadd-grch37-1.6+0.29.1 -# full/annonars/cadd-grch38-1.6+0.29.1 -# full/annonars/cons-grch37-20161007+0.29.1 -# full/annonars/cons-grch38-20190906+0.29.1 -# full/annonars/dbnsfp-grch37-4.4a+0.29.1 -# full/annonars/dbnsfp-grch38-4.4a+0.29.1 -# full/annonars/dbscsnv-grch37-1.1+0.29.1 -# full/annonars/dbscsnv-grch38-1.1+0.29.1 -# full/annonars/dbsnp-grch37-b151+0.29.1 -# full/annonars/dbsnp-grch38-b151+0.29.1 -full/annonars/functional-grch37-105.20201022+0.29.1 -full/annonars/functional-grch38-110+0.29.1 -full/annonars/genes-3.1+2.1.1+4.4+20230606+10.1+20231123+0.29.3 -# full/annonars/gnomad-exomes-grch37-2.1.1+0.29.1 -# full/annonars/gnomad-exomes-grch38-2.1.1+0.29.1 -# full/annonars/gnomad-genomes-grch37-2.1.1+0.29.1 -# full/annonars/gnomad-genomes-grch38-3.1.2+0.29.1 -full/annonars/gnomad-mtdna-grch37-3.1+0.29.1 -full/annonars/gnomad-mtdna-grch38-3.1+0.29.1 -full/annonars/gnomad-sv-exomes-grch37-0.3.1+0.29.1 -full/annonars/gnomad-sv-exomes-grch38-4.0+0.29.1 -full/annonars/gnomad-sv-genomes-grch37-2.1.1+0.29.1 -full/annonars/gnomad-sv-genomes-grch38-4.0+0.29.1 -full/annonars/helixmtdb-grch37-20200327+0.29.1 -full/annonars/helixmtdb-grch38-20200327+0.29.1 -full/annonars/regions-grch37-20231122+0.29.3 -full/annonars/regions-grch38-20231122+0.29.3 -# full/mehari/freqs-grch37-2.1.1+2.1.1+3.1+20200327+0.29.1 -# full/mehari/freqs-grch37-2.1.1+2.1.1+3.1+20200327+0.29.1 -full/mehari/genes-xlink-20231122 -full/tracks -full/worker -# full/viguno/hpo-20230606+0.1.6 -## -- reduced data for dev --------------------------------------------------- -reduced-dev/annonars -reduced-dev/mehari -reduced-dev/viguno -EOF -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/download -(set -x; for token in $(grep -v ^# /tmp/tokens.txt); do \ - src=$token; \ - dst=$(echo $token | perl -p -e 's|/\*||' | perl -p -e 's#^(full/|reduced-dev/)##'); \ - mkdir -p .dev/volumes/$STATIC/data/download/$dst; \ - s5cmd \ - --endpoint-url=https://ceph-s3-public.cubi.bihealth.org \ - --no-sign-request \ - sync \ - "s3://varfish-public/$src/*" \ - ".dev/volumes/$STATIC/data/download/$dst"; \ -done) -``` - -Setup symlink structure so the data is at the expected location. - -```bash session -## -## annonars -## - -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/annonars - -ln -sr .dev/volumes/$STATIC/data/download/annonars/genes-* \ - .dev/volumes/$STATIC/data/annonars/genes - -names="cadd dbsnp dbnsfp dbscsnv gnomad-mtdna gnomad-genomes gnomad-exomes helixmtdb cons"; \ -for genome in grch37 grch38; do \ - for name in $names; do \ - mkdir -p .dev/volumes/$STATIC/data/annonars/$genome; \ - test -e .dev/volumes/$STATIC/data/$genome/$name || \ - ln -sr \ - $(echo .dev/volumes/$STATIC/data/download/annonars/$name-$genome-* \ - | tr ' ' '\n' \ - | tail -n 1) \ - .dev/volumes/$STATIC/data/annonars/$genome/$name; \ - done; \ -done - -## -## mehari -## - -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/mehari/grch3{7,8} - -rm -f .dev/volumes/$STATIC/data/mehari/grch3?/freqs - -ln -sr .dev/volumes/$STATIC/data/download/mehari/freqs-grch37-* \ - .dev/volumes/$STATIC/data/mehari/grch37/freqs -ln -sr .dev/volumes/$STATIC/data/download/mehari/freqs-grch38-* \ - .dev/volumes/$STATIC/data/mehari/grch38/freqs - -## -## viguno -## - -STATIC=reev-static - -rm -f .dev/volumes/$STATIC/data/{hgnc_xlink.tsv,hpo} - -ln -sr .dev/volumes/$STATIC/data/download/mehari/genes-xlink-20231122/genes-xlink.tsv \ - .dev/volumes/$STATIC/data/hgnc_xlink.tsv -ln -sr .dev/volumes/$STATIC/data/download/viguno/hpo-20230606+0.1.6 \ - .dev/volumes/$STATIC/data/hpo - -## -## worker -## - -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/worker/{grch3{7,8}/strucvars/bgdbs,noref/genes} - -rm -f .dev/volumes/$STATIC/data/worker/grch3?/strucvars/bgdbs/{exac,g1k,gnomad,dbvar,dgv,dgv-gs}.bin - -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-exac-grch37-*/bgdb-exac.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/exac.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-g1k-grch37-phase3v2+0.9.0/bgdb-g1k.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/g1k.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-gnomad-grch37-*/bgdb-gnomad.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/gnomad.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dbvar-grch37-*/bgdb-dbvar.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/dbvar.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dbvar-grch38-*/bgdb-dbvar.bin \ - .dev/volumes/$STATIC/data/worker/grch38/strucvars/bgdbs/dbvar.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dgv-grch37-*/bgdb-dgv.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/dgv.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dgv-grch38-*/bgdb-dgv.bin \ - .dev/volumes/$STATIC/data/worker/grch38/strucvars/bgdbs/dgv.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dgv-gs-grch37-*/bgdb-dgv-gs.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/bgdbs/dgv-gs.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/bgdb-dgv-gs-grch38-*/bgdb-dgv-gs.bin \ - .dev/volumes/$STATIC/data/worker/grch38/strucvars/bgdbs/dgv-gs.bin - -rm -f .dev/volumes/$STATIC/data/worker/grch3?/strucvars/clinvar.bin - -ln -sr .dev/volumes/$STATIC/data/download/worker/clinvar-strucvars-grch37-*/clinvar-strucvars.bin \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/clinvar.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/clinvar-strucvars-grch38-*/clinvar-strucvars.bin \ - .dev/volumes/$STATIC/data/worker/grch38/strucvars/clinvar.bin - -rm -f .dev/volumes/$STATIC/data/worker/grch3?/strucvars/patho-mms.bin - -ln -sr .dev/volumes/$STATIC/data/download/worker/patho-mms-grch37-*/patho-mms.bed \ - .dev/volumes/$STATIC/data/worker/grch37/strucvars/patho-mms.bed -ln -sr .dev/volumes/$STATIC/data/download/worker/patho-mms-grch38-*/patho-mms.bed \ - .dev/volumes/$STATIC/data/worker/grch38/strucvars/patho-mms.bed - -mkdir -p .dev/volumes/$STATIC/data/worker/grch3{7,8}/tads -rm -f .dev/volumes/$STATIC/data/worker/grch3?/tads/hesc.bed - -ln -sr .dev/volumes/$STATIC/data/download/worker/tads-grch37-dixon2015/hesc.bed \ - .dev/volumes/$STATIC/data/worker/grch37/tads/hesc.bed -ln -sr .dev/volumes/$STATIC/data/download/worker/tads-grch38-dixon2015/hesc.bed \ - .dev/volumes/$STATIC/data/worker/grch38/tads/hesc.bed - -rm -f .dev/volumes/$STATIC/data/worker/noref/genes/{xlink.bin,acmg.tsv,mime2gene.tsv} - -ln -sr .dev/volumes/$STATIC/data/download/worker/genes-xlink-*/genes-xlink.bin \ - .dev/volumes/$STATIC/data/worker/noref/genes/xlink.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/acmg-sf-*/acmg_sf.tsv \ - .dev/volumes/$STATIC/data/worker/noref/genes/acmg.tsv -ln -sr .dev/volumes/$STATIC/data/download/worker/mim2gene-*/mim2gene.tsv \ - .dev/volumes/$STATIC/data/worker/noref/genes/mime2gene.tsv - -mkdir -p .dev/volumes/$STATIC/data/worker/grch3{7,8}/genes -rm -f .dev/volumes/$STATIC/data/worker/grch3?/genes/{ensembl_genes.bin,refseq_genes.bin} - -ln -sr .dev/volumes/$STATIC/data/download/worker/genes-regions-grch37-*/ensembl_genes.bin \ - .dev/volumes/$STATIC/data/worker/grch37/genes/ensembl_regions.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/genes-regions-grch38-*/ensembl_genes.bin \ - .dev/volumes/$STATIC/data/worker/grch38/genes/ensembl_regions.bin - -ln -sr .dev/volumes/$STATIC/data/download/worker/genes-regions-grch37-*/refseq_genes.bin \ - .dev/volumes/$STATIC/data/worker/grch37/genes/refseq_regions.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/genes-regions-grch38-*/refseq_genes.bin \ - .dev/volumes/$STATIC/data/worker/grch38/genes/refseq_regions.bin - -mkdir -p .dev/volumes/$STATIC/data/worker/grch3{7,8}/features -rm -f .dev/volumes/$STATIC/data/worker/grch3?/features/{masked_repeat.bin,masked_seqdup.bin} - -ln -sr .dev/volumes/$STATIC/data/download/worker/masked-repeat-grch37-*/masked-repeat.bin \ - .dev/volumes/$STATIC/data/worker/grch37/features/masked_repeat.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/masked-segdup-grch37-*/masked-segdup.bin \ - .dev/volumes/$STATIC/data/worker/grch37/features/masked_seqdup.bin - -ln -sr .dev/volumes/$STATIC/data/download/worker/masked-repeat-grch38-*/masked-repeat.bin \ - .dev/volumes/$STATIC/data/worker/grch38/features/masked_repeat.bin -ln -sr .dev/volumes/$STATIC/data/download/worker/masked-segdup-grch38-*/masked-segdup.bin \ - .dev/volumes/$STATIC/data/worker/grch38/features/masked_seqdup.bin - -## -## tracks -## - -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/nginx/grch3{7,8} -rm -f .dev/volumes/$STATIC/data/nginx/grch3?/* - -paths_37=$(find .dev/volumes/$STATIC/data/download/tracks/ -type f -name '*.bed' -or -name '*.bed.gz' | sort | grep grch37) -for path in $paths_37; do - if [[ -e ${path}.tbi ]]; then - ln -sr $path ${path}.tbi .dev/volumes/$STATIC/data/nginx/grch37 - else - ln -sr $path .dev/volumes/$STATIC/data/nginx/grch37 - fi -done - -paths_38=$(find .dev/volumes/$STATIC/data/download/tracks/ -type f -name '*.bed' -or -name '*.bed.gz' | sort | grep grch38) -for path in $paths_38; do - if [[ -e ${path}.tbi ]]; then - ln -sr $path ${path}.tbi .dev/volumes/$STATIC/data/nginx/grch38 - else - ln -sr $path .dev/volumes/$STATIC/data/nginx/grch38 - fi -done -``` - -To download Mehari transcripts - -```bash session -STATIC=reev-static -mkdir -p .dev/volumes/$STATIC/data/download/mehari-data-txs-grch3{7,8} - -for ext in .zst .zst.sha256 .zst.report .zst.report.sha256; do - wget -O .dev/volumes/$STATIC/data/download/mehari-data-txs-grch37/mehari-data-txs-grch37-0.4.4.bin$ext \ - https://github.com/bihealth/mehari-data-tx/releases/download/v0.4.4/mehari-data-txs-grch37-0.4.4.bin$ext - wget -O .dev/volumes/$STATIC/data/download/mehari-data-txs-grch38/mehari-data-txs-grch38-0.4.4.bin$ext \ - https://github.com/bihealth/mehari-data-tx/releases/download/v0.4.4/mehari-data-txs-grch38-0.4.4.bin$ext -done - -rm -f .dev/volumes/$STATIC/data/mehari/grch3?/txs.bin.zst -ln -sr .dev/volumes/$STATIC/data/download/mehari-data-txs-grch37/mehari-data-txs-grch37-0.4.4.bin.zst \ - .dev/volumes/$STATIC/data/mehari/grch37/txs.bin.zst -ln -sr .dev/volumes/$STATIC/data/download/mehari-data-txs-grch38/mehari-data-txs-grch38-0.4.4.bin.zst \ - .dev/volumes/$STATIC/data/mehari/grch38/txs.bin.zst -``` - -To obtain ClinVar, use the following. -Note that this will install the data from November 12, 2023 and you might want to [look here for the latest release](https://github.com/bihealth/annonars-data-clinvar/releases). - -```bash session -STATIC=reev-static - -wget -O /tmp/annonars-clinvar-minimal-grch37-20231112+0.29.3.tar.gz \ - https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-minimal-grch37-20231112+0.29.3.tar.gz -wget -O /tmp/annonars-clinvar-minimal-grch38-20231112+0.29.3.tar.gz \ - https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-minimal-grch38-20231112+0.29.3.tar.gz - -tar -C .dev/volumes/$STATIC/data/download/annonars/ \ - -xf /tmp/annonars-clinvar-minimal-grch37-20231112+0.29.3.tar.gz -tar -C .dev/volumes/$STATIC/data/download/annonars \ - -xf /tmp/annonars-clinvar-minimal-grch38-20231112+0.29.3.tar.gz - -rm -f .dev/volumes/$STATIC/data/annonars/grch37/clinvar -ln -sr .dev/volumes/$STATIC/data/download/annonars/annonars-clinvar-minimal-grch37-20231112+0.29.3 \ - .dev/volumes/$STATIC/data/annonars/grch37/clinvar -rm -f .dev/volumes/$STATIC/data/annonars/grch38/clinvar -ln -sr .dev/volumes/$STATIC/data/download/annonars/annonars-clinvar-minimal-grch38-20231112+0.29.3 \ - .dev/volumes/$STATIC/data/annonars/grch38/clinvar - -wget -O /tmp/annonars-clinvar-genes-20231112+0.29.3.tar.gz \ - https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-genes-20231112+0.29.3.tar.gz -tar -C .dev/volumes/$STATIC/data/download/annonars \ - -xf /tmp/annonars-clinvar-genes-20231112+0.29.3.tar.gz - -rm -f .dev/volumes/$STATIC/data/annonars/clinvar-genes -ln -sr .dev/volumes/$STATIC/data/download/annonars/annonars-clinvar-genes-20231112+0.29.3 \ - .dev/volumes/$STATIC/data/annonars/clinvar-genes - -wget -O /tmp/annonars-clinvar-sv-grch37-20231112+0.29.3.tar.gz \ - https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-sv-grch37-20231112+0.29.3.tar.gz -wget -O /tmp/annonars-clinvar-sv-grch38-20231112+0.29.3.tar.gz \ - https://github.com/bihealth/annonars-data-clinvar/releases/download/annonars-data-clinvar-20231112/annonars-clinvar-sv-grch38-20231112+0.29.3.tar.gz -tar -C .dev/volumes/$STATIC/data/download/annonars/ \ - -xf /tmp/annonars-clinvar-sv-grch37-20231112+0.29.3.tar.gz -tar -C .dev/volumes/$STATIC/data/download/annonars/ \ - -xf /tmp/annonars-clinvar-sv-grch38-20231112+0.29.3.tar.gz -rm -f .dev/volumes/$STATIC/data/annonars/grch3{7,8}/clinvar-sv -ln -sr .dev/volumes/$STATIC/data/download/annonars/annonars-clinvar-sv-grch37-20231112+0.29.3 \ - .dev/volumes/$STATIC/data/annonars/grch37/clinvar-sv -ln -sr .dev/volumes/$STATIC/data/download/annonars/annonars-clinvar-sv-grch38-20231112+0.29.3 \ - .dev/volumes/$STATIC/data/annonars/grch38/clinvar-sv -``` - -To obtain data for dotty - -```bash session -mkdir -p .dev/volumes/reev-static/data/download/dotty -pushd .dev/volumes/reev-static/data/download/dotty -wget \ - https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.ensembl.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.ensembl.grch38.json.gz \ - https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.refseq.grch37.json.gz \ - https://github.com/SACGF/cdot/releases/download/v0.2.21/cdot-0.2.21.refseq.grch38.json.gz -wget \ - https://github.com/bihealth/dotty/releases/download/v0.1.0/seqrepo.tar.gz-00 \ - https://github.com/bihealth/dotty/releases/download/v0.1.0/seqrepo.tar.gz-01 -cat seqrepo.tar.gz-?? | tar xzf - -popd - -mkdir -p .dev/volumes/reev-static/data/dotty -ln -sr .dev/volumes/reev-static/data/download/dotty/{*.json.gz,seqrepo} \ - .dev/volumes/reev-static/data/dotty -``` - -To obtain data for cada-prio +We provide a script that will setup the necessary directories, download the data, and create symlinks. ```bash session -mkdir -p .dev/volumes/reev-static/data/download/cada -pushd .dev/volumes/reev-static/data/download/cada -wget \ - https://github.com/bihealth/cada-prio-data/releases/download/cada-prio-data-20231112/cada-prio-model-20231112+0.6.1.tar.gz -tar -xzf cada-prio-model-20231112+0.6.1.tar.gz -popd - -mkdir -p .dev/volumes/reev-static/data/cada -rm -f .dev/volumes/reev-static/data/cada/model - -source_dir=".dev/volumes/reev-static/data/download/cada/cada-prio-model-20231112+0.6.1/model/" -for file in ${source_dir}*; do - ln -sr "$file" ".dev/volumes/reev-static/data/cada/" -done +bash download-data.sh ``` ### Setup Configuration diff --git a/download-data.sh b/download-data.sh new file mode 100644 index 0000000..5442b13 --- /dev/null +++ b/download-data.sh @@ -0,0 +1,604 @@ +#!/usr/bin/bash + +# Inofficial Bash Strict Mode +# +# cf. http://redsymbol.net/articles/unofficial-bash-strict-mode/ +set -euo pipefail +IFS=$'\n\t' + +# -- Configuration ------------------------------------------------------------ + +# QUIET=1 mode: suppress all output except errors. +export QUIET=${QUIET-0} +# VERBOSE=1 mode: print all commands before executing them. +export VERBOSE=${VERBOSE-0} +# Dry-run mode. +export DRY_RUN=${DRY_RUN-0} +# Download options: reduced-dev (default), reduced-exomes, full. +export DOWNLOAD=${DOWNLOAD-reduced-dev} +# Directory for static data. +export STATIC_DIR=${STATIC_DIR-reev-static} +# Overall directory prefix. +export DIR_PREFIX=${DIR_PREFIX-.dev} +# Overall static data directory. +export DATA_DIR=${DATA_DIR-$DIR_PREFIX/volumes/$STATIC_DIR/data} +# S3 endpoing URL. +export S3_ENDPOINT_URL=https://ceph-s3-public.cubi.bihealth.org + +# -- Versions ----------------------------------------------------------------- + +# annonars +export V_ANNONARS=${V_ANNONARS-0.33.0} +# viguno +export V_VIGUNO=${V_VIGUNO-0.2.0} +# VarFish Worker +export V_WORKER=${V_WORKER-0.10.2} + +# CADD +export V_CADD=${V_CADD-1.6} +# UCSC 100 vertebrate conservation GRCh37 +export V_UCSC_CONS_37=${V_UCSC_CONS_37-20161007} +# UCSC 100 vertebrate conservation GRCh38 +export V_UCSC_CONS_38=${V_UCSC_CONS_38-20190906} +# dbNSFP +export V_DBNSFP=${V_DBNSFP-4.5a} +# dbNSFP (no suffix) +export V_DBNSFP_NO_SUFFIX=${V_DBNSFP%a} +export V_DBNSFP_NO_SUFFIX=${V_DBNSFP_NO_SUFFIX%c} +# dbSCSNV +export V_DBSCSNV=${V_DBSCSNV-1.1} +# dbSNP +export V_DBSNP=${V_DBSNP-b151} +# RefSeq functional GRCh37 +export V_REFSEQ_GRCH37=${V_REFSEQ_GRCH37-105.20201022} +# RefSeq functional GRCh38 +export V_REFSEQ_GRCH38=${V_REFSEQ_GRCH38-110} +# ACMG SF list +export V_ACMG_SF=${V_ACMG_SF-3.1} +# gnomAD constraints +export V_GNOMAD_CONSTRAINTS=${V_GNOMAD_CONSTRAINTS-4.0} +# HPO release +export V_HPO=${V_HPO-20230606} +# OrphaPackets release +export V_ORPHAPACKETS=${V_ORPHAPACKETS-10.1} +# VarFish DB Download Data +export V_VARFISHDB=${V_VARFISHDB-20240105} +# gnomAD exomes GRCh37 +export V_GNOMAD_EXOMES_GRCH37=${V_GNOMAD_EXOMES_GRCH37-2.1.1} +# gnomAD exomes GRCh38 +export V_GNOMAD_EXOMES_GRCH38=${V_GNOMAD_EXOMES_GRCH38-4.0} +# gnomAD genomes GRCh37 +export V_GNOMAD_GENOMES_GRCH37=${V_GNOMAD_GENOMES_GRCH37-2.1.1} +# gnomAD genomes GRCh38 +export V_GNOMAD_GENOMES_GRCH38=${V_GNOMAD_GENOMES_GRCH38-4.0} +# gnomAD mtDNA +export V_GNOMAD_MT=${V_GNOMAD_MT-3.1} +# gnomAD SVs exomes GRCh37 (== ExAC) +export V_GNOMAD_EXOMES_SVS_GRCH37=${V_GNOMAD_EXOMES_SVS_GRCH37-0.3.1} +# gnomAD SVs exomes GRCh38 +export V_GNOMAD_EXOMES_SVS_GRCH38=${V_GNOMAD_EXOMES_SVS_GRCH38-4.0} +# gnomAD SV genomes GRCh37 +export V_GNOMAD_GENOMES_SV_GRCH37=${V_GNOMAD_GENOMES_SV_GRCH37-2.1.1} +# gnomAD SV genomes GRCh38 +export V_GNOMAD_GENOMES_SV_GRCH38=${V_GNOMAD_GENOMES_SV_GRCH38-4.0} +# HelixMtDB +export V_HELIXMTDB=${V_HELIXMTDB-20200327} +# ClinGen Regions +export V_CLINGEN_REGIONS=${V_CLINGEN_REGIONS-20240105} + +# Mehari Gene ID Xlink +export V_MEHARI_XLINK=${V_MEHARI_XLINK-20240105} +# Mehari Transcripts +export V_MEHARI_TXS=${V_MEHARI_TXS-0.4.4} + +# dbVar version +export V_DBVAR=${V_DBVAR-20231030} +# DGV version +export V_DGV=${V_DGV-20200225} +# 1000 Genomes Version +export V_G1K=${V_G1K-phase3v2} +# ClinVar Strucvars +export V_CLINVAR_STRUCVARS=${V_CLINVAR_STRUCVARS-20230625} +# Genes ENSEMBL GRCh37 +export V_GENES_ENSEMBL_GRCH37=${V_GENES_ENSEMBL_GRCH37-87} +# Genes ENSEMBL GRCh38 +export V_GENES_ENSEMBL_GRCH38=${V_GENES_ENSEMBL_GRCH38-109} +# Genes RefSeq GRCh37 +export V_GENES_REFSEQ_GRCH37=${V_GENES_REFSEQ_GRCH37-105.20201022} +# Genes RefSeq GRCh38 +export V_GENES_REFSEQ_GRCH38=${V_GENES_REFSEQ_GRCH38-GCF_000001405.40+RS_2023_03} +# UCSC rmsk GRCh37 +export V_UCSC_RMSK_37=${V_UCSC_RMSK_37-20200322} +# UCSC rmsk GRCh38 +export V_UCSC_RMSK_38=${V_UCSC_RMSK_38-20221018} +# UCSC genomicSuperDups GRCh37 +export V_UCSC_GENOMIC_SUPER_DUPS_37=${V_UCSC_GENOMIC_SUPER_DUPS_37-20111025} +# UCSC genomicSuperDups GRCh38 +export V_UCSC_GENOMIC_SUPER_DUPS_38=${V_UCSC_GENOMIC_SUPER_DUPS_38-20141019} +# UCSC altSeqLiftover GRCh37 +export V_UCSC_ALT_SEQ_LIFTOVER_37=${V_UCSC_ALT_SEQ_LIFTOVER_37-20200322} +# UCSC altSeqLiftover GRCh38 +export V_UCSC_ALT_SEQ_LIFTOVER_38=${V_UCSC_ALT_SEQ_LIFTOVER_38-20221103} +# Patho MMS +export V_PATHO_MMS=${V_PATHO_MMS-20220730} + +# annonars-data-clinvar clinvar +export V_ANNONARS_DATA_CLINVAR_CLINVAR=${V_ANNONARS_DATA_CLINVAR_CLINVAR-20231231} +# annonars-data-gnomad annonars +export V_ANNONARS_DATA_CLINVAR_ANNONARS=${V_ANNONARS_DATA_CLINVAR_ANNONARS-0.31.0} +# dotty-seqrepo +export V_DOTTY_SEQREPO=${V_DOTTY_SEQREPO-0.1.0} + +# dotty-cdot-version +export V_DOTTY_CDOT_VERSION=${V_DOTTY_CDOT_VERSION-0.2.21} + +# cada-prio model +export V_CADA_PRIO_MODEL=${V_CADA_PRIO_MODEL-20231112} +# cada-prio version +export V_CADA_PRIO_VERSION=${V_CADA_PRIO_VERSION-0.6.1} + +# -- Verbose Mode ------------------------------------------------------------- + +# Use "set -x" if verbose and unless quiet. +if [[ "$QUIET" -eq 0 ]] && [[ "$VERBOSE" -ne 0 ]]; then + set -x +fi + +# -- Logging ------------------------------------------------------------------ + +# Logging in info mode. +log_info() +{ + [[ $QUIET -eq 0 ]] && echo "[$(set +x; date +%T.%3N)] INFO: $*" >&2 +} + +# Logging in debug mode. +log_debug() +{ + [[ $VERBOSE -ne 0 ]] && echo "[$(set +x; date +%T.%3N)] DBG: $*" >&2 +} + +# Logging in error mode. +log_error() +{ + echo "[$(set +x; date +%T.%3N)] ERROR: $*" >&2 +} + +# -- Helper Functions --------------------------------------------------------- + +# Return prefix for the given download directory. +# +# prefix_for +# +# Example: +# +# prefix_for annonars/cadd-grch37-1.6+0.29.1 +prefix_for() +{ + for prefix in annonars/cadd annonars/cons annonars/dbnsfp annonars/dbscsnv \ + annonars/dbsnp annonars/gnomad-exomes annonars/gnomad-genomes \ + mehari/freqs viguno/hpo; do + if [[ $1 == $prefix* ]]; then + # have reduced + echo $DOWNLOAD + return + fi + done + # no reduced available + echo full +} + +# Depending on $DRY_RUN, either execute the given command or print it. +run() +{ + if [[ $DRY_RUN -eq 0 ]]; then + "$@" + else + echo "$@" + fi +} + +# -- Downloading -------------------------------------------------------------- + +log_info "Download data ..." + +# First, write out folders that we want to download. +cat </tmp/download-list.txt +annonars/cadd-grch37-$V_CADD+$V_ANNONARS +annonars/cadd-grch38-$V_CADD+$V_ANNONARS +annonars/cons-grch37-$V_UCSC_CONS_37+$V_ANNONARS +annonars/cons-grch38-$V_UCSC_CONS_38+$V_ANNONARS +annonars/dbnsfp-grch37-$V_DBNSFP+$V_ANNONARS +annonars/dbnsfp-grch38-$V_DBNSFP+$V_ANNONARS +annonars/dbscsnv-grch37-$V_DBSCSNV+$V_ANNONARS +annonars/dbscsnv-grch38-$V_DBSCSNV+$V_ANNONARS +annonars/dbsnp-grch37-$V_DBSNP+$V_ANNONARS +annonars/dbsnp-grch38-$V_DBSNP+$V_ANNONARS +annonars/functional-grch37-$V_REFSEQ_GRCH37+$V_ANNONARS +annonars/functional-grch38-$V_REFSEQ_GRCH38+$V_ANNONARS +annonars/genes-$V_ACMG_SF+$V_GNOMAD_CONSTRAINTS+$V_DBNSFP_NO_SUFFIX+$V_HPO+$V_ORPHAPACKETS+$V_VARFISHDB+$V_ANNONARS +annonars/gnomad-exomes-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_ANNONARS +annonars/gnomad-exomes-grch38-$V_GNOMAD_EXOMES_GRCH38+$V_ANNONARS +annonars/gnomad-genomes-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_ANNONARS +annonars/gnomad-genomes-grch38-$V_GNOMAD_EXOMES_GRCH38+$V_ANNONARS +annonars/gnomad-mtdna-grch37-$V_GNOMAD_MT+$V_ANNONARS +annonars/gnomad-mtdna-grch38-$V_GNOMAD_MT+$V_ANNONARS +annonars/gnomad-sv-exomes-grch37-$V_GNOMAD_EXOMES_SVS_GRCH37+$V_ANNONARS +annonars/gnomad-sv-exomes-grch38-$V_GNOMAD_EXOMES_SVS_GRCH38+$V_ANNONARS +annonars/gnomad-sv-genomes-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_ANNONARS +annonars/gnomad-sv-genomes-grch38-$V_GNOMAD_GENOMES_GRCH38+$V_ANNONARS +annonars/helixmtdb-grch37-$V_HELIXMTDB+$V_ANNONARS +annonars/helixmtdb-grch38-$V_HELIXMTDB+$V_ANNONARS +annonars/regions-grch37-$V_CLINGEN_REGIONS+$V_ANNONARS +annonars/regions-grch38-$V_CLINGEN_REGIONS+$V_ANNONARS +mehari/freqs-grch37-$V_GNOMAD_EXOMES_GRCH37+$V_GNOMAD_GENOMES_GRCH37+$V_GNOMAD_MT+$V_HELIXMTDB+$V_ANNONARS +mehari/freqs-grch38-$V_GNOMAD_EXOMES_GRCH38+$V_GNOMAD_GENOMES_GRCH38+$V_GNOMAD_MT+$V_HELIXMTDB+$V_ANNONARS +mehari/genes-xlink-$V_MEHARI_XLINK +tracks +worker +viguno/hpo-$V_HPO+$V_VIGUNO +EOF +# Create download directory. +mkdir -p $DATA_DIR/download +# Download each entry from download list. Note that we support commenting +# out lines with a leading "#". +grep -v ^# /tmp/download-list.txt >/tmp/download-list.nocomment.txt +while read -r line; do + # Create the download directory. + run mkdir -p $DATA_DIR/download/$line + # Actually download the data. + log_info "s3://varfish-public/$(prefix_for $line)/$line/* -> $DATA_DIR/download/$line" + run s5cmd \ + --endpoint-url=$S3_ENDPOINT_URL \ + --no-sign-request \ + sync \ + "s3://varfish-public/$(prefix_for $line)/$line/*" \ + $DATA_DIR/download/$line \ + &> >(tee /tmp/download.stderr >&2) + grep ^ERROR /tmp/download.stderr >/dev/null && exit 1 +done /dev/null +wget -q -c \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.ensembl.grch38.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch37.json.gz \ + https://github.com/SACGF/cdot/releases/download/v$V_DOTTY_CDOT_VERSION/cdot-$V_DOTTY_CDOT_VERSION.refseq.grch38.json.gz +wget -q -c \ + https://github.com/bihealth/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-00 \ + https://github.com/bihealth/dotty/releases/download/v$V_DOTTY_SEQREPO/seqrepo.tar.gz-01 +cat seqrepo.tar.gz-?? | tar -xzf - +popd >/dev/null + +mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty +rm -f $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty/{*.json.gz,seqrepo} +ln -sr $DIR_PREFIX/volumes/$STATIC_DIR/data/download/dotty/{*.json.gz,seqrepo} \ + $DIR_PREFIX/volumes/$STATIC_DIR/data/dotty + +log_info "- cada-prio" + +mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada +pushd $DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada >/dev/null +wget -q -c \ + https://github.com/bihealth/cada-prio-data/releases/download/cada-prio-data-$V_CADA_PRIO_MODEL/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz +tar -xzf cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION.tar.gz +popd >/dev/null + +mkdir -p $DIR_PREFIX/volumes/$STATIC_DIR/data/cada +rm -f $DIR_PREFIX/volumes/$STATIC_DIR/data/cada/model + +source_dir="$DIR_PREFIX/volumes/$STATIC_DIR/data/download/cada/cada-prio-model-$V_CADA_PRIO_MODEL+$V_CADA_PRIO_VERSION/model" +for file in "${source_dir}"/*; do + rm -f "$DIR_PREFIX/volumes/$STATIC_DIR/data/cada/$(basename "$file")" + ln -sr "$file" "$DIR_PREFIX/volumes/$STATIC_DIR/data/cada/" +done diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..1a906c7 --- /dev/null +++ b/environment.yml @@ -0,0 +1,7 @@ +name: reev-docker-compose +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - shellcheck=0.9.0