Skip to content

Commit

Permalink
Reformat with snakefmt
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Jul 11, 2024
1 parent f66d941 commit 3af3dc6
Show file tree
Hide file tree
Showing 51 changed files with 1,598 additions and 1,285 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[tool.snakefmt]
line_length = 100
include = '\.smk$|^Snakefile|\.py$'
23 changes: 13 additions & 10 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
# Common
# =================================================================================================


# We first need to load the common functionality, which gives us access to the config file,
# generates the sample list, and prepares some other things for us that are needed below.
include: "rules/init.smk"


# =================================================================================================
# Default "All" Target Rule
# =================================================================================================


# The rule that is executed by default. We include the result files of different intermediate steps
# here as well, for example, the genotyped vcf file from the "calling.smk" step, so that a nice
# arrow shows up in the DAG that reminds us that this is an important intermediate file.
Expand All @@ -20,30 +23,29 @@ rule all:
"filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [],
"annotated/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
"annotated/vep.vcf.gz" if config["settings"]["vep"] else [],

# Quality control
"qc/multiqc.html",

# Reference genome statistics
config["data"]["reference-genome"] + ".seqkit",

# Pileup
"mpileup/all-pileups.done" if len(config["settings"]["pileups"]) else [],

# HAFpipe
"hafpipe/all.csv" + (
".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else ""
) if config["settings"]["hafpipe"] else [],

"hafpipe/all.csv"
+ (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "")
if config["settings"]["hafpipe"]
else [],
# Stats. Some deactivated for now.
"tables/frequencies.tsv" if config["settings"]["frequency-table"] else []
"tables/frequencies.tsv" if config["settings"]["frequency-table"] else [],
# "plots/depths.svg",
# "plots/allele-freqs.svg",
# "tables/sample-sizes.tsv"


# The main `all` rule is local. It does not do anything anyway,
# except requesting the other rules to run.
localrules: all
localrules:
all,


# =================================================================================================
# Rule Modules
Expand All @@ -52,6 +54,7 @@ localrules: all
# These rules need to come after the `all` rule above, as Snakemake takes the first rule it finds
# as the implicit target when no other target is specified, and we want that to be the `all` rule.


include: "rules/prep.smk"
include: "rules/trimming.smk"
include: "rules/mapping.smk"
Expand Down
131 changes: 66 additions & 65 deletions workflow/rules/annotation.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,35 @@
# SnpEff Setup
# =================================================================================================


# Get the path where we download the database to, so that we do not have to download it all the time.
# If not provided by the user, we use the directory where the reference genome is.
# Not used with a custom db, in which case we just use the provided dir directly.
def get_snpeff_db_download_path():
if config["params"]["snpeff"]["download-dir"]:
# Return path from config, and make sure that it has trailing slash
return os.path.join( config["params"]["snpeff"]["download-dir"], '' )
return os.path.join(config["params"]["snpeff"]["download-dir"], "")
else:
# Use the ref genome path, with trailing slash
return os.path.join(
os.path.dirname( config["data"]["reference-genome"] ), "snpeff-db"
) + "/"
return os.path.join(os.path.dirname(config["data"]["reference-genome"]), "snpeff-db") + "/"


# We separate download from usage, so that we can better see progress and errors,
# and to avoid unneccessary re-downloads.
rule snpeff_db:
output:
# wildcard {reference} may be anything listed in the first column of `snpeff databases`
directory( os.path.join( get_snpeff_db_download_path(), "{reference}" )),
touch( os.path.join( get_snpeff_db_download_path(), "{reference}.done" ))
directory(os.path.join(get_snpeff_db_download_path(), "{reference}")),
touch(os.path.join(get_snpeff_db_download_path(), "{reference}.done")),
log:
# Log file where the download is made to, so that this is independent of the run itself.
os.path.abspath(
os.path.join( get_snpeff_db_download_path(), "../snpeff-download-{reference}.log" )
)
os.path.join(get_snpeff_db_download_path(), "../snpeff-download-{reference}.log")
),
group:
"snpeff"
params:
reference="{reference}"
reference="{reference}",
conda:
# As always, the wrapper does not specify that python, pandas, and numpy are needed,
# and if those are misspecified on the cluster (as in our case), it just fails...
Expand All @@ -39,104 +39,97 @@ rule snpeff_db:
wrapper:
"v3.13.6/bio/snpeff/download"


# Rule is not submitted as a job to the cluster.
localrules: snpeff_db
localrules:
snpeff_db,


# Get the path to the snpeff database. Usually, this is the path to a database from the available
# ones of snpEff. If however a custom db path is given, we return this instead.
def get_snpeff_db_path():
if config["params"]["snpeff"]["custom-db-dir"]:
return config["params"]["snpeff"]["custom-db-dir"]
else:
return os.path.join( get_snpeff_db_download_path(), config["params"]["snpeff"]["name"] )
return os.path.join(get_snpeff_db_download_path(), config["params"]["snpeff"]["name"])


# =================================================================================================
# SnpEff
# =================================================================================================


rule snpeff:
input:
# (vcf, bcf, or vcf.gz)
# we use the filtered file if a filtering is done, or the unfiltered if not.
calls=(
# we use the filtered file if a filtering is done, or the unfiltered if not.
"filtered/all.vcf.gz"
if not config["settings"]["filter-variants"] == "none"
else "genotyped/all.vcf.gz"
),

# path to reference db downloaded with the snpeff download wrapper above
db=get_snpeff_db_path()
db=get_snpeff_db_path(),
output:
# annotated calls (vcf, bcf, or vcf.gz)
calls=report(
"annotated/snpeff.vcf.gz",
caption="../report/vcf.rst",
category="Calls"
),

calls=report("annotated/snpeff.vcf.gz", caption="../report/vcf.rst", category="Calls"),
# summary statistics (in HTML), optional
stats=report(
"annotated/snpeff.html",
category="Calls"
),

stats=report("annotated/snpeff.html", category="Calls"),
# summary statistics in CSV, optional
csvstats="annotated/snpeff.csv"
csvstats="annotated/snpeff.csv",
log:
"logs/snpeff.log"
"logs/snpeff.log",
group:
"snpeff"
params:
# For finding the chromosome names used by snpeff, add `-v` here
extra=config["params"]["snpeff"]["extra"]
extra=config["params"]["snpeff"]["extra"],
resources:
mem_mb=config["params"]["snpeff"]["mem"]
mem_mb=config["params"]["snpeff"]["mem"],
conda:
"../envs/snpeff.yaml"
wrapper:
"v3.13.6/bio/snpeff/annotate"


# =================================================================================================
# VEP Downloads
# =================================================================================================


# Get the paths where we store the data, so that we do not have to download it all the time.
# If not provided by the user, we use the directory where the reference genome is.
def get_vep_cache_dir():
if config["params"]["vep"]["cache-dir"]:
result = os.path.dirname( config["params"]["vep"]["cache-dir"] )
result = os.path.dirname(config["params"]["vep"]["cache-dir"])
else:
result = os.path.join(
os.path.dirname( config["data"]["reference-genome"] ), "vep-cache"
)
result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-cache")
return result


# Same for plugins dir.
def get_vep_plugins_dir():
if config["params"]["vep"]["plugins-dir"]:
result = os.path.dirname( config["params"]["vep"]["plugins-dir"] )
result = os.path.dirname(config["params"]["vep"]["plugins-dir"])
else:
result = os.path.join(
os.path.dirname( config["data"]["reference-genome"] ), "vep-plugins"
)
result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-plugins")
return result


rule vep_cache:
output:
directory( get_vep_cache_dir() ),
touch( get_vep_cache_dir().rstrip('/') + ".done" )
directory(get_vep_cache_dir()),
touch(get_vep_cache_dir().rstrip("/") + ".done"),
params:
species = config["params"]["vep"]["species"],
release = config["params"]["vep"]["release"],
build = config["params"]["vep"]["build"],
cacheurl = config["params"]["vep"]["cache-url"],
species=config["params"]["vep"]["species"],
release=config["params"]["vep"]["release"],
build=config["params"]["vep"]["build"],
cacheurl=config["params"]["vep"]["cache-url"],
# fastaurl = config["params"]["vep"]["fasta-url"],
# fasta-url: "ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants/current/fasta"
log:
# Log file where the download is made to, so that this is independent of the run itself.
os.path.abspath(
os.path.join( get_vep_cache_dir(), "../vep-cache.log" )
)
os.path.abspath(os.path.join(get_vep_cache_dir(), "../vep-cache.log")),
conda:
# We use a conda environment on top of the wrapper, as the wrapper always causes
# issues with missing python modules and mismatching program versions and stuff...
Expand All @@ -147,21 +140,22 @@ rule vep_cache:
"../envs/vep.yaml"
script:
"../scripts/vep-cache.py"
# wrapper:
# "v3.13.6/bio/vep/cache"
# "0.74.0/bio/vep/cache"


# wrapper:
# "v3.13.6/bio/vep/cache"
# "0.74.0/bio/vep/cache"


rule vep_plugins:
output:
directory( get_vep_plugins_dir() ),
touch( get_vep_plugins_dir().rstrip('/') + ".done" )
directory(get_vep_plugins_dir()),
touch(get_vep_plugins_dir().rstrip("/") + ".done"),
params:
release = config["params"]["vep"]["release"],
release=config["params"]["vep"]["release"],
log:
# Log file where the download is made to, so that this is independent of the run itself.
os.path.abspath(
os.path.join( get_vep_plugins_dir(), "../vep-plugins.log" )
)
os.path.abspath(os.path.join(get_vep_plugins_dir(), "../vep-plugins.log")),
conda:
# Use our own env definition here, to ensure that we are working with the same vep
# versions across the different rules here. This is not the case in the original wrapper...
Expand All @@ -170,21 +164,28 @@ rule vep_plugins:
# We use our own script here, which solves a weird issue with the wrapper where
# the output directory was already created and hence the python mkdir failed...
"../scripts/vep-plugins.py"
# wrapper:
# "v3.13.6/bio/vep/plugins"
# "0.74.0/bio/vep/plugins"


# wrapper:
# "v3.13.6/bio/vep/plugins"
# "0.74.0/bio/vep/plugins"


# Rules are not submitted as a job to the cluster.
localrules: vep_cache, vep_plugins
localrules:
vep_cache,
vep_plugins,


# =================================================================================================
# VEP
# =================================================================================================


rule vep:
input:
# we use the filtered file if a filtering is done, or the unfiltered if not.
calls=(
# we use the filtered file if a filtering is done, or the unfiltered if not.
"filtered/all.vcf.gz"
if not config["settings"]["filter-variants"] == "none"
else "genotyped/all.vcf.gz"
Expand All @@ -197,12 +198,12 @@ rule vep:
caption="../report/vcf.rst",
category="Calls",
),
# The html file has to have a specific file name, so that MultiQC can find it,
# see https://multiqc.info/docs/#vep
# At the moment, this is however not yet working, because VEP was only added to
# MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438
# Will need to update to MultiQC v1.11 at some point.
stats=report(
# The html file has to have a specific file name, so that MultiQC can find it,
# see https://multiqc.info/docs/#vep
# At the moment, this is however not yet working, because VEP was only added to
# MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438
# Will need to update to MultiQC v1.11 at some point.
"annotated/vep_summary.html",
caption="../report/stats.rst",
category="Calls",
Expand Down
Loading

0 comments on commit 3af3dc6

Please sign in to comment.