Reformat with snakefmt

moiexpositoalonsolab · Jul 11, 2024 · 3af3dc6 · 3af3dc6
1 parent f66d941
commit 3af3dc6
Show file tree

Hide file tree

Showing 51 changed files with 1,598 additions and 1,285 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.snakefmt]
+line_length = 100
+include = '\.smk$|^Snakefile|\.py$'
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -2,14 +2,17 @@
 #     Common
 # =================================================================================================
 
+
 # We first need to load the common functionality, which gives us access to the config file,
 # generates the sample list, and prepares some other things for us that are needed below.
 include: "rules/init.smk"
 
+
 # =================================================================================================
 #     Default "All" Target Rule
 # =================================================================================================
 
+
 # The rule that is executed by default. We include the result files of different intermediate steps
 # here as well, for example, the genotyped vcf file from the "calling.smk" step, so that a nice
 # arrow shows up in the DAG that reminds us that this is an important intermediate file.
@@ -20,30 +23,29 @@ rule all:
         "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [],
         "annotated/snpeff.vcf.gz" if config["settings"]["snpeff"] else [],
         "annotated/vep.vcf.gz" if config["settings"]["vep"] else [],
-
         # Quality control
         "qc/multiqc.html",
-
         # Reference genome statistics
         config["data"]["reference-genome"] + ".seqkit",
-
         # Pileup
         "mpileup/all-pileups.done" if len(config["settings"]["pileups"]) else [],
-
         # HAFpipe
-        "hafpipe/all.csv" + (
-            ".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else ""
-        ) if config["settings"]["hafpipe"] else [],
-
+        "hafpipe/all.csv"
+        + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "")
+        if config["settings"]["hafpipe"]
+        else [],
         # Stats. Some deactivated for now.
-        "tables/frequencies.tsv" if config["settings"]["frequency-table"] else []
+        "tables/frequencies.tsv" if config["settings"]["frequency-table"] else [],
         # "plots/depths.svg",
         # "plots/allele-freqs.svg",
         # "tables/sample-sizes.tsv"
 
+
 # The main `all` rule is local. It does not do anything anyway,
 # except requesting the other rules to run.
-localrules: all
+localrules:
+    all,
+
 
 # =================================================================================================
 #     Rule Modules
@@ -52,6 +54,7 @@ localrules: all
 # These rules need to come after the `all` rule above, as Snakemake takes the first rule it finds
 # as the implicit target when no other target is specified, and we want that to be the `all` rule.
 
+
 include: "rules/prep.smk"
 include: "rules/trimming.smk"
 include: "rules/mapping.smk"

diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk
@@ -2,35 +2,35 @@
 #     SnpEff Setup
 # =================================================================================================
 
+
 # Get the path where we download the database to, so that we do not have to download it all the time.
 # If not provided by the user, we use the directory where the reference genome is.
 # Not used with a custom db, in which case we just use the provided dir directly.
 def get_snpeff_db_download_path():
     if config["params"]["snpeff"]["download-dir"]:
         # Return path from config, and make sure that it has trailing slash
-        return os.path.join( config["params"]["snpeff"]["download-dir"], '' )
+        return os.path.join(config["params"]["snpeff"]["download-dir"], "")
     else:
         # Use the ref genome path, with trailing slash
-        return os.path.join(
-            os.path.dirname( config["data"]["reference-genome"] ), "snpeff-db"
-        ) + "/"
+        return os.path.join(os.path.dirname(config["data"]["reference-genome"]), "snpeff-db") + "/"
+
 
 # We separate download from usage, so that we can better see progress and errors,
 # and to avoid unneccessary re-downloads.
 rule snpeff_db:
     output:
         # wildcard {reference} may be anything listed in the first column of `snpeff databases`
-        directory( os.path.join( get_snpeff_db_download_path(), "{reference}" )),
-        touch( os.path.join( get_snpeff_db_download_path(), "{reference}.done" ))
+        directory(os.path.join(get_snpeff_db_download_path(), "{reference}")),
+        touch(os.path.join(get_snpeff_db_download_path(), "{reference}.done")),
     log:
         # Log file where the download is made to, so that this is independent of the run itself.
         os.path.abspath(
-            os.path.join( get_snpeff_db_download_path(), "../snpeff-download-{reference}.log" )
-        )
+            os.path.join(get_snpeff_db_download_path(), "../snpeff-download-{reference}.log")
+        ),
     group:
         "snpeff"
     params:
-        reference="{reference}"
+        reference="{reference}",
     conda:
         # As always, the wrapper does not specify that python, pandas, and numpy are needed,
         # and if those are misspecified on the cluster (as in our case), it just fails...
@@ -39,104 +39,97 @@ rule snpeff_db:
     wrapper:
         "v3.13.6/bio/snpeff/download"
 
+
 # Rule is not submitted as a job to the cluster.
-localrules: snpeff_db
+localrules:
+    snpeff_db,
+
 
 # Get the path to the snpeff database. Usually, this is the path to a database from the available
 # ones of snpEff. If however a custom db path is given, we return this instead.
 def get_snpeff_db_path():
     if config["params"]["snpeff"]["custom-db-dir"]:
         return config["params"]["snpeff"]["custom-db-dir"]
     else:
-        return os.path.join( get_snpeff_db_download_path(), config["params"]["snpeff"]["name"] )
+        return os.path.join(get_snpeff_db_download_path(), config["params"]["snpeff"]["name"])
+
 
 # =================================================================================================
 #     SnpEff
 # =================================================================================================
 
+
 rule snpeff:
     input:
         # (vcf, bcf, or vcf.gz)
+        # we use the filtered file if a filtering is done, or the unfiltered if not.
         calls=(
-            # we use the filtered file if a filtering is done, or the unfiltered if not.
             "filtered/all.vcf.gz"
             if not config["settings"]["filter-variants"] == "none"
             else "genotyped/all.vcf.gz"
         ),
-
         # path to reference db downloaded with the snpeff download wrapper above
-        db=get_snpeff_db_path()
+        db=get_snpeff_db_path(),
     output:
         # annotated calls (vcf, bcf, or vcf.gz)
-        calls=report(
-            "annotated/snpeff.vcf.gz",
-            caption="../report/vcf.rst",
-            category="Calls"
-        ),
-
+        calls=report("annotated/snpeff.vcf.gz", caption="../report/vcf.rst", category="Calls"),
         # summary statistics (in HTML), optional
-        stats=report(
-            "annotated/snpeff.html",
-            category="Calls"
-        ),
-
+        stats=report("annotated/snpeff.html", category="Calls"),
         # summary statistics in CSV, optional
-        csvstats="annotated/snpeff.csv"
+        csvstats="annotated/snpeff.csv",
     log:
-        "logs/snpeff.log"
+        "logs/snpeff.log",
     group:
         "snpeff"
     params:
         # For finding the chromosome names used by snpeff, add `-v` here
-        extra=config["params"]["snpeff"]["extra"]
+        extra=config["params"]["snpeff"]["extra"],
     resources:
-        mem_mb=config["params"]["snpeff"]["mem"]
+        mem_mb=config["params"]["snpeff"]["mem"],
     conda:
         "../envs/snpeff.yaml"
     wrapper:
         "v3.13.6/bio/snpeff/annotate"
 
+
 # =================================================================================================
 #     VEP Downloads
 # =================================================================================================
 
+
 # Get the paths where we store the data, so that we do not have to download it all the time.
 # If not provided by the user, we use the directory where the reference genome is.
 def get_vep_cache_dir():
     if config["params"]["vep"]["cache-dir"]:
-        result = os.path.dirname( config["params"]["vep"]["cache-dir"] )
+        result = os.path.dirname(config["params"]["vep"]["cache-dir"])
     else:
-        result = os.path.join(
-            os.path.dirname( config["data"]["reference-genome"] ), "vep-cache"
-        )
+        result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-cache")
     return result
 
+
 # Same for plugins dir.
 def get_vep_plugins_dir():
     if config["params"]["vep"]["plugins-dir"]:
-        result = os.path.dirname( config["params"]["vep"]["plugins-dir"] )
+        result = os.path.dirname(config["params"]["vep"]["plugins-dir"])
     else:
-        result = os.path.join(
-            os.path.dirname( config["data"]["reference-genome"] ), "vep-plugins"
-        )
+        result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-plugins")
     return result
 
+
 rule vep_cache:
     output:
-        directory( get_vep_cache_dir() ),
-        touch( get_vep_cache_dir().rstrip('/') + ".done" )
+        directory(get_vep_cache_dir()),
+        touch(get_vep_cache_dir().rstrip("/") + ".done"),
     params:
-        species  = config["params"]["vep"]["species"],
-        release  = config["params"]["vep"]["release"],
-        build    = config["params"]["vep"]["build"],
-        cacheurl = config["params"]["vep"]["cache-url"],
+        species=config["params"]["vep"]["species"],
+        release=config["params"]["vep"]["release"],
+        build=config["params"]["vep"]["build"],
+        cacheurl=config["params"]["vep"]["cache-url"],
         # fastaurl = config["params"]["vep"]["fasta-url"],
         # fasta-url: "ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants/current/fasta"
     log:
         # Log file where the download is made to, so that this is independent of the run itself.
-        os.path.abspath(
-            os.path.join( get_vep_cache_dir(), "../vep-cache.log" )
-        )
+        os.path.abspath(os.path.join(get_vep_cache_dir(), "../vep-cache.log")),
     conda:
         # We use a conda environment on top of the wrapper, as the wrapper always causes
         # issues with missing python modules and mismatching program versions and stuff...
@@ -147,21 +140,22 @@ rule vep_cache:
         "../envs/vep.yaml"
     script:
         "../scripts/vep-cache.py"
-    # wrapper:
-    #     "v3.13.6/bio/vep/cache"
-    #     "0.74.0/bio/vep/cache"
+
+
+# wrapper:
+#     "v3.13.6/bio/vep/cache"
+#     "0.74.0/bio/vep/cache"
+
 
 rule vep_plugins:
     output:
-        directory( get_vep_plugins_dir() ),
-        touch( get_vep_plugins_dir().rstrip('/') + ".done" )
+        directory(get_vep_plugins_dir()),
+        touch(get_vep_plugins_dir().rstrip("/") + ".done"),
     params:
-        release = config["params"]["vep"]["release"],
+        release=config["params"]["vep"]["release"],
     log:
         # Log file where the download is made to, so that this is independent of the run itself.
-        os.path.abspath(
-            os.path.join( get_vep_plugins_dir(), "../vep-plugins.log" )
-        )
+        os.path.abspath(os.path.join(get_vep_plugins_dir(), "../vep-plugins.log")),
     conda:
         # Use our own env definition here, to ensure that we are working with the same vep
         # versions across the different rules here. This is not the case in the original wrapper...
@@ -170,21 +164,28 @@ rule vep_plugins:
         # We use our own script here, which solves a weird issue with the wrapper where
         # the output directory was already created and hence the python mkdir failed...
         "../scripts/vep-plugins.py"
-    # wrapper:
-    #     "v3.13.6/bio/vep/plugins"
-    #     "0.74.0/bio/vep/plugins"
+
+
+# wrapper:
+#     "v3.13.6/bio/vep/plugins"
+#     "0.74.0/bio/vep/plugins"
+
 
 # Rules are not submitted as a job to the cluster.
-localrules: vep_cache, vep_plugins
+localrules:
+    vep_cache,
+    vep_plugins,
+
 
 # =================================================================================================
 #     VEP
 # =================================================================================================
 
+
 rule vep:
     input:
+        # we use the filtered file if a filtering is done, or the unfiltered if not.
         calls=(
-            # we use the filtered file if a filtering is done, or the unfiltered if not.
             "filtered/all.vcf.gz"
             if not config["settings"]["filter-variants"] == "none"
             else "genotyped/all.vcf.gz"
@@ -197,12 +198,12 @@ rule vep:
             caption="../report/vcf.rst",
             category="Calls",
         ),
+        # The html file has to have a specific file name, so that MultiQC can find it,
+        # see https://multiqc.info/docs/#vep
+        # At the moment, this is however not yet working, because VEP was only added to
+        # MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438
+        # Will need to update to MultiQC v1.11 at some point.
         stats=report(
-            # The html file has to have a specific file name, so that MultiQC can find it,
-            # see https://multiqc.info/docs/#vep
-            # At the moment, this is however not yet working, because VEP was only added to
-            # MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438
-            # Will need to update to MultiQC v1.11 at some point.
             "annotated/vep_summary.html",
             caption="../report/stats.rst",
             category="Calls",