From 3af3dc65f305e4ab53fb63de01bcf4a5cbcfaa9a Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Thu, 11 Jul 2024 19:00:28 +0200 Subject: [PATCH] Reformat with snakefmt --- pyproject.toml | 3 + workflow/Snakefile | 23 +- workflow/rules/annotation.smk | 131 ++++---- workflow/rules/calling-bcftools-combined.smk | 79 ++--- .../rules/calling-bcftools-individual.smk | 107 ++++--- workflow/rules/calling-bcftools.smk | 25 +- workflow/rules/calling-freebayes.smk | 82 ++--- workflow/rules/calling-haplotypecaller.smk | 109 +++---- workflow/rules/calling.smk | 127 ++++---- workflow/rules/damage.smk | 45 +-- workflow/rules/duplicates-dedup.smk | 22 +- workflow/rules/duplicates-picard.smk | 17 +- workflow/rules/filtering-bcftools-filter.smk | 10 +- .../filtering-gatk-variantfiltration.smk | 14 +- workflow/rules/filtering-gatk-vqsr.smk | 65 ++-- workflow/rules/filtering.smk | 48 +-- workflow/rules/frequency.smk | 293 ++++++++++-------- workflow/rules/init.smk | 171 +++++----- workflow/rules/mapping-bowtie2.smk | 52 ++-- workflow/rules/mapping-bwa-aln.smk | 80 ++--- workflow/rules/mapping-bwa-mem.smk | 47 +-- workflow/rules/mapping-bwa-mem2.smk | 27 +- workflow/rules/mapping-recalibrate.smk | 27 +- workflow/rules/mapping.smk | 129 +++++--- workflow/rules/pileup.smk | 77 ++--- workflow/rules/prep.smk | 139 +++++---- workflow/rules/qc-bam.smk | 120 +++---- workflow/rules/qc-fastq.smk | 97 +++--- workflow/rules/qc-vcf.smk | 26 +- workflow/rules/qc.smk | 27 +- workflow/rules/stats.smk | 33 +- workflow/rules/trimming-adapterremoval.smk | 71 +++-- workflow/rules/trimming-cutadapt.smk | 44 +-- workflow/rules/trimming-fastp.smk | 73 +++-- workflow/rules/trimming-seqprep.smk | 35 ++- workflow/rules/trimming-skewer.smk | 39 ++- workflow/rules/trimming-trimmomatic.smk | 53 ++-- workflow/rules/trimming.smk | 22 +- workflow/scripts/bwa-mem2-mem.py | 8 +- workflow/scripts/bwa-samxe.py | 18 +- workflow/scripts/fastqc.py | 24 +- workflow/scripts/freebayes.py | 10 +- workflow/scripts/frequency-table.py | 21 +- workflow/scripts/hafpipe-concat.py | 32 +- workflow/scripts/hafpipe-merge.py | 76 +++-- workflow/scripts/hafpipe.py | 39 +-- .../scripts/picard-collectmultiplemetrics.py | 5 +- workflow/scripts/plot-depths.py | 3 +- workflow/scripts/sample-sizes.py | 52 ++-- workflow/scripts/vep-cache.py | 4 +- workflow/scripts/vep-plugins.py | 2 +- 51 files changed, 1598 insertions(+), 1285 deletions(-) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1ab5f11 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[tool.snakefmt] +line_length = 100 +include = '\.smk$|^Snakefile|\.py$' diff --git a/workflow/Snakefile b/workflow/Snakefile index e1609e0..29c03f8 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -2,14 +2,17 @@ # Common # ================================================================================================= + # We first need to load the common functionality, which gives us access to the config file, # generates the sample list, and prepares some other things for us that are needed below. include: "rules/init.smk" + # ================================================================================================= # Default "All" Target Rule # ================================================================================================= + # The rule that is executed by default. We include the result files of different intermediate steps # here as well, for example, the genotyped vcf file from the "calling.smk" step, so that a nice # arrow shows up in the DAG that reminds us that this is an important intermediate file. @@ -20,30 +23,29 @@ rule all: "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else [], "annotated/snpeff.vcf.gz" if config["settings"]["snpeff"] else [], "annotated/vep.vcf.gz" if config["settings"]["vep"] else [], - # Quality control "qc/multiqc.html", - # Reference genome statistics config["data"]["reference-genome"] + ".seqkit", - # Pileup "mpileup/all-pileups.done" if len(config["settings"]["pileups"]) else [], - # HAFpipe - "hafpipe/all.csv" + ( - ".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "" - ) if config["settings"]["hafpipe"] else [], - + "hafpipe/all.csv" + + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "") + if config["settings"]["hafpipe"] + else [], # Stats. Some deactivated for now. - "tables/frequencies.tsv" if config["settings"]["frequency-table"] else [] + "tables/frequencies.tsv" if config["settings"]["frequency-table"] else [], # "plots/depths.svg", # "plots/allele-freqs.svg", # "tables/sample-sizes.tsv" + # The main `all` rule is local. It does not do anything anyway, # except requesting the other rules to run. -localrules: all +localrules: + all, + # ================================================================================================= # Rule Modules @@ -52,6 +54,7 @@ localrules: all # These rules need to come after the `all` rule above, as Snakemake takes the first rule it finds # as the implicit target when no other target is specified, and we want that to be the `all` rule. + include: "rules/prep.smk" include: "rules/trimming.smk" include: "rules/mapping.smk" diff --git a/workflow/rules/annotation.smk b/workflow/rules/annotation.smk index 1a2c1ac..669079b 100644 --- a/workflow/rules/annotation.smk +++ b/workflow/rules/annotation.smk @@ -2,35 +2,35 @@ # SnpEff Setup # ================================================================================================= + # Get the path where we download the database to, so that we do not have to download it all the time. # If not provided by the user, we use the directory where the reference genome is. # Not used with a custom db, in which case we just use the provided dir directly. def get_snpeff_db_download_path(): if config["params"]["snpeff"]["download-dir"]: # Return path from config, and make sure that it has trailing slash - return os.path.join( config["params"]["snpeff"]["download-dir"], '' ) + return os.path.join(config["params"]["snpeff"]["download-dir"], "") else: # Use the ref genome path, with trailing slash - return os.path.join( - os.path.dirname( config["data"]["reference-genome"] ), "snpeff-db" - ) + "/" + return os.path.join(os.path.dirname(config["data"]["reference-genome"]), "snpeff-db") + "/" + # We separate download from usage, so that we can better see progress and errors, # and to avoid unneccessary re-downloads. rule snpeff_db: output: # wildcard {reference} may be anything listed in the first column of `snpeff databases` - directory( os.path.join( get_snpeff_db_download_path(), "{reference}" )), - touch( os.path.join( get_snpeff_db_download_path(), "{reference}.done" )) + directory(os.path.join(get_snpeff_db_download_path(), "{reference}")), + touch(os.path.join(get_snpeff_db_download_path(), "{reference}.done")), log: # Log file where the download is made to, so that this is independent of the run itself. os.path.abspath( - os.path.join( get_snpeff_db_download_path(), "../snpeff-download-{reference}.log" ) - ) + os.path.join(get_snpeff_db_download_path(), "../snpeff-download-{reference}.log") + ), group: "snpeff" params: - reference="{reference}" + reference="{reference}", conda: # As always, the wrapper does not specify that python, pandas, and numpy are needed, # and if those are misspecified on the cluster (as in our case), it just fails... @@ -39,8 +39,11 @@ rule snpeff_db: wrapper: "v3.13.6/bio/snpeff/download" + # Rule is not submitted as a job to the cluster. -localrules: snpeff_db +localrules: + snpeff_db, + # Get the path to the snpeff database. Usually, this is the path to a database from the available # ones of snpEff. If however a custom db path is given, we return this instead. @@ -48,95 +51,85 @@ def get_snpeff_db_path(): if config["params"]["snpeff"]["custom-db-dir"]: return config["params"]["snpeff"]["custom-db-dir"] else: - return os.path.join( get_snpeff_db_download_path(), config["params"]["snpeff"]["name"] ) + return os.path.join(get_snpeff_db_download_path(), config["params"]["snpeff"]["name"]) + # ================================================================================================= # SnpEff # ================================================================================================= + rule snpeff: input: # (vcf, bcf, or vcf.gz) + # we use the filtered file if a filtering is done, or the unfiltered if not. calls=( - # we use the filtered file if a filtering is done, or the unfiltered if not. "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else "genotyped/all.vcf.gz" ), - # path to reference db downloaded with the snpeff download wrapper above - db=get_snpeff_db_path() + db=get_snpeff_db_path(), output: # annotated calls (vcf, bcf, or vcf.gz) - calls=report( - "annotated/snpeff.vcf.gz", - caption="../report/vcf.rst", - category="Calls" - ), - + calls=report("annotated/snpeff.vcf.gz", caption="../report/vcf.rst", category="Calls"), # summary statistics (in HTML), optional - stats=report( - "annotated/snpeff.html", - category="Calls" - ), - + stats=report("annotated/snpeff.html", category="Calls"), # summary statistics in CSV, optional - csvstats="annotated/snpeff.csv" + csvstats="annotated/snpeff.csv", log: - "logs/snpeff.log" + "logs/snpeff.log", group: "snpeff" params: # For finding the chromosome names used by snpeff, add `-v` here - extra=config["params"]["snpeff"]["extra"] + extra=config["params"]["snpeff"]["extra"], resources: - mem_mb=config["params"]["snpeff"]["mem"] + mem_mb=config["params"]["snpeff"]["mem"], conda: "../envs/snpeff.yaml" wrapper: "v3.13.6/bio/snpeff/annotate" + # ================================================================================================= # VEP Downloads # ================================================================================================= + # Get the paths where we store the data, so that we do not have to download it all the time. # If not provided by the user, we use the directory where the reference genome is. def get_vep_cache_dir(): if config["params"]["vep"]["cache-dir"]: - result = os.path.dirname( config["params"]["vep"]["cache-dir"] ) + result = os.path.dirname(config["params"]["vep"]["cache-dir"]) else: - result = os.path.join( - os.path.dirname( config["data"]["reference-genome"] ), "vep-cache" - ) + result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-cache") return result + # Same for plugins dir. def get_vep_plugins_dir(): if config["params"]["vep"]["plugins-dir"]: - result = os.path.dirname( config["params"]["vep"]["plugins-dir"] ) + result = os.path.dirname(config["params"]["vep"]["plugins-dir"]) else: - result = os.path.join( - os.path.dirname( config["data"]["reference-genome"] ), "vep-plugins" - ) + result = os.path.join(os.path.dirname(config["data"]["reference-genome"]), "vep-plugins") return result + rule vep_cache: output: - directory( get_vep_cache_dir() ), - touch( get_vep_cache_dir().rstrip('/') + ".done" ) + directory(get_vep_cache_dir()), + touch(get_vep_cache_dir().rstrip("/") + ".done"), params: - species = config["params"]["vep"]["species"], - release = config["params"]["vep"]["release"], - build = config["params"]["vep"]["build"], - cacheurl = config["params"]["vep"]["cache-url"], + species=config["params"]["vep"]["species"], + release=config["params"]["vep"]["release"], + build=config["params"]["vep"]["build"], + cacheurl=config["params"]["vep"]["cache-url"], # fastaurl = config["params"]["vep"]["fasta-url"], # fasta-url: "ftp://ftp.ebi.ac.uk/ensemblgenomes/pub/plants/current/fasta" log: # Log file where the download is made to, so that this is independent of the run itself. - os.path.abspath( - os.path.join( get_vep_cache_dir(), "../vep-cache.log" ) - ) + os.path.abspath(os.path.join(get_vep_cache_dir(), "../vep-cache.log")), conda: # We use a conda environment on top of the wrapper, as the wrapper always causes # issues with missing python modules and mismatching program versions and stuff... @@ -147,21 +140,22 @@ rule vep_cache: "../envs/vep.yaml" script: "../scripts/vep-cache.py" - # wrapper: - # "v3.13.6/bio/vep/cache" - # "0.74.0/bio/vep/cache" + + +# wrapper: +# "v3.13.6/bio/vep/cache" +# "0.74.0/bio/vep/cache" + rule vep_plugins: output: - directory( get_vep_plugins_dir() ), - touch( get_vep_plugins_dir().rstrip('/') + ".done" ) + directory(get_vep_plugins_dir()), + touch(get_vep_plugins_dir().rstrip("/") + ".done"), params: - release = config["params"]["vep"]["release"], + release=config["params"]["vep"]["release"], log: # Log file where the download is made to, so that this is independent of the run itself. - os.path.abspath( - os.path.join( get_vep_plugins_dir(), "../vep-plugins.log" ) - ) + os.path.abspath(os.path.join(get_vep_plugins_dir(), "../vep-plugins.log")), conda: # Use our own env definition here, to ensure that we are working with the same vep # versions across the different rules here. This is not the case in the original wrapper... @@ -170,21 +164,28 @@ rule vep_plugins: # We use our own script here, which solves a weird issue with the wrapper where # the output directory was already created and hence the python mkdir failed... "../scripts/vep-plugins.py" - # wrapper: - # "v3.13.6/bio/vep/plugins" - # "0.74.0/bio/vep/plugins" + + +# wrapper: +# "v3.13.6/bio/vep/plugins" +# "0.74.0/bio/vep/plugins" + # Rules are not submitted as a job to the cluster. -localrules: vep_cache, vep_plugins +localrules: + vep_cache, + vep_plugins, + # ================================================================================================= # VEP # ================================================================================================= + rule vep: input: + # we use the filtered file if a filtering is done, or the unfiltered if not. calls=( - # we use the filtered file if a filtering is done, or the unfiltered if not. "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else "genotyped/all.vcf.gz" @@ -197,12 +198,12 @@ rule vep: caption="../report/vcf.rst", category="Calls", ), + # The html file has to have a specific file name, so that MultiQC can find it, + # see https://multiqc.info/docs/#vep + # At the moment, this is however not yet working, because VEP was only added to + # MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438 + # Will need to update to MultiQC v1.11 at some point. stats=report( - # The html file has to have a specific file name, so that MultiQC can find it, - # see https://multiqc.info/docs/#vep - # At the moment, this is however not yet working, because VEP was only added to - # MultiQC recently, see https://github.com/ewels/MultiQC/issues/1438 - # Will need to update to MultiQC v1.11 at some point. "annotated/vep_summary.html", caption="../report/stats.rst", category="Calls", diff --git a/workflow/rules/calling-bcftools-combined.smk b/workflow/rules/calling-bcftools-combined.smk index 8c0a16d..3c93515 100644 --- a/workflow/rules/calling-bcftools-combined.smk +++ b/workflow/rules/calling-bcftools-combined.smk @@ -2,30 +2,31 @@ # Variant Calling # ================================================================================================= + rule call_variants: input: # Need the ref genome, as well as its indices. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), - # Get the bam and bai files for all files. If this is too slow, we need to split, # similar to what our GATK HaplotypeCaller rule does. samples=get_all_bams(), indices=get_all_bais(), - # If we use restricted regions, set them here. If not, empty, which will propagate to the # get_mpileup_params function as well. Same for small contig groups. # regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else [] - regions="called/{contig}.regions.bed" if ( - config["settings"].get("restrict-regions") - ) else ( - "contig-groups/{contig}.bed" if ( - config["settings"].get("contig-group-size") - ) else [] - ) + regions=( + "called/{contig}.regions.bed" + if (config["settings"].get("restrict-regions")) + else ( + "contig-groups/{contig}.bed" + if (config["settings"].get("contig-group-size")) + else [] + ) + ), output: vcf=( "called/{contig}.vcf.gz" @@ -33,21 +34,19 @@ rule call_variants: else temp("called/{contig}.vcf.gz") ), # vcf=protected("called/{contig}.vcf.gz") - done=touch("called/{contig}.done") + done=touch("called/{contig}.done"), params: # Optional parameters for bcftools mpileup (except -g, -f). mpileup=get_mpileup_params, - # Optional parameters for bcftools call (except -v, -o, -m). - call=config["params"]["bcftools"]["call"] + call=config["params"]["bcftools"]["call"], log: - "logs/bcftools/call-{contig}.log" + "logs/bcftools/call-{contig}.log", benchmark: "benchmarks/bcftools/call-{contig}.bench.log" conda: "../envs/bcftools.yaml" - threads: - config["params"]["bcftools"]["threads"] + threads: config["params"]["bcftools"]["threads"] shell: # We run an additional norm step after the calling, in order to clear any calls that # contain ambiguity chars. As always, no idea how they end up in the vcf in the first place. @@ -58,11 +57,12 @@ rule call_variants: "bcftools norm --check-ref ws --fasta-ref {input.ref} --output-type z --output {output.vcf} - " ") 2> {log}" - # Cannot use the wrapper, as we need `bcftools mpileup` instead of `samtools mpileup` (as used - # by the wrapper), in order to have the `--annotate` option, which we need so that our - # stats rules can work. Man... what a mess! Again! - # wrapper: - # "0.55.1/bio/bcftools/call" + +# Cannot use the wrapper, as we need `bcftools mpileup` instead of `samtools mpileup` (as used +# by the wrapper), in order to have the `--annotate` option, which we need so that our +# stats rules can work. Man... what a mess! Again! +# wrapper: +# "0.55.1/bio/bcftools/call" # Alternative to parallelization over contigs would be to parallelize over equally sized chunks of # the reference genome, see e.g., http://dmnfarrell.github.io/bioinformatics/mpileup-parallel @@ -76,35 +76,39 @@ rule call_variants: # but with different individuals)... But tools tend to name these things differently, so let's # call it merging here... + # Need an input function to work with the fai checkpoint def merge_variants_vcfs_input(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return expand("called/{contig}.vcf.gz", contig=get_contigs( fai )) + return expand("called/{contig}.vcf.gz", contig=get_contigs(fai)) + # Need index files for some of the downstream tools. # Rational for the fai: see merge_variants_vcfs_input() def merge_variants_tbis_input(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return expand("called/{contig}.vcf.gz.tbi", contig=get_contigs( fai )) + return expand("called/{contig}.vcf.gz.tbi", contig=get_contigs(fai)) + # bcftools does not automatically create vcf index files, so we need a rule for that... # ... but picard does! So, if we continue using the below picard mergevcfs tool, we don't need tabix # ...... buuuuut vcflib does not! So, we reactivate it again! rule called_vcf_index: input: - "called/{contig}.vcf.gz" + "called/{contig}.vcf.gz", output: - "called/{contig}.vcf.gz.tbi" + "called/{contig}.vcf.gz.tbi", params: # pass arguments to tabix (e.g. index a vcf) - "-p vcf" + "-p vcf", log: - "logs/tabix/called/{contig}.log" + "logs/tabix/called/{contig}.log", conda: "../envs/tabix.yaml" wrapper: "0.55.1/bio/tabix" + # Cannot use bcftools concat, as it does not except vcf/bcf files without any calls in them. # rule merge_variants: # input: @@ -143,6 +147,7 @@ rule called_vcf_index: # wrapper: # "0.51.3/bio/picard/mergevcfs" + # Let's try vcflib instead, hoping that it can handle both of the above use cases. # See above for comments on the input files. # This is not the safest option, as vcflib does little to no checks for file integrity, @@ -153,19 +158,23 @@ rule merge_variants: ref=get_fai, contig_groups=contigs_groups_input, vcfs=merge_variants_vcfs_input, - tbis=merge_variants_tbis_input + tbis=merge_variants_tbis_input, output: # With small contigs, we also need sorting, see below. # Unfortunately, we cannot pipe here, as Picard fails with that, so temp file it is... # If we do not use small contigs, we directly output the final file. - vcf = temp("genotyped/merged-all.vcf.gz") if ( - config["settings"].get("contig-group-size") - ) else "genotyped/all.vcf.gz", - done = touch("genotyped/merged-all.done") if ( - config["settings"].get("contig-group-size") - ) else touch("genotyped/all.done") + vcf=( + temp("genotyped/merged-all.vcf.gz") + if (config["settings"].get("contig-group-size")) + else "genotyped/all.vcf.gz" + ), + done=( + touch("genotyped/merged-all.done") + if (config["settings"].get("contig-group-size")) + else touch("genotyped/all.done") + ), log: - "logs/vcflib/merge-genotyped.log" + "logs/vcflib/merge-genotyped.log", benchmark: "benchmarks/vcflib/merge-genotyped.bench.log" conda: diff --git a/workflow/rules/calling-bcftools-individual.smk b/workflow/rules/calling-bcftools-individual.smk index 7ed9531..a4288fd 100644 --- a/workflow/rules/calling-bcftools-individual.smk +++ b/workflow/rules/calling-bcftools-individual.smk @@ -2,29 +2,30 @@ # Variant Calling # ================================================================================================= + rule call_variants: input: # Need the ref genome, as well as its indices. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), - # Get the sample data. samples=get_sample_bams_wildcards, indices=get_sample_bais_wildcards, - # If we use restricted regions, set them here. If not, empty, which will propagate to the # get_mpileup_params function as well. Same for small contig groups. # regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else [] - regions="called/{contig}.regions.bed" if ( - config["settings"].get("restrict-regions") - ) else ( - "contig-groups/{contig}.bed" if ( - config["settings"].get("contig-group-size") - ) else [] - ) + regions=( + "called/{contig}.regions.bed" + if (config["settings"].get("restrict-regions")) + else ( + "contig-groups/{contig}.bed" + if (config["settings"].get("contig-group-size")) + else [] + ) + ), output: gvcf=( "called/{sample}.{contig}.g.vcf.gz" @@ -32,29 +33,29 @@ rule call_variants: else temp("called/{sample}.{contig}.g.vcf.gz") ), gtbi="called/{sample}.{contig}.g.vcf.gz.tbi", - done=touch("called/{sample}.{contig}.g.done") + done=touch("called/{sample}.{contig}.g.done"), params: # Optional parameters for bcftools mpileup (except -g, -f). mpileup=get_mpileup_params, - # Optional parameters for bcftools call (except -v, -o, -m). - call=config["params"]["bcftools"]["call"] + call=config["params"]["bcftools"]["call"], log: - "logs/bcftools/call-{sample}.{contig}.log" + "logs/bcftools/call-{sample}.{contig}.log", benchmark: "benchmarks/bcftools/call-{sample}.{contig}.bench.log" conda: "../envs/bcftools.yaml" - threads: - config["params"]["bcftools"]["threads"] + threads: config["params"]["bcftools"]["threads"] shell: # We need a normalization step in between, because somehow bcftools spits out data # in some non-normal format by default... see https://www.biostars.org/p/404061/#404245 + # See https://www.biostars.org/p/397616/ for details on the annotations + # " -a \"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR,FORMAT/AD,FORMAT/DP\" --gvcf 0 | " "(" "bcftools mpileup " " {params.mpileup} --fasta-ref {input.ref} --output-type u {input.samples} " - # See https://www.biostars.org/p/397616/ for details on the annotations - # " -a \"AD,ADF,ADR,DP,SP,INFO/AD,INFO/ADF,INFO/ADR,FORMAT/AD,FORMAT/DP\" --gvcf 0 | " + + " -a 'INFO/AD' -a 'FORMAT/AD' -a 'FORMAT/DP' --gvcf 0 | " "bcftools call " " --threads {threads} {params.call} --gvcf 0 --output-type u | " @@ -64,6 +65,7 @@ rule call_variants: "bcftools index --tbi {output.gvcf}" ") &> {log}" + # Potential reference implementation: # https://gist.github.com/lindenb/f5311887f5a5df0abfaf487df4ae2858 @@ -71,33 +73,29 @@ rule call_variants: # Combining Contigs # ================================================================================================= + rule combine_contig: input: # Need the ref genome, as well as its indices. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), - # Get the sample data, including indices, which are produced above. gvcfs=( - expand( - "called/{sample}.{{contig}}.g.vcf.gz", - sample=config["global"]["sample-names"] - ) + expand("called/{sample}.{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"]) ), indices=expand( - "called/{sample}.{{contig}}.g.vcf.gz.tbi", - sample=config["global"]["sample-names"] - ) + "called/{sample}.{{contig}}.g.vcf.gz.tbi", sample=config["global"]["sample-names"] + ), output: gvcf="called/all.{contig}.g.vcf.gz", gtbi="called/all.{contig}.g.vcf.gz.tbi", gvcflist="called/all.{contig}.g.txt", - done=touch("called/all.{contig}.g.done") + done=touch("called/all.{contig}.g.done"), log: - "logs/bcftools/combine-contig-{contig}.log" + "logs/bcftools/combine-contig-{contig}.log", benchmark: "benchmarks/bcftools/combine-contig-{contig}.bench.log" conda: @@ -116,10 +114,12 @@ rule combine_contig: "bcftools index --tbi {output.gvcf}" ") &> {log}" + # ================================================================================================= # Combining All # ================================================================================================= + # We use the fai file of the reference genome to obtain a list of contigs. This is used as input in # the rule below, which then request all per-contig gvcfs. We need to provide this as an input # function for the rule input, so that the fai snakemake checkpoint gets executed before evaluating. @@ -127,7 +127,8 @@ rule combine_contig: # config file, this is the list of the group names. def combined_contig_gvcfs(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return expand("called/all.{contig}.g.vcf.gz", contig=get_contigs( fai )) + return expand("called/all.{contig}.g.vcf.gz", contig=get_contigs(fai)) + # We also need a comma-separated list of the contigs, so that bcftools can output # the concatenated entries in the correct order as given in the fai. @@ -139,9 +140,10 @@ def combined_contig_order(wildcards): with open(fai, "r") as f: for line in f: contig = line.split("\t")[0].strip() - contig_list.append( contig ) + contig_list.append(contig) return ",".join(contig_list) + rule combine_all: input: # fai is needed to calculate aggregation over contigs below. @@ -150,29 +152,37 @@ rule combine_all: # produced before we use it here to get its content. ref=get_fai, contig_groups=contigs_groups_input, - gvcfs=combined_contig_gvcfs + gvcfs=combined_contig_gvcfs, output: # vcf="genotyped/all.vcf.gz", # tbi="genotyped/all.vcf.gz.tbi", # lst="genotyped/all.txt", # done="genotyped/all.done" - vcf = temp("genotyped/merged-all.vcf.gz") if ( - config["settings"].get("contig-group-size") - ) else "genotyped/all.vcf.gz", - tbi = temp("genotyped/merged-all.vcf.gz.tbi") if ( - config["settings"].get("contig-group-size") - ) else "genotyped/all.vcf.gz.tbi", - lst = temp("genotyped/merged-all.txt") if ( - config["settings"].get("contig-group-size") - ) else "genotyped/all.txt", - done = touch("genotyped/merged-all.done") if ( - config["settings"].get("contig-group-size") - ) else touch("genotyped/all.done") + vcf=( + temp("genotyped/merged-all.vcf.gz") + if (config["settings"].get("contig-group-size")) + else "genotyped/all.vcf.gz" + ), + tbi=( + temp("genotyped/merged-all.vcf.gz.tbi") + if (config["settings"].get("contig-group-size")) + else "genotyped/all.vcf.gz.tbi" + ), + lst=( + temp("genotyped/merged-all.txt") + if (config["settings"].get("contig-group-size")) + else "genotyped/all.txt" + ), + done=( + touch("genotyped/merged-all.done") + if (config["settings"].get("contig-group-size")) + else touch("genotyped/all.done") + ), params: # Use a list of the chromosomes in the same order as the fai, for bcftools to sort the output. - regionorder=combined_contig_order + regionorder=combined_contig_order, log: - "logs/bcftools/combine-all.log" + "logs/bcftools/combine-all.log", benchmark: "benchmarks/bcftools/combine-all.bench.log" conda: @@ -181,14 +191,13 @@ rule combine_all: # Create a list of the input files. We use the order of gvfcs as provided by the fai file # by default, which works if we call per chromosome, so that bcftool has an easier job # with the data, but gives them out of order if we use contig groups insteads. - "echo {input.gvcfs} | sed 's/ /\\n/g' > {output.lst} ; " - # Hence, we further enforce the order that we want by specifing the chromosomes as regions, # see https://github.com/samtools/bcftools/issues/268#issuecomment-105772010, # by providing a list of regions in the order that we want. + "echo {input.gvcfs} | sed 's/ /\\n/g' > {output.lst} ; " "(" "bcftools concat " - " --file-list \"{output.lst}\" --regions \"{params.regionorder}\" --allow-overlaps " + ' --file-list "{output.lst}" --regions "{params.regionorder}" --allow-overlaps ' " --output-type z -o {output.vcf} ; " "bcftools index --tbi {output.vcf}" ") &> {log}" diff --git a/workflow/rules/calling-bcftools.smk b/workflow/rules/calling-bcftools.smk index ad8f063..d8cbbcf 100644 --- a/workflow/rules/calling-bcftools.smk +++ b/workflow/rules/calling-bcftools.smk @@ -10,6 +10,7 @@ if config["data"]["known-variants"]: "as it does not support to call based on known variants." ) + def get_mpileup_params(wildcards, input): # Start with the user-specified params from the config file params = config["params"]["bcftools"]["mpileup"] @@ -28,20 +29,28 @@ def get_mpileup_params(wildcards, input): params += " --annotate FORMAT/AD,FORMAT/DP" return params + # ================================================================================================= # Variant Calling # ================================================================================================= # Switch to the chosen caller mode +bcftools_mode_good = False if config["params"]["bcftools"]["mode"] == "combined": + bcftools_mode_good = True + include: "calling-bcftools-combined.smk" elif config["params"]["bcftools"]["mode"] == "individual": + bcftools_mode_good = True + include: "calling-bcftools-individual.smk" -else: + +# Stupid workaround for https://github.com/snakemake/snakefmt/issues/239 +if not bcftools_mode_good: raise Exception("Unknown bcftools mode: " + config["params"]["bcftools"]["mode"]) # ================================================================================================= @@ -54,21 +63,21 @@ if config["settings"].get("contig-group-size"): rule sort_variants: input: - vcf = "genotyped/merged-all.vcf.gz", - refdict=genome_dict() + vcf="genotyped/merged-all.vcf.gz", + refdict=genome_dict(), output: - vcf = "genotyped/all.vcf.gz", - done = touch("genotyped/all.done") + vcf="genotyped/all.vcf.gz", + done=touch("genotyped/all.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( + extra=( " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" ), - java_opts=config["params"]["picard"]["SortVcf-java-opts"] + java_opts=config["params"]["picard"]["SortVcf-java-opts"], log: - "logs/picard/sort-genotyped.log" + "logs/picard/sort-genotyped.log", benchmark: "benchmarks/picard/sort-genotyped.bench.log" conda: diff --git a/workflow/rules/calling-freebayes.smk b/workflow/rules/calling-freebayes.smk index f02757b..5d7ff8d 100644 --- a/workflow/rules/calling-freebayes.smk +++ b/workflow/rules/calling-freebayes.smk @@ -4,12 +4,24 @@ import platform # Variant Calling # ================================================================================================= + def know_variants_extra(): if config["data"]["known-variants"]: return " --haplotype-basis-alleles " + config["data"]["known-variants"] else: return "" + +# Need to calculate the threads to be used for freebayes here already, +# see https://github.com/snakemake/snakefmt/issues/240 +# Need to exclude threads that we need for compression: +freebayes_threads = max( + 1, + int(config["params"]["freebayes"]["threads"]) + - int(config["params"]["freebayes"]["compress-threads"]), +) + + rule call_variants: input: # Our custom script needs the genome, with index files, and its fai file. @@ -18,55 +30,45 @@ rule call_variants: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), fai=get_fai, - # Get the bam and bai files for all files. If this is too slow, we need to split, # similar to what our GATK HaplotypeCaller rule does. # Without bai files, freebayes claims that it recomputes them, but actually crashes... samples=get_all_bams(), indices=get_all_bais(), - # If known variants are set, use this as input to ensure the file exists. # We use the file via the know_variants_extra() function call below, # but request it here as well to ensure that it and its index are present. known=config["data"]["known-variants"], knownidx=( - config["data"]["known-variants"] + ".tbi" - if config["data"]["known-variants"] - else [] + config["data"]["known-variants"] + ".tbi" if config["data"]["known-variants"] else [] ), - # If we restict the calling to some regions, use this file here. # regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else [] - regions="called/{contig}.regions.bed" if ( - config["settings"].get("restrict-regions") - ) else ( - "contig-groups/{contig}.bed" if ( - config["settings"].get("contig-group-size") - ) else [] - ) + regions=( + "called/{contig}.regions.bed" + if (config["settings"].get("restrict-regions")) + else ( + "contig-groups/{contig}.bed" + if (config["settings"].get("contig-group-size")) + else [] + ) + ), output: pipe("called/{contig}.vcf"), - touch("called/{contig}.vcf.done") + touch("called/{contig}.vcf.done"), log: - "logs/freebayes/{contig}.log" + "logs/freebayes/{contig}.log", benchmark: "benchmarks/freebayes/{contig}.bench.log" params: # Optional extra parameters. extra=config["params"]["freebayes"]["extra"] + know_variants_extra(), - # Reference genome chunk size for parallelization (default: 100000) - chunksize=config["params"]["freebayes"]["chunksize"] - threads: - # Need to exclude threads that we need for compression - max( - 1, - int(config["params"]["freebayes"]["threads"]) - - int(config["params"]["freebayes"]["compress-threads"]) - ) + chunksize=config["params"]["freebayes"]["chunksize"], + threads: freebayes_threads group: "call_variants" # wrapper: @@ -77,11 +79,12 @@ rule call_variants: # We use our own version of the wrapper here, which improves cluster throughput. "../scripts/freebayes.py" + # Picard does not understand the bcf files that freebayes produces, so we have to take # an unfortunate detour via vcf, and compress on-the-fly using a piped rule from above. rule compress_vcf: input: - "called/{contig}.vcf" + "called/{contig}.vcf", output: ( "called/{contig}.vcf.gz" @@ -89,11 +92,10 @@ rule compress_vcf: else temp("called/{contig}.vcf.gz") ), # protected("called/{contig}.vcf.gz") - touch("called/{contig}.vcf.gz.done") + touch("called/{contig}.vcf.gz.done"), log: - "logs/compress_vcf/{contig}.log" - threads: - config["params"]["freebayes"]["compress-threads"] + "logs/compress_vcf/{contig}.log", + threads: config["params"]["freebayes"]["compress-threads"] group: "call_variants" conda: @@ -103,14 +105,17 @@ rule compress_vcf: # which then already exists once bgzip gets active. "bgzip --force --threads {threads} {input[0]} > {output[0]} 2> {log}" + # ================================================================================================= # Combining Calls # ================================================================================================= + # Need an input function to work with the fai checkpoint def merge_variants_vcfs_input(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return expand("called/{contig}.vcf.gz", contig=get_contigs( fai )) + return expand("called/{contig}.vcf.gz", contig=get_contigs(fai)) + rule merge_variants: input: @@ -120,22 +125,19 @@ rule merge_variants: # produced before we use it here to get its content. ref=get_fai, contig_groups=contigs_groups_input, - # The wrapper expects input to be called `vcfs`, but we can use `vcf.gz` as well. # vcfs=lambda w: expand("called/{contig}.vcf.gz", contig=get_contigs()) - vcfs=merge_variants_vcfs_input + vcfs=merge_variants_vcfs_input, output: vcf="genotyped/all.vcf.gz", - done=touch("genotyped/all.done") + done=touch("genotyped/all.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + extra=( + " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" + ), log: - "logs/picard/merge-genotyped.log" + "logs/picard/merge-genotyped.log", benchmark: "benchmarks/picard/merge-genotyped.bench.log" conda: diff --git a/workflow/rules/calling-haplotypecaller.smk b/workflow/rules/calling-haplotypecaller.smk index 8610ce4..e198e42 100644 --- a/workflow/rules/calling-haplotypecaller.smk +++ b/workflow/rules/calling-haplotypecaller.smk @@ -4,47 +4,50 @@ import platform # Variant Calling # ================================================================================================= + # Combine all params to call gatk. We may want to set regions, we set that bit of multithreading # that gatk is capable of (not much, but the best we can do without spark...), and we add # all additional params from the config file. def get_gatk_call_variants_params(wildcards, input): return ( - get_gatk_regions_param(regions=input.regions, default="--intervals '{}'".format(wildcards.contig)) - + " --native-pair-hmm-threads " + str(config["params"]["gatk"]["HaplotypeCaller-threads"]) + " " + get_gatk_regions_param( + regions=input.regions, default="--intervals '{}'".format(wildcards.contig) + ) + + " --native-pair-hmm-threads " + + str(config["params"]["gatk"]["HaplotypeCaller-threads"]) + + " " + config["params"]["gatk"]["HaplotypeCaller-extra"] ) + rule call_variants: input: # Get the sample data. bam=get_sample_bams_wildcards, bai=get_sample_bais_wildcards, - # Get the reference genome, as well as its indices. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), refdict=genome_dict(), - # If known variants are set in the config, use then, and require the index file as well. known=config["data"]["known-variants"], knownidx=( - config["data"]["known-variants"] + ".tbi" - if config["data"]["known-variants"] - else [] + config["data"]["known-variants"] + ".tbi" if config["data"]["known-variants"] else [] ), - # Further settings for region constraint filter. # regions="called/{contig}.regions.bed" if config["settings"].get("restrict-regions") else [] - regions="called/{contig}.regions.bed" if ( - config["settings"].get("restrict-regions") - ) else ( - "contig-groups/{contig}.bed" if ( - config["settings"].get("contig-group-size") - ) else [] - ) + regions=( + "called/{contig}.regions.bed" + if (config["settings"].get("restrict-regions")) + else ( + "contig-groups/{contig}.bed" + if (config["settings"].get("contig-group-size")) + else [] + ) + ), output: gvcf=( "called/{sample}.{contig}.g.vcf.gz" @@ -57,23 +60,22 @@ rule call_variants: if config["settings"]["keep-intermediate"]["calling"] else temp("called/{sample}.{contig}.g.vcf.gz.tbi") ), - done=touch("called/{sample}.{contig}.g.done") + done=touch("called/{sample}.{contig}.g.done"), log: - "logs/gatk/haplotypecaller/{sample}.{contig}.log" + "logs/gatk/haplotypecaller/{sample}.{contig}.log", benchmark: "benchmarks/gatk/haplotypecaller/{sample}.{contig}.bench.log" - threads: - # Need to set threads here so that snakemake can plan the job scheduling properly - config["params"]["gatk"]["HaplotypeCaller-threads"] + # Need to set threads here so that snakemake can plan the job scheduling properly + threads: config["params"]["gatk"]["HaplotypeCaller-threads"] # resources: - # Increase time limit in factors of 24h, if the job fails due to time limit. - # time = lambda wildcards, input, threads, attempt: int(1440 * int(attempt)) + # Increase time limit in factors of 24h, if the job fails due to time limit. + # time = lambda wildcards, input, threads, attempt: int(1440 * int(attempt)) params: # The function here is where the contig variable is propagated to haplotypecaller. # Took me a while to figure this one out... # Contigs are used as long as no restrict-regions are given in the config file. extra=get_gatk_call_variants_params, - java_opts=config["params"]["gatk"]["HaplotypeCaller-java-opts"] + java_opts=config["params"]["gatk"]["HaplotypeCaller-java-opts"], group: "call_variants" conda: @@ -82,6 +84,7 @@ rule call_variants: wrapper: "0.51.3/bio/gatk/haplotypecaller" + # Deactivated the below, as this was causing trouble. Got the warning # Warning: the following output files of rule vcf_index_gatk were not present when the DAG was created: # {'called/S3.chloroplast.g.vcf.gz.tbi'} @@ -113,6 +116,7 @@ rule call_variants: # Combining Calls # ================================================================================================= + rule combine_calls: input: # Get the reference genome and its indices. Not sure if the indices are needed @@ -120,35 +124,31 @@ rule combine_calls: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), refdict=genome_dict(), - # Get the sample data, including indices. - gvcfs=expand( - "called/{sample}.{{contig}}.g.vcf.gz", - sample=config["global"]["sample-names"] - ), + gvcfs=expand("called/{sample}.{{contig}}.g.vcf.gz", sample=config["global"]["sample-names"]), indices=expand( - "called/{sample}.{{contig}}.g.vcf.gz.tbi", - sample=config["global"]["sample-names"] - ) + "called/{sample}.{{contig}}.g.vcf.gz.tbi", sample=config["global"]["sample-names"] + ), output: gvcf=( "called/all.{contig}.g.vcf.gz" if config["settings"]["keep-intermediate"]["calling"] else temp("called/all.{contig}.g.vcf.gz") ), - done=touch("called/all.{contig}.g.done") + done=touch("called/all.{contig}.g.done"), params: - extra=config["params"]["gatk"]["CombineGVCFs-extra"] + ( + extra=config["params"]["gatk"]["CombineGVCFs-extra"] + + ( " --dbsnp " + config["data"]["known-variants"] + " " if config["data"]["known-variants"] else "" ), - java_opts=config["params"]["gatk"]["CombineGVCFs-java-opts"] + java_opts=config["params"]["gatk"]["CombineGVCFs-java-opts"], log: - "logs/gatk/combine-gvcfs/{contig}.log" + "logs/gatk/combine-gvcfs/{contig}.log", benchmark: "benchmarks/gatk/combine-gvcfs/{contig}.bench.log" # group: @@ -158,6 +158,7 @@ rule combine_calls: wrapper: "0.51.3/bio/gatk/combinegvcfs" + rule genotype_variants: input: # Get the reference genome and its indices. Not sure if the indices are needed @@ -165,27 +166,27 @@ rule genotype_variants: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), refdict=genome_dict(), - - gvcf="called/all.{contig}.g.vcf.gz" + gvcf="called/all.{contig}.g.vcf.gz", output: vcf=( "genotyped/all.{contig}.vcf.gz" if config["settings"]["keep-intermediate"]["calling"] else temp("genotyped/all.{contig}.vcf.gz") ), - done=touch("genotyped/all.{contig}.done") + done=touch("genotyped/all.{contig}.done"), params: - extra=config["params"]["gatk"]["GenotypeGVCFs-extra"] + ( + extra=config["params"]["gatk"]["GenotypeGVCFs-extra"] + + ( " --dbsnp " + config["data"]["known-variants"] + " " if config["data"]["known-variants"] else "" ), - java_opts=config["params"]["gatk"]["GenotypeGVCFs-java-opts"] + java_opts=config["params"]["gatk"]["GenotypeGVCFs-java-opts"], log: - "logs/gatk/genotype-gvcfs/{contig}.log" + "logs/gatk/genotype-gvcfs/{contig}.log", benchmark: "benchmarks/gatk/genotype-gvcfs/{contig}.bench.log" # group: @@ -195,14 +196,17 @@ rule genotype_variants: wrapper: "0.51.3/bio/gatk/genotypegvcfs" + # ================================================================================================= # Merging Variants # ================================================================================================= + # Need an input function to work with the fai checkpoint def merge_variants_vcfs_input(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return expand("genotyped/all.{contig}.vcf.gz", contig=get_contigs( fai )) + return expand("genotyped/all.{contig}.vcf.gz", contig=get_contigs(fai)) + rule merge_variants: input: @@ -212,21 +216,18 @@ rule merge_variants: # produced before we use it here to get its content. ref=get_fai, contig_groups=contigs_groups_input, - # vcfs=lambda w: expand("genotyped/all.{contig}.vcf.gz", contig=get_contigs()) - vcfs=merge_variants_vcfs_input + vcfs=merge_variants_vcfs_input, output: vcf="genotyped/all.vcf.gz", - done=touch("genotyped/all.done") + done=touch("genotyped/all.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + extra=( + " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" + ), log: - "logs/picard/merge-genotyped.log" + "logs/picard/merge-genotyped.log", benchmark: "benchmarks/picard/merge-genotyped.bench.log" conda: diff --git a/workflow/rules/calling.smk b/workflow/rules/calling.smk index ed2aa48..95a243d 100644 --- a/workflow/rules/calling.smk +++ b/workflow/rules/calling.smk @@ -4,11 +4,13 @@ import json # Get Fai # ================================================================================================= + def get_fai(wildcards): # Stop at the snakemake checkpoint first to ensure that the fai file is available. return checkpoints.samtools_faidx.get().output[0] # return config["data"]["reference-genome"] + ".fai" + # ================================================================================================= # Grouping of (Small) Contigs # ================================================================================================= @@ -18,10 +20,10 @@ if config["settings"].get("contig-group-size", 0) > 0: # Simple greedy solver for the bin packing problem, here used to make bins of roughly equal # size for the contigs. We expect the input values to be tuples or pairs, where the index [1] # element is the weight that we use for packing (index[0] here is used for the contig name itself). - def solve_bin_packing( values, max_bin_size ): + def solve_bin_packing(values, max_bin_size): # First, sort by length, large ones first, decreasing. # This helps to get closer to an optimal solution. - values.sort(key = lambda x: x[1], reverse=True) + values.sort(key=lambda x: x[1], reverse=True) # Fill the bins as needed, using first-fit on the sorted list, and keeping track of # how much we already put in each of them. We can have at most as many bins as elements, @@ -31,9 +33,9 @@ if config["settings"].get("contig-group-size", 0) > 0: for cont in values: # Find the first bin where the contig fits in. j = 0 - while( j < len(bins) ): - if( sums[j] + cont[1] <= max_bin_size ): - bins[j].append( cont ) + while j < len(bins): + if sums[j] + cont[1] <= max_bin_size: + bins[j].append(cont) sums[j] += cont[1] break j += 1 @@ -43,7 +45,7 @@ if config["settings"].get("contig-group-size", 0) > 0: # if some chromosomes are fully assembled and are hence not in scaffolds of small size. if j == len(bins): bins.append([]) - bins[j].append( cont ) + bins[j].append(cont) sums[j] += cont[1] # Log output @@ -56,13 +58,13 @@ if config["settings"].get("contig-group-size", 0) > 0: checkpoint contig_groups: input: - fai = get_fai + fai=get_fai, output: - "contig-groups/contigs.json" + "contig-groups/contigs.json", log: - "logs/contig-groups/contigs.log" + "logs/contig-groups/contigs.log", params: - contig_group_size = config["settings"].get("contig-group-size", 0) + contig_group_size=config["settings"].get("contig-group-size", 0), run: # Read fai to get all contigs and their sizes. contig_list = [] @@ -71,12 +73,12 @@ if config["settings"].get("contig-group-size", 0) > 0: contig, length_str = line.split("\t")[:2] contig = contig.strip() length = int(length_str.strip()) - contig_list.append(( contig, length )) + contig_list.append((contig, length)) - # Solve the bin packing for the contigs, to get a close to optimal solution - # for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger - # than the bin size will simply get their own (overflowing...) bin. - contig_bins = solve_bin_packing( contig_list, params.contig_group_size ) + # Solve the bin packing for the contigs, to get a close to optimal solution + # for putting them in groups. Large contigs (e.g., whole chromosomes) that are larger + # than the bin size will simply get their own (overflowing...) bin. + contig_bins = solve_bin_packing(contig_list, params.contig_group_size) # Now turn the contig bins into groups for the result of this function. # We store our resulting list of contigs containing all contigs, @@ -87,12 +89,15 @@ if config["settings"].get("contig-group-size", 0) > 0: groupname = "contig-group-" + str(len(contigs)) contigs[groupname] = bin - # We need to store the result in a file, so that the rule that creates the per-contig - # files can access it. - json.dump( contigs, open( output[0], 'w' )) + # We need to store the result in a file, so that the rule that creates the per-contig + # files can access it. + json.dump(contigs, open(output[0], "w")) - # Rule is not submitted as a job to the cluster. - localrules: contig_groups + # Rule is not submitted as a job to the cluster. + + + localrules: + contig_groups, # Due to a new bug in Snakemake after our update to 8.15.2, we now need the following # function to be called as input whenever the above checkpoint is needed. @@ -100,32 +105,35 @@ if config["settings"].get("contig-group-size", 0) > 0: def contigs_groups_input(wildcards): return checkpoints.contig_groups.get().output[0] - # Make the contig-group list files that contain the names of the contigs/scaffolds - # that have been bin-packed above. + # Make the contig-group list files that contain the names of the contigs/scaffolds + # that have been bin-packed above. rule contigs_group_list: input: - contigs="contig-groups/contigs.json" + contigs="contig-groups/contigs.json", output: - "contig-groups/{contig}.bed" + "contig-groups/{contig}.bed", log: - "logs/contig-groups/{contig}.log" + "logs/contig-groups/{contig}.log", run: # Get the contigs file that we created above. - contigs = json.load( open( input.contigs )) + contigs = json.load(open(input.contigs)) # Same for the group name itself: This rule is only executed # for group names that we actually have made. if wildcards.contig not in contigs: - raise Exception( "Internal error: contig " + wildcards.contig + " not found." ) + raise Exception("Internal error: contig " + wildcards.contig + " not found.") - # Write the output list file, using the contig names and lengths from the group. - # In bed files, the first three columns are the chrom name, start (incluse, zero-based), - # and end (exclusive). Hence, we can simply use 0 and length as start and end here. + # Write the output list file, using the contig names and lengths from the group. + # In bed files, the first three columns are the chrom name, start (incluse, zero-based), + # and end (exclusive). Hence, we can simply use 0 and length as start and end here. with open(output[0], "w") as f: - f.writelines( f"{c[0]}\t0\t{c[1]}\n" for c in contigs[wildcards.contig] ) + f.writelines(f"{c[0]}\t0\t{c[1]}\n" for c in contigs[wildcards.contig]) + + # Rule is not submitted as a job to the cluster. - # Rule is not submitted as a job to the cluster. - localrules: contigs_group_list + + localrules: + contigs_group_list, # Conflicts of interest: if config["settings"].get("restrict-regions"): @@ -136,21 +144,26 @@ if config["settings"].get("contig-group-size", 0) > 0: "https://github.com/lczech/grenepipe/issues and we will see what we can do." ) - # We now extended the rules for bcftools and freebayes to also work with small contig groups. - # The following check is no longer needed - just kept here for reference. - # if config["settings"]["calling-tool"] != "haplotypecaller": - # raise Exception( - # "Can only use setting contig-group-size with calling-tool haplotypecaller " - # "at the moment, as we have not implemented this for other calling tools yet. " - # "If you need this combination of settings, please submit an issue to " - # "https://github.com/lczech/grenepipe/issues and we will see what we can do." - # ) + # We now extended the rules for bcftools and freebayes to also work with small contig groups. + # The following check is no longer needed - just kept here for reference. + # if config["settings"]["calling-tool"] != "haplotypecaller": + # raise Exception( + # "Can only use setting contig-group-size with calling-tool haplotypecaller " + # "at the moment, as we have not implemented this for other calling tools yet. " + # "If you need this combination of settings, please submit an issue to " + # "https://github.com/lczech/grenepipe/issues and we will see what we can do." + # ) + + +# Dummy definition of the above rule for when we are not using contig groups. +# We cannot do this as an else branch of the above, due to an issue in the +# formatting of snakemfmt: https://github.com/snakemake/snakefmt/issues/115#issuecomment-986860293 +if config["settings"].get("contig-group-size", 0) == 0: -else: - # Dummy definition of the above rule for when we are not using contig groups. def contigs_groups_input(wildcards): return [] + # ================================================================================================= # Get Contigs # ================================================================================================= @@ -159,17 +172,17 @@ else: if "contigs" in config["global"]: raise Exception("Config key 'global:contigs' already defined. Someone messed with our setup.") + # Get the list of chromosome names that are present in the fai file. # This is just the first column of the file. -def get_chromosomes( fai ): - return list( - pd.read_csv( fai, sep='\t', header=None, usecols=[0], dtype=str ).squeeze() - ) +def get_chromosomes(fai): + return list(pd.read_csv(fai, sep="\t", header=None, usecols=[0], dtype=str).squeeze()) + # Contigs in reference genome. This function gives the list of contig names that we want to # use for calling, that is, either all contigs of the reference genome, or, if contig grouping # is activated, the names for the contig groups that are sent as one job combining multiple contigs. -def get_contigs( fai ): +def get_contigs(fai): # This function might be called multiple times in different invocations of rules. # To avoid re-computing the contigs, we cache them in the global config dict. global config @@ -187,16 +200,17 @@ def get_contigs( fai ): # pickling issue occurs downstream when snakemake tries to pickle the config for usage # in other rules... contig_group_file = checkpoints.contig_groups.get().output[0] - contigs = json.load( open( contig_group_file )) + contigs = json.load(open(contig_group_file)) config["global"]["contigs"] = list(contigs.keys()) return config["global"]["contigs"] # Without contig groups, just read the fai and return its first column, # which contains the ref sequence names (our contigs). Store it in the global variable # first to not have to do the reading each time. - config["global"]["contigs"] = get_chromosomes( fai ) + config["global"]["contigs"] = get_chromosomes(fai) return config["global"]["contigs"] + # ================================================================================================= # Restrict Regions # ================================================================================================= @@ -204,20 +218,23 @@ def get_contigs( fai ): # Intersect the restict regions file with a given contig (chromosome), so that we can use the # resulting bed file for parallelization over contigs. if "restrict-regions" in config["settings"]: + rule compose_regions: input: - config["settings"].get("restrict-regions") + config["settings"].get("restrict-regions"), output: - "called/{contig}.regions.bed" + "called/{contig}.regions.bed", log: - "logs/bedextract/{contig}.regions.log" + "logs/bedextract/{contig}.regions.log", conda: "../envs/bedops.yaml" shell: "bedextract {wildcards.contig} {input} > {output}" # Rule is not submitted as a job to the cluster. - localrules: compose_regions + localrules: + compose_regions, + # ================================================================================================= # Variant Calling diff --git a/workflow/rules/damage.smk b/workflow/rules/damage.smk index 24689a4..c4f53a7 100644 --- a/workflow/rules/damage.smk +++ b/workflow/rules/damage.smk @@ -4,28 +4,28 @@ import platform # mapDamage # ================================================================================================= + rule mapdamage: input: # Get either the normal mapping output, or, if additional filtering via `samtools view` # is set in the config settings: filter-mapped-reads, use the filtered output instead. get_mapped_reads, # "mapped/{sample}-{unit}.sorted.bam" - # Get the reference genome and its indices. Not sure if the indices are needed # for this particular rule, but doesn't hurt to include them as an input anyway. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), output: - "mapdamage/{sample}/Runtime_log.txt" + "mapdamage/{sample}/Runtime_log.txt", params: index=config["data"]["reference-genome"], extra=config["params"]["mapdamage"]["extra"], - outdir="mapdamage/{sample}" + outdir="mapdamage/{sample}", log: - "logs/mapdamage/{sample}.log" + "logs/mapdamage/{sample}.log", conda: # We have two different env yaml files, depending on the platform. # This is because on Linux, a particular lib might be missing that is needed for mapdamage, @@ -38,56 +38,59 @@ rule mapdamage: shell: "mapDamage -i {input[0]} -r {params.index} -d {params.outdir} {params.extra} > {log} 2>&1" + rule mapdamage_collect: input: - expand( - "mapdamage/{sample}/Runtime_log.txt", - sample=config["global"]["sample-names"] - ) + expand("mapdamage/{sample}/Runtime_log.txt", sample=config["global"]["sample-names"]), output: - touch("mapdamage/mapdamage.done") + touch("mapdamage/mapdamage.done"), + + +localrules: + mapdamage_collect, -localrules: mapdamage_collect # ================================================================================================= # DamageProfiler # ================================================================================================= + rule damageprofiler: input: # Get either the normal mapping output, or, if additional filtering via `samtools view` # is set in the config settings: filter-mapped-reads, use the filtered output instead. get_mapped_reads, # "mapped/{sample}-{unit}.sorted.bam" - # Get the reference genome and its indices. Not sure if the indices are needed # for this particular rule, but doesn't hurt to include them as an input anyway. ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), output: - "damageprofiler/{sample}/DamageProfiler.log" + "damageprofiler/{sample}/DamageProfiler.log", params: index=config["data"]["reference-genome"], extra=config["params"]["damageprofiler"]["extra"], - outdir="damageprofiler/{sample}" + outdir="damageprofiler/{sample}", log: - "logs/damageprofiler/{sample}.log" + "logs/damageprofiler/{sample}.log", conda: "../envs/damageprofiler.yaml" shell: "damageprofiler -i {input[0]} -r {params.index} -o {params.outdir} {params.extra} > {log} 2>&1" # "java -jar DamageProfiler-0.5.0.jar " + rule damageprofiler_collect: input: expand( - "damageprofiler/{sample}/DamageProfiler.log", - sample=config["global"]["sample-names"] - ) + "damageprofiler/{sample}/DamageProfiler.log", sample=config["global"]["sample-names"] + ), output: - touch("damageprofiler/damageprofiler.done") + touch("damageprofiler/damageprofiler.done"), + -localrules: damageprofiler_collect +localrules: + damageprofiler_collect, diff --git a/workflow/rules/duplicates-dedup.smk b/workflow/rules/duplicates-dedup.smk index 0ca977a..227def6 100644 --- a/workflow/rules/duplicates-dedup.smk +++ b/workflow/rules/duplicates-dedup.smk @@ -8,6 +8,7 @@ if len(config["params"]["samtools"]["temp-dir"]) > 0: # Used in `mapping.smk` + # The dedup documentation says that -o is an output file, but the program then complains # that it needs a directory instead... and then litters the output directory with all kinds # of files that we do not want. So, let's use a shadow directory, and let's pray that the file naming @@ -26,18 +27,18 @@ rule mark_duplicates: # is set in the config settings: filter-mapped-reads, use the filtered output instead, # or the clipped ones, if those were requested. # We always use the merged units per sample here. - get_mapped_reads + get_mapped_reads, output: bam=temp("dedup/{sample}_rmdup.bam"), metrics="dedup/{sample}.dedup.json", - done=touch("dedup/{sample}.dedup.done") + done=touch("dedup/{sample}.dedup.done"), log: - "logs/dedup/{sample}.log" + "logs/dedup/{sample}.log", benchmark: "benchmarks/dedup/{sample}.bench.log" params: extra=config["params"]["dedup"]["extra"], - out_dir="dedup" + out_dir="dedup", conda: "../envs/dedup.yaml" group: @@ -53,23 +54,24 @@ rule mark_duplicates: " dedup/{wildcards.sample}." + get_mapped_read_infix() + ".dedup.json" " dedup/{wildcards.sample}.dedup.json" + rule sort_reads_dedup: input: - "dedup/{sample}_rmdup.bam" + "dedup/{sample}_rmdup.bam", output: ( "dedup/{sample}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("dedup/{sample}.bam") ), - touch("dedup/{sample}.done") + touch("dedup/{sample}.done"), params: extra=config["params"]["samtools"]["sort"], - tmp_dir=config["params"]["samtools"]["temp-dir"] - threads: # Samtools takes additional threads through its option -@ - 1 # This value - 1 will be sent to -@. Weird flex, but okay. + tmp_dir=config["params"]["samtools"]["temp-dir"], + # Samtools takes additional threads through its option -@ + threads: 1 # This value - 1 will be sent to -@. Weird flex, but okay. log: - "logs/samtools/sort/{sample}-dedup.log" + "logs/samtools/sort/{sample}-dedup.log", group: "mapping_extra" conda: diff --git a/workflow/rules/duplicates-picard.smk b/workflow/rules/duplicates-picard.smk index d7ded39..d8204f9 100644 --- a/workflow/rules/duplicates-picard.smk +++ b/workflow/rules/duplicates-picard.smk @@ -6,13 +6,14 @@ import platform # Used in `mapping.smk` + rule mark_duplicates: input: # Get either the normal mapping output, or, if additional filtering via `samtools view` # is set in the config settings: filter-mapped-reads, use the filtered output instead, # or the clipped ones, if those were requested. # We always use the merged units per sample here. - get_mapped_reads + get_mapped_reads, output: bam=( "dedup/{sample}.bam" @@ -20,9 +21,9 @@ rule mark_duplicates: else temp("dedup/{sample}.bam") ), metrics="qc/dedup/{sample}.metrics.txt", - done=touch("dedup/{sample}.done") + done=touch("dedup/{sample}.done"), log: - "logs/picard/dedup/{sample}.log" + "logs/picard/dedup/{sample}.log", benchmark: "benchmarks/picard/dedup/{sample}.bench.log" params: @@ -31,12 +32,10 @@ rule mark_duplicates: # and some system libraries, leading the JRE to exit with fatal error SIGSEGV caused by # libgkl_compression, see https://github.com/broadinstitute/picard/issues/1329. # Hence, on MacOS, we add the two settings recommended by the github issue. - config["params"]["picard"]["MarkDuplicates-java-opts"] + " " + - config["params"]["picard"]["MarkDuplicates"] + ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + config["params"]["picard"]["MarkDuplicates-java-opts"] + + " " + + config["params"]["picard"]["MarkDuplicates"] + + (" USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else ""), group: "mapping_extra" conda: diff --git a/workflow/rules/filtering-bcftools-filter.smk b/workflow/rules/filtering-bcftools-filter.smk index 9b9a602..90ab09d 100644 --- a/workflow/rules/filtering-bcftools-filter.smk +++ b/workflow/rules/filtering-bcftools-filter.smk @@ -2,26 +2,28 @@ # bcftools filter # ================================================================================================= + # Need a function, in order to use wildcards within params dicts... def get_filter(wildcards): return config["params"]["bcftools-filter"][wildcards.vartype] + rule bcftools_filter_calls: input: - vcf="filtered/all.{vartype}.selected.vcf.gz" + vcf="filtered/all.{vartype}.selected.vcf.gz", output: vcf=( "filtered/all.{vartype}.filtered.vcf.gz" if config["settings"]["keep-intermediate"]["filtering"] else temp("filtered/all.{vartype}.filtered.vcf.gz") ), - done=touch("filtered/all.{vartype}.filtered.done") + done=touch("filtered/all.{vartype}.filtered.done"), params: tool=config["params"]["bcftools-filter"]["tool"], filters=get_filter, - extra=config["params"]["bcftools-filter"]["extra"] + extra=config["params"]["bcftools-filter"]["extra"], log: - "logs/bcftools/filter-{vartype}.log" + "logs/bcftools/filter-{vartype}.log", benchmark: "benchmarks/bcftools/filter-{vartype}.bench.log" group: diff --git a/workflow/rules/filtering-gatk-variantfiltration.smk b/workflow/rules/filtering-gatk-variantfiltration.smk index fe00a6f..f31d885 100644 --- a/workflow/rules/filtering-gatk-variantfiltration.smk +++ b/workflow/rules/filtering-gatk-variantfiltration.smk @@ -2,32 +2,34 @@ # Hard Filtering # ================================================================================================= + # Need a function, in order to use wildcards within params dicts... def get_filter(wildcards): return { - wildcards.vartype + "-hard-filter": - config["params"]["gatk-variantfiltration"][wildcards.vartype] + wildcards.vartype + + "-hard-filter": config["params"]["gatk-variantfiltration"][wildcards.vartype] } + # Simple hard filter, used by default rule gatk_hard_filter_calls: input: ref=config["data"]["reference-genome"], vcf="filtered/all.{vartype}.selected.vcf.gz", - refdict=genome_dict() + refdict=genome_dict(), output: vcf=( "filtered/all.{vartype}.filtered.vcf.gz" if config["settings"]["keep-intermediate"]["filtering"] else temp("filtered/all.{vartype}.filtered.vcf.gz") ), - done=touch("filtered/all.{vartype}.filtered.done") + done=touch("filtered/all.{vartype}.filtered.done"), params: filters=get_filter, extra=config["params"]["gatk-variantfiltration"]["extra"], - java_opts=config["params"]["gatk-variantfiltration"]["java-opts"] + java_opts=config["params"]["gatk-variantfiltration"]["java-opts"], log: - "logs/gatk/variantfiltration/{vartype}.log" + "logs/gatk/variantfiltration/{vartype}.log", benchmark: "benchmarks/gatk/variantfiltration/{vartype}.bench.log" group: diff --git a/workflow/rules/filtering-gatk-vqsr.smk b/workflow/rules/filtering-gatk-vqsr.smk index 10a4738..0f09b75 100644 --- a/workflow/rules/filtering-gatk-vqsr.smk +++ b/workflow/rules/filtering-gatk-vqsr.smk @@ -4,6 +4,7 @@ import json # VQSR Recalibration and Filtering # ================================================================================================= + # We want our config file to be a bit flexible, and allow a dict or a list of dicts to be provided, # so here we need to unpack those. Python wants this to be complicated, so we use a detour via json, # see https://stackoverflow.com/a/27373027/4184258 @@ -20,59 +21,65 @@ def ordered_dict_to_dict(value): # print(value) return value + # We need to turn our config file into named inputs for the resources, # as this is the format that the below wrapper expects. def get_variant_recalibrator_resource_files(): - rf = ordered_dict_to_dict( config["params"]["gatk-vqsr"]["resource-files"] ) + rf = ordered_dict_to_dict(config["params"]["gatk-vqsr"]["resource-files"]) if any(x in rf.keys() for x in ["ref", "vcf", "tbi"]): raise Exception("Cannot use resource named 'ref', 'vcf', or 'tbi' with our GATK VQSR.") return rf + # We also add the tbi files as dependencies, so that their existence is checked. def get_variant_recalibrator_resource_file_tbis(): rf = get_variant_recalibrator_resource_files() tbi = [] for k in rf.keys(): - tbi.append( rf[k] + ".tbi" ) + tbi.append(rf[k] + ".tbi") return tbi + def get_variant_recalibrator_resources(): - res = ordered_dict_to_dict( config["params"]["gatk-vqsr"]["resources"] ) + res = ordered_dict_to_dict(config["params"]["gatk-vqsr"]["resources"]) for rn, rv in res.items(): - if isinstance( rv, list ): + if isinstance(rv, list): nv = {} for erv in rv: - for k,v in erv.items(): - nv[ k ] = v + for k, v in erv.items(): + nv[k] = v res[rn] = nv return res + def get_variant_recalibrator_extra(wildcards): # Need to do this in a function, as wildcard replacement otherwise does not work within # config file dict retrieval. We additionally set the Rscript file, so that we get some plots. vartype = wildcards.vartype return ( - config["params"]["gatk-vqsr"]["extra-variantrecalibrator-" + vartype] + - " --rscript-file filtered/all." + vartype + ".vqsr-recal.plots.R" + config["params"]["gatk-vqsr"]["extra-variantrecalibrator-" + vartype] + + " --rscript-file filtered/all." + + vartype + + ".vqsr-recal.plots.R" ) + def get_apply_vqsr_extra(wildcards): # Same as above, need a function here for wildcard replacement return config["params"]["gatk-vqsr"]["extra-applyvqsr-" + wildcards.vartype] + # Use the GATK VQSR machine learning based recalibration of quality scores instead of hard filtering. # This is a two step process: first, we estimate parameters, second (below) we filter the data. rule gatk_variant_recalibrator: input: + **get_variant_recalibrator_resource_files(), ref=config["data"]["reference-genome"], vcf="filtered/all.{vartype}.selected.vcf.gz", refdict=genome_dict(), - # Resources have to be given as named input files, we unpack the dict to get them. # We also request the tbi index files, so that users get a nice error message if missing. - **get_variant_recalibrator_resource_files(), tbi=get_variant_recalibrator_resource_file_tbis(), - output: # Ouput file needs to be called vcf for the wrapper, but is in fact a fake vcf # that actually contains the recal information. @@ -81,35 +88,32 @@ rule gatk_variant_recalibrator: # We also might produce a plot PDF about the trances - but only for the SNPs, # not for the INDELs, so we do not specify it here for simplicity... # The avid user will find it in the `filtered` directory either way. - done=touch("filtered/all.{vartype}.vqsr-recal.done") + done=touch("filtered/all.{vartype}.vqsr-recal.done"), params: # set mode, must be either SNP, INDEL or BOTH mode="{vartype}", - # Resource parameter definition. Key must match named input files from above. # Snakemake/Python makes this a bit difficult with the config file... - resources = get_variant_recalibrator_resources(), - + resources=get_variant_recalibrator_resources(), # which fields to use with -an (see VariantRecalibrator docs) annotation=config["params"]["gatk-vqsr"]["annotation"], - # Extras extra=get_variant_recalibrator_extra, java_opts=config["params"]["gatk-vqsr"]["java-variantrecalibrator"], log: - "logs/gatk/variantrecalibrator/{vartype}.log" + "logs/gatk/variantrecalibrator/{vartype}.log", benchmark: "benchmarks/gatk/variantrecalibrator/{vartype}.bench.log" # Group deactivated, so that it can run in parallel for SNP and INDEL # group: # "filtering" - resources: - # optional specification of memory usage of the JVM that snakemake will respect with global - # resource restrictions - # (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources) - # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`: - # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties - # mem_mb=1024 + # resources: + # optional specification of memory usage of the JVM that snakemake will respect with global + # resource restrictions + # (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources) + # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`: + # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties + # mem_mb=1024 conda: # We overwrite the original yaml, as it does not contain the R specifications, # which we want in order to also plot the trances file for SNPs... because why not. @@ -117,31 +121,32 @@ rule gatk_variant_recalibrator: wrapper: "0.85.0/bio/gatk/variantrecalibrator" + rule gatk_apply_vqsr: input: ref=config["data"]["reference-genome"], refdict=genome_dict(), vcf="filtered/all.{vartype}.selected.vcf.gz", recal="filtered/all.{vartype}.vqsr-recal.vcf.gz", - tranches="filtered/all.{vartype}.vqsr-recal.tranches" + tranches="filtered/all.{vartype}.vqsr-recal.tranches", output: vcf=( "filtered/all.{vartype}.recalibrated.vcf.gz" if config["settings"]["keep-intermediate"]["filtering"] else temp("filtered/all.{vartype}.recalibrated.vcf.gz") ), - done=touch("filtered/all.{vartype}.recalibrated.done") + done=touch("filtered/all.{vartype}.recalibrated.done"), log: - "logs/gatk/applyvqsr/{vartype}.log" + "logs/gatk/applyvqsr/{vartype}.log", benchmark: "benchmarks/gatk/applyvqsr/{vartype}.bench.log" params: # set mode, must be either SNP, INDEL or BOTH mode="{vartype}", extra=get_apply_vqsr_extra, - java_opts=config["params"]["gatk-vqsr"]["java-applyvqsr"] - resources: - # mem_mb=50 + java_opts=config["params"]["gatk-vqsr"]["java-applyvqsr"], + # resources: + # mem_mb=50 conda: # We overwrite the original yaml, as this wrapper here (version 0.85.0) and the one above # for the variantrecalibrator (also 0.85.0) use different GATK versions originally... diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index 93e86c1..8cf5f68 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -5,12 +5,12 @@ import platform # Variant Selection Helper # ================================================================================================= + rule select_calls: input: ref=config["data"]["reference-genome"], vcf="genotyped/all.vcf.gz", refdict=genome_dict(), - # bcftools does not automatically create vcf index files, so we need to specifically request them... # ... but the picard merge tool that we use right now does create tbi files, so all good atm. # tbi="genotyped/all.vcf.gz.tbi" if config["settings"]["calling-tool"] == "bcftools" else [] @@ -20,11 +20,11 @@ rule select_calls: if config["settings"]["keep-intermediate"]["filtering"] else temp("filtered/all.{vartype}.selected.vcf.gz") ), - done=touch("filtered/all.{vartype}.selected.done") + done=touch("filtered/all.{vartype}.selected.done"), params: - extra="--select-type-to-include {vartype}" + extra="--select-type-to-include {vartype}", log: - "logs/gatk/selectvariants/{vartype}.log" + "logs/gatk/selectvariants/{vartype}.log", benchmark: "benchmarks/gatk/selectvariants/{vartype}.bench.log" group: @@ -34,38 +34,50 @@ rule select_calls: wrapper: "0.27.1/bio/gatk/selectvariants" + # ================================================================================================= # Filtering # ================================================================================================= # Switch to the chosen filtering tool +filter_variants_good = False if config["settings"]["filter-variants"] == "gatk-variantfiltration": # Use `gatk-variantfiltration` + filter_variants_good = True + include: "filtering-gatk-variantfiltration.smk" elif config["settings"]["filter-variants"] == "gatk-vqsr": # Use `gatk-vqsr` + filter_variants_good = True + include: "filtering-gatk-vqsr.smk" elif config["settings"]["filter-variants"] == "bcftools-filter": # Use `bcftools-filter` + filter_variants_good = True + include: "filtering-bcftools-filter.smk" -elif config["settings"]["filter-variants"] == "none": + +if config["settings"]["filter-variants"] == "none": # Nothing to include - pass + filter_variants_good = True -else: +# Slightly awkward setup of the conditions here, due to a bug in snakefmt, +# see https://github.com/snakemake/snakefmt/issues/239#issuecomment-2223276169 +if not filter_variants_good: raise Exception("Unknown filter-variants: " + config["settings"]["filter-variants"]) # ================================================================================================= # Merge Filtered Variants # ================================================================================================= + rule merge_calls: input: # We use different naming for the VQSR intermediate files, @@ -74,23 +86,23 @@ rule merge_calls: vcf=expand( "filtered/all.{vartype}.{filtertype}.vcf.gz", vartype=["SNP", "INDEL"], - filtertype="recalibrated" - if config["settings"]["filter-variants"] == "gatk-vqsr" - else "filtered" - ) + filtertype=( + "recalibrated" + if config["settings"]["filter-variants"] == "gatk-vqsr" + else "filtered" + ), + ), output: vcf="filtered/all.vcf.gz", # vcf=protected("filtered/all.vcf.gz") - done=touch("filtered/all.done") + done=touch("filtered/all.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + extra=( + " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" + ), log: - "logs/picard/merge-filtered.log" + "logs/picard/merge-filtered.log", benchmark: "benchmarks/picard/merge-filtered.bench.log" conda: diff --git a/workflow/rules/frequency.smk b/workflow/rules/frequency.smk index ac5112d..cca9738 100644 --- a/workflow/rules/frequency.smk +++ b/workflow/rules/frequency.smk @@ -8,6 +8,7 @@ import gzip # We use a rule for the setup in order to ensure that this is only called once, # even if there are multiple instances of the downstream roles. + def get_packages_dir(): # Hard coded paths, within our structure. If we ever want to change the paths where these # dependencies are stored, they need to be adjusted in scripts/hafpipe.py as well. @@ -15,25 +16,29 @@ def get_packages_dir(): # https://snakemake.readthedocs.io/en/stable/project_info/faq.html#how-does-snakemake-interpret-relative-paths # However, neither of them is working... So we use a different way of getting the path # to our Snakemake base directory: https://stackoverflow.com/a/73202976/4184258 - return os.path.join( snakemake.workflow.basedir, "../packages" ) + return os.path.join(snakemake.workflow.basedir, "../packages") + def get_hafpipe_bins(): - harp_bin = os.path.join( get_packages_dir(), "harp/bin/harp" ) - hafpipe_bin = os.path.join( get_packages_dir(), "hafpipe/HAFpipe_wrapper.sh" ) - return [ harp_bin, hafpipe_bin ] + harp_bin = os.path.join(get_packages_dir(), "harp/bin/harp") + hafpipe_bin = os.path.join(get_packages_dir(), "hafpipe/HAFpipe_wrapper.sh") + return [harp_bin, hafpipe_bin] + rule hafpipe_setup: output: - get_hafpipe_bins() + get_hafpipe_bins(), params: - packages_path = get_packages_dir() + packages_path=get_packages_dir(), log: - "logs/hafpipe/setup.log" + "logs/hafpipe/setup.log", shell: "{params.packages_path}/setup-hafpipe.sh >> {log} 2>&1" + localrules: - hafpipe_setup + hafpipe_setup, + # ================================================================================================= # Input File Check @@ -47,18 +52,20 @@ localrules: # We keep the functions around here for reference, as they might be useful for someone using # the original HAF-pipe, but we have deactivated their usage below for now. + # Open a file, potentially gzipped, inspired by https://stackoverflow.com/a/16816627 def gzip_opener(filename): - f = open(filename,'rb') - if( f.read(2) == b'\x1f\x8b' ): + f = open(filename, "rb") + if f.read(2) == b"\x1f\x8b": f.seek(0) return gzip.GzipFile(fileobj=f) else: f.seek(0) return f + # Check that the VCF contains PASS rather than . for the FILTER column. -def check_hafpipe_founder_vcf( vcf ): +def check_hafpipe_founder_vcf(vcf): with gzip_opener(vcf) as file: idx = -1 dot = 0 @@ -68,17 +75,17 @@ def check_hafpipe_founder_vcf( vcf ): line = str(line) # Skip first header lines - if line.startswith( "##" ): + if line.startswith("##"): continue # Only check first 1k lines. Hope that's enough. if cnt > 1000: break - spl = line.split( "\t" ) + spl = line.split("\t") # Look at the main header line and find the column of the FILTER - if line.startswith( "#" ): + if line.startswith("#"): if "FILTER" in spl: - idx = spl.index( "FILTER" ) + idx = spl.index("FILTER") else: raise Exception("Invalid HAF-pipe founder VCF file without FILTER column") continue @@ -101,6 +108,7 @@ def check_hafpipe_founder_vcf( vcf ): "See also https://github.com/petrov-lab/HAFpipe-line/issues/6" ) + # The below is deactivated, as we switched to our HAF-pipe fork, which fixes that problem already. # We run this check every time... Bit wasteful, and we could make it a rule that is only # exectuted once, but that would give the error message in a log file, instead of the main @@ -112,44 +120,50 @@ def check_hafpipe_founder_vcf( vcf ): # HAFpipe Task 1: Make SNP Table # ================================================================================================= + # We get the list of chromosomes from the config that the user wants HAFpipe to run for, # and cross check them with the actual ref genome fai file, to avoid irritation. # We use wildcard {chrom} here on purpose, instead of {contig} that we use for the rest of grenepipe, # in order to (a) make it clear that these need to be actual sequences from the reference genome, # and not our contig groups for example, and to (b) avoid accidents when matching the wildcards. -def get_hafpipe_chromosomes( fai ): - ref_chrs = get_chromosomes( fai ) - haf_chrs = list( config["params"]["hafpipe"]["chromosomes"] ) +def get_hafpipe_chromosomes(fai): + ref_chrs = get_chromosomes(fai) + haf_chrs = list(config["params"]["hafpipe"]["chromosomes"]) if len(haf_chrs) == 0: haf_chrs = ref_chrs - haf_chrs = [ str(v) for v in haf_chrs ] + haf_chrs = [str(v) for v in haf_chrs] for chr in haf_chrs: if not chr in ref_chrs: raise Exception( - "Chromosome '" + chr + "' specified via the config `params: hafpipe: chromosomes` " + - "list for running HAFpipe is not part of the reference genome." + "Chromosome '" + + chr + + "' specified via the config `params: hafpipe: chromosomes` " + + "list for running HAFpipe is not part of the reference genome." ) return haf_chrs + # Same function as above, but wrapped to be used in a rule... Snakemake can be complicated... def get_hafpipe_chromosomes_list(wildcards): fai = checkpoints.samtools_faidx.get().output[0] - return get_hafpipe_chromosomes( fai ) + return get_hafpipe_chromosomes(fai) + # We allow users to specify a directory for the snp table, to avoid recomputation of Task 1. def get_hafpipe_snp_table_dir(): cfg_dir = config["params"]["hafpipe"]["snp-table-dir"] if cfg_dir: - return cfg_dir.rstrip('/') + return cfg_dir.rstrip("/") return "hafpipe/snp-tables" + rule hafpipe_snp_table: input: vcf=config["params"]["hafpipe"]["founder-vcf"], - bins=get_hafpipe_bins() + bins=get_hafpipe_bins(), output: - snptable = get_hafpipe_snp_table_dir() + "/{chrom}.csv", - alleleCts = get_hafpipe_snp_table_dir() + "/{chrom}.csv.alleleCts", + snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", + alleleCts=get_hafpipe_snp_table_dir() + "/{chrom}.csv.alleleCts", # We request both the numeric table as produced by the `numeric_SNPtable.R` script, # as well as its compressed binary form. The R script needs an _insane_ amount of memory # for larger founder VCF files, but might fail silently when going out of memory. @@ -158,41 +172,43 @@ rule hafpipe_snp_table: # So here, by requiring both files, we at least get notified if the R script failed. # We further raise an exception in our script when we detect this issue, # in order to better inform the user about the situation and how to fix this. - numeric = get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric", - numericbgz = get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric.bgz", - done=touch( get_hafpipe_snp_table_dir() + "/{chrom}.done" ) + numeric=get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric", + numericbgz=get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric.bgz", + done=touch(get_hafpipe_snp_table_dir() + "/{chrom}.done"), params: tasks="1", chrom="{chrom}", - extra=config["params"]["hafpipe"]["snp-table-extra"] + extra=config["params"]["hafpipe"]["snp-table-extra"], # We give this rule a bit higher priority, as it might run for a while. # On clusters, after this rule is started, remaining capacity can then be filled # with per-sample jobs, which usually run faster. priority: 5 log: - "logs/hafpipe/snp-table/{chrom}.log" + "logs/hafpipe/snp-table/{chrom}.log", conda: "../envs/hafpipe.yaml" script: "../scripts/hafpipe.py" + # Get the list of snp table files. def get_all_hafpipe_raw_snp_tables(wildcards): # We use the fai file of the ref genome to cross-check the list of chromosomes in the config. # We use a checkpoint to create the fai file from our ref genome, which gives us the chrom names. # Snakemake then needs an input function to work with the fai checkpoint here. fai = checkpoints.samtools_faidx.get().output[0] - return expand( - get_hafpipe_snp_table_dir() + "/{chrom}.csv", - chrom=get_hafpipe_chromosomes( fai ) - ) + return expand(get_hafpipe_snp_table_dir() + "/{chrom}.csv", chrom=get_hafpipe_chromosomes(fai)) + # Rule that requests all HAFpipe SNP table files, so that users can impute them themselves. rule all_hafpipe_snp_tables: input: - get_all_hafpipe_raw_snp_tables + get_all_hafpipe_raw_snp_tables, + + +localrules: + all_hafpipe_snp_tables, -localrules: all_hafpipe_snp_tables # ================================================================================================= # HAFpipe Task 2: Impute SNP Table @@ -213,35 +229,35 @@ if impmethod in ["simpute", "npute"]: # Deactivated this warning now, as we switchted to our own HAF-pipe fork, which fixes this. # if impmethod == "npute": - # No comment... - # logger.warning( - # "Using HAF-pipe with SNP table imputation method 'npute' is likely going to fail: " - # "We are using Python >= 3.7 in grenepipe, whereas npute requires Pyhon 2.*, " - # "and there is unfortunately no easy way to fix this. If you require npute, " - # "and get error messages here, please submit an issue to " - # "https://github.com/moiexpositoalonsolab/grenepipe/issues, " - # "so that we know about this and can try to find a solution.\n" - # "Alternatively, you can use the custom impmethod capability of grenepipe " - # "to run npute yourself, by providing your own script that runs it.\n" - # ) + # No comment... + # logger.warning( + # "Using HAF-pipe with SNP table imputation method 'npute' is likely going to fail: " + # "We are using Python >= 3.7 in grenepipe, whereas npute requires Pyhon 2.*, " + # "and there is unfortunately no easy way to fix this. If you require npute, " + # "and get error messages here, please submit an issue to " + # "https://github.com/moiexpositoalonsolab/grenepipe/issues, " + # "so that we know about this and can try to find a solution.\n" + # "Alternatively, you can use the custom impmethod capability of grenepipe " + # "to run npute yourself, by providing your own script that runs it.\n" + # ) # Call the HAFpipe script with one of the two existing methods. rule hafpipe_impute_snp_table: input: snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", - bins=get_hafpipe_bins() + bins=get_hafpipe_bins(), output: # Unnamed output, as this is implicit in HAFpipe Task 2 get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, - touch(get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done") + touch(get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done"), params: tasks="2", impmethod=impmethod, - extra=config["params"]["hafpipe"]["impute-extra"] + extra=config["params"]["hafpipe"]["impute-extra"], # Same logic as above, let's accelarate the workflow. priority: 5 log: - "logs/hafpipe/impute-" + impmethod + "/{chrom}.log" + "logs/hafpipe/impute-" + impmethod + "/{chrom}.log", conda: "../envs/hafpipe.yaml" script: @@ -250,9 +266,8 @@ if impmethod in ["simpute", "npute"]: elif impmethod != "": # Validity check of the custom script. - if ( - not os.path.exists( config["params"]["hafpipe"]["impute-script"] ) or - not os.access(config["params"]["hafpipe"]["impute-script"], os.X_OK) + if not os.path.exists(config["params"]["hafpipe"]["impute-script"]) or not os.access( + config["params"]["hafpipe"]["impute-script"], os.X_OK ): raise Exception( "User provided impute-script for HAFpipe does not exist or is not executable" @@ -261,15 +276,15 @@ elif impmethod != "": # Call the user provided script. No need for any of the HAFpipe parameters here. rule hafpipe_impute_snp_table: input: - snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv" + snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", output: # Unnamed output, as this is implicit in the user script get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, - touch(get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done") + touch(get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".done"), # Same logic as above, let's accelarate the workflow. priority: 5 log: - "logs/hafpipe/impute-" + impmethod + "/{chrom}.log" + "logs/hafpipe/impute-" + impmethod + "/{chrom}.log", conda: # We use the custom conda env if the user provided it, # or just re-use the hafpipe env, for simplicity. @@ -281,6 +296,7 @@ elif impmethod != "": shell: config["params"]["hafpipe"]["impute-script"] + " {input.snptable}" + # There is an issue with the imputed SNP table being indexed multiple times in parallel environments # such as clusters, see https://github.com/petrov-lab/HAFpipe-line/issues/5 # We here circumvent this when using imputation, by running the indexing ourselves... @@ -299,16 +315,16 @@ if impmethod == "": # just create our indicator trigger file. rule hafpipe_snp_table_indices: input: - snptable = get_hafpipe_snp_table_dir() + "/{chrom}.csv", - alleleCts = get_hafpipe_snp_table_dir() + "/{chrom}.csv.alleleCts", - numeric = get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric.bgz" + snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv", + alleleCts=get_hafpipe_snp_table_dir() + "/{chrom}.csv.alleleCts", + numeric=get_hafpipe_snp_table_dir() + "/{chrom}.csv.numeric.bgz", output: - flag = get_hafpipe_snp_table_dir() + "/{chrom}.csv.flag" + flag=get_hafpipe_snp_table_dir() + "/{chrom}.csv.flag", shell: "touch {output.flag}" localrules: - hafpipe_snp_table_indices + hafpipe_snp_table_indices, else: @@ -316,14 +332,14 @@ else: # and mark them as output, to have snakemake validate that they were in fact created. rule hafpipe_snp_table_indices: input: - snptable = get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, - bins = get_hafpipe_bins() # require that HAF-pipe scripts are there + snptable=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod, + bins=get_hafpipe_bins(), # require that HAF-pipe scripts are there output: - alleleCts = get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".alleleCts", - numeric = get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".numeric.bgz", - flag = get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".flag" + alleleCts=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".alleleCts", + numeric=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".numeric.bgz", + flag=get_hafpipe_snp_table_dir() + "/{chrom}.csv." + impmethod + ".flag", params: - hp_scripts = get_packages_dir() + "/hafpipe/scripts" + hp_scripts=get_packages_dir() + "/hafpipe/scripts", # The below code tries to switch where the scripts are found for the old and our # new improved version of HAF-pipe... it however does not work if HAF-pipe has not # been downloaded yet by the hafpipe_setup rule. We have fixed this now to always @@ -336,7 +352,7 @@ else: # Same logic as above, let's accelarate the workflow. priority: 5 log: - "logs/hafpipe/impute-" + impmethod + "/{chrom}-indices.log" + "logs/hafpipe/impute-" + impmethod + "/{chrom}-indices.log", conda: "../envs/hafpipe.yaml" shell: @@ -346,15 +362,16 @@ else: # if the other two files already exist (because snakemake has a check for this, to # avoid that files just "appear" without us intending to have created them). "if [ ! -e {input.snptable}{impmethod}.alleleCts ]; then " - " echo \"counting alleles in {input.snptable}\" >> {log} 2>&1 ; " + ' echo "counting alleles in {input.snptable}" >> {log} 2>&1 ; ' " {params.hp_scripts}/count_SNPtable.sh {input.snptable} >> {log} 2>&1 ; " "fi ; " "if [ ! -e {input.snptable}{impmethod}.numeric.bgz ]; then " - " echo \"preparing {input.snptable} for allele frequency calculation\" >> {log} 2>&1 ; " + ' echo "preparing {input.snptable} for allele frequency calculation" >> {log} 2>&1 ; ' " {params.hp_scripts}/prepare_SNPtable_for_HAFcalc.sh {input.snptable} >> {log} 2>&1 ; " "fi ; " "touch {output.flag}" + # Helper to get the SNP table for a given chromosome. According to the `impmethod` config setting, # this is either the raw table from Task 1 above, or the imputed table from Task 2, with either one # of the established methods of HAFpipe, or a custom method/script provided by the user. @@ -365,27 +382,30 @@ def get_hafpipe_snp_table(wildcards): else: return base + "." + config["params"]["hafpipe"]["impmethod"] + # We need another helper to process wild cards, requesting the dummy `flag` indicator file # that ensures that the snp table index files (alleleCt and numeric) are created above. def get_hafpipe_snp_table_flag(wildcards): return get_hafpipe_snp_table(wildcards) + ".flag" + # ================================================================================================= # HAFpipe Tasks 3 & 4: Infer haplotype frequencies & Calculate allele frequencies # ================================================================================================= + # We use the merged bam files of all bams per sample. # Unfortunately, HAF-pipe names its output files after its input files, # so here we create symlinks to the merged bam files, to keep the naming consistent... rule hafpipe_sample_bams: input: - bam=get_sample_bams_wildcards, # provided in mapping.smk - bai=get_sample_bais_wildcards # provided in mapping.smk + bam=get_sample_bams_wildcards, # provided in mapping.smk + bai=get_sample_bais_wildcards, # provided in mapping.smk output: bam="hafpipe/bam/{sample}.bam", - bai="hafpipe/bam/{sample}.bam.bai" + bai="hafpipe/bam/{sample}.bam.bai", log: - "logs/samtools/hafpipe/bam-{sample}.log" + "logs/samtools/hafpipe/bam-{sample}.log", shell: # This is a bit... hacky... but dealing with absolute paths seems even worse # (and would need hacks as well to get to work on Linux and MacOS with the same commands). @@ -394,25 +414,31 @@ rule hafpipe_sample_bams: "ln -s ../../{input.bam} {output.bam} ; " "ln -s ../../{input.bai} {output.bai} ; " + # Here, we have a special case where we need to fix the rule order, instead of relying # on file names to distinguish between rules. That is because on the one hand, we want to keep the # generic bai rule, so that we do not have to write one for every case where a bai file is needed. # On the other hand, this rule here also "produces" bai files, but by linking them, so it does # something different, meaning that we need both rules, and hence need to resolve like this: ruleorder: hafpipe_sample_bams > bam_index -localrules: hafpipe_sample_bams + + +localrules: + hafpipe_sample_bams, + # We run the two steps separately, so that if Task 4 fails, # Task 3 does not have to be run again, hence saving time. + rule hafpipe_haplotype_frequencies: input: - bamfile="hafpipe/bam/{sample}.bam", # provided above - baifile="hafpipe/bam/{sample}.bam.bai", # provided via bam_index rule in mapping.smk - snptable=get_hafpipe_snp_table, # provided above + bamfile="hafpipe/bam/{sample}.bam", # provided above + baifile="hafpipe/bam/{sample}.bam.bai", # provided via bam_index rule in mapping.smk + snptable=get_hafpipe_snp_table, # provided above flag=get_hafpipe_snp_table_flag, refseq=config["data"]["reference-genome"], - bins=get_hafpipe_bins() + bins=get_hafpipe_bins(), output: # We currently just specify the output file names here as HAFpipe produces them. # Might want to refactor in the future to be able to provide our own names, @@ -422,26 +448,27 @@ rule hafpipe_haplotype_frequencies: if config["params"]["hafpipe"].get("keep-intermediates", True) else temp("hafpipe/frequencies/{sample}.bam.{chrom}.freqs") ), - done=touch("hafpipe/frequencies/{sample}.bam.{chrom}.freqs.done") + done=touch("hafpipe/frequencies/{sample}.bam.{chrom}.freqs.done"), params: tasks="3", outdir="hafpipe/frequencies", - extra=config["params"]["hafpipe"]["haplotype-frequencies-extra"] + extra=config["params"]["hafpipe"]["haplotype-frequencies-extra"], log: - "logs/hafpipe/frequencies/haplotype-{sample}.{chrom}.log" + "logs/hafpipe/frequencies/haplotype-{sample}.{chrom}.log", conda: "../envs/hafpipe.yaml" script: "../scripts/hafpipe.py" + rule hafpipe_allele_frequencies: input: - bamfile="hafpipe/bam/{sample}.bam", # provided above - baifile="hafpipe/bam/{sample}.bam.bai", # provided via bam_index rule in mapping.smk - snptable=get_hafpipe_snp_table, # provided above + bamfile="hafpipe/bam/{sample}.bam", # provided above + baifile="hafpipe/bam/{sample}.bam.bai", # provided via bam_index rule in mapping.smk + snptable=get_hafpipe_snp_table, # provided above flag=get_hafpipe_snp_table_flag, - freqs="hafpipe/frequencies/{sample}.bam.{chrom}.freqs", # from Task 3 above - bins=get_hafpipe_bins() + freqs="hafpipe/frequencies/{sample}.bam.{chrom}.freqs", # from Task 3 above + bins=get_hafpipe_bins(), output: # Same as above: just expect the file name as produced by HAFpipe. afSite=( @@ -449,22 +476,24 @@ rule hafpipe_allele_frequencies: if config["params"]["hafpipe"].get("keep-intermediates", True) else temp("hafpipe/frequencies/{sample}.bam.{chrom}.afSite") ), - done=touch("hafpipe/frequencies/{sample}.bam.{chrom}.afSite.done") + done=touch("hafpipe/frequencies/{sample}.bam.{chrom}.afSite.done"), params: tasks="4", outdir="hafpipe/frequencies", - extra=config["params"]["hafpipe"]["allele-frequencies-extra"] + extra=config["params"]["hafpipe"]["allele-frequencies-extra"], log: - "logs/hafpipe/frequencies/allele-{sample}.{chrom}.log" + "logs/hafpipe/frequencies/allele-{sample}.{chrom}.log", conda: "../envs/hafpipe.yaml" script: "../scripts/hafpipe.py" + # ================================================================================================= # HAFpipe Concat Sample # ================================================================================================= + # Get the afSite file list for a given sample. As this is the result of task 4, # task 3 will also be executed, but we keep it simple here and only request the final files. def collect_sample_hafpipe_allele_frequencies(wildcards): @@ -475,50 +504,53 @@ def collect_sample_hafpipe_allele_frequencies(wildcards): return expand( "hafpipe/frequencies/{sample}.bam.{chrom}.afSite", sample=wildcards.sample, - chrom=get_hafpipe_chromosomes( fai ) + chrom=get_hafpipe_chromosomes(fai), ) + # Concat all afSite files for a given sample. # The script assumes the exact naming scheme that we use above, so it is not terribly portable... rule hafpipe_concat_sample_allele_frequencies: input: - collect_sample_hafpipe_allele_frequencies + collect_sample_hafpipe_allele_frequencies, output: # This is the file name produced by the script. For now we do not allow to change this. - table="hafpipe/samples/{sample}.csv" + ( - ".gz" if config["params"]["hafpipe"].get("compress-sample-tables", False) else "" - ), - done=touch("hafpipe/samples/{sample}.done") + table="hafpipe/samples/{sample}.csv" + + (".gz" if config["params"]["hafpipe"].get("compress-sample-tables", False) else ""), + done=touch("hafpipe/samples/{sample}.done"), params: # The rule needs access to the list of chromosomes, and to the sample. sample="{sample}", chroms=get_hafpipe_chromosomes_list, - # We might want to compress the output per sample. - compress=config["params"]["hafpipe"].get("compress-sample-tables", False) + compress=config["params"]["hafpipe"].get("compress-sample-tables", False), log: - "logs/hafpipe/samples/{sample}.log" + "logs/hafpipe/samples/{sample}.log", script: "../scripts/hafpipe-concat.py" + # Simply request all the above sample files. rule hafpipe_collect_concat_samples: input: tables=expand( - "hafpipe/samples/{sample}.csv" + ( - ".gz" if config["params"]["hafpipe"].get("compress-sample-tables", False) else "" - ), - sample=config["global"]["sample-names"] - ) + "hafpipe/samples/{sample}.csv" + + (".gz" if config["params"]["hafpipe"].get("compress-sample-tables", False) else ""), + sample=config["global"]["sample-names"], + ), output: - done=touch("hafpipe/samples.done") + done=touch("hafpipe/samples.done"), + + +localrules: + hafpipe_collect_concat_samples, -localrules: hafpipe_collect_concat_samples # ================================================================================================= # HAFpipe Merge All # ================================================================================================= + # Get the afSite file list. As this is the result of task 4, task 3 will also be executed, # but we keep it simple here and only request the final files. def collect_all_hafpipe_allele_frequencies(wildcards): @@ -529,9 +561,10 @@ def collect_all_hafpipe_allele_frequencies(wildcards): return expand( "hafpipe/frequencies/{sample}.bam.{chrom}.afSite", sample=config["global"]["sample-names"], - chrom=get_hafpipe_chromosomes( fai ) + chrom=get_hafpipe_chromosomes(fai), ) + # Merge all afSite files produced above, for all samples and all chromsomes. # The script assumes the exact naming scheme that we use above, so it is not terribly portable... rule hafpipe_merge_allele_frequencies: @@ -543,51 +576,52 @@ rule hafpipe_merge_allele_frequencies: # This is unfortunately necessary, as HAFpipe afSite files do not contain any information # on their origins (samples and chromosomes) other than their file names, so we have to # work with that... See the script for details. - collect_all_hafpipe_allele_frequencies + collect_all_hafpipe_allele_frequencies, output: # This is the file name produced by the script. For now we do not allow to change this. - table="hafpipe/all.csv" + ( - ".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "" - ), - done=touch("hafpipe/all.done") + table="hafpipe/all.csv" + + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else ""), + done=touch("hafpipe/all.done"), params: # We are potentially dealing with tons of files, and cannot open all of them at the same # time, due to OS limitations, check `ulimit -n` for example. When this param is set to 0, # we try to use that upper limit (minus some tolerance). However, if that fails, this value # can be manually set to a value below the limit, to make it work, e.g., 500 concurrent_files=0, - # The rule needs access to lists of samples and chromosomes, # and we give it the base path of the afSite files as well, for a little bit of flexibility. samples=config["global"]["sample-names"], chroms=get_hafpipe_chromosomes_list, - # We provide the paths to the input directory here. The output will be written to the # parent directory of that (so, to "hafpipe"). # Ugly, but we are dealing with HAFpipe uglines here, and that seems to be easiest for now. base_path="hafpipe/frequencies", - # We might want to compress the final output. - compress=config["params"]["hafpipe"].get("compress-merged-table", False) + compress=config["params"]["hafpipe"].get("compress-merged-table", False), log: - "logs/hafpipe/merge-all.log" + "logs/hafpipe/merge-all.log", script: "../scripts/hafpipe-merge.py" + # ================================================================================================= # HAFpipe All # ================================================================================================= + # Simply request all the above afSite files. # We always request to run this, so that HAF-pipe is run even when the two make table config # options are not set. This is so that users who just want HAF-pipe out as-is are able to get that. rule hafpipe_collect_allele_frequencies: input: - collect_all_hafpipe_allele_frequencies + collect_all_hafpipe_allele_frequencies, output: - done=touch("hafpipe/afSite.done") + done=touch("hafpipe/afSite.done"), + + +localrules: + hafpipe_collect_allele_frequencies, -localrules: hafpipe_collect_allele_frequencies # Simple rule that requests all hafpipe af files, so that they get computed. # This requests the completely merged table, and/or the per-sample concatenated tables, @@ -595,9 +629,14 @@ localrules: hafpipe_collect_allele_frequencies rule all_hafpipe: input: "hafpipe/afSite.done", - "hafpipe/all.csv" + ( - ".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "" - ) if config["params"]["hafpipe"].get("make-merged-table", False) else [], - "hafpipe/samples.done" if config["params"]["hafpipe"].get("make-sample-tables", False) else [] + "hafpipe/all.csv" + + (".gz" if config["params"]["hafpipe"].get("compress-merged-table", False) else "") + if config["params"]["hafpipe"].get("make-merged-table", False) + else [], + "hafpipe/samples.done" + if config["params"]["hafpipe"].get("make-sample-tables", False) + else [], + -localrules: all_hafpipe +localrules: + all_hafpipe, diff --git a/workflow/rules/init.smk b/workflow/rules/init.smk index f5e125a..4779a58 100644 --- a/workflow/rules/init.smk +++ b/workflow/rules/init.smk @@ -18,15 +18,18 @@ basedir = workflow.basedir # We want to report the grenepipe version for the user, for reproducibility. # The following line is automatically replaced by the deploy scripts. Do not change manually. -grenepipe_version = "0.12.2" #GRENEPIPE_VERSION# +grenepipe_version = "0.12.2" # GRENEPIPE_VERSION# + # Add a description of the workflow to the final report report: os.path.join(workflow.basedir, "reports/workflow.rst") + # Load the config. If --directory was provided, this is also loaded from there. # This is useful to have runs that have different settings, but generally re-use the main setup. configfile: "config.yaml" + # Legacy fixes to avoid disrupting user workflows that already have a config file from a previous # version of grenepipe. We change the config keys here to our new scheme. # We renamed "samples" to "samples-table": @@ -43,9 +46,9 @@ snakemake.utils.validate(config, schema="../schemas/config.schema.yaml") # We need to do this after the scheme validation, as we are introducing changes here # that are not according to the scheme. if "known-variants" not in config["data"] or not config["data"]["known-variants"]: - config["data"]["known-variants"]=[] + config["data"]["known-variants"] = [] if "restrict-regions" not in config["settings"] or not config["settings"]["restrict-regions"]: - config["settings"]["restrict-regions"]=[] + config["settings"]["restrict-regions"] = [] # We need to clean up the file name for the reference genome. # The prep rule decompress_genome provides the unzipped genome as needed. @@ -59,11 +62,13 @@ if config["data"]["reference-genome"].endswith(".gz"): # So here, we check this, in order to provide some better error messages for users, # instead of having them run into cryptic log messages "File is not a supported reference file type". # Add `".fas", ".fas.gz"` later if we upgrade GATK. -fastaexts = ( ".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna" ) -if not config["data"]["reference-genome"].endswith( fastaexts ): +fastaexts = (".fasta", ".fasta.gz", ".fa", ".fa.gz", ".fna") +if not config["data"]["reference-genome"].endswith(fastaexts): raise Exception( - "Reference genome file path does not end in " + str(fastaexts) + ", which unfortunately " + - "is needed by GATK. Please rename the file and change the path in the config.yaml" + "Reference genome file path does not end in " + + str(fastaexts) + + ", which unfortunately " + + "is needed by GATK. Please rename the file and change the path in the config.yaml" ) # ================================================================================================= @@ -82,21 +87,23 @@ else: # Read samples and units table, and enforce to use strings in the index config["global"]["samples"] = pd.read_csv( - config["data"]["samples-table"], sep='\t', dtype=str).set_index(["sample", "unit"], drop=False -) + config["data"]["samples-table"], sep="\t", dtype=str +).set_index(["sample", "unit"], drop=False) config["global"]["samples"].index = config["global"]["samples"].index.set_levels( [i.astype(str) for i in config["global"]["samples"].index.levels] ) -snakemake.utils.validate( config["global"]["samples"], schema="../schemas/samples.schema.yaml" ) +snakemake.utils.validate(config["global"]["samples"], schema="../schemas/samples.schema.yaml") + # Helper function to get a list of all units of a given sample name. -def get_sample_units( sample ): +def get_sample_units(sample): res = list() for unit in config["global"]["samples"].loc[sample].unit: if unit not in res: res.append(unit) return res + # Get a list of all samples names, in the same order as the input sample table. # Samples with multiple units appear only once, at the first position in the table. # We cannot use a simple approach here, as this messes up the sample @@ -109,14 +116,14 @@ for index, row in config["global"]["samples"].iterrows(): config["global"]["sample-names"].append(s) # Unordered list of all unit names that appear in all the samples. -config["global"]["unit-names"] = list(set( - config["global"]["samples"].index.get_level_values("unit") -)) +config["global"]["unit-names"] = list(set(config["global"]["samples"].index.get_level_values("unit"))) + # Wildcard constraints: only allow sample names from the spreadsheet to be used wildcard_constraints: sample="|".join(config["global"]["sample-names"]), - unit="|".join(config["global"]["unit-names"]) + unit="|".join(config["global"]["unit-names"]), + # ================================================================================================= # Pipeline User Output @@ -126,33 +133,34 @@ wildcard_constraints: indent = 24 # Get a nicely formatted username and hostname -username = pwd.getpwuid( os.getuid() )[ 0 ] +username = pwd.getpwuid(os.getuid())[0] hostname = socket.gethostname() hostname = hostname + ("; " + platform.node() if platform.node() != socket.gethostname() else "") # Get some info on the platform and OS -pltfrm = platform.platform() + "\n" + ( " " * indent ) + platform.version() +pltfrm = platform.platform() + "\n" + (" " * indent) + platform.version() try: # Not available in all versions, so we need to catch this ld = platform.linux_distribution() if len(ld): - pltfrm += "\n" + ( " " * indent ) + ld + pltfrm += "\n" + (" " * indent) + ld del ld except: pass try: # Mac OS version comes back as a nested tuple?! # Need to merge the tuples... - def merge_tuple(x, bases = (tuple, list)): + def merge_tuple(x, bases=(tuple, list)): for e in x: if type(e) in bases: for e in merge_tuple(e, bases): yield e else: yield e - mv = " ".join( merge_tuple( platform.mac_ver() )) + + mv = " ".join(merge_tuple(platform.mac_ver())) if not mv.isspace(): - pltfrm += "\n" + ( " " * indent ) + mv + pltfrm += "\n" + (" " * indent) + mv del mv, merge_tuple except: pass @@ -160,10 +168,10 @@ except: # Get the git commit hash of grenepipe, if available. try: process = subprocess.Popen( - ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE, stderr=subprocess.PIPE + ["git", "rev-parse", "--short", "HEAD"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) out, err = process.communicate() - out = out.decode('ascii') + out = out.decode("ascii") grenepipe_git_hash = out.strip() if grenepipe_git_hash: grenepipe_version += "-" + grenepipe_git_hash @@ -173,10 +181,10 @@ except: # Get the conda version, if available. try: - process = subprocess.Popen(['conda', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + process = subprocess.Popen(["conda", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() - out = out.decode('ascii') - conda_ver = out[out.startswith("conda") and len("conda"):].strip() + out = out.decode("ascii") + conda_ver = out[out.startswith("conda") and len("conda") :].strip() del process, out, err if not conda_ver: conda_ver = "n/a" @@ -187,9 +195,9 @@ except: # Who knows what that means. I'm sick of conda. Just reporting the version here, # and have someone else deal with it. try: - process = subprocess.Popen(['mamba', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) + process = subprocess.Popen(["mamba", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = process.communicate() - out = out.decode('ascii') + out = out.decode("ascii") mamba_ver = re.findall("mamba *(.*) *", out)[0] conda_ver_mamba = re.findall("conda *(.*) *", out)[0] del process, out, err @@ -204,26 +212,26 @@ if conda_ver_mamba and conda_ver_mamba != conda_ver: # Get the conda env name, if available. # See https://stackoverflow.com/a/42660674/4184258 -conda_env = os.environ['CONDA_DEFAULT_ENV'] + " (" + os.environ["CONDA_PREFIX"] + ")" +conda_env = os.environ["CONDA_DEFAULT_ENV"] + " (" + os.environ["CONDA_PREFIX"] + ")" if conda_env == " ()": conda_env = "n/a" # Get nicely wrapped command line cmdline = sys.argv[0] -for i in range( 1, len(sys.argv)): +for i in range(1, len(sys.argv)): if sys.argv[i].startswith("--"): - cmdline += "\n" + ( " " * indent ) + sys.argv[i] + cmdline += "\n" + (" " * indent) + sys.argv[i] else: cmdline += " " + sys.argv[i] # Get abs paths of all config files cfgfiles = [] for cfg in workflow.configfiles: - cfgfiles.append( os.path.abspath(cfg) ) + cfgfiles.append(os.path.abspath(cfg)) cfgfiles = "\n ".join(cfgfiles) # Get a nice output of the number of samples and units -unitcnt=len(config["global"]["samples"].index.get_level_values("unit")) +unitcnt = len(config["global"]["samples"].index.get_level_values("unit")) if unitcnt == len(config["global"]["sample-names"]): smpcnt = str(len(config["global"]["sample-names"])) else: @@ -244,7 +252,7 @@ logger.info(" Host: " + hostname) logger.info(" User: " + username) logger.info(" Conda: " + str(conda_ver)) logger.info(" Mamba: " + str(mamba_ver)) -logger.info(" Python: " + str(sys.version.split(' ')[0])) +logger.info(" Python: " + str(sys.version.split(" ")[0])) logger.info(" Snakemake: " + str(snakemake.__version__)) logger.info(" Grenepipe: " + str(grenepipe_version)) logger.info(" Conda env: " + str(conda_env)) @@ -273,10 +281,11 @@ del unitcnt, smpcnt if config["data"].get("samples-count", 0) > 0: if config["data"]["samples-count"] != len(config["global"]["sample-names"]): raise Exception( - "Inconsistent number of samples found in the sample table (" + - str(len(config["global"]["sample-names"])) + - ") and the samples count consistency check provided in the config file (" + - str(config["data"]["samples-count"]) + ")." + "Inconsistent number of samples found in the sample table (" + + str(len(config["global"]["sample-names"])) + + ") and the samples count consistency check provided in the config file (" + + str(config["data"]["samples-count"]) + + ")." ) # Check the three file paths provided in the config file directly. @@ -296,21 +305,24 @@ if config["data"]["known-variants"] and not os.path.isabs(config["data"]["known- "We recommend using absolute paths for all files.\n" ) + # Helper to check that a string contains no invalid chars for file names. # This is just the file, not its path! Slashes are considered invalid by this function. def valid_filename(fn): # Only accept alnum, underscore, dash, and dot. - return fn.replace('_', '').replace('-', '').replace('.', '').isalnum() and fn.isascii() + return fn.replace("_", "").replace("-", "").replace(".", "").isalnum() and fn.isascii() # Bit more loose: allow all except Windows forbidden chars. # return not( True in [c in fn for c in "<>:\"/\\|?*"]) + # Helper to check if a file path contains weird characters. # We just want to warn about this for input fastq files, but still try to continue. def valid_filepath(fn): # Only accept alnum, underscore, dash, dot, and slashes. - clean = fn.replace('_', '').replace('-', '').replace('.', '').replace('/', '').replace('\\', '') + clean = fn.replace("_", "").replace("-", "").replace(".", "").replace("/", "").replace("\\", "") return clean.isalnum() and clean.isascii() + # List that contains tuples for all samples with their units. # In other words, a list of tuples of the sample and unit column of the sample table, # in the same order. @@ -318,48 +330,53 @@ config["global"]["sample-units"] = list() problematic_filenames = 0 relative_filenames = 0 for index, row in config["global"]["samples"].iterrows(): - if ( row["sample"], row["unit"] ) in config["global"]["sample-units"]: + if (row["sample"], row["unit"]) in config["global"]["sample-units"]: raise Exception( - "Multiple rows with identical sample name and unit found in samples table: " + - str(row["sample"]) + " " + str(row["unit"]) + "Multiple rows with identical sample name and unit found in samples table: " + + str(row["sample"]) + + " " + + str(row["unit"]) ) - config["global"]["sample-units"].append(( row["sample"], row["unit"] )) + config["global"]["sample-units"].append((row["sample"], row["unit"])) # Do a check that the sample and unit names are valid file names. # They are used for file names, and would cause weird errors if they contain bad chars. - if not valid_filename( row["sample"] ) or not valid_filename( row["unit"] ): + if not valid_filename(row["sample"]) or not valid_filename(row["unit"]): raise Exception( - "Invalid sample name or unit name found in samples table that contains characters " + - "which cannot be used as sample/unit names for naming output files: " + - str(row["sample"]) + " " + str(row["unit"]) + - "; for maximum robustness, we only allow alpha-numerical, dots, dashes, and underscores. " + - "Use for example the script `tools/copy-samples.py --mode link [...] --clean` " + - "to create a new samples table and symlinks to the existing fastq files to solve this." + "Invalid sample name or unit name found in samples table that contains characters " + + "which cannot be used as sample/unit names for naming output files: " + + str(row["sample"]) + + " " + + str(row["unit"]) + + "; for maximum robustness, we only allow alpha-numerical, dots, dashes, and underscores. " + + "Use for example the script `tools/copy-samples.py --mode link [...] --clean` " + + "to create a new samples table and symlinks to the existing fastq files to solve this." ) # Do a check of the fastq file names. - if not os.path.isfile(row["fq1"]) or ( - not pd.isnull(row["fq2"]) and not os.path.isfile(row["fq2"]) - ): + if not os.path.isfile(row["fq1"]) or (not pd.isnull(row["fq2"]) and not os.path.isfile(row["fq2"])): raise Exception( - "Input fastq files listed in the input files table " + config["data"]["samples-table"] + - " not found: " + str(row["fq1"]) + "; " + str(row["fq2"]) + "Input fastq files listed in the input files table " + + config["data"]["samples-table"] + + " not found: " + + str(row["fq1"]) + + "; " + + str(row["fq2"]) ) - if not valid_filepath(row["fq1"]) or ( - not pd.isnull(row["fq2"]) and not valid_filepath(row["fq2"]) - ): + if not valid_filepath(row["fq1"]) or (not pd.isnull(row["fq2"]) and not valid_filepath(row["fq2"])): problematic_filenames += 1 - if not os.path.isabs(row["fq1"]) or ( - not pd.isnull(row["fq2"]) and not os.path.isabs(row["fq2"]) - ): + if not os.path.isabs(row["fq1"]) or (not pd.isnull(row["fq2"]) and not os.path.isabs(row["fq2"])): relative_filenames += 1 # Warning about input names and files. if problematic_filenames > 0: logger.warning( - str(problematic_filenames) + " of the " + str(len(config["global"]["sample-names"])) + - " input samples listed in the input files table " + config["data"]["samples-table"] + - " contain problematic characters. We generally advise to only use alpha-numeric " + str(problematic_filenames) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input samples listed in the input files table " + + config["data"]["samples-table"] + + " contain problematic characters. We generally advise to only use alpha-numeric " "characters, dots, dashes, and underscores. " "Use for example the script `tools/copy-samples.py --mode link [...] --clean` " "to create a new samples table and symlinks to the existing fastq files to solve this. " @@ -367,9 +384,12 @@ if problematic_filenames > 0: ) if relative_filenames > 0: logger.warning( - str(relative_filenames) + " of the " + str(len(config["global"]["sample-names"])) + - " input samples listed in the input files table " + config["data"]["samples-table"] + - " use relative file paths. We generally advise to only use absolute paths. " + str(relative_filenames) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input samples listed in the input files table " + + config["data"]["samples-table"] + + " use relative file paths. We generally advise to only use absolute paths. " "Use for example the script `tools/generate-table.py` " "to create a samples table with absolute paths. " "We will try to continue running with these files, but it might lead to errors.\n" @@ -377,14 +397,16 @@ if relative_filenames > 0: del problematic_filenames, relative_filenames + # Check if a given string can be converted to a number, https://stackoverflow.com/q/354038/4184258 def is_number(s): try: - float(s) # for int, long, float + float(s) # for int, long, float except ValueError: return False return True + # We check if any sample names are only numbers, and warn about this. Bioinformatics is messy... numeric_sample_names = 0 for sn in config["global"]["sample-names"]: @@ -393,11 +415,14 @@ for sn in config["global"]["sample-names"]: if numeric_sample_names > 0: logger.warning( - str(numeric_sample_names) + " of the " + str(len(config["global"]["sample-names"])) + - " input sample names listed in the input files table " + config["data"]["samples-table"] + - " are just numbers, or can be converted to numbers. We generally advise to avoid this, " + str(numeric_sample_names) + + " of the " + + str(len(config["global"]["sample-names"])) + + " input sample names listed in the input files table " + + config["data"]["samples-table"] + + " are just numbers, or can be converted to numbers. We generally advise to avoid this, " "as it might confuse downstream processing such as Excel and R. The same applies for names " - "that can be converted to dates (\"Dec21\"), but we do not check this here. " + 'that can be converted to dates ("Dec21"), but we do not check this here. ' "We will now continue running with these files, as we can work with this here, " "but recommend to change the sample names.\n" ) diff --git a/workflow/rules/mapping-bowtie2.smk b/workflow/rules/mapping-bowtie2.smk index 7829144..920eed9 100644 --- a/workflow/rules/mapping-bowtie2.smk +++ b/workflow/rules/mapping-bowtie2.smk @@ -2,6 +2,7 @@ # Genome Indexing # ================================================================================================= + # Bowtie needs its own index of the reference genome. Of course it does. rule bowtie2_index: input: @@ -9,27 +10,30 @@ rule bowtie2_index: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), output: expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2" ] - ) + ext=["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"], + ), params: # Bowtie expects the prefix, and creates the above output files automatically. # So, let's do some snakemake magic to make this work. # Let's hope that they do not change the naming of their ouput files in the future. - basename=config["data"]["reference-genome"] + basename=config["data"]["reference-genome"], conda: "../envs/bowtie2.yaml" log: - os.path.dirname(config["data"]["reference-genome"]) + "/logs/bowtie2_index.log" + os.path.dirname(config["data"]["reference-genome"]) + "/logs/bowtie2_index.log", shell: "bowtie2-build {input[0]} {params.basename} > {log} 2>&1" + # Rule is not submitted as a job to the cluster. -localrules: bowtie2_index +localrules: + bowtie2_index, + # ================================================================================================= # Read Mapping @@ -39,7 +43,8 @@ localrules: bowtie2_index if len(config["params"]["samtools"]["temp-dir"]) > 0: os.makedirs(config["params"]["samtools"]["temp-dir"], exist_ok=True) -def get_bowtie2_extra( wildcards ): + +def get_bowtie2_extra(wildcards): extra = r"--rg-id \"" + wildcards.sample + r"\"" rg_tags = get_read_group_tags(wildcards) for tag in rg_tags: @@ -47,28 +52,28 @@ def get_bowtie2_extra( wildcards ): extra += " " + config["params"]["bowtie2"]["extra"] return extra + rule map_reads: input: sample=get_trimmed_reads, indices=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2" ] - ) + ext=["1.bt2", "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2", "rev.2.bt2"], + ), output: pipe("mapped/{sample}-{unit}.bam"), - touch("mapped/{sample}-{unit}.done") + touch("mapped/{sample}-{unit}.done"), params: # Prefix of reference genome index (built with bowtie2-build above) index=config["data"]["reference-genome"], - extra=get_bowtie2_extra - threads: - # Use at least two threads - config["params"]["bowtie2"]["threads"] + extra=get_bowtie2_extra, + # Use at least two threads + threads: config["params"]["bowtie2"]["threads"] # resources: - # Increase time limit in factors of 2h, if the job fails due to time limit. - # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) + # Increase time limit in factors of 2h, if the job fails due to time limit. + # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) log: - "logs/bowtie2/{sample}-{unit}.log" + "logs/bowtie2/{sample}-{unit}.log", benchmark: "benchmarks/bowtie2/{sample}-{unit}.bench.log" group: @@ -80,26 +85,27 @@ rule map_reads: wrapper: "0.74.0/bio/bowtie2/align" + # The bowtie wrapper above is different from the bwa mem wrapper, and does not sort. # Why is everything so inconsistent? Anyway, that means we have to do the sorting step here. # At least, we can pipe the files from above to here, so this should not slow us down. rule sort_reads: input: - "mapped/{sample}-{unit}.bam" + "mapped/{sample}-{unit}.bam", output: ( "mapped/{sample}-{unit}.sorted.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}-{unit}.sorted.bam") ), - touch("mapped/{sample}-{unit}.sorted.done") + touch("mapped/{sample}-{unit}.sorted.done"), params: extra=config["params"]["samtools"]["sort"], - tmp_dir=config["params"]["samtools"]["temp-dir"] - threads: # Samtools takes additional threads through its option -@ - 1 # This value - 1 will be sent to -@. Weird flex, but okay. + tmp_dir=config["params"]["samtools"]["temp-dir"], + # Samtools takes additional threads through its option -@ + threads: 1 # This value - 1 will be sent to -@. Weird flex, but okay. log: - "logs/samtools/sort/{sample}-{unit}.log" + "logs/samtools/sort/{sample}-{unit}.log", group: "mapping" conda: diff --git a/workflow/rules/mapping-bwa-aln.smk b/workflow/rules/mapping-bwa-aln.smk index e3b0b42..4fd7d15 100644 --- a/workflow/rules/mapping-bwa-aln.smk +++ b/workflow/rules/mapping-bwa-aln.smk @@ -4,6 +4,7 @@ import platform # Read Mapping # ================================================================================================= + # We need a special rule that gives us exactly one fastq file. Our get_trimmed_reads() returns # a list of either one or two fastq files, depending on whether it is a single or paired end # sample. bwa aln cannot work with this, so we need to call it independently, and hence need @@ -12,8 +13,8 @@ def get_single_trimmed_read(wildcards): # Get the list that comes from the trimming tool. list = get_trimmed_reads(wildcards) - if len(list) not in [ 1, 2 ]: - raise Exception( "Invalid trimming tool result that is neither se nor pe" ) + if len(list) not in [1, 2]: + raise Exception("Invalid trimming tool result that is neither se nor pe") # Return one value of the two, with a bit of internal error checking. # Probably more error checking than necessary, but better safe than sorry. @@ -21,10 +22,11 @@ def get_single_trimmed_read(wildcards): return list[0] elif wildcards.pair == "2": if len(list) != 2: - raise Exception( "Invalid pair number 2 requested for single end sample" ) + raise Exception("Invalid pair number 2 requested for single end sample") return list[1] else: - raise Exception( "Invalid pair number that is not in [1,2] requested" ) + raise Exception("Invalid pair number that is not in [1,2] requested") + rule map_reads: input: @@ -32,39 +34,39 @@ rule map_reads: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] - ) + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], + ), output: ( "sai/{sample}-{unit}-{pair}.sai" if config["settings"]["keep-intermediate"]["mapping"] else temp("sai/{sample}-{unit}-{pair}.sai") ), - touch("sai/{sample}-{unit}-{pair}.done") + touch("sai/{sample}-{unit}-{pair}.done"), # Absolutely no idea why the following does not work. Snakemake complains that the pipe # is not used at all, which is simply wrong, as it is clearly used by bwa_sai_to_bam, - # and also why would this rul here be called if its output file was not requested at all?! + # and also why would this rule here be called if its output file was not requested at all?! # Snakemake... no idea what you are doing there again. # pipe( "sai/{sample}-{unit}-{pair}.sai" ) params: index=config["data"]["reference-genome"], - extra=config["params"]["bwaaln"]["extra"] + extra=config["params"]["bwaaln"]["extra"], group: "mapping" log: - "logs/bwa-aln/{sample}-{unit}-{pair}.log" + "logs/bwa-aln/{sample}-{unit}-{pair}.log", benchmark: "benchmarks/bwa-aln/{sample}-{unit}-{pair}.bench.log" - threads: - config["params"]["bwaaln"]["threads"] + threads: config["params"]["bwaaln"]["threads"] # resources: - # Increase time limit in factors of 2h, if the job fails due to time limit. - # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) + # Increase time limit in factors of 2h, if the job fails due to time limit. + # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) conda: "../envs/bwa.yaml" wrapper: "0.74.0/bio/bwa/aln" + # ================================================================================================= # Convert to bam # ================================================================================================= @@ -78,62 +80,65 @@ if len(config["params"]["samtools"]["temp-dir"]) > 0: # Apparently, it uses the input fastq files to determine if we have se or pe data. Smart. # Still, we ned a bit of trickery to get it to work. + # Adapted from get_trimmed_reads, but replace fastq by our sai files produced in the rule above. def get_sai(wildcards): if is_single_end(**wildcards): # Single end sample. - return [ "sai/{sample}-{unit}-1.sai".format(**wildcards) ] + return ["sai/{sample}-{unit}-1.sai".format(**wildcards)] elif config["settings"]["merge-paired-end-reads"]: # Merged paired-end samples. # Here, we rely on the fact that get_trimmed_reads() only returns a single sample for # merged paired-end samples, which is then the only one in the list returned from that # function. So then, we pretend that this is the "pair == 1" sample, so that the bwa aln # rule maps that file. - return [ "sai/{sample}-{unit}-1.sai".format(**wildcards) ] + return ["sai/{sample}-{unit}-1.sai".format(**wildcards)] else: # Paired-end sample. return expand( "sai/{sample}-{unit}-{pair}.sai", - sample=wildcards.sample, unit=wildcards.unit, pair=[1, 2] + sample=wildcards.sample, + unit=wildcards.unit, + pair=[1, 2], ) -def get_bwa_aln_extra( wildcards ): + +def get_bwa_aln_extra(wildcards): # We need the read group tags, including `ID` and `SM`, as downstream tools use these. # Contrary to bwa mem, this is here specified with lowercase -r, instead of uppercase -R. # As if bioinformatics tools were ever consistent... - rg_tags = "\\t".join( get_read_group_tags(wildcards) ) + rg_tags = "\\t".join(get_read_group_tags(wildcards)) extra = "-r '@RG\\t" + rg_tags + "' " + config["params"]["bwaaln"]["extra-sam"] return extra + rule bwa_sai_to_bam: input: fastq=get_trimmed_reads, sai=get_sai, ref=config["data"]["reference-genome"], - # Somehow, the wrapper expects the index extensions to be given, # instead of the underlying fasta file... Well, so let's do that. # We provide the fasta above as well; it's not used, # but might be important as a rule dependency so that it is present. idx=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] - ) + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], + ), output: - pipe( "mapped/{sample}-{unit}.sorted-unclean.bam" ), - touch( "mapped/{sample}-{unit}.sorted-unclean.done" ) + pipe("mapped/{sample}-{unit}.sorted-unclean.bam"), + touch("mapped/{sample}-{unit}.sorted-unclean.done"), params: extra=get_bwa_aln_extra, - # Sort as we need it. sort="samtools", sort_order="coordinate", sort_extra=config["params"]["samtools"]["sort"], - tmp_dir=config["params"]["samtools"]["temp-dir"] + tmp_dir=config["params"]["samtools"]["temp-dir"], group: "mapping" log: - "logs/bwa-sam/{sample}-{unit}.log" + "logs/bwa-sam/{sample}-{unit}.log", benchmark: "benchmarks/bwa-sam/{sample}-{unit}.bench.log" conda: @@ -144,8 +149,11 @@ rule bwa_sai_to_bam: # We use our own version of the wrapper here, as that wrapper misses temp dirs for # samtools sort, causing all kinds of trouble... "../scripts/bwa-samxe.py" - # wrapper: - # "0.74.0/bio/bwa/samxe" + + +# wrapper: +# "0.74.0/bio/bwa/samxe" + # Apparently, yet another bioinformatics tool fail is at play here. The bam files written above # lead to a SAM validation error: ERROR::INVALID_MAPPING_QUALITY, MAPQ should be 0 for unmapped read @@ -154,25 +162,23 @@ rule bwa_sai_to_bam: # can work with those files again. rule bwa_bam_clean: input: - "mapped/{sample}-{unit}.sorted-unclean.bam" + "mapped/{sample}-{unit}.sorted-unclean.bam", output: ( "mapped/{sample}-{unit}.sorted.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}-{unit}.sorted.bam") ), - touch("mapped/{sample}-{unit}.sorted.done") + touch("mapped/{sample}-{unit}.sorted.done"), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + extra=( + " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" + ), group: "mapping" log: - "logs/picard/cleansam/{sample}-{unit}.log" + "logs/picard/cleansam/{sample}-{unit}.log", conda: "../envs/picard.yaml" shell: diff --git a/workflow/rules/mapping-bwa-mem.smk b/workflow/rules/mapping-bwa-mem.smk index b562a1f..92717a6 100644 --- a/workflow/rules/mapping-bwa-mem.smk +++ b/workflow/rules/mapping-bwa-mem.smk @@ -6,49 +6,49 @@ if len(config["params"]["samtools"]["temp-dir"]) > 0: os.makedirs(config["params"]["samtools"]["temp-dir"], exist_ok=True) -def get_bwa_mem_extra( wildcards ): - rg_tags = "\\t".join( get_read_group_tags(wildcards) ) + +def get_bwa_mem_extra(wildcards): + rg_tags = "\\t".join(get_read_group_tags(wildcards)) extra = "-R '@RG\\t" + rg_tags + "' " + config["params"]["bwamem"]["extra"] return extra + rule map_reads: input: reads=get_trimmed_reads, ref=config["data"]["reference-genome"], idx=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] - ) + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], + ), output: ( "mapped/{sample}-{unit}.sorted.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}-{unit}.sorted.bam") ), - touch("mapped/{sample}-{unit}.sorted.done") + touch("mapped/{sample}-{unit}.sorted.done"), params: index=config["data"]["reference-genome"], extra=get_bwa_mem_extra, - # Sort as we need it. Samtools provided via the two params for new and old wrapper versions. sort="samtools", sorting="samtools", sort_order="coordinate", sort_extra=config["params"]["samtools"]["sort"], - tmp_dir=config["params"]["samtools"]["temp-dir"] + tmp_dir=config["params"]["samtools"]["temp-dir"], group: "mapping" log: - "logs/bwa-mem/{sample}-{unit}.log" + "logs/bwa-mem/{sample}-{unit}.log", benchmark: "benchmarks/bwa-mem/{sample}-{unit}.bench.log" - threads: - config["params"]["bwamem"]["threads"] + threads: config["params"]["bwamem"]["threads"] conda: "../envs/bwa.yaml" # resources: - # Increase time limit in factors of 2h, if the job fails due to time limit. - # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) + # Increase time limit in factors of 2h, if the job fails due to time limit. + # time = lambda wildcards, input, threads, attempt: int(120 * int(attempt)) # This wrapper version uses a proper tmp dir, so that the below shadow rule is not needed. # It caused trouble when running large cluster jobs with high number of parallel jobs, @@ -57,14 +57,15 @@ rule map_reads: wrapper: "0.80.0/bio/bwa/mem" - # samtools sort creates tmp files that are not cleaned up when the cluster job runs out - # of time, but which cause samtools to immediately terminate if called again, meaning - # that we cannot run it again with more time. We hence use a full shadow directory. - # We experimented with all kinds of other solutions, such as setting the `-T` option of - # `samtools sort` via the `sort_extra` param, and creating and deleting tmp directories for that, - # but that fails due to issues with snakemake handling directories instead of files... - # The snakemake shadow rule here gets the job done. - # shadow: - # "full" - # wrapper: - # "0.51.3/bio/bwa/mem" + +# samtools sort creates tmp files that are not cleaned up when the cluster job runs out +# of time, but which cause samtools to immediately terminate if called again, meaning +# that we cannot run it again with more time. We hence use a full shadow directory. +# We experimented with all kinds of other solutions, such as setting the `-T` option of +# `samtools sort` via the `sort_extra` param, and creating and deleting tmp directories for that, +# but that fails due to issues with snakemake handling directories instead of files... +# The snakemake shadow rule here gets the job done. +# shadow: +# "full" +# wrapper: +# "0.51.3/bio/bwa/mem" diff --git a/workflow/rules/mapping-bwa-mem2.smk b/workflow/rules/mapping-bwa-mem2.smk index 370653c..85e1262 100644 --- a/workflow/rules/mapping-bwa-mem2.smk +++ b/workflow/rules/mapping-bwa-mem2.smk @@ -6,47 +6,46 @@ if len(config["params"]["samtools"]["temp-dir"]) > 0: os.makedirs(config["params"]["samtools"]["temp-dir"], exist_ok=True) -def get_bwa_mem2_extra( wildcards ): - rg_tags = "\\t".join( get_read_group_tags(wildcards) ) + +def get_bwa_mem2_extra(wildcards): + rg_tags = "\\t".join(get_read_group_tags(wildcards)) extra = "-R '@RG\\t" + rg_tags + "' " + config["params"]["bwamem2"]["extra"] return extra + rule map_reads: input: reads=get_trimmed_reads, ref=config["data"]["reference-genome"], - # Somehow, the wrapper expects the index extensions to be given, # instead of the underlying fasta file... Well, so let's do that. # We provide the fasta above as well; it's not used, # but might be important as a rule dependency so that it is present. idx=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "0123", "amb", "ann", "bwt.2bit.64", "pac" ] - ) + ext=["0123", "amb", "ann", "bwt.2bit.64", "pac"], + ), output: ( "mapped/{sample}-{unit}.sorted.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}-{unit}.sorted.bam") ), - touch("mapped/{sample}-{unit}.sorted.done") + touch("mapped/{sample}-{unit}.sorted.done"), params: extra=get_bwa_mem2_extra, - # Sort as we need it. sort="samtools", sort_order="coordinate", sort_extra=config["params"]["samtools"]["sort"], - tmp_dir=config["params"]["samtools"]["temp-dir"] + tmp_dir=config["params"]["samtools"]["temp-dir"], group: "mapping" log: - "logs/bwa-mem2/{sample}-{unit}.log" + "logs/bwa-mem2/{sample}-{unit}.log", benchmark: "benchmarks/bwa-mem2/{sample}-{unit}.bench.log" - threads: - config["params"]["bwamem2"]["threads"] + threads: config["params"]["bwamem2"]["threads"] conda: # As always, we need our own env here that overwrites the python/pandas/numpy stack # to make sure that we do not run into a version conflict. @@ -55,5 +54,7 @@ rule map_reads: # We use our own version of the wrapper here, as that wrapper misses temp dirs for # samtools sort, causing all kinds of trouble... "../scripts/bwa-mem2-mem.py" - # wrapper: - # "0.78.0/bio/bwa-mem2/mem" + + +# wrapper: +# "0.78.0/bio/bwa-mem2/mem" diff --git a/workflow/rules/mapping-recalibrate.smk b/workflow/rules/mapping-recalibrate.smk index 1c548d8..06752db 100644 --- a/workflow/rules/mapping-recalibrate.smk +++ b/workflow/rules/mapping-recalibrate.smk @@ -12,23 +12,24 @@ if config["settings"]["recalibrate-base-qualities"]: "Setting recalibrate-base-qualities can only be activated if a known-variants " "file is also provided, as GATK BaseRecalibrator needs this." ) - if 'platform' not in config["global"]["samples"] and not config["params"]["gatk"]["platform"]: + if "platform" not in config["global"]["samples"] and not config["params"]["gatk"]["platform"]: raise Exception( "Setting recalibrate-base-qualities can only be activated if either a `platform` column " - "is provided in the input samples table (" + config["global"]["samples"] + - ") or if a platform for all samples is given in the `params: gatk: platform` setting " + "is provided in the input samples table (" + + config["global"]["samples"] + + ") or if a platform for all samples is given in the `params: gatk: platform` setting " "in the config file." ) - if 'platform' in config["global"]["samples"] and config["params"]["gatk"]["platform"]: + if "platform" in config["global"]["samples"] and config["params"]["gatk"]["platform"]: logger.warning( "The samples table contains a column `platform` for each sample, " "but the `params: gatk: platform` setting is also provided. " "We will use the table, as this is more specific." ) if config["params"]["gatk"]["platform"]: - platforms = set( config["params"]["gatk"]["platform"] ) - if 'platform' in config["global"]["samples"]: - platforms = set( config["global"]["samples"]["platform"] ) + platforms = set(config["params"]["gatk"]["platform"]) + if "platform" in config["global"]["samples"]: + platforms = set(config["global"]["samples"]["platform"]) for p in ["CAPILLARY", "LS454", "ILLUMINA", "SOLID", "HELICOS", "IONTORRENT", "ONT", "PACBIO"]: if p in platforms: platforms.remove(p) @@ -39,6 +40,7 @@ if config["settings"]["recalibrate-base-qualities"]: "GATK BaseRecalibrator, and hence might lead to errors: " + str(platforms) ) + def get_recal_input(bai=False): # case 1: no duplicate removal f = "mapped/{sample}.merged.bam" @@ -66,6 +68,7 @@ def get_recal_input(bai=False): else: return f + rule recalibrate_base_qualities: input: bam=get_recal_input(), @@ -73,22 +76,22 @@ rule recalibrate_base_qualities: ref=config["data"]["reference-genome"], refidcs=expand( config["data"]["reference-genome"] + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ext=["amb", "ann", "bwt", "pac", "sa", "fai"], ), refdict=genome_dict(), - known=config["data"]["known-variants"] + known=config["data"]["known-variants"], output: bam=( "recal/{sample}.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("recal/{sample}.bam") ), - done=touch("recal/{sample}.done") + done=touch("recal/{sample}.done"), # bam=protected("recal/{sample}.bam") params: - extra=get_gatk_regions_param() + " " + config["params"]["gatk"]["BaseRecalibrator"] + extra=get_gatk_regions_param() + " " + config["params"]["gatk"]["BaseRecalibrator"], log: - "logs/gatk/bqsr/{sample}.log" + "logs/gatk/bqsr/{sample}.log", benchmark: "benchmarks/gatk/bqsr/{sample}.bench.log" group: diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index eace6c8..25314d2 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -2,21 +2,26 @@ # Read Group and Helper Functions # ================================================================================================= + # We here get the read group tags list that is used per sample for the reads in the bam file. # This differs a bit depending on whether the `platform` field is properly set in the samples table. # We do not construct a full string here, as mapping tools take this information in different ways. -def get_read_group_tags( wildcards ): +def get_read_group_tags(wildcards): # We need the @RG read group tags, including `ID` and `SM`, as downstream tools use these. # Potentially, we also set the PL platform. Also add the config extra settings. - res = [ "ID:" + wildcards.sample + "-" + wildcards.unit, "SM:" + wildcards.sample ] + res = ["ID:" + wildcards.sample + "-" + wildcards.unit, "SM:" + wildcards.sample] # TODO Add LD? LB? field as well for the unit?! http://www.htslib.org/workflow/ # Add platform information, if available, giving precedence to the table over the config. pl = "" - if 'platform' in config["params"]["gatk"] and config["params"]["gatk"]["platform"]: + if "platform" in config["params"]["gatk"] and config["params"]["gatk"]["platform"]: pl = config["params"]["gatk"]["platform"] - if 'platform' in config["global"]["samples"]: - s = config["global"]["samples"].loc[(wildcards.sample, wildcards.unit), ["platform"]].dropna() + if "platform" in config["global"]["samples"]: + s = ( + config["global"]["samples"] + .loc[(wildcards.sample, wildcards.unit), ["platform"]] + .dropna() + ) # We catch the case that the platform column is present, but empty for a given row. # This would lead to an error, see https://stackoverflow.com/q/610883/4184258 try: @@ -24,7 +29,7 @@ def get_read_group_tags( wildcards ): except: pl = "" if pl: - res.append( "PL:" + pl ) + res.append("PL:" + pl) return res # For reference: https://github.com/snakemake-workflows/dna-seq-gatk-variant-calling/blob/cffa77a9634801152971448bd41d0687cf765723/workflow/rules/common.smk#L60 @@ -33,6 +38,7 @@ def get_read_group_tags( wildcards ): # platform=units.loc[(wildcards.sample, wildcards.unit), "platform"], # ) + # Helper function needed by some of the GATK tools, here and in the calling. def get_gatk_regions_param(regions=config["settings"].get("restrict-regions"), default=""): if regions: @@ -44,45 +50,57 @@ def get_gatk_regions_param(regions=config["settings"].get("restrict-regions"), d return params return default + # ================================================================================================= # Read Mapping # ================================================================================================= # Switch to the chosen mapper +mapping_tool_good = False if config["settings"]["mapping-tool"] == "bwaaln": # Use `bwa aln` include: "mapping-bwa-aln.smk" + mapping_tool_good = True + elif config["settings"]["mapping-tool"] == "bwamem": # Use `bwa mem` include: "mapping-bwa-mem.smk" + mapping_tool_good = True + elif config["settings"]["mapping-tool"] == "bwamem2": # Use `bwa mem2` include: "mapping-bwa-mem2.smk" + mapping_tool_good = True + elif config["settings"]["mapping-tool"] == "bowtie2": # Use `bowtie2` include: "mapping-bowtie2.smk" -else: + mapping_tool_good = True + + +# Weird syntax instead of just using an else branch here, due to +# https://github.com/snakemake/snakefmt/issues/239#issuecomment-2223276169 +if not mapping_tool_good: raise Exception("Unknown mapping-tool: " + config["settings"]["mapping-tool"]) # ================================================================================================= # Merge per-sample bam files # ================================================================================================= + # We need a helper function to expand based on wildcards. # The file names used here is what all the above mappers are expected to produce. def get_sorted_sample_bams(wildcards): - return expand( - "mapped/{{sample}}-{unit}.sorted.bam", - unit=get_sample_units(wildcards.sample) - ) + return expand("mapped/{{sample}}-{unit}.sorted.bam", unit=get_sample_units(wildcards.sample)) + # This is where all units are merged together. # We changed this behaviour. grenepipe v0.11.1 and before did _all_ the steps in this file @@ -92,60 +110,60 @@ def get_sorted_sample_bams(wildcards): # instead of potentially accidentally considering the different units as individual samples. rule merge_sample_unit_bams: input: - get_sorted_sample_bams + get_sorted_sample_bams, output: "mapped/{sample}.merged.bam", - touch("mapped/{sample}.merged.done") + touch("mapped/{sample}.merged.done"), params: - extra=config["params"]["samtools"]["merge"] - threads: - config["params"]["samtools"]["merge-threads"] + extra=config["params"]["samtools"]["merge"], + threads: config["params"]["samtools"]["merge-threads"] log: - "logs/samtools/merge/merge-{sample}.log" + "logs/samtools/merge/merge-{sample}.log", wrapper: "v3.13.6/bio/samtools/merge" + # ================================================================================================= # Filtering and Clipping Mapped Reads # ================================================================================================= + rule filter_mapped_reads: input: - "mapped/{sample}.merged.bam" + "mapped/{sample}.merged.bam", output: ( "mapped/{sample}.filtered.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}.filtered.bam") ), - touch("mapped/{sample}.filtered.done") + touch("mapped/{sample}.filtered.done"), params: - extra=config["params"]["samtools"]["view"] + " -b" + extra=config["params"]["samtools"]["view"] + " -b", conda: # Need our own env again, because of conflicting numpy and pandas version... "../envs/samtools.yaml" wrapper: "0.85.0/bio/samtools/view" + rule clip_read_overlaps: input: # Either get the mapped reads directly, or if we do filtering before, take that instead. - "mapped/{sample}.filtered.bam" if ( - config["settings"]["filter-mapped-reads"] - ) else ( - "mapped/{sample}.merged.bam" - ) + "mapped/{sample}.filtered.bam" + if (config["settings"]["filter-mapped-reads"]) + else ("mapped/{sample}.merged.bam"), output: ( "mapped/{sample}.clipped.bam" if config["settings"]["keep-intermediate"]["mapping"] else temp("mapped/{sample}.clipped.bam") ), - touch("mapped/{sample}.clipped.done") + touch("mapped/{sample}.clipped.done"), params: - extra=config["params"]["bamutil"]["extra"] + extra=config["params"]["bamutil"]["extra"], log: - "logs/bamutil/{sample}.log" + "logs/bamutil/{sample}.log", benchmark: "benchmarks/bamutil/{sample}.bench.log" conda: @@ -153,6 +171,7 @@ rule clip_read_overlaps: shell: "bam clipOverlap --in {input[0]} --out {output[0]} {params.extra} &> {log}" + def get_mapped_reads(wildcards): """Get mapped reads of given sample (with merged units), either unfiltered, filtered, and/or clipped.""" @@ -171,6 +190,7 @@ def get_mapped_reads(wildcards): return result + # We need to get a bit dirty here, as dedup names output files based on input file names, # so that depending on what kind of input (mapped, filtered, clipped) we give it, # we get differently named output files, which does not work well with snakemake. @@ -186,11 +206,13 @@ def get_mapped_read_infix(): result = "clipped" return result + # ================================================================================================= # Mark Duplicates # ================================================================================================= # Switch to the chosen duplicate marker tool +duplicates_tool_good = False if config["settings"]["duplicates-tool"] == "picard": # Bioinformatics tools are messed up... @@ -210,12 +232,18 @@ if config["settings"]["duplicates-tool"] == "picard": # Use `picard` include: "duplicates-picard.smk" + duplicates_tool_good = True + elif config["settings"]["duplicates-tool"] == "dedup": # Use `dedup` include: "duplicates-dedup.smk" -else: + duplicates_tool_good = True + + +# Again, fix for https://github.com/snakemake/snakefmt/issues/239#issuecomment-2223276169 +if not duplicates_tool_good: raise Exception("Unknown duplicates-tool: " + config["settings"]["duplicates-tool"]) # ================================================================================================= @@ -223,26 +251,30 @@ else: # ================================================================================================= if config["settings"]["recalibrate-base-qualities"]: + include: "mapping-recalibrate.smk" + # ================================================================================================= # Indexing # ================================================================================================= + # Generic rule for all bai files. # Bit weird as it produces log files with nested paths, but that's okay for now. rule bam_index: input: - "{prefix}.bam" + "{prefix}.bam", output: - "{prefix}.bam.bai" + "{prefix}.bam.bai", log: - "logs/samtools/index/{prefix}.log" + "logs/samtools/index/{prefix}.log", group: "mapping_extra" wrapper: "0.51.3/bio/samtools/index" + # ================================================================================================= # Final Mapping Result # ================================================================================================= @@ -254,6 +286,7 @@ rule bam_index: # As each of these optional steps itself also takes care of using previous optional steps # (see above), what we get here is the file name of the last step that we want to apply. + def get_mapping_result(bai=False): # case 1: no duplicate removal f = "mapped/{sample}.merged.bam" @@ -280,59 +313,63 @@ def get_mapping_result(bai=False): return f + # Return the bam file for a given sample. # Get all aligned reads of given sample, with all its units merged. # The function automatically gets which of the mapping results to use, depending on the config # setting (whether to remove duplicates, and whether to recalibrate the base qualities), # by using the get_mapping_result function, that gives the respective files depending on the config. def get_sample_bams(sample): - return expand( - get_mapping_result(), - sample=sample - ) + return expand(get_mapping_result(), sample=sample) + # Return the bai file(s) for a given sample def get_sample_bais(sample): - return expand( - get_mapping_result(True), - sample=sample - ) + return expand(get_mapping_result(True), sample=sample) + # Return the bam file(s) for a sample, given a wildcard param from a rule. def get_sample_bams_wildcards(wildcards): - return get_sample_bams( wildcards.sample ) + return get_sample_bams(wildcards.sample) + # Return the bai file(s) for a sample, given a wildcard param from a rule. def get_sample_bais_wildcards(wildcards): - return get_sample_bais( wildcards.sample ) + return get_sample_bais(wildcards.sample) + # Return the bam file(s) for all samples def get_all_bams(): # Make a list of all bams in the order as the samples list. res = list() for smp in config["global"]["sample-names"]: - res.append( get_mapping_result().format( sample=smp )) + res.append(get_mapping_result().format(sample=smp)) return res + # Return the bai file(s) for all samples def get_all_bais(): res = list() for smp in config["global"]["sample-names"]: - res.append( get_mapping_result(True).format( sample=smp )) + res.append(get_mapping_result(True).format(sample=smp)) return res + # ================================================================================================= # All bams, but not SNP calling # ================================================================================================= + # This alternative target rule executes all steps up to th mapping, and yields the final bam # files that would otherwise be used for variant calling in the downstream process. # That is, depending on the config, these are the sorted+merged, filtered, remove duplicates, or # recalibrated base qualities bam files. rule all_bams: input: - get_all_bams() + get_all_bams(), + # The `all_bams` rule is local. It does not do anything anyway, # except requesting the other rules to run. -localrules: all_bams +localrules: + all_bams, diff --git a/workflow/rules/pileup.smk b/workflow/rules/pileup.smk index df4cc87..0115e46 100644 --- a/workflow/rules/pileup.smk +++ b/workflow/rules/pileup.smk @@ -2,24 +2,25 @@ # Merge bams # ================================================================================================= + rule mpileup_merge_all: input: - get_all_bams() + get_all_bams(), output: pipe("mpileup/all.merged.bam"), - touch("mpileup/all.merged.done") + touch("mpileup/all.merged.done"), params: # Need file overwrite flag, see above. - extra=config["params"]["samtools"]["merge"] + " -f" - threads: - # Samtools takes additional threads through its option -@ - # This value - 1 will be sent to -@ - config["params"]["samtools"]["merge-threads"] + extra=config["params"]["samtools"]["merge"] + " -f", + # Samtools takes additional threads through its option -@ + # This value - 1 will be sent to -@ + threads: config["params"]["samtools"]["merge-threads"] log: - "logs/samtools/mpileup/merge-all.log" + "logs/samtools/mpileup/merge-all.log", wrapper: "v3.13.6/bio/samtools/merge" + # ================================================================================================= # Sample Names # ================================================================================================= @@ -32,15 +33,19 @@ rule mpileup_merge_all: # Attention: This order needs to be the same as what the below pilup rules use, # otherwise we miss the point of writing these lists! + rule mpileup_all_sample_names: output: - "mpileup/all-sample-names.txt" + "mpileup/all-sample-names.txt", run: with open(output[0], "w") as out: for sample in config["global"]["sample-names"]: out.write(sample + "\n") -localrules: mpileup_all_sample_names + +localrules: + mpileup_all_sample_names, + # ================================================================================================= # Pileup @@ -51,71 +56,73 @@ localrules: mpileup_all_sample_names # We always use the merged units per sample here. This is a change after grenepipe v0.11.1, # as it did not seem like treating each unit of each sample as an individual file made much sense. + rule mpileup_individual_sample: input: - bam=get_sample_bams_wildcards, # provided in mapping.smk - reference_genome=config["data"]["reference-genome"] + bam=get_sample_bams_wildcards, # provided in mapping.smk + reference_genome=config["data"]["reference-genome"], output: - "mpileup/{sample}.mpileup.gz" + "mpileup/{sample}.mpileup.gz", params: - extra=config["params"]["samtools"]["pileup"] + extra=config["params"]["samtools"]["pileup"], log: - "logs/samtools/mpileup/samples-{sample}.log" + "logs/samtools/mpileup/samples-{sample}.log", wrapper: "v3.13.6/bio/samtools/mpileup" + rule mpileup_all_samples: input: bam=get_all_bams(), reference_genome=config["data"]["reference-genome"], - list="mpileup/all-sample-names.txt" + list="mpileup/all-sample-names.txt", output: - "mpileup/all-samples.mpileup.gz" + "mpileup/all-samples.mpileup.gz", params: - extra=config["params"]["samtools"]["pileup"] + extra=config["params"]["samtools"]["pileup"], log: - "logs/samtools/mpileup/all-merged-units.log" + "logs/samtools/mpileup/all-merged-units.log", wrapper: "v3.13.6/bio/samtools/mpileup" + rule mpileup_all_merged_samples: input: bam="mpileup/all.merged.bam", - reference_genome=config["data"]["reference-genome"] + reference_genome=config["data"]["reference-genome"], output: - "mpileup/all-merged-samples.mpileup.gz" + "mpileup/all-merged-samples.mpileup.gz", params: - extra=config["params"]["samtools"]["pileup"] + extra=config["params"]["samtools"]["pileup"], log: - "logs/samtools/mpileup/all-merged-samples.log" + "logs/samtools/mpileup/all-merged-samples.log", wrapper: "v3.13.6/bio/samtools/mpileup" + # ================================================================================================= # All pileups, but not SNP calling # ================================================================================================= + # This alternative target rule executes all steps up to th mapping and mpileup creation. # This is the same as the above all_bams rule, but additionally also requests the pileups. rule all_pileups: input: - expand( - "mpileup/{sample}.mpileup.gz", - sample=config["global"]["sample-names"] - ) if "individual-samples" in config["settings"]["pileups"] else [], - ( - "mpileup/all-samples.mpileup.gz" - if "all-samples" in config["settings"]["pileups"] - else [] - ), + expand("mpileup/{sample}.mpileup.gz", sample=config["global"]["sample-names"]) + if "individual-samples" in config["settings"]["pileups"] + else [], + ("mpileup/all-samples.mpileup.gz" if "all-samples" in config["settings"]["pileups"] else []), ( "mpileup/all-merged-samples.mpileup.gz" if "all-merged-samples" in config["settings"]["pileups"] else [] - ) + ), output: - touch("mpileup/all-pileups.done") + touch("mpileup/all-pileups.done"), + # The `all_pileups` rule is local. It does not do anything anyway, # except requesting the other rules to run. -localrules: all_pileups +localrules: + all_pileups, diff --git a/workflow/rules/prep.smk b/workflow/rules/prep.smk index eeb4f10..cab72a9 100644 --- a/workflow/rules/prep.smk +++ b/workflow/rules/prep.smk @@ -2,24 +2,26 @@ # Reference Genome # ================================================================================================= + # Helper function to get the name of the genome dictorary file as expected by GATK def genome_dict(): return os.path.splitext(config["data"]["reference-genome"])[0] + ".dict" + # We define some local variables for simplicity, and delete them later # in order to not spam the global scope by accident. # Get file names from config file. The reference genome file has already been stripped of the # `.gz` extension if present in common. -genome=config["data"]["reference-genome"] +genome = config["data"]["reference-genome"] # We need to remove absolute paths here, otherwise the log files will contain broken paths. -genomename = os.path.basename( config["data"]["reference-genome"] ) -genomedir = os.path.dirname( config["data"]["reference-genome"] ) +genomename = os.path.basename(config["data"]["reference-genome"]) +genomedir = os.path.dirname(config["data"]["reference-genome"]) # We write the log file to where the known variants file is, so that this is independent # of the particular run, making the log files easier to find for users. -genome_logdir = os.path.join( genomedir, "logs" ) +genome_logdir = os.path.join(genomedir, "logs") # In all rules below, we use hard coded file names (no wildcards), as snakemake cannot handle # absolute file paths properly and gives us no reasonable way to use lambdas in the `log` part @@ -27,32 +29,34 @@ genome_logdir = os.path.join( genomedir, "logs" ) # of our input genome. We do not want that - and as this whole prep script here only serves # one purpose (prepare one genome for a given config file), we just hard code for simplicity. + # Write fai indices for the fasta reference genome file. # This is a checkpoint, as downstream rules that parallelize over chromosomes/contigs of the # reference genome will need to read this file in order to get the list of contigs. # See get_fai() in calling.smk for the usage of this checkpoint. checkpoint samtools_faidx: input: - genome + genome, output: - genome + ".fai" + genome + ".fai", log: - os.path.join( genome_logdir, genomename + ".samtools_faidx.log" ) + os.path.join(genome_logdir, genomename + ".samtools_faidx.log"), params: - "" # optional params string + "", # optional params string wrapper: "0.51.3/bio/samtools/faidx" + # Uncompress the reference genome if it is gz, without deleting the original. # Also, we provide our test data in gz-compressed form, in order to keep data in the git repo low. # Hence, we have to decompress first. rule decompress_genome: input: - genome + ".gz" + genome + ".gz", output: - genome + genome, log: - os.path.join( genome_logdir, genomename + ".decompress.log" ) + os.path.join(genome_logdir, genomename + ".decompress.log"), shell: # Cannot use gunzip here, as CentOS does not support the --keep option... # "gunzip --keep {input}" @@ -61,27 +65,30 @@ rule decompress_genome: # So back to gunzip, but with a different arg to keep the original file... "gunzip -c {input} > {output}" + localrules: - decompress_genome + decompress_genome, + # Write indices for a given fasta reference genome file. rule bwa_index: input: - genome + genome, output: genome + ".amb", genome + ".ann", genome + ".bwt", genome + ".pac", - genome + ".sa" + genome + ".sa", log: - os.path.join( genome_logdir, genomename + ".bwa_index.log" ) + os.path.join(genome_logdir, genomename + ".bwa_index.log"), params: prefix=genome, - algorithm="bwtsw" + algorithm="bwtsw", wrapper: "0.51.3/bio/bwa/index" + # By default, we use the above bwa index mapping, but here we also have a rule for bwa mem2 indexing, # as this needs some special indices. However, it unfortunately does not produce all indices # of the original bwa index (because... bioinformatics...), so we have to work around this, @@ -96,19 +103,18 @@ if config["settings"]["mapping-tool"] == "bwamem2": # are created there... wow, hacky hacky. rule bwa_mem2_index: input: - genome + genome, output: genome + ".0123", - genome + ".bwt.2bit.64" - + genome + ".bwt.2bit.64", # Files that are also created by bwa mem2 index: amb, ann, pac. # We checked with our test data, and they are identical to the ones produced by # bwa index above, so we can just delete them. params: - tempdir = genomedir + "/bwa-mem2-index-temp/", - basename = genomedir + "/bwa-mem2-index-temp/" + genomename + tempdir=genomedir + "/bwa-mem2-index-temp/", + basename=genomedir + "/bwa-mem2-index-temp/" + genomename, log: - os.path.join( genome_logdir, genomename + ".bwa-mem2_index.log" ) + os.path.join(genome_logdir, genomename + ".bwa-mem2_index.log"), conda: "../envs/bwa-mem2.yaml" shell: @@ -121,24 +127,23 @@ if config["settings"]["mapping-tool"] == "bwamem2": "mv {params.basename}.bwt.2bit.64 {output[1]} ; " "rm -r {params.tempdir}" + # Write a dictionary file for the genome. # The input file extension is replaced by `dict`, instead of adding to it, so we have to trick # around with the output file name here. rule sequence_dictionary: input: - genome + genome, output: - genome_dict() + genome_dict(), params: # See duplicates-picard.smk for the reason whe need this on MacOS. - extra = ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + extra=( + " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else "" + ), # base= lambda wc: os.path.splitext(genome)[0], log: - os.path.join( genome_logdir, genomename + ".sequence_dictionary.log" ) + os.path.join(genome_logdir, genomename + ".sequence_dictionary.log"), conda: "../envs/picard.yaml" # We used GATK here for a while, but for whatever reason, in our GitHub Actions tests, @@ -148,35 +153,39 @@ rule sequence_dictionary: # Trying a (limited) loop now, until it works... shell: "for ITERATION in `seq 1 10` ; do " - " echo -e \"\\nAttempt $ITERATION\\n\" >> {log} 2>&1 ; " + ' echo -e "\\nAttempt $ITERATION\\n" >> {log} 2>&1 ; ' " rm -f {output} ; " " picard CreateSequenceDictionary " " REFERENCE={input} OUTPUT={output} {params.extra} >> {log} 2>&1 ; " " LENGTH=`cat {output} | wc -l` ; " - " if [ \"$LENGTH\" -gt 1 ]; then " - " echo -e \"\\n{output} has $LENGTH lines \" >> {log} 2>&1 ; " + ' if [ "$LENGTH" -gt 1 ]; then ' + ' echo -e "\\n{output} has $LENGTH lines " >> {log} 2>&1 ; ' " break ; " " fi ; " "done" - # shell: - # "picard CreateSequenceDictionary REFERENCE={input} OUTPUT={output} {params.extra} > {log} 2>&1" - # "gatk CreateSequenceDictionary -R {input} -O {output} --VERBOSITY DEBUG > {log} 2>&1" + + +# shell: +# "picard CreateSequenceDictionary REFERENCE={input} OUTPUT={output} {params.extra} > {log} 2>&1" +# "gatk CreateSequenceDictionary -R {input} -O {output} --VERBOSITY DEBUG > {log} 2>&1" + # Get some statistics about the reference genome rule reference_seqkit: input: - genome + genome, output: - genome + ".seqkit" + genome + ".seqkit", params: - extra = config["params"]["seqkit"]["extra"] + extra=config["params"]["seqkit"]["extra"], log: - os.path.join( genome_logdir, genomename + ".seqkit.log" ) + os.path.join(genome_logdir, genomename + ".seqkit.log"), conda: "../envs/seqkit.yaml" shell: "seqkit stats {input} {params.extra} > {output} 2> {log}" + # ================================================================================================= # Known Variants # ================================================================================================= @@ -189,12 +198,12 @@ rule reference_seqkit: # here, because those rules will never be invoked in that case, and so, snakemake does not seem to # fail then. Still, we make it even more fail safe by setting it to a dummy string then that # will just lead to rules that are never executed (in the case of no known variants file). -variants=config["data"]["known-variants"] +variants = config["data"]["known-variants"] has_known_variants = True if isinstance(variants, list) or not variants: if len(variants) > 0: - raise Exception("Known variants has to be either a file path or an empty list." ) - variants="dummyfile" + raise Exception("Known variants has to be either a file path or an empty list.") + variants = "dummyfile" has_known_variants = False else: # Somehow, some tool (was it GATK?) requires known variants to be in vcf.gz format, @@ -209,49 +218,54 @@ else: variants = os.path.splitext(variants)[0] else: raise Exception( - "Invalid known variants file type: '" + variants + - "'. Needs to be either .vcf or .vcf.gz for some of the tools to work." + "Invalid known variants file type: '" + + variants + + "'. Needs to be either .vcf or .vcf.gz for some of the tools to work." ) # We write the log file to where the known variants file is, so that this is independent # of the particular run, making the log files easier to find for users. -variant_logdir = os.path.join( os.path.dirname(variants), "logs" ) +variant_logdir = os.path.join(os.path.dirname(variants), "logs") + # Compress the known variants vcf file using gzip, as this seems needed for GATK. rule variants_vcf_compress: input: - variants + variants, output: - variants + ".gz" + variants + ".gz", group: "known_variants" log: - os.path.join( variant_logdir, os.path.basename(variants) + ".vcf_compress.log" ) + os.path.join(variant_logdir, os.path.basename(variants) + ".vcf_compress.log"), wrapper: "0.27.1/bio/vcf/compress" + # Write an index file for the known variants file. rule variants_vcf_index: input: - variants + ".gz" + variants + ".gz", output: - variants + ".gz.tbi" + variants + ".gz.tbi", params: # pass arguments to tabix (e.g. index a vcf) - "-p vcf" + "-p vcf", group: "known_variants" log: - os.path.join( variant_logdir, os.path.basename(variants) + ".vcf_index.log" ) + os.path.join(variant_logdir, os.path.basename(variants) + ".vcf_index.log"), conda: "../envs/tabix.yaml" wrapper: "0.55.1/bio/tabix" + # ================================================================================================= # All prep rule # ================================================================================================= + # This alternative target rule executes all prep rules, so that we can get all indices of the # reference etc. This is useful in settings where we have an unprepared reference, but want # to run the pipeline multiple times already. If we did not prep the genome first, each pipeline @@ -259,19 +273,20 @@ rule variants_vcf_index: rule all_prep: input: ref=genome, - ref_idcs=expand( - genome + ".{ext}", - ext=[ "amb", "ann", "bwt", "pac", "sa", "fai" ] + ref_idcs=expand(genome + ".{ext}", ext=["amb", "ann", "bwt", "pac", "sa", "fai"]), + ref_idcs2=( + expand(genome + ".{ext}", ext=["0123", "bwt.2bit.64"]) + if config["settings"]["mapping-tool"] == "bwamem2" + else [] ), - ref_idcs2=expand( - genome + ".{ext}", - ext=[ "0123", "bwt.2bit.64" ] - ) if config["settings"]["mapping-tool"] == "bwamem2" else [], ref_dict=genome_dict(), ref_stat=genome + ".seqkit", - known_vars=variants + ".gz.tbi" if has_known_variants else [] + known_vars=variants + ".gz.tbi" if has_known_variants else [], + + +localrules: + all_prep, -localrules: all_prep # Clean up the variables that we used above del genome diff --git a/workflow/rules/qc-bam.smk b/workflow/rules/qc-bam.smk index d8a04bd..36dcff8 100644 --- a/workflow/rules/qc-bam.smk +++ b/workflow/rules/qc-bam.smk @@ -1,11 +1,12 @@ import platform + # The bam QC rules can either be run on the samples right after merging their units, # or after the further downstream processing steps (filtering, clipping, dedup, recal etc) # have been run. Here, we offer a simple function to switch between these inputs, # in order to avoid duplicating thise code below. # It expects the tool and key from the config.yaml -def bam_qc_input( tool, key ): +def bam_qc_input(tool, key): if key not in config["params"][tool]: raise Exception("config key " + key + " not found for tool " + tool) if config["params"][tool][key] == "processed": @@ -17,17 +18,19 @@ def bam_qc_input( tool, key ): "Unknown setting for " + tool + " " + key + ": " + config["params"][tool][key] ) + # ================================================================================================= # samtools stats and flagstat # ================================================================================================= + rule samtools_stats: input: - bam_qc_input("samtools", "stats-bams") + bam_qc_input("samtools", "stats-bams"), output: - "qc/samtools-stats/{sample}.txt" + "qc/samtools-stats/{sample}.txt", log: - "logs/samtools-stats/{sample}.log" + "logs/samtools-stats/{sample}.log", benchmark: "benchmarks/samtools-stats/{sample}.bench.log" group: @@ -37,24 +40,25 @@ rule samtools_stats: wrapper: "0.27.1/bio/samtools/stats" + rule samtools_stats_collect: input: - expand( - "qc/samtools-stats/{sample}.txt", - sample=config["global"]["sample-names"] - ) + expand("qc/samtools-stats/{sample}.txt", sample=config["global"]["sample-names"]), output: - touch("qc/samtools-stats/samtools-stats.done") + touch("qc/samtools-stats/samtools-stats.done"), + + +localrules: + samtools_stats_collect, -localrules: samtools_stats_collect rule samtools_flagstat: input: - bam_qc_input("samtools", "flagstat-bams") + bam_qc_input("samtools", "flagstat-bams"), output: - "qc/samtools-flagstat/{sample}.txt" + "qc/samtools-flagstat/{sample}.txt", log: - "logs/samtools-flagstat/{sample}.log" + "logs/samtools-flagstat/{sample}.log", benchmark: "benchmarks/samtools-flagstat/{sample}.bench.log" group: @@ -64,25 +68,27 @@ rule samtools_flagstat: wrapper: "0.64.0/bio/samtools/flagstat" + rule samtools_flagstat_collect: input: - expand( - "qc/samtools-flagstat/{sample}.txt", - sample=config["global"]["sample-names"] - ) + expand("qc/samtools-flagstat/{sample}.txt", sample=config["global"]["sample-names"]), output: - touch("qc/samtools-flagstat/samtools-flagstat.done") + touch("qc/samtools-flagstat/samtools-flagstat.done"), + + +localrules: + samtools_flagstat_collect, -localrules: samtools_flagstat_collect # ================================================================================================= # qualimap # ================================================================================================= + # Rule for running qualimap for each bam file of a sample, with its units merged. rule qualimap_sample: input: - bam_qc_input("qualimap", "bams") + bam_qc_input("qualimap", "bams"), output: # The output of this rule (for now) is a directory, so we have no way of telling whether # the rule succeeded by that. We hence add the `done` file here as well, which (according @@ -90,8 +96,7 @@ rule qualimap_sample: # We had instances in the past where cluster jobs of this rule failed, and left a # half-finished directory behind; let's hope that this avoids that. outdir=directory("qc/qualimap/{sample}"), - done=touch("qc/qualimap/{sample}.done") - + done=touch("qc/qualimap/{sample}.done"), # Alternatively, specify all individual files, which gives more control, # but also more spammed output # "qc/qualimap/{sample}/genome_results.txt", @@ -101,11 +106,10 @@ rule qualimap_sample: # "qc/qualimap/{sample}/raw_data_qualimapReport/mapped_reads_gc-content_distribution.txt", params: extra=config["params"]["qualimap"]["extra"], - outdir="qc/qualimap/{sample}" - threads: - config["params"]["qualimap"]["threads"] + outdir="qc/qualimap/{sample}", + threads: config["params"]["qualimap"]["threads"] log: - "logs/qualimap/{sample}_qualimap.log" + "logs/qualimap/{sample}_qualimap.log", group: "qualimap" conda: @@ -115,13 +119,10 @@ rule qualimap_sample: "-outdir {params.outdir} -outformat HTML " "{params.extra} > {log} 2>&1" + rule qualimap_collect: input: - expand( - "qc/qualimap/{sample}.done", - sample=config["global"]["sample-names"] - ) - + expand("qc/qualimap/{sample}.done", sample=config["global"]["sample-names"]), # Old, explicitly request each output file. Also still uses units... # expand( # "qc/qualimap/{u.sample}-{u.unit}/genome_results.txt", @@ -144,14 +145,18 @@ rule qualimap_collect: # u=config["global"]["samples"].itertuples() # ), output: - touch("qc/qualimap/qualimap.done") + touch("qc/qualimap/qualimap.done"), + + +localrules: + qualimap_collect, -localrules: qualimap_collect # ================================================================================================= # Picard CollectMultipleMetrics # ================================================================================================= + # The snakemake wrapper for picard/collectmultiplemetrics uses output file extensions # to select the different tools for the metrics. # Usable extensions (and which tools they implicitly call) are listed here: @@ -182,46 +187,52 @@ def picard_collectmultiplemetrics_exts(): # res.append(".rna_metrics") return res + rule picard_collectmultiplemetrics: input: bam=bam_qc_input("picard", "CollectMultipleMetrics-bams"), - ref=config["data"]["reference-genome"] + ref=config["data"]["reference-genome"], output: - expand( "qc/picard/{{sample}}{ext}", ext=picard_collectmultiplemetrics_exts()) + expand("qc/picard/{{sample}}{ext}", ext=picard_collectmultiplemetrics_exts()), log: - "logs/picard/multiple_metrics/{sample}.log" + "logs/picard/multiple_metrics/{sample}.log", params: - config["params"]["picard"]["CollectMultipleMetrics-java-opts"] + " " + - config["params"]["picard"]["CollectMultipleMetrics-extra"] + ( - " USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" - if platform.system() == "Darwin" - else "" - ) + config["params"]["picard"]["CollectMultipleMetrics-java-opts"] + + " " + + config["params"]["picard"]["CollectMultipleMetrics-extra"] + + (" USE_JDK_DEFLATER=true USE_JDK_INFLATER=true" if platform.system() == "Darwin" else ""), conda: "../envs/picard.yaml" script: # We use our own version of the wrapper here, which fixes issues with missing files in cases # where Picard does not have enough data for a specific metric to run. "../scripts/picard-collectmultiplemetrics.py" - # wrapper: - # "0.72.0/bio/picard/collectmultiplemetrics" + + +# wrapper: +# "0.72.0/bio/picard/collectmultiplemetrics" + rule picard_collectmultiplemetrics_collect: input: expand( "qc/picard/{sample}{ext}", sample=config["global"]["sample-names"], - ext=picard_collectmultiplemetrics_exts() - ) + ext=picard_collectmultiplemetrics_exts(), + ), output: - touch("qc/picard/collectmultiplemetrics.done") + touch("qc/picard/collectmultiplemetrics.done"), + + +localrules: + picard_collectmultiplemetrics_collect, -localrules: picard_collectmultiplemetrics_collect # ================================================================================================= # Deduplication # ================================================================================================= + # Different dedup tools produce different summary files. See above for details. # This function simply returns these file names as strings, without replacing the wildcards. # Then, when the function is called below, these are expanded. @@ -234,6 +245,7 @@ def get_dedup_report(): else: raise Exception("Unknown duplicates-tool: " + config["settings"]["duplicates-tool"]) + def get_dedup_done(): # Switch to the chosen duplicate marker tool if config["settings"]["duplicates-tool"] == "picard": @@ -243,13 +255,13 @@ def get_dedup_done(): else: raise Exception("Unknown duplicates-tool: " + config["settings"]["duplicates-tool"]) + rule dedup_reports_collect: input: - expand( - get_dedup_report(), - sample=config["global"]["sample-names"] - ) + expand(get_dedup_report(), sample=config["global"]["sample-names"]), output: - touch( get_dedup_done() ) + touch(get_dedup_done()), + -localrules: dedup_reports_collect +localrules: + dedup_reports_collect, diff --git a/workflow/rules/qc-fastq.smk b/workflow/rules/qc-fastq.smk index 310f47a..f283490 100644 --- a/workflow/rules/qc-fastq.smk +++ b/workflow/rules/qc-fastq.smk @@ -12,17 +12,17 @@ # function resolves them into the input file paths. assert "fastqc" not in config["global"] -config["global"]["fastqc"] = pd.DataFrame( columns = [ "sample", "unit", "id", "file" ]) -def add_fastqc_file( sample, unit, id, file ): - config["global"]["fastqc"] = pd.concat([ - config["global"]["fastqc"], - pd.DataFrame({ - "sample": [sample], - "unit": [unit], - "id": [id], - "file": [file] - }) - ], ignore_index=True) +config["global"]["fastqc"] = pd.DataFrame(columns=["sample", "unit", "id", "file"]) + + +def add_fastqc_file(sample, unit, id, file): + config["global"]["fastqc"] = pd.concat( + [ + config["global"]["fastqc"], + pd.DataFrame({"sample": [sample], "unit": [unit], "id": [id], "file": [file]}), + ], + ignore_index=True, + ) # Deprecated way of using append instead of concat. We use concat now to avoid warnings # with newer pandas versions, but keep the below for reference. @@ -33,12 +33,13 @@ def add_fastqc_file( sample, unit, id, file ): # "file": file # }, ignore_index=True) + if config["params"]["fastqc"]["input"] == "samples": # Simple case: raw fastq files from the samples. for smp in config["global"]["samples"].itertuples(): - add_fastqc_file( smp.sample, smp.unit, "R1", smp.fq1 ) + add_fastqc_file(smp.sample, smp.unit, "R1", smp.fq1) if isinstance(smp.fq2, str): - add_fastqc_file( smp.sample, smp.unit, "R2", smp.fq2 ) + add_fastqc_file(smp.sample, smp.unit, "R2", smp.fq2) elif config["params"]["fastqc"]["input"] == "trimmed": # Trimmed files, which can come in more varieties. for smp in config["global"]["samples"].itertuples(): @@ -51,34 +52,40 @@ elif config["params"]["fastqc"]["input"] == "trimmed": # Now let's see if we have merged them or not, and add to our result accordingly. if config["settings"]["merge-paired-end-reads"]: assert len(trimmed) == 1 - add_fastqc_file( smp.sample, smp.unit, "trimmed-merged", trimmed[0] ) + add_fastqc_file(smp.sample, smp.unit, "trimmed-merged", trimmed[0]) else: # If not merged, it's either single end or paired end. assert len(trimmed) == 1 or len(trimmed) == 2 - add_fastqc_file( smp.sample, smp.unit, "trimmed-R1", trimmed[0] ) + add_fastqc_file(smp.sample, smp.unit, "trimmed-R1", trimmed[0]) if len(trimmed) == 2: - add_fastqc_file( smp.sample, smp.unit, "trimmed-R2", trimmed[1] ) + add_fastqc_file(smp.sample, smp.unit, "trimmed-R2", trimmed[1]) else: raise Exception("Unknown fastqc input setting: " + config["params"]["fastqc"]["input"]) + def get_fastqc_input(wildcards): - return config["global"]["fastqc"].loc[ - (config["global"]["fastqc"]['sample'] == wildcards.sample) & - (config["global"]["fastqc"]['unit'] == wildcards.unit ) & - (config["global"]["fastqc"]['id'] == wildcards.id ), - ["file"] - ].file + return ( + config["global"]["fastqc"] + .loc[ + (config["global"]["fastqc"]["sample"] == wildcards.sample) + & (config["global"]["fastqc"]["unit"] == wildcards.unit) + & (config["global"]["fastqc"]["id"] == wildcards.id), + ["file"], + ] + .file + ) + rule fastqc: input: - get_fastqc_input + get_fastqc_input, output: html="qc/fastqc/{sample}-{unit}-{id}_fastqc.html", - zip="qc/fastqc/{sample}-{unit}-{id}_fastqc.zip" + zip="qc/fastqc/{sample}-{unit}-{id}_fastqc.zip", params: - config["params"]["fastqc"]["extra"] + config["params"]["fastqc"]["extra"], log: - "logs/fastqc/{sample}-{unit}-{id}.log" + "logs/fastqc/{sample}-{unit}-{id}.log", benchmark: "benchmarks/fastqc/{sample}-{unit}-{id}.bench.log" group: @@ -88,43 +95,50 @@ rule fastqc: script: # We use our own version of the wrapper here, as that wrapper is just badly implemented. "../scripts/fastqc.py" - # wrapper: - # "0.27.1/bio/fastqc" + + +# wrapper: +# "0.27.1/bio/fastqc" + rule fastqc_collect: input: expand( "qc/fastqc/{u.sample}-{u.unit}-{u.id}_fastqc.html", - u=config["global"]["fastqc"].itertuples() + u=config["global"]["fastqc"].itertuples(), ), expand( "qc/fastqc/{u.sample}-{u.unit}-{u.id}_fastqc.zip", - u=config["global"]["fastqc"].itertuples() - ) + u=config["global"]["fastqc"].itertuples(), + ), output: - touch("qc/fastqc/fastqc.done") + touch("qc/fastqc/fastqc.done"), + + +localrules: + fastqc_collect, -localrules: fastqc_collect # ================================================================================================= # Trimming # ================================================================================================= + # Different trimming tools produce different summary files. We here expand ourselves, # because we need to retrieve the correct file type for each of them (single or paired end), # which cannot easily be done with the simple snakemake expand function. def get_trimming_reports(): - result=[] + result = [] for smp in config["global"]["samples"].itertuples(): # The get_trimming_report() function is part of each trimming tool rule file. # We here hence call the respective correct function for each tool. - result.append( get_trimming_report( smp.sample, smp.unit )) + result.append(get_trimming_report(smp.sample, smp.unit)) # Now append the file for the sample to the result list # if config["settings"]["trimming-tool"] == "adapterremoval": - # result.append( "trimmed/" + smp.sample + "-" + smp.unit + "-" + suffix + ".settings" ) + # result.append( "trimmed/" + smp.sample + "-" + smp.unit + "-" + suffix + ".settings" ) # elif config["settings"]["trimming-tool"] == "cutadapt": - # result.append( "trimmed/" + smp.sample + "-" + smp.unit + ".qc-" + suffix + ".txt" ) + # result.append( "trimmed/" + smp.sample + "-" + smp.unit + ".qc-" + suffix + ".txt" ) # elif config["settings"]["trimming-tool"] == "fastp": # result.append( "trimmed/" + smp.sample + "-" + smp.unit + "-" + suffix + "-fastp.json" ) # elif config["settings"]["trimming-tool"] == "skewer": @@ -135,10 +149,13 @@ def get_trimming_reports(): # raise Exception("Unknown trimming-tool: " + config["settings"]["trimming-tool"]) return result + rule trimming_reports_collect: input: - get_trimming_reports() + get_trimming_reports(), output: - touch("trimmed/trimming-reports.done") + touch("trimmed/trimming-reports.done"), + -localrules: trimming_reports_collect +localrules: + trimming_reports_collect, diff --git a/workflow/rules/qc-vcf.smk b/workflow/rules/qc-vcf.smk index 259b771..a9791de 100644 --- a/workflow/rules/qc-vcf.smk +++ b/workflow/rules/qc-vcf.smk @@ -2,20 +2,21 @@ # bcftools stats # ================================================================================================= + rule bcftools_stats: input: + # we use the filtered file if a filtering is done, or the unfiltered if not. calls=( - # we use the filtered file if a filtering is done, or the unfiltered if not. "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else "genotyped/all.vcf.gz" - ) + ), output: - "qc/bcftools-stats/stats.vchk" + "qc/bcftools-stats/stats.vchk", log: - "logs/bcftools-stats/bcftools.stats.log" + "logs/bcftools-stats/bcftools.stats.log", params: - config["params"]["bcftools"]["stats"] + config["params"]["bcftools"]["stats"], conda: "../envs/bcftools.yaml" group: @@ -23,17 +24,18 @@ rule bcftools_stats: wrapper: "v1.7.0/bio/bcftools/stats" + rule bcftools_stats_plot: input: - "qc/bcftools-stats/stats.vchk" + "qc/bcftools-stats/stats.vchk", output: # We only request the PDF here, but other files are generated as well. - "qc/bcftools-stats/summary.pdf" + "qc/bcftools-stats/summary.pdf", log: - "logs/bcftools-stats/bcftools.stats.plot.log" + "logs/bcftools-stats/bcftools.stats.plot.log", params: outdir="qc/bcftools-stats", - extra=config["params"]["bcftools"]["stats-plot"] + extra=config["params"]["bcftools"]["stats-plot"], conda: "../envs/bcftools.yaml" group: @@ -50,9 +52,9 @@ rule bcftools_stats_plot: # So instead we specify the path to the file directly. So weird. "( plot-vcfstats --prefix {params.outdir} {params.extra} {input} &> {log} || true ) ; " "if [ -f {output} ]; then " - " echo \"Success with pdflatex\" >> {log} ; " + ' echo "Success with pdflatex" >> {log} ; ' "else" - " echo \"Failed with pdflatex\" >> {log} ; " - " echo \"Trying tectonic instead\" >> {log} ; " + ' echo "Failed with pdflatex" >> {log} ; ' + ' echo "Trying tectonic instead" >> {log} ; ' " tectonic {params.outdir}/summary.tex >> {log} 2>&1 ; " "fi" diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index 072f033..043e388 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -3,10 +3,12 @@ include: "qc-fastq.smk" include: "qc-bam.smk" include: "qc-vcf.smk" + # ================================================================================================= # MultiQC # ================================================================================================= + # Unfortunately, in some cluster environments, multiqc does not work due to char encoding issues, see # https://github.com/ewels/MultiQC/issues/484 ... If you run into this issue, try running it locally. # @@ -20,44 +22,41 @@ rule multiqc: # Fastq QC tools "qc/fastqc/fastqc.done", "trimmed/trimming-reports.done", - # Mapping QC tools "qc/samtools-stats/samtools-stats.done", "qc/samtools-flagstat/samtools-flagstat.done", "qc/qualimap/qualimap.done", "qc/picard/collectmultiplemetrics.done", get_dedup_done() if config["settings"]["remove-duplicates"] else [], - # VCF QC tools, if requested # We only need the stats.vchk file of bcftools stats, but request the pdf here as well, # so that the bctools internal plots get generated by the bcftools_stats_plot rule as well. "qc/bcftools-stats/stats.vchk" if config["settings"]["bcftools-stats"] else [], "qc/bcftools-stats/summary.pdf" if config["settings"]["bcftools-stats"] else [], - # Annotations, if requested "annotated/snpeff.csv" if config["settings"]["snpeff"] else [], "annotated/vep_summary.html" if config["settings"]["vep"] else [], - # Damage profiling, if requested "mapdamage/mapdamage.done" if config["settings"]["mapdamage"] else [], - "damageprofiler/damageprofiler.done" if config["settings"]["damageprofiler"] else [] - + "damageprofiler/damageprofiler.done" if config["settings"]["damageprofiler"] else [], output: report("qc/multiqc.html", caption="../report/multiqc.rst", category="Quality control"), "qc/multiqc.zip", params: config["params"]["multiqc"]["extra"], log: - "logs/multiqc.log" + "logs/multiqc.log", conda: # We use a conda environment on top of the wrapper, as the wrapper always causes # issues with missing python modules and mismatching program versions and stuff... "../envs/multiqc.yaml" wrapper: "v3.13.6/bio/multiqc" - # script: - # # We use our own version of the wrapper here, to troubleshoot dependecy issues... - # "../scripts/multiqc.py" + + +# script: +# # We use our own version of the wrapper here, to troubleshoot dependecy issues... +# "../scripts/multiqc.py" # Rule is not submitted as a job to the cluster. # Edit: It is now, as it turns out to be quite the process for large datasets... @@ -67,6 +66,7 @@ rule multiqc: # All QC, but not SNP calling # ================================================================================================= + # This alternative target rule executes all quality control (QC) steps of read trimming and mapping, # but does not call SNPs, and does not call snpeff. The result is mainly the MultiQC report (without # the snpeff part however), as well as the fastqc reports. @@ -74,10 +74,11 @@ rule all_qc: input: # Quality control "qc/multiqc.html", - # Reference genome statistics, provided in the prep steps - config["data"]["reference-genome"] + ".seqkit" + config["data"]["reference-genome"] + ".seqkit", + # The `all_qc` rule is local. It does not do anything anyway, # except requesting the other rules to run. -localrules: all_qc +localrules: + all_qc, diff --git a/workflow/rules/stats.smk b/workflow/rules/stats.smk index 1b5bc7c..648e809 100644 --- a/workflow/rules/stats.smk +++ b/workflow/rules/stats.smk @@ -2,13 +2,14 @@ # Calls Table # ================================================================================================= + rule vcf_to_tsv: input: - "annotated/snpeff.vcf.gz" + "annotated/snpeff.vcf.gz", output: - report("tables/calls.tsv.gz", caption="../report/calls.rst", category="Calls") + report("tables/calls.tsv.gz", caption="../report/calls.rst", category="Calls"), log: - "logs/vcf_to_tsv.log" + "logs/vcf_to_tsv.log", conda: "../envs/rbt.yaml" group: @@ -18,6 +19,7 @@ rule vcf_to_tsv: "rbt vcf-to-txt -g --fmt DP AD --info ANN | " "gzip > {output}" + # TODO this rule fails for freebayes for some weird reason, see # https://github.com/rust-bio/rust-bio-tools/issues/52#issuecomment-626289782 # fix it, once that github issue has an answer @@ -26,14 +28,15 @@ rule vcf_to_tsv: # Depth Stats Plot # ================================================================================================= + rule plot_stats: input: - "tables/calls.tsv.gz" + "tables/calls.tsv.gz", output: depths=report("plots/depths.svg", caption="../report/depths.rst", category="Plots"), - freqs=report("plots/allele-freqs.svg", caption="../report/freqs.rst", category="Plots") + freqs=report("plots/allele-freqs.svg", caption="../report/freqs.rst", category="Plots"), log: - "logs/plot-depths.log" + "logs/plot-depths.log", conda: "../envs/stats.yaml" group: @@ -41,36 +44,40 @@ rule plot_stats: script: "../scripts/plot-depths.py" + # ================================================================================================= # Sequences per Sample # ================================================================================================= + rule seqs_per_sample: output: - "tables/sample-sizes.tsv" + "tables/sample-sizes.tsv", params: - samples = config["data"]["samples-table"] + samples=config["data"]["samples-table"], script: "../scripts/sample-sizes.py" + # ================================================================================================= # Allele Frequency Table # ================================================================================================= + rule frequency_table: input: + # we use the filtered file if a filtering is done, or the unfiltered if not. calls=( - # we use the filtered file if a filtering is done, or the unfiltered if not. "filtered/all.vcf.gz" if not config["settings"]["filter-variants"] == "none" else "genotyped/all.vcf.gz" - ) + ), output: - "tables/frequencies.tsv" + "tables/frequencies.tsv", params: - fields=config["settings"]["frequency-table-fields"] + fields=config["settings"]["frequency-table-fields"], log: - "logs/frequency-table.log" + "logs/frequency-table.log", conda: "../envs/frequency-table.yaml" script: diff --git a/workflow/rules/trimming-adapterremoval.smk b/workflow/rules/trimming-adapterremoval.smk index 218f712..3cd87bd 100644 --- a/workflow/rules/trimming-adapterremoval.smk +++ b/workflow/rules/trimming-adapterremoval.smk @@ -2,9 +2,10 @@ # Trimming # ================================================================================================= + rule trim_reads_se: input: - unpack(get_fastq) + unpack(get_fastq), output: r1=( "trimmed/{sample}-{unit}.fastq.gz" @@ -12,15 +13,14 @@ rule trim_reads_se: else temp("trimmed/{sample}-{unit}.fastq.gz") ), settings="trimmed/{sample}-{unit}.se.settings", - done=touch("trimmed/{sample}-{unit}.se.done") + done=touch("trimmed/{sample}-{unit}.se.done"), params: extra="--gzip", params=config["params"]["adapterremoval"]["se"], - basename="trimmed/{sample}-{unit}" - threads: - config["params"]["adapterremoval"]["threads"] + basename="trimmed/{sample}-{unit}", + threads: config["params"]["adapterremoval"]["threads"] log: - "logs/adapterremoval/{sample}-{unit}.log" + "logs/adapterremoval/{sample}-{unit}.log", benchmark: "benchmarks/adapterremoval/{sample}-{unit}.bench.log" conda: @@ -29,9 +29,10 @@ rule trim_reads_se: "AdapterRemoval --file1 {input.r1} {params.extra} {params.params} --threads {threads} " "--basename {params.basename} --output1 {output.r1} --settings {output.settings} > {log} 2>&1" + rule trim_reads_pe: input: - unpack(get_fastq) + unpack(get_fastq), output: r1=( "trimmed/{sample}-{unit}.pair1.fastq.gz" @@ -44,15 +45,14 @@ rule trim_reads_pe: else temp("trimmed/{sample}-{unit}.pair2.fastq.gz") ), settings="trimmed/{sample}-{unit}.pe.settings", - done=touch("trimmed/{sample}-{unit}.pe.done") + done=touch("trimmed/{sample}-{unit}.pe.done"), params: extra="--gzip", params=config["params"]["adapterremoval"]["pe"], - basename="trimmed/{sample}-{unit}" - threads: - config["params"]["adapterremoval"]["threads"] + basename="trimmed/{sample}-{unit}", + threads: config["params"]["adapterremoval"]["threads"] log: - "logs/adapterremoval/{sample}-{unit}.log" + "logs/adapterremoval/{sample}-{unit}.log", benchmark: "benchmarks/adapterremoval/{sample}-{unit}.bench.log" conda: @@ -62,53 +62,53 @@ rule trim_reads_pe: "--threads {threads} --basename {params.basename} --output1 {output.r1} --output2 {output.r2} " "--settings {output.settings} > {log} 2>&1" + rule trim_reads_pe_merged: input: - unpack(get_fastq) + unpack(get_fastq), output: # We mostly use the default output file names of AdapterRemoval here, # except for the settings file, which we have to rename so that its name is distinct from # the settings files of the other two rules above. - pair1_truncated = ( + pair1_truncated=( "trimmed/{sample}-{unit}.pair1.truncated.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.pair1.truncated.gz") ), - pair2_truncated = ( + pair2_truncated=( "trimmed/{sample}-{unit}.pair2.truncated.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.pair2.truncated.gz") ), - singleton_truncated = ( + singleton_truncated=( "trimmed/{sample}-{unit}.singleton.truncated.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.singleton.truncated.gz") ), - collapsed = ( + collapsed=( "trimmed/{sample}-{unit}.collapsed.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.collapsed.gz") ), - collapsed_truncated = ( + collapsed_truncated=( "trimmed/{sample}-{unit}.collapsed.truncated.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.collapsed.truncated.gz") ), - discarded = ( + discarded=( "trimmed/{sample}-{unit}.discarded.gz" if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.discarded.gz") ), - settings = "trimmed/{sample}-{unit}.pe-merged.settings", - done = touch("trimmed/{sample}-{unit}.pe-merged.done") + settings="trimmed/{sample}-{unit}.pe-merged.settings", + done=touch("trimmed/{sample}-{unit}.pe-merged.done"), params: extra="--gzip --collapse", params=config["params"]["adapterremoval"]["pe"], - basename="trimmed/{sample}-{unit}" - threads: - config["params"]["adapterremoval"]["threads"] + basename="trimmed/{sample}-{unit}", + threads: config["params"]["adapterremoval"]["threads"] log: - "logs/adapterremoval/{sample}-{unit}.log" + "logs/adapterremoval/{sample}-{unit}.log", benchmark: "benchmarks/adapterremoval/{sample}-{unit}.bench.log" conda: @@ -117,29 +117,36 @@ rule trim_reads_pe_merged: "AdapterRemoval --file1 {input.r1} --file2 {input.r2} {params.extra} {params.params} " "--threads {threads} --basename {params.basename} --settings {output.settings} > {log} 2>&1" + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - return [ "trimmed/{sample}-{unit}.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}.fastq.gz".format(sample=wildcards.sample, unit=wildcards.unit) + ] elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples - return [ "trimmed/{sample}-{unit}.collapsed.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}.collapsed.gz".format( + sample=wildcards.sample, unit=wildcards.unit + ) + ] else: # paired-end sample return expand( "trimmed/{sample}-{unit}.pair{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-cutadapt.smk b/workflow/rules/trimming-cutadapt.smk index 383808b..733a1f0 100644 --- a/workflow/rules/trimming-cutadapt.smk +++ b/workflow/rules/trimming-cutadapt.smk @@ -2,9 +2,10 @@ # Trimming # ================================================================================================= + rule trim_reads_se: input: - unpack(get_fastq) + unpack(get_fastq), output: fastq=( "trimmed/{sample}-{unit}.fastq.gz" @@ -12,14 +13,13 @@ rule trim_reads_se: else temp("trimmed/{sample}-{unit}.fastq.gz") ), qc="trimmed/{sample}-{unit}.qc-se.txt", - done=touch("trimmed/{sample}-{unit}.se.done") + done=touch("trimmed/{sample}-{unit}.se.done"), params: - adapters = config["params"]["cutadapt"]["se"]["adapters"], - extra = config["params"]["cutadapt"]["se"]["extra"] - threads: - config["params"]["cutadapt"]["threads"] + adapters=config["params"]["cutadapt"]["se"]["adapters"], + extra=config["params"]["cutadapt"]["se"]["extra"], + threads: config["params"]["cutadapt"]["threads"] log: - "logs/cutadapt/{sample}-{unit}.log" + "logs/cutadapt/{sample}-{unit}.log", benchmark: "benchmarks/cutadapt/{sample}-{unit}.bench.log" conda: @@ -28,9 +28,10 @@ rule trim_reads_se: wrapper: "0.74.0/bio/cutadapt/se" + rule trim_reads_pe: input: - unpack(get_fastq) + unpack(get_fastq), output: fastq1=( "trimmed/{sample}-{unit}.1.fastq.gz" @@ -43,14 +44,13 @@ rule trim_reads_pe: else temp("trimmed/{sample}-{unit}.2.fastq.gz") ), qc="trimmed/{sample}-{unit}.qc-pe.txt", - done=touch("trimmed/{sample}-{unit}.pe.done") + done=touch("trimmed/{sample}-{unit}.pe.done"), params: - adapters = config["params"]["cutadapt"]["pe"]["adapters"], - extra = config["params"]["cutadapt"]["pe"]["extra"] - threads: - config["params"]["cutadapt"]["threads"] + adapters=config["params"]["cutadapt"]["pe"]["adapters"], + extra=config["params"]["cutadapt"]["pe"]["extra"], + threads: config["params"]["cutadapt"]["threads"] log: - "logs/cutadapt/{sample}-{unit}.log" + "logs/cutadapt/{sample}-{unit}.log", benchmark: "benchmarks/cutadapt/{sample}-{unit}.bench.log" conda: @@ -59,17 +59,19 @@ rule trim_reads_pe: wrapper: "0.74.0/bio/cutadapt/pe" + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - return [ "trimmed/{sample}-{unit}.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}.fastq.gz".format(sample=wildcards.sample, unit=wildcards.unit) + ] elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples raise Exception( @@ -77,10 +79,14 @@ def get_trimmed_reads(wildcards): ) else: # paired-end sample - return expand("trimmed/{sample}-{unit}.{pair}.fastq.gz", pair=[1, 2], - sample=wildcards.sample, unit=wildcards.unit + return expand( + "trimmed/{sample}-{unit}.{pair}.fastq.gz", + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-fastp.smk b/workflow/rules/trimming-fastp.smk index 0537d14..43589fd 100644 --- a/workflow/rules/trimming-fastp.smk +++ b/workflow/rules/trimming-fastp.smk @@ -2,15 +2,17 @@ # Trimming # ================================================================================================= + # The fastp wrapper is different from the other trimming wrappers in the the snakemake wrapper # respository, because consistency is just not their strength... So, we have to provide an extra # function to unpack the fastq file names into a list for us. def unpack_fastp_files(wildcards): return list(get_fastq(wildcards).values()) + rule trim_reads_se: input: - sample=unpack_fastp_files + sample=unpack_fastp_files, output: trimmed=( "trimmed/{sample}-{unit}.fastq.gz" @@ -19,21 +21,21 @@ rule trim_reads_se: ), html="trimmed/{sample}-{unit}-se-fastp.html", json="trimmed/{sample}-{unit}-se-fastp.json", - done=touch("trimmed/{sample}-{unit}-se.done") + done=touch("trimmed/{sample}-{unit}-se.done"), log: - "logs/fastp/{sample}-{unit}.log" + "logs/fastp/{sample}-{unit}.log", benchmark: "benchmarks/fastp/{sample}-{unit}.bench.log" params: - extra=config["params"]["fastp"]["se"] - threads: - config["params"]["fastp"]["threads"] + extra=config["params"]["fastp"]["se"], + threads: config["params"]["fastp"]["threads"] wrapper: "0.64.0/bio/fastp" + rule trim_reads_pe: input: - sample=unpack_fastp_files + sample=unpack_fastp_files, output: trimmed=( ["trimmed/{sample}-{unit}.1.fastq.gz", "trimmed/{sample}-{unit}.2.fastq.gz"] @@ -42,21 +44,21 @@ rule trim_reads_pe: ), html="trimmed/{sample}-{unit}-pe-fastp.html", json="trimmed/{sample}-{unit}-pe-fastp.json", - done=touch("trimmed/{sample}-{unit}-pe.done") + done=touch("trimmed/{sample}-{unit}-pe.done"), log: - "logs/fastp/{sample}-{unit}.log" + "logs/fastp/{sample}-{unit}.log", benchmark: "benchmarks/fastp/{sample}-{unit}.bench.log" params: - extra=config["params"]["fastp"]["pe"] - threads: - config["params"]["fastp"]["threads"] + extra=config["params"]["fastp"]["pe"], + threads: config["params"]["fastp"]["threads"] wrapper: "0.64.0/bio/fastp" + rule trim_reads_pe_merged: input: - sample=unpack_fastp_files + sample=unpack_fastp_files, output: # Need to leave "trimmed" empty here, so that the wrapper works properly with merged, # so we use "merged" instead, and use it as an extra param. @@ -67,45 +69,52 @@ rule trim_reads_pe_merged: ), html="trimmed/{sample}-{unit}-pe-merged-fastp.html", json="trimmed/{sample}-{unit}-pe-merged-fastp.json", - done=touch("trimmed/{sample}-{unit}-pe-merged.done") + done=touch("trimmed/{sample}-{unit}-pe-merged.done"), log: - "logs/fastp/{sample}-{unit}.log" + "logs/fastp/{sample}-{unit}.log", benchmark: "benchmarks/fastp/{sample}-{unit}.bench.log" params: - extra=config["params"]["fastp"]["pe"] + - " --merge --merged_out trimmed/{sample}-{unit}-merged.fastq.gz" + - " --out1 trimmed/{sample}-{unit}-unmerged.pass-1.fastq.gz" + - " --out2 trimmed/{sample}-{unit}-unmerged.pass-2.fastq.gz" + - " --unpaired1 trimmed/{sample}-{unit}-unmerged.unpaired-1.fastq.gz" + - " --unpaired2 trimmed/{sample}-{unit}-unmerged.unpaired-2.fastq.gz" - threads: - config["params"]["fastp"]["threads"] + extra=config["params"]["fastp"]["pe"] + + " --merge --merged_out trimmed/{sample}-{unit}-merged.fastq.gz" + + " --out1 trimmed/{sample}-{unit}-unmerged.pass-1.fastq.gz" + + " --out2 trimmed/{sample}-{unit}-unmerged.pass-2.fastq.gz" + + " --unpaired1 trimmed/{sample}-{unit}-unmerged.unpaired-1.fastq.gz" + + " --unpaired2 trimmed/{sample}-{unit}-unmerged.unpaired-2.fastq.gz", + threads: config["params"]["fastp"]["threads"] wrapper: - "0.64.0/bio/fastp" # this runs fastp 0.20.0 + "0.64.0/bio/fastp" # this runs fastp 0.20.0 + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - return [ "trimmed/{sample}-{unit}.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}.fastq.gz".format(sample=wildcards.sample, unit=wildcards.unit) + ] elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples - return [ "trimmed/{sample}-{unit}-merged.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}-merged.fastq.gz".format( + sample=wildcards.sample, unit=wildcards.unit + ) + ] else: # paired-end sample - return expand("trimmed/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit + return expand( + "trimmed/{sample}-{unit}.{pair}.fastq.gz", + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-seqprep.smk b/workflow/rules/trimming-seqprep.smk index 2f03d8f..65441c8 100644 --- a/workflow/rules/trimming-seqprep.smk +++ b/workflow/rules/trimming-seqprep.smk @@ -9,9 +9,10 @@ if platform.system() == "Darwin": "Trimming tool seqprep is not available for MacOS. Please use a different trimming tool." ) + rule trim_reads_pe: input: - unpack(get_fastq) + unpack(get_fastq), output: merged=( "trimmed/{sample}-{unit}-merged.fastq.gz" @@ -38,11 +39,11 @@ rule trim_reads_pe: if config["settings"]["keep-intermediate"]["trimming"] else temp("trimmed/{sample}-{unit}.2.discarded.fastq.gz") ), - done=touch("trimmed/{sample}-{unit}.done") + done=touch("trimmed/{sample}-{unit}.done"), params: - extra = config["params"]["seqprep"]["extra"] + extra=config["params"]["seqprep"]["extra"], log: - "logs/seqprep/{sample}-{unit}.log" + "logs/seqprep/{sample}-{unit}.log", benchmark: "benchmarks/seqprep/{sample}-{unit}.bench.log" conda: @@ -55,36 +56,40 @@ rule trim_reads_pe: "-s {output.merged} " "{params.extra} > {log} 2>&1" + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - raise Exception( - "Trimming tool 'seqprep' cannot be used for single-end reads" - ) + raise Exception("Trimming tool 'seqprep' cannot be used for single-end reads") elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples - return [ "trimmed/{sample}-{unit}-merged.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}-merged.fastq.gz".format( + sample=wildcards.sample, unit=wildcards.unit + ) + ] else: # paired-end sample - return expand("trimmed/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit + return expand( + "trimmed/{sample}-{unit}.{pair}.fastq.gz", + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + # MultiQC does not support SeqPrep. Empty output. def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): # single end sample - raise Exception( - "Trimming tool 'seqprep' cannot be used for single-end reads" - ) + raise Exception("Trimming tool 'seqprep' cannot be used for single-end reads") elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples return [] diff --git a/workflow/rules/trimming-skewer.smk b/workflow/rules/trimming-skewer.smk index 2ec4f1d..ab03268 100644 --- a/workflow/rules/trimming-skewer.smk +++ b/workflow/rules/trimming-skewer.smk @@ -2,9 +2,10 @@ # Trimming # ================================================================================================= + rule trim_reads_se: input: - unpack(get_fastq) + unpack(get_fastq), output: # Output of the trimmed files, as well as the log file for multiqc ( @@ -13,15 +14,14 @@ rule trim_reads_se: else temp("trimmed/{sample}-{unit}-se-trimmed.fastq.gz") ), "trimmed/{sample}-{unit}-se-trimmed.log", - touch("trimmed/{sample}-{unit}-se.done") + touch("trimmed/{sample}-{unit}-se.done"), params: extra="--format sanger --compress", params=config["params"]["skewer"]["se"], - outpref="trimmed/{sample}-{unit}-se" - threads: - config["params"]["skewer"]["threads"] + outpref="trimmed/{sample}-{unit}-se", + threads: config["params"]["skewer"]["threads"] log: - "logs/skewer/{sample}-{unit}.log" + "logs/skewer/{sample}-{unit}.log", benchmark: "benchmarks/skewer/{sample}-{unit}.bench.log" conda: @@ -30,9 +30,10 @@ rule trim_reads_se: "skewer {params.extra} {params.params} --threads {threads} --output {params.outpref} " "{input.r1} > {log} 2>&1" + rule trim_reads_pe: input: - unpack(get_fastq) + unpack(get_fastq), output: # Output of the trimmed files, as well as the log file for multiqc r1=( @@ -46,15 +47,14 @@ rule trim_reads_pe: else temp("trimmed/{sample}-{unit}-pe-trimmed-pair2.fastq.gz") ), log="trimmed/{sample}-{unit}-pe-trimmed.log", - done=touch("trimmed/{sample}-{unit}-pe.done") + done=touch("trimmed/{sample}-{unit}-pe.done"), params: extra="--format sanger --compress", params=config["params"]["skewer"]["pe"], - outpref="trimmed/{sample}-{unit}-pe" - threads: - config["params"]["skewer"]["threads"] + outpref="trimmed/{sample}-{unit}-pe", + threads: config["params"]["skewer"]["threads"] log: - "logs/skewer/{sample}-{unit}.log" + "logs/skewer/{sample}-{unit}.log", benchmark: "benchmarks/skewer/{sample}-{unit}.bench.log" conda: @@ -63,17 +63,21 @@ rule trim_reads_pe: "skewer {params.extra} {params.params} --threads {threads} --output {params.outpref} " "{input.r1} {input.r2} > {log} 2>&1" + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - return [ "trimmed/{sample}-{unit}-se-trimmed.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}-se-trimmed.fastq.gz".format( + sample=wildcards.sample, unit=wildcards.unit + ) + ] elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples raise Exception( @@ -83,9 +87,12 @@ def get_trimmed_reads(wildcards): # paired-end sample return expand( "trimmed/{sample}-{unit}-pe-trimmed-pair{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" if is_single_end(sample, unit): diff --git a/workflow/rules/trimming-trimmomatic.smk b/workflow/rules/trimming-trimmomatic.smk index 7d64ac7..d4df64f 100644 --- a/workflow/rules/trimming-trimmomatic.smk +++ b/workflow/rules/trimming-trimmomatic.smk @@ -2,9 +2,10 @@ # Trimming # ================================================================================================= + rule trim_reads_se: input: - unpack(get_fastq) + unpack(get_fastq), output: ( "trimmed/{sample}-{unit}.fastq.gz" @@ -12,15 +13,14 @@ rule trim_reads_se: else temp("trimmed/{sample}-{unit}.fastq.gz") ), # trimlog="trimmed/{sample}-{unit}-se.trimlog.log" - touch("trimmed/{sample}-{unit}-se.done") + touch("trimmed/{sample}-{unit}-se.done"), params: # extra=lambda w, output: "-trimlog {}".format(output.trimlog), - extra = config["params"]["trimmomatic"]["se"]["extra"], - trimmer = config["params"]["trimmomatic"]["se"]["trimmer"] - threads: - config["params"]["trimmomatic"]["threads"] + extra=config["params"]["trimmomatic"]["se"]["extra"], + trimmer=config["params"]["trimmomatic"]["se"]["trimmer"], + threads: config["params"]["trimmomatic"]["threads"] log: - "logs/trimmomatic/{sample}-{unit}.log" + "logs/trimmomatic/{sample}-{unit}.log", benchmark: "benchmarks/trimmomatic/{sample}-{unit}.bench.log" conda: @@ -29,9 +29,10 @@ rule trim_reads_se: wrapper: "v3.13.6/bio/trimmomatic/se" + rule trim_reads_pe: input: - unpack(get_fastq) + unpack(get_fastq), output: r1=( "trimmed/{sample}-{unit}.1.fastq.gz" @@ -54,15 +55,14 @@ rule trim_reads_pe: else temp("trimmed/{sample}-{unit}.2.unpaired.fastq.gz") ), # trimlog="trimmed/{sample}-{unit}-pe.trimlog.log" - done=touch("trimmed/{sample}-{unit}-pe.done") + done=touch("trimmed/{sample}-{unit}-pe.done"), params: # extra=lambda w, output: "-trimlog {}".format(output.trimlog), - extra = config["params"]["trimmomatic"]["se"]["extra"], - trimmer = config["params"]["trimmomatic"]["pe"]["trimmer"] - threads: - config["params"]["trimmomatic"]["threads"] + extra=config["params"]["trimmomatic"]["se"]["extra"], + trimmer=config["params"]["trimmomatic"]["pe"]["trimmer"], + threads: config["params"]["trimmomatic"]["threads"] log: - "logs/trimmomatic/{sample}-{unit}.log" + "logs/trimmomatic/{sample}-{unit}.log", benchmark: "benchmarks/trimmomatic/{sample}-{unit}.bench.log" conda: @@ -71,17 +71,19 @@ rule trim_reads_pe: wrapper: "v3.13.6/bio/trimmomatic/pe" + # ================================================================================================= # Trimming Results # ================================================================================================= + def get_trimmed_reads(wildcards): """Get trimmed reads of given sample-unit.""" if is_single_end(wildcards.sample, wildcards.unit): # single end sample - return [ "trimmed/{sample}-{unit}.fastq.gz".format( - sample=wildcards.sample, unit=wildcards.unit - )] + return [ + "trimmed/{sample}-{unit}.fastq.gz".format(sample=wildcards.sample, unit=wildcards.unit) + ] elif config["settings"]["merge-paired-end-reads"]: # merged paired-end samples raise Exception( @@ -89,23 +91,30 @@ def get_trimmed_reads(wildcards): ) else: # paired-end sample - return expand("trimmed/{sample}-{unit}.{pair}.fastq.gz", - pair=[1, 2], sample=wildcards.sample, unit=wildcards.unit + return expand( + "trimmed/{sample}-{unit}.{pair}.fastq.gz", + pair=[1, 2], + sample=wildcards.sample, + unit=wildcards.unit, ) + # MultiQC expects the normal stdout log files from trimmomatic, but as we use a wrapper for the latter, # we cannot also declare the log files as output files, because snakemake... # Instead, we copy them afterwards. This is dirty, but that's how the snake rolls... rule trimmomatic_multiqc_log: input: # Take the trimming result as dummy input, so that this rule here is always executed afterwards - get_trimmed_reads + get_trimmed_reads, output: - "trimmed/{sample}-{unit}.trimlog.log" + "trimmed/{sample}-{unit}.trimlog.log", shell: "cp logs/trimmomatic/{wildcards.sample}-{wildcards.unit}.log {output}" -localrules: trimmomatic_multiqc_log + +localrules: + trimmomatic_multiqc_log, + def get_trimming_report(sample, unit): """Get the report needed for MultiQC.""" diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk index 97ffe4b..c671420 100644 --- a/workflow/rules/trimming.smk +++ b/workflow/rules/trimming.smk @@ -2,6 +2,7 @@ # Common File Access Functions # ================================================================================================= + # Get the fastq files for a sample, either single or paired end, as a dictionary. def get_fastq(wildcards): """Get fastq files of given sample-unit.""" @@ -12,47 +13,64 @@ def get_fastq(wildcards): else: return {"r1": fastqs.fq1} + # Determine whether a sample is single or paired end. # We use args to be able to call this function from functions that contain more wildcards # than just sample and unit, such as the bwa aln rules. -def is_single_end( sample, unit, **kargs ): +def is_single_end(sample, unit, **kargs): """Return True if sample-unit is single end.""" return pd.isnull(config["global"]["samples"].loc[(sample, unit), "fq2"]) + # ================================================================================================= # Trimming # ================================================================================================= # Switch to the chosen mapper +trimming_tool_good = False if config["settings"]["trimming-tool"] == "adapterremoval": # Use `adapterremoval` include: "trimming-adapterremoval.smk" + trimming_tool_good = True + elif config["settings"]["trimming-tool"] == "cutadapt": # Use `cutadapt` include: "trimming-cutadapt.smk" + trimming_tool_good = True + elif config["settings"]["trimming-tool"] == "fastp": # Use `fastp` include: "trimming-fastp.smk" + trimming_tool_good = True + elif config["settings"]["trimming-tool"] == "seqprep": # Use `seqprep` include: "trimming-seqprep.smk" + trimming_tool_good = True + elif config["settings"]["trimming-tool"] == "skewer": # Use `skewer` include: "trimming-skewer.smk" + trimming_tool_good = True + elif config["settings"]["trimming-tool"] == "trimmomatic": # Use `trimmomatic` include: "trimming-trimmomatic.smk" -else: + trimming_tool_good = True + + +# Another ugly workaround for https://github.com/snakemake/snakefmt/issues/239 +if not trimming_tool_good: raise Exception("Unknown trimming-tool: " + config["settings"]["trimming-tool"]) diff --git a/workflow/scripts/bwa-mem2-mem.py b/workflow/scripts/bwa-mem2-mem.py index 94f2e71..b01f3b6 100644 --- a/workflow/scripts/bwa-mem2-mem.py +++ b/workflow/scripts/bwa-mem2-mem.py @@ -19,7 +19,9 @@ __copyright__ = ( "Copyright 2020, Christopher Schröder, Johannes Köster and Julian de Ruiter, 2022, Lucas Czech" ) -__email__ = "christopher.schroeder@tu-dortmund.de koester@jimmy.harvard.edu, julianderuiter@gmail.com" +__email__ = ( + "christopher.schroeder@tu-dortmund.de koester@jimmy.harvard.edu, julianderuiter@gmail.com" +) __license__ = "MIT" from os import path @@ -116,6 +118,6 @@ # When using samtools, we create a temp dir. if sort == "samtools": with tempfile.TemporaryDirectory() as tmp: - shell( shell_cmd ) + shell(shell_cmd) else: - shell( shell_cmd ) + shell(shell_cmd) diff --git a/workflow/scripts/bwa-samxe.py b/workflow/scripts/bwa-samxe.py index 37a2150..275a9c1 100644 --- a/workflow/scripts/bwa-samxe.py +++ b/workflow/scripts/bwa-samxe.py @@ -32,15 +32,9 @@ # Check inputs fastq = ( - snakemake.input.fastq - if isinstance(snakemake.input.fastq, list) - else [snakemake.input.fastq] -) -sai = ( - snakemake.input.sai - if isinstance(snakemake.input.sai, list) - else [snakemake.input.sai] + snakemake.input.fastq if isinstance(snakemake.input.fastq, list) else [snakemake.input.fastq] ) +sai = snakemake.input.sai if isinstance(snakemake.input.sai, list) else [snakemake.input.sai] if len(fastq) == 1 and len(sai) == 1: alg = "samse" elif len(fastq) == 2 and len(sai) == 2: @@ -87,9 +81,7 @@ if sort == "none": # Simply convert to output format using samtools view. - pipe_cmd = ( - "samtools view -h --output-fmt " + out_ext + " -o {snakemake.output[0]} -" - ) + pipe_cmd = "samtools view -h --output-fmt " + out_ext + " -o {snakemake.output[0]} -" elif sort == "samtools": @@ -138,6 +130,6 @@ # When using samtools, we create a temp dir. if sort == "samtools": with tempfile.TemporaryDirectory() as tmp: - shell( shell_cmd ) + shell(shell_cmd) else: - shell( shell_cmd ) + shell(shell_cmd) diff --git a/workflow/scripts/fastqc.py b/workflow/scripts/fastqc.py index 67a69ab..dd650c6 100644 --- a/workflow/scripts/fastqc.py +++ b/workflow/scripts/fastqc.py @@ -20,6 +20,7 @@ log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True) + def basename_without_ext(file_path): """Returns basename of file path, without the file extension.""" @@ -36,16 +37,17 @@ def basename_without_ext(file_path): # Proper solution that follows the actual fastqc code. # Proposed for the snakemake wrapper that this script is based on: # https://github.com/snakemake/snakemake-wrappers/issues/288 - base = re.sub('\\.gz$', '', base) - base = re.sub('\\.bz2$', '', base) - base = re.sub('\\.txt$', '', base) - base = re.sub('\\.fastq$', '', base) - base = re.sub('\\.fq$', '', base) - base = re.sub('\\.sam$', '', base) - base = re.sub('\\.bam$', '', base) + base = re.sub("\\.gz$", "", base) + base = re.sub("\\.bz2$", "", base) + base = re.sub("\\.txt$", "", base) + base = re.sub("\\.fastq$", "", base) + base = re.sub("\\.fq$", "", base) + base = re.sub("\\.sam$", "", base) + base = re.sub("\\.bam$", "", base) return base + # ================================================================================================= # Main Call # ================================================================================================= @@ -59,10 +61,10 @@ def basename_without_ext(file_path): # More verbose output than the wrapper, for debugging shell( # "echo \"{tempdir:q}\" ; " - "echo \"Input: {snakemake.input[0]:q}\" {log} ; " - "echo \"Output zip: {snakemake.output.zip:q}\" {log} ; " - "echo \"Output html: {snakemake.output.html:q}\" {log} ; " - "echo -e \"\n--\n\" {log} ; " + 'echo "Input: {snakemake.input[0]:q}" {log} ; ' + 'echo "Output zip: {snakemake.output.zip:q}" {log} ; ' + 'echo "Output html: {snakemake.output.html:q}" {log} ; ' + 'echo -e "\n--\n" {log} ; ' "fastqc {snakemake.params} -t {snakemake.threads} " " --outdir {tempdir:q} {snakemake.input[0]:q} {log} ;" # "ls {tempdir:q}" diff --git a/workflow/scripts/freebayes.py b/workflow/scripts/freebayes.py index de7df5a..a35d9a4 100644 --- a/workflow/scripts/freebayes.py +++ b/workflow/scripts/freebayes.py @@ -83,7 +83,7 @@ chrom_name = fields[0] chrom_length = int(fields[1]) if chrom_name == contig: - regions = "<(echo \"" + chrom_name + ":0-" + str(chrom_length) + "\")" + regions = '<(echo "' + chrom_name + ":0-" + str(chrom_length) + '")' # If we are here, we must have found the contig in the fai file, # otherwise that name would not have appeared in the "{contig}" wildcard of our snakemake rule - @@ -108,12 +108,12 @@ # If there are no regions yet, we have the case that a small contig group was provided. # In this case, we just parse that file and turn its bed format into the freebayes # regions format. - regions = ( - "<(cat {snakemake.input.regions} | sed 's/\\t/:/' | sed 's/\\t/-/')" - ).format(snakemake=snakemake) + regions = ("<(cat {snakemake.input.regions} | sed 's/\\t/:/' | sed 's/\\t/-/')").format( + snakemake=snakemake + ) if snakemake.threads == 1: - freebayes = "freebayes --region <("+ regions + ")" + freebayes = "freebayes --region <(" + regions + ")" else: # Ideally, we'd be using bamtools coverage and coverage_to_regions.py here, # as suggsted in the freebayes-parallel script, but this runs a long time and had some errors diff --git a/workflow/scripts/frequency-table.py b/workflow/scripts/frequency-table.py index c1b1195..b38fd8f 100644 --- a/workflow/scripts/frequency-table.py +++ b/workflow/scripts/frequency-table.py @@ -3,6 +3,7 @@ # ================================================================================================= import vcfpy + # ================================================================================================= # Process Data # ================================================================================================= @@ -33,11 +34,11 @@ sample_fields = [] for field in fields: for sample in reader.header.samples.names: - sample_fields.append( field + "." + sample ) + sample_fields.append(field + "." + sample) # Print header -header = ['CHROM', 'POS', 'REF', 'ALT'] + sample_fields -tableout.write('\t'.join(header) + '\n') +header = ["CHROM", "POS", "REF", "ALT"] + sample_fields +tableout.write("\t".join(header) + "\n") # Process input vcf line by line for record in reader: @@ -48,7 +49,7 @@ # Prepare fixed part line = [record.CHROM, record.POS, record.REF] line += [alt.value for alt in record.ALT] - tableout.write('\t'.join(map(str, line))) + tableout.write("\t".join(map(str, line))) # Prepare lists for results per sample covs = [] @@ -59,7 +60,7 @@ # Iterate samples and fill lists # line += [call.data.get('AD') or '0,0' for call in record.calls] for call in record.calls: - ad = call.data.get('AD') + ad = call.data.get("AD") if ad and len(ad) == 2: cov = ad[0] + ad[1] covs.append(str(cov)) @@ -79,12 +80,12 @@ # Write the lists per sample, in the order of the fields. if "COV" in fields: - tableout.write('\t' + '\t'.join( covs )) + tableout.write("\t" + "\t".join(covs)) if "FREQ" in fields: - tableout.write('\t' + '\t'.join( freqs )) + tableout.write("\t" + "\t".join(freqs)) if "REF_CNT" in fields: - tableout.write('\t' + '\t'.join( ref_cnts )) + tableout.write("\t" + "\t".join(ref_cnts)) if "ALT_CNT" in fields: - tableout.write('\t' + '\t'.join( alt_cnts )) - tableout.write('\n') + tableout.write("\t" + "\t".join(alt_cnts)) + tableout.write("\n") tableout.close() diff --git a/workflow/scripts/hafpipe-concat.py b/workflow/scripts/hafpipe-concat.py index bfb1555..5a23d4b 100644 --- a/workflow/scripts/hafpipe-concat.py +++ b/workflow/scripts/hafpipe-concat.py @@ -26,17 +26,19 @@ # User output. shell( - "echo \"Started `date`\" {log} ; " - "echo \"Sample: {snakemake.params.sample}\" {log} ; " - "echo \"Chromosomes: {snakemake.params.chroms}\" {log} ; " + 'echo "Started `date`" {log} ; ' + 'echo "Sample: {snakemake.params.sample}" {log} ; ' + 'echo "Chromosomes: {snakemake.params.chroms}" {log} ; ' ) + # Helper function to get the file path for a HAFpipe afSite file. -def get_afsite_path( sample, chrom ): +def get_afsite_path(sample, chrom): if chrom not in snakemake.params.chroms: - raise Exception( "Invalid chrom " + chrom ) + raise Exception("Invalid chrom " + chrom) return "hafpipe/frequencies/" + sample + ".bam." + chrom + ".afSite" + # ------------------------------------------------------------------------- # Concat chromosomes # ------------------------------------------------------------------------- @@ -44,26 +46,20 @@ def get_afsite_path( sample, chrom ): # If we want to compress the table, just add a pipe here. Luckily, appending gzip files to each # other is a valid thing to do with the format, so we can just build the file chrom by chrom. if snakemake.params.get("compress", False): - gzip=" | gzip " + gzip = " | gzip " else: - gzip="" + gzip = "" # Make a header line with the sample name, and write it to the output. sample_file = snakemake.output.table -header="chrom,pos," + snakemake.params.sample + ".af" -shell( - "echo {header} {gzip} > {sample_file} ; " -) +header = "chrom,pos," + snakemake.params.sample + ".af" +shell("echo {header} {gzip} > {sample_file} ; ") # Go through all chromosomes in the specified order, and concat them to the output file, # while prepending the chromosome as a first column to the file, and leaving out its header. for chrom in snakemake.params.chroms: - sn = get_afsite_path( snakemake.params.sample, chrom ) - shell( - "cat {sn} | tail -n +2 | sed 's/^/{chrom},/' {gzip} >> {sample_file} ; " - ) + sn = get_afsite_path(snakemake.params.sample, chrom) + shell("cat {sn} | tail -n +2 | sed 's/^/{chrom},/' {gzip} >> {sample_file} ; ") # User output. -shell( - "echo \"Finished `date`\" {log} ; " -) +shell('echo "Finished `date`" {log} ; ') diff --git a/workflow/scripts/hafpipe-merge.py b/workflow/scripts/hafpipe-merge.py index 6d01b84..72f43d5 100644 --- a/workflow/scripts/hafpipe-merge.py +++ b/workflow/scripts/hafpipe-merge.py @@ -33,7 +33,7 @@ if not snakemake.log: log = "" else: - log = " >> {0} 2>&1".format( os.path.abspath(str(snakemake.log)) ) + log = " >> {0} 2>&1".format(os.path.abspath(str(snakemake.log))) # ------------------------------------------------------------------------- # File access @@ -41,16 +41,18 @@ # We actually change to the directory of the files, to keep it simple, # and to potentially avoid issues with too long paths when doing our hundreds of concats. -os.chdir( snakemake.params.get("base_path", "hafpipe/frequencies") ) +os.chdir(snakemake.params.get("base_path", "hafpipe/frequencies")) + # Helper function to get the file path for a HAFpipe afSite file. -def get_afsite_path( sample, chrom ): +def get_afsite_path(sample, chrom): if sample not in snakemake.params.samples: - raise Exception( "Invalid sample " + sample ) + raise Exception("Invalid sample " + sample) if chrom not in snakemake.params.chroms: - raise Exception( "Invalid chrom " + chrom ) + raise Exception("Invalid chrom " + chrom) return sample + ".merged.bam." + chrom + ".afSite" + # ------------------------------------------------------------------------- # Set up and checks # ------------------------------------------------------------------------- @@ -71,16 +73,16 @@ def get_afsite_path( sample, chrom ): # Some stupid assertions, just in case, so that we can rely on this downstream. # Cannot really happen (I think), as the rules would never be executed, but better safe than sorry. if len(snakemake.params.samples) < 1: - raise Exception( "No samples provided for HAFpipe merging" ) + raise Exception("No samples provided for HAFpipe merging") if len(snakemake.params.chroms) < 1: - raise Exception( "No chromosomes provided for HAFpipe merging" ) + raise Exception("No chromosomes provided for HAFpipe merging") # We are combining all files into larger tables first, and then combine these tables again. # But we do not do that recursively, so we might hit a limit if there are more of these combined # tables than we can process in their merging step... # For a typical system with a `ulimit -n` of 1024, this will happen at ~1mio samples. # batch_cnt = int(( len(snakemake.params.samples) - 1 ) / max_files) + 1 -batch_cnt = int(math.ceil( float(len(snakemake.params.samples)) / float(max_files) )) +batch_cnt = int(math.ceil(float(len(snakemake.params.samples)) / float(max_files))) if batch_cnt >= max_files: raise Exception( "Congratulations! You are using incredibly many files (or a very small batch size)! " @@ -100,26 +102,27 @@ def get_afsite_path( sample, chrom ): for chrom in snakemake.params.chroms: lc = 0 for sample in snakemake.params.samples: - fn=get_afsite_path(sample, chrom) + fn = get_afsite_path(sample, chrom) # We use the full speed of unix here, instead of the slow python way... - lines=int(subprocess.check_output("/usr/bin/wc -l " + fn, shell=True).split()[0]) + lines = int(subprocess.check_output("/usr/bin/wc -l " + fn, shell=True).split()[0]) if lc == 0: lc = lines if lc != lines: raise Exception( - "Cannot merge HAFpipe afSite files for chromosome " + chrom + - ", as the per-sample files have different number of positions." + "Cannot merge HAFpipe afSite files for chromosome " + + chrom + + ", as the per-sample files have different number of positions." ) # Some log output shell( - "echo -e \"In `pwd` \\n\" {log} ; " - "echo -e \"Started `date` \\n\" {log} ; " - "echo \"Merging HAFpipe afSite files\" {log} ; " - "echo \"Samples: {snakemake.params.samples}\" {log} ; " - "echo \"Chromosomes: {snakemake.params.chroms}\" {log} ; " - "echo -e \"Batches: {batch_cnt} \\n\" {log} ; " + 'echo -e "In `pwd` \\n" {log} ; ' + 'echo -e "Started `date` \\n" {log} ; ' + 'echo "Merging HAFpipe afSite files" {log} ; ' + 'echo "Samples: {snakemake.params.samples}" {log} ; ' + 'echo "Chromosomes: {snakemake.params.chroms}" {log} ; ' + 'echo -e "Batches: {batch_cnt} \\n" {log} ; ' ) # print(snakemake.params.samples) # print(snakemake.params.chroms) @@ -144,28 +147,23 @@ def get_afsite_path( sample, chrom ): for chrom in snakemake.params.chroms: # Make a deep copy of the samples list, so that we can delete the ones that we have processed # on this chromosome from it... Python and its shallow references... - samples=list(snakemake.params.samples) + samples = list(snakemake.params.samples) # Get the first column of the first file, without the header line. # This will be the positions column that we use for all, assuming that they all have # the same positions (see above). - sample_0 = get_afsite_path( samples[0], chrom ) + sample_0 = get_afsite_path(samples[0], chrom) pos_file = "../all-" + chrom + ".pos" - shell( "cut -d\",\" -f1 {sample_0} | tail -n +2 > {pos_file}" ) - lines=int(subprocess.check_output("/usr/bin/wc -l " + pos_file, shell=True).split()[0]) + shell('cut -d"," -f1 {sample_0} | tail -n +2 > {pos_file}') + lines = int(subprocess.check_output("/usr/bin/wc -l " + pos_file, shell=True).split()[0]) # We also need to build a file of the same length that over and over repeats the chromosome, # so that we can paste it in front of the chromosome table. chr_file = "../all-" + chrom + ".chrom" - shell( - "touch {chr_file} ; " - "for l in `seq 1 {lines}` ; do echo {chrom} >> {chr_file}; done" - ) + shell("touch {chr_file} ; " "for l in `seq 1 {lines}` ; do echo {chrom} >> {chr_file}; done") # Some nice user output - shell( - "echo \"Processing chromosome {chrom} with {lines} positions\" {log} ; " - ) + shell('echo "Processing chromosome {chrom} with {lines} positions" {log} ; ') # We process in batches of our max_files size, so that we never have (much) more files open # at the same time that than. The output of course as well, but that should work. @@ -179,15 +177,15 @@ def get_afsite_path( sample, chrom ): # We stream each file, using pipe redirections, while at the same time filtering: # For each sample, get the frequency column without the head line. # We here build a string with all these instructions, and then exectute it. - sample_paste_files="" + sample_paste_files = "" for sample in batch_smps: - sn = get_afsite_path( sample, chrom ) - sample_paste_files += " <( cut -d\",\" -f2 " + sn + " | tail -n +2 )" + sn = get_afsite_path(sample, chrom) + sample_paste_files += ' <( cut -d"," -f2 ' + sn + " | tail -n +2 )" # Get just the af column without header of each of the files in the batch. batch_file = "../all-" + chrom + "-" + str(batch_num) + ".af" shell( - "echo \" - Batch {batch_num} with {batch_smps_cnt} samples\" {log} ; " + 'echo " - Batch {batch_num} with {batch_smps_cnt} samples" {log} ; ' "paste -d, {sample_paste_files} > {batch_file} ; " ) batch_num += 1 @@ -196,7 +194,7 @@ def get_afsite_path( sample, chrom ): # Now we have the batch files, and need to merge them, including the pos column. chrom_file = "../all-" + chrom + ".af" shell( - "echo \" - Merging {batch_num} batch(es)\" {log} ; " + 'echo " - Merging {batch_num} batch(es)" {log} ; ' "paste -d, {chr_file} {pos_file} {batch_files} > {chrom_file} ; " "rm {chr_file} {pos_file} {batch_files} ; " ) @@ -213,21 +211,21 @@ def get_afsite_path( sample, chrom ): # If we want to compress the table, just add a pipe here. Luckily, appending gzip files to each # other is a valid thing to do with the format, so we can just build the file chrom by chrom. if snakemake.params.get("compress", False): - gzip=" | gzip " + gzip = " | gzip " else: - gzip="" + gzip = "" # Make a header line with all sample names. -header="chrom,pos" +header = "chrom,pos" for sample in snakemake.params.samples: header += "," + sample + ".af" # Write the table by pasting all chromsome tables. all_file = "../all.csv" + (".gz" if gzip else "") shell( - "echo \"Merging final table\" {log} ; " + 'echo "Merging final table" {log} ; ' "echo {header} {gzip} > {all_file} ; " "cat {chrom_files} {gzip} >> {all_file} ; " "rm {chrom_files} ; " - "echo -e \"\\nFinished `date`\" {log} ; " + 'echo -e "\\nFinished `date`" {log} ; ' ) diff --git a/workflow/scripts/hafpipe.py b/workflow/scripts/hafpipe.py index b30c2b9..bd3914b 100644 --- a/workflow/scripts/hafpipe.py +++ b/workflow/scripts/hafpipe.py @@ -28,19 +28,17 @@ # For all these steps, we get the directory where this python script file is in, # and then work our way up from there through our grenepipe directory. script_path = os.path.dirname(os.path.realpath(__file__)) -packages_path = os.path.join( script_path, "../../packages" ) +packages_path = os.path.join(script_path, "../../packages") -harp_path = os.path.join( packages_path, "harp/bin/" ) -harp_bin = os.path.join( packages_path, "harp/bin/harp" ) -hafpipe_bin = os.path.join( packages_path, "hafpipe/HAFpipe_wrapper.sh" ) +harp_path = os.path.join(packages_path, "harp/bin/") +harp_bin = os.path.join(packages_path, "harp/bin/harp") +hafpipe_bin = os.path.join(packages_path, "hafpipe/HAFpipe_wrapper.sh") -if not os.path.exists( harp_bin ) or not os.path.exists( hafpipe_bin ): +if not os.path.exists(harp_bin) or not os.path.exists(hafpipe_bin): print("Cannot find harp or HAFpipe in the grenepipe directory. Downloading them now.") - shell( - "{packages_path}/setup-hafpipe.sh {log}" - ) + shell("{packages_path}/setup-hafpipe.sh {log}") -if not os.path.exists( harp_bin ) or not os.path.exists( hafpipe_bin ): +if not os.path.exists(harp_bin) or not os.path.exists(hafpipe_bin): print( "Still cannot find harp or HAFpipe in the grenepipe directory. " "Please see grenepipe/packages/README.md to download them manually." @@ -75,13 +73,13 @@ # need to match them in our rule. Might want to change later, and rename the files to our liking. # Add potential inputs -hafpipe_inputs = [ "vcf", "bamfile", "refseq" ] +hafpipe_inputs = ["vcf", "bamfile", "refseq"] for arg in hafpipe_inputs: if snakemake.input.get(arg, ""): args += " --" + arg + " {snakemake.input." + arg + ":q}" # Add potential params, without the `:q`, as those are not files. -hafpipe_params = [ "chrom", "impmethod", "outdir" ] +hafpipe_params = ["chrom", "impmethod", "outdir"] for arg in hafpipe_params: if snakemake.params.get(arg, ""): args += " --" + arg + " {snakemake.params." + arg + "}" @@ -90,7 +88,7 @@ # Should not happen with our rules, as they should only call this script for the two valid methods. impmethod = snakemake.params.get("impmethod", "") if impmethod != "" and impmethod not in ["simpute", "npute"]: - raise Exception( "Cannot use impmethod '" + impmethod + "' with HAFpipe directly" ) + raise Exception("Cannot use impmethod '" + impmethod + "' with HAFpipe directly") # Arguments of HAFpipe that exist, but that we do not need to use above: # --logfile (directly provided below in the shell command) @@ -108,8 +106,7 @@ # and we finally also forward any extra params that the user might have provided. shell( "export PATH={harp_path}:$PATH ; " - "{hafpipe_bin} " + args + \ - " --logfile {snakemake.log:q} " + "{hafpipe_bin} " + args + " --logfile {snakemake.log:q} " " {snakemake.params.extra} {log}" ) @@ -118,10 +115,8 @@ # and manually rename to the expected file for that particular case, so that our rule finds it. # We fixed that in our fork of HAF-pipe, but keep this check around here for completeness. if snakemake.params.get("tasks") == "2" and impmethod == "simpute": - if os.path.exists( snakemake.input.snptable + ".imputed" ): - shell( - "mv {snakemake.input.snptable:q}.imputed {snakemake.input.snptable:q}.simpute" - ) + if os.path.exists(snakemake.input.snptable + ".imputed"): + shell("mv {snakemake.input.snptable:q}.imputed {snakemake.input.snptable:q}.simpute") # The `numeric_SNPtable.R` script is being run from within Task 1 and produces the `.numeric` file # of the SNP table. It can require an _insane_ amount of memory for larger founder VCF files, @@ -135,12 +130,8 @@ # We check that both files exist and that the bgzipped one is not so small that it's likely # just a zipped empty file. Reading bgzip in python is tricky, so we don't do that as of now... numeric = snakemake.output.get("snptable") + ".numeric" - numbgz = numeric + ".bgz" - if ( - not os.path.exists( numeric ) or - not os.path.exists( numbgz ) or - os.path.getsize( numbgz ) < 100 - ): + numbgz = numeric + ".bgz" + if not os.path.exists(numeric) or not os.path.exists(numbgz) or os.path.getsize(numbgz) < 100: raise Exception( "The HAF-pipe Task 1 step to convert the SNP table file to a numeric format failed. " "Please check all log files for errors." diff --git a/workflow/scripts/picard-collectmultiplemetrics.py b/workflow/scripts/picard-collectmultiplemetrics.py index f2dd7cd..552872b 100644 --- a/workflow/scripts/picard-collectmultiplemetrics.py +++ b/workflow/scripts/picard-collectmultiplemetrics.py @@ -86,10 +86,13 @@ # Python does not yet have a touch function. import os + + def touch(fname, times=None): - with open(fname, 'a'): + with open(fname, "a"): os.utime(fname, times) + # Here is our addition: Go thorugh all files again, and simply touch the any non-existing ones. # We only touch non-existing ones, to not mess with existing time stamps. for file in snakemake.output: diff --git a/workflow/scripts/plot-depths.py b/workflow/scripts/plot-depths.py index c3e8843..4c2c01d 100755 --- a/workflow/scripts/plot-depths.py +++ b/workflow/scripts/plot-depths.py @@ -3,6 +3,7 @@ # ================================================================================================= import matplotlib + matplotlib.use("agg") import matplotlib.pyplot as plt @@ -14,7 +15,7 @@ # Process Data # ================================================================================================= -calls = pd.read_csv(snakemake.input[0], sep='\t', header=[0, 1]) +calls = pd.read_csv(snakemake.input[0], sep="\t", header=[0, 1]) samples = [name for name in calls.columns.levels[0] if name != "VARIANT"] sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False) sample_info = sample_info.rename({"level_1": "sample"}, axis=1) diff --git a/workflow/scripts/sample-sizes.py b/workflow/scripts/sample-sizes.py index 4352b95..cf6f5d1 100755 --- a/workflow/scripts/sample-sizes.py +++ b/workflow/scripts/sample-sizes.py @@ -1,6 +1,7 @@ #!/usr/bin/python3 import pandas as pd + # from tabulate import tabulate import os, sys import gzip @@ -13,52 +14,59 @@ # print ("Summarizing", smpfile, "\n") # Read samples and units table -samples = pd.read_csv(smpfile, sep='\t', dtype=str) +samples = pd.read_csv(smpfile, sep="\t", dtype=str) # samples = pd.read_csv(smpfile, sep='\t', dtype=str).set_index(["sample", "unit"], drop=False) # samples.index = samples.index.set_levels([i.astype(str) for i in samples.index.levels]) # enforce str in index # Change to the dir of the sample file, so that we can find sample files relative to it. -olddir=os.getcwd() +olddir = os.getcwd() os.chdir(os.path.dirname(smpfile) if os.path.dirname(smpfile) else ".") # Add the file sizes as samples["fq1_size"] = [ - (os.path.getsize(str(f)) - if os.path.isfile(str(f)) - else ( - os.path.getsize("../" + str(f)) if os.path.isfile("../" + str(f)) else 0 - )) for f in samples["fq1"] + ( + os.path.getsize(str(f)) + if os.path.isfile(str(f)) + else (os.path.getsize("../" + str(f)) if os.path.isfile("../" + str(f)) else 0) + ) + for f in samples["fq1"] ] samples["fq2_size"] = [ - (os.path.getsize(str(f)) - if os.path.isfile(str(f)) - else ( - os.path.getsize("../" + str(f)) if os.path.isfile("../" + str(f)) else 0 - )) for f in samples["fq2"] + ( + os.path.getsize(str(f)) + if os.path.isfile(str(f)) + else (os.path.getsize("../" + str(f)) if os.path.isfile("../" + str(f)) else 0) + ) + for f in samples["fq2"] ] + def count_fastq_gzip_lines(fastqfile): count = 0 - with gzip.open(fastqfile,'rt') as f: + with gzip.open(fastqfile, "rt") as f: for line in f: count += 1 return int(count / 4) + samples["fq1_seqs"] = [ - count_fastq_gzip_lines(str(f)) - if os.path.isfile(str(f)) - else count_fastq_gzip_lines("../" + str(f)) + ( + count_fastq_gzip_lines(str(f)) + if os.path.isfile(str(f)) + else count_fastq_gzip_lines("../" + str(f)) + ) for f in samples["fq1"] ] samples["fq2_seqs"] = [ - count_fastq_gzip_lines(str(f)) - if os.path.isfile(str(f)) - else ( - count_fastq_gzip_lines("../" + str(f)) if os.path.isfile("../" + str(f)) else 0 - ) for f in samples["fq2"] + ( + count_fastq_gzip_lines(str(f)) + if os.path.isfile(str(f)) + else (count_fastq_gzip_lines("../" + str(f)) if os.path.isfile("../" + str(f)) else 0) + ) + for f in samples["fq2"] ] os.chdir(olddir) -samples.to_csv(snakemake.output[0], index = False) +samples.to_csv(snakemake.output[0], index=False) # print (samples) # print( tabulate( samples )) diff --git a/workflow/scripts/vep-cache.py b/workflow/scripts/vep-cache.py index 356d313..d6c8822 100644 --- a/workflow/scripts/vep-cache.py +++ b/workflow/scripts/vep-cache.py @@ -27,10 +27,10 @@ # Extra optional cache and fasta url cacheurl = snakemake.params.get("cacheurl", "") if cacheurl: - cacheurl = "--CACHEURL \"{}\"".format(cacheurl) + cacheurl = '--CACHEURL "{}"'.format(cacheurl) fastaurl = snakemake.params.get("fastaurl", "") if fastaurl: - fastaurl = "--FASTAURL \"{}\"".format(fastaurl) + fastaurl = '--FASTAURL "{}"'.format(fastaurl) log = snakemake.log_fmt_shell(stdout=True, stderr=True) diff --git a/workflow/scripts/vep-plugins.py b/workflow/scripts/vep-plugins.py index 7de6de2..04eafd3 100644 --- a/workflow/scripts/vep-plugins.py +++ b/workflow/scripts/vep-plugins.py @@ -24,7 +24,7 @@ sys.stderr = open(snakemake.log[0], "w") outdir = Path(snakemake.output[0]) -outdir.mkdir(exist_ok = True) +outdir.mkdir(exist_ok=True) with NamedTemporaryFile() as tmp: urlretrieve(