From 84fe8254bfe86a276a4f35b3884c1cfca184a17a Mon Sep 17 00:00:00 2001 From: Austin Gillen <4809242+agillen@users.noreply.github.com> Date: Thu, 5 Oct 2023 10:47:32 -0600 Subject: [PATCH] incorporate ultima rules, fix naming, remove bodhi- and 10X-specific refs --- .gitignore | 6 ++ chemistry_to_json.py | 2 + rules/count.snake | 153 +++++++++++++++++++++++++++------ rules/cutadapt_star.snake | 111 +++++++++++++++++++----- rules/r1only.snake | 175 -------------------------------------- 5 files changed, 228 insertions(+), 219 deletions(-) create mode 100644 .gitignore delete mode 100644 rules/r1only.snake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bdcc658 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.Rproj* +*.Rproj +.RData +.Rhistory +.Rproj.user +*.DS_Store diff --git a/chemistry_to_json.py b/chemistry_to_json.py index d47329b..6206c2a 100644 --- a/chemistry_to_json.py +++ b/chemistry_to_json.py @@ -3,6 +3,8 @@ import pandas chromiumV3 = {"length" : "58", "bc_cut" : "", "R1" : '[WHITELIST_V3,"--soloUMIlen 12 --clip5pNbases 58 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17"]', "R2" : '[WHITELIST_V3,"--soloUMIlen 12"]'} +chromiumV3UG = {"length": "58", "bc_cut": "", "R1": '[WHITELIST_V3,"--soloUMIlen 9 --clip5pNbases 58 --soloCBstart 23 --soloCBlen 16 --soloUMIstart 39"]', "R2": '[WHITELIST_V3,"--soloUMIlen 9"]'} + chromiumV2 = {"length" : "56", "bc_cut" : "", "R1" : '[WHITELIST_V2,"--soloUMIlen 10 --clip5pNbases 56 0 --soloCBstart 1 --soloCBlen 16 --soloUMIstart 17"]', "R2" : '[WHITELIST_V2,"--soloUMIlen 10"]'} dropseq = {"length" : "50", "bc_cut" : "", "R1" : '["None --soloUMIlen 8 --clip5pNbases 50 0 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13"]', "R2" : '["None --soloUMIlen 8 --soloCBstart 1 --soloCBlen 12 --soloUMIstart 13"]'} diff --git a/rules/count.snake b/rules/count.snake index af1a7ef..772d6f8 100644 --- a/rules/count.snake +++ b/rules/count.snake @@ -1,18 +1,58 @@ """ Extract correction length target""" -def _get_chem_length_R1(wildcards): +def _get_chem_length_paired(wildcards): chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0] args = chemistry[chem_version]["length"] return args """ rules for scraps counting """ -rule assign_sites_read2: +rule assign_sites_R1: + input: + "{results}/temp/{sample}_R1.bam" + output: + bam = temp("{results}/temp/{sample}_R1_assigned.bam") + params: + job_name = "assign_sites_R1", + memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed + out = "{results}/temp/{sample}_R1_assigned", + out_bam = "{results}/temp/{sample}_R1.bam.featureCounts.bam" + log: + "{results}/logs/assign_sites_{sample}_R1.txt" + threads: + 12 + shell: + """ + if echo {POLYA_SITES} | tr '[:upper:]' '[:lower:]' | grep -q -e "\.saf$" -e "\.saf.gz$"; then + polyaformat='SAF'; else + polyaformat='GTF'; + fi + + featureCounts \ + -s 2 -Q 10 -O \ + --read2pos 5 \ + -o {params.out} \ + -F $polyaformat \ + -a {POLYA_SITES} \ + -R BAM \ + -T {threads} \ + {input} \ + 2> {log} + + samtools sort \ + {params.out_bam} \ + -o {output.bam} + + samtools index {output.bam} + rm -rf {params.out_bam} + """ + +rule assign_sites_R2: input: "{results}/temp/{sample}_R2.bam" output: bam = temp("{results}/temp/{sample}_R2_assigned.bam") params: - job_name = "assign_sites_read2", + job_name = "assign_sites_R2", memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed out = "{results}/temp/{sample}_R2_assigned", out_bam = "{results}/temp/{sample}_R2.bam.featureCounts.bam" @@ -47,18 +87,18 @@ rule assign_sites_read2: rm -rf {params.out_bam} """ -rule assign_sites_read1: +rule assign_sites_paired: input: - "{results}/temp/{sample}_R1.bam" + "{results}/temp/{sample}_paired.bam" output: - bam = temp("{results}/temp/{sample}_R1_assigned.bam") + bam = temp("{results}/temp/{sample}_paired_assigned.bam") params: - job_name = "assign_sites_read1", + job_name = "assign_sites_paired", memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed - out = "{results}/temp/{sample}_R1_assigned", - out_bam = "{results}/temp/{sample}_R1.bam.featureCounts.bam" + out = "{results}/temp/{sample}_paired_assigned", + out_bam = "{results}/temp/{sample}_paired.bam.featureCounts.bam" log: - "{results}/logs/assign_sites_{sample}_R1.txt" + "{results}/logs/assign_sites_{sample}_paired.txt" threads: 12 shell: @@ -87,7 +127,40 @@ rule assign_sites_read1: rm -rf {params.out_bam} """ -rule filterR2: +rule filter_R1: + input: + "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" + output: + temp("{results}/temp/{sample}_R1.bam") + params: + job_name = "filter", + length = _get_chem_length_R1, + temp = "{results}/temp/{sample}_R1_filter.bam", + temp2 = "{results}/temp/{sample}_R1_filter2.bam", + final = "{results}/counts/{sample}_R1_counts.tsv.gz", + memory = "select[mem>8] rusage[mem=8]" # LSF format; change as needed + log: + "{results}/logs/filter_{sample}_R1.txt" + threads: + 4 + shell: + """ + samtools view -h {input} | grep -v 'CB:Z:-\|UB:Z:-' | samtools view -b - > {params.temp} + set +e + python3 inst/scripts/filter_bam_correct.py -i {params.temp} -o {params.temp2} -l {params.length} -s -c 20 + exitcode=$? + if [ $exitcode -eq 1 ] + then + touch {params.final} + fi + samtools sort {params.temp2} -o {output} + # mv {params.temp} {output} + samtools index {output} + rm -rf {params.temp} + rm -rf {params.temp2} + """ + +rule filter_R2: input: "{results}/{sample}/{sample}_R2_Aligned.sortedByCoord.out.bam" output: @@ -111,20 +184,20 @@ rule filterR2: rm -rf {params.tempbai} """ -rule filterR1: +rule filter_paired: input: - "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" + "{results}/{sample}/{sample}_paired_Aligned.sortedByCoord.out.bam" output: - temp("{results}/temp/{sample}_R1.bam") + temp("{results}/temp/{sample}_paired.bam") params: job_name = "filter", - length = _get_chem_length_R1, - temp = "{results}/temp/{sample}_R1_filter.bam", - temp2 = "{results}/temp/{sample}_R1_filter2.bam", - final = "{results}/counts/{sample}_R1_counts.tsv.gz", + length = _get_chem_length_paired, + temp = "{results}/temp/{sample}_paired_filter.bam", + temp2 = "{results}/temp/{sample}_paired_filter2.bam", + final = "{results}/counts/{sample}_paired_counts.tsv.gz", memory = "select[mem>8] rusage[mem=8]" # LSF format; change as needed log: - "{results}/logs/filter_{sample}_R1.txt" + "{results}/logs/filter_{sample}_paired.txt" threads: 4 shell: @@ -171,7 +244,7 @@ rule count: > {log} """ -rule bedR1: +rule bed_R1: input: "{results}/temp/{sample}_R1.bam" output: @@ -197,13 +270,13 @@ rule bedR1: -I {input} \ -S {params.dedup} \ > {log} - # 0x82 for R2 - samtools view -f 0x42 {params.dedup} -b > {params.r} - bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | gzip > {output} + samtools view -F 4 {params.dedup} -b > {params.r} + bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | g +zip > {output} rm -rf {params.r} """ -rule bedR2: +rule bed_R2: input: "{results}/temp/{sample}_R2.bam" output: @@ -230,3 +303,35 @@ rule bedR2: bedtools genomecov -ibam {params.dedup} -3 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | gzip > {output} """ + +rule bed_paired: + input: + "{results}/temp/{sample}_paired.bam" + output: + "{results}/bed/{sample}_paired.bed.gz" + params: + job_name = "bed", + dedup = "{results}/temp/{sample}_paired_dedup.bam", + r = "{results}/temp/{sample}_paired_r.bam", + memory = "select[mem>48] rusage[mem=48]" # LSF format; change as needed + log: + "{results}/logs/bed_{sample}_paired.txt" + threads: + 4 + shell: + """ + umi_tools dedup \ + --extract-umi-method=tag \ + --umi-tag=UB \ + --per-cell \ + --cell-tag=CB \ + --paired \ + --method=unique \ + -I {input} \ + -S {params.dedup} \ + > {log} + # 0x82 for R2 + samtools view -f 0x42 {params.dedup} -b > {params.r} + bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | gzip > {output} + rm -rf {params.r} + """ \ No newline at end of file diff --git a/rules/cutadapt_star.snake b/rules/cutadapt_star.snake index bdd7729..8cf7761 100644 --- a/rules/cutadapt_star.snake +++ b/rules/cutadapt_star.snake @@ -23,13 +23,13 @@ def _get_bc_cut(wildcards): args = chemistry[chem_version]["bc_cut"] return args -""" Extract per-capture 10X chemistry versions from gex libs (R2)""" +""" Extract per-capture chemistry from gex libs (R2)""" def _get_chem_version_R2(wildcards): chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0] args = eval(chemistry[chem_version]["R2"]) return args -""" Extract per-capture 10X chemistry versions from gex libs (paired)""" +""" Extract per-capture chemistry from gex libs (paired)""" def _get_chem_version_R1(wildcards): chem_version = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample].chemistry.unique()[0] args = eval(chemistry[chem_version]["R1"]) @@ -40,21 +40,47 @@ def _get_extra_args(wildcards): captures_filtered = SAMPLES_DF[SAMPLES_DF.capture == wildcards.sample] return list(captures_filtered.extra_args.unique()) -""" This rule is now only necessary for paired-end libraries """ +""" This rule trims R1-only libraries """ +rule cutadapt_R1: + input: + _get_fq_paths + output: + R1 = temp("{results}/cutadapt/{sample}_R1_trimmed.R1.fastq.gz") + params: + job_name = "cutadapt", + bc_cut = _get_bc_cut, + info = "{results}/cutadapt/temp_{sample}_R1.info", + temp = "{results}/cutadapt/temp_{sample}_R1_trimmed.R1.fastq.gz", + memory = "select[mem>32] rusage[mem=32]" + log: "{results}/logs/{sample}_cutadapt.out" + threads: 24 + shell: + """ + cutadapt -j 24 \ + -a TSO_R1=CCCATGTACTCTGCGTTGATACCACTGCTT \ + -a TruSeq_R1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \ + -n 4 \ + -m 75 \ + -l 300 \ + --nextseq-trim=20 \ + -o {output.R1} \ + <(zcat {input}) + """ + +""" This rule trims paired-end libraries """ rule cutadapt_paired: input: - #expand("{data}/{sample}_R1.fastq.gz", data = DATA, sample = R1_SAMPLES) _get_fq_paths output: - R1 = temp("{results}/cutadapt/{sample}_trimmed.R1.fastq.gz"), - R2 = temp("{results}/cutadapt/{sample}_trimmed.R2.fastq.gz") + R1 = temp("{results}/cutadapt/{sample}_paired_trimmed.R1.fastq.gz"), + R2 = temp("{results}/cutadapt/{sample}_paired_trimmed.R2.fastq.gz") params: job_name = "cutadapt", bc_cut = _get_bc_cut, - info = "{results}/cutadapt/temp_{sample}.info", - temp = "{results}/cutadapt/temp_{sample}_trimmed.R1.fastq.gz", - temp2 = "{results}/cutadapt/temp_{sample}_trimmed.R2.fastq.gz", - memory = "select[mem>32 && hname!=compute08] rusage[mem=32]" + info = "{results}/cutadapt/temp_{sample}_paired.info", + temp = "{results}/cutadapt/temp_{sample}_paired_trimmed.R1.fastq.gz", + temp2 = "{results}/cutadapt/temp_{sample}_paired_trimmed.R2.fastq.gz", + memory = "select[mem>32] rusage[mem=32]" log: "{results}/logs/{sample}_cutadapt.out" threads: 24 shell: @@ -95,20 +121,20 @@ rule cutadapt_paired: fi """ -""" This rule processes chromium V2/V3 paired-end libraries """ +""" This rule aligns R1-only libraries """ rule starsolo_R1: input: - R1 = "{results}/cutadapt/{sample}_trimmed.R1.fastq.gz", - R2 = "{results}/cutadapt/{sample}_trimmed.R2.fastq.gz" + R1 = "{results}/cutadapt/{sample}_R1_trimmed.R1.fastq.gz" output: "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" params: chemistry = _get_chem_version_R1, extra_args = _get_extra_args, - out_dir = "{results}/{sample}/{sample}_R1_", + out_dir = "{results}/{sample}/{sample}_R1_", job_name = "star_R1", - memory = - "select[mem>48 && hname!=compute02 && hname!=compute08 && hname!=compute19 && hname!=compute20] rusage[mem=48]" + memory = + "select[mem>48] rusage[mem=4 +8]" log: "{results}/logs/{sample}_star_R1.out" resources: total_impact = 5 @@ -135,12 +161,12 @@ rule starsolo_R1: --outSAMtype BAM SortedByCoordinate \ --limitBAMsortRAM 48000000000 \ --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \ - --readFilesIn {input.R1} {input.R2} - + --readFilesIn {input.R1} + rm -rf {params.out_dir}Solo.out """ -""" This rule processes chromium V2/V3 libraries (R2 only) """ +""" This rule aligns R2-only libraries """ rule starsolo_R2: input: R1 = _get_fq_paths @@ -151,7 +177,7 @@ rule starsolo_R2: extra_args = _get_extra_args, out_dir = "{results}/{sample}/{sample}_R2_", job_name = "star_R2", - memory = "select[mem>48 && hname!=compute02 && hname!=compute08 && hname!=compute19 && hname!=compute20] rusage[mem=48]" + memory = "select[mem>48] rusage[mem=48]" log: "{results}/logs/{sample}_star_R2.out" resources: total_impact = 5 @@ -179,3 +205,48 @@ rule starsolo_R2: rm -rf {params.out_dir}Solo.out """ + +""" This rule aligns paired-end libraries """ +rule starsolo_paired: + input: + R1 = "{results}/cutadapt/{sample}_trimmed.R1.fastq.gz", + R2 = "{results}/cutadapt/{sample}_trimmed.R2.fastq.gz" + output: + "{results}/{sample}/{sample}_paired_Aligned.sortedByCoord.out.bam" + params: + chemistry = _get_chem_version_R1, + extra_args = _get_extra_args, + out_dir = "{results}/{sample}/{sample}_paired_", + job_name = "star_paired", + memory = + "select[mem>48] rusage[mem=48]" + log: "{results}/logs/{sample}_star_paired.out" + resources: + total_impact = 5 + threads: 14 + shell: + """ + {STAR} --soloType CB_UMI_Simple \ + --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI \ + --soloUMIdedup 1MM_Directional \ + --readFilesCommand gunzip -c \ + --runThreadN 12 \ + --soloCBwhitelist {params.chemistry} \ + --soloBarcodeMate 1 \ + --outFilterMultimapNmax 1 \ + --outFilterMismatchNmax 999 \ + --outFilterMismatchNoverReadLmax 0.2 \ + --genomeDir {STAR_INDEX} \ + --soloFeatures Gene \ + --alignMatesGapMax 100000 \ + --alignIntronMax 100000 \ + --soloCellFilter None {params.extra_args} \ + --outFileNamePrefix {params.out_dir} \ + --outSAMtype BAM SortedByCoordinate \ + --limitBAMsortRAM 48000000000 \ + --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \ + --readFilesIn {input.R1} {input.R2} + + rm -rf {params.out_dir}Solo.out + """ diff --git a/rules/r1only.snake b/rules/r1only.snake deleted file mode 100644 index a0acbee..0000000 --- a/rules/r1only.snake +++ /dev/null @@ -1,175 +0,0 @@ -rule cutadapt_R1only: - input: - #expand("{data}/{sample}_R1.fastq.gz", data = DATA, sample = R1_SAMPLES) - _get_fq_paths - output: - R1 = temp("{results}/cutadapt/{sample}_trimmed.R1.fastq.gz") - params: - job_name = "cutadapt", - bc_cut = _get_bc_cut, - info = "{results}/cutadapt/temp_{sample}.info", - temp = "{results}/cutadapt/temp_{sample}_trimmed.R1.fastq.gz", - memory = "select[mem>32 && hname!=compute08] rusage[mem=32]" - log: "{results}/logs/{sample}_cutadapt.out" - threads: 24 - shell: - """ - cutadapt -j 24 \ - -a TSO_R1=CCCATGTACTCTGCGTTGATACCACTGCTT \ - -a TruSeq_R1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \ - -n 4 \ - -m 75 \ - -l 300 \ - --nextseq-trim=20 \ - -o {output.R1} \ - <(zcat {input}) - """ - -rule starsolo_R1only: - input: - R1 = "{results}/cutadapt/{sample}_trimmed.R1.fastq.gz" - output: - "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" - params: - chemistry = _get_chem_version_R1, - extra_args = _get_extra_args, - out_dir = "{results}/{sample}/{sample}_R1_", - job_name = "star_R1", - memory = - "select[mem>48 && hname!=compute02 && hname!=compute08 && hname!=compute19 && hname!=compute20] rusage[mem=4 -8]" - log: "{results}/logs/{sample}_star_R1.out" - resources: - total_impact = 5 - threads: 14 - shell: - """ - {STAR} --soloType CB_UMI_Simple \ - --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ - --soloUMIfiltering MultiGeneUMI \ - --soloUMIdedup 1MM_Directional \ - --readFilesCommand gunzip -c \ - --runThreadN 12 \ - --soloCBwhitelist {params.chemistry} \ - --soloBarcodeMate 1 \ - --outFilterMultimapNmax 1 \ - --outFilterMismatchNmax 999 \ - --outFilterMismatchNoverReadLmax 0.2 \ - --genomeDir {STAR_INDEX} \ - --soloFeatures Gene \ - --alignMatesGapMax 100000 \ - --alignIntronMax 100000 \ - --soloCellFilter None {params.extra_args} \ - --outFileNamePrefix {params.out_dir} \ - --outSAMtype BAM SortedByCoordinate \ - --limitBAMsortRAM 48000000000 \ - --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \ - --readFilesIn {input.R1} - - rm -rf {params.out_dir}Solo.out - """ - -rule filterR1only: - input: - "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" - output: - temp("{results}/temp/{sample}_R1.bam") - params: - job_name = "filter", - length = _get_chem_length_R1, - temp = "{results}/temp/{sample}_R1_filter.bam", - temp2 = "{results}/temp/{sample}_R1_filter2.bam", - final = "{results}/counts/{sample}_R1_counts.tsv.gz", - memory = "select[mem>8] rusage[mem=8]" # LSF format; change as needed - log: - "{results}/logs/filter_{sample}_R1.txt" - threads: - 4 - shell: - """ - samtools view -h {input} | grep -v 'CB:Z:-\|UB:Z:-' | samtools view -b - > {params.temp} - set +e - python3 inst/scripts/filter_bam_correct.py -i {params.temp} -o {params.temp2} -l {params.length} -s -c 20 - exitcode=$? - if [ $exitcode -eq 1 ] - then - touch {params.final} - fi - samtools sort {params.temp2} -o {output} - # mv {params.temp} {output} - samtools index {output} - rm -rf {params.temp} - rm -rf {params.temp2} - """ - -rule assign_sites_read1only: - input: - "{results}/temp/{sample}_R1.bam" - output: - bam = temp("{results}/temp/{sample}_R1_assigned.bam") - params: - job_name = "assign_sites_read1", - memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed - out = "{results}/temp/{sample}_R1_assigned", - out_bam = "{results}/temp/{sample}_R1.bam.featureCounts.bam" - log: - "{results}/logs/assign_sites_{sample}_R1.txt" - threads: - 12 - shell: - """ - if echo {POLYA_SITES} | tr '[:upper:]' '[:lower:]' | grep -q -e "\.saf$" -e "\.saf.gz$"; then - polyaformat='SAF'; else - polyaformat='GTF'; - fi - - featureCounts \ - -s 2 -Q 10 -O \ - --read2pos 5 \ - -o {params.out} \ - -F $polyaformat \ - -a {POLYA_SITES} \ - -R BAM \ - -T {threads} \ - {input} \ - 2> {log} - - samtools sort \ - {params.out_bam} \ - -o {output.bam} - - samtools index {output.bam} - rm -rf {params.out_bam} - """ - -rule bedR1only: - input: - "{results}/temp/{sample}_R1.bam" - output: - "{results}/bed/{sample}_R1.bed.gz" - params: - job_name = "bed", - dedup = "{results}/temp/{sample}_R1_dedup.bam", - r = "{results}/temp/{sample}_R1_r.bam", - memory = "select[mem>48] rusage[mem=48]" # LSF format; change as needed - log: - "{results}/logs/bed_{sample}_R1.txt" - threads: - 4 - shell: - """ - umi_tools dedup \ - --extract-umi-method=tag \ - --umi-tag=UB \ - --per-cell \ - --cell-tag=CB \ - --paired \ - --method=unique \ - -I {input} \ - -S {params.dedup} \ - > {log} - samtools view -F 4 {params.dedup} -b > {params.r} - bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | g -zip > {output} - rm -rf {params.r} - """