From ec192bdc5562c4b46901e5647cbc752c9c1b42ad Mon Sep 17 00:00:00 2001 From: raysinensis Date: Thu, 21 Sep 2023 14:27:26 -0400 Subject: [PATCH] r1only for ug --- inst/scripts/filter_bam_correct.py | 24 ++-- rules/r1only.snake | 175 +++++++++++++++++++++++++++++ 2 files changed, 190 insertions(+), 9 deletions(-) create mode 100644 rules/r1only.snake diff --git a/inst/scripts/filter_bam_correct.py b/inst/scripts/filter_bam_correct.py index 3498f27..d8bbe73 100644 --- a/inst/scripts/filter_bam_correct.py +++ b/inst/scripts/filter_bam_correct.py @@ -11,7 +11,7 @@ suitable for cellranger and starsolo output """ -def correct_bam_read1(bam, outbam, target_len, filter_cut): +def correct_bam_read1(bam, outbam, target_len, filter_cut, single_end): samfile = pysam.AlignmentFile(bam, "rb") outfile = pysam.AlignmentFile(outbam, "wb", template = samfile) @@ -28,14 +28,16 @@ def correct_bam_read1(bam, outbam, target_len, filter_cut): if read.is_unmapped: # not mapped, toss continue - - if not read.is_proper_pair: + + if not single_end: + if not read.is_proper_pair: # not properly paired, toss - continue + continue - if not read.is_read1: + if not single_end: + if not read.is_read1: # not read1, toss - continue + continue j += 1 @@ -115,16 +117,20 @@ def main(): help ="""maximum nts off of target_len to keep""", default = 10, required = False) - + parser.add_argument('-s', + '--single', + action="store_true", + required = False) args=parser.parse_args() in_bam = args.inbam out_bam = args.outbam target_len = int(args.targetlen) filter_cut = float(args.filtercut) - + single_end = args.single + print(single_end) print("settings- target_len: ", target_len, "; filter_cut: ", filter_cut) - k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut) + k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut, single_end = single_end) print("fraction of reads kept: ", i/j) print("fraction of reads corrected: ", k/i) diff --git a/rules/r1only.snake b/rules/r1only.snake new file mode 100644 index 0000000..a0acbee --- /dev/null +++ b/rules/r1only.snake @@ -0,0 +1,175 @@ +rule cutadapt_R1only: + input: + #expand("{data}/{sample}_R1.fastq.gz", data = DATA, sample = R1_SAMPLES) + _get_fq_paths + output: + R1 = temp("{results}/cutadapt/{sample}_trimmed.R1.fastq.gz") + params: + job_name = "cutadapt", + bc_cut = _get_bc_cut, + info = "{results}/cutadapt/temp_{sample}.info", + temp = "{results}/cutadapt/temp_{sample}_trimmed.R1.fastq.gz", + memory = "select[mem>32 && hname!=compute08] rusage[mem=32]" + log: "{results}/logs/{sample}_cutadapt.out" + threads: 24 + shell: + """ + cutadapt -j 24 \ + -a TSO_R1=CCCATGTACTCTGCGTTGATACCACTGCTT \ + -a TruSeq_R1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \ + -n 4 \ + -m 75 \ + -l 300 \ + --nextseq-trim=20 \ + -o {output.R1} \ + <(zcat {input}) + """ + +rule starsolo_R1only: + input: + R1 = "{results}/cutadapt/{sample}_trimmed.R1.fastq.gz" + output: + "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" + params: + chemistry = _get_chem_version_R1, + extra_args = _get_extra_args, + out_dir = "{results}/{sample}/{sample}_R1_", + job_name = "star_R1", + memory = + "select[mem>48 && hname!=compute02 && hname!=compute08 && hname!=compute19 && hname!=compute20] rusage[mem=4 +8]" + log: "{results}/logs/{sample}_star_R1.out" + resources: + total_impact = 5 + threads: 14 + shell: + """ + {STAR} --soloType CB_UMI_Simple \ + --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \ + --soloUMIfiltering MultiGeneUMI \ + --soloUMIdedup 1MM_Directional \ + --readFilesCommand gunzip -c \ + --runThreadN 12 \ + --soloCBwhitelist {params.chemistry} \ + --soloBarcodeMate 1 \ + --outFilterMultimapNmax 1 \ + --outFilterMismatchNmax 999 \ + --outFilterMismatchNoverReadLmax 0.2 \ + --genomeDir {STAR_INDEX} \ + --soloFeatures Gene \ + --alignMatesGapMax 100000 \ + --alignIntronMax 100000 \ + --soloCellFilter None {params.extra_args} \ + --outFileNamePrefix {params.out_dir} \ + --outSAMtype BAM SortedByCoordinate \ + --limitBAMsortRAM 48000000000 \ + --outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \ + --readFilesIn {input.R1} + + rm -rf {params.out_dir}Solo.out + """ + +rule filterR1only: + input: + "{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam" + output: + temp("{results}/temp/{sample}_R1.bam") + params: + job_name = "filter", + length = _get_chem_length_R1, + temp = "{results}/temp/{sample}_R1_filter.bam", + temp2 = "{results}/temp/{sample}_R1_filter2.bam", + final = "{results}/counts/{sample}_R1_counts.tsv.gz", + memory = "select[mem>8] rusage[mem=8]" # LSF format; change as needed + log: + "{results}/logs/filter_{sample}_R1.txt" + threads: + 4 + shell: + """ + samtools view -h {input} | grep -v 'CB:Z:-\|UB:Z:-' | samtools view -b - > {params.temp} + set +e + python3 inst/scripts/filter_bam_correct.py -i {params.temp} -o {params.temp2} -l {params.length} -s -c 20 + exitcode=$? + if [ $exitcode -eq 1 ] + then + touch {params.final} + fi + samtools sort {params.temp2} -o {output} + # mv {params.temp} {output} + samtools index {output} + rm -rf {params.temp} + rm -rf {params.temp2} + """ + +rule assign_sites_read1only: + input: + "{results}/temp/{sample}_R1.bam" + output: + bam = temp("{results}/temp/{sample}_R1_assigned.bam") + params: + job_name = "assign_sites_read1", + memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed + out = "{results}/temp/{sample}_R1_assigned", + out_bam = "{results}/temp/{sample}_R1.bam.featureCounts.bam" + log: + "{results}/logs/assign_sites_{sample}_R1.txt" + threads: + 12 + shell: + """ + if echo {POLYA_SITES} | tr '[:upper:]' '[:lower:]' | grep -q -e "\.saf$" -e "\.saf.gz$"; then + polyaformat='SAF'; else + polyaformat='GTF'; + fi + + featureCounts \ + -s 2 -Q 10 -O \ + --read2pos 5 \ + -o {params.out} \ + -F $polyaformat \ + -a {POLYA_SITES} \ + -R BAM \ + -T {threads} \ + {input} \ + 2> {log} + + samtools sort \ + {params.out_bam} \ + -o {output.bam} + + samtools index {output.bam} + rm -rf {params.out_bam} + """ + +rule bedR1only: + input: + "{results}/temp/{sample}_R1.bam" + output: + "{results}/bed/{sample}_R1.bed.gz" + params: + job_name = "bed", + dedup = "{results}/temp/{sample}_R1_dedup.bam", + r = "{results}/temp/{sample}_R1_r.bam", + memory = "select[mem>48] rusage[mem=48]" # LSF format; change as needed + log: + "{results}/logs/bed_{sample}_R1.txt" + threads: + 4 + shell: + """ + umi_tools dedup \ + --extract-umi-method=tag \ + --umi-tag=UB \ + --per-cell \ + --cell-tag=CB \ + --paired \ + --method=unique \ + -I {input} \ + -S {params.dedup} \ + > {log} + samtools view -F 4 {params.dedup} -b > {params.r} + bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | g +zip > {output} + rm -rf {params.r} + """