Skip to content

Commit

Permalink
r1only for ug
Browse files Browse the repository at this point in the history
  • Loading branch information
raysinensis committed Sep 21, 2023
1 parent d15e231 commit ec192bd
Show file tree
Hide file tree
Showing 2 changed files with 190 additions and 9 deletions.
24 changes: 15 additions & 9 deletions inst/scripts/filter_bam_correct.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
suitable for cellranger and starsolo output
"""

def correct_bam_read1(bam, outbam, target_len, filter_cut):
def correct_bam_read1(bam, outbam, target_len, filter_cut, single_end):

samfile = pysam.AlignmentFile(bam, "rb")
outfile = pysam.AlignmentFile(outbam, "wb", template = samfile)
Expand All @@ -28,14 +28,16 @@ def correct_bam_read1(bam, outbam, target_len, filter_cut):
if read.is_unmapped:
# not mapped, toss
continue

if not read.is_proper_pair:

if not single_end:
if not read.is_proper_pair:
# not properly paired, toss
continue
continue

if not read.is_read1:
if not single_end:
if not read.is_read1:
# not read1, toss
continue
continue

j += 1

Expand Down Expand Up @@ -115,16 +117,20 @@ def main():
help ="""maximum nts off of target_len to keep""",
default = 10,
required = False)

parser.add_argument('-s',
'--single',
action="store_true",
required = False)
args=parser.parse_args()

in_bam = args.inbam
out_bam = args.outbam
target_len = int(args.targetlen)
filter_cut = float(args.filtercut)

single_end = args.single
print(single_end)
print("settings- target_len: ", target_len, "; filter_cut: ", filter_cut)
k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut)
k,i,j = correct_bam_read1(in_bam, out_bam, target_len = target_len, filter_cut = filter_cut, single_end = single_end)
print("fraction of reads kept: ", i/j)
print("fraction of reads corrected: ", k/i)

Expand Down
175 changes: 175 additions & 0 deletions rules/r1only.snake
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
rule cutadapt_R1only:
input:
#expand("{data}/{sample}_R1.fastq.gz", data = DATA, sample = R1_SAMPLES)
_get_fq_paths
output:
R1 = temp("{results}/cutadapt/{sample}_trimmed.R1.fastq.gz")
params:
job_name = "cutadapt",
bc_cut = _get_bc_cut,
info = "{results}/cutadapt/temp_{sample}.info",
temp = "{results}/cutadapt/temp_{sample}_trimmed.R1.fastq.gz",
memory = "select[mem>32 && hname!=compute08] rusage[mem=32]"
log: "{results}/logs/{sample}_cutadapt.out"
threads: 24
shell:
"""
cutadapt -j 24 \
-a TSO_R1=CCCATGTACTCTGCGTTGATACCACTGCTT \
-a TruSeq_R1=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA \
-n 4 \
-m 75 \
-l 300 \
--nextseq-trim=20 \
-o {output.R1} \
<(zcat {input})
"""

rule starsolo_R1only:
input:
R1 = "{results}/cutadapt/{sample}_trimmed.R1.fastq.gz"
output:
"{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam"
params:
chemistry = _get_chem_version_R1,
extra_args = _get_extra_args,
out_dir = "{results}/{sample}/{sample}_R1_",
job_name = "star_R1",
memory =
"select[mem>48 && hname!=compute02 && hname!=compute08 && hname!=compute19 && hname!=compute20] rusage[mem=4
8]"
log: "{results}/logs/{sample}_star_R1.out"
resources:
total_impact = 5
threads: 14
shell:
"""
{STAR} --soloType CB_UMI_Simple \
--soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts \
--soloUMIfiltering MultiGeneUMI \
--soloUMIdedup 1MM_Directional \
--readFilesCommand gunzip -c \
--runThreadN 12 \
--soloCBwhitelist {params.chemistry} \
--soloBarcodeMate 1 \
--outFilterMultimapNmax 1 \
--outFilterMismatchNmax 999 \
--outFilterMismatchNoverReadLmax 0.2 \
--genomeDir {STAR_INDEX} \
--soloFeatures Gene \
--alignMatesGapMax 100000 \
--alignIntronMax 100000 \
--soloCellFilter None {params.extra_args} \
--outFileNamePrefix {params.out_dir} \
--outSAMtype BAM SortedByCoordinate \
--limitBAMsortRAM 48000000000 \
--outSAMattributes NH HI nM AS CR UR CB UB GX GN sS sQ sM \
--readFilesIn {input.R1}

rm -rf {params.out_dir}Solo.out
"""

rule filterR1only:
input:
"{results}/{sample}/{sample}_R1_Aligned.sortedByCoord.out.bam"
output:
temp("{results}/temp/{sample}_R1.bam")
params:
job_name = "filter",
length = _get_chem_length_R1,
temp = "{results}/temp/{sample}_R1_filter.bam",
temp2 = "{results}/temp/{sample}_R1_filter2.bam",
final = "{results}/counts/{sample}_R1_counts.tsv.gz",
memory = "select[mem>8] rusage[mem=8]" # LSF format; change as needed
log:
"{results}/logs/filter_{sample}_R1.txt"
threads:
4
shell:
"""
samtools view -h {input} | grep -v 'CB:Z:-\|UB:Z:-' | samtools view -b - > {params.temp}
set +e
python3 inst/scripts/filter_bam_correct.py -i {params.temp} -o {params.temp2} -l {params.length} -s -c 20
exitcode=$?
if [ $exitcode -eq 1 ]
then
touch {params.final}
fi
samtools sort {params.temp2} -o {output}
# mv {params.temp} {output}
samtools index {output}
rm -rf {params.temp}
rm -rf {params.temp2}
"""

rule assign_sites_read1only:
input:
"{results}/temp/{sample}_R1.bam"
output:
bam = temp("{results}/temp/{sample}_R1_assigned.bam")
params:
job_name = "assign_sites_read1",
memory = "select[mem>8] rusage[mem=8]", # LSF format; change as needed
out = "{results}/temp/{sample}_R1_assigned",
out_bam = "{results}/temp/{sample}_R1.bam.featureCounts.bam"
log:
"{results}/logs/assign_sites_{sample}_R1.txt"
threads:
12
shell:
"""
if echo {POLYA_SITES} | tr '[:upper:]' '[:lower:]' | grep -q -e "\.saf$" -e "\.saf.gz$"; then
polyaformat='SAF'; else
polyaformat='GTF';
fi

featureCounts \
-s 2 -Q 10 -O \
--read2pos 5 \
-o {params.out} \
-F $polyaformat \
-a {POLYA_SITES} \
-R BAM \
-T {threads} \
{input} \
2> {log}

samtools sort \
{params.out_bam} \
-o {output.bam}

samtools index {output.bam}
rm -rf {params.out_bam}
"""

rule bedR1only:
input:
"{results}/temp/{sample}_R1.bam"
output:
"{results}/bed/{sample}_R1.bed.gz"
params:
job_name = "bed",
dedup = "{results}/temp/{sample}_R1_dedup.bam",
r = "{results}/temp/{sample}_R1_r.bam",
memory = "select[mem>48] rusage[mem=48]" # LSF format; change as needed
log:
"{results}/logs/bed_{sample}_R1.txt"
threads:
4
shell:
"""
umi_tools dedup \
--extract-umi-method=tag \
--umi-tag=UB \
--per-cell \
--cell-tag=CB \
--paired \
--method=unique \
-I {input} \
-S {params.dedup} \
> {log}
samtools view -F 4 {params.dedup} -b > {params.r}
bedtools genomecov -ibam {params.r} -5 -dz | awk 'BEGIN {{ OFS = "\t" }} {{ print $1, $2 - 1, $2, $3 }}' - | g
zip > {output}
rm -rf {params.r}
"""

0 comments on commit ec192bd

Please sign in to comment.