From e59203fadeb11648320650c2fbd81e6d41342268 Mon Sep 17 00:00:00 2001 From: karleg <32402869+karleg@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:28:06 -0500 Subject: [PATCH] pipeline for RNA-Seq data --- scripts/pipeline/Snakefile | 129 ++++++++++++++++++++++++++++++ scripts/pipeline/config.yaml | 36 +++++++++ scripts/pipeline/run_hba_deals.R | 96 ++++++++++++++++++++++ scripts/pipeline/run_pipeline.sh | 45 +++++++++++ scripts/pipeline/run_snakemake.sh | 4 + 5 files changed, 310 insertions(+) create mode 100644 scripts/pipeline/Snakefile create mode 100644 scripts/pipeline/config.yaml create mode 100644 scripts/pipeline/run_hba_deals.R create mode 100644 scripts/pipeline/run_pipeline.sh create mode 100644 scripts/pipeline/run_snakemake.sh diff --git a/scripts/pipeline/Snakefile b/scripts/pipeline/Snakefile new file mode 100644 index 0000000..15e35bf --- /dev/null +++ b/scripts/pipeline/Snakefile @@ -0,0 +1,129 @@ +import pandas as pd +import os +import tempfile +from snakemake.shell import shell +from pathlib import Path +import numpy as np +import re + +configfile: "/projects/chesler-lab/jcpg/snakemake/config.yaml" + +sample_table = pd.read_table(config['samples_table'], sep='\t') +sample_table = sample_table.drop_duplicates(subset='srr', keep='first', inplace=False) +sample_table = sample_table.dropna() +sample_table.set_index('srr',inplace=True) +srp=sample_table.iloc[1,1] +cohort=sample_table.iloc[1,0] +dir=config["fastq_dir"] +samples_pe=sample_table[sample_table['isPaired']=='PAIRED'].index.values +biosamples=sample_table.BioSample.unique() + +rule all: + input: + expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"]) + + +rule get_fastq_pe: + output: + down=expand("{dir}/{samples_pe}_{num}.fastq",dir=config["fastq_dir"],num=[1,2],samples_pe=samples_pe) + params: + outdir=config["fastq_dir"], + samples=expand("{samples}",samples=samples_pe) + threads: 32 + run: + for s in samples_pe: + try: + shell("prefetch --max-size 500G "+s) + shell("fasterq-dump -t tmp/ --split-files --threads {threads} --outdir {params.outdir}/ "+s) + # shell("rm -r "+s) + except: + print('retrying '+s) + +rule fastp_pe: + input: + i=expand("{dir}/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]), + I=expand("{dir}/{{samples_pe}}_2.fastq",dir=config["fastq_dir"]) + output: + o=expand("{dir}/trimmed/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]), + O=expand("{dir}/trimmed/{{samples_pe}}_2.fastq",dir=config["fastq_dir"]) + log: + expand("logs/fastp/{{samples_pe}}.log") + params: + # list of trimmers (see manual) + # optional parameters + threads: + 32 + run: + shell("fastp -i {input.i} -I {input.I} -o {output.o} -O {output.O}") + # shell("rm {input.i}") + # shell("rm {input.I}") + + + +rule prepare_rsem_reference: + input: + gtf = expand("{gtf}",gtf=config["gtf"]), + genome = expand("{genome}",genome=config["genome"]) + output: + directory(expand("{rsem_ref_path}",genome=config["genome"],rsem_ref_path=config["rsem_ref_path"])) + log: + log=expand("logs/rsem/rsem_reference_{genome}.log",genome=config["genome"]) + threads: + 32 + shell: + "rsem-prepare-reference --gtf {input.gtf} --num-threads {threads} --star " + "{input.genome} " + "{output}rsem" + + +rule rsem_calculate_expression_pe: + input: + fq1 = expand("{dir}/trimmed/{samples_pe}_1.fastq",dir=config["fastq_dir"],samples_pe=samples_pe), + fq2 = expand("{dir}/trimmed/{samples_pe}_2.fastq",dir=config["fastq_dir"],samples_pe=samples_pe), + ref = expand("{rsem_ref_path}/",rsem_ref_path=config["rsem_ref_path"]) + output: + samp = expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"]) + log: + log=expand("logs/rsem/rsem_expression_{samples_pe}.log",samples_pe=samples_pe) + threads: + 32 + run: + shell('mkdir -p '+dir+'/rsem/pe/') + for x in input.fq1: + shell('touch '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(x))).with_suffix(''))) + for val in sample_table.BioSample.unique(): + if os.path.exists(dir+'/rsem/pe/'+val+'/rsem.isoforms.results'): + continue + fastq_files=sample_table.index.values[sample_table['BioSample']==val] + fastq_string1='' + fastq_string2='' + wait_next=False + shell('echo next > '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(input.fq1))).with_suffix(''))) + for file in fastq_files: + if os.path.exists(dir+'/trimmed/'+file+'_1.fastq'): + fastq_string1=fastq_string1+dir+'/trimmed/'+file+'_1.fastq ' + fastq_string2=fastq_string2+dir+'/trimmed/'+file+'_2.fastq ' + else: + wait_next=True + break + if wait_next: + break + shell('cat '+fastq_string1+' > '+dir+'/trimmed/'+val+'_1.fastq') + shell('cat '+fastq_string2+' > '+dir+'/trimmed/'+val+'_2.fastq') + shell('mkdir '+dir+'/rsem/pe/'+val) + shell("rsem-calculate-expression --star --paired-end --no-bam-output --num-threads {threads} "+dir+"/trimmed/"+val+"_1.fastq "+dir+"/trimmed/"+val+"_2.fastq"+ " {input.ref}/rsem "+dir+'/rsem/pe/'+val+"/rsem") + +rule run_hbadeals: + threads: 32 + input: + samp = expand("{dir}/rsem/pe/{samples_pe}/",dir=config["fastq_dir"],samples_pe=samples_pe) + output: + '/projects/chesler-lab/jcpg/snakemake/'+str(srp)+'_'+str(cohort)+'.txt' + params: + cohort=str(cohort), + fastq_dir=config["fastq_dir"] + run: + shell("Rscript /projects/chesler-lab/jcpg/snakemake/run_hba_deals.R /projects/chesler-lab/jcpg/snakemake/case_control_c{params.cohort}.tsv {params.cohort} {params.fastq_dir}/rsem/pe/") + shell("rm -f "+dir+"/trimmed/*.fastq") + + diff --git a/scripts/pipeline/config.yaml b/scripts/pipeline/config.yaml new file mode 100644 index 0000000..61906fc --- /dev/null +++ b/scripts/pipeline/config.yaml @@ -0,0 +1,36 @@ +fastq_dir: + ./ + +rsem_ref_path: + /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/human_genome/ + +#rsem_ref_path: + #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/rsem_ref + +#genome: +# /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCm39.genome.fa + +#gtf: + #/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/gencode.vM26.chr_patch_hapl_scaff.annotation.gtf + +#rsem_ref_path: + #/fastscratch/genomes/C57BL/rsem_ref + + +genome: + /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCh38_r91.all.fa + +gtf: + /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/Homo_sapiens.GRCh38.109.gtf + +#genome: + #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/GRCm39.genome.fa + +#genome: +# /projects/compsci/omics_share/mouse/GRCm38/genome/sequence/imputed/C57BL_10J.with_unplac_unloc.fa + +#gtf: + #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/gencode.vM30.annotation.gtf + +#gtf: + #/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/imputed/C57BL_10J.with_unplac_unloc.gtf diff --git a/scripts/pipeline/run_hba_deals.R b/scripts/pipeline/run_hba_deals.R new file mode 100644 index 0000000..23fe560 --- /dev/null +++ b/scripts/pipeline/run_hba_deals.R @@ -0,0 +1,96 @@ +library(hbadeals) +library(edgeR) + +#args[1] - name and path of samples table +#args[2] - cohort number (which cases should be compared against which controls) +#args[3] - input directory where the RSEM files are + +args=commandArgs(trailingOnly = TRUE) + +sample.table=read.table(args[1],header=TRUE,sep='\t') + +#Keep paired end samples: +#sample.table=sample.table[sample.table$isPaired==args[4],] + +#Keep only the control and samples in this SRP that belong to the given cohort +sample.table=sample.table[sample.table$status=='control' | sample.table$cohort==as.integer(args[2]),] + + +countsData=NULL +labels=c() + +for (srr in unique(sample.table$BioSample[sample.table$status=='control'])) +{ + + if (is.null(countsData)) + { + countsData=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)[c(2,1)] + } + next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE) + countsData=cbind(countsData,next.file$expected_count) + + labels=c(labels,1) +} + +for (srr in unique(sample.table$BioSample[sample.table$status!='control'])) +{ + next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE) + countsData=cbind(countsData,next.file$expected_count) + + labels=c(labels,2) +} + + +#labels=labels[colSums(countsData[,-c(1,2)])>=10^6] + +#countsData=countsData[,c(1,2,2+which(colSums(countsData[,-c(1,2)])>=10^6))] + +if (sum(colSums(countsData[,-c(1,2)])<10^6)>=1) + print("Warning: Sum of sample counts less than 10^6") + +tmp.countsData=countsData + +if (ncol(countsData)-2<=9){ + countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,] +}else{ + countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),] + countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),] +} + + +num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)})) + +countsData=countsData[num.iso>1,] + + +if (length(unique(countsData[,1]))<100) +{ + countsData=tmp.countsData + thresh=quantile(colMeans(countsData[,-c(1,2)]>0),0.5) + labels=labels[colMeans(countsData[,-c(1,2)]>0)>thresh] + if (sum(labels==1)<3 || sum(labels==2)<3) {print('Not enough expressed isoforms');quit('no')} + countsData=countsData[,c(1,2,2+which(colMeans(countsData[,-c(1,2)]>0)>thresh))] + if (ncol(countsData)-2<=9){ + countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,] + }else{ + countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),] + countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),] + } + num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)})) + countsData=countsData[num.iso>1,] + if (length(unique(countsData[,1]))<100) {print('Not enough expressed isoforms');quit('no')} +} + +rm(tmp.countsData) + +res=hbadeals(countsData = countsData,labels = labels,n.cores = 45,isoform.level=TRUE,mcmc.iter=400000,mcmc.warmup=10000,mtc=TRUE,lib.size=colSums(countsData[,-c(1,2)])*calcNormFactors(as.matrix(countsData[,-c(1,2)]),method='TMM'),hierarchy='auto') + + +write.table(res,paste0('/projects/chesler-lab/jcpg/snakemake/',as.character(sample.table$srp)[sample.table$cohort==as.integer(args[2])][1] + ,'_',args[2],'.txt'), + sep='\t',quote = F,col.names = T,row.names = F) + + + + + diff --git a/scripts/pipeline/run_pipeline.sh b/scripts/pipeline/run_pipeline.sh new file mode 100644 index 0000000..b245f0f --- /dev/null +++ b/scripts/pipeline/run_pipeline.sh @@ -0,0 +1,45 @@ +#!/bin/bash +#SBATCH --time=72:00:00 +#SBATCH --job-name="pps" +#SBATCH -o ppr-%j.out +#SBATCH -e ppr-%j.err +#SBATCH --mail-user=guy.karlebach@jax.org +#SBATCH --mail-type=BEGIN,END,FAIL +#SBATCH -N 1 +#SBATCH -n 33 +#SBATCH --mem-per-cpu=16G +#SBATCH --array=21 + +mkdir /flashscratch/fastq_$SLURM_ARRAY_TASK_ID + +cd /flashscratch/fastq_$SLURM_ARRAY_TASK_ID + +module load singularity + +cp $SLURM_SUBMIT_DIR/Snakefile . + +singularity exec /projects/chesler-lab/jcpg/snakemake/sing.sif bash $SLURM_SUBMIT_DIR/run_snakemake.sh $SLURM_SUBMIT_DIR/case_control_c${SLURM_ARRAY_TASK_ID}.tsv /flashscratch/fastq_$SLURM_ARRAY_TASK_ID + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/pipeline/run_snakemake.sh b/scripts/pipeline/run_snakemake.sh new file mode 100644 index 0000000..db999dc --- /dev/null +++ b/scripts/pipeline/run_snakemake.sh @@ -0,0 +1,4 @@ +#!/bin/sh +source activate rsem-pipeline + +snakemake --resources mem_mb=16000 --until run_hbadeals --cores 32 --config samples_table=$1 fastq_dir=$2