pipeline for RNA-Seq data

TheJacksonLaboratory · Dec 14, 2023 · e59203f · e59203f
1 parent 3c50aa5
commit e59203f
Show file tree

Hide file tree

Showing 5 changed files with 310 additions and 0 deletions.
diff --git a/scripts/pipeline/Snakefile b/scripts/pipeline/Snakefile
@@ -0,0 +1,129 @@
+import pandas as pd
+import os
+import tempfile
+from snakemake.shell import shell
+from pathlib import Path
+import numpy as np
+import re
+
+configfile: "/projects/chesler-lab/jcpg/snakemake/config.yaml"
+
+sample_table    = pd.read_table(config['samples_table'], sep='\t')
+sample_table    = sample_table.drop_duplicates(subset='srr', keep='first', inplace=False)
+sample_table    = sample_table.dropna()
+sample_table.set_index('srr',inplace=True)
+srp=sample_table.iloc[1,1]
+cohort=sample_table.iloc[1,0]
+dir=config["fastq_dir"]
+samples_pe=sample_table[sample_table['isPaired']=='PAIRED'].index.values
+biosamples=sample_table.BioSample.unique()
+
+rule all:
+    input:
+         expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"])
+
+
+rule get_fastq_pe:
+    output:
+        down=expand("{dir}/{samples_pe}_{num}.fastq",dir=config["fastq_dir"],num=[1,2],samples_pe=samples_pe)
+    params:
+        outdir=config["fastq_dir"],
+        samples=expand("{samples}",samples=samples_pe)
+    threads: 32
+    run:
+        for s in samples_pe:
+            try:
+              shell("prefetch  --max-size 500G  "+s)
+              shell("fasterq-dump -t tmp/  --split-files --threads {threads} --outdir {params.outdir}/ "+s)
+            #  shell("rm -r "+s)
+            except:
+              print('retrying '+s)
+
+rule fastp_pe:
+    input:
+        i=expand("{dir}/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]),
+        I=expand("{dir}/{{samples_pe}}_2.fastq",dir=config["fastq_dir"])
+    output:
+        o=expand("{dir}/trimmed/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]),
+        O=expand("{dir}/trimmed/{{samples_pe}}_2.fastq",dir=config["fastq_dir"])
+    log:
+        expand("logs/fastp/{{samples_pe}}.log")
+    params:
+        # list of trimmers (see manual)
+        # optional parameters
+    threads:
+        32
+    run:
+        shell("fastp -i {input.i} -I {input.I} -o {output.o} -O {output.O}")
+       # shell("rm {input.i}")
+       # shell("rm {input.I}")
+
+
+
+rule prepare_rsem_reference:
+        input:
+           gtf = expand("{gtf}",gtf=config["gtf"]),
+           genome = expand("{genome}",genome=config["genome"]) 
+        output:
+           directory(expand("{rsem_ref_path}",genome=config["genome"],rsem_ref_path=config["rsem_ref_path"]))
+        log:
+           log=expand("logs/rsem/rsem_reference_{genome}.log",genome=config["genome"])
+        threads:
+           32
+        shell:
+           "rsem-prepare-reference  --gtf {input.gtf} --num-threads {threads} --star "
+    	   "{input.genome} "
+  	   "{output}rsem"
+
+
+rule rsem_calculate_expression_pe:
+        input:
+           fq1 = expand("{dir}/trimmed/{samples_pe}_1.fastq",dir=config["fastq_dir"],samples_pe=samples_pe),
+           fq2 = expand("{dir}/trimmed/{samples_pe}_2.fastq",dir=config["fastq_dir"],samples_pe=samples_pe),
+           ref = expand("{rsem_ref_path}/",rsem_ref_path=config["rsem_ref_path"]) 
+        output:
+           samp = expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"])
+        log:
+           log=expand("logs/rsem/rsem_expression_{samples_pe}.log",samples_pe=samples_pe)
+        threads:
+           32
+        run:
+           shell('mkdir -p '+dir+'/rsem/pe/')
+           for x in input.fq1:
+             shell('touch '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(x))).with_suffix('')))
+           for val in sample_table.BioSample.unique():
+               if os.path.exists(dir+'/rsem/pe/'+val+'/rsem.isoforms.results'):
+                    continue
+               fastq_files=sample_table.index.values[sample_table['BioSample']==val]
+               fastq_string1=''
+               fastq_string2=''
+               wait_next=False
+               shell('echo next > '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(input.fq1))).with_suffix('')))
+               for file in fastq_files:
+                    if os.path.exists(dir+'/trimmed/'+file+'_1.fastq'):
+                    	fastq_string1=fastq_string1+dir+'/trimmed/'+file+'_1.fastq '
+                    	fastq_string2=fastq_string2+dir+'/trimmed/'+file+'_2.fastq '
+                    else:
+                        wait_next=True
+                        break
+               if wait_next:
+                        break
+               shell('cat '+fastq_string1+' > '+dir+'/trimmed/'+val+'_1.fastq')
+               shell('cat '+fastq_string2+' > '+dir+'/trimmed/'+val+'_2.fastq')
+               shell('mkdir '+dir+'/rsem/pe/'+val)
+               shell("rsem-calculate-expression --star --paired-end --no-bam-output --num-threads {threads} "+dir+"/trimmed/"+val+"_1.fastq "+dir+"/trimmed/"+val+"_2.fastq"+ " {input.ref}/rsem "+dir+'/rsem/pe/'+val+"/rsem")
+
+rule run_hbadeals:
+        threads: 32
+        input:
+          samp = expand("{dir}/rsem/pe/{samples_pe}/",dir=config["fastq_dir"],samples_pe=samples_pe)
+        output:
+          '/projects/chesler-lab/jcpg/snakemake/'+str(srp)+'_'+str(cohort)+'.txt'
+        params:
+          cohort=str(cohort),
+          fastq_dir=config["fastq_dir"]
+        run:
+          shell("Rscript  /projects/chesler-lab/jcpg/snakemake/run_hba_deals.R  /projects/chesler-lab/jcpg/snakemake/case_control_c{params.cohort}.tsv {params.cohort} {params.fastq_dir}/rsem/pe/")
+          shell("rm -f "+dir+"/trimmed/*.fastq")
+
+
diff --git a/scripts/pipeline/config.yaml b/scripts/pipeline/config.yaml
@@ -0,0 +1,36 @@
+fastq_dir:
+ ./
+
+rsem_ref_path:
+ /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/human_genome/
+
+#rsem_ref_path:
+ #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/rsem_ref
+
+#genome:
+# /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCm39.genome.fa
+
+#gtf:
+ #/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/gencode.vM26.chr_patch_hapl_scaff.annotation.gtf
+
+#rsem_ref_path:
+ #/fastscratch/genomes/C57BL/rsem_ref
+
+
+genome:
+ /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCh38_r91.all.fa 
+
+gtf:
+ /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/Homo_sapiens.GRCh38.109.gtf
+
+#genome:
+ #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/GRCm39.genome.fa
+
+#genome:
+# /projects/compsci/omics_share/mouse/GRCm38/genome/sequence/imputed/C57BL_10J.with_unplac_unloc.fa
+
+#gtf:
+ #/projects/chesler-lab/jcpg/snakemake/genomes/mouse/gencode.vM30.annotation.gtf
+
+#gtf:
+ #/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/imputed/C57BL_10J.with_unplac_unloc.gtf
diff --git a/scripts/pipeline/run_hba_deals.R b/scripts/pipeline/run_hba_deals.R
@@ -0,0 +1,96 @@
+library(hbadeals)
+library(edgeR)
+
+#args[1] - name and path of samples table
+#args[2] - cohort number (which cases should be compared against which controls)
+#args[3] - input directory where the RSEM files are
+
+args=commandArgs(trailingOnly = TRUE)
+
+sample.table=read.table(args[1],header=TRUE,sep='\t')
+
+#Keep paired end samples:
+#sample.table=sample.table[sample.table$isPaired==args[4],]
+
+#Keep only the control and samples in this SRP that belong to the given cohort
+sample.table=sample.table[sample.table$status=='control' | sample.table$cohort==as.integer(args[2]),]
+
+
+countsData=NULL
+labels=c()
+
+for (srr in unique(sample.table$BioSample[sample.table$status=='control']))
+{
+
+  if (is.null(countsData))
+  {
+    countsData=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)[c(2,1)]
+  }
+   next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)
+   countsData=cbind(countsData,next.file$expected_count)
+
+   labels=c(labels,1)  
+}
+
+for (srr in unique(sample.table$BioSample[sample.table$status!='control']))
+{
+   next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)
+   countsData=cbind(countsData,next.file$expected_count)
+
+   labels=c(labels,2)
+}
+
+
+#labels=labels[colSums(countsData[,-c(1,2)])>=10^6]
+
+#countsData=countsData[,c(1,2,2+which(colSums(countsData[,-c(1,2)])>=10^6))]
+
+if (sum(colSums(countsData[,-c(1,2)])<10^6)>=1)
+	print("Warning: Sum of sample counts less than 10^6")
+
+tmp.countsData=countsData
+
+if (ncol(countsData)-2<=9){
+  countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,]
+}else{
+  countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),]
+  countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),]
+}
+
+
+num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)}))
+
+countsData=countsData[num.iso>1,]
+
+
+if (length(unique(countsData[,1]))<100)
+{
+  countsData=tmp.countsData
+  thresh=quantile(colMeans(countsData[,-c(1,2)]>0),0.5)
+  labels=labels[colMeans(countsData[,-c(1,2)]>0)>thresh]
+  if (sum(labels==1)<3 || sum(labels==2)<3)  {print('Not enough expressed isoforms');quit('no')}
+  countsData=countsData[,c(1,2,2+which(colMeans(countsData[,-c(1,2)]>0)>thresh))]
+  if (ncol(countsData)-2<=9){
+ 	 countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,]
+  }else{
+  	 countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),]
+  	 countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),]
+  }
+  num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)}))
+  countsData=countsData[num.iso>1,]
+  if (length(unique(countsData[,1]))<100) {print('Not enough expressed isoforms');quit('no')}  
+}
+
+rm(tmp.countsData)
+
+res=hbadeals(countsData = countsData,labels = labels,n.cores = 45,isoform.level=TRUE,mcmc.iter=400000,mcmc.warmup=10000,mtc=TRUE,lib.size=colSums(countsData[,-c(1,2)])*calcNormFactors(as.matrix(countsData[,-c(1,2)]),method='TMM'),hierarchy='auto')
+
+
+write.table(res,paste0('/projects/chesler-lab/jcpg/snakemake/',as.character(sample.table$srp)[sample.table$cohort==as.integer(args[2])][1]
+  ,'_',args[2],'.txt'),
+            sep='\t',quote = F,col.names = T,row.names = F)
+
+
+
+
+
diff --git a/scripts/pipeline/run_pipeline.sh b/scripts/pipeline/run_pipeline.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#SBATCH --time=72:00:00
+#SBATCH --job-name="pps"
+#SBATCH -o ppr-%j.out
+#SBATCH -e ppr-%j.err 
+#SBATCH [email protected]
+#SBATCH --mail-type=BEGIN,END,FAIL
+#SBATCH -N 1
+#SBATCH -n 33
+#SBATCH --mem-per-cpu=16G
+#SBATCH --array=21
+
+mkdir /flashscratch/fastq_$SLURM_ARRAY_TASK_ID
+
+cd /flashscratch/fastq_$SLURM_ARRAY_TASK_ID
+
+module load singularity
+
+cp $SLURM_SUBMIT_DIR/Snakefile .
+
+singularity exec /projects/chesler-lab/jcpg/snakemake/sing.sif bash $SLURM_SUBMIT_DIR/run_snakemake.sh $SLURM_SUBMIT_DIR/case_control_c${SLURM_ARRAY_TASK_ID}.tsv /flashscratch/fastq_$SLURM_ARRAY_TASK_ID
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/scripts/pipeline/run_snakemake.sh b/scripts/pipeline/run_snakemake.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+source activate rsem-pipeline
+
+snakemake   --resources mem_mb=16000 --until run_hbadeals --cores 32  --config samples_table=$1 fastq_dir=$2