Skip to content

Commit

Permalink
pipeline for RNA-Seq data
Browse files Browse the repository at this point in the history
  • Loading branch information
karleg authored Dec 14, 2023
1 parent 3c50aa5 commit e59203f
Show file tree
Hide file tree
Showing 5 changed files with 310 additions and 0 deletions.
129 changes: 129 additions & 0 deletions scripts/pipeline/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pandas as pd
import os
import tempfile
from snakemake.shell import shell
from pathlib import Path
import numpy as np
import re

configfile: "/projects/chesler-lab/jcpg/snakemake/config.yaml"

sample_table = pd.read_table(config['samples_table'], sep='\t')
sample_table = sample_table.drop_duplicates(subset='srr', keep='first', inplace=False)
sample_table = sample_table.dropna()
sample_table.set_index('srr',inplace=True)
srp=sample_table.iloc[1,1]
cohort=sample_table.iloc[1,0]
dir=config["fastq_dir"]
samples_pe=sample_table[sample_table['isPaired']=='PAIRED'].index.values
biosamples=sample_table.BioSample.unique()

rule all:
input:
expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"])


rule get_fastq_pe:
output:
down=expand("{dir}/{samples_pe}_{num}.fastq",dir=config["fastq_dir"],num=[1,2],samples_pe=samples_pe)
params:
outdir=config["fastq_dir"],
samples=expand("{samples}",samples=samples_pe)
threads: 32
run:
for s in samples_pe:
try:
shell("prefetch --max-size 500G "+s)
shell("fasterq-dump -t tmp/ --split-files --threads {threads} --outdir {params.outdir}/ "+s)
# shell("rm -r "+s)
except:
print('retrying '+s)

rule fastp_pe:
input:
i=expand("{dir}/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]),
I=expand("{dir}/{{samples_pe}}_2.fastq",dir=config["fastq_dir"])
output:
o=expand("{dir}/trimmed/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]),
O=expand("{dir}/trimmed/{{samples_pe}}_2.fastq",dir=config["fastq_dir"])
log:
expand("logs/fastp/{{samples_pe}}.log")
params:
# list of trimmers (see manual)
# optional parameters
threads:
32
run:
shell("fastp -i {input.i} -I {input.I} -o {output.o} -O {output.O}")
# shell("rm {input.i}")
# shell("rm {input.I}")



rule prepare_rsem_reference:
input:
gtf = expand("{gtf}",gtf=config["gtf"]),
genome = expand("{genome}",genome=config["genome"])
output:
directory(expand("{rsem_ref_path}",genome=config["genome"],rsem_ref_path=config["rsem_ref_path"]))
log:
log=expand("logs/rsem/rsem_reference_{genome}.log",genome=config["genome"])
threads:
32
shell:
"rsem-prepare-reference --gtf {input.gtf} --num-threads {threads} --star "
"{input.genome} "
"{output}rsem"


rule rsem_calculate_expression_pe:
input:
fq1 = expand("{dir}/trimmed/{samples_pe}_1.fastq",dir=config["fastq_dir"],samples_pe=samples_pe),
fq2 = expand("{dir}/trimmed/{samples_pe}_2.fastq",dir=config["fastq_dir"],samples_pe=samples_pe),
ref = expand("{rsem_ref_path}/",rsem_ref_path=config["rsem_ref_path"])
output:
samp = expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"])
log:
log=expand("logs/rsem/rsem_expression_{samples_pe}.log",samples_pe=samples_pe)
threads:
32
run:
shell('mkdir -p '+dir+'/rsem/pe/')
for x in input.fq1:
shell('touch '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(x))).with_suffix('')))
for val in sample_table.BioSample.unique():
if os.path.exists(dir+'/rsem/pe/'+val+'/rsem.isoforms.results'):
continue
fastq_files=sample_table.index.values[sample_table['BioSample']==val]
fastq_string1=''
fastq_string2=''
wait_next=False
shell('echo next > '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(input.fq1))).with_suffix('')))
for file in fastq_files:
if os.path.exists(dir+'/trimmed/'+file+'_1.fastq'):
fastq_string1=fastq_string1+dir+'/trimmed/'+file+'_1.fastq '
fastq_string2=fastq_string2+dir+'/trimmed/'+file+'_2.fastq '
else:
wait_next=True
break
if wait_next:
break
shell('cat '+fastq_string1+' > '+dir+'/trimmed/'+val+'_1.fastq')
shell('cat '+fastq_string2+' > '+dir+'/trimmed/'+val+'_2.fastq')
shell('mkdir '+dir+'/rsem/pe/'+val)
shell("rsem-calculate-expression --star --paired-end --no-bam-output --num-threads {threads} "+dir+"/trimmed/"+val+"_1.fastq "+dir+"/trimmed/"+val+"_2.fastq"+ " {input.ref}/rsem "+dir+'/rsem/pe/'+val+"/rsem")

rule run_hbadeals:
threads: 32
input:
samp = expand("{dir}/rsem/pe/{samples_pe}/",dir=config["fastq_dir"],samples_pe=samples_pe)
output:
'/projects/chesler-lab/jcpg/snakemake/'+str(srp)+'_'+str(cohort)+'.txt'
params:
cohort=str(cohort),
fastq_dir=config["fastq_dir"]
run:
shell("Rscript /projects/chesler-lab/jcpg/snakemake/run_hba_deals.R /projects/chesler-lab/jcpg/snakemake/case_control_c{params.cohort}.tsv {params.cohort} {params.fastq_dir}/rsem/pe/")
shell("rm -f "+dir+"/trimmed/*.fastq")


36 changes: 36 additions & 0 deletions scripts/pipeline/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
fastq_dir:
./

rsem_ref_path:
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/human_genome/

#rsem_ref_path:
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/rsem_ref

#genome:
# /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCm39.genome.fa

#gtf:
#/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/gencode.vM26.chr_patch_hapl_scaff.annotation.gtf

#rsem_ref_path:
#/fastscratch/genomes/C57BL/rsem_ref


genome:
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCh38_r91.all.fa

gtf:
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/Homo_sapiens.GRCh38.109.gtf

#genome:
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/GRCm39.genome.fa

#genome:
# /projects/compsci/omics_share/mouse/GRCm38/genome/sequence/imputed/C57BL_10J.with_unplac_unloc.fa

#gtf:
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/gencode.vM30.annotation.gtf

#gtf:
#/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/imputed/C57BL_10J.with_unplac_unloc.gtf
96 changes: 96 additions & 0 deletions scripts/pipeline/run_hba_deals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
library(hbadeals)
library(edgeR)

#args[1] - name and path of samples table
#args[2] - cohort number (which cases should be compared against which controls)
#args[3] - input directory where the RSEM files are

args=commandArgs(trailingOnly = TRUE)

sample.table=read.table(args[1],header=TRUE,sep='\t')

#Keep paired end samples:
#sample.table=sample.table[sample.table$isPaired==args[4],]

#Keep only the control and samples in this SRP that belong to the given cohort
sample.table=sample.table[sample.table$status=='control' | sample.table$cohort==as.integer(args[2]),]


countsData=NULL
labels=c()

for (srr in unique(sample.table$BioSample[sample.table$status=='control']))
{

if (is.null(countsData))
{
countsData=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)[c(2,1)]
}
next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)
countsData=cbind(countsData,next.file$expected_count)

labels=c(labels,1)
}

for (srr in unique(sample.table$BioSample[sample.table$status!='control']))
{
next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)
countsData=cbind(countsData,next.file$expected_count)

labels=c(labels,2)
}


#labels=labels[colSums(countsData[,-c(1,2)])>=10^6]

#countsData=countsData[,c(1,2,2+which(colSums(countsData[,-c(1,2)])>=10^6))]

if (sum(colSums(countsData[,-c(1,2)])<10^6)>=1)
print("Warning: Sum of sample counts less than 10^6")

tmp.countsData=countsData

if (ncol(countsData)-2<=9){
countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,]
}else{
countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),]
countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),]
}


num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)}))

countsData=countsData[num.iso>1,]


if (length(unique(countsData[,1]))<100)
{
countsData=tmp.countsData
thresh=quantile(colMeans(countsData[,-c(1,2)]>0),0.5)
labels=labels[colMeans(countsData[,-c(1,2)]>0)>thresh]
if (sum(labels==1)<3 || sum(labels==2)<3) {print('Not enough expressed isoforms');quit('no')}
countsData=countsData[,c(1,2,2+which(colMeans(countsData[,-c(1,2)]>0)>thresh))]
if (ncol(countsData)-2<=9){
countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,]
}else{
countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),]
countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),]
}
num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)}))
countsData=countsData[num.iso>1,]
if (length(unique(countsData[,1]))<100) {print('Not enough expressed isoforms');quit('no')}
}

rm(tmp.countsData)

res=hbadeals(countsData = countsData,labels = labels,n.cores = 45,isoform.level=TRUE,mcmc.iter=400000,mcmc.warmup=10000,mtc=TRUE,lib.size=colSums(countsData[,-c(1,2)])*calcNormFactors(as.matrix(countsData[,-c(1,2)]),method='TMM'),hierarchy='auto')


write.table(res,paste0('/projects/chesler-lab/jcpg/snakemake/',as.character(sample.table$srp)[sample.table$cohort==as.integer(args[2])][1]
,'_',args[2],'.txt'),
sep='\t',quote = F,col.names = T,row.names = F)





45 changes: 45 additions & 0 deletions scripts/pipeline/run_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --time=72:00:00
#SBATCH --job-name="pps"
#SBATCH -o ppr-%j.out
#SBATCH -e ppr-%j.err
#SBATCH [email protected]
#SBATCH --mail-type=BEGIN,END,FAIL
#SBATCH -N 1
#SBATCH -n 33
#SBATCH --mem-per-cpu=16G
#SBATCH --array=21

mkdir /flashscratch/fastq_$SLURM_ARRAY_TASK_ID

cd /flashscratch/fastq_$SLURM_ARRAY_TASK_ID

module load singularity

cp $SLURM_SUBMIT_DIR/Snakefile .

singularity exec /projects/chesler-lab/jcpg/snakemake/sing.sif bash $SLURM_SUBMIT_DIR/run_snakemake.sh $SLURM_SUBMIT_DIR/case_control_c${SLURM_ARRAY_TASK_ID}.tsv /flashscratch/fastq_$SLURM_ARRAY_TASK_ID
























4 changes: 4 additions & 0 deletions scripts/pipeline/run_snakemake.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
source activate rsem-pipeline

snakemake --resources mem_mb=16000 --until run_hbadeals --cores 32 --config samples_table=$1 fastq_dir=$2

0 comments on commit e59203f

Please sign in to comment.