-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
310 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
import pandas as pd | ||
import os | ||
import tempfile | ||
from snakemake.shell import shell | ||
from pathlib import Path | ||
import numpy as np | ||
import re | ||
|
||
configfile: "/projects/chesler-lab/jcpg/snakemake/config.yaml" | ||
|
||
sample_table = pd.read_table(config['samples_table'], sep='\t') | ||
sample_table = sample_table.drop_duplicates(subset='srr', keep='first', inplace=False) | ||
sample_table = sample_table.dropna() | ||
sample_table.set_index('srr',inplace=True) | ||
srp=sample_table.iloc[1,1] | ||
cohort=sample_table.iloc[1,0] | ||
dir=config["fastq_dir"] | ||
samples_pe=sample_table[sample_table['isPaired']=='PAIRED'].index.values | ||
biosamples=sample_table.BioSample.unique() | ||
|
||
rule all: | ||
input: | ||
expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"]) | ||
|
||
|
||
rule get_fastq_pe: | ||
output: | ||
down=expand("{dir}/{samples_pe}_{num}.fastq",dir=config["fastq_dir"],num=[1,2],samples_pe=samples_pe) | ||
params: | ||
outdir=config["fastq_dir"], | ||
samples=expand("{samples}",samples=samples_pe) | ||
threads: 32 | ||
run: | ||
for s in samples_pe: | ||
try: | ||
shell("prefetch --max-size 500G "+s) | ||
shell("fasterq-dump -t tmp/ --split-files --threads {threads} --outdir {params.outdir}/ "+s) | ||
# shell("rm -r "+s) | ||
except: | ||
print('retrying '+s) | ||
|
||
rule fastp_pe: | ||
input: | ||
i=expand("{dir}/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]), | ||
I=expand("{dir}/{{samples_pe}}_2.fastq",dir=config["fastq_dir"]) | ||
output: | ||
o=expand("{dir}/trimmed/{{samples_pe}}_1.fastq",dir=config["fastq_dir"]), | ||
O=expand("{dir}/trimmed/{{samples_pe}}_2.fastq",dir=config["fastq_dir"]) | ||
log: | ||
expand("logs/fastp/{{samples_pe}}.log") | ||
params: | ||
# list of trimmers (see manual) | ||
# optional parameters | ||
threads: | ||
32 | ||
run: | ||
shell("fastp -i {input.i} -I {input.I} -o {output.o} -O {output.O}") | ||
# shell("rm {input.i}") | ||
# shell("rm {input.I}") | ||
|
||
|
||
|
||
rule prepare_rsem_reference: | ||
input: | ||
gtf = expand("{gtf}",gtf=config["gtf"]), | ||
genome = expand("{genome}",genome=config["genome"]) | ||
output: | ||
directory(expand("{rsem_ref_path}",genome=config["genome"],rsem_ref_path=config["rsem_ref_path"])) | ||
log: | ||
log=expand("logs/rsem/rsem_reference_{genome}.log",genome=config["genome"]) | ||
threads: | ||
32 | ||
shell: | ||
"rsem-prepare-reference --gtf {input.gtf} --num-threads {threads} --star " | ||
"{input.genome} " | ||
"{output}rsem" | ||
|
||
|
||
rule rsem_calculate_expression_pe: | ||
input: | ||
fq1 = expand("{dir}/trimmed/{samples_pe}_1.fastq",dir=config["fastq_dir"],samples_pe=samples_pe), | ||
fq2 = expand("{dir}/trimmed/{samples_pe}_2.fastq",dir=config["fastq_dir"],samples_pe=samples_pe), | ||
ref = expand("{rsem_ref_path}/",rsem_ref_path=config["rsem_ref_path"]) | ||
output: | ||
samp = expand("{dir}/rsem/pe/{samples_pe}/",samples_pe=samples_pe,dir=config["fastq_dir"]) | ||
log: | ||
log=expand("logs/rsem/rsem_expression_{samples_pe}.log",samples_pe=samples_pe) | ||
threads: | ||
32 | ||
run: | ||
shell('mkdir -p '+dir+'/rsem/pe/') | ||
for x in input.fq1: | ||
shell('touch '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(x))).with_suffix(''))) | ||
for val in sample_table.BioSample.unique(): | ||
if os.path.exists(dir+'/rsem/pe/'+val+'/rsem.isoforms.results'): | ||
continue | ||
fastq_files=sample_table.index.values[sample_table['BioSample']==val] | ||
fastq_string1='' | ||
fastq_string2='' | ||
wait_next=False | ||
shell('echo next > '+dir+'/rsem/pe/'+ os.path.basename(Path(re.sub('_1','',str(input.fq1))).with_suffix(''))) | ||
for file in fastq_files: | ||
if os.path.exists(dir+'/trimmed/'+file+'_1.fastq'): | ||
fastq_string1=fastq_string1+dir+'/trimmed/'+file+'_1.fastq ' | ||
fastq_string2=fastq_string2+dir+'/trimmed/'+file+'_2.fastq ' | ||
else: | ||
wait_next=True | ||
break | ||
if wait_next: | ||
break | ||
shell('cat '+fastq_string1+' > '+dir+'/trimmed/'+val+'_1.fastq') | ||
shell('cat '+fastq_string2+' > '+dir+'/trimmed/'+val+'_2.fastq') | ||
shell('mkdir '+dir+'/rsem/pe/'+val) | ||
shell("rsem-calculate-expression --star --paired-end --no-bam-output --num-threads {threads} "+dir+"/trimmed/"+val+"_1.fastq "+dir+"/trimmed/"+val+"_2.fastq"+ " {input.ref}/rsem "+dir+'/rsem/pe/'+val+"/rsem") | ||
|
||
rule run_hbadeals: | ||
threads: 32 | ||
input: | ||
samp = expand("{dir}/rsem/pe/{samples_pe}/",dir=config["fastq_dir"],samples_pe=samples_pe) | ||
output: | ||
'/projects/chesler-lab/jcpg/snakemake/'+str(srp)+'_'+str(cohort)+'.txt' | ||
params: | ||
cohort=str(cohort), | ||
fastq_dir=config["fastq_dir"] | ||
run: | ||
shell("Rscript /projects/chesler-lab/jcpg/snakemake/run_hba_deals.R /projects/chesler-lab/jcpg/snakemake/case_control_c{params.cohort}.tsv {params.cohort} {params.fastq_dir}/rsem/pe/") | ||
shell("rm -f "+dir+"/trimmed/*.fastq") | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
fastq_dir: | ||
./ | ||
|
||
rsem_ref_path: | ||
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/human_genome/ | ||
|
||
#rsem_ref_path: | ||
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/rsem_ref | ||
|
||
#genome: | ||
# /projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCm39.genome.fa | ||
|
||
#gtf: | ||
#/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/gencode.vM26.chr_patch_hapl_scaff.annotation.gtf | ||
|
||
#rsem_ref_path: | ||
#/fastscratch/genomes/C57BL/rsem_ref | ||
|
||
|
||
genome: | ||
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/GRCh38_r91.all.fa | ||
|
||
gtf: | ||
/projects/robinson-lab/USERS/karleg/projects/lps/sra/star_files/Homo_sapiens.GRCh38.109.gtf | ||
|
||
#genome: | ||
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/GRCm39.genome.fa | ||
|
||
#genome: | ||
# /projects/compsci/omics_share/mouse/GRCm38/genome/sequence/imputed/C57BL_10J.with_unplac_unloc.fa | ||
|
||
#gtf: | ||
#/projects/chesler-lab/jcpg/snakemake/genomes/mouse/gencode.vM30.annotation.gtf | ||
|
||
#gtf: | ||
#/projects/compsci/omics_share/mouse/GRCm38/transcriptome/annotation/imputed/C57BL_10J.with_unplac_unloc.gtf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
library(hbadeals) | ||
library(edgeR) | ||
|
||
#args[1] - name and path of samples table | ||
#args[2] - cohort number (which cases should be compared against which controls) | ||
#args[3] - input directory where the RSEM files are | ||
|
||
args=commandArgs(trailingOnly = TRUE) | ||
|
||
sample.table=read.table(args[1],header=TRUE,sep='\t') | ||
|
||
#Keep paired end samples: | ||
#sample.table=sample.table[sample.table$isPaired==args[4],] | ||
|
||
#Keep only the control and samples in this SRP that belong to the given cohort | ||
sample.table=sample.table[sample.table$status=='control' | sample.table$cohort==as.integer(args[2]),] | ||
|
||
|
||
countsData=NULL | ||
labels=c() | ||
|
||
for (srr in unique(sample.table$BioSample[sample.table$status=='control'])) | ||
{ | ||
|
||
if (is.null(countsData)) | ||
{ | ||
countsData=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE)[c(2,1)] | ||
} | ||
next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE) | ||
countsData=cbind(countsData,next.file$expected_count) | ||
|
||
labels=c(labels,1) | ||
} | ||
|
||
for (srr in unique(sample.table$BioSample[sample.table$status!='control'])) | ||
{ | ||
next.file=read.table(paste0(args[3],'/',srr,'/rsem.isoforms.results'),header=TRUE) | ||
countsData=cbind(countsData,next.file$expected_count) | ||
|
||
labels=c(labels,2) | ||
} | ||
|
||
|
||
#labels=labels[colSums(countsData[,-c(1,2)])>=10^6] | ||
|
||
#countsData=countsData[,c(1,2,2+which(colSums(countsData[,-c(1,2)])>=10^6))] | ||
|
||
if (sum(colSums(countsData[,-c(1,2)])<10^6)>=1) | ||
print("Warning: Sum of sample counts less than 10^6") | ||
|
||
tmp.countsData=countsData | ||
|
||
if (ncol(countsData)-2<=9){ | ||
countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,] | ||
}else{ | ||
countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),] | ||
countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),] | ||
} | ||
|
||
|
||
num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)})) | ||
|
||
countsData=countsData[num.iso>1,] | ||
|
||
|
||
if (length(unique(countsData[,1]))<100) | ||
{ | ||
countsData=tmp.countsData | ||
thresh=quantile(colMeans(countsData[,-c(1,2)]>0),0.5) | ||
labels=labels[colMeans(countsData[,-c(1,2)]>0)>thresh] | ||
if (sum(labels==1)<3 || sum(labels==2)<3) {print('Not enough expressed isoforms');quit('no')} | ||
countsData=countsData[,c(1,2,2+which(colMeans(countsData[,-c(1,2)]>0)>thresh))] | ||
if (ncol(countsData)-2<=9){ | ||
countsData=countsData[rowSums(countsData[,-c(1,2)]>0)>=ncol(countsData)-2,] | ||
}else{ | ||
countsData=countsData[rowSums(countsData[,2+which(labels==1)]>0)>=0.9*sum(labels==1),] | ||
countsData=countsData[rowSums(countsData[,2+which(labels==2)]>0)>=0.9*sum(labels==2),] | ||
} | ||
num.iso=unlist(lapply(countsData$gene_id,function(x){sum(countsData$gene_id %in% x)})) | ||
countsData=countsData[num.iso>1,] | ||
if (length(unique(countsData[,1]))<100) {print('Not enough expressed isoforms');quit('no')} | ||
} | ||
|
||
rm(tmp.countsData) | ||
|
||
res=hbadeals(countsData = countsData,labels = labels,n.cores = 45,isoform.level=TRUE,mcmc.iter=400000,mcmc.warmup=10000,mtc=TRUE,lib.size=colSums(countsData[,-c(1,2)])*calcNormFactors(as.matrix(countsData[,-c(1,2)]),method='TMM'),hierarchy='auto') | ||
|
||
|
||
write.table(res,paste0('/projects/chesler-lab/jcpg/snakemake/',as.character(sample.table$srp)[sample.table$cohort==as.integer(args[2])][1] | ||
,'_',args[2],'.txt'), | ||
sep='\t',quote = F,col.names = T,row.names = F) | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/bin/bash | ||
#SBATCH --time=72:00:00 | ||
#SBATCH --job-name="pps" | ||
#SBATCH -o ppr-%j.out | ||
#SBATCH -e ppr-%j.err | ||
#SBATCH [email protected] | ||
#SBATCH --mail-type=BEGIN,END,FAIL | ||
#SBATCH -N 1 | ||
#SBATCH -n 33 | ||
#SBATCH --mem-per-cpu=16G | ||
#SBATCH --array=21 | ||
|
||
mkdir /flashscratch/fastq_$SLURM_ARRAY_TASK_ID | ||
|
||
cd /flashscratch/fastq_$SLURM_ARRAY_TASK_ID | ||
|
||
module load singularity | ||
|
||
cp $SLURM_SUBMIT_DIR/Snakefile . | ||
|
||
singularity exec /projects/chesler-lab/jcpg/snakemake/sing.sif bash $SLURM_SUBMIT_DIR/run_snakemake.sh $SLURM_SUBMIT_DIR/case_control_c${SLURM_ARRAY_TASK_ID}.tsv /flashscratch/fastq_$SLURM_ARRAY_TASK_ID | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/sh | ||
source activate rsem-pipeline | ||
|
||
snakemake --resources mem_mb=16000 --until run_hbadeals --cores 32 --config samples_table=$1 fastq_dir=$2 |