Skip to content

Commit

Permalink
Merge branch 'release/0.1.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
filosi committed Feb 19, 2024
2 parents e20cc2a + b15a6d3 commit 7400767
Show file tree
Hide file tree
Showing 10 changed files with 401 additions and 185 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ sbatch snakemake --configfile config/config.yaml --profile ~/snake_prof/slurm
--nolock
```

# Output

The pipeline produce a summary `tsv` file with the leading variant for each credible set found in the analysis.
The summary contain a subset of the original summary statistic.


# References

> {#ref-susier} Wang, G., Sarkar, A., Carbonetto, P. & Stephens, M. (2020). A simple new approach to variable selection in regression, with application to genetic fine mapping. Journal of the Royal Statistical Society, Series B 82, 1273–1300. [https://doi.org/10.1111/rssb.12388](https://doi.org/10.1111/rssb.12388)
20 changes: 19 additions & 1 deletion config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ genodata:
# ------------------
# See https://www.cog-genomics.org/plink/1.9/filter#indiv
# for details on the sample file
sample_file: "<path_to_sample_file>/sample.samplist"
sample_file: "/scratch/mfilosi/test_gwas_regenie/metaboGWAS/sample.samplist"

# Phenotype file used in the GWAS
# -------------------------------
# tab separated.
# First two colums should be FID and IID
pheno_file: "/scratch/mfilosi/metaboGWAS/input_files/phenotype_filt_winsorized_scaled.txt"
run_list: "/scratch/mfilosi/test_gwas_regenie/finemap_pipeline/pheno_to_run.csv"

# Clumping
# --------
Expand All @@ -44,3 +45,20 @@ sumstat:
pvalcol: "LOG10P"
log: True
pthr: 1.7e-11

# SusieR parameters
# -----------------
susieR:
# The following parameter will enable the use of correlation matrix based
# on LD as specified in [https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html](https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html)
# If set to False (default), it will use the genotypes coded with additive model
# together with the phenotype to evaluate the RSS model.
use_ld: False
# When using this pipeline on CHRIS samples, the IDs
# have leading zeros, and will have a total length of 10 characters.
# Thus within the `scripts/finemapping.R` will do a conversion
# with for zero padding of the IDs to match the ones in the genotypes.
# Set this value to `False` for remove 0 padding to 10 character.
chris_id: True
min_abs_corr: 0.1
iter: 1000
98 changes: 67 additions & 31 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,20 @@ import os
import json
import glob
from os.path import join as pjoin

# Define configuration file
configfile: "config/config.yaml"
import pandas as pd

# TODO: Need to validate the inputs....
# TODO: divide rules into different file to have the main file cleaner

# Define wildcards for chromosomes
wildcard_constraints:
chrom="\d+"
chrom=r"\d+"


#----------------------------------------
# Read configuration file
#----------------------------------------
configfile: "config/config.yaml"

# Load genetic data information
genotype = config["genodata"]["name"]
Expand All @@ -29,58 +34,75 @@ outdir = config["output_dir"]
if not os.path.exists(outdir):
os.makedirs(outdir, exist_ok=True)

# Include script to prepare input
# TODO: generalize prepare_input.py script
include: "scripts/prepare_input.py"

# Check if the input is available
try:
run_list = pd.read_csv(config["run_list"], header=0, sep="\t")
except KeyError:
run_list = prepare_inputs(pjoin(resdir, "tophits"), pval_thr=pthr)
except FileNotFoundError:
run_list = prepare_inputs(pjoin(resdir, "tophits"), pval_thr=pthr)


#----------------------------------------
# Custom funtion to define input for the rules
#----------------------------------------
def get_pfile_from_chrom(wildcards):
pfiles = gd['plinkfiles']
ff = [p for p in pfiles if p.find(f'chr{wildcards.chrom}.') > 0]
nfiles = gd['nfiles']
if nfiles == 1:
ff = pfiles
else:
ff = [p for p in pfiles if p.find(f'chr{wildcards.chrom}.') > 0]
return ff[0]

# Function to get the phenotypes
def get_phenos():
return run_list['pheno'].unique().tolist()

def get_pheno_to_run():
fnames = glob.glob(pjoin(resdir, "*.regenie.gz.tbi"))
phenos = [os.path.basename(f) for f in fnames]
phenos = [p.replace(".regenie.gz.tbi", "") for p in phenos]
return phenos


#----------------------------------------
# Start creating the rules for the workflow
#----------------------------------------
# Target rule
rule all:
input:
pjoin(outdir, "all_phenos_summary.cs")

checkpoint sumstat_2_plink:
# Divide sumstat by chromosome
rule sumstat_2_plink:
message:
"Separate summary stat by chromosome based on best hits"
input:
sumstat = pjoin(resdir, "{pheno}.regenie.gz")
output:
directory(pjoin(outdir, "{pheno}/tmp"))
temp(pjoin(outdir, "{pheno}/tmp/chr{chrom}_sumstat.csv"))
resources:
mem_mb=12000
params:
pval_thr = pthr,
pval_col = pvalcol
conda:
"plink-pandas"
script:
"scripts/separate_gwas.py"

def get_significant(wildcards):
"""
Exctract the chromosome available for each summary stat
"""
ckoutput = checkpoints.sumstat_2_plink.get(**wildcards).output[0]
clfiles = expand(pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.cssmstat"),
pheno=wildcards.pheno,
chrom=glob_wildcards(pjoin(ckoutput, "chr{chrom}_sumstat.csv")).chrom
)
return clfiles


rule cut_pheno:
message:
"Extract the phenotype from the original GWAS"
input:
phenofile = config["pheno_file"]
output:
pjoin(outdir, "{pheno}/tmp/phenotype.csv")
temp(pjoin(outdir, "{pheno}/tmp/phenotype.csv"))
conda:
"plink-pandas"
script:
Expand All @@ -107,13 +129,20 @@ rule clumping:
conda:
"plink2"
shell:
"""plink2 --bfile {params.infile} --clump {input.smstat} \
--clump-log10 'input-only' --clump-field {pvalcol} \
--clump-log10-p1 {params.clump_logp1} --clump-log10-p2 {params.clump_logp2} \
--clump-r2 {params.clump_r2} --clump-kb {params.clump_kb} \
--clump-snp-field ID --out {params.ofile} \
--memory {resources.mem_mb} \
--keep {params.sampfile}
"""
if [ -s {input} ]
then
plink2 --bfile {params.infile} --clump {input.smstat} \
--clump-log10 'input-only' --clump-field {pvalcol} \
--clump-log10-p1 {params.clump_logp1} --clump-log10-p2 {params.clump_logp2} \
--clump-r2 {params.clump_r2} --clump-kb {params.clump_kb} \
--clump-snp-field ID --out {params.ofile} \
--memory {resources.mem_mb} \
--keep {params.sampfile}
else
touch {output[0]}
touch {output[1]}
fi
"""

rule enlarge_and_merge:
Expand Down Expand Up @@ -141,17 +170,24 @@ rule run_susieR:
output:
cs_smstat = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.cssmstat"),
cs_report = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.csreport"),
cs_rds = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}_fit.rds")
cs_rds = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}_fit.rds")#,
# cs_log = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.log")
params:
use_ld = config["susieR"]["use_ld"],
chris_id = config["susieR"]["chris_id"],
iter = config["susieR"]["iter"],
min_abs_corr = config["susieR"]["min_abs_corr"]
resources:
mem_mb=16000
mem_mb = 16000
conda:
"susier"
script:
"scripts/finemapping.R"

rule collect_by_pheno:
input:
get_significant
expand(pjoin(outdir, "{{pheno}}/cs/cs_chr{chrom}.cssmstat"),
chrom=lookup(query="pheno == '{pheno}'", within=run_list, cols="chrom"))
output:
pjoin(outdir, "{pheno}/summary.cs")
resources:
Expand All @@ -163,7 +199,7 @@ rule collect_by_pheno:

rule collect_all:
input:
expand(pjoin(outdir, "{pheno}/summary.cs"), pheno=get_pheno_to_run()),
expand(pjoin(outdir, "{pheno}/summary.cs"), pheno=get_phenos()),
output:
pjoin(outdir, "all_phenos_summary.cs")
resources:
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/plink-pandas.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: plink-pandas
dependencies:
- bioconda::plink=1.90*
- bioconda::tabix=1.11
- python=3.11.*
- numpy
- scipy
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/susier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
- r-rcpp==1.0.*
- r-rcppeigen==0.3.3.*
- r-markdown==1.10
- r-rlog==0.1.0
- numpy==1.24.3
- scipy==1.9.3
- pandas==1.5.3
Expand Down
5 changes: 4 additions & 1 deletion workflow/scripts/aggregate_by_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ def main(summary_files, outfile, bypheno=True):
try:
# Read input file
print(f"Computing phenotype: {pheno}")
sdf = pd.read_csv(ff, sep="\t")
try:
sdf = pd.read_csv(ff, sep="\t")
except pd.errors.EmptyDataError:
sdf = pd.DataFrame()

# Handling an empty data.frame
if sdf.shape[0] > 0:
Expand Down
Loading

0 comments on commit 7400767

Please sign in to comment.