Merge branch 'release/0.1.0'

EuracBiomedicalResearch · Feb 19, 2024 · 7400767 · 7400767
2 parents e20cc2a + b15a6d3
commit 7400767
Show file tree

Hide file tree

Showing 10 changed files with 401 additions and 185 deletions.
diff --git a/README.md b/README.md
@@ -86,6 +86,12 @@ sbatch snakemake --configfile config/config.yaml --profile ~/snake_prof/slurm
   --nolock
 ```
 
+# Output
+
+The pipeline produce a summary `tsv` file with the leading variant for each credible set found in the analysis.
+The summary contain a subset of the original summary statistic.
+
+
 # References
 
 > {#ref-susier} Wang, G., Sarkar, A., Carbonetto, P. & Stephens, M. (2020). A simple new approach to variable selection in regression, with application to genetic fine mapping. Journal of the Royal Statistical Society, Series B 82, 1273–1300. [https://doi.org/10.1111/rssb.12388](https://doi.org/10.1111/rssb.12388)
diff --git a/config/config.yaml b/config/config.yaml
@@ -11,13 +11,14 @@ genodata:
 # ------------------
 # See https://www.cog-genomics.org/plink/1.9/filter#indiv
 # for details on the sample file
-sample_file: "<path_to_sample_file>/sample.samplist"
+sample_file: "/scratch/mfilosi/test_gwas_regenie/metaboGWAS/sample.samplist"
 
 # Phenotype file used in the GWAS
 # -------------------------------
 # tab separated.
 # First two colums should be FID and IID
 pheno_file: "/scratch/mfilosi/metaboGWAS/input_files/phenotype_filt_winsorized_scaled.txt"
+run_list: "/scratch/mfilosi/test_gwas_regenie/finemap_pipeline/pheno_to_run.csv" 
 
 # Clumping
 # --------
@@ -44,3 +45,20 @@ sumstat:
   pvalcol: "LOG10P"
   log: True
   pthr: 1.7e-11
+
+# SusieR parameters
+# -----------------
+susieR:
+  # The following parameter will enable the use of correlation matrix based
+  # on LD as specified in [https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html](https://stephenslab.github.io/susieR/articles/finemapping_summary_statistics.html)
+  # If set to False (default), it will use the genotypes coded with additive model
+  # together with the phenotype to evaluate the RSS model.
+  use_ld: False
+  # When using this pipeline on CHRIS samples, the IDs
+  # have leading zeros, and will have a total length of 10 characters.
+  # Thus within the `scripts/finemapping.R` will do a conversion
+  # with for zero padding of the IDs to match the ones in the genotypes.
+  # Set this value to `False` for remove 0 padding to 10 character.
+  chris_id: True
+  min_abs_corr: 0.1
+  iter: 1000
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -2,15 +2,20 @@ import os
 import json
 import glob
 from os.path import join as pjoin
-
-# Define configuration file
-configfile: "config/config.yaml"
+import pandas as pd
 
 # TODO: Need to validate the inputs....
+# TODO: divide rules into different file to have the main file cleaner 
 
 # Define wildcards for chromosomes 
 wildcard_constraints:
-      chrom="\d+"
+      chrom=r"\d+"
+
+
+#----------------------------------------
+# Read configuration file
+#----------------------------------------
+configfile: "config/config.yaml"
 
 # Load genetic data information
 genotype = config["genodata"]["name"]
@@ -29,58 +34,75 @@ outdir = config["output_dir"]
 if not os.path.exists(outdir):
   os.makedirs(outdir, exist_ok=True)
 
+# Include script to prepare input
+# TODO: generalize prepare_input.py script
+include: "scripts/prepare_input.py"
+
+# Check if the input is available
+try:
+  run_list = pd.read_csv(config["run_list"], header=0, sep="\t")
+except KeyError:
+  run_list = prepare_inputs(pjoin(resdir, "tophits"), pval_thr=pthr)
+except FileNotFoundError:
+  run_list = prepare_inputs(pjoin(resdir, "tophits"), pval_thr=pthr)
+
+
+#----------------------------------------
+# Custom funtion to define input for the rules
+#----------------------------------------
 def get_pfile_from_chrom(wildcards):
   pfiles = gd['plinkfiles']
-  ff = [p for p in pfiles if p.find(f'chr{wildcards.chrom}.') > 0]
+  nfiles = gd['nfiles']
+  if nfiles == 1:
+    ff = pfiles
+  else:
+    ff = [p for p in pfiles if p.find(f'chr{wildcards.chrom}.') > 0]
   return ff[0]
 
 # Function to get the phenotypes
+def get_phenos():
+  return run_list['pheno'].unique().tolist()
+
 def get_pheno_to_run():
   fnames = glob.glob(pjoin(resdir, "*.regenie.gz.tbi"))
   phenos = [os.path.basename(f) for f in fnames]
   phenos = [p.replace(".regenie.gz.tbi", "") for p in phenos]
   return phenos
-
 
+#----------------------------------------
+# Start creating the rules for the workflow
+#----------------------------------------
 # Target rule
 rule all:
   input:
     pjoin(outdir, "all_phenos_summary.cs")
 
-checkpoint sumstat_2_plink:
+# Divide sumstat by chromosome
+rule sumstat_2_plink:
   message:
     "Separate summary stat by chromosome based on best hits"
   input:
     sumstat = pjoin(resdir, "{pheno}.regenie.gz")
   output:
-    directory(pjoin(outdir, "{pheno}/tmp"))
+    temp(pjoin(outdir, "{pheno}/tmp/chr{chrom}_sumstat.csv"))
   resources:
     mem_mb=12000
   params:
     pval_thr = pthr,
     pval_col = pvalcol
+  conda:
+    "plink-pandas"
   script:
     "scripts/separate_gwas.py"
 
-def get_significant(wildcards):
-  """
-  Exctract the chromosome available for each summary stat
-  """
-  ckoutput = checkpoints.sumstat_2_plink.get(**wildcards).output[0]
-  clfiles = expand(pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.cssmstat"),
-                   pheno=wildcards.pheno,
-                   chrom=glob_wildcards(pjoin(ckoutput, "chr{chrom}_sumstat.csv")).chrom
-                   )
-  return clfiles
-
 
 rule cut_pheno:
   message:
     "Extract the phenotype from the original GWAS"
   input:
     phenofile = config["pheno_file"]
   output:
-    pjoin(outdir, "{pheno}/tmp/phenotype.csv")
+    temp(pjoin(outdir, "{pheno}/tmp/phenotype.csv"))
   conda:
     "plink-pandas"
   script:
@@ -107,13 +129,20 @@ rule clumping:
   conda:
     "plink2"
   shell:
-    """plink2 --bfile {params.infile} --clump {input.smstat} \
-    --clump-log10 'input-only' --clump-field {pvalcol} \
-    --clump-log10-p1 {params.clump_logp1} --clump-log10-p2 {params.clump_logp2} \
-    --clump-r2 {params.clump_r2} --clump-kb {params.clump_kb} \
-    --clump-snp-field ID  --out {params.ofile} \
-    --memory {resources.mem_mb} \
-    --keep {params.sampfile}
+    """
+    if [ -s {input} ]
+    then
+      plink2 --bfile {params.infile} --clump {input.smstat} \
+      --clump-log10 'input-only' --clump-field {pvalcol} \
+      --clump-log10-p1 {params.clump_logp1} --clump-log10-p2 {params.clump_logp2} \
+      --clump-r2 {params.clump_r2} --clump-kb {params.clump_kb} \
+      --clump-snp-field ID  --out {params.ofile} \
+      --memory {resources.mem_mb} \
+      --keep {params.sampfile}
+    else
+      touch {output[0]}
+      touch {output[1]}
+    fi
     """
 
 rule enlarge_and_merge:
@@ -141,17 +170,24 @@ rule run_susieR:
   output:
     cs_smstat = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.cssmstat"),
     cs_report = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.csreport"),
-    cs_rds = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}_fit.rds")
+    cs_rds = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}_fit.rds")#,
+    # cs_log = pjoin(outdir, "{pheno}/cs/cs_chr{chrom}.log")
+  params:
+    use_ld = config["susieR"]["use_ld"],
+    chris_id = config["susieR"]["chris_id"],
+    iter = config["susieR"]["iter"],
+    min_abs_corr = config["susieR"]["min_abs_corr"]
   resources:
-    mem_mb=16000
+    mem_mb = 16000
   conda:
     "susier"
   script:
     "scripts/finemapping.R"
 
 rule collect_by_pheno:
   input:
-    get_significant
+    expand(pjoin(outdir, "{{pheno}}/cs/cs_chr{chrom}.cssmstat"), 
+           chrom=lookup(query="pheno == '{pheno}'", within=run_list, cols="chrom"))
   output:
     pjoin(outdir, "{pheno}/summary.cs")
   resources:
@@ -163,7 +199,7 @@ rule collect_by_pheno:
 
 rule collect_all:
   input:
-    expand(pjoin(outdir, "{pheno}/summary.cs"), pheno=get_pheno_to_run()),
+    expand(pjoin(outdir, "{pheno}/summary.cs"), pheno=get_phenos()),
   output:
     pjoin(outdir, "all_phenos_summary.cs")
   resources:

diff --git a/workflow/envs/plink-pandas.yml b/workflow/envs/plink-pandas.yml
@@ -1,6 +1,7 @@
 name: plink-pandas
 dependencies:
   - bioconda::plink=1.90*
+  - bioconda::tabix=1.11
   - python=3.11.*
   - numpy
   - scipy

diff --git a/workflow/envs/susier.yml b/workflow/envs/susier.yml
@@ -13,6 +13,7 @@ dependencies:
   - r-rcpp==1.0.*
   - r-rcppeigen==0.3.3.*
   - r-markdown==1.10
+  - r-rlog==0.1.0
   - numpy==1.24.3
   - scipy==1.9.3
   - pandas==1.5.3

diff --git a/workflow/scripts/aggregate_by_pheno.py b/workflow/scripts/aggregate_by_pheno.py
@@ -21,7 +21,10 @@ def main(summary_files, outfile, bypheno=True):
         try:
             # Read input file
             print(f"Computing phenotype: {pheno}")
-            sdf = pd.read_csv(ff, sep="\t")
+            try:
+                sdf = pd.read_csv(ff, sep="\t")
+            except pd.errors.EmptyDataError:
+                sdf = pd.DataFrame()
 
             # Handling an empty data.frame
             if sdf.shape[0] > 0: