-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
116 lines (104 loc) · 5.29 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
configfile: "config.yaml"
workdir: config['workdir']
#Inputs
#Path to input bed files.
BED_IN_DIR = config['bed_in_dir']
#Outputs
#The output of make_annot and ld_scores will be saved to /ldsc_out
# This includes the .annot files, as well as all .ld* files
#The output of partition_h2 will be saved to /h2_out
# This is both the .results and the .log files
#The compiled output tsv file with baseline adjusted coefficients and p values will be saved /h2_compiled
#LDSC Config
#Directories where ldsc scripts and required external data are saved
LDSC_SCRIPTS_DIR = config['ldsc_scripts_dir']
LDSC_DATA_DIR = config['ldsc_data_dir']
SUMSTATS_DIR = config['sumstats_dir']
#Set chromosome numbers- used for iterating over chromosomes
CHROM_NUMBERS = range(1,23)
#Read in names of all of the input bedfiles. These will be used for iterating
import os
PEAK_GROUPS = os.listdir(BED_IN_DIR)
#remove file extension and just keep name
for i in range(0, len(PEAK_GROUPS)):
PEAK_GROUPS[i] = os.path.splitext(PEAK_GROUPS[i])[0]
#PEAK_GROUPS = PEAK_GROUPS[0:1] #for testing on subset
#Read in names of all of the gwas studies
GWAS_STUDIES = os.listdir(SUMSTATS_DIR)
GWAS_STUDIES = list(filter(lambda x:'sumstats' in x, GWAS_STUDIES))
#remove file extension and just keep name
for i in range(0, len(GWAS_STUDIES)):
GWAS_STUDIES[i] = os.path.splitext(GWAS_STUDIES[i])[0]
#GWAS_STUDIES = GWAS_STUDIES[0:2] #for testing on subset
#Define targets of entire workflow
rule all:
input:
# The commented out portions are the intermediate outputs from each rule. They will still be generated despite being commented out.
#expand("ldsc_out/{peak_group}.{chrom_number}.annot.gz", peak_group=PEAK_GROUPS, chrom_number=CHROM_NUMBERS)
#expand("ldsc_out/{peak_group}.{chrom_number}.l2.ldscore.gz", peak_group=PEAK_GROUPS, chrom_number=CHROM_NUMBERS)
#expand("h2_out/{peak_group}.{gwas_study}.results", peak_group=PEAK_GROUPS, gwas_study=GWAS_STUDIES)
"h2_compiled_results/results_dat.tsv"
###Make annotations
#These need to be output to the same directory as the ld scores or else partition_h2 won't work
rule make_annot:
input:
bed="{bed_in_dir}/{{peak_group}}.bed".format(bed_in_dir = BED_IN_DIR),
bim="{ldsc_data_dir}/1000G_EUR_Phase3_plink/1000G.EUR.QC.{{chrom_number}}.bim".format(ldsc_data_dir = LDSC_DATA_DIR)
output:
annot="ldsc_out/{peak_group}.{chrom_number}.annot.gz"
conda:
"{ldsc_scripts_dir}/environment.yml".format(ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
shell:
"python {ldsc_scripts_dir}/make_annot.py"
" --bed-file {{input.bed}}"
" --bimfile {{input.bim}}"
" --annot-file {{output}}".format(ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
###Compute LD scores with annot file
##have to do some weird things with the --bfile and --out because ldsc.py automatically adds file extensions
# It is very important to include the print_snps flag with the proper print_snps.txt. This print_snps.txt is#
# present in the 1000G_EUR_Phase3_baseline ftp. If this isn't used, you will get an error saying that the ldscores
# files don't have the same snp columns as the baseline
rule ld_scores:
input:
annot="ldsc_out/{peak_group}.{chrom_number}.annot.gz",
snps="{ldsc_data_dir}/1000G_EUR_Phase3_baseline/print_snps.txt".format(ldsc_data_dir = LDSC_DATA_DIR)
output:
"ldsc_out/{peak_group}.{chrom_number}.l2.ldscore.gz"
conda:
"{ldsc_scripts_dir}/environment.yml".format(ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
shell:
"python {ldsc_scripts_dir}/ldsc.py"
" --l2"
" --bfile {ldsc_data_dir}/1000G_EUR_Phase3_plink/1000G.EUR.QC.{{wildcards.chrom_number}}"
" --ld-wind-cm 1"
" --annot {{input.annot}}"
" --thin-annot"
" --out ldsc_out/{{wildcards.peak_group}}.{{wildcards.chrom_number}}"
" --print-snps {{input.snps}}".format(ldsc_data_dir = LDSC_DATA_DIR, ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
###Perform partitioning of heritability
#annot need to be output to the same directory as the ld scores or else partition_h2 won't work
rule partition_h2:
input:
ss="{sumstats_dir}/{{gwas_study}}.sumstats".format(sumstats_dir = SUMSTATS_DIR),
ref_ld_chr=expand("ldsc_out/{{peak_group}}.{chrom_number}.l2.ldscore.gz", chrom_number=CHROM_NUMBERS)
output:
"h2_out/{peak_group}.{gwas_study}.results"
conda:
"{ldsc_scripts_dir}/environment.yml".format(ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
shell:
"python {ldsc_scripts_dir}/ldsc.py"
" --h2 {{input.ss}}"
" --ref-ld-chr ldsc_out/{{wildcards.peak_group}}.,{ldsc_data_dir}/1000G_EUR_Phase3_baseline/baseline."
" --overlap-annot"
" --frqfile-chr {ldsc_data_dir}/1000G_Phase3_frq/1000G.EUR.QC."
" --out h2_out/{{wildcards.peak_group}}.{{wildcards.gwas_study}}"
" --w-ld-chr {ldsc_data_dir}/weights_hm3_no_hla/weights."
" --print-coefficients".format(ldsc_data_dir = LDSC_DATA_DIR, ldsc_scripts_dir = LDSC_SCRIPTS_DIR)
# Takes all of the results from partition_h2 and puts them into single file
rule h2_collect_results:
input:
expand("h2_out/{peak_group}.{gwas_study}.results", peak_group=PEAK_GROUPS, gwas_study=GWAS_STUDIES)
output:
"h2_compiled_results/results_dat.tsv"
script:
"h2_collect_results.py"