updating variants used for step 1

MRCIEU · Oct 24, 2024 · a0afda1 · a0afda1
1 parent 402f087
commit a0afda1
Show file tree

Hide file tree

Showing 7 changed files with 99 additions and 13 deletions.
diff --git a/00-extract-pruned-variants.sh b/00-extract-pruned-variants.sh
@@ -13,22 +13,12 @@ mkdir -p ${results_dir}/00
 exec &> >(tee ${results_dir}/00/logfile)
 
 echo "Get list of pruned SNPs"
-if test -f "resources/genotypes/${major_ancestry}_pruned_variants.txt.gz"; then
+if test -f "resources/genotypes/hm3_prune_th_${build}.bed.gz"; then
     echo "Found prune file"
     prunefile="${genotype_processed_dir}/scratch/indep.prune.in"
-    gunzip -c resources/genotypes/${major_ancestry}_pruned_variants.txt.gz > ${prunefile}
+    gunzip -c resources/genotypes/hm3_prune_th_${build}.bed.gz > ${prunefile}
 fi
 
-# Get list of tophits
-
-> ${genotype_processed_dir}/scratch/tophitsnps.txt
-for f in resources/genotypes/tophits/*.txt
-do
-    awk '{ print $1 }' $f >> ${genotype_processed_dir}/scratch/tophitsnps.txt
-done
-cat ${genotype_processed_dir}/scratch/tophitsnps.txt | sort | uniq >> ${prunefile}
-cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print $1, $2, $2, $1":"$2 }' > ${prunefile}_range
-
 
 # Get list of bgen files
 echo "Checking genotype input list..."
@@ -79,7 +69,7 @@ do
     ./bin/plink2 \
         --bgen ${bgen} ref-first \
         --sample ${sample} \
-        --extract ${prunefile}_range \
+        --extract ${prunefile} \
         --make-bed \
         --out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \
         --max-alleles 2 \

diff --git a/config-template.env b/config-template.env
@@ -13,6 +13,9 @@ genotype_processed_dir="/EDIT/THIS/PATH"
 results_dir="/EDIT/THIS/PATH"
 env_family_data="true"
 
+# Provide either hg19 or hg38
+build="hg19"
+
 # Add any study-specific covariate variables that must be included here.
 # We expect this to correspond to a column name in $phenotype_input_dir/pheno_covariates.txt
 # sex and yob are default that should be included

diff --git a/resources/genotypes/EUR_pruned_variants.txt.gz b/resources/genotypes/EUR_pruned_variants.txt.gz
diff --git a/resources/genotypes/hm3_prune_th_hg19.bed.gz b/resources/genotypes/hm3_prune_th_hg19.bed.gz
diff --git a/resources/genotypes/hm3_prune_th_hg38.bed.gz b/resources/genotypes/hm3_prune_th_hg38.bed.gz
diff --git a/test/prune_list/run.sh b/test/prune_list/run.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+
+# Get 1kg in plink format
+wget http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz
+tar xzvf data_maf0.01_rs_ref.tgz
+
+# get hm3 list
+wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
+bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
+
+awk '{ print $2 }' hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map > hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp
+
+# prune hm3 using 1kg data
+plink2 --bfile data_maf0.01_rs_ref --extract hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp --indep-pairwise 3000 50 0.8 --out indep
+wc -l indep.prune.in
+plink2 --bfile data_maf0.01_rs_ref --extract indep.prune.in --out data_maf0.01_rs_ref_pruned --make-just-bim
+
+
+## Prune hm3 from chromosome X
+
+# Get 1kg vcf
+wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz
+wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel
+
+# Create psam
+sed 1d integrated_call_samples_v3.20130502.ALL.panel | awk '{ print $1, $1, "0", "0", $4, "0" }' | sed 's/female/2/g' | sed 's/male/1/g'  > 1kg_chrX.psam
+head 1kg_chrX.psam
+
+# check same number of samples
+wc -l 1kg_chrX.psam
+zcat ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz | grep -m 1 "#CHROM" | tr "\t" "\n" | grep -E [0-9] > vcf_idlist
+wc -l vcf_idlist
+
+# Get hm3 positions etc from chromosome X
+grep -wf hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp /local-scratch/data/ukb/genetic/variants/arrays/imputed/released/2018-09-18/data/snp-stats/data.chrX.snp-stats | awk '{ print $3, $4, $4, $2 }' > hm3_x.bed
+
+# Extract from vcf and prune
+plink2 --vcf ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz --psam 1kg_chrX.psam --extract range hm3_x.bed --set-all-var-ids @:#:[b37]\$r,\$a --new-id-max-allele-len 161 --max-alleles 2 --indep-pairwise 3000 50 0.8 --out indep_x --split-par hg19
+
+# Make bed
+cut -d ":" -f 2 indep_x.prune.in > hm3_x_prune_pos
+grep -wf hm3_x_prune_pos hm3_x.bed | awk '{ print "chr"$1, $2, $3, $4 }' > hm3_x_prune.bed
+
+
+# Combine with autosomal prune list
+awk '{ print "chr"$1, $4, $4, $2 }' data_maf0.01_rs_ref_pruned.bim > hm3_auto_prune_b37.bed
+
+cat hm3_auto_prune_b37.bed hm3_x_prune.bed > hm3_prune_b37.bed
+
+# Add tophits
+> tophitsnps.txt
+for f in ../../resources/genotypes/tophits/*.txt
+do
+    awk '{ print $1 }' $f | cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print "chr"$1, $2, $2, $1":"$2 }' >> tophitsnps.txt
+done
+cat tophitsnps.txt hm3_prune_b37.bed | sort | uniq > hm3_prune_th_b37.bed
+wc -l hm3_prune_th_b37.bed
+cut -d " " -f 1 hm3_prune_th_b37.bed | sort | uniq -c
+
+# Liftover to hg38
+wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver
+wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz
+
+gunzip hg19ToHg38.over.chain.gz
+
+./liftOver hm3_prune_th_b37.bed hg19ToHg38.over.chain hm3_prune_th_b38.bed unmapped
+
+wc -l unmapped
+
+sed 's/chr//g' hm3_prune_th_b37.bed > hm3_prune_th_b37_nochr.bed
+sed 's/chr//g' hm3_prune_th_b38.bed > hm3_prune_th_b38_nochr.bed
+
+gzip -c hm3_prune_th_b37_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg19.bed.gz
+gzip -c hm3_prune_th_b38_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg38.bed.gz
+
+head hm3_prune_th_b38_nochr.bed
+head hm3_prune_th_b37_nochr.bed
+
+
+
+# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz
+# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.md5
+# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.tbi
+
+
diff --git a/utils/check.sh b/utils/check.sh
@@ -54,6 +54,13 @@ else
 fi
 
 
+# check build = hg19 or hg38
+if [[ $build != "hg19" && $build != "hg38" ]]; then
+    echo "Error: build specified in config.env must be 'hg19' or 'hg38'"
+    exit 1
+fi
+
+
 echo "Checking genotype input list..."
 nchr=$(cat ${genotype_input_list} | grep -c '^')
 # Check nchr = 22 or 23