diff --git a/00-extract-pruned-variants.sh b/00-extract-pruned-variants.sh index b4158a4..cb26b29 100755 --- a/00-extract-pruned-variants.sh +++ b/00-extract-pruned-variants.sh @@ -13,22 +13,12 @@ mkdir -p ${results_dir}/00 exec &> >(tee ${results_dir}/00/logfile) echo "Get list of pruned SNPs" -if test -f "resources/genotypes/${major_ancestry}_pruned_variants.txt.gz"; then +if test -f "resources/genotypes/hm3_prune_th_${build}.bed.gz"; then echo "Found prune file" prunefile="${genotype_processed_dir}/scratch/indep.prune.in" - gunzip -c resources/genotypes/${major_ancestry}_pruned_variants.txt.gz > ${prunefile} + gunzip -c resources/genotypes/hm3_prune_th_${build}.bed.gz > ${prunefile} fi -# Get list of tophits - -> ${genotype_processed_dir}/scratch/tophitsnps.txt -for f in resources/genotypes/tophits/*.txt -do - awk '{ print $1 }' $f >> ${genotype_processed_dir}/scratch/tophitsnps.txt -done -cat ${genotype_processed_dir}/scratch/tophitsnps.txt | sort | uniq >> ${prunefile} -cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print $1, $2, $2, $1":"$2 }' > ${prunefile}_range - # Get list of bgen files echo "Checking genotype input list..." @@ -79,7 +69,7 @@ do ./bin/plink2 \ --bgen ${bgen} ref-first \ --sample ${sample} \ - --extract ${prunefile}_range \ + --extract ${prunefile} \ --make-bed \ --out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \ --max-alleles 2 \ diff --git a/config-template.env b/config-template.env index 6ec3e2d..8c0a812 100644 --- a/config-template.env +++ b/config-template.env @@ -13,6 +13,9 @@ genotype_processed_dir="/EDIT/THIS/PATH" results_dir="/EDIT/THIS/PATH" env_family_data="true" +# Provide either hg19 or hg38 +build="hg19" + # Add any study-specific covariate variables that must be included here. # We expect this to correspond to a column name in $phenotype_input_dir/pheno_covariates.txt # sex and yob are default that should be included diff --git a/resources/genotypes/EUR_pruned_variants.txt.gz b/resources/genotypes/EUR_pruned_variants.txt.gz deleted file mode 100644 index fb2dda2..0000000 Binary files a/resources/genotypes/EUR_pruned_variants.txt.gz and /dev/null differ diff --git a/resources/genotypes/hm3_prune_th_hg19.bed.gz b/resources/genotypes/hm3_prune_th_hg19.bed.gz new file mode 100644 index 0000000..b7af27a Binary files /dev/null and b/resources/genotypes/hm3_prune_th_hg19.bed.gz differ diff --git a/resources/genotypes/hm3_prune_th_hg38.bed.gz b/resources/genotypes/hm3_prune_th_hg38.bed.gz new file mode 100644 index 0000000..22db0be Binary files /dev/null and b/resources/genotypes/hm3_prune_th_hg38.bed.gz differ diff --git a/test/prune_list/run.sh b/test/prune_list/run.sh new file mode 100644 index 0000000..6ab6311 --- /dev/null +++ b/test/prune_list/run.sh @@ -0,0 +1,86 @@ +#!/bin/bash + + +# Get 1kg in plink format +wget http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz +tar xzvf data_maf0.01_rs_ref.tgz + +# get hm3 list +wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2 +bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2 + +awk '{ print $2 }' hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map > hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp + +# prune hm3 using 1kg data +plink2 --bfile data_maf0.01_rs_ref --extract hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp --indep-pairwise 3000 50 0.8 --out indep +wc -l indep.prune.in +plink2 --bfile data_maf0.01_rs_ref --extract indep.prune.in --out data_maf0.01_rs_ref_pruned --make-just-bim + + +## Prune hm3 from chromosome X + +# Get 1kg vcf +wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz +wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel + +# Create psam +sed 1d integrated_call_samples_v3.20130502.ALL.panel | awk '{ print $1, $1, "0", "0", $4, "0" }' | sed 's/female/2/g' | sed 's/male/1/g' > 1kg_chrX.psam +head 1kg_chrX.psam + +# check same number of samples +wc -l 1kg_chrX.psam +zcat ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz | grep -m 1 "#CHROM" | tr "\t" "\n" | grep -E [0-9] > vcf_idlist +wc -l vcf_idlist + +# Get hm3 positions etc from chromosome X +grep -wf hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp /local-scratch/data/ukb/genetic/variants/arrays/imputed/released/2018-09-18/data/snp-stats/data.chrX.snp-stats | awk '{ print $3, $4, $4, $2 }' > hm3_x.bed + +# Extract from vcf and prune +plink2 --vcf ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz --psam 1kg_chrX.psam --extract range hm3_x.bed --set-all-var-ids @:#:[b37]\$r,\$a --new-id-max-allele-len 161 --max-alleles 2 --indep-pairwise 3000 50 0.8 --out indep_x --split-par hg19 + +# Make bed +cut -d ":" -f 2 indep_x.prune.in > hm3_x_prune_pos +grep -wf hm3_x_prune_pos hm3_x.bed | awk '{ print "chr"$1, $2, $3, $4 }' > hm3_x_prune.bed + + +# Combine with autosomal prune list +awk '{ print "chr"$1, $4, $4, $2 }' data_maf0.01_rs_ref_pruned.bim > hm3_auto_prune_b37.bed + +cat hm3_auto_prune_b37.bed hm3_x_prune.bed > hm3_prune_b37.bed + +# Add tophits +> tophitsnps.txt +for f in ../../resources/genotypes/tophits/*.txt +do + awk '{ print $1 }' $f | cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print "chr"$1, $2, $2, $1":"$2 }' >> tophitsnps.txt +done +cat tophitsnps.txt hm3_prune_b37.bed | sort | uniq > hm3_prune_th_b37.bed +wc -l hm3_prune_th_b37.bed +cut -d " " -f 1 hm3_prune_th_b37.bed | sort | uniq -c + +# Liftover to hg38 +wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver +wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz + +gunzip hg19ToHg38.over.chain.gz + +./liftOver hm3_prune_th_b37.bed hg19ToHg38.over.chain hm3_prune_th_b38.bed unmapped + +wc -l unmapped + +sed 's/chr//g' hm3_prune_th_b37.bed > hm3_prune_th_b37_nochr.bed +sed 's/chr//g' hm3_prune_th_b38.bed > hm3_prune_th_b38_nochr.bed + +gzip -c hm3_prune_th_b37_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg19.bed.gz +gzip -c hm3_prune_th_b38_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg38.bed.gz + +head hm3_prune_th_b38_nochr.bed +head hm3_prune_th_b37_nochr.bed + + + +# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz +# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.md5 +# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.tbi + + diff --git a/utils/check.sh b/utils/check.sh index 2910cad..aeec519 100755 --- a/utils/check.sh +++ b/utils/check.sh @@ -54,6 +54,13 @@ else fi +# check build = hg19 or hg38 +if [[ $build != "hg19" && $build != "hg38" ]]; then + echo "Error: build specified in config.env must be 'hg19' or 'hg38'" + exit 1 +fi + + echo "Checking genotype input list..." nchr=$(cat ${genotype_input_list} | grep -c '^') # Check nchr = 22 or 23