Skip to content

Commit

Permalink
updating variants used for step 1
Browse files Browse the repository at this point in the history
  • Loading branch information
explodecomputer committed Oct 24, 2024
1 parent 402f087 commit a0afda1
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 13 deletions.
16 changes: 3 additions & 13 deletions 00-extract-pruned-variants.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,22 +13,12 @@ mkdir -p ${results_dir}/00
exec &> >(tee ${results_dir}/00/logfile)

echo "Get list of pruned SNPs"
if test -f "resources/genotypes/${major_ancestry}_pruned_variants.txt.gz"; then
if test -f "resources/genotypes/hm3_prune_th_${build}.bed.gz"; then
echo "Found prune file"
prunefile="${genotype_processed_dir}/scratch/indep.prune.in"
gunzip -c resources/genotypes/${major_ancestry}_pruned_variants.txt.gz > ${prunefile}
gunzip -c resources/genotypes/hm3_prune_th_${build}.bed.gz > ${prunefile}
fi

# Get list of tophits

> ${genotype_processed_dir}/scratch/tophitsnps.txt
for f in resources/genotypes/tophits/*.txt
do
awk '{ print $1 }' $f >> ${genotype_processed_dir}/scratch/tophitsnps.txt
done
cat ${genotype_processed_dir}/scratch/tophitsnps.txt | sort | uniq >> ${prunefile}
cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print $1, $2, $2, $1":"$2 }' > ${prunefile}_range


# Get list of bgen files
echo "Checking genotype input list..."
Expand Down Expand Up @@ -79,7 +69,7 @@ do
./bin/plink2 \
--bgen ${bgen} ref-first \
--sample ${sample} \
--extract ${prunefile}_range \
--extract ${prunefile} \
--make-bed \
--out ${genotype_processed_dir}/bgen_extract/$(basename ${bgen} .bgen) \
--max-alleles 2 \
Expand Down
3 changes: 3 additions & 0 deletions config-template.env
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ genotype_processed_dir="/EDIT/THIS/PATH"
results_dir="/EDIT/THIS/PATH"
env_family_data="true"

# Provide either hg19 or hg38
build="hg19"

# Add any study-specific covariate variables that must be included here.
# We expect this to correspond to a column name in $phenotype_input_dir/pheno_covariates.txt
# sex and yob are default that should be included
Expand Down
Binary file removed resources/genotypes/EUR_pruned_variants.txt.gz
Binary file not shown.
Binary file added resources/genotypes/hm3_prune_th_hg19.bed.gz
Binary file not shown.
Binary file added resources/genotypes/hm3_prune_th_hg38.bed.gz
Binary file not shown.
86 changes: 86 additions & 0 deletions test/prune_list/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash


# Get 1kg in plink format
wget http://fileserve.mrcieu.ac.uk/ld/data_maf0.01_rs_ref.tgz
tar xzvf data_maf0.01_rs_ref.tgz

# get hm3 list
wget https://www.broadinstitute.org/files/shared/mpg/hapmap3/hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2
bzip2 -d hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map.bz2

awk '{ print $2 }' hapmap3_r1_b36_fwd_consensus.qc.poly.recode.map > hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp

# prune hm3 using 1kg data
plink2 --bfile data_maf0.01_rs_ref --extract hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp --indep-pairwise 3000 50 0.8 --out indep
wc -l indep.prune.in
plink2 --bfile data_maf0.01_rs_ref --extract indep.prune.in --out data_maf0.01_rs_ref_pruned --make-just-bim


## Prune hm3 from chromosome X

# Get 1kg vcf
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/integrated_call_samples_v3.20130502.ALL.panel

# Create psam
sed 1d integrated_call_samples_v3.20130502.ALL.panel | awk '{ print $1, $1, "0", "0", $4, "0" }' | sed 's/female/2/g' | sed 's/male/1/g' > 1kg_chrX.psam
head 1kg_chrX.psam

# check same number of samples
wc -l 1kg_chrX.psam
zcat ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz | grep -m 1 "#CHROM" | tr "\t" "\n" | grep -E [0-9] > vcf_idlist
wc -l vcf_idlist

# Get hm3 positions etc from chromosome X
grep -wf hapmap3_r1_b36_fwd_consensus.qc.poly.recode.snp /local-scratch/data/ukb/genetic/variants/arrays/imputed/released/2018-09-18/data/snp-stats/data.chrX.snp-stats | awk '{ print $3, $4, $4, $2 }' > hm3_x.bed

# Extract from vcf and prune
plink2 --vcf ALL.chrX.phase3_shapeit2_mvncall_integrated_v1c.20130502.genotypes.vcf.gz --psam 1kg_chrX.psam --extract range hm3_x.bed --set-all-var-ids @:#:[b37]\$r,\$a --new-id-max-allele-len 161 --max-alleles 2 --indep-pairwise 3000 50 0.8 --out indep_x --split-par hg19

# Make bed
cut -d ":" -f 2 indep_x.prune.in > hm3_x_prune_pos
grep -wf hm3_x_prune_pos hm3_x.bed | awk '{ print "chr"$1, $2, $3, $4 }' > hm3_x_prune.bed


# Combine with autosomal prune list
awk '{ print "chr"$1, $4, $4, $2 }' data_maf0.01_rs_ref_pruned.bim > hm3_auto_prune_b37.bed

cat hm3_auto_prune_b37.bed hm3_x_prune.bed > hm3_prune_b37.bed

# Add tophits
> tophitsnps.txt
for f in ../../resources/genotypes/tophits/*.txt
do
awk '{ print $1 }' $f | cut -d "_" -f 1 ${prunefile} | tr ":" " " | awk '{ print "chr"$1, $2, $2, $1":"$2 }' >> tophitsnps.txt
done
cat tophitsnps.txt hm3_prune_b37.bed | sort | uniq > hm3_prune_th_b37.bed
wc -l hm3_prune_th_b37.bed
cut -d " " -f 1 hm3_prune_th_b37.bed | sort | uniq -c

# Liftover to hg38
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOver
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz

gunzip hg19ToHg38.over.chain.gz

./liftOver hm3_prune_th_b37.bed hg19ToHg38.over.chain hm3_prune_th_b38.bed unmapped

wc -l unmapped

sed 's/chr//g' hm3_prune_th_b37.bed > hm3_prune_th_b37_nochr.bed
sed 's/chr//g' hm3_prune_th_b38.bed > hm3_prune_th_b38_nochr.bed

gzip -c hm3_prune_th_b37_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg19.bed.gz
gzip -c hm3_prune_th_b38_nochr.bed > ../../resources/genotypes/hm3_prune_th_hg38.bed.gz

head hm3_prune_th_b38_nochr.bed
head hm3_prune_th_b37_nochr.bed



# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz
# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.md5
# wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-common_all.vcf.gz.tbi


7 changes: 7 additions & 0 deletions utils/check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ else
fi


# check build = hg19 or hg38
if [[ $build != "hg19" && $build != "hg38" ]]; then
echo "Error: build specified in config.env must be 'hg19' or 'hg38'"
exit 1
fi


echo "Checking genotype input list..."
nchr=$(cat ${genotype_input_list} | grep -c '^')
# Check nchr = 22 or 23
Expand Down

0 comments on commit a0afda1

Please sign in to comment.