-
Notifications
You must be signed in to change notification settings - Fork 5
/
CohortDataQC_final.sh
72 lines (51 loc) · 3.52 KB
/
CohortDataQC_final.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
## !/bin/sh
## A script for processing cohort data files for use in LAI with 1000 genomes reference individuals
## Written by Elizabeth Atkinson. 1/23/18
# sub-scripts for processing steps courtesy of Meng Lin and Chris Gignoux
## Required parameters:
## 1. Binary plink files for the cohort data in question. Will just have to put in the bed stem (not including file extension)
## 2. plink installed and on the path
## 3. R installed and on the path
## 4. python installed and on the path
## Result: A new set of binary plink files where all allele rsIDs are renamed to dbsnp144, sites are oriented to 1000 genomes, with non-matching sites, indels, duplicates, sex chromosomes, and triallelic sites removed. Output plink files will be the input DATA name suffixed with QC.
## Usage is CohortDataQC.sh <data-stem> <dbSNP-bedfile> <ref-panel-legend, i.e. from 1kG>
## Unpack the parameters into labelled variables
DATA=$1
DBSNP=$2
LEG=$3
#keep only the autosomes in the data file
plink --bfile $DATA --chr 1-22 --make-bed --out $DATA.auto
## Find and get rid of duplicate loci in the bim file
#then keep the good SNPs in the plink file
cut -f2,4 $DATA.auto.bim| uniq -f1 > $DATA.NonDupSNPs
cut -f2,4 $DATA.auto.bim| uniq -D -f1 > $DATA.DuplicateSNPs
cat $DATA.DuplicateSNPs | uniq -f1 > $DATA.FirstDup
cat $DATA.NonDupSNPs $DATA.FirstDup > $DATA.SNPstoKeep
#set up environment to run plink again
plink --bfile $DATA.auto --extract $DATA.SNPstoKeep --make-bed --out $DATA.auto.nodup
## Update SNP IDs to dbsnp 144
python update_rsID_bim_ega.py --bim $DATA.auto.nodup.bim --bed $DBSNP --format T --codechr F --out $DATA.auto.nodup.dbsnp.bim
#copy the other files over to this name
cp $DATA.auto.nodup.bed $DATA.auto.nodup.dbsnp.bed
cp $DATA.auto.nodup.fam $DATA.auto.nodup.dbsnp.fam
#orient to 1000G
python match_against_1000g_v2.py --bim $DATA.auto.nodup.dbsnp.bim --legend $LEG --out $DATA.1kg
# This script has three outputs: ##modified to be suffixed to allow for full paths to be input
# 1) [outfile].Indel.txt: a bim file of indels
# 2) [outfile].NonMatching.txt: a bim file containing loci not found in 1000 genome, or has different coding alleles than 1000 genome (tri-allelic, for example). They should be removed.
# 3) [outfile].FlipStrand.txt: a bim file containing loci to flip.
#combine the lists of indels and triallelic/non-matching sites into one list of bad SNPs to remove
cat $DATA.1kg.Indel.txt $DATA.1kg.NonMatching.txt > $DATA.1kg.badsites.txt
## Flip strands for flipped sites and remove non-matching loci using plink.
plink --bfile $DATA.auto.nodup.dbsnp --exclude $DATA.1kg.badsites.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites
plink --bfile $DATA.auto.nodup.dbsnp.1ksites --flip $DATA.1kg.FlipStrand.txt --make-bed --out $DATA.auto.nodup.dbsnp.1ksites.flip
##export a warning flag if there are too many mismatched sites compared to 1000 genomes
wc -l $DATA.1kg.NonMatching.txt > nonCount
wc -l $DATA.1kg.FlipStrand.txt > flipcount
wc -l $DATA.auto.nodup.dbsnp.bim > totalsites
paste nonCount flipcount totalsites > SiteCounts
awk '{if ($1/$5 > 0.01) print "WARNING: "$1/$5*100"% of sites are problematic when compared to 1000G. This could be indicative of a different reference build or other date file incompatibility." }' SiteCounts > Warnings.out
## Find and remove A/T C/G loci
python find_cg_at_snps.py $DATA.auto.nodup.dbsnp.1ksites.flip.bim > $DATA.ATCGsites
plink --bfile $DATA.auto.nodup.dbsnp.1ksites.flip --exclude $DATA.ATCGsites --make-bed --out $DATA.QCed
#cohort data is now formatted to merge properly with the 1000G reference panel