-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter-vcf.py
105 lines (97 loc) · 4.06 KB
/
filter-vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/python3
import sys
import argparse
parser = argparse.ArgumentParser(prog='hwe.py', description=__doc__)
parser.add_argument('min_an', metavar='AN', help="minimum number of paths a variant must be covered.")
parser.add_argument('min_an_male', metavar='AN_MALE', help="minimum number of paths a variant must be covered when males are homozygous.")
parser.add_argument('min_an_female', metavar='AN_FEMALE', help="minimum number of paths a variant must be covered on chrY.")
parser.add_argument('--chromosomes', metavar='CHROMOSOMES', nargs='+', default=[], help='Only select these chromosomes.')
args = parser.parse_args()
min_an = int(args.min_an)
min_an_male = int(args.min_an_male)
min_an_female = int(args.min_an_female)
total_variants = 0
total_alleles = 0
#total_lv = 0
#total_lv_alleles = 0
total_ac = 0
total_ac_alleles = 0
total_an = 0
total_an_alleles = 0
total_chrom = 0
total_chrom_alleles = 0
total_ns = 0
total_ns_alleles = 0
total_missing_alt = 0
total_missing_alt_alleles = 0
total_filtered = 0
total_filtered_alleles = 0
samples = []
for line in sys.stdin:
fields = line.split()
if line.startswith('##'):
print(line[:-1])
continue
if line.startswith('#'):
# rename chm13 to CHM13
fixed = ['CHM13' if i == 'chm13' else i for i in fields]
print('\t'.join(fixed).strip())
# keep samples
samples = fields[9:]
continue
chromosome = fields[0]#.split('.')[-1]
an_threshold = min_an
if 'X' in chromosome:
an_threshold = min_an_male
elif 'Y' in chromosome:
an_threshold = min_an_female
select_chromosome = (chromosome in args.chromosomes) or (args.chromosomes == [])
total_variants += 1
info_fields = {a.split('=')[0] : a.split('=')[1].strip() for a in fields[7].split(';') if '=' in a}
nr_alt_alleles = len(fields[4].split(','))
total_alleles += nr_alt_alleles
assert 'AN' in info_fields
assert 'AC' in info_fields
ac = sum([int(i) for i in info_fields['AC'].split(',')])
an = int(info_fields['AN'])
no_n = not 'N' in fields[3] and not 'N' in fields[4]
no_missing_alt = all(c in 'CAGTcagt,' for c in fields[4])
# for diploid samples (all except CHM13), replace "." genotype by ".|."
for i in range(len(fields[9:])):
if (fields[i+9] == '.') and not (samples[i] == 'chm13'):
fields[i+9] = '.|.'
if no_n:
total_ns += 1
total_ns_alleles += nr_alt_alleles
if ac > 0:
total_ac += 1
total_ac_alleles += nr_alt_alleles
if an >= an_threshold:
total_an += 1
total_an_alleles += nr_alt_alleles
if select_chromosome:
total_chrom += 1
total_chrom_alleles += nr_alt_alleles
if no_missing_alt:
total_missing_alt += 1
total_missing_alt_alleles += nr_alt_alleles
if (an >= an_threshold) and select_chromosome and no_n and no_missing_alt and (ac > 0):
fields[0] = chromosome
outline = '\t'.join(fields).strip()
total_filtered += 1
total_filtered_alleles += nr_alt_alleles
print(outline)
sys.stderr.write('total number of variants: ' + str(total_variants) + '\n')
sys.stderr.write('total number of alleles: ' + str(total_alleles) + '\n')
sys.stderr.write('number of variants AN>=' + str(min_an) + ': ' + str(total_an) + '\n')
sys.stderr.write('number of alleles AN>=' + str(min_an) + ': ' + str(total_an_alleles) + '\n')
sys.stderr.write('number of variants AC>0: ' + str(total_ac) + '\n')
sys.stderr.write('number of alleles AC>0: ' + str(total_ac_alleles) + '\n')
sys.stderr.write('number of variants on selected chromosomes: ' + str(total_chrom) + '\n')
sys.stderr.write('number of alleles on selected chromosomes: ' + str(total_chrom_alleles) + '\n')
sys.stderr.write('number of variants not containing Ns: ' + str(total_ns) + '\n')
sys.stderr.write('number of alleles not containing Ns: ' + str(total_ns_alleles) + '\n')
sys.stderr.write('number of variants not containing missing ALTs: ' + str(total_missing_alt) + '\n')
sys.stderr.write('number of alleles not containing missing ALTs: ' + str(total_missing_alt_alleles) + '\n')
sys.stderr.write('total number of variants in output (AN>=' + str(min_an) + ', on chromosomes): ' + str(total_filtered) + '\n')
sys.stderr.write('total number of alleles in output: ' + str(total_filtered_alleles) + '\n')