-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakeDiploidSimpleFromVCF.py
executable file
·50 lines (42 loc) · 1.67 KB
/
MakeDiploidSimpleFromVCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import sys
from collections import Counter
##########
# Pos args
# 1) path to VCF file(not gzipped)
filepath = sys.argv[1]
# 2) output vcf file
outpath = sys.argv[2]
##########
with open(filepath) as f, open(outpath, 'w') as w:
for line in f:
# skip comment lines
if line.startswith('#'):
continue
# skip not biallelic SNPs
elif len(line.split('\t')[4].split(',')) > 1:
continue
else:
spltline = line.rstrip().split('\t')
spltline[8] = 'GT'
# heterozyg = False
for i, sample in enumerate(spltline[9:]):
gt = sample.split(':')
if '/' in gt[0]:
haplotype = gt[0].split('/')
elif '|' in gt[0]:
haplotype = gt[0].split('|')
else:
print('Unexpected separator of alleles', gt[0])
# Записываю всех гетерозигот, в режими unphased у LDhat не должно быть проблем
if haplotype[0] != haplotype[1]:
# print('Warning! There are heterozygotes in the data!')
# heterozyg = True
# break
spltline[i + 9] = '/'.join(haplotype)
else:
spltline[i + 9] = '/'.join(haplotype) # оставляем только один
# Пропускаем строки с менее чем 4 данными:
treshold = len(spltline[9:]) - 4
counts = Counter(spltline[9:])
if counts['./.'] <= treshold:
w.write('\t'.join(spltline) + '\n')