forked from kircherlab/CADD-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_scored.py
executable file
·71 lines (60 loc) · 2.06 KB
/
extract_scored.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# -*- coding: ASCII -*-
import sys, os
import pysam
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-p","--path", dest="path", help="Path to scored variants.")
parser.add_option("-i","--input", dest="input", help="Read variants from vcf file (default stdin)",default=None)
parser.add_option("--found_out", dest="found_out", help="Write found variants to file (default: stdout)",default=None)
parser.add_option("--header", dest="header", help="Write full header to output (default none)",default=False, action="store_true")
(options, args) = parser.parse_args()
if options.input:
stdin = open(options.input,'r')
else:
stdin = sys.stdin
if options.found_out:
found_out = open(options.found_out,'w')
else:
found_out = sys.stdout
fpos,fref,falt = 1,2,3
if os.path.exists(options.path) and os.path.exists(options.path+".tbi"):
filename = options.path
sys.stderr.write("Opening %s...\n"%(filename))
regionTabix = pysam.Tabixfile(filename,'r')
header = list(regionTabix.header)
for line in header:
if options.header:
found_out.write(line+"\n")
try:
fref = line.split('\t').index('Ref')
falt = line.split('\t').index('Alt')
except ValueError:
pass
else:
raise IOError("No valid file with pre-scored variants.\n")
for line in stdin:
line = line.rstrip('\n\r')
if line.startswith('#'):
sys.stdout.write(line + '\n')
continue
try:
fields = line.split('\t')
found = False
chrom = fields[0]
pos = int(fields[1])
lref,allele = fields[-2],fields[-1]
for regionHit in regionTabix.fetch(chrom, pos-1, pos):
vfields = regionHit.rstrip().split('\t')
if (vfields[fref] == lref) and (vfields[falt] == allele) and (vfields[fpos] == fields[1]):
found_out.write(regionHit+"\n")
found = True
if not found:
sys.stdout.write(line + '\n')
except ValueError:
sys.stderr.write('Encountered uncovered chromosome\n')
sys.stdout.write(line + '\n')
if options.input:
stdin.close()
if options.found_out:
found_out.close()