-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathVCF_Sanger_to_Psuedomolecule.py
123 lines (114 loc) · 4.81 KB
/
VCF_Sanger_to_Psuedomolecule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
# Creates a really crude VCF file given a FASTA alignment
# Assumes that the reference sequence is the first sequence in the alignment
# To take arguments
import sys
# To handle ranges and getting elements
import itertools
import operator
import re
# To read FASTA
from Bio import SeqIO
alignment = sys.argv[1]
sequences = list(SeqIO.parse(alignment, 'fasta'))
# The reference is the first sequence in the alignment
reference = sequences[0]
# And the name of the 'chromosome' is the contig name
string_name = reference.name
chrom_name = re.split(':|-', string_name)[0]
start_pos = int(re.split(':|-', string_name)[1]) + 2 # Add 2 because to convert from 0-based BED to 1-based VCF
#sys.exit(str(start_pos))
# The VCF header
vcf_header="""##fileformat=VCFv4.1
##INFO=<ID=INDEL,Number=0,Type=Flag,Description="Variant is an INDEL">
#CHROM POS ID REF ALT QUAL FILTER INFO
"""
# We then strip out just the sequence, leaving names etc. behind
# Convert to a list, so we can save column information, too
raw_seqs = [list(x.seq) for x in sequences]
# Transpose the matrix, to iterate over columns instead of rows
columns = zip(*raw_seqs)
# Now, we actually find indels and SNP positions
indel_positions = []
snp_positions = []
# For each column
for index, base in enumerate(columns):
# Convert to a set, which gives us how many states in each column
tmp = set(base)
# Then we throw out Ns
tmp.discard('N')
# this is a little ugly, but it works
if '-' in tmp:
indel_positions.append(index)
# Now, we discard the gaps, and count up SNPs
tmp.discard('-')
# If there is more than one base, after throwing out gaps and Ns,
# then we have a SNP
if len(tmp) > 1:
snp_positions.append(index)
# Now, we have a list of positions that are covered by SNPs and indels
# We go through the indels and group them by consecutive ranges
# This identifies runs of consecutive integers, and puts them into separate lists
# for group_key, group in (0, pos1) (1, pos2) (2, pos3) ...
# group by the value of (pos - index)
# if they are the same, then the numbers are consecutive
indels = []
for key, group in itertools.groupby(enumerate(indel_positions), lambda (i,x):i-x):
# save the second value, which is the actual group
indels.append(map(operator.itemgetter(1), group))
# Now that we have a list of actual indels, we iterate over that, and save
# the states that aren't gaps. We also save the reference state
indelfilename = alignment.replace('.fasta', '_INDELs.vcf')
handle = open(indelfilename, 'w')
handle.write(vcf_header)
for i in indels:
# First, we have to get the base before the indel
# to save it in the VCF
# Since this isn't in the indel positions, it should never be a gap
before_indel = i[0]-1
before_indel_base = reference[before_indel]
# Then, we get the reference and alternate states
ref_allele = []
alt_allele = []
for position in i:
ref_allele.append(reference[position])
# Get the alignment column, throw out Ns
states = set(columns[position])
states.discard('N')
# Then get the one that isn't the reference base
if reference[position] == '-':
alt_base = [s for s in states if s != reference[position]][0]
alt_allele.append(alt_base)
else:
alt_allele.append('-')
# Now we have the reference allels and the alternate alleles
# We take out the gaps and find out which is 'longer'
ref_allele = [x for x in ref_allele if x != '-']
alt_allele = [x for x in alt_allele if x != '-']
# If the reference is longer, then we fill it with Ns, and truncate alt
if len(ref_allele) > len(alt_allele):
reference_state = before_indel_base + 'N'*len(i)
alternate_state = before_indel_base
else:
# Otherwise, truncate ref, and fill the alt with N
reference_state = before_indel_base
alternate_state = before_indel_base + 'N'*len(i)
entry = '\t'.join([chrom_name, str(before_indel + start_pos), '.', reference_state, alternate_state, '.', '.', 'INDEL'])
handle.write(entry + '\n')
handle.close()
# Now, we make the SNP VCF files
snpfilename = alignment.replace('.fasta', '_SNPs.vcf')
handle = open(snpfilename, 'w')
handle.write(vcf_header)
for position in snp_positions:
states = set(columns[position])
states.discard('N')
states.discard('-')
reference_allele = reference[position]
alternate_allele = [x for x in states if x != reference_allele][0]
# if it is involved in an indel, then we don't want it
if reference_allele == '-' or alternate_allele == '-':
continue
entry = '\t'.join([chrom_name, str(position + start_pos), '.', reference_allele, alternate_allele, '.', '.', '.'])
handle.write(entry + '\n')
handle.close()