-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAugmentHaps.py
93 lines (87 loc) · 2.82 KB
/
AugmentHaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#AugmentHaps.py
# Import necessary modules
import numpy as np
from argparse import ArgumentParser
from Modules.VCF_parser import *
from Modules.General import *
from Modules.IO import *
# Inputs:
parser = ArgumentParser()
parser.add_argument(
'-i','--inputHaps',
action="store",
dest="knownHaps",
help = "A VCF-formatted file containing the known haplotypes encoded \
in the GT field. GT must be present in the FORMAT field, and \
ploidy must be 1. ",
required=True
)
parser.add_argument(
"-n", "--newHaps",
action="store",
dest="inNewHaps",
help="A file containting the decimal identifiers for any new haplotypes \
to be added, one per line. ",
required=True
)
parser.add_argument(
'-o','--outFile',
action="store",
dest="outFile",
required=True,
help="A name for the output file. "
)
o = parser.parse_args()
#class o:
# pass
## the initial haplotypes file
#o.knownHaps = "Lascal_DeNovoAlign_d600q20_Haps_Extended.vcf"
## A file containing those haplotypes to be added, one per line
#o.inNewHaps = "NewHapsToAdd.txt"
## A name for the output file
#o.outFile = "Rerun2/WithAddedHaps.vcf"
# Load haplotypes
KnownHaps, KnownNames = toNP_array(o.knownHaps, "GT")
# Invert haplotypes so that ref allele is 1
#KnownHaps = invertArray(KnownHaps)
# Find unique haplotypes
inHapArray = ExtendHaps(KnownHaps)
# Convert to np array
# Read in new haplotypes, new haplotype names
inNewHaps = open(o.inNewHaps, "rb")
newHapNames = []
NewDecHaps = []
NewHaps = []
for iterLine in inNewHaps:
newHapNames.append("NH_%s" % iterLine.strip())
NewDecHaps.append(int(iterLine.strip()))
NewHaps = [DecHapToNPHap(NewDecHaps[x])
for x in xrange(len(NewDecHaps))]
# Append new haplotypes to old haplotypes
fullHaps = [np.copy(inHapArray[:,x]) for x in xrange(len(KnownNames))]
fullHaps.extend([invertArray(NewHaps[x].transpose()) for x in xrange(len(NewHaps))])
# Combine names fields
KnownNames.extend(newHapNames)
# Write out haplotypes
tmpVCF = vcfReader(o.knownHaps)
output3 = vcfWriter(
o.outFile,
source="AugmentHaploypes.py",
commandLine="",
baseHead=tmpVCF.headInfo,
FormatBlock=tmpVCF.headInfo["FORMAT"])
output3.writeHeader(KnownNames)
output3.setFormat("GT")
output3.importLinesInfo(
tmpVCF.getData("chrom", lineTarget="a"),
tmpVCF.getData("pos", lineTarget="a"),
tmpVCF.getData("ref", lineTarget="a"),
tmpVCF.getData("alt", lineTarget="a"),
tmpVCF.getData("qual", lineTarget="a")
)
for hapIter in xrange(len(fullHaps)):
# Add predicted SNP frequencies to VCF output
output3.importSampleValues(list(fullHaps[hapIter]), KnownNames[hapIter])
output3.writeSamples()
# Close output files
output3.close()