-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKinasePseudoKinase.py
executable file
·81 lines (63 loc) · 2.73 KB
/
KinasePseudoKinase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import sys
from collections import defaultdict, Counter
path = '/home/gene/Orthonectida/Projects/WTK/data/Filtred_Fa/V_Corymbosum/tandem_all_MatchDataset_DomainName_V_corymbosum_RLK-Pelle_DLSV.SeqCharReplace.aln.fa'
outpath = '/home/gene/Orthonectida/Projects/WTK/data/Filtred_Fa/V_Corymbosum/tandem_all_MatchDataset_DomainName_V_corymbosum_RLK-Pelle_DLSV.SeqCharReplace_DifferenceFromMostOccured.tsv'
classification = {"R": 'Charge+',
"H": 'Charge+',
"K": 'Charge+',
"D": 'Charge-',
"E": 'Charge-',
"S": 'Uncharge',
"T": 'Uncharge',
"N": 'Uncharge',
"Q": 'Uncharge',
"C": 'SpecialCases',
"U": 'SpecialCases',
"G": 'SpecialCases',
"P": 'SpecialCases',
'A': 'Hydrophobic',
'I': 'Hydrophobic',
'L': 'Hydrophobic',
'M': 'Hydrophobic',
'F': 'Hydrophobic',
'W': 'Hydrophobic',
'Y': 'Hydrophobic',
'V': 'Hydrophobic',
'-': 'None'}
with open(path) as f:
seqposdict = defaultdict(list)
seqnamedict = defaultdict(str)
seqnames = []
seq = ''
for line in f:
if line.startswith('>'):
if seq:
for pos, letter in enumerate(seq):
seqposdict[pos + 1].append(letter)
seqnamedict[seqname] = seq
seqname = line.strip('>').split()[0]
seqnames.append(seqname)
seq = ''
continue
if line.strip():
seq += line.strip()
# add seq info of last sample
for pos, letter in enumerate(seq):
seqposdict[pos + 1].append(letter)
reference = defaultdict(list)
[reference[i].append(sorted(Counter(seqposdict[i]).items(), key=lambda x: x[1], reverse = True)[0][0]) for i in seqposdict]
refseq = ''.join([i[1][0] for i in sorted(reference.items(), key=lambda x: x[0])])
seqpenaltydict = defaultdict(int)
for name, seq in seqnamedict.items():
seqpenaltydict[name] = 0
# simple implementation == or != reference metric
for aa, refaa in zip(seq, refseq):
if aa != refaa:
# Penalize on 0.25 if sinonymous change and for 1 in other cases
if classification[aa] == classification[refaa]:
seqpenaltydict[name] += 0.25
else:
seqpenaltydict[name] += 1
res_sorted = sorted(seqpenaltydict.items(), key = lambda x: x[1])
with open(outpath, 'w') as w:
[w.write(i[0] + '\t' + str(i[1]) + '\n') for i in res_sorted]