-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitSEQATOMs.py
executable file
·74 lines (66 loc) · 2.12 KB
/
splitSEQATOMs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Split SEQATOMs files by SEQRES and SEQATOM
File name: splitSEQATOMs.py
Author: Nicolas Palopoli
Date created: 2016/01/22
Date last modified: 2016/01/22
Python Version: 2.7
'''
import sys
from collections import OrderedDict
import csv
from Bio import SeqIO
# Read input files
try:
infasta = open(sys.argv[1])
pdb = sys.argv[2]
chain = sys.argv[3]
except IndexError:
print("Input file(s) not specified. Format: ./splitSEQATOMs.py <in.fasta> <pdb> <chain>")
exit()
except IOError:
print("Input file(s) not found. Format: ./splitSEQATOMs.py <in.fasta> <pdb> <chain>")
exit()
def readFasta(infasta,pdb,chain):
'''Store fasta sequences from file.'''
seqs = OrderedDict()
readFirstSeq = False
for line in infasta:
if line.strip() or line not in ['\n', '\r\n']: # avoid empty or only whitespace lines
line=line.rstrip() # discard newline at the end (if any)
if line[0]=='>': # or line.startswith('>'); distinguish header
if readFirstSeq: # exit if more than 2 sequences
break
readFirstSeq = True
words = line.split()
name = pdb + ':' + chain
seqs['res']=''
else : # sequence, not header, possibly multi-line
seqs['res'] = seqs['res'] + line
seqs['res'] = list(seqs['res'])
return seqs
def splitSeq(seq):
'''Split fasta sequence in SEQRES and SEQATOM'''
seqlen = len(seq['res'])
# seq['SEQRES'] = seq['res'].upper()
seq['SEQRES'] = map(lambda x:x.upper(),seq['res'])
# seq['SEQATOM'] = seq['res'].upper()
seq['SEQATOM'] = map(lambda x:x.upper(),seq['res'])
for pos in range(0,seqlen):
if seq['res'][pos].islower():
seq['SEQATOM'][pos] = '-'
return seq
# Read input file
seq = readFasta(infasta,pdb,chain)
infasta.close()
seq = splitSeq(seq)
header = '|'.join([pdb,chain])
outname = 'SEQATOM_split_' + pdb + '_' + chain + '.fasta'
outfile=open(outname, 'w+')
print >>outfile, '{}{}|SEQRES'.format('>',''.join(header))
print >>outfile, ''.join(seq['SEQRES'])
print >>outfile, '{}{}|SEQATOM'.format('>',''.join(header))
print >>outfile, ''.join(seq['SEQATOM'])
outfile.close()