-
Notifications
You must be signed in to change notification settings - Fork 1
/
old_fasta_py.txt
95 lines (72 loc) · 2.17 KB
/
old_fasta_py.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def read_fasta_header(f):
'''
read the next fasta header
file must be at the start of the next record
or at end of file
return header
header is the full header line
do not mess about with parsing the header line at all
'''
line = f.readline()
#end of file
if line == '': return None
#check the line looks like a fasta header
assert line.startswith('>'), 'expecting fasta header: %s'%line
#remove the leading '>' character and trailing whitespace
header = line[1:]
header = header.strip()
rec = {}
#header is the complete line, minus the '>'
rec['header'] = header
return rec
def next_seqline(f):
'''
generator function to read next sequence line
'''
while True:
line = read_seqline(f)
#end of sequence lines
if line == None: break
#yield next line
yield line
def read_seqline(f):
'''
read lines until end of file or next fasta header
'''
while True:
posn = f.tell()
line = f.readline()
#end of file or reached next fasta record
if line == '' or line.startswith('>'):
f.seek(posn)
return None
#skip blank lines
line = line.strip()
if line == '': continue
#remove any internal spaces or tabs
line = line.replace(' ','')
line = line.replace('\t','')
#return sequence line
return line
def read_fasta(f,storeseq=True):
'''
read next fasta record
file must be at start of record or end of file
'''
#read header into record
rec = read_fasta_header(f)
#end of file
if rec == None: return None
if storeseq:
#read in the sequence data
rec['seq'] = ''.join([x for x in next_seqline(f)])
rec['len'] = len(rec['seq'])
else:
#determine length of sequence without storing it
length = 0
for x in next_seqline(f):
length += len(x)
#return record without the sequence
rec['seq'] = None
rec['len'] = length
return rec