forked from frederic-mahe/swarm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathswarm.py
311 lines (255 loc) · 11.5 KB
/
swarm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Find clusters of nearly identical sequences in giant barcoding
projects.
"""
from __future__ import print_function
__author__ = "Frédéric Mahé <mahe@rhrk.uni-kl.fr>"
__date__ = "2014/11/10"
__version__ = "$Revision: 8.0"
import os
import sys
from Bio import SeqIO
from operator import itemgetter
from optparse import OptionParser
#*****************************************************************************#
# #
# Functions #
# #
#*****************************************************************************#
def option_parse():
"""Parse arguments from command line."""
desc = """Find clusters of nearly identical amplicons in giant
amplicon-based environmental or clinical projects. Amplicons are
expected to be sorted by decreasing abundance."""
parser = OptionParser(usage="usage: %prog --input_file filename",
description=desc,
version="%prog version 1.0")
parser.add_option("-i", "--input_file",
metavar="<FILENAME>",
action="store",
dest="input_file",
help="set <FILENAME> as input fasta file.")
parser.add_option("-b", "--boundary",
action="store",
type="int",
dest="boundary",
default=2,
help="Frontier between abundant/rare amplicons.")
(options, args) = parser.parse_args()
return options.input_file, options.boundary
def parse_input_file(input_file):
"""Build a list of amplicons."""
input_format = "fasta"
amplicons = dict()
order = list()
with open(input_file, "rb") as input_file:
for record in SeqIO.parse(input_file, input_format):
seq = str(record.seq).lower() # Convert all sequences to lowercase.
# Store 0) amplicon_id, 1) amplicon abundance, 2) amplicon
# status, 3) swarm mass, 4) swarm seed id
amplicons[seq] = [record.id,
int(record.id.split(";size=")[1]),
True,
0,
""]
order.append(seq)
return amplicons, order
def output_swarms(input_file, order, amplicons, swarms, d):
"""Write swarms to a file."""
# Create new file name
extension = "_" + str(d) + ".swarms_2_prototype"
output_file = os.path.splitext(os.path.abspath(input_file))[0] + extension
with open(output_file, "w") as output_file:
for seed in order:
seed_id = amplicons[seed][0]
if seed_id in swarms:
print(" ".join(swarms[seed_id]), file=output_file)
return
def produce_microvariants(seq):
"""Produce all possible micro-variants, without repetition.
The original sequence must be made exclusively of lowercase
nucleotides (acgt). Micro-variants have one difference
(substitution, insertion, or deletion) with the original sequence.
"""
nucleotides = ("a", "c", "g", "t")
seq = list(seq)
length = len(seq)
microvariants = list()
# Insertions (insert once, then shift)
tmp = [""] + seq[:]
for i in xrange(0, length, 1):
for nuc in nucleotides:
if tmp[i+1] is not nuc:
tmp[i] = nuc
microvariants.append("".join(tmp))
tmp[i] = tmp[i+1]
# Insertions at the last position
for nuc in nucleotides:
tmp = seq[:] + [nuc]
microvariants.append("".join(tmp))
# Mutations and deletions
for i in xrange(0, length, 1):
tmp = seq[:]
initial = tmp[i]
for nuc in nucleotides:
if initial is not nuc: # Avoid useless mutations
tmp[i] = nuc
microvariants.append("".join(tmp))
tmp[i] = initial # Restore the initial sequence
# Deletions
try:
if tmp[i] is not tmp[i+1]: # delete at the end of homopolymers
del tmp[i]
microvariants.append("".join(tmp))
except IndexError: # Deletion at the last position
del tmp[i]
microvariants.append("".join(tmp))
return microvariants
def main():
"""Load and parse input fasta file and clusterize it."""
# Parse command line options.
input_file, boundary = option_parse()
# Build a dict of amplicons and a list to preserve input order
# (assuming decreasing abundance)
amplicons, order = parse_input_file(input_file)
# Store each swarm created in a dictionary: swarm[seed] = list(amplicons)
swarms = dict()
# Phase 1: d = 1 clustering -----------------------------------------------
# Start swarming
for seed in order:
seed_id, seed_abundance, seed_status = amplicons[seed][0:3]
if not seed_status: # Skip amplicons already swarmed
continue
# Seed id and status
swarm = [seed_id]
amplicons[seed][2] = False # could be seed_status = False
amplicons[seed][4] = seed_id # point to itself
# Create micro-variants
microvariants = produce_microvariants(seed)
# Which of these microvariants are in our dataset?
hits = [(microvariant, amplicons[microvariant][1])
for microvariant in microvariants
if microvariant in amplicons
and amplicons[microvariant][2]] # No need to check
# abundance here
# Isolated seed? close the swarm
if not hits:
swarms[swarm[0]] = swarm
# Add mass information
amplicons[seed][3] = seed_abundance
continue
# Sort by decreasing abundance
hits.sort(key=itemgetter(1, 0), reverse=True)
# Add them to the swarm and update their status
swarm.extend([amplicons[hit[0]][0] for hit in hits])
for hit in hits:
hit_seq = hit[0]
amplicons[hit_seq][2] = False
amplicons[hit_seq][4] = seed_id # point to swarm seed
# Work on subseeds (also save a list of hits along the way)
all_hits = [(seed, seed_abundance)]
all_subseeds = [hits]
all_hits.extend(hits)
for subseeds in all_subseeds:
nextseeds = list()
for subseed in subseeds:
subseed_seq, subseed_abundance = subseed[0], subseed[1]
# Update subseed status
amplicons[subseed_seq][2] = False
# Produce all microvariants of subseed
microvariants = produce_microvariants(subseed_seq)
# Which of these microvariants are in our dataset?
# (discard hits with higher abundance value)
hits = [(microvariant, amplicons[microvariant][1])
for microvariant in microvariants
if microvariant in amplicons
and amplicons[microvariant][2]
and amplicons[microvariant][1] <= subseed_abundance]
if not hits: # subseed has no son
continue
# Sort by decreasing abundance
hits.sort(key=itemgetter(1, 0), reverse=True)
nextseeds.extend(hits) # Hits in nextseeds are not globally
# sorted at that point
# Add hits to the swarm and update their status
swarm.extend([amplicons[hit[0]][0] for hit in hits])
all_hits.extend(hits)
for hit in hits:
hit_seq = hit[0]
amplicons[hit_seq][2] = False
amplicons[hit_seq][4] = seed_id # point to swarm seed
if not nextseeds: # No new subseeds, end of the all_subseeds list
swarms[swarm[0]] = swarm # Memorize the swarm
mass = sum([hit[1] for hit in all_hits])
for hit in all_hits: # Set swarm mass value for each amplicon
amplicons[hit[0]][3] = mass
break
# Hits in nextseeds are globally sorted (each layer of microvariant is sorted by decreasing abundance)
nextseeds.sort(key=itemgetter(1, 0), reverse=True)
all_subseeds.append(nextseeds)
# Output swarms (d = 1)
output_swarms(input_file, order, amplicons, swarms, 1)
# Phase 2: d = 2 clustering -----------------------------------------------
# The process seems to detect the same candidates many times. I
# need to mark the sequences already seen to get a more precise
# evaluation of the dynamic of recruitment. After the first phase,
# all amplicons are marked as False. I can re-use that data
# structure to my advantage.
# Number of OTUs
n_OTUs = len(swarms)
for seed in order:
captured_OTUs = 0
seed_id, seed_abundance, seed_status, swarm_mass, main_seed = amplicons[seed]
# Do not deal with rare amplicons
if seed_abundance == boundary:
break
# Create micro-variants
microvariants = produce_microvariants(seed)
# Which of these microvariants are not in our dataset?
fails = [microvariant
for microvariant in microvariants
if microvariant not in amplicons]
for fail in fails:
microvariants_2d = produce_microvariants(fail)
# Which of these microvariants are not in our dataset?
# OTUs of size 2 or less are dust (boundary).
hits = [(microvariant, amplicons[microvariant])
for microvariant in microvariants_2d
if microvariant in amplicons
and amplicons[microvariant][3] <= boundary
and amplicons[microvariant][2] is False]
# Add hits to the swarm to which the seed belongs
for hit in hits:
amplicons[hit[0]][2] = True
hit_id, hit_pointer = hit[1][0], hit[1][4]
if hit_pointer in swarms:
swarms[main_seed] += swarms[hit_pointer]
del swarms[hit_pointer]
captured_OTUs += 1
# Basic stats
n_OTUs -= captured_OTUs
print(main_seed, seed_id, seed_abundance,
len(seed), len(microvariants),
len(fails), captured_OTUs, n_OTUs,
file=sys.stderr)
# Output swarms (d = 2)
output_swarms(input_file, order, amplicons, swarms, 2)
return
#*****************************************************************************#
# #
# Body #
# #
#*****************************************************************************#
if __name__ == '__main__':
main()
sys.exit(0)
## TODO
# the algorithm could be simplified if instead of adding mass values
# to the amplicons, I added them to the swarms dict. It would cost a
# look up in the amplicons dict first to get the swarm seed, then a
# look up in the swarms dict to get the mass of the swarm.
#
# During the second step, the mass values should be updated?? Not sure
# it is usefull.