-
Notifications
You must be signed in to change notification settings - Fork 108
/
Copy pathpathway_pipeline.py
executable file
·283 lines (235 loc) · 14.1 KB
/
pathway_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
#!/usr/bin/env python
import argparse
from importlib.metadata import version
from picrust2.pathway_pipeline import pathway_pipeline
from picrust2.util import (make_output_dir, check_files_exist,
TemporaryDirectory)
from os import path
from picrust2.default_oldIMG import default_regroup_map, default_pathway_map
default_regroup_map_old, default_pathway_map_old = default_regroup_map, default_pathway_map
from picrust2.default import default_regroup_map, default_pathway_map
parser = argparse.ArgumentParser(
description="Script to infer the presence and abundances of pathways "
"based on gene family abundances in a sample. By default, this script "
"expects a table of EC number abundances (as output by PICRUSt2). "
"However, alternative reaction to pathways mapping files can also be "
"specified. By default, EC numbers are first regrouped to MetaCyc "
"reactions, which are then linked to MetaCyc pathways through the default "
"database.\n\n\n"
"Stratified output will only be output if a stratified metagenome is "
"input (or if --per_sequence_contrib is set). "
"Please note that by default stratified abundances are based on "
"how much predicted genomes (e.g. sequences) contribute to the "
"community-wide abundance, not the abundance of the pathway based on the "
"predicted genes in that genome alone. In other words, a predicted genome "
"might be contributing greatly to the community-wide pathway abundance "
"simply because one required gene for that pathway is at extremely high "
"abundance in that genome even though no other required genes for that "
"pathway are present. In contrast, the --per_sequence_contrib option "
"should be used to get the predicted abundance and coverage of each "
"pathway based on the predicted gene families within each genome. Note "
"that using the --per_sequence_contrib option can greatly increase "
"runtime.",
epilog='''
Usage examples:
Default mapping of predicted EC number abundances to MetaCyc pathways:
pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -o pathways_out --processes 1
Mapping predicted KO abundances to legacy KEGG pathways (with stratified output that represents contributions to community-wide abundances):
pathway_pipeline.py -i KO_metagenome_out/pred_metagenome_strat.tsv.gz -o KEGG_pathways_out --no_regroup --map picrust2/picrust2/default_files/pathway_mapfiles/KEGG_pathways_to_KO.tsv
Map EC numbers to MetaCyc pathways and get stratified output corresponding to contribution of predicted gene family abundances within each predicted genome:
pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -o pathways_out_per_seq --per_sequence_contrib --per_sequence_abun EC_metagenome_out/seqtab_norm.tsv.gz --per_sequence_function EC_predicted.tsv.gz''',\
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-i', '--input', metavar='IN_TABLE', required=True,
type=str,
help='Input TSV table of gene family abundances (either '
'the unstratified or stratified output of '
'metagenome_pipeline.py).')
parser.add_argument('-o', '--out_dir', metavar='DIRECTORY', required=True,
type=str, help='Output folder for pathway abundance output.')
parser.add_argument('-db', '--database',type=str, default = 'RV',
help='Database that is being used for the pathway calculations. Set this to oldIMG if you want to run this script with the old IMG database.')
parser.add_argument('-m', '--map', metavar='MAP', type=str,
default=default_pathway_map,
help='Mapping of pathways to reactions. The default '
'mapfile maps MetaCyc reactions to prokaryotic '
'MetaCyc pathways (default: %(default)s).')
parser.add_argument('--skip_minpath', default=False, action="store_true",
help='Do not run MinPath to identify which pathways are '
'present as a first pass (on by default).')
parser.add_argument('--no_gap_fill', default=False, action="store_true",
help='Do not perform gap filling before predicting '
'pathway abundances (Gap filling is on otherwise by '
'default.')
parser.add_argument('--intermediate', metavar='DIR', type=str, default=None,
help='Output folder for intermediate files (will be '
'deleted otherwise).')
parser.add_argument('-p', '--processes', default=1, type=int,
help='Number of processes to run in parallel '
'(default: %(default)d).')
parser.add_argument('--no_regroup', default=False, action="store_true",
help='Do not regroup input gene families to reactions '
'as specified in the regrouping mapfile.')
parser.add_argument('--coverage', default=False, action="store_true",
help='Calculate pathway coverages as well as abundances, '
'which are experimental and only useful for '
'advanced users.')
parser.add_argument('-r', '--regroup_map', metavar='ID_MAP',
default=default_regroup_map, type=str,
help='Mapfile of ids to regroup gene families to before '
'running MinPath. The default mapfile is for '
'regrouping EC numbers to MetaCyc reactions '
'(default: %(default)s).')
parser.add_argument('--per_sequence_contrib', default=False,
action="store_true",
help='Flag to specify that MinPath is run on the genes '
'contributed by each sequence (i.e. a predicted '
'genome) individually. Note this will greatly increase '
'the runtime. The output will be the predicted pathway '
'abundance contributed by each individual sequence. This '
'is in contrast to the default stratified output, which '
'is the contribution to the community-wide pathway '
'abundances. Experimental pathway coverage stratified by contributing '
'sequence will also be output when --coverage is set. '
'Options --per_sequence_contrib and '
'--per_sequence_function need to be set when this option '
'is used (default: %(default)s).')
parser.add_argument('--per_sequence_abun', metavar='PATH',
default=None, type=str,
help='Path to table of sequence abundances across samples '
'normalized by marker copy number (typically the normalized '
'sequence abundance table output at the metagenome '
'pipeline step). This input is required when the '
'--per_sequence_contrib option is set. (default: '
'%(default)s).')
parser.add_argument('--per_sequence_function', metavar='PATH',
default=None, type=str,
help='Path to table of function abundances per sequence, '
'which was outputted at the hidden-state prediction '
'step. This input is required when the '
'--per_sequence_contrib option is set. Note that '
'this file should be the same input table as used '
'for the metagenome pipeline step (default: '
'%(default)s).')
parser.add_argument('--wide_table', default=False,
action="store_true",
help='Flag to specify that wide-format stratified table '
'should be output rather than metagenome '
'contribution table. This is the deprecated method '
'of generating stratified tables since it is '
'extremely memory intensive (default: %(default)s).')
parser.add_argument('--verbose', default=False, action='store_true',
help='If specified, print out wrapped commands and other '
'details to screen.')
parser.add_argument('-v', '--version', default=False, action='version',
version="PICRUSt2 " + version('PICRUSt2'))
def main():
args = parser.parse_args()
# Check that input files exist.
check_files_exist([args.input, args.map])
gap_fill_opt = not args.no_gap_fill
run_minpath_opt = not args.skip_minpath
if args.database != 'RV':
if args.database == 'oldIMG':
if args.verbose:
print('-db is set to old so the old pathway files are being used.')
args.regroup_map, args.map = default_regroup_map_old, default_pathway_map_old
else:
if args.verbose:
print('Unknown option set for -db. Ignoring it.')
# If intermediate output directory set then create and output there.
# Otherwise make a temporary directory for the intermediate files.
if args.intermediate:
make_output_dir(args.intermediate)
unstrat_abun, \
unstrat_cov, \
strat_abun, \
strat_cov, \
path_abun_by_seq, \
path_cov_by_seq, \
unstrat_abun_per_seq = pathway_pipeline(
inputfile=args.input,
mapfile=args.map,
regroup_mapfile=args.regroup_map,
proc=args.processes,
out_dir=args.intermediate,
run_minpath=run_minpath_opt,
coverage=args.coverage,
gap_fill_on=gap_fill_opt,
no_regroup=args.no_regroup,
per_sequence_contrib=args.per_sequence_contrib,
per_sequence_abun=args.per_sequence_abun,
per_sequence_function=args.per_sequence_function,
wide_table=args.wide_table,
verbose=args.verbose)
else:
with TemporaryDirectory() as temp_dir:
unstrat_abun, \
unstrat_cov, \
strat_abun, \
strat_cov, \
path_abun_by_seq, \
path_cov_by_seq, \
unstrat_abun_per_seq = pathway_pipeline(
inputfile=args.input,
mapfile=args.map,
regroup_mapfile=args.regroup_map,
proc=args.processes,
out_dir=temp_dir,
run_minpath=run_minpath_opt,
coverage=args.coverage,
gap_fill_on=gap_fill_opt,
no_regroup=args.no_regroup,
per_sequence_contrib=args.per_sequence_contrib,
per_sequence_abun=args.per_sequence_abun,
per_sequence_function=args.per_sequence_function,
wide_table=args.wide_table,
verbose=args.verbose)
make_output_dir(args.out_dir)
# Write output files. The unstratified abundance table will always be
# written, but the other files will only be written if applicable.
unstrat_abun_outfile = path.join(args.out_dir, "path_abun_unstrat.tsv.gz")
unstrat_abun.to_csv(path_or_buf=unstrat_abun_outfile, sep="\t",
index_label="pathway", compression="gzip")
if args.coverage:
unstrat_cov_outfile = path.join(args.out_dir,
"path_cov_unstrat.tsv.gz")
unstrat_cov.to_csv(path_or_buf=unstrat_cov_outfile, sep="\t",
index_label="pathway", compression="gzip")
if strat_abun is not None:
if args.wide_table:
strat_abun_outfile = path.join(args.out_dir,
"path_abun_strat.tsv.gz")
else:
strat_abun_outfile = path.join(args.out_dir,
"path_abun_contrib.tsv.gz")
strat_abun.to_csv(path_or_buf=strat_abun_outfile, sep="\t",
index=False, compression="gzip")
if args.coverage and strat_cov is not None:
if args.wide_table:
strat_cov_outfile = path.join(args.out_dir,
"path_cov_strat.tsv.gz")
else:
strat_cov_outfile = path.join(args.out_dir,
"path_cov_contrib.tsv.gz")
strat_cov.to_csv(path_or_buf=strat_cov_outfile, sep="\t",
index=False, compression="gzip")
if path_abun_by_seq is not None:
genome_path_abun_outfile = path.join(args.out_dir,
"path_abun_predictions.tsv.gz")
path_abun_by_seq.to_csv(path_or_buf=genome_path_abun_outfile, sep="\t",
index=True, compression="gzip",
index_label="sequence")
if args.coverage and path_cov_by_seq is not None:
genome_path_cov_outfile = path.join(args.out_dir,
"path_cov_predictions.tsv.gz")
path_cov_by_seq.to_csv(path_or_buf=genome_path_cov_outfile, sep="\t",
index=True, compression="gzip",
index_label="sequence")
if unstrat_abun_per_seq is not None:
unstrat_abun_per_seq_outfile = path.join(args.out_dir,
"path_abun_unstrat_per_seq.tsv.gz")
unstrat_abun_per_seq.to_csv(path_or_buf=unstrat_abun_per_seq_outfile,
sep="\t", index_label="pathway",
compression="gzip")
if __name__ == "__main__":
main()