-
Notifications
You must be signed in to change notification settings - Fork 108
/
Copy pathhsp.py
executable file
·178 lines (147 loc) · 8.96 KB
/
hsp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/usr/bin/env python
import argparse
from importlib.metadata import version
from picrust2.wrap_hsp import castor_hsp_workflow
from picrust2.util import make_output_dir_for_file, check_files_exist
from picrust2.default_oldIMG import default_tables
from picrust2.default import default_tables_bac, default_tables_arc
HSP_METHODS = ['mp', 'emp_prob', 'pic', 'scp', 'subtree_average']
TRAIT_OPTIONS = ['16S', 'BIGG', 'CAZY', 'EC', 'GENE_NAMES', 'GO', 'KO', 'PFAM', 'COG', 'TIGRFAM', 'PHENO']
TRAIT_OPTIONS_NEW = ['16S', 'BIGG', 'CAZY', 'EC', 'GENE_NAMES', 'GO', 'KO', 'PFAM']
TRAIT_OPTIONS_OLD = ['16S', 'COG', 'EC', 'KO', 'PFAM', 'TIGRFAM', 'PHENO']
parser = argparse.ArgumentParser(
description="This script performs hidden state prediction on tips in "
"the input tree with unknown trait values. Typically this "
"script is used to predict the copy number of gene families "
"present in the predicted genome for each amplicon sequence "
"variant, given a tree and a set of known trait values. "
"This script outputs a table of trait predictions.",
epilog='''
Usage example:
hsp.py -n -t out.tre -i 16S -o 16S_predicted_and_nsti.tsv.gz --processes 1
''', formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-t', '--tree', metavar='PATH', required=True, type=str,
help='The full reference tree in newick format containing '
'both study sequences (i.e. ASVs or OTUs) and '
'reference sequences.')
parser.add_argument('-o', '--output', metavar='PATH', type=str, required=True,
help='Output table with predicted abundances per study '
'sequence in input tree. If the extension \".gz\" '
'is added the table will automatically be gzipped.')
parser.add_argument('-i', '--in_trait', type=str.upper, choices=TRAIT_OPTIONS,
help='Specifies which default trait table should be '
'used. Use the --observed_trait_table option '
'to input a non-default trait table.')
parser.add_argument('-db', '--database', type=str, default = 'RV',
help='Database that is being used for the pathway calculations. Set this to oldIMG if you want to run this script with the old IMG database.')
parser.add_argument('-r', '--reference', type=str,
default='bac',
help='Which set of reference files to use '
'(default: %(default)s). Included options are '
'bac/bacteria and arc/archaea')
parser.add_argument('--observed_trait_table', metavar='PATH', type=str,
help='The input trait table describing directly '
'observed traits (e.g. sequenced genomes) in '
'tab-delimited format. Necessary if you want to '
'use a custom table.')
parser.add_argument('-e', '--edge_exponent', default=0.5, type=float,
help='Setting for maximum parisomony hidden-state '
'prediction. Specifies weighting transition costs '
'by the inverse length of edge lengths. If 0, then '
'edge lengths do not influence predictions. Must be '
'a non-negative real-valued number (default: '
'%(default)f).')
parser.add_argument('--chunk_size', default=500, type=int,
help='Number of functions to run at a time on one '
'processor. Note that you should consider how many '
'processes you have specified before changing this '
'option. E.g. if you specify the chunk_size to be '
'the total number of functions, 1 processor will '
'be used even if you specified more so the job will '
'be substantially slower (default: %(default)d).')
parser.add_argument('-m', '--hsp_method', default='mp',
choices=HSP_METHODS,
help='HSP method to use.' +
'"mp": predict discrete traits using max parsimony. '
'"emp_prob": predict discrete traits based on empirical '
'state probabilities across tips. "subtree_average": '
'predict continuous traits using subtree averaging. '
'"pic": predict continuous traits with phylogentic '
'independent contrast. "scp": reconstruct continuous '
'traits using squared-change parsimony (default: '
'%(default)s).')
parser.add_argument('-n', '--calculate_NSTI', default=False,
action='store_true',
help='Calculate NSTI and add to output file.')
parser.add_argument('--check', default=False, action='store_true',
help='Check input trait table before HSP.')
parser.add_argument('-p', '--processes', default=1, type=int,
help='Number of processes to run in parallel (default: '
'%(default)d).')
parser.add_argument('--seed', default=100, type=int,
help='Seed to make output reproducible, which is '
'necessary for the emp_prob method '
'(default: %(default)d).')
parser.add_argument('--verbose', default=False, action='store_true',
help='If specified, print out wrapped commands and other '
'details to screen.')
parser.add_argument('-v', '--version', default=False, action='version',
version="PICRUSt2 " + version('PICRUSt2'))
def main():
args = parser.parse_args()
# Determine which input trait table was specified. If neither a default
# or custom table was specified then throw an error.
if args.in_trait and args.observed_trait_table:
raise RuntimeError(
"Only one of the arguments --in_trait and --observed_trait_table "
"can be specified, but currently both are set.")
elif args.in_trait:
if args.database != 'RV':
if args.database == 'oldIMG':
if args.in_trait not in TRAIT_OPTIONS_OLD:
RuntimeError(
"You've chosen a train option that's not available for the oldIMG database. Available options are: "+','.join(TRAIT_OPTIONS_OLD))
trait_table = default_tables[args.in_trait]
else:
raise RuntimeError(
"Unknown option set for -db/--database. Valid options are oldIMG or RV. See the help documentation on the wiki for further information.")
else:
if args.in_trait not in TRAIT_OPTIONS_NEW:
RuntimeError(
"You've chosen a train option that's not available for the new RV database. Available options are: "+','.join(TRAIT_OPTIONS_NEW))
if args.reference in ['bac', 'bacteria']:
trait_table = default_tables_bac[args.in_trait]
elif args.reference in ['arc', 'archaea']:
trait_table = default_tables_arc[args.in_trait]
else:
raise RuntimeError(
"--in_trait has been set but an unknown reference has been given. "
"Included options for -r are bacteria or archaea.")
elif args.observed_trait_table:
trait_table = args.observed_trait_table
else:
raise RuntimeError(
"A default input trait table needs to be specified with the "
"--in_trait option, or alternatively a custom table can be "
"specified with the --observed_trait_table option")
# Check that input filenames exist.
check_files_exist([args.tree, trait_table])
# No longer support outputting CIs with this script.
ci_setting = False
hsp_table, ci_table = castor_hsp_workflow(tree_path=args.tree,
trait_table_path=trait_table,
hsp_method=args.hsp_method,
edge_exponent=args.edge_exponent,
chunk_size=args.chunk_size,
calc_nsti=args.calculate_NSTI,
calc_ci=ci_setting,
check_input=args.check,
num_proc=args.processes,
ran_seed=args.seed,
verbose=args.verbose)
# Output the table to file.
make_output_dir_for_file(args.output)
hsp_table.to_csv(path_or_buf=args.output, index_label="sequence",
sep="\t", compression="infer")
if __name__ == "__main__":
main()