-
Notifications
You must be signed in to change notification settings - Fork 5
/
orthograph.conf
291 lines (247 loc) · 12.5 KB
/
orthograph.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#--------------------------------------------------
# # Orthograph config file
#--------------------------------------------------
#
# Comments start with a hash sign (#) and are ignored by the parser.
# This way you can comment out lines you don't need, but would like to keep
# in your config file because they might be important later.
#
# Settings in UPPER-CASE must be changed to your local environment. Settings
# in lower-case are sane defaults.
#
# Remember to save this file as 'orthograph.conf'.
#--------------------------------------------------
# # Important stuff first. These settings are mandatory.
#--------------------------------------------------
#
# # Database backend selection
#
# Database backend. Defaults to 'sqlite', which means to use the file-based
# SQLite database system. Set this to 'mysql' and adjust the MySQL settings
# below if you want to use MySQL.
database-backend = sqlite
#
# # SQLite settings
#
# Path to the database file for SQLite. Required if you use SQLite. This is a
# plain file that can be located anywhere. It will be created automatically. By
# convention, SQLite database files carry the extension '.sqlite'.
sqlite-database = test_data/test_set.sqlite
# Path to the sqlite executable. Normally this is located at /usr/bin/sqlite3
# (the default). Required if you use SQLite and have 'database-backend' set to
# 'sqlite'. Change this if your setup differs.
sqlite-program = sqlite3
# The path to your transcript data file. Must be in Fasta format. It is always
# best to use absolute paths. You may only supply one file, not a directory of
# files.
input-file = test_data/Orussus_abietinus_transcriptome_1KITE_subset.fa
# Species name. Make sure to pick a unique name because otherwise results get
# mixed up. You can, however, supply an existing name if you want to add
# sequences to an existing data set (this has been neither tested nor
# documented yet, though).
species-name = Orussus_abietinus
# The name of the ortholog set (not the path to a file) you created earlier.
ortholog-set = test_set
# This is where your results are placed. Will be created if it doesn't exist.
# You should provide the path to a directory here or your output will be placed
# in your current working directory.
output-directory = test_data/output/Orussus_abietinus
# Reference taxa. A comma-separated list of taxon names that are present in
# your ortholog set. If you don't specify any reference taxa (i.e., leave this
# setting commented out), all taxa in your set are used as reference taxa.
# Note that you must use the taxon names exactly as they are in the database.
# If unsure, you can use the list provided from `orthograph-manager --list-taxa`
# as a starting point.
#reference-taxa = COMMA SEPARATED, LIST OF, TAXA IN, YOUR ORTHOLOG, SET
#--------------------------------------------------
# Options. These settings are optional, but may be required if the defaults
# don't do.
#--------------------------------------------------
#
# # Paths to the programs. It's best to set absolute paths.
#
# Default alignment program; used to create the ortholog set. Must accept a
# fasta input file as input and produce fasta-formatted output on STDOUT.
# Note: The --anysymbol option makes MAFFT accept any character in a sequence,
# INCLUDING '*' for stop codons and 'U' for Selenocystein. If you are not OK
# with this, you may remove the --anysymbol option from the command, but then it
# is your responsibility to make sure that your ortholog set sequences do not
# contain any nonstandard symbols that may make MAFFT choke on your set.
#
# Standard amino acid symbols are: ACDEFGHIKLMNPQRSTVWY and X for ambiguity.
alignment-program = mafft-linsi
# HMMbuild is used to build the profile HMMs. Part of the HMMER3 package.
hmmbuild-program = hmmbuild
# makeblastdb is used to build the BLAST database. Part of NCBI BLAST+
makeblastdb-program = makeblastdb
# Fastatranslate is part of the Exonerate package and translates the transcript
# sequences into all six reading frames.
translate-program = fastatranslate
# HMMsearch, of course. Also part of the HMMER3 package.
hmmsearch-program = hmmsearch
# BLAST. Should be blastp from the NCBI BLAST+ package.
blast-program = blastp
# Exonerate. Used to find ORFs.
exonerate-program = exonerate
#
# # MySQL connection settings. Only required if you use MySQL and have
# 'database-backend' set to 'mysql'.
#
#mysql-username = USERNAME
#mysql-password = PASSWORD
#mysql-database = DATABASE
# The database server. Change this if the database does not run on the same
# computer as the analysis. Ask your administrator if you don't know what to
# write here.
# Defaults to 127.0.0.1 (localhost).
#mysql-server = 127.0.0.1
# Prefix for your Orthograph database tables. Useful if you are running
# multiple instances of Orthograph on the same database but don't want the data
# to be mixed up. Defaults to 'orthograph'.
#db-prefix = orthograph
#
# # Settings that affect the HMM and BLAST searches.
#
# Score threshold. A higher alignment score means a better match.
# This affects both orthograph-analyzer and orthograph-reporter.
# Default value is 10.
#hmmsearch-score-threshold = 10
#blast-score-threshold = 10
# You can also set an e-value threshold. The HMMsearch e-value threshold
# affects the specificity of the HMM search, the first step in the reciprocal
# algorithm. It defines how distantly related candidate orthologs may be when
# searching through the transcriptome file. The BLAST e-value threshold affects
# the second step, the reciprocal search. Basically, this defines the
# false-positive probability (lower e-value = lower probability).
# This affects both orthograph-analyzer and orthograph-reporter.
# Default value is 1e-05 (0.00001).
#hmmsearch-evalue-threshold = 1e-05
#blast-evalue-threshold = 1e-05
# Maximum number of HMMsearch hits to consider. This setting is useful to limit
# the number of reciprocal searches for very large numbers of HMMsearch hits.
# However, it is also useful to limit the scope of your HMM searches, which you
# normally don't want to do. Unless you have an idea of how many reciprocal
# searches are required to effectively verify or reject a candidate ortholog,
# don't change this setting.
# This affects only orthograph-analyzer.
# Defaults to 100.
#max-blast-searches = 100
# Maximum number of BLAST hits to save. This affects only orthograph-analyzer.
# Defaults to 100.
#blast-max-hits = 100
# Number of CPU threads to use. For optimum CPU efficiency, set this to the
# number of threads that your CPU(s) can run. Ask your system administrator if
# you don't know.
# This affects only orthograph-analyzer.
# Defaults to 1 (single-threaded: slow but safe)
#num-threads = 1
#
# # Other options
#
# Minimum transcript length. A transcript must have equal or more amino acids
# in order to be accepted for further processing. This is a way to avoid very
# small fragments, commonly due to domain walking, but you can also set it to a
# higher value if you want longer transcripts.
# This affects only orthograph-reporter.
# Defaults to 30 (a common domain length).
#minimum-transcript-length = 30
# When concatenating transcripts, Orthograph does not fill the gap with X
# (amino acid sequences) resp. N (nucleotide sequences).
# Enable this option if you want this.
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#fill-with-x = 1
# If you do not want Orthograph to do any concatenation at all, but are only
# interested in the very best reciprocal hit, set this to 1. Only a single
# transcript will be assigned to each COG.
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#brh-only = 1
# Frameshift error correction using Exonerate. Exonerate is used to
# infer ORFs and obtain a 100% corresponding nucleotide sequence for
# your predicted orthologous transcripts, but this may lead to lost
# transcripts where no ORF could be obtained.
# This affects only orthograph-reporter.
# Using Exonerate is the default behaviour; uncomment this line to turn it off.
#frameshift-correction = 0
# ORF extension. Try to extend the ORF while retaining the orthologous region.
# This makes Exonerate run an additional time with the full transcript
# sequence, and the resulting ORF must contain the original ORF.
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#extend-orf = 1
# Minimum ORF overlap. If extend-overlap is enabled, Orthograph tries to extend
# the ORF beyond the BRH HMM alignment region. The resulting ORF must overlap
# the original HMM alignment region by this percentage. Note that the value
# must be specified as a fraction of 1.
# This affects only orthograph-reporter.
# 0.5 (50%) is the default; lower values make this less conservative.
#orf-overlap-minimum = 0.5
# If you are only interested in a few genes from your ortholog set, you can
# provide Orthograph with a list of ortholog IDs in a file. The IDs must be
# in a single line each, and there must be no empty lines in the file.
# This affects both orthograph-analyzer and orthograph-reporter.
#cog-list-file = /PATH/TO/FILE
# Strict search. Normally it is enough for a match to occur if one of the
# reference taxa is hit in the reciprocal search. In strict mode ALL reference
# taxa must be hit to verify an ortholog assignment. This is much more
# conservative.
# This affects only orthograph-reporter.
#strict-search = 1
# Clear pre-existing data of the same species from the database prior to the
# analysis. Recommended if you plan to run the same analysis multiple times,
# but doesn't hurt otherwise.
# This affects only orthograph-analyzer.
# This behaviour is the default, uncomment this line if you want to turn it off.
#clear-database = 0
# Delete old result files. This means the HMMsearch and BLAST report files found
# in the output directory. If you plan to run the same analysis multiple times
# with the same HMMsearch and BLAST settings, then don't have them deleted. This
# will speed up the process significantly, since the search programs don't have
# to be run again.
# This affects both orthograph-analyzer and orthograph-reporter.
# Uncomment this line if you want the files deleted.
#clear-files = 1
# Selenocysteine (U) may occur in some protein sequences. However, some
# alignment programs do not accept this nonstandard amino acid symbol. You can
# tell Orthograph to substitute all 'U' in the sequences with a different
# character. The default is not to substitute.
# This affects only orthograph-analyzer.
#substitute-u-with = X
# Header separator. This will be used to separate header fields in the output files.
# May be an arbitrary string. Defaults to '|'. To use a whitespace character,
# enclose it in quotes, e.g., ' '.
# This affects only orthograph-reporter.
#header-separator = |
# Verbose output. More information about the HMMsearch and BLAST hits. Normally
# you don't want to see this. If you are really interested in what Orthograph is
# thinking during the analysis, uncomment this. Verbose and quiet are mutually
# exclusive.
# This affects both orthograph-analyzer and orthograph-reporter.
#verbose = 1
# Quiet output. Uncomment this if you don't want to be bothered during the
# analysis. After starting, Orthograph will keep still until the analysis is
# complete (or unexpected things happen). Verbose and quiet are mutually
# exclusive.
# This affects both orthograph-analyzer and orthograph-reporter.
#quiet = 1
#
# # More paths
#
# Ortholog sets directory. Useful if you would like to keep your ortholog sets
# (that is, the BLAST database, the HMMs and the alignment files for each
# ortholog gene) in a separate place. Defaults to 'sets' (in the current
# directory).
# This affects both orthograph-analyzer and orthograph-reporter.
#sets-dir = /PATH/TO/SETS/DIR
# Path to log file. If set, all messages will also be written to this file. If
# this is not set, messages are written to a log file in your output directory
# (see the setting 'output-directory' above). The log file for
# orthograph-analyzer is called orthograph-analyzer-TIMESTAMP.log, and the log
# file for orthograph-reporter is called orthograph-reporter-TIMESTAMP.log. The
# TIMESTAMP is generated in YYYY-MM-DD_HH:MM format.
# This affects both orthograph-analyzer and orthograph-reporter.
# Note that if you do set a path here, both programs write to the same file,
# meaning that orthograph-reporter will overwrite the log file of
# orthograph-analyzer. It is much more convenient to leave this unset.
#logfile = /PATH/TO/LOGFILE