orthograph.conf

#--------------------------------------------------
# # Orthograph config file
#-------------------------------------------------- 
#
# Comments start with a hash sign (#) and are ignored by the parser.
# This way you can comment out lines you don't need, but would like to keep
# in your config file because they might be important later.
#
# Settings in UPPER-CASE must be changed to your local environment. Settings
# in lower-case are sane defaults.
#
# Remember to save this file as 'orthograph.conf'.

#--------------------------------------------------
# # Important stuff first. These settings are mandatory.
#-------------------------------------------------- 

#
# # Database backend selection
#
# Database backend. Defaults to 'sqlite', which means to use the file-based
# SQLite database system. Set this to 'mysql' and adjust the MySQL settings
# below if you want to use MySQL.
database-backend   = sqlite

#
# # SQLite settings
#
# Path to the database file for SQLite. Required if you use SQLite. This is a
# plain file that can be located anywhere. It will be created automatically. By
# convention, SQLite database files carry the extension '.sqlite'.
sqlite-database    = test_data/test_set.sqlite

# Path to the sqlite executable. Normally this is located at /usr/bin/sqlite3
# (the default). Required if you use SQLite and have 'database-backend' set to
# 'sqlite'. Change this if your setup differs. 
sqlite-program     = sqlite3

# The path to your transcript data file. Must be in Fasta format. It is always
# best to use absolute paths. You may only supply one file, not a directory of
# files.
input-file         = test_data/Orussus_abietinus_transcriptome_1KITE_subset.fa

# Species name. Make sure to pick a unique name because otherwise results get
# mixed up. You can, however, supply an existing name if you want to add
# sequences to an existing data set (this has been neither tested nor
# documented yet, though).
species-name       = Orussus_abietinus

# The name of the ortholog set (not the path to a file) you created earlier.
ortholog-set       = test_set

# This is where your results are placed. Will be created if it doesn't exist.
# You should provide the path to a directory here or your output will be placed
# in your current working directory.
output-directory   = test_data/output/Orussus_abietinus

# Reference taxa. A comma-separated list of taxon names that are present in
# your ortholog set. If you don't specify any reference taxa (i.e., leave this
# setting commented out), all taxa in your set are used as reference taxa.
# Note that you must use the taxon names exactly as they are in the database.
# If unsure, you can use the list provided from `orthograph-manager --list-taxa`
# as a starting point.
#reference-taxa     = COMMA SEPARATED, LIST OF, TAXA IN, YOUR ORTHOLOG, SET


#--------------------------------------------------
# Options. These settings are optional, but may be required if the defaults
# don't do.
#-------------------------------------------------- 

#
# # Paths to the programs. It's best to set absolute paths.
#
# Default alignment program; used to create the ortholog set. Must accept a
# fasta input file as input and produce fasta-formatted output on STDOUT. 
# Note: The --anysymbol option makes MAFFT accept any character in a sequence,
# INCLUDING '*' for stop codons and 'U' for Selenocystein. If you are not OK
# with this, you may remove the --anysymbol option from the command, but then it
# is your responsibility to make sure that your ortholog set sequences do not
# contain any nonstandard symbols that may make MAFFT choke on your set.
#
# Standard amino acid symbols are: ACDEFGHIKLMNPQRSTVWY and X for ambiguity.
alignment-program    = mafft-linsi

# HMMbuild is used to build the profile HMMs. Part of the HMMER3 package.
hmmbuild-program     = hmmbuild

# makeblastdb is used to build the BLAST database. Part of NCBI BLAST+
makeblastdb-program  = makeblastdb

# Fastatranslate is part of the Exonerate package and translates the transcript
# sequences into all six reading frames. 
translate-program    = fastatranslate

# HMMsearch, of course. Also part of the HMMER3 package.
hmmsearch-program    = hmmsearch

# BLAST. Should be blastp from the NCBI BLAST+ package.
blast-program        = blastp

# Exonerate. Used to find ORFs.
exonerate-program    = exonerate

#
# # MySQL connection settings. Only required if you use MySQL and have
# 'database-backend' set to 'mysql'.
#
#mysql-username     = USERNAME
#mysql-password     = PASSWORD
#mysql-database     = DATABASE

# The database server. Change this if the database does not run on the same
# computer as the analysis. Ask your administrator if you don't know what to
# write here.
# Defaults to 127.0.0.1 (localhost).
#mysql-server             = 127.0.0.1

# Prefix for your Orthograph database tables. Useful if you are running
# multiple instances of Orthograph on the same database but don't want the data
# to be mixed up. Defaults to 'orthograph'.
#db-prefix = orthograph

#
# # Settings that affect the HMM and BLAST searches.
#
# Score threshold. A higher alignment score means a better match. 
# This affects both orthograph-analyzer and orthograph-reporter. 
# Default value is 10.
#hmmsearch-score-threshold = 10
#blast-score-threshold     = 10

# You can also set an e-value threshold. The HMMsearch e-value threshold
# affects the specificity of the HMM search, the first step in the reciprocal
# algorithm. It defines how distantly related candidate orthologs may be when
# searching through the transcriptome file. The BLAST e-value threshold affects
# the second step, the reciprocal search. Basically, this defines the
# false-positive probability (lower e-value = lower probability).
# This affects both orthograph-analyzer and orthograph-reporter.
# Default value is 1e-05 (0.00001).
#hmmsearch-evalue-threshold = 1e-05
#blast-evalue-threshold     = 1e-05

# Maximum number of HMMsearch hits to consider. This setting is useful to limit
# the number of reciprocal searches for very large numbers of HMMsearch hits.
# However, it is also useful to limit the scope of your HMM searches, which you
# normally don't want to do. Unless you have an idea of how many reciprocal
# searches are required to effectively verify or reject a candidate ortholog,
# don't change this setting. 
# This affects only orthograph-analyzer.
# Defaults to 100.
#max-blast-searches         = 100

# Maximum number of BLAST hits to save. This affects only orthograph-analyzer.
# Defaults to 100.
#blast-max-hits             = 100

# Number of CPU threads to use. For optimum CPU efficiency, set this to the
# number of threads that your CPU(s) can run. Ask your system administrator if
# you don't know.
# This affects only orthograph-analyzer.
# Defaults to 1 (single-threaded: slow but safe)
#num-threads                = 1

# 
# # Other options
#

# Minimum transcript length. A transcript must have equal or more amino acids
# in order to be accepted for further processing. This is a way to avoid very
# small fragments, commonly due to domain walking, but you can also set it to a
# higher value if you want longer transcripts. 
# This affects only orthograph-reporter.
# Defaults to 30 (a common domain length).
#minimum-transcript-length = 30

# When concatenating transcripts, Orthograph does not fill the gap with X
# (amino acid sequences) resp. N (nucleotide sequences).
# Enable this option if you want this.
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#fill-with-x = 1

# If you do not want Orthograph to do any concatenation at all, but are only
# interested in the very best reciprocal hit, set this to 1. Only a single
# transcript will be assigned to each COG. 
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#brh-only = 1

# Frameshift error correction using Exonerate. Exonerate is used to
# infer ORFs and obtain a 100% corresponding nucleotide sequence for
# your predicted orthologous transcripts, but this may lead to lost
# transcripts where no ORF could be obtained.
# This affects only orthograph-reporter.
# Using Exonerate is the default behaviour; uncomment this line to turn it off.
#frameshift-correction = 0

# ORF extension. Try to extend the ORF while retaining the orthologous region.
# This makes Exonerate run an additional time with the full transcript
# sequence, and the resulting ORF must contain the original ORF.
# This affects only orthograph-reporter.
# Defaults to 0 (off), uncomment this to turn it on.
#extend-orf = 1

# Minimum ORF overlap. If extend-overlap is enabled, Orthograph tries to extend
# the ORF beyond the BRH HMM alignment region. The resulting ORF must overlap
# the original HMM alignment region by this percentage. Note that the value
# must be specified as a fraction of 1.
# This affects only orthograph-reporter.
# 0.5 (50%) is the default; lower values make this less conservative.
#orf-overlap-minimum = 0.5

# If you are only interested in a few genes from your ortholog set, you can 
# provide Orthograph with a list of ortholog IDs in a file. The IDs must be
# in a single line each, and there must be no empty lines in the file. 
# This affects both orthograph-analyzer and orthograph-reporter.
#cog-list-file = /PATH/TO/FILE

# Strict search. Normally it is enough for a match to occur if one of the
# reference taxa is hit in the reciprocal search. In strict mode ALL reference
# taxa must be hit to verify an ortholog assignment. This is much more
# conservative.
# This affects only orthograph-reporter.
#strict-search = 1

# Clear pre-existing data of the same species from the database prior to the
# analysis. Recommended if you plan to run the same analysis multiple times,
# but doesn't hurt otherwise. 
# This affects only orthograph-analyzer.
# This behaviour is the default, uncomment this line if you want to turn it off.
#clear-database  = 0

# Delete old result files. This means the HMMsearch and BLAST report files found
# in the output directory. If you plan to run the same analysis multiple times
# with the same HMMsearch and BLAST settings, then don't have them deleted. This
# will speed up the process significantly, since the search programs don't have
# to be run again.
# This affects both orthograph-analyzer and orthograph-reporter.
# Uncomment this line if you want the files deleted.
#clear-files = 1

# Selenocysteine (U) may occur in some protein sequences. However, some
# alignment programs do not accept this nonstandard amino acid symbol. You can
# tell Orthograph to substitute all 'U' in the sequences with a different
# character. The default is not to substitute.
# This affects only orthograph-analyzer.
#substitute-u-with = X

# Header separator. This will be used to separate header fields in the output files.
# May be an arbitrary string. Defaults to '|'. To use a whitespace character,
# enclose it in quotes, e.g., ' '.
# This affects only orthograph-reporter.
#header-separator = |

# Verbose output. More information about the HMMsearch and BLAST hits. Normally
# you don't want to see this. If you are really interested in what Orthograph is
# thinking during the analysis, uncomment this. Verbose and quiet are mutually
# exclusive.
# This affects both orthograph-analyzer and orthograph-reporter.
#verbose = 1

# Quiet output. Uncomment this if you don't want to be bothered during the
# analysis. After starting, Orthograph will keep still until the analysis is
# complete (or unexpected things happen). Verbose and quiet are mutually
# exclusive.
# This affects both orthograph-analyzer and orthograph-reporter.
#quiet = 1


#
# # More paths
# 
# Ortholog sets directory. Useful if you would like to keep your ortholog sets
# (that is, the BLAST database, the HMMs and the alignment files for each
# ortholog gene) in a separate place. Defaults to 'sets' (in the current
# directory).
# This affects both orthograph-analyzer and orthograph-reporter.
#sets-dir = /PATH/TO/SETS/DIR

# Path to log file. If set, all messages will also be written to this file. If
# this is not set, messages are written to a log file in your output directory
# (see the setting 'output-directory' above). The log file for
# orthograph-analyzer is called orthograph-analyzer-TIMESTAMP.log, and the log
# file for orthograph-reporter is called orthograph-reporter-TIMESTAMP.log. The
# TIMESTAMP is generated in YYYY-MM-DD_HH:MM format.
# This affects both orthograph-analyzer and orthograph-reporter.
# Note that if you do set a path here, both programs write to the same file,
# meaning that orthograph-reporter will overwrite the log file of
# orthograph-analyzer. It is much more convenient to leave this unset.
#logfile = /PATH/TO/LOGFILE