Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support blast and diamond blastp for ortholog identification #643

Merged
merged 15 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions jcvi/apps/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,13 @@ def run_formatdb(infile=None, outfile=None, dbtype="nucl"):
sh(cmd)


@depends
def run_diamond_makedb(infile=None, outfile=None):
cmd = "diamond makedb"
cmd += " --in {0} --db {1} -p 5".format(infile, infile)
sh(cmd)


@depends
def run_blat(
infile=None,
Expand Down Expand Up @@ -586,5 +593,126 @@ def last(args, dbtype=None):
return lastfile


def blast_main(args, dbtype=None):
"""
%prog database.fasta query.fasta

Run blastp/blastn by calling BLAST+ blastp/blastn depends on dbtype.
"""
p = OptionParser(blast_main.__doc__)
p.add_option(
"--dbtype",
default="nucl",
choices=("nucl", "prot"),
help="Molecule type of subject database",
)
p.add_option("--path", help="Specify BLAST path for blastn or blastp")

p.set_cpus()
p.set_outdir()
p.set_params()

opts, args = p.parse_args(args)

if len(args) != 2:
sys.exit(not p.print_help())

subject, query = args
path = opts.path
cpus = opts.cpus
if not dbtype:
dbtype = opts.dbtype

getpath = lambda x: op.join(path, x) if path else x
cmd = "blastn" if dbtype == "nucl" else "blastp"
lastdb_bin = getpath("makeblastdb")
lastal_bin = getpath(cmd)
for bin in (lastdb_bin, lastal_bin):
if not which(bin):
logging.fatal("`%s` not found on PATH. Have you installed BLAST?", bin)
sys.exit(1)

db_suffix = '.nin' if dbtype == "nucl" else ".pin"

run_formatdb(
infile=subject,
outfile=subject + db_suffix,
dbtype=dbtype)

blastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir)
# Make several attempts to run LASTAL
try:
sh(
cmd + f" -num_threads {cpus} -query {query} -db {subject} -out {blastfile}" +
" -outfmt 6 -max_target_seqs 1000 -evalue 1e-5",
check=False,
redirect_error=STDOUT,
)
except CalledProcessError as e: # multi-threading disabled
message = f"{cmd} failed with message:"
message += "\n{0}".format(e.output.decode())
logging.error(message)
logging.fatal("Failed to run `blast`. Aborted.")
cleanup(blastfile)
sys.exit(1)
return blastfile


def diamond_blastp_main(args, dbtype="prot"):
"""
%prog database.fasta query.fasta

Run diamond blastp for protein alignment.
"""
p = OptionParser(diamond_blastp_main.__doc__)

p.add_option("--path", help="Specify diamond path for diamond blastp")

p.set_cpus()
p.set_outdir()
p.set_params()

opts, args = p.parse_args(args)

if len(args) != 2:
sys.exit(not p.print_help())

subject, query = args
path = opts.path
cpus = opts.cpus
if not dbtype:
dbtype = opts.dbtype

getpath = lambda x: op.join(path, x) if path else x
cmd = "diamond blastp"
diamond_bin = getpath("diamond")
for bin in (diamond_bin,):
if not which(bin):
logging.fatal("`%s` not found on PATH. Have you installed Diamond?", bin)
sys.exit(1)

run_diamond_makedb(
infile=subject,
outfile=subject + '.dmnd',)

blastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir)
# Make several attempts to run LASTAL
try:
sh(
cmd + f" --threads {cpus} --query {query} --db {subject} --out {blastfile}" +
" --ultra-sensitive --max-target-seqs 1000 --evalue 1e-5 --outfmt 6",
check=False,
redirect_error=STDOUT,
)
except CalledProcessError as e: # multi-threading disabled
message = f"{cmd} failed with message:"
message += "\n{0}".format(e.output.decode())
logging.error(message)
logging.fatal("Failed to run `diamond blastp`. Aborted.")
cleanup(blastfile)
sys.exit(1)
return blastfile


if __name__ == "__main__":
main()
21 changes: 20 additions & 1 deletion jcvi/compara/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,6 +609,7 @@ def ortholog(args):
match (RBH).
"""
from jcvi.apps.align import last as last_main
from jcvi.apps.align import diamond_blastp_main, blast_main
from jcvi.compara.blastfilter import main as blastfilter_main
from jcvi.compara.quota import main as quota_main
from jcvi.compara.synteny import scan, mcscan, liftover
Expand All @@ -621,6 +622,7 @@ def ortholog(args):
choices=("nucl", "prot"),
help="Molecule type of subject database",
)

p.add_option(
"--full",
default=False,
Expand Down Expand Up @@ -673,6 +675,13 @@ def ortholog(args):
help="Ignore this pair of ortholog identification instead of throwing an error when performing many pairs of cataloging."
)

p.add_option(
"--align_soft",
default="last",
choices=("last", "blast", "diamond_blastp"),
help="Sequence alignment software. Default <last> for both <nucl> and <prot>. Users could also use <blast> for both <nucl> and <prot>, or <diamond_blastp> for <prot>.",
)

opts, args = p.parse_args(args)

if len(args) != 2:
Expand All @@ -690,14 +699,24 @@ def ortholog(args):
dist = "--dist={0}".format(opts.dist)
minsize_flag = "--min_size={}".format(opts.n)
cpus_flag = "--cpus={}".format(opts.cpus)
align_soft = opts.align_soft

aprefix = op.basename(a)
bprefix = op.basename(b)
pprefix = ".".join((aprefix, bprefix))
qprefix = ".".join((bprefix, aprefix))
last = pprefix + ".last"
if need_update((afasta, bfasta), last, warn=True):
last_main([bfasta, afasta, cpus_flag], dbtype)
if align_soft == "blast":
blast_main(
[bfasta, afasta, cpus_flag], dbtype
)
elif dbtype == "prot" and align_soft == "diamond_blastp":
diamond_blastp_main(
[bfasta, afasta, cpus_flag], dbtype
)
else:
last_main([bfasta, afasta, cpus_flag], dbtype)

self_remove = opts.self_remove
if a == b:
Expand Down
Loading