Skip to content

Commit

Permalink
Replace hmmer with py hmmer (#219)
Browse files Browse the repository at this point in the history
* replace hmmer with pyhmmer
* Add correct version of pyhmmer
* Undo --force parameter in test-genomes.nf
* Correct PyHMMER version
* Refactor hmmscan hit loops
  • Loading branch information
jhahnfeld authored Aug 11, 2023
1 parent dfccd4a commit 3415e89
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 98 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ Bakta requires the following 3rd party software tools which must be installed an
- INFERNAL (1.1.4) <https://dx.doi.org/10.1093%2Fbioinformatics%2Fbtt509> <http://eddylab.org/infernal>
- PILER-CR (1.06) <https://doi.org/10.1186/1471-2105-8-18> <http://www.drive5.com/pilercr>
- Pyrodigal (2.0.2) <https://doi.org/10.21105/joss.04296> <https://github.com/althonos/pyrodigal>
- Hmmer (3.3.2) <https://doi.org/10.1093/nar/gkt263> <http://hmmer.org>
- PyHMMER (0.8.1) <https://doi.org/10.21105/joss.04296> <https://github.com/althonos/pyhmmer>
- Diamond (2.0.14) <https://doi.org/10.1038/nmeth.3176> <https://github.com/bbuchfink/diamond>
- Blast+ (2.12.0) <https://www.ncbi.nlm.nih.gov/pubmed/2231712> <https://blast.ncbi.nlm.nih.gov>
- AMRFinderPlus (3.10.23) <https://github.com/ncbi/amr>
Expand Down Expand Up @@ -738,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
- Pyrodigal <https://doi.org/10.21105/joss.04296>
- Diamond <https://doi.org/10.1038/s41592-021-01101-x>
- BLAST+ <https://doi.org/10.1186/1471-2105-10-421>
- PyHMMER <https://doi.org/10.21105/joss.04296>
- HMMER <https://doi.org/10.1371/journal.pcbi.1002195>
- AMRFinderPlus <https://doi.org/10.1038/s41598-021-91456-0>
- DeepSig <https://doi.org/10.1093/bioinformatics/btx818>
Expand Down
80 changes: 27 additions & 53 deletions bakta/features/cds.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from pathlib import Path

import pyrodigal
import pyhmmer

from Bio.Seq import Seq
from Bio.SeqUtils.ProtParam import ProteinAnalysis
Expand Down Expand Up @@ -166,75 +167,48 @@ def create_cdss(genes, contig):

def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]:
"""Detect Pfam-A entries"""
fasta_path = cfg.tmp_path.joinpath('hypotheticals.faa')
orf.write_internal_faa(cdss, fasta_path)
output_path = cfg.tmp_path.joinpath('cds.hypotheticals.pfam.tsv')
cmd = [
'hmmsearch',
'--noali',
'--cut_ga', # use gathering cutoff
'--domtblout', str(output_path),
'--cpu', str(cfg.threads if cfg.threads <= 4 else 4),
str(cfg.db_path.joinpath('pfam')),
str(fasta_path)
alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
proteins: list[pyhmmer.easel.DigitalSequence] = [
pyhmmer.easel.TextSequence(sequence=cds['aa'],
name=bytes(orf.get_orf_key(cds),
'UTF-8')).digitize(alphabet) for cds in cdss
]
log.debug('cmd=%s', cmd)
proc = sp.run(
cmd,
cwd=str(cfg.tmp_path),
env=cfg.env,
stdout=sp.PIPE,
stderr=sp.PIPE,
universal_newlines=True
)
if(proc.returncode != 0):
log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
log.warning('pfam detection failed! hmmsearch-error-code=%d', proc.returncode)
raise Exception(f'hmmsearch error! error code: {proc.returncode}')

pfam_hits = []
cds_pfams_hits = {}
cds_pfams_hits = []
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
with output_path.open() as fh:
for line in fh:
if(line[0] != '#'):
cols = bc.RE_MULTIWHITESPACE.split(line.strip())
aa_identifier = cols[0]
cds = orf_by_aa_digest[aa_identifier]

domain_length = int(cols[5])
domain_start = int(cols[15])
domain_stop = int(cols[16])
domain_cov = (domain_stop - domain_start + 1) / domain_length
aa_start = int(cols[19])
aa_stop = int(cols[20])
aa_cov = (aa_stop - aa_start + 1) / len(cds['aa'])
with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm:
for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
for hit in top_hits:
cds = orf_by_aa_digest[hit.name.decode()]

domain_cov = (hit.best_domain.alignment.hmm_to - hit.best_domain.alignment.hmm_from + 1) / len(hit.best_domain.alignment.hmm_sequence)
aa_cov = (hit.best_domain.alignment.target_to - hit.best_domain.alignment.target_from + 1) / len(cds['aa'])

pfam = OrderedDict()
pfam['id'] = cols[4]
pfam['name'] = cols[3]
pfam['length'] = domain_length
pfam['name'] = hit.best_domain.alignment.hmm_name.decode()
pfam['id'] = hit.best_domain.alignment.hmm_accession.decode()
pfam['length'] = len(hit.best_domain.alignment.hmm_sequence)
pfam['aa_cov'] = aa_cov
pfam['hmm_cov'] = domain_cov
pfam['evalue'] = float(cols[6])
pfam['score'] = float(cols[7])
pfam['start'] = aa_start
pfam['stop'] = aa_stop
pfam['evalue'] = hit.evalue
pfam['score'] = hit.score
pfam['start'] = hit.best_domain.alignment.target_from
pfam['stop'] = hit.best_domain.alignment.target_to

if('pfams' not in cds):
cds['pfams'] = []
cds.setdefault('pfams', [])
cds['pfams'].append(pfam)
if('db_xrefs' not in cds):
cds['db_xrefs'] = []
cds.setdefault('db_xrefs', [])
cds['db_xrefs'].append(f"PFAM:{pfam['id']}")
pfam_hits.append(cds)
cds_pfams_hits[aa_identifier] = cds
cds_pfams_hits.append(cds)
log.info(
'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'],
pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
)
log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits))
return cds_pfams_hits.values()
return cds_pfams_hits


def analyze_proteins(cdss: Sequence[dict]):
Expand Down
56 changes: 20 additions & 36 deletions bakta/features/orf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,65 +5,49 @@
from pathlib import Path
from typing import Dict, Sequence

import pyhmmer

import bakta.config as cfg
import bakta.constants as bc


log = logging.getLogger('ORF')


def detect_spurious(orfs: Sequence[dict], orf_aa_path: Path):
def detect_spurious(orfs: Sequence[dict]):
"""Detect spurious ORFs with AntiFam"""
output_path = cfg.tmp_path.joinpath('cds.spurious.hmm.tsv')
cmd = [
'hmmsearch',
'--noali',
'--cut_ga', # use gathering cutoff
'--tblout', str(output_path),
'--cpu', str(cfg.threads if cfg.threads <= 4 else 4),
str(cfg.db_path.joinpath('antifam')),
str(orf_aa_path)
alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
proteins: list[pyhmmer.easel.DigitalSequence] = [
pyhmmer.easel.TextSequence(sequence=orf['aa'],
name=bytes(get_orf_key(orf),
'UTF-8')).digitize(alphabet) for orf in orfs
]
log.debug('cmd=%s', cmd)
proc = sp.run(
cmd,
cwd=str(cfg.tmp_path),
env=cfg.env,
stdout=sp.PIPE,
stderr=sp.PIPE,
universal_newlines=True
)
if(proc.returncode != 0):
log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
log.warning('spurious ORF detection failed! hmmsearch-error-code=%d', proc.returncode)
raise Exception(f'hmmsearch error! error code: {proc.returncode}')

discarded_orfs = []
orf_by_aa_digest = get_orf_dictionary(orfs)
with output_path.open() as fh:
for line in fh:
if(line[0] != '#'):
(aa_identifier, _, subject_name, subject_id, evalue, bitscore, _) = line.strip().split(maxsplit=6)
orf = orf_by_aa_digest[aa_identifier]
evalue = float(evalue)
bitscore = float(bitscore)
if(evalue > 1E-5):
with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm:
for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
for hit in top_hits:
orf = orf_by_aa_digest[hit.name.decode()]
if hit.evalue > 1E-5:
log.debug(
'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore
orf['contig'], orf['start'], orf['stop'], orf['strand'],
hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
)
else:
discard = OrderedDict()
discard['type'] = bc.DISCARD_TYPE_SPURIOUS
discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{subject_id})'
discard['score'] = bitscore
discard['evalue'] = evalue
discard['description'] = f'(partial) homology to spurious sequence HMM ' \
f'(AntiFam:{hit.best_domain.alignment.hmm_accession.decode()})'
discard['score'] = hit.score
discard['evalue'] = hit.evalue

orf['discarded'] = discard
discarded_orfs.append(orf)
log.info(
'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore
orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
)
log.info('discarded=%i', len(discarded_orfs))
return discarded_orfs
Expand Down
8 changes: 2 additions & 6 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,7 @@ def main():

if(len(cdss) > 0):
log.debug('detect spurious CDS')
cds_aa_path = cfg.tmp_path.joinpath('cds.spurious.faa')
orf.write_internal_faa(cdss, cds_aa_path)
discarded_cdss = orf.detect_spurious(cdss, cds_aa_path)
discarded_cdss = orf.detect_spurious(cdss)
print(f'\tdiscarded spurious: {len(discarded_cdss)}')
cdss = [cds for cds in cdss if 'discarded' not in cds]

Expand Down Expand Up @@ -339,9 +337,7 @@ def main():

if(len(sorfs) > 0):
log.debug('detect spurious sORF')
sorf_aa_path = cfg.tmp_path.joinpath('sorf.spurious.faa')
orf.write_internal_faa(sorfs, sorf_aa_path)
discarded_sorfs = orf.detect_spurious(sorfs, sorf_aa_path)
discarded_sorfs = orf.detect_spurious(sorfs)
print(f'\tdiscarded spurious: {len(discarded_sorfs)}')
sorfs = [sorf for sorf in sorfs if 'discarded' not in sorf]

Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies:
- aragorn>=1.2.41
- infernal>=1.1.4
- piler-cr
- hmmer>=3.3.2
- pyhmmer>=0.8.1
- diamond>=2.0.14
- blast>=2.12.0
- ncbi-amrfinderplus>=3.11.2
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
'requests >= 2.25.1',
'alive-progress >= 3.0.1',
'PyYAML >= 6.0',
'pyrodigal >= 2.1.0'
'pyrodigal >= 2.1.0',
'pyhmmer >= 0.8.1'
],
entry_points={
'console_scripts': [
Expand Down

0 comments on commit 3415e89

Please sign in to comment.