Replace hmmer with py hmmer (#219)

* replace hmmer with pyhmmer * Add correct version of pyhmmer * Undo --force parameter in test-genomes.nf * Correct PyHMMER version * Refactor hmmscan hit loops
oschwengers · Aug 11, 2023 · 3415e89 · 3415e89
1 parent dfccd4a
commit 3415e89
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ Bakta requires the following 3rd party software tools which must be installed an
 - INFERNAL (1.1.4) <https://dx.doi.org/10.1093%2Fbioinformatics%2Fbtt509> <http://eddylab.org/infernal>
 - PILER-CR (1.06) <https://doi.org/10.1186/1471-2105-8-18> <http://www.drive5.com/pilercr>
 - Pyrodigal (2.0.2) <https://doi.org/10.21105/joss.04296> <https://github.com/althonos/pyrodigal>
-- Hmmer (3.3.2) <https://doi.org/10.1093/nar/gkt263> <http://hmmer.org>
+- PyHMMER (0.8.1) <https://doi.org/10.21105/joss.04296> <https://github.com/althonos/pyhmmer>
 - Diamond (2.0.14) <https://doi.org/10.1038/nmeth.3176> <https://github.com/bbuchfink/diamond>
 - Blast+ (2.12.0) <https://www.ncbi.nlm.nih.gov/pubmed/2231712> <https://blast.ncbi.nlm.nih.gov>
 - AMRFinderPlus (3.10.23) <https://github.com/ncbi/amr>
@@ -738,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof
 - Pyrodigal <https://doi.org/10.21105/joss.04296>
 - Diamond <https://doi.org/10.1038/s41592-021-01101-x>
 - BLAST+ <https://doi.org/10.1186/1471-2105-10-421>
+- PyHMMER <https://doi.org/10.21105/joss.04296>
 - HMMER <https://doi.org/10.1371/journal.pcbi.1002195>
 - AMRFinderPlus <https://doi.org/10.1038/s41598-021-91456-0>
 - DeepSig <https://doi.org/10.1093/bioinformatics/btx818>

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 
 import pyrodigal
+import pyhmmer
 
 from Bio.Seq import Seq
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
@@ -166,75 +167,48 @@ def create_cdss(genes, contig):
 
 def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]:
     """Detect Pfam-A entries"""
-    fasta_path = cfg.tmp_path.joinpath('hypotheticals.faa')
-    orf.write_internal_faa(cdss, fasta_path)
-    output_path = cfg.tmp_path.joinpath('cds.hypotheticals.pfam.tsv')
-    cmd = [
-        'hmmsearch',
-        '--noali',
-        '--cut_ga',  # use gathering cutoff
-        '--domtblout', str(output_path),
-        '--cpu', str(cfg.threads if cfg.threads <= 4 else 4),
-        str(cfg.db_path.joinpath('pfam')),
-        str(fasta_path)
+    alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
+    proteins: list[pyhmmer.easel.DigitalSequence] = [
+        pyhmmer.easel.TextSequence(sequence=cds['aa'],
+                                   name=bytes(orf.get_orf_key(cds),
+                                              'UTF-8')).digitize(alphabet) for cds in cdss
     ]
-    log.debug('cmd=%s', cmd)
-    proc = sp.run(
-        cmd,
-        cwd=str(cfg.tmp_path),
-        env=cfg.env,
-        stdout=sp.PIPE,
-        stderr=sp.PIPE,
-        universal_newlines=True
-    )
-    if(proc.returncode != 0):
-        log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
-        log.warning('pfam detection failed! hmmsearch-error-code=%d', proc.returncode)
-        raise Exception(f'hmmsearch error! error code: {proc.returncode}')
 
     pfam_hits = []
-    cds_pfams_hits = {}
+    cds_pfams_hits = []
     orf_by_aa_digest = orf.get_orf_dictionary(cdss)
-    with output_path.open() as fh:
-        for line in fh:
-            if(line[0] != '#'):
-                cols = bc.RE_MULTIWHITESPACE.split(line.strip())
-                aa_identifier = cols[0]
-                cds = orf_by_aa_digest[aa_identifier]
-
-                domain_length = int(cols[5])
-                domain_start = int(cols[15])
-                domain_stop = int(cols[16])
-                domain_cov = (domain_stop - domain_start + 1) / domain_length
-                aa_start = int(cols[19])
-                aa_stop = int(cols[20])
-                aa_cov = (aa_stop - aa_start + 1) / len(cds['aa'])
+    with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm:
+        for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
+            for hit in top_hits:
+                cds = orf_by_aa_digest[hit.name.decode()]
+
+                domain_cov = (hit.best_domain.alignment.hmm_to - hit.best_domain.alignment.hmm_from + 1) / len(hit.best_domain.alignment.hmm_sequence)
+                aa_cov = (hit.best_domain.alignment.target_to - hit.best_domain.alignment.target_from + 1) / len(cds['aa'])
 
                 pfam = OrderedDict()
-                pfam['id'] = cols[4]
-                pfam['name'] = cols[3]
-                pfam['length'] = domain_length
+                pfam['name'] = hit.best_domain.alignment.hmm_name.decode()
+                pfam['id'] = hit.best_domain.alignment.hmm_accession.decode()
+                pfam['length'] = len(hit.best_domain.alignment.hmm_sequence)
                 pfam['aa_cov'] = aa_cov
                 pfam['hmm_cov'] = domain_cov
-                pfam['evalue'] = float(cols[6])
-                pfam['score'] = float(cols[7])
-                pfam['start'] = aa_start
-                pfam['stop'] = aa_stop
+                pfam['evalue'] = hit.evalue
+                pfam['score'] = hit.score
+                pfam['start'] = hit.best_domain.alignment.target_from
+                pfam['stop'] = hit.best_domain.alignment.target_to
 
-                if('pfams' not in cds):
-                    cds['pfams'] = []
+                cds.setdefault('pfams', [])
                 cds['pfams'].append(pfam)
-                if('db_xrefs' not in cds):
-                    cds['db_xrefs'] = []
+                cds.setdefault('db_xrefs', [])
                 cds['db_xrefs'].append(f"PFAM:{pfam['id']}")
                 pfam_hits.append(cds)
-                cds_pfams_hits[aa_identifier] = cds
+                cds_pfams_hits.append(cds)
                 log.info(
                     'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'],
+                    pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
                 )
     log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits))
-    return cds_pfams_hits.values()
+    return cds_pfams_hits
 
 
 def analyze_proteins(cdss: Sequence[dict]):

diff --git a/bakta/features/orf.py b/bakta/features/orf.py
@@ -5,65 +5,49 @@
 from pathlib import Path
 from typing import Dict, Sequence
 
+import pyhmmer
+
 import bakta.config as cfg
 import bakta.constants as bc
 
 
 log = logging.getLogger('ORF')
 
 
-def detect_spurious(orfs: Sequence[dict], orf_aa_path: Path):
+def detect_spurious(orfs: Sequence[dict]):
     """Detect spurious ORFs with AntiFam"""
-    output_path = cfg.tmp_path.joinpath('cds.spurious.hmm.tsv')
-    cmd = [
-        'hmmsearch',
-        '--noali',
-        '--cut_ga',  # use gathering cutoff
-        '--tblout', str(output_path),
-        '--cpu', str(cfg.threads if cfg.threads <= 4 else 4),
-        str(cfg.db_path.joinpath('antifam')),
-        str(orf_aa_path)
+    alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
+    proteins: list[pyhmmer.easel.DigitalSequence] = [
+        pyhmmer.easel.TextSequence(sequence=orf['aa'],
+                                   name=bytes(get_orf_key(orf),
+                                              'UTF-8')).digitize(alphabet) for orf in orfs
     ]
-    log.debug('cmd=%s', cmd)
-    proc = sp.run(
-        cmd,
-        cwd=str(cfg.tmp_path),
-        env=cfg.env,
-        stdout=sp.PIPE,
-        stderr=sp.PIPE,
-        universal_newlines=True
-    )
-    if(proc.returncode != 0):
-        log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr)
-        log.warning('spurious ORF detection failed! hmmsearch-error-code=%d', proc.returncode)
-        raise Exception(f'hmmsearch error! error code: {proc.returncode}')
 
     discarded_orfs = []
     orf_by_aa_digest = get_orf_dictionary(orfs)
-    with output_path.open() as fh:
-        for line in fh:
-            if(line[0] != '#'):
-                (aa_identifier, _, subject_name, subject_id, evalue, bitscore, _) = line.strip().split(maxsplit=6)
-                orf = orf_by_aa_digest[aa_identifier]
-                evalue = float(evalue)
-                bitscore = float(bitscore)
-                if(evalue > 1E-5):
+    with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm:
+        for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads):
+            for hit in top_hits:
+                orf = orf_by_aa_digest[hit.name.decode()]
+                if hit.evalue > 1E-5:
                     log.debug(
                         'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
-                        orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore
+                        orf['contig'], orf['start'], orf['stop'], orf['strand'],
+                        hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
                     )
                 else:
                     discard = OrderedDict()
                     discard['type'] = bc.DISCARD_TYPE_SPURIOUS
-                    discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{subject_id})'
-                    discard['score'] = bitscore
-                    discard['evalue'] = evalue
+                    discard['description'] = f'(partial) homology to spurious sequence HMM ' \
+                                             f'(AntiFam:{hit.best_domain.alignment.hmm_accession.decode()})'
+                    discard['score'] = hit.score
+                    discard['evalue'] = hit.evalue
 
                     orf['discarded'] = discard
                     discarded_orfs.append(orf)
                     log.info(
                         'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
-                        orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore
+                        orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
                     )
     log.info('discarded=%i', len(discarded_orfs))
     return discarded_orfs

diff --git a/bakta/main.py b/bakta/main.py
@@ -230,9 +230,7 @@ def main():
 
         if(len(cdss) > 0):
             log.debug('detect spurious CDS')
-            cds_aa_path = cfg.tmp_path.joinpath('cds.spurious.faa')
-            orf.write_internal_faa(cdss, cds_aa_path)
-            discarded_cdss = orf.detect_spurious(cdss, cds_aa_path)
+            discarded_cdss = orf.detect_spurious(cdss)
             print(f'\tdiscarded spurious: {len(discarded_cdss)}')
             cdss = [cds for cds in cdss if 'discarded' not in cds]
 
@@ -339,9 +337,7 @@ def main():
 
         if(len(sorfs) > 0):
             log.debug('detect spurious sORF')
-            sorf_aa_path = cfg.tmp_path.joinpath('sorf.spurious.faa')
-            orf.write_internal_faa(sorfs, sorf_aa_path)
-            discarded_sorfs = orf.detect_spurious(sorfs, sorf_aa_path)
+            discarded_sorfs = orf.detect_spurious(sorfs)
             print(f'\tdiscarded spurious: {len(discarded_sorfs)}')
             sorfs = [sorf for sorf in sorfs if 'discarded' not in sorf]
 

diff --git a/environment.yml b/environment.yml
@@ -14,7 +14,7 @@ dependencies:
   - aragorn>=1.2.41
   - infernal>=1.1.4
   - piler-cr
-  - hmmer>=3.3.2
+  - pyhmmer>=0.8.1
   - diamond>=2.0.14
   - blast>=2.12.0
   - ncbi-amrfinderplus>=3.11.2

diff --git a/setup.py b/setup.py
@@ -31,7 +31,8 @@
         'requests >= 2.25.1',
         'alive-progress >= 3.0.1',
         'PyYAML >= 6.0',
-        'pyrodigal >= 2.1.0'
+        'pyrodigal >= 2.1.0',
+        'pyhmmer >= 0.8.1'
     ],
     entry_points={
         'console_scripts': [