diff --git a/README.md b/README.md index ef288974..21d9c29a 100644 --- a/README.md +++ b/README.md @@ -112,7 +112,7 @@ Bakta requires the following 3rd party software tools which must be installed an - INFERNAL (1.1.4) - PILER-CR (1.06) - Pyrodigal (2.0.2) -- Hmmer (3.3.2) +- PyHMMER (0.8.1) - Diamond (2.0.14) - Blast+ (2.12.0) - AMRFinderPlus (3.10.23) @@ -738,6 +738,7 @@ Bakta is *standing on the shoulder of giants* taking advantage of many great sof - Pyrodigal - Diamond - BLAST+ +- PyHMMER - HMMER - AMRFinderPlus - DeepSig diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 562e6fff..70b57ef2 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -10,6 +10,7 @@ from pathlib import Path import pyrodigal +import pyhmmer from Bio.Seq import Seq from Bio.SeqUtils.ProtParam import ProteinAnalysis @@ -166,75 +167,48 @@ def create_cdss(genes, contig): def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: """Detect Pfam-A entries""" - fasta_path = cfg.tmp_path.joinpath('hypotheticals.faa') - orf.write_internal_faa(cdss, fasta_path) - output_path = cfg.tmp_path.joinpath('cds.hypotheticals.pfam.tsv') - cmd = [ - 'hmmsearch', - '--noali', - '--cut_ga', # use gathering cutoff - '--domtblout', str(output_path), - '--cpu', str(cfg.threads if cfg.threads <= 4 else 4), - str(cfg.db_path.joinpath('pfam')), - str(fasta_path) + alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino() + proteins: list[pyhmmer.easel.DigitalSequence] = [ + pyhmmer.easel.TextSequence(sequence=cds['aa'], + name=bytes(orf.get_orf_key(cds), + 'UTF-8')).digitize(alphabet) for cds in cdss ] - log.debug('cmd=%s', cmd) - proc = sp.run( - cmd, - cwd=str(cfg.tmp_path), - env=cfg.env, - stdout=sp.PIPE, - stderr=sp.PIPE, - universal_newlines=True - ) - if(proc.returncode != 0): - log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr) - log.warning('pfam detection failed! hmmsearch-error-code=%d', proc.returncode) - raise Exception(f'hmmsearch error! error code: {proc.returncode}') pfam_hits = [] - cds_pfams_hits = {} + cds_pfams_hits = [] orf_by_aa_digest = orf.get_orf_dictionary(cdss) - with output_path.open() as fh: - for line in fh: - if(line[0] != '#'): - cols = bc.RE_MULTIWHITESPACE.split(line.strip()) - aa_identifier = cols[0] - cds = orf_by_aa_digest[aa_identifier] - - domain_length = int(cols[5]) - domain_start = int(cols[15]) - domain_stop = int(cols[16]) - domain_cov = (domain_stop - domain_start + 1) / domain_length - aa_start = int(cols[19]) - aa_stop = int(cols[20]) - aa_cov = (aa_stop - aa_start + 1) / len(cds['aa']) + with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('pfam')) as hmm: + for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): + for hit in top_hits: + cds = orf_by_aa_digest[hit.name.decode()] + + domain_cov = (hit.best_domain.alignment.hmm_to - hit.best_domain.alignment.hmm_from + 1) / len(hit.best_domain.alignment.hmm_sequence) + aa_cov = (hit.best_domain.alignment.target_to - hit.best_domain.alignment.target_from + 1) / len(cds['aa']) pfam = OrderedDict() - pfam['id'] = cols[4] - pfam['name'] = cols[3] - pfam['length'] = domain_length + pfam['name'] = hit.best_domain.alignment.hmm_name.decode() + pfam['id'] = hit.best_domain.alignment.hmm_accession.decode() + pfam['length'] = len(hit.best_domain.alignment.hmm_sequence) pfam['aa_cov'] = aa_cov pfam['hmm_cov'] = domain_cov - pfam['evalue'] = float(cols[6]) - pfam['score'] = float(cols[7]) - pfam['start'] = aa_start - pfam['stop'] = aa_stop + pfam['evalue'] = hit.evalue + pfam['score'] = hit.score + pfam['start'] = hit.best_domain.alignment.target_from + pfam['stop'] = hit.best_domain.alignment.target_to - if('pfams' not in cds): - cds['pfams'] = [] + cds.setdefault('pfams', []) cds['pfams'].append(pfam) - if('db_xrefs' not in cds): - cds['db_xrefs'] = [] + cds.setdefault('db_xrefs', []) cds['db_xrefs'].append(f"PFAM:{pfam['id']}") pfam_hits.append(cds) - cds_pfams_hits[aa_identifier] = cds + cds_pfams_hits.append(cds) log.info( 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] + cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], + pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] ) log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_pfams_hits)) - return cds_pfams_hits.values() + return cds_pfams_hits def analyze_proteins(cdss: Sequence[dict]): diff --git a/bakta/features/orf.py b/bakta/features/orf.py index 5dcaa171..9d2b7bc8 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -5,6 +5,8 @@ from pathlib import Path from typing import Dict, Sequence +import pyhmmer + import bakta.config as cfg import bakta.constants as bc @@ -12,58 +14,40 @@ log = logging.getLogger('ORF') -def detect_spurious(orfs: Sequence[dict], orf_aa_path: Path): +def detect_spurious(orfs: Sequence[dict]): """Detect spurious ORFs with AntiFam""" - output_path = cfg.tmp_path.joinpath('cds.spurious.hmm.tsv') - cmd = [ - 'hmmsearch', - '--noali', - '--cut_ga', # use gathering cutoff - '--tblout', str(output_path), - '--cpu', str(cfg.threads if cfg.threads <= 4 else 4), - str(cfg.db_path.joinpath('antifam')), - str(orf_aa_path) + alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino() + proteins: list[pyhmmer.easel.DigitalSequence] = [ + pyhmmer.easel.TextSequence(sequence=orf['aa'], + name=bytes(get_orf_key(orf), + 'UTF-8')).digitize(alphabet) for orf in orfs ] - log.debug('cmd=%s', cmd) - proc = sp.run( - cmd, - cwd=str(cfg.tmp_path), - env=cfg.env, - stdout=sp.PIPE, - stderr=sp.PIPE, - universal_newlines=True - ) - if(proc.returncode != 0): - log.debug('stdout=\'%s\', stderr=\'%s\'', proc.stdout, proc.stderr) - log.warning('spurious ORF detection failed! hmmsearch-error-code=%d', proc.returncode) - raise Exception(f'hmmsearch error! error code: {proc.returncode}') discarded_orfs = [] orf_by_aa_digest = get_orf_dictionary(orfs) - with output_path.open() as fh: - for line in fh: - if(line[0] != '#'): - (aa_identifier, _, subject_name, subject_id, evalue, bitscore, _) = line.strip().split(maxsplit=6) - orf = orf_by_aa_digest[aa_identifier] - evalue = float(evalue) - bitscore = float(bitscore) - if(evalue > 1E-5): + with pyhmmer.plan7.HMMFile(cfg.db_path.joinpath('antifam')) as hmm: + for top_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='gathering', cpus=cfg.threads): + for hit in top_hits: + orf = orf_by_aa_digest[hit.name.decode()] + if hit.evalue > 1E-5: log.debug( 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore + orf['contig'], orf['start'], orf['stop'], orf['strand'], + hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score ) else: discard = OrderedDict() discard['type'] = bc.DISCARD_TYPE_SPURIOUS - discard['description'] = f'(partial) homology to spurious sequence HMM (AntiFam:{subject_id})' - discard['score'] = bitscore - discard['evalue'] = evalue + discard['description'] = f'(partial) homology to spurious sequence HMM ' \ + f'(AntiFam:{hit.best_domain.alignment.hmm_accession.decode()})' + discard['score'] = hit.score + discard['evalue'] = hit.evalue orf['discarded'] = discard discarded_orfs.append(orf) log.info( 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], subject_name, evalue, bitscore + orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score ) log.info('discarded=%i', len(discarded_orfs)) return discarded_orfs diff --git a/bakta/main.py b/bakta/main.py index b473e59c..89ad2fe4 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -230,9 +230,7 @@ def main(): if(len(cdss) > 0): log.debug('detect spurious CDS') - cds_aa_path = cfg.tmp_path.joinpath('cds.spurious.faa') - orf.write_internal_faa(cdss, cds_aa_path) - discarded_cdss = orf.detect_spurious(cdss, cds_aa_path) + discarded_cdss = orf.detect_spurious(cdss) print(f'\tdiscarded spurious: {len(discarded_cdss)}') cdss = [cds for cds in cdss if 'discarded' not in cds] @@ -339,9 +337,7 @@ def main(): if(len(sorfs) > 0): log.debug('detect spurious sORF') - sorf_aa_path = cfg.tmp_path.joinpath('sorf.spurious.faa') - orf.write_internal_faa(sorfs, sorf_aa_path) - discarded_sorfs = orf.detect_spurious(sorfs, sorf_aa_path) + discarded_sorfs = orf.detect_spurious(sorfs) print(f'\tdiscarded spurious: {len(discarded_sorfs)}') sorfs = [sorf for sorf in sorfs if 'discarded' not in sorf] diff --git a/environment.yml b/environment.yml index d2989895..7630dcc2 100644 --- a/environment.yml +++ b/environment.yml @@ -14,7 +14,7 @@ dependencies: - aragorn>=1.2.41 - infernal>=1.1.4 - piler-cr - - hmmer>=3.3.2 + - pyhmmer>=0.8.1 - diamond>=2.0.14 - blast>=2.12.0 - ncbi-amrfinderplus>=3.11.2 diff --git a/setup.py b/setup.py index 913b7732..5960714d 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,8 @@ 'requests >= 2.25.1', 'alive-progress >= 3.0.1', 'PyYAML >= 6.0', - 'pyrodigal >= 2.1.0' + 'pyrodigal >= 2.1.0', + 'pyhmmer >= 0.8.1' ], entry_points={ 'console_scripts': [