implement HMM search

oschwengers · Sep 24, 2024 · a2d9ae0 · a2d9ae0
1 parent 4f40f46
commit a2d9ae0
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 4 deletions.
diff --git a/bakta/expert/protein_hmms.py b/bakta/expert/protein_hmms.py
@@ -0,0 +1,95 @@
+import logging
+
+from typing import Sequence
+
+import pyhmmer
+
+import bakta.config as cfg
+import bakta.constants as bc
+import bakta.features.orf as orf
+
+
+log = logging.getLogger('EXPERT_AA_HMM')
+
+
+def search(cdss: Sequence[dict], user_hmms):
+    """Detect Pfam-A entries"""
+    cds_found = set()
+    orf_by_aa_digest = orf.get_orf_dictionary(cdss)
+    alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
+    proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
+    with pyhmmer.plan7.HMMFile(user_hmms) as hmm:
+        for hmm_query_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
+            print(f'TopHits: {hmm_query_hits}')
+            hmm_name = hmm_query_hits.query_name.decode()
+            print(f'query_name: {hmm_name}')
+            hmm_id = hmm_query_hits.query_accession.decode()
+            print(f'query_accession: {hmm_id}')
+            hmm_length = hmm_query_hits.query_length
+            print(f'query_length {hmm_length}')
+            # print(f'description: {hmm_query_hits.description.decode()}')
+
+            for hmm_query_hit in hmm_query_hits.reported:
+                print(f'Hit: {hmm_query_hit}')
+                # print(f'\taccession: {hmm_query_hit.accession.decode()}')
+                # print(f'\tdescription: {hmm_query_hit.description.decode()}')
+                print(f'\tname: {hmm_query_hit.name.decode()}')
+                print(f'\tevalue: {hmm_query_hit.evalue}')
+                print(f'\tlength: {hmm_query_hit.length}')
+                print(f'\tscore: {hmm_query_hit.score}')
+
+                aa_identifier = hmm_query_hit.name.decode()
+                cds = orf_by_aa_digest[aa_identifier]
+                if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
+                    log.debug(
+                        'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
+                    )
+                else:
+                    hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
+                    print(f'hit domain length sum: {hit_domain_lengths_sum}')
+                    hmm_cov = hit_domain_lengths_sum / hmm_length
+                    print(f'domain cov: {hmm_cov}')
+                    aa_cov = hit_domain_lengths_sum / hmm_length
+                    aa_cov_best_domain = (hmm_query_hit.best_domain.alignment.target_to - hmm_query_hit.best_domain.alignment.target_from + 1) / len(cds['aa'])
+                    assert aa_cov == aa_cov_best_domain, f'ERROR: aa_cov ({aa_cov}) != aa_cov_best_domain ({aa_cov_best_domain})'
+                    print(f'aa cov: {aa_cov}')
+
+                    hit = {
+                        'type': 'user_hmms',
+                        'source': 'UserHMMs',
+                        'rank': 99,
+                        'id': hmm_id,
+                        'length': hit_domain_lengths_sum,
+                        'aa_cov': aa_cov,
+                        'hmm_cov': hmm_cov,
+                        'evalue': hmm_query_hit.evalue,
+                        'score': hmm_query_hit.score,
+                        'start': hmm_query_hit.best_domain.alignment.target_from,
+                        'stop': hmm_query_hit.best_domain.alignment.target_to,
+                        'db_xrefs': [f'UserHMM:{hmm_id}'],
+                        'gene': None,
+                        'product': hmm_name  # None
+                    }
+
+                    # ToDo: implement user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>, waiting for https://github.com/althonos/pyhmmer/issues/76
+                    # hmm_description = hmm_query_hits.description.decode()
+                    # cols = hmm_description.split('~~~')
+                    # if(len(cols) == 1):
+                    #     hit['product'] = hmm_description
+                    # else:
+                    #     (gene, product, dbxrefs) = cols
+                    #     hit['gene'] = gene
+                    #     hit['product'] = product
+                    #     hit['db_xrefs'].extend(set(dbxrefs.split(',')))
+
+                    cds.setdefault('expert', [])
+                    cds['expert'].append(hit)
+                    log.debug(
+                        'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
+                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hit['gene'], hit['product'], hit['evalue'], hit['score']
+                    )
+                    cds_found.add(aa_identifier)
+
+    log.info('found=%i', len(cds_found))
+    return cds_found
diff --git a/bakta/main.py b/bakta/main.py
@@ -15,6 +15,7 @@
 import bakta.io.insdc as insdc
 import bakta.expert.amrfinder as exp_amr
 import bakta.expert.protein_sequences as exp_aa_seq
+import bakta.expert.protein_hmms as exp_aa_hmms
 import bakta.features.annotation as anno
 import bakta.features.t_rna as t_rna
 import bakta.features.tm_rna as tm_rna
@@ -249,8 +250,8 @@ def main():
         if(len(cdss) > 0):
             log.debug('lookup CDS UPS/IPS')
             cdss_ups, cdss_not_found = ups.lookup(cdss)
-            cdss_ips, sorf_pscs = ips.lookup(cdss_ups)
-            cdss_not_found.extend(sorf_pscs)
+            cdss_ips, cdss_not_found_tmp = ips.lookup(cdss_ups)
+            cdss_not_found.extend(cdss_not_found_tmp)
             print(f'\tdetected IPSs: {len(cdss_ips)}')
 
             if(len(cdss_not_found) > 0):
@@ -286,6 +287,11 @@ def main():
                 user_aa_found = exp_aa_seq.search(cdss, cds_aa_path, 'user_proteins', user_aa_path)
                 print(f'\t\tuser protein sequences: {len(user_aa_found)}')
 
+            if(cfg.user_hmms):
+                log.debug('conduct expert system: user HMM')
+                user_hmm_found = exp_aa_hmms.search(cdss, cfg.user_hmms)
+                print(f'\t\tuser HMM sequences: {len(user_hmm_found)}')
+
             if(cfg.gram != bc.GRAM_UNKNOWN):
                 sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
                 print(f'\tsignal peptides: {len(sig_peptides_found)}')
@@ -357,8 +363,8 @@ def main():
         if(len(sorfs_not_found) > 0):
             if(cfg.db_info['type'] == 'full'):
                 log.debug('search sORF PSC')
-                sorf_pscs, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
-                sorf_pscs_psccs.extend(sorf_pscs)
+                cdss_not_found_tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
+                sorf_pscs_psccs.extend(cdss_not_found_tmp)
                 print(f'\tfound PSCs: {len(sorf_pscs_psccs)}')
             else:
                 log.debug('search sORF PSCC')