Skip to content

Commit

Permalink
implement HMM search
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Sep 24, 2024
1 parent 4f40f46 commit a2d9ae0
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 4 deletions.
95 changes: 95 additions & 0 deletions bakta/expert/protein_hmms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import logging

from typing import Sequence

import pyhmmer

import bakta.config as cfg
import bakta.constants as bc
import bakta.features.orf as orf


log = logging.getLogger('EXPERT_AA_HMM')


def search(cdss: Sequence[dict], user_hmms):
"""Detect Pfam-A entries"""
cds_found = set()
orf_by_aa_digest = orf.get_orf_dictionary(cdss)
alphabet: pyhmmer.easel.Alphabet = pyhmmer.easel.Alphabet.amino()
proteins: list[pyhmmer.easel.DigitalSequence] = [ pyhmmer.easel.TextSequence(sequence=cds['aa'], name=bytes(orf.get_orf_key(cds), 'UTF-8')).digitize(alphabet) for cds in cdss ]
with pyhmmer.plan7.HMMFile(user_hmms) as hmm:
for hmm_query_hits in pyhmmer.hmmsearch(hmm, proteins, bit_cutoffs='trusted', cpus=cfg.threads):
print(f'TopHits: {hmm_query_hits}')
hmm_name = hmm_query_hits.query_name.decode()
print(f'query_name: {hmm_name}')
hmm_id = hmm_query_hits.query_accession.decode()
print(f'query_accession: {hmm_id}')
hmm_length = hmm_query_hits.query_length
print(f'query_length {hmm_length}')
# print(f'description: {hmm_query_hits.description.decode()}')

for hmm_query_hit in hmm_query_hits.reported:
print(f'Hit: {hmm_query_hit}')
# print(f'\taccession: {hmm_query_hit.accession.decode()}')
# print(f'\tdescription: {hmm_query_hit.description.decode()}')
print(f'\tname: {hmm_query_hit.name.decode()}')
print(f'\tevalue: {hmm_query_hit.evalue}')
print(f'\tlength: {hmm_query_hit.length}')
print(f'\tscore: {hmm_query_hit.score}')

aa_identifier = hmm_query_hit.name.decode()
cds = orf_by_aa_digest[aa_identifier]
if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
log.debug(
'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
)
else:
hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
print(f'hit domain length sum: {hit_domain_lengths_sum}')
hmm_cov = hit_domain_lengths_sum / hmm_length
print(f'domain cov: {hmm_cov}')
aa_cov = hit_domain_lengths_sum / hmm_length
aa_cov_best_domain = (hmm_query_hit.best_domain.alignment.target_to - hmm_query_hit.best_domain.alignment.target_from + 1) / len(cds['aa'])
assert aa_cov == aa_cov_best_domain, f'ERROR: aa_cov ({aa_cov}) != aa_cov_best_domain ({aa_cov_best_domain})'
print(f'aa cov: {aa_cov}')

hit = {
'type': 'user_hmms',
'source': 'UserHMMs',
'rank': 99,
'id': hmm_id,
'length': hit_domain_lengths_sum,
'aa_cov': aa_cov,
'hmm_cov': hmm_cov,
'evalue': hmm_query_hit.evalue,
'score': hmm_query_hit.score,
'start': hmm_query_hit.best_domain.alignment.target_from,
'stop': hmm_query_hit.best_domain.alignment.target_to,
'db_xrefs': [f'UserHMM:{hmm_id}'],
'gene': None,
'product': hmm_name # None
}

# ToDo: implement user-provided gene, product, db_xrefs via HMM's description field provided as <gene>~~~<product>~~~<db_xrefs>, waiting for https://github.com/althonos/pyhmmer/issues/76
# hmm_description = hmm_query_hits.description.decode()
# cols = hmm_description.split('~~~')
# if(len(cols) == 1):
# hit['product'] = hmm_description
# else:
# (gene, product, dbxrefs) = cols
# hit['gene'] = gene
# hit['product'] = product
# hit['db_xrefs'].extend(set(dbxrefs.split(',')))

cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hit['gene'], hit['product'], hit['evalue'], hit['score']
)
cds_found.add(aa_identifier)

log.info('found=%i', len(cds_found))
return cds_found
14 changes: 10 additions & 4 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import bakta.io.insdc as insdc
import bakta.expert.amrfinder as exp_amr
import bakta.expert.protein_sequences as exp_aa_seq
import bakta.expert.protein_hmms as exp_aa_hmms
import bakta.features.annotation as anno
import bakta.features.t_rna as t_rna
import bakta.features.tm_rna as tm_rna
Expand Down Expand Up @@ -249,8 +250,8 @@ def main():
if(len(cdss) > 0):
log.debug('lookup CDS UPS/IPS')
cdss_ups, cdss_not_found = ups.lookup(cdss)
cdss_ips, sorf_pscs = ips.lookup(cdss_ups)
cdss_not_found.extend(sorf_pscs)
cdss_ips, cdss_not_found_tmp = ips.lookup(cdss_ups)
cdss_not_found.extend(cdss_not_found_tmp)
print(f'\tdetected IPSs: {len(cdss_ips)}')

if(len(cdss_not_found) > 0):
Expand Down Expand Up @@ -286,6 +287,11 @@ def main():
user_aa_found = exp_aa_seq.search(cdss, cds_aa_path, 'user_proteins', user_aa_path)
print(f'\t\tuser protein sequences: {len(user_aa_found)}')

if(cfg.user_hmms):
log.debug('conduct expert system: user HMM')
user_hmm_found = exp_aa_hmms.search(cdss, cfg.user_hmms)
print(f'\t\tuser HMM sequences: {len(user_hmm_found)}')

if(cfg.gram != bc.GRAM_UNKNOWN):
sig_peptides_found = sig_peptides.search(cdss, cds_aa_path)
print(f'\tsignal peptides: {len(sig_peptides_found)}')
Expand Down Expand Up @@ -357,8 +363,8 @@ def main():
if(len(sorfs_not_found) > 0):
if(cfg.db_info['type'] == 'full'):
log.debug('search sORF PSC')
sorf_pscs, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
sorf_pscs_psccs.extend(sorf_pscs)
cdss_not_found_tmp, sorfs_not_found = s_orf.search_pscs(sorfs_not_found)
sorf_pscs_psccs.extend(cdss_not_found_tmp)
print(f'\tfound PSCs: {len(sorf_pscs_psccs)}')
else:
log.debug('search sORF PSCC')
Expand Down

0 comments on commit a2d9ae0

Please sign in to comment.