Skip to content

Commit

Permalink
add EC & GO terms and update to new NCBIFams file format
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Jan 17, 2024
1 parent d1da8ae commit 4973fb2
Showing 1 changed file with 74 additions and 9 deletions.
83 changes: 74 additions & 9 deletions db-scripts/annotate-ncbi-fams.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,42 @@
with hmms_path.open() as fh, alive_bar() as bar:
for line in fh:
(
ncbi_accession, source_identifier, label, sequence_cutoff, domain_cutoff, hmm_length, family_type, for_structural_annotation, for_naming, for_AMRFinder, product_name, gene_symbol, ec_numbers, go_terms, pmids, taxonomic_range, taxonomic_range_name, taxonomic_rank_name, n_refseq_protein_hits, source, name_orig
ncbi_accession,
source_identifier,
label,
sequence_cutoff,
domain_cutoff,
hmm_length,
family_type,
for_structural_annotation,
for_naming,
for_AMRFinder,
product_name,
gene_symbol,
ec_numbers,
go_terms,
pmids,
taxonomic_range,
taxonomic_range_name,
taxonomic_rank_name,
n_refseq_protein_hits,
source,
name_orig,
_1,
_2
) = line.split('\t')
if(family_type in family_type_ranks.keys() and for_naming == 'Y'): # only accept exception/equivalog HMMs eligible for naming proteins
hmm = {
'acc': ncbi_accession,
'gene': gene_symbol,
'type': family_type,
'product': product_name.strip()
}
if(gene_symbol != ''):
hmm['gene'] = gene_symbol
if(ec_numbers != ''):
hmm['ecs'] = ec_numbers.split(',')
if(go_terms != ''):
hmm['gos'] = [term.split(':')[1] for term in go_terms.split(',')]
if(ncbi_accession != '-'):
hmms[ncbi_accession] = hmm
bar()
Expand Down Expand Up @@ -86,6 +113,9 @@
print('\n')

psc_annotated = 0
psc_with_gene = 0
psc_with_ec = 0
psc_with_go = 0
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar:
conn.execute('PRAGMA page_size = 4096;')
conn.execute('PRAGMA cache_size = 100000;')
Expand All @@ -95,16 +125,51 @@
conn.execute('PRAGMA journal_mode = OFF')
conn.execute('PRAGMA threads = 2;')
conn.commit()
conn.row_factory = sqlite3.Row
for psc_id, hit in hit_per_psc.items():
hmm = hmms.get(hit['hmm_id'], None)
if(hmm is not None):
if(hmm['gene'] == ''):
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
else:
conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (hmm['gene'], hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', hmm['gene'], hmm['product'], psc_id)
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
if('gene' in hmm):
conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (hmm['gene'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', hmm['gene'], psc_id)
psc_with_gene += 1
if('ecs' in hmm):
rec_psc = conn.execute('SELECT ec_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
if(rec_psc is None):
continue
if(rec_psc['ec_ids'] is not None):
ecs = set(rec_psc['ec_ids'].split(','))
for ec in hmm['ecs']:
ecs.add(ec)
else:
ecs = hmm['ecs']
ec_list = ','.join(sorted(ecs))
conn.execute('UPDATE psc SET ec_ids=? WHERE uniref90_id=?', (ec_list, psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET ec_ids=%s WHERE uniref90_id=%s', ec_list, psc_id)
psc_with_ec += 1
if('gos' in hmm):
rec_psc = conn.execute('SELECT go_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
if(rec_psc is None):
continue
if(rec_psc['go_ids'] is not None):
gos = set(rec_psc['go_ids'].split(','))
for go in hmm['gos']:
gos.add(go)
else:
gos = hmm['gos']
go_list = ','.join(sorted(gos))
conn.execute('UPDATE psc SET go_ids=? WHERE uniref90_id=?', (go_list, psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET go_ids=%s WHERE uniref90_id=%s', go_list, psc_id)
psc_with_go += 1
psc_annotated += 1
bar()
print(f'PSCs with annotated gene / product: {psc_annotated}')
print(f'PSCs with annotated product: {psc_annotated}')
log_psc.debug('summary: PSC annotated=%d', psc_annotated)
print(f'PSCs with annotated gene: {psc_with_gene}')
log_psc.debug('summary: PSC gene annotated=%d', psc_with_gene)
print(f'PSCs with annotated EC: {psc_with_ec}')
log_psc.debug('summary: PSC EC annotated=%d', psc_with_ec)
print(f'PSCs with annotated GO: {psc_with_go}')
log_psc.debug('summary: PSC GO annotated=%d', psc_with_go)

0 comments on commit 4973fb2

Please sign in to comment.