Skip to content

Commit

Permalink
Update db build scripts (#270)
Browse files Browse the repository at this point in the history
* update MOB-suite to v3.1.8
* update UniParc file download
* bump Diamond to v2.1.8
* bump HMMER to v3.4
* URL updates and minor fixes
* add EC & GO terms and update to new NCBIFams file format
  • Loading branch information
oschwengers authored Jan 18, 2024
1 parent f13d491 commit e7f00df
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 20 deletions.
83 changes: 74 additions & 9 deletions db-scripts/annotate-ncbi-fams.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,42 @@
with hmms_path.open() as fh, alive_bar() as bar:
for line in fh:
(
ncbi_accession, source_identifier, label, sequence_cutoff, domain_cutoff, hmm_length, family_type, for_structural_annotation, for_naming, for_AMRFinder, product_name, gene_symbol, ec_numbers, go_terms, pmids, taxonomic_range, taxonomic_range_name, taxonomic_rank_name, n_refseq_protein_hits, source, name_orig
ncbi_accession,
source_identifier,
label,
sequence_cutoff,
domain_cutoff,
hmm_length,
family_type,
for_structural_annotation,
for_naming,
for_AMRFinder,
product_name,
gene_symbol,
ec_numbers,
go_terms,
pmids,
taxonomic_range,
taxonomic_range_name,
taxonomic_rank_name,
n_refseq_protein_hits,
source,
name_orig,
_1,
_2
) = line.split('\t')
if(family_type in family_type_ranks.keys() and for_naming == 'Y'): # only accept exception/equivalog HMMs eligible for naming proteins
hmm = {
'acc': ncbi_accession,
'gene': gene_symbol,
'type': family_type,
'product': product_name.strip()
}
if(gene_symbol != ''):
hmm['gene'] = gene_symbol
if(ec_numbers != ''):
hmm['ecs'] = ec_numbers.split(',')
if(go_terms != ''):
hmm['gos'] = [term.split(':')[1] for term in go_terms.split(',')]
if(ncbi_accession != '-'):
hmms[ncbi_accession] = hmm
bar()
Expand Down Expand Up @@ -86,6 +113,9 @@
print('\n')

psc_annotated = 0
psc_with_gene = 0
psc_with_ec = 0
psc_with_go = 0
with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar:
conn.execute('PRAGMA page_size = 4096;')
conn.execute('PRAGMA cache_size = 100000;')
Expand All @@ -95,16 +125,51 @@
conn.execute('PRAGMA journal_mode = OFF')
conn.execute('PRAGMA threads = 2;')
conn.commit()
conn.row_factory = sqlite3.Row
for psc_id, hit in hit_per_psc.items():
hmm = hmms.get(hit['hmm_id'], None)
if(hmm is not None):
if(hmm['gene'] == ''):
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
else:
conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (hmm['gene'], hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', hmm['gene'], hmm['product'], psc_id)
conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
if('gene' in hmm):
conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (hmm['gene'], psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', hmm['gene'], psc_id)
psc_with_gene += 1
if('ecs' in hmm):
rec_psc = conn.execute('SELECT ec_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
if(rec_psc is None):
continue
if(rec_psc['ec_ids'] is not None):
ecs = set(rec_psc['ec_ids'].split(','))
for ec in hmm['ecs']:
ecs.add(ec)
else:
ecs = hmm['ecs']
ec_list = ','.join(sorted(ecs))
conn.execute('UPDATE psc SET ec_ids=? WHERE uniref90_id=?', (ec_list, psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET ec_ids=%s WHERE uniref90_id=%s', ec_list, psc_id)
psc_with_ec += 1
if('gos' in hmm):
rec_psc = conn.execute('SELECT go_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
if(rec_psc is None):
continue
if(rec_psc['go_ids'] is not None):
gos = set(rec_psc['go_ids'].split(','))
for go in hmm['gos']:
gos.add(go)
else:
gos = hmm['gos']
go_list = ','.join(sorted(gos))
conn.execute('UPDATE psc SET go_ids=? WHERE uniref90_id=?', (go_list, psc_id)) # annotate PSC
log_psc.info('UPDATE psc SET go_ids=%s WHERE uniref90_id=%s', go_list, psc_id)
psc_with_go += 1
psc_annotated += 1
bar()
print(f'PSCs with annotated gene / product: {psc_annotated}')
print(f'PSCs with annotated product: {psc_annotated}')
log_psc.debug('summary: PSC annotated=%d', psc_annotated)
print(f'PSCs with annotated gene: {psc_with_gene}')
log_psc.debug('summary: PSC gene annotated=%d', psc_with_gene)
print(f'PSCs with annotated EC: {psc_with_ec}')
log_psc.debug('summary: PSC EC annotated=%d', psc_with_ec)
print(f'PSCs with annotated GO: {psc_with_go}')
log_psc.debug('summary: PSC GO annotated=%d', psc_with_go)
18 changes: 11 additions & 7 deletions db-scripts/buid-db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ rm -r antifam antifam-dir/

# download & extract oriT sequences
printf "\n5/20: download and extract oriT sequences from Mob-suite ...\n"
wget https://zenodo.org/record/3786915/files/data.tar.gz
wget https://zenodo.org/records/10304948/files/data.tar.gz
tar -xvzf data.tar.gz
mv data/orit.fas ./orit.fna
rm -r data/ data.tar.gz
Expand Down Expand Up @@ -109,9 +109,13 @@ python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db
############################################################################
printf "\n8/20: download UniProt UniRef50 ...\n"
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.xml.gz
wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/uniparc_active.fasta.gz
for i in {1..200}; do
wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/fasta/active/uniparc_active_p${i}.fasta.gz
pigz -dc uniparc_active_p${i}.fasta.gz >> uniparc_active.fasta
rm uniparc_active_p${i}.fasta.gz
done
printf "\n8/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --pscc pscc.faa --sorf pscc_sorf.faa
python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta --db bakta.db --pscc pscc.faa --pscc_sorf pscc_sorf.faa
printf "\n8/20: build PSCC Diamond db ...\n"
diamond makedb --in pscc.faa --db pscc
diamond makedb --in pscc_sorf.faa --db sorf
Expand All @@ -133,7 +137,7 @@ rm uniref50.xml.gz
printf "\n9/20: download UniProt UniRef90 ...\n"
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref90/uniref90.xml.gz
printf "\n9/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --psc psc.faa --sorf sorf.faa
python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta --db bakta.db --psc psc.faa --psc_sorf sorf.faa
printf "\n9/20: build PSC Diamond db ...\n"
diamond makedb --in psc.faa --db psc
diamond makedb --in sorf.faa --db sorf
Expand All @@ -148,7 +152,7 @@ rm uniref90.xml.gz
printf "\n10/20: download UniProt UniRef100 ...\n"
wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref100/uniref100.xml.gz
printf "\n10/20: read, filter and store UniRef100 entries ...:\n"
python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --ips ips.faa
python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta --db bakta.db --ips ips.faa
rm uniref100.xml.gz uniparc_active.fasta.gz


Expand Down Expand Up @@ -224,7 +228,7 @@ rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt
# - annotate IPSs if IPS have no PSC UniRef90 identifier (seq -> hash -> UPS -> IPS)
############################################################################
printf "\n14/20: download UniProt/SwissProt ...\n"
wget https://ftp.expasy.org/databases/swiss-prot/release/uniprot_sprot.xml.gz
wget https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
printf "\n14/20: annotate IPSs and PSCs ...\n"
python3 ${BAKTA_DB_SCRIPTS}/annotate-swissprot.py --taxonomy nodes.dmp --xml uniprot_sprot.xml.gz --db bakta.db
rm uniprot_sprot.xml.gz
Expand All @@ -241,7 +245,7 @@ wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv
grep -v "(Provisional)" hmm_PGAP.tsv > hmms.non-prov.tsv
grep exception hmms.non-prov.tsv > hmms.ncbi.selected.tsv
grep equivalog hmms.non-prov.tsv >> hmms.ncbi.selected.tsv
cut -f1 hmms.ncbi.selected.tsv > hmms.ids.txt
sort hmms.ncbi.selected.tsv | uniq | cut -f1 > hmms.ids.txt
hmmfetch -f -o ncbifams hmm_PGAP.LIB hmms.ids.txt
hmmpress ncbifams
printf "\n15/20: annotate PSCs...\n"
Expand Down
2 changes: 1 addition & 1 deletion db-scripts/diamond.nf
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ process diamond {
maxRetries 3
cpus 8
memory '32 GB'
conda 'diamond=2.0.14'
conda 'diamond=2.1.8'

input:
file('input.faa') from chAAs
Expand Down
2 changes: 1 addition & 1 deletion db-scripts/hmmsearch.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ process hmmsearch {
maxRetries 3
cpus 1
memory { 1.GB * task.attempt }
conda 'hmmer=3.3.2'
conda 'hmmer=3.4'

input:
path('input.faa') from chAAs
Expand Down
4 changes: 2 additions & 2 deletions db-scripts/init-pscc.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,8 @@ def is_taxon_child(child, LCA, taxonomy):
log_pscc.debug('written UniParc seed sequences: %i', uniparc_total_seqs)

print(f'PSCC normal seqs: {pscc_seqs}')
log_pscc.debug('summary: # PSC normal=%i', pscc_seqs)
log_pscc.debug('summary: # PSCC normal=%i', pscc_seqs)
print(f'PSCC sORF seqs: {pscc_sorf_seqs}')
log_pscc.debug('summary: # PSC sORFs=%i', pscc_sorf_seqs)
log_pscc.debug('summary: # PSCC sORFs=%i', pscc_sorf_seqs)

print("\nsuccessfully initialized PSCC table!")

0 comments on commit e7f00df

Please sign in to comment.