Skip to content

Commit

Permalink
improve IS transposase annotation #10
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Feb 7, 2023
1 parent 0bd0c94 commit fab2993
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 28 deletions.
44 changes: 21 additions & 23 deletions db-scripts/annotate-is.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,18 +51,17 @@
length = int(length)
qcov = length / int(qlen)
scov = length / int(slen)
if(qcov >= 0.9 and scov >= 0.95 and float(pident) >= 98. and float(evalue) < 1e-6):
descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~')
if(descriptions[1].lower() == 'transposase'):
descriptions = descriptions[0].split('_')
is_name = descriptions[0]
is_group = descriptions[1]
is_family = descriptions[2]
product = f"{is_family} family {is_name} transposase"
gene = f'tnp-{is_name}'
conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id))
log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id)
ips_updated += 1
if(qcov >= 0.90 and scov >= 0.90 and float(pident) >= 95. and float(evalue) < 1e-6):
(is_name, is_group, is_family, is_orf) = sseqid[0].split('_')
gene = 'tnp'
product = f"{is_family} family {is_name} transposase"
if(is_orf == 'ORFA'):
product = f"{product} ORF A"
elif(is_orf == 'ORFB'):
product = f"{product} ORF B"
conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id))
log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id)
ips_updated += 1
ips_processed += 1
if((ips_processed % 10000) == 0):
conn.commit()
Expand All @@ -81,17 +80,16 @@
qcov = length / int(qlen)
scov = length / int(slen)
if(qcov >= 0.80 and scov >= 0.80 and float(pident) >= 90. and float(evalue) < 1e-6):
descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~')
if(descriptions[1].lower() == 'transposase'):
descriptions = descriptions[0].split('_')
is_name = descriptions[0]
is_group = descriptions[1]
is_family = descriptions[2]
product = f"{is_family} family transposase"
gene = f'tnp-{is_family}'
conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id))
log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id)
psc_updated += 1
(is_name, is_group, is_family, is_orf) = sseqid[0].split('_')
gene = 'tnp'
product = f"{is_family} family transposase"
if(is_orf == 'ORFA'):
product = f"{product} ORF A"
elif(is_orf == 'ORFB'):
product = f"{product} ORF B"
conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id))
log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id)
psc_updated += 1
psc_processed += 1
if((psc_processed % 10000) == 0):
conn.commit()
Expand Down
11 changes: 6 additions & 5 deletions db-scripts/buid-db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -238,14 +238,15 @@ rm ReferenceGeneCatalog.txt
############################################################################
# Integrate ISfinder db
# - download IS protein sequences from GitHub (oschwengers/ISfinder-sequences)
# - annotate IPSs with IS info
# - extract IS transposase sequences and mark ORF A/B transposases
# - annotate IPSs/PCSs with IS info
############################################################################
printf "\n16/18: download ISfinder protein sequences ...\n"
printf "\n16/18: download & extract ISfinder protein sequences ...\n"
wget https://github.com/oschwengers/ISfinder-sequences/raw/2e9162bd5e3448c86ec1549a55315e498bef72fc/IS.faa
printf "\n16/18: annotate IPSs ...\n"
grep -A 1 ~~~Transposase~~~ IS.faa | tr -d - | tr -s "\n" > is.transposase.faa
python3 ${BAKTA_DB_SCRIPTS}/extract-is.py --input IS.faa --output is.transposase.faa
printf "\n16/18: annotate IPSs/PCSs ...\n"
diamond makedb --in is.transposase.faa --db is
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 98 --qcov 99 --scov 99 --out diamond.is.ips.tsv
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 95 --qcov 90 --scov 90 --out diamond.is.ips.tsv
nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db is.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.is.psc.tsv
python3 ${BAKTA_DB_SCRIPTS}/annotate-is.py --db bakta.db --ips-alignments diamond.is.ips.tsv --psc-alignments diamond.is.psc.tsv
rm IS.faa is.transposase.faa is.dmnd diamond.is.ips.tsv diamond.is.psc.tsv
Expand Down
53 changes: 53 additions & 0 deletions db-scripts/extract-is.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import argparse
import logging

from pathlib import Path

from Bio import SeqIO

parser = argparse.ArgumentParser(
description='Extract IS transposases from ISFinder.'
)
parser.add_argument('--input', action='store', help='Path to ISFinder file.')
parser.add_argument('--output', action='store', help='Path to transposase file.')
args = parser.parse_args()

input_path = Path(args.input).resolve()
output_path = Path(args.output).resolve()

ises = {}
transposases = 0
with input_path.open() as fh_in:
for record in SeqIO.parse(fh_in, 'fasta'):
id = record.id
seq = str(record.seq).upper()
descriptions = record.description.split(' ', 1)[1][3:-3].split('~~~')
if(descriptions[1].lower() == 'transposase' and len(seq) > 0):
(is_name, is_group, is_family, is_orf) = descriptions[0].split('_', 3)
if(id in ises):
ise = ises[id]
orfs = ise['orfs']
orfs.append((is_orf, seq))
else:
ises[id] = {
'name': is_name,
'group': is_group,
'family': is_family,
'orfs': [(is_orf, seq)]
}

with output_path.open('w') as fh_out:
for id, ise in ises.items():
orfs = ise['orfs']
if(len(orfs) == 3 and orfs[0][1][:5] == orfs[2][1][:5]): # check if ORF 3 starts with ORF 1 -> fusion ORF -> annotate ORF1/2 as transposase orfA/orfB
fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFA\n{orfs[0][1]}\n")
fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFB\n{orfs[1][1]}\n")
fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF\n{orfs[2][1]}\n")
transposases += 3
else:
for idx, transposase in enumerate(orfs):
(orf, seq) = transposase
fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF{idx+1}\n{seq}\n")
transposases += 1

print(f'\tstored transposase sequences: {transposases}')

0 comments on commit fab2993

Please sign in to comment.