diff --git a/db-scripts/annotate-is.py b/db-scripts/annotate-is.py index 0401c64a..2e25f7c6 100644 --- a/db-scripts/annotate-is.py +++ b/db-scripts/annotate-is.py @@ -51,18 +51,17 @@ length = int(length) qcov = length / int(qlen) scov = length / int(slen) - if(qcov >= 0.9 and scov >= 0.95 and float(pident) >= 98. and float(evalue) < 1e-6): - descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~') - if(descriptions[1].lower() == 'transposase'): - descriptions = descriptions[0].split('_') - is_name = descriptions[0] - is_group = descriptions[1] - is_family = descriptions[2] - product = f"{is_family} family {is_name} transposase" - gene = f'tnp-{is_name}' - conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id)) - log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id) - ips_updated += 1 + if(qcov >= 0.90 and scov >= 0.90 and float(pident) >= 95. and float(evalue) < 1e-6): + (is_name, is_group, is_family, is_orf) = sseqid[0].split('_') + gene = 'tnp' + product = f"{is_family} family {is_name} transposase" + if(is_orf == 'ORFA'): + product = f"{product} ORF A" + elif(is_orf == 'ORFB'): + product = f"{product} ORF B" + conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id)) + log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id) + ips_updated += 1 ips_processed += 1 if((ips_processed % 10000) == 0): conn.commit() @@ -81,17 +80,16 @@ qcov = length / int(qlen) scov = length / int(slen) if(qcov >= 0.80 and scov >= 0.80 and float(pident) >= 90. and float(evalue) < 1e-6): - descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~') - if(descriptions[1].lower() == 'transposase'): - descriptions = descriptions[0].split('_') - is_name = descriptions[0] - is_group = descriptions[1] - is_family = descriptions[2] - product = f"{is_family} family transposase" - gene = f'tnp-{is_family}' - conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id)) - log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id) - psc_updated += 1 + (is_name, is_group, is_family, is_orf) = sseqid[0].split('_') + gene = 'tnp' + product = f"{is_family} family transposase" + if(is_orf == 'ORFA'): + product = f"{product} ORF A" + elif(is_orf == 'ORFB'): + product = f"{product} ORF B" + conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id)) + log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id) + psc_updated += 1 psc_processed += 1 if((psc_processed % 10000) == 0): conn.commit() diff --git a/db-scripts/buid-db.sh b/db-scripts/buid-db.sh index e6801280..4ac0ba28 100644 --- a/db-scripts/buid-db.sh +++ b/db-scripts/buid-db.sh @@ -238,14 +238,15 @@ rm ReferenceGeneCatalog.txt ############################################################################ # Integrate ISfinder db # - download IS protein sequences from GitHub (oschwengers/ISfinder-sequences) -# - annotate IPSs with IS info +# - extract IS transposase sequences and mark ORF A/B transposases +# - annotate IPSs/PCSs with IS info ############################################################################ -printf "\n16/18: download ISfinder protein sequences ...\n" +printf "\n16/18: download & extract ISfinder protein sequences ...\n" wget https://github.com/oschwengers/ISfinder-sequences/raw/2e9162bd5e3448c86ec1549a55315e498bef72fc/IS.faa -printf "\n16/18: annotate IPSs ...\n" -grep -A 1 ~~~Transposase~~~ IS.faa | tr -d - | tr -s "\n" > is.transposase.faa +python3 ${BAKTA_DB_SCRIPTS}/extract-is.py --input IS.faa --output is.transposase.faa +printf "\n16/18: annotate IPSs/PCSs ...\n" diamond makedb --in is.transposase.faa --db is -nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 98 --qcov 99 --scov 99 --out diamond.is.ips.tsv +nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 95 --qcov 90 --scov 90 --out diamond.is.ips.tsv nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db is.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.is.psc.tsv python3 ${BAKTA_DB_SCRIPTS}/annotate-is.py --db bakta.db --ips-alignments diamond.is.ips.tsv --psc-alignments diamond.is.psc.tsv rm IS.faa is.transposase.faa is.dmnd diamond.is.ips.tsv diamond.is.psc.tsv diff --git a/db-scripts/extract-is.py b/db-scripts/extract-is.py new file mode 100644 index 00000000..982026e3 --- /dev/null +++ b/db-scripts/extract-is.py @@ -0,0 +1,53 @@ +import argparse +import logging + +from pathlib import Path + +from Bio import SeqIO + +parser = argparse.ArgumentParser( + description='Extract IS transposases from ISFinder.' +) +parser.add_argument('--input', action='store', help='Path to ISFinder file.') +parser.add_argument('--output', action='store', help='Path to transposase file.') +args = parser.parse_args() + +input_path = Path(args.input).resolve() +output_path = Path(args.output).resolve() + +ises = {} +transposases = 0 +with input_path.open() as fh_in: + for record in SeqIO.parse(fh_in, 'fasta'): + id = record.id + seq = str(record.seq).upper() + descriptions = record.description.split(' ', 1)[1][3:-3].split('~~~') + if(descriptions[1].lower() == 'transposase' and len(seq) > 0): + (is_name, is_group, is_family, is_orf) = descriptions[0].split('_', 3) + if(id in ises): + ise = ises[id] + orfs = ise['orfs'] + orfs.append((is_orf, seq)) + else: + ises[id] = { + 'name': is_name, + 'group': is_group, + 'family': is_family, + 'orfs': [(is_orf, seq)] + } + +with output_path.open('w') as fh_out: + for id, ise in ises.items(): + orfs = ise['orfs'] + if(len(orfs) == 3 and orfs[0][1][:5] == orfs[2][1][:5]): # check if ORF 3 starts with ORF 1 -> fusion ORF -> annotate ORF1/2 as transposase orfA/orfB + fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFA\n{orfs[0][1]}\n") + fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFB\n{orfs[1][1]}\n") + fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF\n{orfs[2][1]}\n") + transposases += 3 + else: + for idx, transposase in enumerate(orfs): + (orf, seq) = transposase + fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF{idx+1}\n{seq}\n") + transposases += 1 + +print(f'\tstored transposase sequences: {transposases}')