From fab2993d2d5817e4f5cf5676c71df91cc4883126 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 7 Feb 2023 11:31:40 +0100
Subject: [PATCH] improve IS transposase annotation #10

---
 db-scripts/annotate-is.py | 44 ++++++++++++++++----------------
 db-scripts/buid-db.sh     | 11 ++++----
 db-scripts/extract-is.py  | 53 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 28 deletions(-)
 create mode 100644 db-scripts/extract-is.py

diff --git a/db-scripts/annotate-is.py b/db-scripts/annotate-is.py
index 0401c64a..2e25f7c6 100644
--- a/db-scripts/annotate-is.py
+++ b/db-scripts/annotate-is.py
@@ -51,18 +51,17 @@
             length = int(length)
             qcov = length / int(qlen)
             scov = length / int(slen)
-            if(qcov >= 0.9 and scov >= 0.95 and float(pident) >= 98. and float(evalue) < 1e-6):
-                descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~')
-                if(descriptions[1].lower() == 'transposase'):
-                    descriptions = descriptions[0].split('_')
-                    is_name = descriptions[0]
-                    is_group = descriptions[1]
-                    is_family = descriptions[2]
-                    product = f"{is_family} family {is_name} transposase"
-                    gene = f'tnp-{is_name}'
-                    conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id))
-                    log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id)
-                    ips_updated += 1
+            if(qcov >= 0.90 and scov >= 0.90 and float(pident) >= 95. and float(evalue) < 1e-6):
+                (is_name, is_group, is_family, is_orf) = sseqid[0].split('_')
+                gene = 'tnp'
+                product = f"{is_family} family {is_name} transposase"
+                if(is_orf == 'ORFA'):
+                    product = f"{product} ORF A"
+                elif(is_orf == 'ORFB'):
+                    product = f"{product} ORF B"
+                conn.execute('UPDATE ips SET gene=?, product=? WHERE uniref100_id=?', (gene, product, uniref100_id))
+                log_ips.info('UPDATE ips SET gene=%s, product=%s WHERE uniref100_id=%s', gene, product, uniref100_id)
+                ips_updated += 1
             ips_processed += 1
             if((ips_processed % 10000) == 0):
                 conn.commit()
@@ -81,17 +80,16 @@
             qcov = length / int(qlen)
             scov = length / int(slen)
             if(qcov >= 0.80 and scov >= 0.80 and float(pident) >= 90. and float(evalue) < 1e-6):
-                descriptions = stitle.split(' ', 1)[1][3:-3].split('~~~')
-                if(descriptions[1].lower() == 'transposase'):
-                    descriptions = descriptions[0].split('_')
-                    is_name = descriptions[0]
-                    is_group = descriptions[1]
-                    is_family = descriptions[2]
-                    product = f"{is_family} family transposase"
-                    gene = f'tnp-{is_family}'
-                    conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id))
-                    log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id)
-                    psc_updated += 1
+                (is_name, is_group, is_family, is_orf) = sseqid[0].split('_')
+                gene = 'tnp'
+                product = f"{is_family} family transposase"
+                if(is_orf == 'ORFA'):
+                    product = f"{product} ORF A"
+                elif(is_orf == 'ORFB'):
+                    product = f"{product} ORF B"
+                conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (gene, product, uniref90_id))
+                log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', gene, product, uniref90_id)
+                psc_updated += 1
             psc_processed += 1
             if((psc_processed % 10000) == 0):
                 conn.commit()
diff --git a/db-scripts/buid-db.sh b/db-scripts/buid-db.sh
index e6801280..4ac0ba28 100644
--- a/db-scripts/buid-db.sh
+++ b/db-scripts/buid-db.sh
@@ -238,14 +238,15 @@ rm ReferenceGeneCatalog.txt
 ############################################################################
 # Integrate ISfinder db
 # - download IS protein sequences from GitHub (oschwengers/ISfinder-sequences)
-# - annotate IPSs with IS info
+# - extract IS transposase sequences and mark ORF A/B transposases
+# - annotate IPSs/PCSs with IS info
 ############################################################################
-printf "\n16/18: download ISfinder protein sequences ...\n"
+printf "\n16/18: download & extract ISfinder protein sequences ...\n"
 wget https://github.com/oschwengers/ISfinder-sequences/raw/2e9162bd5e3448c86ec1549a55315e498bef72fc/IS.faa
-printf "\n16/18: annotate IPSs ...\n"
-grep -A 1 ~~~Transposase~~~ IS.faa | tr -d - | tr -s "\n" > is.transposase.faa
+python3 ${BAKTA_DB_SCRIPTS}/extract-is.py --input IS.faa --output is.transposase.faa
+printf "\n16/18: annotate IPSs/PCSs ...\n"
 diamond makedb --in is.transposase.faa --db is
-nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 98 --qcov 99 --scov 99 --out diamond.is.ips.tsv
+nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in ips.faa --db is.dmnd --block 1000000 --id 95 --qcov 90 --scov 90 --out diamond.is.ips.tsv
 nextflow run ${BAKTA_DB_SCRIPTS}/diamond.nf --in psc.faa --db is.dmnd --block 1000000 --id 90 --qcov 80 --scov 80 --out diamond.is.psc.tsv
 python3 ${BAKTA_DB_SCRIPTS}/annotate-is.py --db bakta.db --ips-alignments diamond.is.ips.tsv --psc-alignments diamond.is.psc.tsv
 rm IS.faa is.transposase.faa is.dmnd diamond.is.ips.tsv diamond.is.psc.tsv
diff --git a/db-scripts/extract-is.py b/db-scripts/extract-is.py
new file mode 100644
index 00000000..982026e3
--- /dev/null
+++ b/db-scripts/extract-is.py
@@ -0,0 +1,53 @@
+import argparse
+import logging
+
+from pathlib import Path
+
+from Bio import SeqIO
+
+parser = argparse.ArgumentParser(
+    description='Extract IS transposases from ISFinder.'
+)
+parser.add_argument('--input', action='store', help='Path to ISFinder file.')
+parser.add_argument('--output', action='store', help='Path to transposase file.')
+args = parser.parse_args()
+
+input_path = Path(args.input).resolve()
+output_path = Path(args.output).resolve()
+
+ises = {}
+transposases = 0
+with input_path.open() as fh_in:
+    for record in SeqIO.parse(fh_in, 'fasta'):
+        id = record.id
+        seq = str(record.seq).upper()
+        descriptions = record.description.split(' ', 1)[1][3:-3].split('~~~')
+        if(descriptions[1].lower() == 'transposase'  and  len(seq) > 0):
+            (is_name, is_group, is_family, is_orf) = descriptions[0].split('_', 3)
+            if(id in ises):
+                ise = ises[id]
+                orfs = ise['orfs']
+                orfs.append((is_orf, seq))
+            else:
+                ises[id] = {
+                    'name': is_name,
+                    'group': is_group,
+                    'family': is_family,
+                    'orfs': [(is_orf, seq)]
+                }
+
+with output_path.open('w') as fh_out:
+    for id, ise in ises.items():
+        orfs = ise['orfs']
+        if(len(orfs) == 3 and orfs[0][1][:5] == orfs[2][1][:5]):  # check if ORF 3 starts with ORF 1 -> fusion ORF -> annotate ORF1/2 as transposase orfA/orfB
+            fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFA\n{orfs[0][1]}\n")
+            fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORFB\n{orfs[1][1]}\n")
+            fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF\n{orfs[2][1]}\n")
+            transposases += 3
+        else:
+            for idx, transposase in enumerate(orfs):
+                (orf, seq) = transposase
+                fh_out.write(f">{ise['name']}_{ise['group']}_{ise['family']}_ORF{idx+1}\n{seq}\n")
+                transposases += 1
+
+print(f'\tstored transposase sequences: {transposases}')