From 6a8e3a7c430c0b53769747d04161f3c511f3c956 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Sep 2024 12:52:16 +0200
Subject: [PATCH 01/10] rename io functions

---
 bakta/io/gff.py           |  4 ++--
 bakta/io/insdc.py         |  2 +-
 bakta/io/tsv.py           | 12 ++++++------
 bakta/main.py             | 15 ++++++++-------
 bakta/plot.py             |  6 +++---
 bakta/proteins.py         |  4 ++--
 scripts/extract-region.py |  4 ++--
 7 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 35fcef08..086ebfde 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -14,9 +14,9 @@
 log = logging.getLogger('GFF')
 
 
-def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
+def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
     """Export features in GFF3 format."""
-    log.info('write GFF3: path=%s', gff3_path)
+    log.info('write features: path=%s', gff3_path)
 
     with gff3_path.open('wt') as fh:
         fh.write('##gff-version 3\n')  # GFF version
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index ba8d630e..5a1de13a 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -18,7 +18,7 @@
 log = logging.getLogger('INSDC')
 
 
-def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
+def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
     log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
 
     contig_list = []
diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index 8a3eb25a..cb492533 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -12,9 +12,9 @@
 log = logging.getLogger('TSV')
 
 
-def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
     """Export features in TSV format."""
-    log.info('write tsv: path=%s', tsv_path)
+    log.info('write feature tsv: path=%s', tsv_path)
 
     with tsv_path.open('wt') as fh:
         fh.write('# Annotated with Bakta\n')
@@ -63,9 +63,9 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
     return
 
 
-def write_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
-    """Export features in TSV format."""
-    log.info('write tsv: path=%s', tsv_path)
+def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
+    """Export protein features in TSV format."""
+    log.info('write protein feature tsv: path=%s', tsv_path)
 
     with tsv_path.open('wt') as fh:
         fh.write(f'#Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n')
@@ -79,7 +79,7 @@ def write_features(features: Sequence[dict], header_columns: Sequence[str], mapp
     return
 
 
-def write_hypotheticals_tsv(hypotheticals: Sequence[dict], tsv_path: Path):
+def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
     """Export hypothetical information in TSV format."""
     log.info('write hypothetical tsv: path=%s', tsv_path)
 
diff --git a/bakta/main.py b/bakta/main.py
index 1765cb20..e9c107c3 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -92,9 +92,10 @@ def main():
         if(cfg.skip_gap): print(f'\tskip gap: {cfg.skip_gap}')
         if(cfg.skip_ori): print(f'\tskip oriC/V/T: {cfg.skip_ori}')
         if(cfg.skip_plot): print(f'\tskip plot: {cfg.skip_plot}')
+        print()
     
     if(cfg.debug):
-        print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}")
+        print(f"Bakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}\n")
     else:
         atexit.register(bu.cleanup, log, cfg.tmp_path)  # register cleanup exit hook
 
@@ -104,7 +105,7 @@ def main():
     # - apply contig length filter
     # - rename contigs
     ############################################################################
-    print('\nparse genome sequences...')
+    print('parse genome sequences...')
     try:
         contigs = fasta.import_contigs(cfg.genome_path)
         log.info('imported sequences=%i', len(contigs))
@@ -535,16 +536,16 @@ def main():
     print(f'\nexport annotation results to: {cfg.output_path}')
     print('\thuman readable TSV...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
-    tsv.write_tsv(genome['contigs'], features_by_contig, tsv_path)
+    tsv.write_features(genome['contigs'], features_by_contig, tsv_path)
 
     print('\tGFF3...')
     gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
-    gff.write_gff3(genome, features_by_contig, gff3_path)
+    gff.write_features(genome, features_by_contig, gff3_path)
 
     print('\tINSDC GenBank & EMBL...')
     genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
     embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
-    insdc.write_insdc(genome, features, genbank_path, embl_path)
+    insdc.write_features(genome, features, genbank_path, embl_path)
 
     print('\tgenome sequences...')
     fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
@@ -562,13 +563,13 @@ def main():
         print('\tskip generation of circular genome plot...')
     else:
         print('\tcircular genome plot...')
-        plot.write_plot(features, contigs, cfg.output_path)
+        plot.write(features, contigs, cfg.output_path)
 
     if(cfg.skip_cds is False):
         hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
         print('\thypothetical TSV...')
         tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
-        tsv.write_hypotheticals_tsv(hypotheticals, tsv_path)
+        tsv.write_hypotheticals(hypotheticals, tsv_path)
 
         print('\ttranslated hypothetical CDS sequences...')
         faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa')
diff --git a/bakta/plot.py b/bakta/plot.py
index 7ec5def9..675b52aa 100644
--- a/bakta/plot.py
+++ b/bakta/plot.py
@@ -180,7 +180,7 @@ def main():
 
     if args.sequences == 'all':  # write whole genome plot
         print(f'draw circular genome plot (type={plot_type}) containing all sequences...')
-        write_plot(features, contigs, output_path, colors, plot_type=plot_type)
+        write(features, contigs, output_path, colors, plot_type=plot_type)
     else:  # write genome plot containing provided sequences only
         plot_contigs = []
         sequence_identifiers = []
@@ -198,10 +198,10 @@ def main():
             plot_name_suffix = '_'.join(sequence_identifiers)
             plot_contig_ids = [c['id'] for c in plot_contigs]
             features = [feat for feat in features if feat['contig'] in plot_contig_ids]
-            write_plot(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type)
+            write(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type)
 
 
-def write_plot(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES):
+def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES):
     # config paths
     circos_path = cfg.tmp_path.joinpath(f'circos')
     circos_path.mkdir(parents=True, exist_ok=True)
diff --git a/bakta/proteins.py b/bakta/proteins.py
index eb827129..a3b51158 100644
--- a/bakta/proteins.py
+++ b/bakta/proteins.py
@@ -154,7 +154,7 @@ def main():
     annotations_path = output_path.joinpath(f'{cfg.prefix}.tsv')
     header_columns = ['ID', 'Length', 'Gene', 'Product', 'EC', 'GO', 'COG', 'RefSeq', 'UniParc', 'UniRef']
     print(f'\tfull annotations (TSV): {annotations_path}')
-    tsv.write_features(aas, header_columns, map_aa_columns, annotations_path)
+    tsv.write_protein_features(aas, header_columns, map_aa_columns, annotations_path)
     full_annotations_path = output_path.joinpath(f'{cfg.prefix}.json')
     print(f'\tfull annotations (JSON): {full_annotations_path}')
     json.write_json(None, aas, full_annotations_path)
@@ -162,7 +162,7 @@ def main():
     header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
     hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
     print(f'\tinformation on hypotheticals (TSV): {hypotheticals_path}')
-    tsv.write_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)
+    tsv.write_protein_features(hypotheticals, header_columns, map_hypothetical_columns, hypotheticals_path)
     aa_output_path = output_path.joinpath(f'{cfg.prefix}.faa')
     print(f'\tannotated sequences (Fasta): {aa_output_path}')
     fasta.write_faa(aas, aa_output_path)
diff --git a/scripts/extract-region.py b/scripts/extract-region.py
index 303b07f5..b357b137 100755
--- a/scripts/extract-region.py
+++ b/scripts/extract-region.py
@@ -76,11 +76,11 @@
 print('Write selected features...')
 output_path = Path(args.output).resolve()
 gff3_path = output_path.joinpath(f'{prefix}.gff3')
-gff.write_gff3(genome, features_by_contig, gff3_path)
+gff.write_features(genome, features_by_contig, gff3_path)
 print('\t...INSDC GenBank & EMBL')
 genbank_path = output_path.joinpath(f'{prefix}.gbff')
 embl_path = output_path.joinpath(f'{prefix}.embl')
-insdc.write_insdc(genome, features_selected, genbank_path, embl_path)
+insdc.write_features(genome, features_selected, genbank_path, embl_path)
 print('\t...feature nucleotide sequences')
 ffn_path = output_path.joinpath(f'{prefix}.ffn')
 fasta.write_ffn(features_selected, ffn_path)

From 1bf5d6fc3ae8cdb735bea25da53eeaa9eb0f0ad7 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Sep 2024 15:04:48 +0200
Subject: [PATCH 02/10] fix merged pscc dicts

---
 bakta/pscc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bakta/pscc.py b/bakta/pscc.py
index ee1be42d..dca2d9ef 100644
--- a/bakta/pscc.py
+++ b/bakta/pscc.py
@@ -120,7 +120,7 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
                 if(pseudo):
                     feature[bc.PSEUDOGENE]['pscc'] = pscc
                 else:
-                    feature['pscc'] = pscc
+                    feature['pscc'] = {**feature['pscc'], **pscc}  # merge dicts to store alignment info for later PSC/PSCC annotations
                 no_pscc_lookups += 1
                 log.debug(
                     'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s',

From f267ba206cae295b309603ed913bbed4d2b68ccd Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 25 Sep 2024 15:47:20 +0200
Subject: [PATCH 03/10] store and log bitscores and evalues

---
 bakta/features/s_orf.py | 10 +++++++---
 bakta/psc.py            |  8 ++++++--
 bakta/pscc.py           |  8 ++++++--
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index 04e52fdd..f0bc3005 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -353,16 +353,20 @@ def search(sorfs: Sequence[dict], cluster_type: str):
             sorf = sorf_by_aa_digest[sorf_hash]
             query_cov = int(alignment_length) / len(sorf['aa'])
             identity = float(identity) / 100
+            bitscore = float(bitscore)
+            evalue = float(evalue)
             if(query_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
                 result = {
                     'query_cov': query_cov,
-                    'identity': identity
+                    'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue
                 }
                 result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id
                 sorf['psc' if cluster_type == 'full' else 'pscc'] = result
                 log.info(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef90=%s',
+                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, bitscore, evalue, cluster_id
                 )
 
     sorfs_found = []
diff --git a/bakta/psc.py b/bakta/psc.py
index 27d80b80..f16b270d 100644
--- a/bakta/psc.py
+++ b/bakta/psc.py
@@ -72,16 +72,20 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
             identity = float(identity) / 100
+            bitscore = float(bitscore)
+            evalue = float(evalue)
             if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
                 cds['psc'] = {
                     DB_PSC_COL_UNIREF90: cluster_id,
                     'query_cov': query_cov,
                     'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue,
                     'valid': identity >= bc.MIN_PSC_IDENTITY
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef90=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, bitscore, evalue, cluster_id
                 )
 
     pscs_found = []
diff --git a/bakta/pscc.py b/bakta/pscc.py
index dca2d9ef..1bb1e193 100644
--- a/bakta/pscc.py
+++ b/bakta/pscc.py
@@ -65,16 +65,20 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
             identity = float(identity) / 100
+            bitscore = float(bitscore)
+            evalue = float(evalue)
             if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
                 cds['pscc'] = {
                     DB_PSCC_COL_UNIREF50: cluster_id,
                     'query_cov': query_cov,
                     'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue,
                     'valid': identity >= bc.MIN_PSC_IDENTITY
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef50=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, bitscore, evalue, cluster_id
                 )
 
     psccs_found = []

From b111f4485a60dd2da49a9e8571f45afd33d590b9 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 27 Sep 2024 14:54:54 +0200
Subject: [PATCH 04/10] harmonize diamond calls, hit inference info and logging

---
 bakta/expert/protein_sequences.py | 14 +++++++-------
 bakta/features/cds.py             | 29 +++++++++++++++++++----------
 bakta/features/ori.py             |  2 +-
 bakta/features/s_orf.py           | 14 +++++++-------
 bakta/psc.py                      | 16 ++++++++--------
 bakta/pscc.py                     | 17 ++++++++---------
 6 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py
index fee7fa2b..982c7011 100644
--- a/bakta/expert/protein_sequences.py
+++ b/bakta/expert/protein_sequences.py
@@ -26,7 +26,7 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
         '--db', str(db_path),
         '--out', str(diamond_output_path),
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6', 'qseqid', 'sseqid', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'stitle',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '4',  # increase block size for faster executions
@@ -51,13 +51,13 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
     cds_by_hexdigest = orf.get_orf_dictionary(cdss)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, model_id, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
+            (aa_identifier, model_id, query_length, model_length, alignment_length, identity, evalue, bitscore, model_title) = line.strip().split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
             model_cov = int(alignment_length) / int(model_length)
             identity = float(identity) / 100
-            evalue = float(evalue)
             bitscore = float(bitscore)
+            evalue = float(evalue)
             (source, rank, min_identity, min_query_cov, min_model_cov, gene, product, dbxrefs) = model_title.split(' ', 1)[1].split('~~~')
             rank = int(rank)
             min_identity = float(min_identity) / 100
@@ -72,10 +72,10 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
                     'gene': gene if gene != '' else None,
                     'product': product,
                     'query_cov': query_cov,
-                    'model_cov': model_cov,
+                    'subject_cov': model_cov,
                     'identity': identity,
+                    'score': bitscore,
                     'evalue': evalue,
-                    'bitscore': bitscore,
                     'db_xrefs': [] if dbxrefs == '' else dbxrefs.split(',')
                 }
                 if(expert_system == 'user_proteins'):
@@ -83,8 +83,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
                 cds.setdefault('expert', [])
                 cds['expert'].append(hit)
                 log.debug(
-                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, identity=%0.3f, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
-                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, gene, product, evalue, bitscore
+                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
+                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
                 )
                 cds_found.add(aa_identifier)
 
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 19e555b1..fb4f91cf 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -528,7 +528,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
         '--query-cover', str(int(bc.MIN_PSEUDOGENE_QUERY_COVERAGE * 100)),      # '80'
         '--subject-cover', str(int(bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE * 100)),  # '40'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6', 'qseqid', 'sseqid', 'pident', 'length', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore', 'qstart', 'qend', 'sstart', 'send', 'full_sseq',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -552,18 +552,21 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
     cds_by_hexdigest = orf.get_orf_dictionary(hypotheticals)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, cluster_id, identity, alignment_length, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
+            (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore, query_start, query_end, subject_start, subject_end, subject_sequence) = line.rstrip('\n').split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
-            subject_cov = int(alignment_length) / len(subject_sequence)
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
-            if query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY \
-                    and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE:
+            bitscore = float(bitscore)
+            evalue = float(evalue)
+            if(query_cov >= bc.MIN_PSEUDOGENE_QUERY_COVERAGE and bc.MIN_PSEUDOGENE_SUBJECT_COVERAGE <= subject_cov < bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSEUDOGENE_IDENTITY):
                 cds['pseudo-inference'] = {
                     DB_PSC_COL_UNIREF90: cluster_id,
-                    'query-cov': query_cov,
-                    'subject-cov': subject_cov,
+                    'query_cov': query_cov,
+                    'subject_cov': subject_cov,
                     'identity': identity,
+                    'score': bitscore,
+                    'evalue': evalue,
                     'gene_start': int(query_start),
                     'gene_end': int(query_end),
                     'reference_start': int(subject_start),
@@ -572,8 +575,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
                 }
                 pseudo_candidates.append(cds)
                 log.debug(
-                    'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, cluster_id
+                    'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
     log.info('found: pseudogene-candidates=%i', len(pseudo_candidates))
     return pseudo_candidates
@@ -662,6 +665,9 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
                     query_alignment_start = int(hit.find('Hit_hsps/Hsp/Hsp_query-from').text)
                     query_alignment_stop = int(hit.find('Hit_hsps/Hsp/Hsp_query-to').text)
                     alignment_length = int(hit.find('Hit_hsps/Hsp/Hsp_align-len').text)
+                    identity = float(hit.find('Hit_hsps/Hsp/Hsp_identity').text) / alignment_length
+                    bitscore = float(hit.find('Hit_hsps/Hsp/Hsp_bit-score').text)
+                    evalue = float(hit.find('Hit_hsps/Hsp/Hsp_evalue').text)
 
                     if alignment_length == len(cds['aa']):  # skip non-extended genes (full match)
                         log.debug(
@@ -686,7 +692,10 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
                             'stop': positions['stop'],
                             'observations': clean_observations(observations),
                             'inference': cds['pseudo-inference'],
-                            'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id)
+                            'paralog': is_paralog(uniref90_by_hexdigest, aa_identifier, cluster_id),
+                            'identity': identity,
+                            'score': bitscore,
+                            'evalue': evalue
                         }
 
                         effects = []
diff --git a/bakta/features/ori.py b/bakta/features/ori.py
index 735a68b2..82281612 100644
--- a/bakta/features/ori.py
+++ b/bakta/features/ori.py
@@ -72,7 +72,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
                 if(len(contig_hits) == 1):
                     hits[hit['contig']] = contig_hits
                 log.debug(
-                    'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%f, identity=%f',
+                    'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f',
                     ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity']
                 )
 
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index f0bc3005..74124c32 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -324,7 +324,7 @@ def search(sorfs: Sequence[dict], cluster_type: str):
         '--query-cover', str(int(bc.MIN_SORF_COVERAGE * 100)),  # '90'
         '--subject-cover', str(int(bc.MIN_SORF_COVERAGE * 100)),  # '90'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -347,17 +347,17 @@ def search(sorfs: Sequence[dict], cluster_type: str):
     sorf_by_aa_digest = orf.get_orf_dictionary(sorfs)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (sorf_hash, cluster_id, identity, alignment_length, align_mismatches,
-                align_gaps, query_start, query_end, subject_start, subject_end,
-                evalue, bitscore) = line.split('\t')
+            (sorf_hash, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t')
             sorf = sorf_by_aa_digest[sorf_hash]
             query_cov = int(alignment_length) / len(sorf['aa'])
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
             bitscore = float(bitscore)
             evalue = float(evalue)
-            if(query_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
+            if(query_cov >= bc.MIN_SORF_COVERAGE and subject_cov >= bc.MIN_SORF_COVERAGE and identity >= bc.MIN_SORF_IDENTITY):
                 result = {
                     'query_cov': query_cov,
+                    'subject_cov': subject_cov,
                     'identity': identity,
                     'score': bitscore,
                     'evalue': evalue
@@ -365,8 +365,8 @@ def search(sorfs: Sequence[dict], cluster_type: str):
                 result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id
                 sorf['psc' if cluster_type == 'full' else 'pscc'] = result
                 log.info(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef90=%s',
-                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, identity, bitscore, evalue, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s',
+                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     sorfs_found = []
diff --git a/bakta/psc.py b/bakta/psc.py
index f16b270d..24cb774b 100644
--- a/bakta/psc.py
+++ b/bakta/psc.py
@@ -43,7 +43,7 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
         '--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
         '--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -66,26 +66,26 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
     cds_by_hexdigest = orf.get_orf_dictionary(cdss)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, cluster_id, identity, alignment_length, align_mismatches,
-                align_gaps, query_start, query_end, subject_start, subject_end,
-                evalue, bitscore) = line.split('\t')
+            (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
             bitscore = float(bitscore)
             evalue = float(evalue)
-            if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
+            if(query_cov >= bc.MIN_PSC_COVERAGE and subject_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):  # use PSCC min id (50%) to allow for pseudogene seeds
                 cds['psc'] = {
                     DB_PSC_COL_UNIREF90: cluster_id,
                     'query_cov': query_cov,
+                    'subject_cov': subject_cov,
                     'identity': identity,
                     'score': bitscore,
                     'evalue': evalue,
-                    'valid': identity >= bc.MIN_PSC_IDENTITY
+                    'valid': identity >= bc.MIN_PSC_IDENTITY  # whether a valid PSC hit (id > 90%)
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, bitscore, evalue, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     pscs_found = []
diff --git a/bakta/pscc.py b/bakta/pscc.py
index 1bb1e193..c01144d7 100644
--- a/bakta/pscc.py
+++ b/bakta/pscc.py
@@ -36,7 +36,7 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
         '--query-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
         '--subject-cover', str(int(bc.MIN_PSC_COVERAGE * 100)),  # '80'
         '--max-target-seqs', '1',  # single best output
-        '--outfmt', '6',
+        '--outfmt', '6', 'qseqid', 'sseqid', 'qlen', 'slen', 'length', 'pident', 'evalue', 'bitscore',
         '--threads', str(cfg.threads),
         '--tmpdir', str(cfg.tmp_path),  # use tmp folder
         '--block-size', '3',  # slightly increase block size for faster executions
@@ -59,26 +59,25 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
     cds_by_hexdigest = orf.get_orf_dictionary(cdss)
     with diamond_output_path.open() as fh:
         for line in fh:
-            (aa_identifier, cluster_id, identity, alignment_length, align_mismatches,
-                align_gaps, query_start, query_end, subject_start, subject_end,
-                evalue, bitscore) = line.split('\t')
+            (aa_identifier, cluster_id, query_length, subject_length, alignment_length, identity, evalue, bitscore) = line.split('\t')
             cds = cds_by_hexdigest[aa_identifier]
             query_cov = int(alignment_length) / len(cds['aa'])
+            subject_cov = int(alignment_length) / int(subject_length)
             identity = float(identity) / 100
             bitscore = float(bitscore)
             evalue = float(evalue)
-            if(query_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
+            if(query_cov >= bc.MIN_PSC_COVERAGE and subject_cov >= bc.MIN_PSC_COVERAGE and identity >= bc.MIN_PSCC_IDENTITY):
                 cds['pscc'] = {
                     DB_PSCC_COL_UNIREF50: cluster_id,
                     'query_cov': query_cov,
+                    'subject_cov': subject_cov,
                     'identity': identity,
                     'score': bitscore,
-                    'evalue': evalue,
-                    'valid': identity >= bc.MIN_PSC_IDENTITY
+                    'evalue': evalue
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%s, UniRef50=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, identity, bitscore, evalue, cluster_id
+                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s',
+                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     psccs_found = []

From f799ee44d455fe20e04178ce6bb9741955e19ca9 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 27 Sep 2024 14:55:25 +0200
Subject: [PATCH 05/10] add inference TSV output

---
 bakta/io/tsv.py | 42 ++++++++++++++++++++++++++++++++++++++++++
 bakta/main.py   |  4 ++++
 2 files changed, 46 insertions(+)

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index cb492533..c77ac9cb 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -63,6 +63,48 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict],
     return
 
 
+def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+    """Export feature inference statistics in TSV format."""
+    log.info('write tsv: path=%s', tsv_path)
+
+    with tsv_path.open('wt') as fh:
+        fh.write('# Annotated with Bakta\n')
+        fh.write(f'# Software: v{bakta.__version__}\n')
+        fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
+        fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
+        fh.write(f'# URL: {bc.BAKTA_URL}\n')
+        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid')
+
+        for contig in contigs:
+            for feat in features_by_contig[contig['id']]:
+                if(feat['type'] in [
+                            bc.FEATURE_T_RNA,
+                            bc.FEATURE_R_RNA,
+                            bc.FEATURE_NC_RNA,
+                            bc.FEATURE_NC_RNA_REGION,
+                            bc.FEATURE_CDS,
+                            bc.FEATURE_SORF
+                        ]
+                    ):
+                    fh.write('\t'.join(
+                        [
+                            feat['contig'],
+                            feat['type'],
+                            str(feat['start']),
+                            str(feat['stop']),
+                            feat['strand'],
+                            feat['locus'],
+                            f"{feat['score']:0.1f}",
+                            f"{feat['evalue']:1.1e}" if 'evalue' in feat else '-',
+                            f"{feat['query_cov']:0.3f}" if 'query_cov' in feat else '-',
+                            f"{feat['subject_cov']:0.3f}" if 'subject_cov' in feat else '-',
+                            f"{feat['identity']:0.3f}" if 'identity' in feat else '-'
+                        ])
+                    )
+                    fh.write('\n')
+    return
+
+
 def write_protein_features(features: Sequence[dict], header_columns: Sequence[str], mapping: LambdaType, tsv_path: Path):
     """Export protein features in TSV format."""
     log.info('write protein feature tsv: path=%s', tsv_path)
diff --git a/bakta/main.py b/bakta/main.py
index e9c107c3..01f299b3 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -559,6 +559,10 @@ def main():
     faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa')
     fasta.write_faa(features, faa_path)
 
+    print('\tfeature inferences...')
+    tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
+    tsv.write_feature_inferences(genome['contigs'], features_by_contig, tsv_path)
+
     if(cfg.skip_plot  or  cfg.meta):
         print('\tskip generation of circular genome plot...')
     else:

From c37f8e7c7fe548cfe4a0a0571c695b42f1d89e49 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 27 Sep 2024 17:14:58 +0200
Subject: [PATCH 06/10] fix merged PSC/PSCC dicts

---
 bakta/pscc.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/bakta/pscc.py b/bakta/pscc.py
index c01144d7..b4b34906 100644
--- a/bakta/pscc.py
+++ b/bakta/pscc.py
@@ -123,7 +123,10 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
                 if(pseudo):
                     feature[bc.PSEUDOGENE]['pscc'] = pscc
                 else:
-                    feature['pscc'] = {**feature['pscc'], **pscc}  # merge dicts to store alignment info for later PSC/PSCC annotations
+                    if('pscc' in feature):
+                        feature['pscc'] = {**feature['pscc'], **pscc}  # merge dicts, add PSCC annotation info to PSCC alignment info
+                    else:
+                        feature['pscc'] = pscc  # add PSCC annotation info
                 no_pscc_lookups += 1
                 log.debug(
                     'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s',

From b9fbe3a48477915d09ec30332662d457a5d36d62 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 27 Sep 2024 17:15:18 +0200
Subject: [PATCH 07/10] add inference TSV file

---
 bakta/io/tsv.py | 50 +++++++++++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index c77ac9cb..67522392 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -73,19 +73,24 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
         fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
         fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
-        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid')
+        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid\n')
 
         for contig in contigs:
             for feat in features_by_contig[contig['id']]:
-                if(feat['type'] in [
-                            bc.FEATURE_T_RNA,
-                            bc.FEATURE_R_RNA,
-                            bc.FEATURE_NC_RNA,
-                            bc.FEATURE_NC_RNA_REGION,
-                            bc.FEATURE_CDS,
-                            bc.FEATURE_SORF
-                        ]
-                    ):
+                if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
+                    score, evalue, query_cov, subject_cov, identity = None, None, None, None, None
+                    if('ups' in feat or 'ips' in feat):
+                        query_cov = 1
+                        subject_cov = 1
+                        identity = 1
+                        evalue = 0
+                    elif('psc' in feat or 'pscc' in feat):
+                        psc_type = 'psc' if 'psc' in feat else 'pscc'
+                        query_cov = feat[psc_type]['query_cov']
+                        subject_cov = feat[psc_type]['subject_cov']
+                        identity = feat[psc_type]['identity']
+                        score = feat[psc_type]['score']
+                        evalue = feat[psc_type]['evalue']
                     fh.write('\t'.join(
                         [
                             feat['contig'],
@@ -94,11 +99,28 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                             str(feat['stop']),
                             feat['strand'],
                             feat['locus'],
+                            f"{score:0.1f}" if score != None else '-',
+                            ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
+                            ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
+                            ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
+                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-'
+                        ])
+                    )
+                    fh.write('\n')
+                elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
+                    fh.write('\t'.join(
+                        [
+                            feat['contig'],
+                            feat['type'],
+                            str(feat['start']),
+                            str(feat['stop']),
+                            feat['strand'],
+                            feat['locus'] if 'locus' in feat else '-',
                             f"{feat['score']:0.1f}",
-                            f"{feat['evalue']:1.1e}" if 'evalue' in feat else '-',
-                            f"{feat['query_cov']:0.3f}" if 'query_cov' in feat else '-',
-                            f"{feat['subject_cov']:0.3f}" if 'subject_cov' in feat else '-',
-                            f"{feat['identity']:0.3f}" if 'identity' in feat else '-'
+                            ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
+                            ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
+                            ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
+                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-'
                         ])
                     )
                     fh.write('\n')

From 6c9c2e63ea75d0f7c7aed21be9d7ef5b5eee54ff Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 9 Oct 2024 15:32:18 +0200
Subject: [PATCH 08/10] add inference accession

---
 bakta/io/tsv.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index 67522392..4d90b1aa 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -7,6 +7,10 @@
 import bakta
 import bakta.config as cfg
 import bakta.constants as bc
+import bakta.ips as bips
+import bakta.ups as bups
+import bakta.psc as bpsc
+import bakta.pscc as bpscc
 
 
 log = logging.getLogger('TSV')
@@ -73,17 +77,18 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
         fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
         fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
-        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid\n')
+        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid\taccession\n')
 
         for contig in contigs:
             for feat in features_by_contig[contig['id']]:
                 if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
-                    score, evalue, query_cov, subject_cov, identity = None, None, None, None, None
+                    score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                     if('ups' in feat or 'ips' in feat):
                         query_cov = 1
                         subject_cov = 1
                         identity = 1
                         evalue = 0
+                        accession = f"{bc.DB_XREF_UNIREF}:{feat['ips'][bips.DB_IPS_COL_UNIREF100]}" if 'ips' in feat else f"{bc.DB_XREF_UNIPARC}:{feat['ups'][bups.DB_UPS_COL_UNIPARC]}"
                     elif('psc' in feat or 'pscc' in feat):
                         psc_type = 'psc' if 'psc' in feat else 'pscc'
                         query_cov = feat[psc_type]['query_cov']
@@ -91,6 +96,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                         identity = feat[psc_type]['identity']
                         score = feat[psc_type]['score']
                         evalue = feat[psc_type]['evalue']
+                        accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}"
                     fh.write('\t'.join(
                         [
                             feat['contig'],
@@ -103,11 +109,13 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                             ('0.0' if evalue == 0 else f"{evalue:1.1e}") if evalue != None else '-',
                             ('1.0' if query_cov == 1 else f"{query_cov:0.3f}") if query_cov != None else '-',
                             ('1.0' if subject_cov == 1 else f"{subject_cov:0.3f}") if subject_cov != None else '-',
-                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-'
+                            ('1.0' if identity == 1 else f"{identity:0.3f}") if identity != None else '-',
+                            accession
                         ])
                     )
                     fh.write('\n')
                 elif(feat['type'] in [bc.FEATURE_T_RNA, bc.FEATURE_R_RNA, bc.FEATURE_NC_RNA, bc.FEATURE_NC_RNA_REGION]):
+                    accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                     fh.write('\t'.join(
                         [
                             feat['contig'],
@@ -120,7 +128,8 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                             ('0.0' if feat['evalue'] == 0 else f"{feat['evalue']:1.1e}") if 'evalue' in feat else '-',
                             ('1.0' if feat['query_cov'] == 1 else f"{feat['query_cov']:0.3f}") if 'query_cov' in feat else '-',
                             ('1.0' if feat['subject_cov'] == 1 else f"{feat['subject_cov']:0.3f}") if 'subject_cov' in feat else '-',
-                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-'
+                            ('1.0' if feat['identity'] == 1 else f"{feat['identity']:0.3f}") if 'identity' in feat else '-',
+                            accession
                         ])
                     )
                     fh.write('\n')

From 078dd769a2f71ce06cf7d59fb7cfdfdc71fa92b1 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 9 Oct 2024 15:33:45 +0200
Subject: [PATCH 09/10] amend column names

---
 bakta/io/tsv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index 4d90b1aa..cb16ac9a 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -77,7 +77,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
         fh.write(f"# Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
         fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
-        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tscore\tevalue\tquery-cov\tsubject-cov\tid\taccession\n')
+        fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')
 
         for contig in contigs:
             for feat in features_by_contig[contig['id']]:

From 2c68052a8748778601823ae48ef013baf7372868 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Wed, 9 Oct 2024 15:35:29 +0200
Subject: [PATCH 10/10] add new output file to readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0f4bc82e..b591631f 100644
--- a/README.md
+++ b/README.md
@@ -285,6 +285,7 @@ Annotation results are provided in standard bioinformatics file formats:
 - `<prefix>.fna`: replicon/contig DNA sequences as FASTA
 - `<prefix>.ffn`: feature nucleotide sequences as FASTA
 - `<prefix>.faa`: CDS/sORF amino acid sequences as FASTA
+- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
 - `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
 - `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
 - `<prefix>.json`: all (internal) annotation & sequence information as JSON