From 61d11b2e4eb5287e4697b0920349b7731e2b05f7 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 12:49:59 +0200
Subject: [PATCH 1/8] rename contig to more general sequence

---
 README.md                         |   4 +-
 bakta/__init__.py                 |   2 +-
 bakta/config.py                   |  26 +--
 bakta/constants.py                |   2 +-
 bakta/expert/amrfinder.py         |   4 +-
 bakta/expert/protein_hmms.py      |   8 +-
 bakta/expert/protein_sequences.py |   4 +-
 bakta/features/annotation.py      | 154 ++++++++---------
 bakta/features/cds.py             | 270 +++++++++++++++---------------
 bakta/features/crispr.py          |  28 ++--
 bakta/features/gaps.py            |  12 +-
 bakta/features/nc_rna.py          |  20 +--
 bakta/features/nc_rna_region.py   |  20 +--
 bakta/features/orf.py             |  10 +-
 bakta/features/ori.py             |  46 ++---
 bakta/features/r_rna.py           |  24 +--
 bakta/features/s_orf.py           |  86 +++++-----
 bakta/features/signal_peptides.py |   4 +-
 bakta/features/t_rna.py           |  20 +--
 bakta/features/tm_rna.py          |  18 +-
 bakta/io.py                       | 196 ++++++++++++++++++++++
 bakta/io/fasta.py                 |  60 +++----
 bakta/io/gff.py                   |  68 ++++----
 bakta/io/insdc.py                 |  70 ++++----
 bakta/io/json.py                  |   4 +-
 bakta/io/tsv.py                   |  26 +--
 bakta/ips.py                      |   4 +-
 bakta/main.py                     | 104 +++++-------
 bakta/plot.py                     | 106 ++++++------
 bakta/proteins.py                 |  12 +-
 bakta/psc.py                      |   8 +-
 bakta/pscc.py                     |   8 +-
 bakta/so.py                       |   2 +-
 bakta/ups.py                      |   4 +-
 bakta/utils.py                    | 214 +++++++++++------------
 scripts/extract-region.py         |  14 +-
 test/test_edge_features.py        |  10 +-
 test/test_nt_sequences.py         |   2 +-
 test/test_pseudo.py               |  26 +--
 test/test_regions.py              |   4 +-
 test/test_sORF.py                 |   6 +-
 41 files changed, 946 insertions(+), 764 deletions(-)
 create mode 100644 bakta/io.py

diff --git a/README.md b/README.md
index 383aa302..81e865d2 100644
--- a/README.md
+++ b/README.md
@@ -382,7 +382,7 @@ positional arguments:
 Input / Output:
   --db DB, -d DB        Database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.
   --min-contig-length MIN_CONTIG_LENGTH, -m MIN_CONTIG_LENGTH
-                        Minimum contig size (default = 1; 200 in compliant mode)
+                        Minimum contig/sequence size (default = 1; 200 in compliant mode)
   --prefix PREFIX, -p PREFIX
                         Prefix for output files
   --output OUTPUT, -o OUTPUT
@@ -409,7 +409,7 @@ Annotation:
                         Locus tag increment: 1/5/10 (default = 1)
 
   --keep-contig-headers
-                        Keep original contig headers
+                        Keep original contig/sequence headers
   --compliant           Force Genbank/ENA/DDJB compliance
   --replicons REPLICONS, -r REPLICONS
                         Replicon information table (tsv/csv)
diff --git a/bakta/__init__.py b/bakta/__init__.py
index 819c77ed..61a4358d 100644
--- a/bakta/__init__.py
+++ b/bakta/__init__.py
@@ -1,2 +1,2 @@
-__version__ = '1.9.4'
+__version__ = '1.10.0-beta'
 __db_schema_version__ = 5
diff --git a/bakta/config.py b/bakta/config.py
index bc418b8d..0c3c1695 100644
--- a/bakta/config.py
+++ b/bakta/config.py
@@ -30,7 +30,7 @@
 db_info = None
 tmp_path = None
 genome_path = None
-min_contig_length = None
+min_sequence_length = None
 prefix = None
 output_path = None
 force = None
@@ -46,7 +46,7 @@
 complete = None
 prodigal_tf = None
 translation_table = None
-keep_contig_headers = None
+keep_sequence_headers = None
 locus = None
 locus_tag = None
 locus_tag_increment = None
@@ -92,7 +92,7 @@ def setup(args):
         verbose = True
 
     # input / output path configurations
-    global db_path, db_info, tmp_path, genome_path, min_contig_length, prefix, output_path, force
+    global db_path, db_info, tmp_path, genome_path, min_sequence_length, prefix, output_path, force
     db_path = check_db_path(args)
     tmp_path = check_tmp_path(args)
 
@@ -108,11 +108,11 @@ def setup(args):
     log.info('genome-path=%s', genome_path)
 
     # input / output configurations
-    min_contig_length = args.min_contig_length
-    if(min_contig_length <= 0):
-        log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_contig_length)
-        sys.exit(f"ERROR: wrong argument ({min_contig_length}) for 'min- contig-length' parameter! Value must be larger than 0")
-    log.info('min_contig_length=%s', min_contig_length)
+    min_sequence_length = args.min_contig_length
+    if(min_sequence_length <= 0):
+        log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_sequence_length)
+        sys.exit(f"ERROR: wrong argument ({min_sequence_length}) for 'min- contig-length' parameter! Value must be larger than 0")
+    log.info('min_contig_length=%s', min_sequence_length)
     log.info('prefix=%s', prefix)  # set in main.py before global logger config
     log.info('output-path=%s', output_path)
     force = args.force
@@ -163,7 +163,7 @@ def setup(args):
         taxon = None
 
     # annotation configurations
-    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
+    global complete, prodigal_tf, translation_table, keep_sequence_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
     complete = args.complete
     log.info('complete=%s', complete)
     prodigal_tf = args.prodigal_tf
@@ -186,8 +186,8 @@ def setup(args):
     compliant = args.compliant
     log.info('compliant=%s', compliant)
     if(compliant):
-        min_contig_length = 200
-        log.info('compliant mode! min_contig_length=%s', min_contig_length)
+        min_sequence_length = 200
+        log.info('compliant mode! min_contig_length=%s', min_sequence_length)
     meta = args.meta
     log.info('meta=%s', meta)
     locus = args.locus
@@ -221,8 +221,8 @@ def setup(args):
     log.info('locus-tag=%s', locus_tag)
     locus_tag_increment = args.locus_tag_increment
     log.info('locus-tag-increment=%s', locus_tag_increment)
-    keep_contig_headers = args.keep_contig_headers
-    log.info('keep_contig_headers=%s', keep_contig_headers)
+    keep_sequence_headers = args.keep_contig_headers
+    log.info('keep_contig_headers=%s', keep_sequence_headers)
     replicons = args.replicons
     if(replicons is not None):
         try:
diff --git a/bakta/constants.py b/bakta/constants.py
index b9e7e83f..8cdfb88e 100644
--- a/bakta/constants.py
+++ b/bakta/constants.py
@@ -191,7 +191,7 @@
 ############################################################################
 REPLICON_CHROMOSOME = 'chromosome'
 REPLICON_PLASMID = 'plasmid'
-REPLICON_CONTIG = 'contig'
+REPLICON_CONTIG = 'sequence'
 REPLICON_LENGTH_THRESHOLD_PLASMID = 112_000  # Nasuia deltocephalinicola -> DOI: 10.1093/gbe/evt118
 REPLICON_LENGTH_THRESHOLD_CHROMOSOME = 2_800_000  # max plasmid length (except 1 outlier-> https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/
 TOPOLOGY_CIRCULAR = 'circular'
diff --git a/bakta/expert/amrfinder.py b/bakta/expert/amrfinder.py
index 943fdc3c..a716a2ff 100644
--- a/bakta/expert/amrfinder.py
+++ b/bakta/expert/amrfinder.py
@@ -80,8 +80,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path):
                 cds.setdefault('expert', [])
                 cds['expert'].append(hit)
                 log.debug(
-                    'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, contig=%s, start=%i, stop=%i, strand=%s',
-                    gene, product, method, model_cov, identity, cds['contig'], cds['start'], cds['stop'], cds['strand']
+                    'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, seq=%s, start=%i, stop=%i, strand=%s',
+                    gene, product, method, model_cov, identity, cds['sequence'], cds['start'], cds['stop'], cds['strand']
                 )
                 cds_found.add(aa_identifier)
 
diff --git a/bakta/expert/protein_hmms.py b/bakta/expert/protein_hmms.py
index 6661f3f8..31585405 100644
--- a/bakta/expert/protein_hmms.py
+++ b/bakta/expert/protein_hmms.py
@@ -30,8 +30,8 @@ def search(cdss: Sequence[dict], user_hmms_path):
                 cds = orf_by_aa_digest[aa_identifier]
                 if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
                     log.debug(
-                        'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
-                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
+                        'discard low evalue: seq=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
+                        cds['sequence'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
                     )
                 else:
                     hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
@@ -64,8 +64,8 @@ def search(cdss: Sequence[dict], user_hmms_path):
                     cds.setdefault('expert', [])
                     cds['expert'].append(hit)
                     log.debug(
-                        'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
-                        cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
+                        'hit: source=UserHMMs, rank=99, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
+                        cds['sequence'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
                     )
                     cds_found.add(aa_identifier)
 
diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py
index 7a4b5de6..ee1b53e9 100644
--- a/bakta/expert/protein_sequences.py
+++ b/bakta/expert/protein_sequences.py
@@ -83,8 +83,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
                 cds.setdefault('expert', [])
                 cds['expert'].append(hit)
                 log.debug(
-                    'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
-                    source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
+                    'hit: source=%s, rank=%i, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
+                    source, rank, cds['sequence'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
                 )
                 cds_found.add(aa_identifier)
 
diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
index 7358aaf9..3cfb03ab 100644
--- a/bakta/features/annotation.py
+++ b/bakta/features/annotation.py
@@ -151,45 +151,45 @@ def detect_feature_overlaps(genome: dict):
     CDS < tmRNA, tRNA, rRNA, CRISPR
     sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations)
     """
-    contig_t_rnas = {k['id']: [] for k in genome['contigs']}
+    sequence_t_rnas = {k['id']: [] for k in genome['sequences']}
     for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []):
-        t_rnas = contig_t_rnas[t_rna['contig']]
+        t_rnas = sequence_t_rnas[t_rna['sequence']]
         t_rnas.append(t_rna)
-    contig_tm_rnas = {k['id']: [] for k in genome['contigs']}
+    sequence_tm_rnas = {k['id']: [] for k in genome['sequences']}
     for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []):
-        tm_rnas = contig_tm_rnas[tm_rna['contig']]
+        tm_rnas = sequence_tm_rnas[tm_rna['sequence']]
         tm_rnas.append(tm_rna)
-    contig_r_rnas = {k['id']: [] for k in genome['contigs']}
+    sequence_r_rnas = {k['id']: [] for k in genome['sequences']}
     for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []):
-        r_rnas = contig_r_rnas[r_rna['contig']]
+        r_rnas = sequence_r_rnas[r_rna['sequence']]
         r_rnas.append(r_rna)
-    contig_ncrna_regions = {k['id']: [] for k in genome['contigs']}
+    sequence_ncrna_regions = {k['id']: [] for k in genome['sequences']}
     for ncRNA_region in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []):
-        ncRNA_regions = contig_ncrna_regions[ncRNA_region['contig']]
+        ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']]
         ncRNA_regions.append(ncRNA_region)
-    contig_crispr_arrays = {k['id']: [] for k in genome['contigs']}
+    sequence_crispr_arrays = {k['id']: [] for k in genome['sequences']}
     for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []):
-        crispr_arrays = contig_crispr_arrays[crispr_array['contig']]
+        crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
-    contig_cdss = {k['id']: [] for k in genome['contigs']}
-    contig_cdss_user_provided = {k['id']: [] for k in genome['contigs']}
+    sequence_cdss = {k['id']: [] for k in genome['sequences']}
+    sequence_cdss_user_provided = {k['id']: [] for k in genome['sequences']}
     for cds in genome['features'].get(bc.FEATURE_CDS, []):
         if(cds.get('source', None) == bc.CDS_SOURCE_USER):
-            cdss = contig_cdss_user_provided[cds['contig']]
+            cdss = sequence_cdss_user_provided[cds['sequence']]
         else:
-            cdss = contig_cdss[cds['contig']]
+            cdss = sequence_cdss[cds['sequence']]
         cdss.append(cds)
-    contig_sorfs = {k['id']: [] for k in genome['contigs']}
+    sequence_sorfs = {k['id']: [] for k in genome['sequences']}
     for sorf in genome['features'].get(bc.FEATURE_SORF, []):
-        sorfs = contig_sorfs[sorf['contig']]
+        sorfs = sequence_sorfs[sorf['sequence']]
         sorfs.append(sorf)
 
-    for contig in genome['contigs']:  # find feature overlaps contig-wise to increase the performance
-        log.debug('filter features on contig: %s', contig['id'])
+    for seq in genome['sequences']:  # find feature overlaps sequence-wise to increase the performance
+        log.debug('filter features on seq: %s', seq['id'])
 
         # mark tRNAs overlapping with tmRNAs
-        for tRNA in contig_t_rnas[contig['id']]:
-            for tmRNA in contig_tm_rnas[contig['id']]:
+        for tRNA in sequence_t_rnas[seq['id']]:
+            for tmRNA in sequence_tm_rnas[seq['id']]:
                 if(tRNA['stop'] < tmRNA['start'] or tRNA['start'] > tmRNA['stop']):
                     continue
                 else:  # overlap -> remove tRNA
@@ -200,13 +200,13 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: tRNA (%s) [%i, %i] overlapping with tmRNA (%s) [%i, %i] at %s on contig=%s",
-                        tRNA['product'], tRNA['start'], tRNA['stop'], tmRNA['product'], tmRNA['start'], tmRNA['stop'], overlap, tRNA['contig']
+                        "overlap: tRNA (%s) [%i, %i] overlapping with tmRNA (%s) [%i, %i] at %s on seq=%s",
+                        tRNA['product'], tRNA['start'], tRNA['stop'], tmRNA['product'], tmRNA['start'], tmRNA['stop'], overlap, tRNA['sequence']
                     )
 
         # mark ncRNA-regions overlapping with ncRNA-regions
-        for ncRNA_region in contig_ncrna_regions[contig['id']]:
-            for ncRNA_region_overlap in contig_ncrna_regions[contig['id']]:
+        for ncRNA_region in sequence_ncrna_regions[seq['id']]:
+            for ncRNA_region_overlap in sequence_ncrna_regions[seq['id']]:
                 if(ncRNA_region['stop'] < ncRNA_region_overlap['start'] or ncRNA_region['start'] > ncRNA_region_overlap['stop']):
                     continue
                 if(ncRNA_region['db_xrefs'][0] == ncRNA_region_overlap['db_xrefs'][0]):
@@ -220,14 +220,14 @@ def detect_feature_overlaps(genome: dict):
                             'description': f"{bc.FEATURE_NC_RNA_REGION} overlap with ({ncRNA_region_overlap['product']}) at {overlap}"
                         }
                         log.info(
-                            "overlap: ncRNA-region (%s) [%i, %i] overlapping with ncRNA-region (%s) [%i, %i] at %s on contig=%s, lower bitscore (%f/%f)",
-                            ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['contig'], ncRNA_region['score'], ncRNA_region_overlap['score']
+                            "overlap: ncRNA-region (%s) [%i, %i] overlapping with ncRNA-region (%s) [%i, %i] at %s on seq=%s, lower bitscore (%f/%f)",
+                            ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['sequence'], ncRNA_region['score'], ncRNA_region_overlap['score']
                         )
 
         # mark de novo-predicted CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs and user-provided CDS
-        for cds in contig_cdss[contig['id']]:
+        for cds in sequence_cdss[seq['id']]:
             # tmRNA overlaps
-            for tmRNA in contig_tm_rnas[contig['id']]:
+            for tmRNA in sequence_tm_rnas[seq['id']]:
                 if(cds['stop'] < tmRNA['start'] or cds['start'] > tmRNA['stop']):
                     continue
                 else:  # overlap -> remove cds
@@ -238,11 +238,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: CDS (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, contig=%s",
-                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, cds['contig']
+                        "overlap: CDS (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, seq=%s",
+                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, cds['sequence']
                     )
             # tRNA overlaps
-            for tRNA in contig_t_rnas[contig['id']]:
+            for tRNA in sequence_t_rnas[seq['id']]:
                 if(cds['stop'] < tRNA['start'] or cds['start'] > tRNA['stop']):
                     continue
                 else:  # overlap -> remove cds
@@ -253,11 +253,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_T_RNA} overlap with ({tRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: CDS (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, contig=%s",
-                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, cds['contig']
+                        "overlap: CDS (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, seq=%s",
+                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, cds['sequence']
                     )
             # rRNA overlaps
-            for rRNA in contig_r_rnas[contig['id']]:
+            for rRNA in sequence_r_rnas[seq['id']]:
                 if(cds['stop'] < rRNA['start'] or cds['start'] > rRNA['stop']):
                     continue
                 else:  # overlap -> remove cds
@@ -268,11 +268,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_R_RNA} overlap with ({rRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: CDS (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, contig=%s",
-                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, cds['contig']
+                        "overlap: CDS (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, seq=%s",
+                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, cds['sequence']
                     )
             # CRISPR overlaps
-            for crispr in contig_crispr_arrays[contig['id']]:
+            for crispr in sequence_crispr_arrays[seq['id']]:
                 if(cds['stop'] < crispr['start'] or cds['start'] > crispr['stop']):
                     continue
                 else:  # overlap -> remove cds
@@ -283,11 +283,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f'overlaps {bc.FEATURE_CRISPR} at {overlap}'
                     }
                     log.info(
-                        "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s",
-                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['contig']
+                        "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, seq=%s",
+                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['sequence']
                     )
             # user-provided CDS overlaps
-            for cds_user_provided in contig_cdss_user_provided[contig['id']]:
+            for cds_user_provided in sequence_cdss_user_provided[seq['id']]:
                 overlap = 0
                 if(not cds_user_provided.get('edge', False)  and  not cds.get('edge', False)):  # both CDS not edge features
                     if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']):
@@ -309,7 +309,7 @@ def detect_feature_overlaps(genome: dict):
                     else:
                         continue
                 elif(cds_user_provided.get('edge', False)  and  cds.get('edge', False)):  # both CDS edge features
-                    overlap = (contig['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop'])
+                    overlap = (seq['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop'])
                 if(overlap > bc.CDS_MAX_OVERLAPS):
                     overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]"
                     cds['discarded'] = {
@@ -318,14 +318,14 @@ def detect_feature_overlaps(genome: dict):
                         'description': f'overlaps user-provided {bc.FEATURE_CDS} at {overlap}'
                     }
                     log.info(
-                        "overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
-                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig']
+                        "overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, seq=%s",
+                        cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['sequence']
                     )
 
         # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs
-        for sorf in contig_sorfs[contig['id']]:
+        for sorf in sequence_sorfs[seq['id']]:
             # tmRNA overlaps
-            for tmRNA in contig_tm_rnas[contig['id']]:
+            for tmRNA in sequence_tm_rnas[seq['id']]:
                 if(sorf['stop'] < tmRNA['start'] or sorf['start'] > tmRNA['stop']):
                     continue
                 else:  # overlap -> remove sorf
@@ -336,11 +336,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: sORF (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, contig=%s",
-                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, sorf['contig']
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, seq=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, sorf['sequence']
                     )
             # tRNA overlaps
-            for tRNA in contig_t_rnas[contig['id']]:
+            for tRNA in sequence_t_rnas[seq['id']]:
                 if(sorf['stop'] < tRNA['start'] or sorf['start'] > tRNA['stop']):
                     continue
                 else:  # overlap -> remove sorf
@@ -351,11 +351,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_T_RNA} overlap with ({tRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: sORF (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, contig=%s",
-                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, sorf['contig']
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, seq=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, sorf['sequence']
                     )
             # rRNA overlaps
-            for rRNA in contig_r_rnas[contig['id']]:
+            for rRNA in sequence_r_rnas[seq['id']]:
                 if(sorf['stop'] < rRNA['start'] or sorf['start'] > rRNA['stop']):
                     continue
                 else:  # overlap -> remove sorf
@@ -366,11 +366,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f"{bc.FEATURE_R_RNA} overlap with ({rRNA['product']}) at {overlap}"
                     }
                     log.info(
-                        "overlap: sORF (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, contig=%s",
-                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, sorf['contig']
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, seq=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, sorf['sequence']
                     )
             # CRISPR overlaps
-            for crispr in contig_crispr_arrays[contig['id']]:
+            for crispr in sequence_crispr_arrays[seq['id']]:
                 if(sorf['stop'] < crispr['start'] or sorf['start'] > crispr['stop']):
                     continue
                 else:  # overlap -> remove sorf
@@ -381,11 +381,11 @@ def detect_feature_overlaps(genome: dict):
                         'description': f'overlaps {bc.FEATURE_CRISPR} at {overlap}'
                     }
                     log.info(
-                        "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s",
-                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['contig']
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, seq=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['sequence']
                     )
             # user-provided CDS overlaps
-            for cds_user_provided in contig_cdss_user_provided[contig['id']]:
+            for cds_user_provided in sequence_cdss_user_provided[seq['id']]:
                 if(sorf['stop'] < cds_user_provided['start'] or sorf['start'] > cds_user_provided['stop']):
                     continue
                 else:  # overlap -> remove sorf
@@ -396,12 +396,12 @@ def detect_feature_overlaps(genome: dict):
                         'description': f'overlaps {bc.FEATURE_CDS} at {overlap}'
                     }
                     log.info(
-                        "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
-                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['contig']
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, seq=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['sequence']
                     )
 
             # sORF overlaps
-            for overlap_sorf in contig_sorfs[contig['id']]:
+            for overlap_sorf in sequence_sorfs[seq['id']]:
                 if(sorf['stop'] < overlap_sorf['start'] or sorf['start'] > overlap_sorf['stop']):
                     continue  # no overlap
                 elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']):
@@ -418,8 +418,8 @@ def detect_feature_overlaps(genome: dict):
                             'description': f"overlaps {bc.FEATURE_SORF} ({overlap_sorf.get('gene', '-')}/{overlap_sorf.get('product', '-')}) at {overlap} with lower score ({score_sorf}/{score_overlap_sorf})"
                         }
                         log.info(
-                            "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, contig=%s, lower annotation score (%i/%i)",
-                            sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['contig'], score_sorf, score_overlap_sorf
+                            "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, seq=%s, lower annotation score (%i/%i)",
+                            sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['sequence'], score_sorf, score_overlap_sorf
                         )
                     elif(score_sorf == score_overlap_sorf and len(sorf['aa']) < len(overlap_sorf['aa'])):  # equal annotation score but shorter sequence -> potential fragment or too short ORF prediction
                         overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]"
@@ -429,8 +429,8 @@ def detect_feature_overlaps(genome: dict):
                             'description': f"overlaps {bc.FEATURE_SORF} ({overlap_sorf.get('gene', '-')}/{overlap_sorf.get('product', '-')}) at {overlap} with equal score ({score_sorf}) but lower length ({len(sorf['aa'])}/{len(overlap_sorf['aa'])})"
                         }
                         log.info(
-                            "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, contig=%s, equal annotation score (%i), lower length (%i/%i)",
-                            sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['contig'], score_sorf, len(sorf['aa']), len(overlap_sorf['aa'])
+                            "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, seq=%s, equal annotation score (%i), lower length (%i/%i)",
+                            sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['sequence'], score_sorf, len(sorf['aa']), len(overlap_sorf['aa'])
                         )
 
 
@@ -451,8 +451,8 @@ def calc_cds_annotation_score(cds: dict) -> int:
         score += 1
         score += calc_annotation_score(psc)
     log.debug(
-        'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
-        cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
+        'cds score: seq=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i',
+        cds['sequence'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score
     )
     return score
 
@@ -620,7 +620,7 @@ def revise_cds_product(product: str):
 
     old_product = product
     if(
-        RE_PROTEIN_CONTIG.search(product) or  # protein containing 'contig'
+        RE_PROTEIN_CONTIG.search(product) or  # protein containing 'sequence'
         RE_PROTEIN_NODE.search(product) or  # potential contig name (SPAdes)
         RE_PROTEIN_POTENTIAL_CONTIG_NAME.search(product) or  # potential contig name (SPAdes)
         RE_PROTEIN_NO_LETTERS.fullmatch(product)  # no letters -> set to Hypothetical
@@ -633,8 +633,8 @@ def revise_cds_product(product: str):
 
 def mark_as_hypothetical(feature: dict):
     log.info(
-        'marked as hypothetical: contig=%s, start=%i, stop=%i, strand=%s',
-        feature['contig'], feature['start'], feature['stop'], feature['strand']
+        'marked as hypothetical: seq=%s, start=%i, stop=%i, strand=%s',
+        feature['sequence'], feature['start'], feature['stop'], feature['strand']
     )
     feature['hypothetical'] = True
     feature['gene'] = None
@@ -660,8 +660,8 @@ def get_adjacent_genes(feature: dict, features: Sequence[dict], neighbors=3):
             upstream_genes.extend(downstream_genes)
             for gene in upstream_genes:
                 log.debug(
-                    'extracted neighbor genes: contig=%s, start=%i, stop=%i, gene=%s, product=%s',
-                    gene['contig'], gene['start'], gene['stop'], gene.get('gene', '-'), gene.get('product', '-')
+                    'extracted neighbor genes: seq=%s, start=%i, stop=%i, gene=%s, product=%s',
+                    gene['sequence'], gene['start'], gene['stop'], gene.get('gene', '-'), gene.get('product', '-')
                 )
             return upstream_genes
     return []
@@ -680,14 +680,14 @@ def select_gene_symbols(features: Sequence[dict]):
                     if(gene_symbol != old_gene_symbol):
                         feat['gene'] = gene_symbol
                         log.info(
-                            'gene product symbol selection: contig=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s',
-                            feat['contig'], feat['start'], feat['stop'], gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-')
+                            'gene product symbol selection: seq=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s',
+                            feat['sequence'], feat['start'], feat['stop'], gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-')
                         )
                         improved_genes.append(feat)
         else:  # multiple gene symbols of varying prefixes are available, e.g. acrS, envR
             log.debug(
-                'select gene symbol: contig=%s, start=%i, stop=%i, gene=%s, genes=%s, product=%s',
-                feat['contig'], feat['start'], feat['stop'], feat.get('gene', '-'), ','.join(feat['genes']), feat.get('product', '-')
+                'select gene symbol: seq=%s, start=%i, stop=%i, gene=%s, genes=%s, product=%s',
+                feat['sequence'], feat['start'], feat['stop'], feat.get('gene', '-'), ','.join(feat['genes']), feat.get('product', '-')
             )
             adjacent_genes = get_adjacent_genes(feat, features, neighbors=3)
             adjacent_gene_symbol_lists = [gene.get('genes', []) for gene in adjacent_genes]
@@ -711,8 +711,8 @@ def select_gene_symbols(features: Sequence[dict]):
             if(selected_gene_symbol != old_gene_symbol):
                 feat['gene'] = selected_gene_symbol
                 log.info(
-                    'gene neighborhood symbol selection: contig=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s',
-                    feat['contig'], feat['start'], feat['stop'], selected_gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-')
+                    'gene neighborhood symbol selection: seq=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s',
+                    feat['sequence'], feat['start'], feat['stop'], selected_gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-')
                 )
                 improved_genes.append(feat)
     return improved_genes
\ No newline at end of file
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index ec728b98..177dda58 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -42,7 +42,7 @@ def predict(genome: dict):
         if(not prodigal_metamode):
             log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
             gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed)
-            seqs = [c['sequence'] for c in genome['contigs']]
+            seqs = [seq['sequence'] for seq in genome['sequences']]
             trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table)
         else:
             log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
@@ -58,43 +58,43 @@ def predict(genome: dict):
 
     cdss = []
     # predict genes on linear sequences
-    linear_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_LINEAR]
-    if(len(linear_contigs) > 0):
+    linear_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR]
+    if(len(linear_sequences) > 0):
         if prodigal_metamode:
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True)
         else:
             gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=True, mask=True)
-        sequences = [contig['sequence'] for contig in linear_contigs]
+        sequences = [seq['sequence'] for seq in linear_sequences]
         with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe:
-            for contig, genes in zip(linear_contigs, tpe.map(gene_finder.find_genes, sequences)):
-                cdss_per_sequence = create_cdss(genes, contig)
+            for seq, genes in zip(linear_sequences, tpe.map(gene_finder.find_genes, sequences)):
+                cdss_per_sequence = create_cdss(genes, seq)
                 cdss.extend(cdss_per_sequence)
 
     # predict genes on circular replicons (chromosomes/plasmids)
-    circular_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_CIRCULAR]
-    if(len(circular_contigs) > 0):
+    circular_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR]
+    if(len(circular_sequences) > 0):
         if prodigal_metamode:
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True)
         else:
             gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=False, mask=True)
-        sequences = [contig['sequence'] for contig in circular_contigs]
+        sequences = [seq['sequence'] for seq in circular_sequences]
         with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe:
-            for contig, genes in zip(circular_contigs, tpe.map(gene_finder.find_genes, sequences)):
-                cdss_per_sequence = create_cdss(genes, contig)
+            for seq, genes in zip(circular_sequences, tpe.map(gene_finder.find_genes, sequences)):
+                cdss_per_sequence = create_cdss(genes, seq)
                 cdss.extend(cdss_per_sequence)
 
     log.info('predicted=%i', len(cdss))
     return cdss
 
 
-def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str):
+def create_cds(sequence: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str):
     cds = OrderedDict()
     cds['type'] = bc.FEATURE_CDS
-    cds['contig'] = contig['id']
+    cds['sequence'] = sequence['id']
     cds['start'] = start
     cds['stop'] = stop
     cds['strand'] = strand
-    cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (contig['length'] - stop) % 3 + 1
+    cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (sequence['length'] - stop) % 3 + 1
     cds['gene'] = None
     cds['product'] = None
     cds['db_xrefs'] = [so.SO_CDS.id]
@@ -106,12 +106,12 @@ def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt:
     return cds
 
 
-def create_cdss(genes, contig):
+def create_cdss(genes, sequence):
     partial_cdss_per_sequence = []
     cdss_per_sequence = []
     for gene in genes:
         strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE
-        cds = create_cds(contig, gene.begin, gene.end, strand, False, '', '')
+        cds = create_cds(sequence, gene.begin, gene.end, strand, False, '', '')
         cds['start_type'] = gene.start_type
         cds['rbs_motif'] = gene.rbs_motif
         if gene.partial_begin:
@@ -135,18 +135,18 @@ def create_cdss(genes, contig):
         cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
         
         log.info(
-            'contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s',
-            cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif']
+            'seq=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s',
+            cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif']
         )
-    if(contig['topology'] == bc.TOPOLOGY_CIRCULAR and len(partial_cdss_per_sequence) >= 2):
-        first_partial_cds = partial_cdss_per_sequence[0]  # first partial CDS per contig
-        last_partial_cds = partial_cdss_per_sequence[-1]  # last partial CDS per contig
+    if(sequence['topology'] == bc.TOPOLOGY_CIRCULAR and len(partial_cdss_per_sequence) >= 2):
+        first_partial_cds = partial_cdss_per_sequence[0]  # first partial CDS per sequence
+        last_partial_cds = partial_cdss_per_sequence[-1]  # last partial CDS per sequence
         # check if partial CDSs are on same strand and have opposite truncated edges
-        # and first starts at 1 and last ends at contig end (length)
+        # and first starts at 1 and last ends at sequence end (length)
         if(first_partial_cds['strand'] == last_partial_cds['strand']
             and first_partial_cds['truncated'] != last_partial_cds['truncated']
             and first_partial_cds['start'] == 1
-            and last_partial_cds['stop'] == contig['length']):
+            and last_partial_cds['stop'] == sequence['length']):
             cds = last_partial_cds
             cds['stop'] = first_partial_cds['stop']
             if(last_partial_cds['truncated'] == bc.FEATURE_END_3_PRIME):
@@ -162,22 +162,22 @@ def create_cdss(genes, contig):
             cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
             cdss_per_sequence.append(cds)
             log.info(
-                'edge CDS: contig=%s, start=%i, stop=%i, strand=%s, frame=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]',
-                cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds['start_type'], cds['rbs_motif'], cds['aa_hexdigest'], aa[:10], aa[-10:]
+                'edge CDS: seq=%s, start=%i, stop=%i, strand=%s, frame=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]',
+                cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds['start_type'], cds['rbs_motif'], cds['aa_hexdigest'], aa[:10], aa[-10:]
             )
             partial_cdss_per_sequence = partial_cdss_per_sequence[1:-1]  # remove first/last partial CDS
     for partial_cds in partial_cdss_per_sequence:
         cdss_per_sequence.append(partial_cds)
         log.info(
-            'truncated CDS: contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]',
-            partial_cds['contig'], partial_cds['start'], partial_cds['stop'], partial_cds['strand'], partial_cds['frame'], partial_cds['truncated'], partial_cds['start_type'], partial_cds['rbs_motif'], partial_cds['aa_hexdigest'], partial_cds['aa'][:10], partial_cds['aa'][-10:]
+            'truncated CDS: seq=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]',
+            partial_cds['sequence'], partial_cds['start'], partial_cds['stop'], partial_cds['strand'], partial_cds['frame'], partial_cds['truncated'], partial_cds['start_type'], partial_cds['rbs_motif'], partial_cds['aa_hexdigest'], partial_cds['aa'][:10], partial_cds['aa'][-10:]
         )
     for cds in cdss_per_sequence:  # extract nt sequences
-        nt = bu.extract_feature_sequence(cds, contig)
+        nt = bu.extract_feature_sequence(cds, sequence)
         cds['nt'] = nt
         log.info(
-            'contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s]',
-            cds['contig'], cds['start'], cds['stop'], cds['strand'], nt[:10], nt[-10:]
+            'seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s]',
+            cds['sequence'], cds['start'], cds['stop'], cds['strand'], nt[:10], nt[-10:]
         )
     return cdss_per_sequence
 
@@ -189,7 +189,7 @@ def import_user_cdss(genome: dict, import_path: Path):
     Parameters
     ----------
     genome : dict
-        Genome dictionary holding sequence information (contigs)
+        Genome dictionary holding sequence information
     import_path : Path
         Path to GFF3 or Genbank file with regions or features.
 
@@ -199,10 +199,10 @@ def import_user_cdss(genome: dict, import_path: Path):
         a list of CDS features - without functional annotations.
     """
     user_cdss = []
-    if(cfg.keep_contig_headers):
-        contigs_by_id = {c['id']: c for c in genome['contigs']}  # use ID as it's not altered -> no 'orig_id' field
+    if(cfg.keep_sequence_headers):
+        sequences_by_id = {seq['id']: seq for seq in genome['sequences']}  # use ID as it's not altered -> no 'orig_id' field
     else:
-        contigs_by_id = {c['orig_id']: c for c in genome['contigs']}  # use 'orig_id' instead of autogenerated new 'id'
+        sequences_by_id = {seq['orig_id']: seq for seq in genome['sequences']}  # use 'orig_id' instead of autogenerated new 'id'
     file_suffix = import_path.suffix.lower()
     if(file_suffix in ['.gff', '.gff3']):  # parse GFF3 format
         try:
@@ -215,45 +215,45 @@ def import_user_cdss(genome: dict, import_path: Path):
                     elif(skip_lines  or  line[0] == '#'):
                         continue
                     else:
-                        contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t')
+                        sequence_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t')
                         if(feature_type.lower() == 'cds'):
                             attributes = attributes.lower().split(';')
-                            contig = contigs_by_id.get(contig_id, None)
-                            if(contig is None):
-                                log.error('user-provided CDS: No contig found for id=%s', contig_id)
-                                raise Exception(f'user-provided CDS: No contig found for id={contig_id}')
+                            seq = sequences_by_id.get(sequence_id, None)
+                            if(seq is None):
+                                log.error('user-provided CDS: No seq found for id=%s', sequence_id)
+                                raise Exception(f'user-provided CDS: No sequence found for id={sequence_id}')
                             edge = False
                             start = int(start)
                             stop = int(stop)
-                            if(stop > contig['length']):  # check for features spanning sequence edges
-                                stop = stop - contig['length']
+                            if(stop > seq['length']):  # check for features spanning sequence edges
+                                stop = stop - seq['length']
                                 edge = True
                                 
-                            user_cds = create_cds(contig, start, stop, strand, edge, '', '')
+                            user_cds = create_cds(seq, start, stop, strand, edge, '', '')
                             user_cds['source'] = bc.CDS_SOURCE_USER
                             if('pseudo=' in attributes  or  bc.INSDC_FEATURE_PSEUDOGENE in attributes):  # skip pseudo genes
                                 log.debug(
-                                    'skip user-provided CDS: reason=pseudogene contig=%s, start=%i, stop=%i, strand=%s',
-                                    user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand']
+                                    'skip user-provided CDS: reason=pseudogene seq=%s, start=%i, stop=%i, strand=%s',
+                                    user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand']
                                 )
                                 continue
                             try:
-                                nt = bu.extract_feature_sequence(user_cds, contig)
+                                nt = bu.extract_feature_sequence(user_cds, seq)
                                 user_cds['nt'] = nt
                             except:
-                                log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
-                                raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
+                                log.error('user-provided CDS out of range! seq=%s, start=%i, stop=%i', user_cds['sequence'], user_cds['start'], user_cds['stop'])
+                                raise ValueError(f"User-provided CDS out of range! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}")
                             try:
                                 aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True))
                                 user_cds['aa'] = aa
                                 user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
                             except:
-                                log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
-                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
+                                log.error('user-provided CDS could not be translated into a valid amino acid sequence! seq=%s, start=%i, stop=%i, cds=%s', user_cds['sequence'], user_cds['start'], user_cds['stop'], nt)
+                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
                             
                             log.info(
-                                'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
-                                user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
+                                'user-provided CDS: seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
+                                user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
                             )
                             user_cdss.append(user_cds)
         except Exception as e:
@@ -265,10 +265,10 @@ def import_user_cdss(genome: dict, import_path: Path):
                 for record in SeqIO.parse(fh_in, 'genbank'):
                     for feature in record.features:
                         if(feature.type.lower() == 'cds'):
-                            contig = contigs_by_id.get(record.id, None)
-                            if(contig is None):
-                                log.error('user-provided CDS: No contig found for id=%s', record.id)
-                                raise Exception(f'user-provided CDS: No contig found for id={record.id}')
+                            seq = sequences_by_id.get(record.id, None)
+                            if(seq is None):
+                                log.error('user-provided CDS: No seq found for id=%s', record.id)
+                                raise Exception(f'user-provided CDS: No sequence found for id={record.id}')
                             if(feature.location.strand is None):  # weird mixed-stranded compound locations
                                 strand = bc.STRAND_UNKNOWN
                             else:
@@ -278,20 +278,20 @@ def import_user_cdss(genome: dict, import_path: Path):
                             edge = False
                             if('<' in str(feature.location.start)  or  '>' in str(feature.location.end)):
                                 log.debug(
-                                    'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s',
-                                    contig['id'], feature.location.start, feature.location.end, strand
+                                    'skip user-provided CDS: reason=partial, seq=%s, start=%s, stop=%s, strand=%s',
+                                    seq['id'], feature.location.start, feature.location.end, strand
                                 )
                                 continue
                             elif(bc.INSDC_FEATURE_PSEUDO in feature.qualifiers  or  bc.INSDC_FEATURE_PSEUDOGENE in feature.qualifiers):
                                 log.debug(
-                                    'skip user-provided CDS: reason=pseudo, contig=%s, start=%i, stop=%i, strand=%s',
-                                    contig['id'], feature.location.start, feature.location.end, strand
+                                    'skip user-provided CDS: reason=pseudo, seq=%s, start=%i, stop=%i, strand=%s',
+                                    seq['id'], feature.location.start, feature.location.end, strand
                                 )
                                 continue
                             elif('ribosomal_slippage' in feature.qualifiers):
                                 log.debug(
-                                    'skip user-provided CDS: reason=ribosomal slippage, contig=%s, start=%i, stop=%i, strand=%s',
-                                    contig['id'], feature.location.start, feature.location.end, strand
+                                    'skip user-provided CDS: reason=ribosomal slippage, seq=%s, start=%i, stop=%i, strand=%s',
+                                    seq['id'], feature.location.start, feature.location.end, strand
                                 )
                                 continue
                             elif(isinstance(feature.location, SeqFeature.CompoundLocation)  and  len(feature.location.parts) == 2):
@@ -307,25 +307,25 @@ def import_user_cdss(genome: dict, import_path: Path):
                                         start = edge_right.start + 1
                                         end = edge_left.end
 
-                            user_cds = create_cds(contig, start, end, strand, edge, '', '')
+                            user_cds = create_cds(seq, start, end, strand, edge, '', '')
                             user_cds['source'] = bc.CDS_SOURCE_USER
                             try:
-                                nt = bu.extract_feature_sequence(user_cds, contig)
+                                nt = bu.extract_feature_sequence(user_cds, seq)
                                 user_cds['nt'] = nt
                             except:
-                                log.error('user-provided CDS: CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
-                                raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
+                                log.error('user-provided CDS: CDS out of range! seq=%s, start=%i, stop=%i', user_cds['sequence'], user_cds['start'], user_cds['stop'])
+                                raise ValueError(f"User-provided CDS out of range! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}")
                             try:
                                 aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True))
                                 user_cds['aa'] = aa
                                 user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
                             except:
-                                log.error('user-provided CDS: CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
-                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
+                                log.error('user-provided CDS: CDS could not be translated into a valid amino acid sequence! seq=%s, start=%i, stop=%i, cds=%s', user_cds['sequence'], user_cds['start'], user_cds['stop'], nt)
+                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
                             
                             log.info(
-                                'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
-                                user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
+                                'user-provided CDS: seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
+                                user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
                             )
                             user_cdss.append(user_cds)
         except Exception as e:
@@ -370,8 +370,8 @@ def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]:
                 pfam_hits.append(cds)
                 cds_with_pfams_hits[aa_identifier] = cds
                 log.info(
-                    'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'],
+                    'pfam detected: seq=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'],
                     pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name']
                 )
     log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_with_pfams_hits))
@@ -386,16 +386,16 @@ def analyze_proteins(cdss: Sequence[dict]):
             seq_stats['molecular_weight'] = seq.molecular_weight()
         except:
             log.warning(
-                'could not calc molecular weight! contig=%s, start=%i, stop=%i, strand=%s, frame=%s',
-                cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame']
+                'could not calc molecular weight! seq=%s, start=%i, stop=%i, strand=%s, frame=%s',
+                cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame']
             )
             seq_stats['molecular_weight'] = None
         try:
             seq_stats['isoelectric_point'] = seq.isoelectric_point()
         except:
             log.warning(
-                'could not calc isoelectric point! contig=%s, start=%i, stop=%i, strand=%s, frame=%s',
-                cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame']
+                'could not calc isoelectric point! seq=%s, start=%i, stop=%i, strand=%s, frame=%s',
+                cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame']
             )
             seq_stats['isoelectric_point'] = None
         cds['seq_stats'] = seq_stats
@@ -409,19 +409,19 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
     if(bc.FEATURE_NC_RNA_REGION not in genome['features']):  # check if ncRNA regions have been detected, otherwise skip analysis and return
         return no_revised
 
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding.
-    cdss_per_contigs = {k['id']: [] for k in genome['contigs']}  # get CDS per contig
+    cdss_per_sequences = {k['id']: [] for k in genome['sequences']}  # get CDS per sequence
     for cds in cdss:
-        cdss_per_contig = cdss_per_contigs[cds['contig']]
+        cdss_per_sequence = cdss_per_sequences[cds['sequence']]
         if('truncated' not in cds):  # exclude truncated CDS for now
-            cdss_per_contig.append(cds)
-    cds_pairs_per_contig = {k['id']: [] for k in genome['contigs']}  # extract inframe primate CDS neighbouring pairs
-    for id, cdss_per_contig in cdss_per_contigs.items():
-        cdss_per_contig = sorted(cdss_per_contig, key=lambda k: k['start'])
-        for i in range(1, len(cdss_per_contig)):
-            cds_a = cdss_per_contig[i-1]
-            cds_b = cdss_per_contig[i]
+            cdss_per_sequence.append(cds)
+    cds_pairs_per_sequence = {k['id']: [] for k in genome['sequences']}  # extract inframe primate CDS neighbouring pairs
+    for id, cdss_per_sequence in cdss_per_sequences.items():
+        cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start'])
+        for i in range(1, len(cdss_per_sequence)):
+            cds_a = cdss_per_sequence[i-1]
+            cds_b = cdss_per_sequence[i]
             strand = cds_a['strand']
             upstream_stop_codon = cds_a['nt'][-3:] if strand == bc.STRAND_FORWARD else cds_b['nt'][-3:]
             if(
@@ -429,27 +429,27 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
                 cds_a['frame'] == cds_b['frame'] and  # up- and downstream ORFs on the same frame
                 upstream_stop_codon == 'TGA' and  # tRNAScan-SE 2.0 only predicts tRNA-Sec with UCA anticodons, therefore we can only detect TGA stop codons
                 (cds_b['start'] - cds_a['stop']) < 100):  # up- and downstream ORFs in close proximity
-                cds_pairs = cds_pairs_per_contig[cds_a['contig']]
+                cds_pairs = cds_pairs_per_sequence[cds_a['sequence']]
                 cds_pairs.append((cds_a, cds_b))
 
     recoding_regions = [ncrna_region for ncrna_region in genome['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION]  #  Selenocysteine insertion sequences
     for recoding_region in recoding_regions:
         if('selenocysteine' in recoding_region.get('product', '').lower()):
-            cds_pairs = cds_pairs_per_contig[recoding_region['contig']]
+            cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']]
             for cds_a, cds_b in cds_pairs:  # find CDS pair around recoding region
                 strand = cds_a['strand']
                 if(
                     strand == recoding_region['strand'] and  # everything is on the same strand
                     cds_a['start'] < recoding_region['start'] and recoding_region['stop'] < cds_b['stop']):  # recoding region lies between up- and downstream ORFs
                     log.debug(
-                        'selenocysteine recoding ncRNA/CDS pair detected: contig=%s, strand=%s, CDS-A=[%i...%i] (%s..%s), recoding-ie=[%i..%i], CDS-B=[%i...%i] (%s..%s)',
-                        recoding_region['contig'], recoding_region['strand'], cds_a['start'], cds_a['stop'], cds_a['nt'][:10], cds_a['nt'][-10:], recoding_region['start'], recoding_region['stop'], cds_b['start'], cds_b['stop'], cds_b['nt'][:10], cds_b['nt'][-10:]
+                        'selenocysteine recoding ncRNA/CDS pair detected: seq=%s, strand=%s, CDS-A=[%i...%i] (%s..%s), recoding-ie=[%i..%i], CDS-B=[%i...%i] (%s..%s)',
+                        recoding_region['sequence'], recoding_region['strand'], cds_a['start'], cds_a['stop'], cds_a['nt'][:10], cds_a['nt'][-10:], recoding_region['start'], recoding_region['stop'], cds_b['start'], cds_b['stop'], cds_b['nt'][:10], cds_b['nt'][-10:]
                     )
                     seleno_cds = copy.deepcopy(cds_a)
                     seleno_cds['stop'] = cds_b['stop']
                     seleno_cds['rbs_motif'] = cds_a['rbs_motif'] if strand == bc.STRAND_FORWARD else cds_b['rbs_motif']
-                    contig = contigs[seleno_cds['contig']]
-                    nt = bu.extract_feature_sequence(seleno_cds, contig)
+                    seq = sequences[seleno_cds['sequence']]
+                    nt = bu.extract_feature_sequence(seleno_cds, seq)
                     seleno_cds['nt'] = nt
                     aa = str(Seq(nt).translate(table=cfg.translation_table, stop_symbol='*', to_stop=False, cds=False))
                     if(
@@ -470,8 +470,8 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
                         }                    
                         cdss.append(seleno_cds)
                         log.info(
-                            'selenocysteine CDS detected: contig=%s, start=%i, stop=%i, strand=%s, frame=%i, exception=[%i..%i], nt=[%s..%s], aa=[%s..%s], aa-hexdigest=%s',
-                            seleno_cds['contig'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], seleno_cds['exception']['start'], seleno_cds['exception']['stop'], nt[:10], nt[-10:], aa[:10], aa[-10:], seleno_cds['aa_hexdigest']
+                            'selenocysteine CDS detected: seq=%s, start=%i, stop=%i, strand=%s, frame=%i, exception=[%i..%i], nt=[%s..%s], aa=[%s..%s], aa-hexdigest=%s',
+                            seleno_cds['sequence'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], seleno_cds['exception']['start'], seleno_cds['exception']['stop'], nt[:10], nt[-10:], aa[:10], aa[-10:], seleno_cds['aa_hexdigest']
                         )
                         discard = {  # mark CDS a/b as discarded
                             'type': bc.DISCARD_TYPE_RECODING,
@@ -482,8 +482,8 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
                         no_revised += 1
                     else:
                         log.warning(
-                            'spurious selenocysteine CDS detected: contig=%s, start=%i, stop=%i, strand=%s, frame=%i, nt=[%s], aa=[%s]',
-                            seleno_cds['contig'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], nt, aa
+                            'spurious selenocysteine CDS detected: seq=%s, start=%i, stop=%i, strand=%s, frame=%i, nt=[%s], aa=[%s]',
+                            seleno_cds['sequence'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], nt, aa
                         )
     return no_revised
 
@@ -494,13 +494,13 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]):
     which often appear on re-annotated genomes.
     """
     
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     # look for supposedly truncated dnaA genes on rotated chromosome starts: start=1, strand=+
     dnaA = None
     for cds in cdss:
-        contig = contigs[cds['contig']]
+        seq = sequences[cds['sequence']]
         if(
-            contig['complete'] and
+            seq['complete'] and
             cds['start'] == 1 and 
             cds['strand'] == bc.STRAND_FORWARD and 
             cds['start_type'] == 'Edge' and 
@@ -512,16 +512,16 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]):
         dnaA.pop('truncated')
         gene = dnaA.get('gene', '-')
         log.info(
-            'revise supposedly truncated dnaA gene on rotated chromosome start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
-            dnaA['contig'], dnaA['start'], dnaA['stop'], dnaA['strand'], gene, dnaA['product'], dnaA['nt'][:10], dnaA['nt'][-10:], dnaA['aa'][:10], dnaA['aa'][-10:]
+            'revise supposedly truncated dnaA gene on rotated chromosome start: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
+            dnaA['sequence'], dnaA['start'], dnaA['stop'], dnaA['strand'], gene, dnaA['product'], dnaA['nt'][:10], dnaA['nt'][-10:], dnaA['aa'][:10], dnaA['aa'][-10:]
         )
     
     # look for supposedly truncated repA genes on rotated plasmid starts: start=1, strand=+
     repAs = []
     for cds in cdss:
-        contig = contigs[cds['contig']]
+        seq = sequences[cds['sequence']]
         if(
-            contig['complete'] and
+            seq['complete'] and
             cds['start'] == 1 and 
             cds['strand'] == bc.STRAND_FORWARD and 
             cds['start_type'] == 'Edge' and 
@@ -533,8 +533,8 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]):
             repA.pop('truncated')
             gene = repA.get('gene', '-')
             log.info(
-                'revise supposedly truncated repA gene on rotated plasmid start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
-                repA['contig'], repA['start'], repA['stop'], repA['strand'], gene, repA['product'], repA['nt'][:10], repA['nt'][-10:], repA['aa'][:10], repA['aa'][-10:]
+                'revise supposedly truncated repA gene on rotated plasmid start: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]',
+                repA['sequence'], repA['start'], repA['stop'], repA['strand'], gene, repA['product'], repA['nt'][:10], repA['nt'][-10:], repA['aa'][:10], repA['aa'][-10:]
             )
 
 
@@ -604,8 +604,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
                 }
                 pseudo_candidates.append(cds)
                 log.debug(
-                    'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
+                    'pseudogene-candidate: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
     log.info('found: pseudogene-candidates=%i', len(pseudo_candidates))
     return pseudo_candidates
@@ -627,13 +627,13 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
             fh.write(f">{cluster_id}\n{faa_seq}\n")
 
     # Get extended cds sequences
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     candidates_extended_positions = {}
     with candidates_elongated_sequences_path.open(mode='w') as fh:
         for cds in candidates:
-            contig = contigs[cds['contig']]
-            cds_elongated = get_elongated_cds(cds, contig)
-            seq = bu.extract_feature_sequence(cds_elongated, contig)
+            seq = sequences[cds['sequence']]
+            cds_elongated = get_elongated_cds(cds, seq)
+            seq = bu.extract_feature_sequence(cds_elongated, seq)
             orf_key = orf.get_orf_key(cds)
             fh.write(f">{orf_key}\n{seq}\n")
             candidates_extended_positions[orf_key] = cds_elongated
@@ -700,8 +700,8 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
 
                     if alignment_length == len(cds['aa']):  # skip non-extended genes (full match)
                         log.debug(
-                            'no pseudogene (full match): contig=%s, start=%i, stop=%i, strand=%s',
-                            cds['contig'], cds['start'], cds['stop'], cds['strand']
+                            'no pseudogene (full match): seq=%s, start=%i, stop=%i, strand=%s',
+                            cds['sequence'], cds['start'], cds['stop'], cds['strand']
                         )
                         continue
 
@@ -763,8 +763,8 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
                         cds.pop('hypothetical')
                         pseudogenes.append(cds)
                         log.info(
-                            'pseudogene: contig=%s, start=%i, stop=%i, strand=%s, insertions=%s, deletions=%s, mutations=%s, effect=%s',
-                            cds['contig'], cds['start'], cds['stop'], cds['strand'], observations.get(bc.PSEUDOGENE_CAUSE_INSERTION, []), observations.get(bc.PSEUDOGENE_CAUSE_DELETION, []), observations.get(bc.PSEUDOGENE_CAUSE_MUTATION, []), effects
+                            'pseudogene: seq=%s, start=%i, stop=%i, strand=%s, insertions=%s, deletions=%s, mutations=%s, effect=%s',
+                            cds['sequence'], cds['start'], cds['stop'], cds['strand'], observations.get(bc.PSEUDOGENE_CAUSE_INSERTION, []), observations.get(bc.PSEUDOGENE_CAUSE_DELETION, []), observations.get(bc.PSEUDOGENE_CAUSE_MUTATION, []), effects
                         )
 
                     elif observations[bc.PSEUDOGENE_EXCEPTION_SELENOCYSTEINE] or observations[bc.PSEUDOGENE_EXCEPTION_PYROLYSINE]:
@@ -777,7 +777,7 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
     return pseudogenes
 
 
-def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSET) -> Dict[str, Union[int, str, bool]]:
+def get_elongated_cds(cds: dict, sequence: dict, offset: int = bc.PSEUDOGENE_OFFSET) -> Dict[str, Union[int, str, bool]]:
     """
     Elongate the given CDS sequence with the offset in upstream and downstream direction, if possible.
     """
@@ -790,9 +790,9 @@ def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSE
         'elongation_downstream': offset
     }
 
-    contig_length = len(contig['sequence'])
-    if contig['topology'] == 'circular' and elongated_cds['start'] - offset < 0:
-        elongated_cds['start'] = contig_length + elongated_cds['start'] - offset
+    sequence_length = len(sequence['sequence'])
+    if sequence['topology'] == 'circular' and elongated_cds['start'] - offset < 0:
+        elongated_cds['start'] = sequence_length + elongated_cds['start'] - offset
         elongated_cds['edge'] = True
     elif elongated_cds['start'] - offset < 0:
         elongated_cds['start'] = 1
@@ -800,12 +800,12 @@ def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSE
     else:
         elongated_cds['start'] = elongated_cds['start'] - offset
 
-    if contig['topology'] == 'circular' and elongated_cds['stop'] + offset > contig_length:
-        elongated_cds['stop'] = elongated_cds['stop'] + offset - contig_length
+    if sequence['topology'] == 'circular' and elongated_cds['stop'] + offset > sequence_length:
+        elongated_cds['stop'] = elongated_cds['stop'] + offset - sequence_length
         elongated_cds['edge'] = True
-    elif elongated_cds['stop'] + offset > contig_length:
-        elongated_cds['stop'] = contig_length
-        elongated_cds['elongation_downstream'] = contig_length - cds['stop']
+    elif elongated_cds['stop'] + offset > sequence_length:
+        elongated_cds['stop'] = sequence_length
+        elongated_cds['elongation_downstream'] = sequence_length - cds['stop']
     else:
         elongated_cds['stop'] = elongated_cds['stop'] + offset
 
@@ -889,8 +889,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c
                 observations[bc.PSEUDOGENE_EFFECT_START].add(genome_position)
                 observations['directions'].add(bc.FEATURE_END_3_PRIME)
                 log.info(
-                    'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, original start=%i',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['start'] + genome_position
+                    'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, original start=%i',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['start'] + genome_position
                 )
             else:  # RBS was predicted (protein iso-form) -> skip
                 pass
@@ -911,8 +911,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c
             observations[bc.PSEUDOGENE_CAUSE_INSERTION].add(genome_position)
             observations['directions'].add(get_direction(alignment_position, edge))
             log.info(
-                'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, cause=insertion, position=%i',
-                cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position
+                'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, cause=insertion, position=%i',
+                cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position
             )
             alignment_position += 1
         elif char == '/':  # deletion
@@ -921,23 +921,23 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c
             observations[bc.PSEUDOGENE_CAUSE_DELETION].add(genome_position)
             observations['directions'].add(get_direction(alignment_position, edge))
             log.info(
-                'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, cause=deletion, position=%i',
-                cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position
+                'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, cause=deletion, position=%i',
+                cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position
             )
         elif char == '*':  # stop codon, selenocysteine, pyrolysine
             if ref_char == 'U':  # selenocysteine
                 genome_position = get_abs_position(cds, start, alignment_position, edge)
                 observations[bc.PSEUDOGENE_EXCEPTION_SELENOCYSTEINE].add(genome_position)
                 log.info(
-                    'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, exception=selenocysteine, position=%i',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position
+                    'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, exception=selenocysteine, position=%i',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position
                 )
             elif ref_char == 'O':  # pyrolysine
                 genome_position = get_abs_position(cds, start, alignment_position, edge)
                 observations[bc.PSEUDOGENE_EXCEPTION_PYROLYSINE].add(genome_position)
                 log.info(
-                    'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, exception=pyrolysin, position=%i',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position
+                    'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, exception=pyrolysin, position=%i',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position
                 )
             else:  # stop codon
                 mutation = ''
@@ -948,8 +948,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c
                 observations[bc.PSEUDOGENE_EFFECT_STOP].add(genome_position)
                 observations['directions'].add(get_direction(alignment_position, edge))
                 log.info(
-                    'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, effect=stop%s, position=%i',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], mutation, genome_position
+                    'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, effect=stop%s, position=%i',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], mutation, genome_position
                 )
             alignment_position += 3
         else:
diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py
index b15818f2..02b41c20 100644
--- a/bakta/features/crispr.py
+++ b/bakta/features/crispr.py
@@ -17,13 +17,13 @@
 log = logging.getLogger('CRISPR')
 
 
-def predict_crispr(genome: dict, contigs_path: Path):
+def predict_crispr(genome: dict, sequences_path: Path):
     """Predict CRISPR arrays with PILER-CR."""
 
     output_path = cfg.tmp_path.joinpath('crispr.txt')
     cmd = [
         'pilercr',
-        '-in', str(contigs_path),
+        '-in', str(sequences_path),
         '-out', str(output_path),
         '-noinfo',  # omit help in output
         '-quiet'  # silent mode
@@ -44,10 +44,10 @@ def predict_crispr(genome: dict, contigs_path: Path):
 
     # parse crispr arrays
     crispr_arrays = {}
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with output_path.open() as fh:
         output_section = None
-        contig_id = None
+        sequence_id = None
         array_id = None
         skip_lines = True
         crispr_array = None
@@ -77,8 +77,8 @@ def predict_crispr(genome: dict, contigs_path: Path):
                         crispr_array['spacers'] = []
                         crispr_arrays[array_id] = crispr_array
                     elif(line[0] == '>'):
-                        contig_id = line[1:]
-                        crispr_array['contig'] = contig_id
+                        sequence_id = line[1:]
+                        crispr_array['sequence'] = sequence_id
                     elif(line[0] != '='):
                         m = RE_CRISPR.fullmatch(line)
                         if(m is not None):
@@ -102,20 +102,20 @@ def predict_crispr(genome: dict, contigs_path: Path):
                                 crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
                                 crispr_spacer['sequence'] = spacer_seq
                                 crispr_array['spacers'].append(crispr_spacer)
-                                spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
+                                spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, sequences[sequence_id])
                                 log.debug('spacer: array-id=%s, start=%i, stop=%i, genome-seq=%s, spacer-seq=%s', array_id, crispr_spacer['start'], crispr_spacer['stop'], spacer_genome_seq, spacer_seq)
                                 assert spacer_seq == spacer_genome_seq  # assure PILER-CR provided sequence equals sequence extracted from genome
                 elif(output_section == 'POSITION'):
                     if(line[0] == '>'):
-                        contig_id = line[1:]
+                        sequence_id = line[1:]
                     elif(line[0] != 'A' and line[0] != '='):
                         cols = line.split()
                         if(len(cols) == 8):
-                            (array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols
+                            (array_id, sequence, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols
                         else:
-                            (array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols
+                            (array_id, sequence, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols
                         crispr_array = crispr_arrays[array_id]
-                        positions = [c['start'] for c in crispr_array['spacers']] + [c['stop'] for c in crispr_array['spacers']] + [c['start'] for c in crispr_array['repeats']] + [c['stop'] for c in crispr_array['repeats']]
+                        positions = [seq['start'] for seq in crispr_array['spacers']] + [seq['stop'] for seq in crispr_array['spacers']] + [seq['start'] for seq in crispr_array['repeats']] + [seq['stop'] for seq in crispr_array['repeats']]
                         crispr_array['start'] = min(positions)
                         crispr_array['stop'] = max(positions)
                         crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
@@ -125,11 +125,11 @@ def predict_crispr(genome: dict, contigs_path: Path):
                         crispr_array['repeat_consensus'] = repeat_consensus
                         crispr_array['db_xrefs'] = [so.SO_CRISPR.id]
 
-                        nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id])  # extract nt sequences
+                        nt = bu.extract_feature_sequence(crispr_array, sequences[sequence_id])  # extract nt sequences
                         crispr_array['nt'] = nt
                         log.info(
-                            'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]',
-                            crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:]
+                            'seq=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]',
+                            crispr_array['sequence'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:]
                         )
     crispr_arrays = crispr_arrays.values()                        
     log.info('predicted=%i', len(crispr_arrays))
diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py
index fb6342ff..5e052de6 100644
--- a/bakta/features/gaps.py
+++ b/bakta/features/gaps.py
@@ -13,14 +13,14 @@
 
 def detect_assembly_gaps(genome: dict) -> Sequence[dict]:
     gaps = []
-    for contig in genome['contigs']:
-        m = RE_ASSEMBLY_GAP.search(contig['sequence'])
+    for seq in genome['sequences']:
+        m = RE_ASSEMBLY_GAP.search(seq['sequence'])
         while m:
             start, end = m.span()
 
             gap = OrderedDict()
             gap['type'] = bc.FEATURE_GAP
-            gap['contig'] = contig['id']
+            gap['sequence'] = seq['id']
             gap['start'] = start + 1
             gap['stop'] = end
             gap['strand'] = bc.STRAND_NA
@@ -28,8 +28,8 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]:
 
             gaps.append(gap)
             log.info(
-                'contig=%s, start=%i, stop=%i, length=%s',
-                gap['contig'], gap['start'], gap['stop'], gap['length']
+                'seq=%s, start=%i, stop=%i, length=%s',
+                gap['sequence'], gap['start'], gap['stop'], gap['length']
             )
-            m = RE_ASSEMBLY_GAP.search(contig['sequence'], end + 1)
+            m = RE_ASSEMBLY_GAP.search(seq['sequence'], end + 1)
     return gaps
diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py
index 207efa00..70050e09 100644
--- a/bakta/features/nc_rna.py
+++ b/bakta/features/nc_rna.py
@@ -17,7 +17,7 @@
 log = logging.getLogger('NC_RNA')
 
 
-def predict_nc_rnas(genome: dict, contigs_path: Path):
+def predict_nc_rnas(genome: dict, sequences_path: Path):
     """Search for non-coding RNA genes."""
 
     output_path = cfg.tmp_path.joinpath('ncrna-genes.tsv')
@@ -35,7 +35,7 @@ def predict_nc_rnas(genome: dict, contigs_path: Path):
         cmd.append('-Z')
         cmd.append(str(2 * genome['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-genes')))
-    cmd.append(str(contigs_path))
+    cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
     proc = sp.run(
         cmd,
@@ -61,12 +61,12 @@ def predict_nc_rnas(genome: dict, contigs_path: Path):
                 rfam2go[rfam] = [go]
 
     ncrnas = []
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
                 (
-                    subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to,
+                    subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to,
                     start, stop, strand, trunc, passed, gc, bias, score, evalue,
                     inc, description
                 ) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17)
@@ -86,8 +86,8 @@ def predict_nc_rnas(genome: dict, contigs_path: Path):
 
                 if(evalue > HIT_EVALUE):
                     log.debug(
-                        'discard low E value: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
-                        contig_id, start, stop, strand, subject, length, truncated, score, evalue
+                        'discard low E value: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
+                        sequence_id, start, stop, strand, subject, length, truncated, score, evalue
                     )
                 else:
                     rfam_id = f'{bc.DB_XREF_RFAM}:{accession}'
@@ -98,7 +98,7 @@ def predict_nc_rnas(genome: dict, contigs_path: Path):
                     ncrna = OrderedDict()
                     ncrna['type'] = bc.FEATURE_NC_RNA
                     ncrna['class'] = determine_class(description)
-                    ncrna['contig'] = contig_id
+                    ncrna['sequence'] = sequence_id
                     ncrna['start'] = start
                     ncrna['stop'] = stop
                     ncrna['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE
@@ -122,13 +122,13 @@ def predict_nc_rnas(genome: dict, contigs_path: Path):
                     ncrna['evalue'] = evalue
                     ncrna['db_xrefs'] = db_xrefs
 
-                    nt = bu.extract_feature_sequence(ncrna, contigs[contig_id])  # extract nt sequences
+                    nt = bu.extract_feature_sequence(ncrna, sequences[sequence_id])  # extract nt sequences
                     ncrna['nt'] = nt
 
                     ncrnas.append(ncrna)
                     log.info(
-                        'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]',
-                        ncrna['contig'], ncrna['start'], ncrna['stop'], ncrna['strand'], ncrna['gene'], ncrna['product'], length, truncated, ncrna['score'], ncrna['evalue'], nt[:10], nt[-10:]
+                        'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]',
+                        ncrna['sequence'], ncrna['start'], ncrna['stop'], ncrna['strand'], ncrna['gene'], ncrna['product'], length, truncated, ncrna['score'], ncrna['evalue'], nt[:10], nt[-10:]
                     )
     log.info('predicted=%i', len(ncrnas))
     return ncrnas
diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py
index cdeaf797..b5e3500e 100644
--- a/bakta/features/nc_rna_region.py
+++ b/bakta/features/nc_rna_region.py
@@ -16,7 +16,7 @@
 log = logging.getLogger('NC_RNA_REGION')
 
 
-def predict_nc_rna_regions(genome: dict, contigs_path: Path):
+def predict_nc_rna_regions(genome: dict, sequences_path: Path):
     """Search for non-coding RNA regions."""
 
     output_path = cfg.tmp_path.joinpath('ncrna-regions.tsv')
@@ -34,7 +34,7 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path):
         cmd.append('-Z')
         cmd.append(str(2 * genome['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-regions')))
-    cmd.append(str(contigs_path))
+    cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
     proc = sp.run(
         cmd,
@@ -60,11 +60,11 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path):
                 rfam2go[rfam] = [go]
 
     ncrnas = []
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
-                (subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to,
+                (subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to,
                     start, stop, strand, trunc, passed, gc, bias, score, evalue,
                     inc, description) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17)
 
@@ -83,8 +83,8 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path):
 
                 if(evalue > HIT_EVALUE):
                     log.debug(
-                        'discard low E value: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
-                        contig_id, start, stop, strand, subject, length, truncated, score, evalue
+                        'discard low E value: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
+                        sequence_id, start, stop, strand, subject, length, truncated, score, evalue
                     )
                 else:
                     rfam_id = f'{bc.DB_XREF_RFAM}:{accession}'
@@ -95,7 +95,7 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path):
                     ncrna_region = OrderedDict()
                     ncrna_region['type'] = bc.FEATURE_NC_RNA_REGION
                     ncrna_region['class'] = determine_class(description)
-                    ncrna_region['contig'] = contig_id
+                    ncrna_region['sequence'] = sequence_id
                     ncrna_region['start'] = start
                     ncrna_region['stop'] = stop
                     ncrna_region['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE
@@ -114,13 +114,13 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path):
                     ncrna_region['evalue'] = evalue
                     ncrna_region['db_xrefs'] = db_xrefs
 
-                    nt = bu.extract_feature_sequence(ncrna_region, contigs[contig_id])  # extract nt sequences
+                    nt = bu.extract_feature_sequence(ncrna_region, sequences[sequence_id])  # extract nt sequences
                     ncrna_region['nt'] = nt
 
                     ncrnas.append(ncrna_region)
                     log.info(
-                        'contig=%s, start=%i, stop=%i, strand=%s, label=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
-                        ncrna_region['contig'], ncrna_region['start'], ncrna_region['stop'], ncrna_region['strand'], ncrna_region['label'], ncrna_region['product'], length, truncated, ncrna_region['score'], ncrna_region['evalue']
+                        'seq=%s, start=%i, stop=%i, strand=%s, label=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
+                        ncrna_region['sequence'], ncrna_region['start'], ncrna_region['stop'], ncrna_region['strand'], ncrna_region['label'], ncrna_region['product'], length, truncated, ncrna_region['score'], ncrna_region['evalue']
                     )
     log.info('predicted=%i', len(ncrnas))
     return ncrnas
diff --git a/bakta/features/orf.py b/bakta/features/orf.py
index a58c72a6..61cf2790 100644
--- a/bakta/features/orf.py
+++ b/bakta/features/orf.py
@@ -25,8 +25,8 @@ def detect_spurious(orfs: Sequence[dict]):
                 orf = orf_by_aa_digest[hit.name.decode()]
                 if hit.evalue > bc.MIN_HMM_EVALUE:
                     log.debug(
-                        'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
-                        orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
+                        'discard low spurious E value: seq=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f',
+                        orf['sequence'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
                     )
                 else:
                     discard = OrderedDict()
@@ -38,8 +38,8 @@ def detect_spurious(orfs: Sequence[dict]):
                     orf['discarded'] = discard
                     discarded_orfs.append(orf)
                     log.info(
-                        'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
-                        orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
+                        'discard spurious: seq=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f',
+                        orf['sequence'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score
                     )
     log.info('discarded=%i', len(discarded_orfs))
     return discarded_orfs
@@ -47,7 +47,7 @@ def detect_spurious(orfs: Sequence[dict]):
 
 def get_orf_key(orf: dict) -> str:
     """Generate a standardized and unique ORF-like feature key for internal store/analyze/parse/retrieval cycles."""
-    return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}"
+    return f"{orf['aa_hexdigest']}-{orf['sequence']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}"
 
 
 def get_orf_dictionary(orfs: Sequence[dict]) -> Dict[str, dict]:
diff --git a/bakta/features/ori.py b/bakta/features/ori.py
index 82281612..bc7c6e59 100644
--- a/bakta/features/ori.py
+++ b/bakta/features/ori.py
@@ -18,7 +18,7 @@
 log = logging.getLogger('ORI')
 
 
-def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[dict]:
+def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]:
     """Search for oriT/C sequences."""
 
     database = 'oric.fna' if ori_type == bc.FEATURE_ORIC else 'orit.fna'
@@ -26,7 +26,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
     cmd = [
         'blastn',
         '-query', str(cfg.db_path.joinpath(database)),
-        '-subject', str(contigs_path),
+        '-subject', str(sequences_path),
         '-culling_limit', '1',
         '-evalue', HIT_EVALUE,
         '-num_threads', str(cfg.threads),
@@ -57,33 +57,33 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
                 'ori_start': int(cols[1]),
                 'ori_end': int(cols[2]),
                 'ori_length': int(cols[3]),
-                'contig': cols[4],
-                'contig_start': int(cols[5]),
-                'contig_stop': int(cols[6]),
+                'sequence': cols[4],
+                'sequence_start': int(cols[5]),
+                'sequence_stop': int(cols[6]),
                 'strand': bc.STRAND_FORWARD if cols[9] == 'plus' else bc.STRAND_REVERSE,
                 'coverage': int(cols[7]) / int(cols[3]),
                 'identity': int(cols[8]) / int(cols[7])
             }
             if(hit['strand'] == bc.STRAND_REVERSE):
-                hit['contig_start'], hit['contig_stop'] = hit['contig_stop'], hit['contig_start']
+                hit['sequence_start'], hit['sequence_stop'] = hit['sequence_stop'], hit['sequence_start']
             if(hit['coverage'] >= HIT_COVERAGE and hit['identity'] >= HIT_IDENTITY):
-                contig_hits = hits.get(hit['contig'], [])
-                contig_hits.append(hit)
-                if(len(contig_hits) == 1):
-                    hits[hit['contig']] = contig_hits
+                sequence_hits = hits.get(hit['sequence'], [])
+                sequence_hits.append(hit)
+                if(len(sequence_hits) == 1):
+                    hits[hit['sequence']] = sequence_hits
                 log.debug(
-                    'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f',
-                    ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity']
+                    'raw hit: type=%s, seq=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f',
+                    ori_type, hit['sequence'], hit['sequence_start'], hit['sequence_stop'], hit['strand'], hit['coverage'], hit['identity']
                 )
 
     # combine overlapping hits (simple 1D array peak detection)
     oris = []
-    for contig in genome['contigs']:
-        contig_hits = hits.get(contig['id'], None)
-        if(contig_hits):
-            region_hits = [0] * (contig['length'] + 1)  # init with extra leading slot (start at 1)
-            for hit in contig_hits:
-                for i in range(hit['contig_start'], hit['contig_stop'] + 1):
+    for seq in genome['sequences']:
+        sequence_hits = hits.get(seq['id'], None)
+        if(sequence_hits):
+            region_hits = [0] * (seq['length'] + 1)  # init with extra leading slot (start at 1)
+            for hit in sequence_hits:
+                for i in range(hit['sequence_start'], hit['sequence_stop'] + 1):
                     region_hits[i] += 1
             start = -1
             stop = -1
@@ -91,11 +91,11 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
                 if(hit_count == 0):
                     if(start != -1):  # new stop
                         stop = i - 1
-                        if(ori_type == bc.FEATURE_ORIC and contig['type'] == bc.REPLICON_PLASMID):
+                        if(ori_type == bc.FEATURE_ORIC and seq['type'] == bc.REPLICON_PLASMID):
                             ori_type = bc.FEATURE_ORIV
                         ori = OrderedDict()
                         ori['type'] = ori_type
-                        ori['contig'] = contig['id']
+                        ori['sequence'] = seq['id']
                         ori['start'] = start
                         ori['stop'] = stop
                         ori['strand'] = bc.STRAND_UNKNOWN
@@ -109,12 +109,12 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di
                         else:
                             ori['product'] = 'origin of replication'
 
-                        nt = bu.extract_feature_sequence(ori, contig)  # extract nt sequences
+                        nt = bu.extract_feature_sequence(ori, seq)  # extract nt sequences
                         ori['nt'] = nt
 
                         log.info(
-                            'type=%s, contig=%s, start=%i, stop=%i, nt=[%s..%s]',
-                            ori_type, ori['contig'], ori['start'], ori['stop'], nt[:10], nt[-10:]
+                            'type=%s, seq=%s, start=%i, stop=%i, nt=[%s..%s]',
+                            ori_type, ori['sequence'], ori['start'], ori['stop'], nt[:10], nt[-10:]
                         )
                         start = -1
                         stop = -1
diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py
index 4c23d72a..640ebfeb 100644
--- a/bakta/features/r_rna.py
+++ b/bakta/features/r_rna.py
@@ -17,7 +17,7 @@
 log = logging.getLogger('R_RNA')
 
 
-def predict_r_rnas(genome: dict, contigs_path: Path):
+def predict_r_rnas(genome: dict, sequences_path: Path):
     """Search for ribosomal RNA sequences."""
 
     output_path = cfg.tmp_path.joinpath('rrna.tsv')
@@ -35,7 +35,7 @@ def predict_r_rnas(genome: dict, contigs_path: Path):
         cmd.append('-Z')
         cmd.append(str(2 * genome['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('rRNA')))
-    cmd.append(str(contigs_path))
+    cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
     proc = sp.run(
         cmd,
@@ -51,12 +51,12 @@ def predict_r_rnas(genome: dict, contigs_path: Path):
         raise Exception(f'cmscan error! error code: {proc.returncode}')
 
     rrnas = []
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
                 (
-                    subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to,
+                    subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to,
                     start, stop, strand, trunc, passed, gc, bias, score, evalue,
                     inc, description
                 ) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17)
@@ -89,8 +89,8 @@ def predict_r_rnas(genome: dict, contigs_path: Path):
                     consensus_length = 2925
                 else:
                     log.warning(
-                        'unknown rRNA detected! accession=%s, contig=%s, start=%i, stop=%i, strand=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
-                        accession, contig_id, start, stop, strand, length, truncated, score, evalue
+                        'unknown rRNA detected! accession=%s, seq=%s, start=%i, stop=%i, strand=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e',
+                        accession, sequence_id, start, stop, strand, length, truncated, score, evalue
                     )
                     continue
 
@@ -100,13 +100,13 @@ def predict_r_rnas(genome: dict, contigs_path: Path):
 
                 if(coverage < HIT_COVERAGE):
                     log.debug(
-                        'discard low coverage: contig=%s, rRNA=%s, start=%i, stop=%i, strand=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e',
-                        contig_id, rrna_tag, start, stop, strand, length, coverage, truncated, score, evalue
+                        'discard low coverage: seq=%s, rRNA=%s, start=%i, stop=%i, strand=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e',
+                        sequence_id, rrna_tag, start, stop, strand, length, coverage, truncated, score, evalue
                     )
                 else:
                     rrna = OrderedDict()
                     rrna['type'] = bc.FEATURE_R_RNA
-                    rrna['contig'] = contig_id
+                    rrna['sequence'] = sequence_id
                     rrna['start'] = start
                     rrna['stop'] = stop
                     rrna['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE
@@ -126,13 +126,13 @@ def predict_r_rnas(genome: dict, contigs_path: Path):
                     rrna['evalue'] = evalue
                     rrna['db_xrefs'] = db_xrefs
 
-                    nt = bu.extract_feature_sequence(rrna, contigs[contig_id])  # extract nt sequences
+                    nt = bu.extract_feature_sequence(rrna, sequences[sequence_id])  # extract nt sequences
                     rrna['nt'] = nt
 
                     rrnas.append(rrna)
                     log.info(
-                        'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]',
-                        rrna['contig'], rrna['start'], rrna['stop'], rrna['strand'], rrna['gene'], rrna['product'], length, coverage, truncated, score, evalue, nt[:10], nt[-10:]
+                        'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]',
+                        rrna['sequence'], rrna['start'], rrna['stop'], rrna['strand'], rrna['gene'], rrna['product'], length, coverage, truncated, score, evalue, nt[:10], nt[-10:]
                     )
 
     log.info('predicted=%i', len(rrnas))
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index 74124c32..500d52ef 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -23,11 +23,11 @@
 def extract(genome: dict):
     """Predict open reading frames in mem via BioPython."""
     orfs = []
-    for contig in genome['contigs']:
-        dna_seq = Seq(contig['sequence'])
-        for strand, seq in [(bc.STRAND_FORWARD, dna_seq), (bc.STRAND_REVERSE, dna_seq.reverse_complement())]:  # strands +/-
+    for seq in genome['sequences']:
+        nt_seq = Seq(seq['sequence'])
+        for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]:  # strands +/-
             for frame in range(3):  # frames 1/2/3 -> 0, 1, 2
-                seq_frame = seq[frame:]
+                seq_frame = strand_nt_seq[frame:]
 
                 # remove non-triplet tail nucleotides
                 residue = len(seq_frame) % 3
@@ -46,12 +46,12 @@ def extract(genome: dict):
                             dna_start = aa_start * 3 + frame + 1  # +1: 0 based idx to 1 based
                             dna_stop = aa_end * 3 + 2 + frame + 1
                         else:
-                            dna_start = len(seq) - frame - (aa_end + 1) * 3 + 1
-                            dna_stop = len(seq) - frame - aa_start * 3
+                            dna_start = len(strand_nt_seq) - frame - (aa_end + 1) * 3 + 1
+                            dna_stop = len(strand_nt_seq) - frame - aa_start * 3
                         
                         sorf = OrderedDict()
                         sorf['type'] = bc.FEATURE_SORF
-                        sorf['contig'] = contig['id']
+                        sorf['sequence'] = seq['id']
                         sorf['start'] = dna_start
                         sorf['stop'] = dna_stop
                         sorf['strand'] = strand
@@ -63,13 +63,13 @@ def extract(genome: dict):
                         sorf['aa_digest'] = aa_digest
                         sorf['aa_hexdigest'] = aa_hexdigest
                         
-                        nt = bu.extract_feature_sequence(sorf, contig)  # extract nt sequences
+                        nt = bu.extract_feature_sequence(sorf, seq)  # extract nt sequences
                         sorf['nt'] = nt
 
                         orfs.append(sorf)
                         log.debug(
-                            'contig=%s, start=%i, stop=%i, strand=%s, frame=%i, aa-length=%i, aa=%s, nt=[%s..%s]',
-                            contig['id'], sorf['start'], sorf['stop'], strand, frame, len(aa), aa, nt[:10], nt[-10:]
+                            'seq=%s, start=%i, stop=%i, strand=%s, frame=%i, aa-length=%i, aa=%s, nt=[%s..%s]',
+                            seq['id'], sorf['start'], sorf['stop'], strand, frame, len(aa), aa, nt[:10], nt[-10:]
                         )
                     aa_start = aa_seq.find('M', aa_start + 1)
                     if(aa_start > aa_end):
@@ -89,68 +89,68 @@ def get_feature_stop(feature: dict) -> int:
 
 def overlap_filter(genome: dict, orfs_raw: Sequence[dict]):
     """Filter in-mem ORFs by overlapping CDSs."""
-    t_rnas_per_contig = {k['id']: [] for k in genome['contigs']}
+    t_rnas_per_sequence = {seq['id']: [] for seq in genome['sequences']}
     for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []):
-        t_rnas = t_rnas_per_contig[t_rna['contig']]
+        t_rnas = t_rnas_per_sequence[t_rna['sequence']]
         t_rnas.append(t_rna)
     for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []):
-        t_rnas = t_rnas_per_contig[tm_rna['contig']]
+        t_rnas = t_rnas_per_sequence[tm_rna['sequence']]
         t_rnas.append(tm_rna)
 
-    r_rna_per_contig = {k['id']: [] for k in genome['contigs']}
+    r_rna_per_sequence = {seq['id']: [] for seq in genome['sequences']}
     for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []):
-        r_rnas = r_rna_per_contig[r_rna['contig']]
+        r_rnas = r_rna_per_sequence[r_rna['sequence']]
         r_rnas.append(r_rna)
 
-    # nc_rnas_per_contig = {k['id']: [] for k in genome['contigs']}
+    # nc_rnas_per_sequence = {k['id']: [] for k in genome['sequences']}
     # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA, []):
-    #     nc_rnas = nc_rnas_per_contig[nc_rna['contig']]
+    #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
     # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []):
-    #     nc_rnas = nc_rnas_per_contig[nc_rna['contig']]
+    #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
 
-    crispr_arrays_per_contig = {k['id']: [] for k in genome['contigs']}
+    crispr_arrays_per_sequence = {seq['id']: [] for seq in genome['sequences']}
     for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []):
-        crispr_arrays = crispr_arrays_per_contig[crispr_array['contig']]
+        crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
 
-    cdss_per_contig = {k['id']: [] for k in genome['contigs']}
+    cdss_per_sequence = {k['id']: [] for k in genome['sequences']}
     for cds in genome['features'].get(bc.FEATURE_CDS, []):
-        cdss = cdss_per_contig[cds['contig']]
+        cdss = cdss_per_sequence[cds['sequence']]
         cdss.append(cds)
 
-    sorfs_per_contig = {k['id']: [] for k in genome['contigs']}
+    sorfs_per_sequence = {seq['id']: [] for seq in genome['sequences']}
     for sorf in orfs_raw:
-        orfs = sorfs_per_contig[sorf['contig']]
+        orfs = sorfs_per_sequence[sorf['sequence']]
         orfs.append(sorf)
 
     discarded_sorf_keys = set()
     with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe:
         futures = []
-        for contig in genome['contigs']:
-            contig_sorfs = sorfs_per_contig[contig['id']]
-            log.debug('filter: contig=%s, # sORFs=%i', contig['id'], len(contig_sorfs))
-            if(len(contig_sorfs) < 100):  # execute sORF filter task
-                sorf_keys = filter_sorf(contig_sorfs, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']])
+        for seq in genome['sequences']:
+            sequence_sorfs = sorfs_per_sequence[seq['id']]
+            log.debug('filter: seq=%s, # sORFs=%i', seq['id'], len(sequence_sorfs))
+            if(len(sequence_sorfs) < 100):  # execute sORF filter task
+                sorf_keys = filter_sorf(sequence_sorfs, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']])
                 for sorf_key in [sk for sk in sorf_keys if sk is not None]:
                     discarded_sorf_keys.add(sorf_key)
-            elif(len(contig_sorfs) < 1000):  # submit sORF filter task to thread pool
-                futures.append(tpe.submit(filter_sorf, contig_sorfs, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']]))
+            elif(len(sequence_sorfs) < 1000):  # submit sORF filter task to thread pool
+                futures.append(tpe.submit(filter_sorf, sequence_sorfs, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']]))
             else:  # submit sORF chunk filter tasks to thread pool
-                chunk_size = math.ceil(len(contig_sorfs) / cfg.threads) if (len(contig_sorfs) >= cfg.threads * 1000) else 1000
+                chunk_size = math.ceil(len(sequence_sorfs) / cfg.threads) if (len(sequence_sorfs) >= cfg.threads * 1000) else 1000
                 log.debug('filter: chunk-size=%i', chunk_size)
-                for i in range(0, len(contig_sorfs), chunk_size):
-                    sorf_chunk = contig_sorfs[i:i + chunk_size]
+                for i in range(0, len(sequence_sorfs), chunk_size):
+                    sorf_chunk = sequence_sorfs[i:i + chunk_size]
                     log.debug('filter chunk: i=%i, chunk-elements=%i', i, len(sorf_chunk))
-                    futures.append(tpe.submit(filter_sorf, sorf_chunk, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']]))
+                    futures.append(tpe.submit(filter_sorf, sorf_chunk, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']]))
         for f in futures:
             for sorf_key in [sk for sk in f.result() if sk is not None]:
                 discarded_sorf_keys.add(sorf_key)
 
     valid_sorfs = []
     discarded_sorfs = []
-    for sorfs in sorfs_per_contig.values():
+    for sorfs in sorfs_per_sequence.values():
         for sorf in sorfs:
             key = orf.get_orf_key(sorf)
             if(key in discarded_sorf_keys):
@@ -162,12 +162,12 @@ def overlap_filter(genome: dict, orfs_raw: Sequence[dict]):
     return valid_sorfs, discarded_sorfs
 
 
-def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_r_rnas: Sequence[dict], contig_t_rnas: Sequence[dict], contig_crispr_arrays: Sequence[dict]):
+def filter_sorf(sorf_chunk: Sequence[dict], sequence_cdss: Sequence[dict], sequence_r_rnas: Sequence[dict], sequence_t_rnas: Sequence[dict], sequence_crispr_arrays: Sequence[dict]):
     discarded_sorf_keys = []
     for sorf in sorf_chunk:
         break_flag = False
         # filter CDS overlapping ORFs
-        for cds in contig_cdss:
+        for cds in sequence_cdss:
             # log.debug('filter short ORFs by CDS: %s%i[%i->%i]', cds['strand'], cds['frame'], cds['start'], cds['stop'])
             if(sorf['strand'] == cds['strand']):
                 if(sorf['frame'] == cds['frame']):
@@ -218,7 +218,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_
             continue
 
         # filter rRNA overlapping ORFs
-        for r_rna in contig_r_rnas:
+        for r_rna in sequence_r_rnas:
             # log.debug('filter short ORFs by rRNA: %s[%i->%i]', r_rna['strand'], r_rna['start'], r_rna['stop'])
             # fast/simple overlap detection for rRNAs
             if(sorf['stop'] < r_rna['start'] or sorf['start'] > r_rna['stop']):
@@ -232,7 +232,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_
 
         # filter tRNA overlapping ORFs
         # log.debug('filter short ORFs by tRNA: %s[%i->%i]', t_rna['strand'], t_rna['start'], t_rna['stop'])
-        for t_rna in contig_t_rnas:
+        for t_rna in sequence_t_rnas:
             # fast/simple overlap detection for tRNAs
             if(sorf['stop'] < t_rna['start'] or sorf['start'] > t_rna['stop']):
                 continue
@@ -245,7 +245,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_
 
         # filter CRISPR array overlapping ORFs
         # log.debug('filter short ORFs by CRISPR: [%i->%i]', crispr['start'], crispr['stop'])
-        for crispr in contig_crispr_arrays:
+        for crispr in sequence_crispr_arrays:
             # fast/simple overlap detection for CRISPR
             if(sorf['stop'] < crispr['start'] or sorf['start'] > crispr['stop']):
                 continue
@@ -365,8 +365,8 @@ def search(sorfs: Sequence[dict], cluster_type: str):
                 result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id
                 sorf['psc' if cluster_type == 'full' else 'pscc'] = result
                 log.info(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s',
-                    sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
+                    'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s',
+                    sorf['sequence'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     sorfs_found = []
diff --git a/bakta/features/signal_peptides.py b/bakta/features/signal_peptides.py
index f1197f2a..4657f730 100644
--- a/bakta/features/signal_peptides.py
+++ b/bakta/features/signal_peptides.py
@@ -62,8 +62,8 @@ def search(orfs: Sequence[dict], orf_aa_path: Path):
                         orf[bc.FEATURE_SIGNAL_PEPTIDE] = {}
                     orf[bc.FEATURE_SIGNAL_PEPTIDE] = sig_pep
                     log.debug(
-                        'hit: contig=%s, nt-start=%i, nt-stop=%i, aa-start=%i, aa-stop=%i, score=%0.2f',
-                        orf['contig'], start_nt, stop_nt, start_aa, stop_aa, score
+                        'hit: seq=%s, nt-start=%i, nt-stop=%i, aa-start=%i, aa-stop=%i, score=%0.2f',
+                        orf['sequence'], start_nt, stop_nt, start_aa, stop_aa, score
                     )
                     sig_peps.append(sig_pep)
                 else:
diff --git a/bakta/features/t_rna.py b/bakta/features/t_rna.py
index efa4a39b..901d3d9d 100644
--- a/bakta/features/t_rna.py
+++ b/bakta/features/t_rna.py
@@ -42,7 +42,7 @@
 }
 
 
-def predict_t_rnas(genome: dict, contigs_path: Path):
+def predict_t_rnas(genome: dict, sequences_path: Path):
     """Search for tRNA sequences."""
 
     txt_output_path = cfg.tmp_path.joinpath('trna.tsv')
@@ -53,7 +53,7 @@ def predict_t_rnas(genome: dict, contigs_path: Path):
         '--output', str(txt_output_path),
         '--fasta', str(fasta_output_path),
         '--thread', str(cfg.threads),
-        str(contigs_path)
+        str(sequences_path)
     ]
     log.debug('cmd=%s', cmd)
     proc = sp.run(
@@ -70,20 +70,20 @@ def predict_t_rnas(genome: dict, contigs_path: Path):
         raise Exception(f'tRNAscan-SE error! error code: {proc.returncode}')
 
     trnas = {}
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with txt_output_path.open() as fh:
         for line in fh.readlines()[3:]:  # skip first 3 lines
-            (contig_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t')
+            (sequence_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t')
 
             start, stop, strand = int(start), int(stop), bc.STRAND_FORWARD
             if(start > stop):  # reverse
                 start, stop = stop, start
                 strand = bc.STRAND_REVERSE
-            contig_id = contig_id.strip()  # bugfix for extra single whitespace in tRNAscan-SE output
+            sequence_id = sequence_id.strip()  # bugfix for extra single whitespace in tRNAscan-SE output
 
             trna = OrderedDict()
             trna['type'] = bc.FEATURE_T_RNA
-            trna['contig'] = contig_id
+            trna['sequence'] = sequence_id
             trna['start'] = start
             trna['stop'] = stop
             trna['strand'] = strand
@@ -101,7 +101,7 @@ def predict_t_rnas(genome: dict, contigs_path: Path):
 
             trna['score'] = float(score)
 
-            nt = bu.extract_feature_sequence(trna, contigs[contig_id])  # extract nt sequences
+            nt = bu.extract_feature_sequence(trna, sequences[sequence_id])  # extract nt sequences
             trna['nt'] = nt
 
             trna['db_xrefs'] = []
@@ -109,11 +109,11 @@ def predict_t_rnas(genome: dict, contigs_path: Path):
             if(so_term):
                 trna['db_xrefs'].append(so_term.id)
 
-            key = f'{contig_id}.trna{trna_id}'
+            key = f'{sequence_id}.trna{trna_id}'
             trnas[key] = trna
             log.info(
-                'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, score=%1.1f, nt=[%s..%s]',
-                trna['contig'], trna['start'], trna['stop'], trna['strand'], trna.get('gene', ''), trna['product'], trna['score'], nt[:10], nt[-10:]
+                'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, score=%1.1f, nt=[%s..%s]',
+                trna['sequence'], trna['start'], trna['stop'], trna['strand'], trna.get('gene', ''), trna['product'], trna['score'], nt[:10], nt[-10:]
             )
 
     with fasta_output_path.open() as fh:
diff --git a/bakta/features/tm_rna.py b/bakta/features/tm_rna.py
index 17abd1aa..26d0bc6c 100644
--- a/bakta/features/tm_rna.py
+++ b/bakta/features/tm_rna.py
@@ -13,7 +13,7 @@
 log = logging.getLogger('TM_RNA')
 
 
-def predict_tm_rnas(genome: dict, contigs_path: Path):
+def predict_tm_rnas(genome: dict, sequences_path: Path):
     """Search for tmRNA sequences."""
 
     txt_output_path = cfg.tmp_path.joinpath('tmrna.tsv')
@@ -23,7 +23,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path):
         f'-gc{cfg.translation_table}',
         '-w',  # batch mode
         '-o', str(txt_output_path),
-        str(contigs_path)
+        str(sequences_path)
     ]
     if(cfg.complete):
         cmd.append('-c')  # complete circular sequence(s)
@@ -45,14 +45,14 @@ def predict_tm_rnas(genome: dict, contigs_path: Path):
         raise Exception(f'aragorn error! error code: {proc.returncode}')
 
     tmrnas = []
-    contigs = {c['id']: c for c in genome['contigs']}
+    sequences = {seq['id']: seq for seq in genome['sequences']}
     with txt_output_path.open() as fh:
-        contig_id = None
+        sequence_id = None
         for line in fh:
             line = line.strip()
             cols = line.split()
             if(line[0] == '>'):
-                contig_id = cols[0][1:]
+                sequence_id = cols[0][1:]
             elif(len(cols) == 5):
                 (nr, type, location, tag_location, tag_aa) = line.split()
                 strand = bc.STRAND_FORWARD
@@ -66,7 +66,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path):
                 if(start > 0 and stop > 0):  # prevent edge tmRNA on linear sequences
                     tmrna = OrderedDict()
                     tmrna['type'] = bc.FEATURE_TM_RNA
-                    tmrna['contig'] = contig_id
+                    tmrna['sequence'] = sequence_id
                     tmrna['start'] = start
                     tmrna['stop'] = stop
                     tmrna['strand'] = strand
@@ -75,7 +75,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path):
                     tmrna['tag_aa'] = tag_aa.replace('*', '')
                     tmrna['db_xrefs'] = [so.SO_TMRNA.id]
 
-                    nt = bu.extract_feature_sequence(tmrna, contigs[contig_id])  # extract nt sequences
+                    nt = bu.extract_feature_sequence(tmrna, sequences[sequence_id])  # extract nt sequences
                     tmrna['nt'] = nt
 
                     if(start > stop):
@@ -83,8 +83,8 @@ def predict_tm_rnas(genome: dict, contigs_path: Path):
 
                     tmrnas.append(tmrna)
                     log.info(
-                        'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s]',
-                        tmrna['contig'], tmrna['start'], tmrna['stop'], tmrna['strand'], tmrna['gene'], tmrna['product'], nt[:10], nt[-10:]
+                        'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s]',
+                        tmrna['sequence'], tmrna['start'], tmrna['stop'], tmrna['strand'], tmrna['gene'], tmrna['product'], nt[:10], nt[-10:]
                     )
     log.info('predicted=%i', len(tmrnas))
     return tmrnas
diff --git a/bakta/io.py b/bakta/io.py
new file mode 100644
index 00000000..64c009e8
--- /dev/null
+++ b/bakta/io.py
@@ -0,0 +1,196 @@
+import atexit
+import logging
+import os
+import sys
+
+from pathlib import Path
+
+import bakta
+import bakta.constants as bc
+import bakta.config as cfg
+import bakta.utils as bu
+import bakta.io.fasta as fasta
+import bakta.io.json as json
+import bakta.io.tsv as tsv
+import bakta.io.gff as gff
+import bakta.io.insdc as insdc
+import bakta.plot as plot
+
+
+log = logging.getLogger('IO')
+
+
+def main():
+    # parse options and arguments
+    parser = bu.init_parser(sub_command='_proteins')
+    parser.add_argument('input', metavar='<input>', help='Bakta annotations in JSON format')
+    
+    arg_group_io = parser.add_argument_group('Input / Output')
+    arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
+    arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
+    arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder')
+    
+    arg_group_general = parser.add_argument_group('General')
+    arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit')
+    arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information')
+    arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.')
+    arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}')
+    args = parser.parse_args()
+
+    ############################################################################
+    # Setup logging
+    ############################################################################
+    cfg.prefix = args.prefix if args.prefix else Path(args.input).stem
+    output_path = cfg.check_output_path(args.output, args.force)
+    cfg.force = args.force
+    log.info('force=%s', args.force)
+    
+    bu.setup_logger(output_path, cfg.prefix, args)
+    log.info('prefix=%s', cfg.prefix)
+    log.info('output=%s', output_path)
+
+    ############################################################################
+    # Checks and configurations
+    # - check parameters and setup global configuration
+    # - test database
+    # - test binary dependencies
+    ############################################################################
+    try:
+        if args.input == '':
+            raise ValueError('File path argument must be non-empty')
+        annotation_path = Path(args.input).resolve()
+        cfg.check_readability('annotation', annotation_path)
+        cfg.check_content_size('annotation', annotation_path)
+    except:
+        log.error('provided annotation file not valid! path=%s', args.input)
+        sys.exit(f'ERROR: annotation file ({args.input}) not valid!')
+    log.info('input-path=%s', annotation_path)
+    
+    cfg.check_tmp_path(args)
+    cfg.debug = args.debug
+    log.info('debug=%s', cfg.debug)
+    cfg.verbose = True if cfg.debug else args.verbose
+    log.info('verbose=%s', cfg.verbose)
+    cfg.user_proteins = cfg.check_user_proteins(args)
+    
+    if(cfg.verbose):
+        print(f'Bakta v{bakta.__version__}')
+        print('Options and arguments:')
+        print(f'\tinput: {annotation_path}')
+        print(f'\toutput: {cfg.output_path}')
+        print(f'\tprefix: {cfg.prefix}')
+        if(cfg.force): print(f'\tforce: {cfg.force}')
+    
+    if(cfg.debug):
+        print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}")
+    else:
+        atexit.register(bu.cleanup, log, cfg.tmp_path)  # register cleanup exit hook
+    
+    ############################################################################
+    # Import annotations from JSON
+    ############################################################################
+    print('Parse genome annotations...')
+    with annotation_path.open('r') as fh:
+        annotation = json.load(fh)
+    features = annotation['features']
+    sequences = annotation['sequences']
+    genome = {
+        'features': features,
+        'sequence': sequences,
+        'taxon': annotation['genome']
+    }
+    features_by_sequence = {k['id']: [] for k in genome['sequences']}
+    for feature in genome['features']:
+        sequence_features = features_by_sequence.get(feature['sequence'])
+        sequence_features.append(feature)
+
+    ############################################################################
+    # Write output files
+    # - write optional output files in GFF3/GenBank/EMBL formats
+    # - measure runtime
+    # - write comprehensive annotation results as JSON
+    # - remove temp directory
+    ############################################################################
+    print(f'\nExport annotation results to: {cfg.output_path}')
+    print('\thuman readable TSV...')
+    tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
+    tsv.write_features(genome['sequences'], features_by_sequence, tsv_path)
+
+    print('\tGFF3...')
+    gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
+    gff.write_features(genome, features_by_sequence, gff3_path)
+
+    print('\tINSDC GenBank & EMBL...')
+    genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
+    embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
+    insdc.write_features(genome, features, genbank_path, embl_path)
+
+    print('\tgenome sequences...')
+    fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
+    fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True)
+
+    print('\tfeature nucleotide sequences...')
+    ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
+    fasta.write_ffn(features, ffn_path)
+
+    print('\ttranslated CDS sequences...')
+    faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa')
+    fasta.write_faa(features, faa_path)
+
+    print('\tfeature inferences...')
+    tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
+    tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path)
+
+    if(cfg.skip_plot  or  cfg.meta):
+        print('\tskip generation of circular genome plot...')
+    else:
+        print('\tcircular genome plot...')
+        plot.write(features, genome['sequences'], cfg.output_path)
+
+    if(cfg.skip_cds is False):
+        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
+        print('\thypothetical TSV...')
+        tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
+        tsv.write_hypotheticals(hypotheticals, tsv_path)
+
+        print('\ttranslated hypothetical CDS sequences...')
+        faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa')
+        fasta.write_faa(hypotheticals, faa_path)
+
+    print('\tGenome and annotation summary...')
+    summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
+    with summary_path.open('w') as fh_out:
+        genome_stats = bu.calc_genome_stats(genome, features)
+        fh_out.write('Sequence(s):\n')
+        fh_out.write(f"Length: {genome['size']:}\n")
+        fh_out.write(f"Count: {len(genome['sequences'])}\n")
+        fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
+        fh_out.write(f"N50: {genome_stats['n50']:}\n")
+        fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
+        fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n")
+        fh_out.write('\nAnnotation:\n')
+        fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n")
+        fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n")
+        fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n")
+        fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n")
+        fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n")
+        fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n")
+        cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
+        fh_out.write(f"CDSs: {len(cdss)}\n")
+        fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
+        fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
+        fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
+        fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n")
+        fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n")
+        fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n")
+        fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n")
+        fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n")
+        fh_out.write('\nBakta:\n')
+        fh_out.write(f'Software: v{bakta.__version__}\n')
+        fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
+        fh_out.write('DOI: 10.1099/mgen.0.000685\n')
+        fh_out.write('URL: github.com/oschwengers/bakta\n')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py
index 240acbf8..076eae4d 100644
--- a/bakta/io/fasta.py
+++ b/bakta/io/fasta.py
@@ -18,60 +18,60 @@
 FASTA_LINE_WRAPPING = 60
 
 
-def import_contigs(contigs_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
-    """Import raw contigs."""
-    contigs = []
-    with xopen(str(contigs_path), threads=0) as fh:
+def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]:
+    """Import raw sequences from Fasta file."""
+    sequences = []
+    with xopen(str(sequences_path), threads=0) as fh:
         for record in SeqIO.parse(fh, 'fasta'):
-            seq = str(record.seq).upper()
-            if('-' in seq):
-                dash_count = seq.count('-')
-                seq = seq.replace('-', '')
+            raw_sequence = str(record.seq).upper()
+            if('-' in raw_sequence):
+                dash_count = raw_sequence.count('-')
+                raw_sequence = raw_sequence.replace('-', '')
                 log.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count)
             if(is_dna):
-                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(seq) is None):
+                if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                     log.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                     raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
             else:
-                if(seq[-1] == '*'):  # remove trailing stop asterik
-                    seq = seq[:-1]
-                    log.debug('import: Removed trailing asterik! id=%s, seq=%s', record.id, seq)
-                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(seq) is None):
-                    log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, seq)
+                if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
+                    raw_sequence = raw_sequence[:-1]
+                    log.debug('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence)
+                if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
+                    log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                     raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
 
-            contig = {
+            sequence = {
                 'id': record.id,
                 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '',
-                'sequence': seq,
-                'length': len(seq)
+                'sequence': raw_sequence,
+                'length': len(raw_sequence)
             }
             if(is_genomic):
-                contig['complete'] = False
-                contig['type'] = bc.REPLICON_CONTIG
-                contig['topology'] = bc.TOPOLOGY_LINEAR
+                sequence['complete'] = False
+                sequence['type'] = bc.REPLICON_CONTIG
+                sequence['topology'] = bc.TOPOLOGY_LINEAR
             log.info(
                 'imported: id=%s, length=%i, description=%s, genomic=%s, dna=%s',
-                contig['id'], contig['length'], contig['description'], is_genomic, is_dna
+                sequence['id'], sequence['length'], sequence['description'], is_genomic, is_dna
             )
-            contigs.append(contig)
-    return contigs
+            sequences.append(sequence)
+    return sequences
 
 
-def export_contigs(contigs: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
-    """Write contigs to Fasta file."""
+def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False):
+    """Write sequences to Fasta file."""
     log.info('write genome sequences: path=%s, description=%s, wrap=%s', fasta_path, description, wrap)
 
     with fasta_path.open('wt') as fh:
-        for contig in contigs:
+        for seq in sequences:
             if(description):
-                fh.write(f">{contig['id']} {contig['description']}\n")
+                fh.write(f">{seq['id']} {seq['description']}\n")
             else:
-                fh.write(f">{contig['id']}\n")
+                fh.write(f">{seq['id']}\n")
             if(wrap):
-                fh.write(wrap_sequence(contig['sequence']))
+                fh.write(wrap_sequence(seq['sequence']))
             else:
-                fh.write(contig['sequence'])
+                fh.write(seq['sequence'])
                 fh.write('\n')
 
 
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 3f615a54..9d7cd355 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -14,7 +14,7 @@
 log = logging.getLogger('GFF')
 
 
-def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path):
+def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_path: Path):
     """Export features in GFF3 format."""
     log.info('write features: path=%s', gff3_path)
 
@@ -31,24 +31,24 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
         fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
 
-        for contig in genome['contigs']:  # write features
-            fh.write(f"##sequence-region {contig['id']} 1 {contig['length']}\n")  # sequence region
+        for seq in genome['sequences']:  # write features
+            fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region
 
             # write landmark region
             annotations = {
-                'ID': contig['id'],
-                'Name': contig['id']
+                'ID': seq['id'],
+                'Name': seq['id']
             }
-            if(contig['topology'] == bc.TOPOLOGY_CIRCULAR):
+            if(seq['topology'] == bc.TOPOLOGY_CIRCULAR):
                 annotations['Is_circular'] = 'true'
             annotations = encode_annotations(annotations)
-            fh.write(f"{contig['id']}\tBakta\tregion\t1\t{str(contig['length'])}\t.\t+\t.\t{annotations}\n")
+            fh.write(f"{seq['id']}\tBakta\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n")
 
-            for feat in features_by_contig[contig['id']]:
+            for feat in features_by_sequence[seq['id']]:
                 start = feat['start']
                 stop = feat['stop']
                 if('edge' in feat):
-                    stop += contig['length']
+                    stop += seq['length']
 
                 if(feat['type'] == bc.FEATURE_T_RNA):
                     annotations = {
@@ -82,9 +82,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if(bc.PSEUDOGENE in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_TM_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -110,9 +110,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_R_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -138,9 +138,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_NC_RNA):
                     annotations = {
                         'ID': feat['locus'],
@@ -167,9 +167,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if('truncated' in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_NC_RNA_REGION):
                     annotations = {
                         'ID': feat['id'],
@@ -185,7 +185,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                         annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat)
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CRISPR):
                     annotations = {
                         'ID': feat['id'],
@@ -201,7 +201,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct'
                         annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                     if(not cfg.compliant):
                         i = 0
                         while i < len(feat['spacers']):
@@ -211,7 +211,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                                 'Parent': feat['id']
                             }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                             spacer = feat['spacers'][i]
                             annotations = {
                                 'ID': f"{feat['id']}_spacer_{i+1}",
@@ -219,13 +219,13 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                                 'sequence': spacer['sequence']
                             }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
                             i += 1
                         if(len(feat['repeats']) - 1 == i):
                             repeat = feat['repeats'][i]
                             annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" }
                             annotations = encode_annotations(annotations)
-                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],
@@ -258,7 +258,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if(bc.PSEUDOGENE in feat):
                             gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     if('exception' in feat):
                         ex = feat['exception']
                         pos = f"{ex['start']}..{ex['stop']}"
@@ -270,7 +270,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if('Notes' not in annotations):
                             annotations['Note'] = notes
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
                 elif(feat['type'] == bc.FEATURE_SORF):
@@ -298,9 +298,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         if(feat.get('gene', None)):
                             gene_annotations['gene'] = feat['gene']
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['sequence']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
                 elif(feat['type'] == bc.FEATURE_GAP):
@@ -310,7 +310,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         'product': f"gap ({feat['length']} bp)"
                     }
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIC):
                     annotations = {
                         'ID': feat['id'],
@@ -323,7 +323,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
-                    fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIV):
                     annotations = {
                         'ID': feat['id'],
@@ -336,7 +336,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name
-                    fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] == bc.FEATURE_ORIT):
                     annotations = {
                         'ID': feat['id'],
@@ -349,13 +349,13 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path:
                         annotations['inference'] = 'similar to DNA sequence'
                     annotations = encode_annotations(annotations)
                     feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name
-                    fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
 
         if(not cfg.compliant):
             fh.write('##FASTA\n')
-            for contig in genome['contigs']:  # write sequences
-                fh.write(f">{contig['id']}\n")
-                fh.write(fasta.wrap_sequence(contig['sequence']))
+            for seq in genome['sequences']:  # write sequences
+                fh.write(f">{seq['id']}\n")
+                fh.write(fasta.wrap_sequence(seq['sequence']))
     return
 
 
@@ -393,4 +393,4 @@ def write_signal_peptide(fh, feat: dict):
         'Parent': feat['locus']
     }
     annotations = encode_annotations(annotations)
-    fh.write(f"{feat['contig']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
+    fh.write(f"{feat['sequence']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n")
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index da56e3ef..d1d380e0 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -21,9 +21,9 @@
 def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
     log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
 
-    contig_list = []
-    for contig in genome['contigs']:
-        contig_features = [feat for feat in features if feat['contig'] == contig['id']]
+    sequence_list = []
+    for seq in genome['sequences']:
+        sequence_features = [feat for feat in features if feat['sequence'] == seq['id']]
         comment = (
             'Annotated with Bakta',
             f"Software: v{bakta.__version__}\n",
@@ -33,24 +33,24 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
             '\n',
             '##Genome Annotation Summary:##\n',
             f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n",
-            f"{'CDSs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n",
-            f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n",
-            f"{'tmRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n",
-            f"{'rRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n",
-            f"{'ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n",
-            f"{'regulatory ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n",
-            f"{'CRISPR Arrays':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CRISPR]):5,}",
-            f"{'oriCs/oriVs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}",
-            f"{'oriTs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIT]):5,}",
-            f"{'gaps':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_GAP]):5,}",
-            f"{'pseudogenes':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS and bc.PSEUDOGENE in feat]):5,}\n"
+            f"{'CDSs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n",
+            f"{'tRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n",
+            f"{'tmRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n",
+            f"{'rRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n",
+            f"{'ncRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n",
+            f"{'regulatory ncRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n",
+            f"{'CRISPR Arrays':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CRISPR]):5,}",
+            f"{'oriCs/oriVs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}",
+            f"{'oriTs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_ORIT]):5,}",
+            f"{'gaps':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_GAP]):5,}",
+            f"{'pseudogenes':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CDS and bc.PSEUDOGENE in feat]):5,}\n"
         )
-        contig_annotations = {
+        sequence_annotations = {
             'molecule_type': 'DNA',
             'source': genome['taxon'],
             'date': date.today().strftime('%d-%b-%Y').upper(),
-            'topology': contig['topology'],
-            'data_file_division': 'HGT' if contig['type'] == bc.REPLICON_CONTIG else 'BCT',
+            'topology': seq['topology'],
+            'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT',
             # 'accession': '*',  # hold back until EMBL output bug is fixed in BioPython (https://github.com/biopython/biopython/pull/3572)
             'comment': comment
             # TODO: taxonomy
@@ -62,32 +62,32 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
 
         description = ''
         if(genome['taxon']):
-            contig_annotations['organism'] = genome['taxon']
+            sequence_annotations['organism'] = genome['taxon']
             source_qualifiers['organism'] = genome['taxon']
             description = genome['taxon']
         if(genome['strain']):
             source_qualifiers['strain'] = genome['strain']
 
-        if(contig['type'] == bc.REPLICON_PLASMID):
-            source_qualifiers['plasmid'] = contig['name'] if contig.get('name', None) else 'unnamed'
-            description = f"{description} plasmid {contig.get('name', 'unnamed')}"
-            description += ', complete sequence' if contig['complete'] else ', whole genome shotgun sequence'
-        elif(contig['type'] == bc.REPLICON_CHROMOSOME):
-            if contig.get('name', None):
-                source_qualifiers['chromosome'] = contig['name']
-            description = f'{description} chromosome, complete genome' if contig['complete'] else f"{description} chromosome {contig['id']}, whole genome shotgun sequence"
+        if(seq['type'] == bc.REPLICON_PLASMID):
+            source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed'
+            description = f"{description} plasmid {seq.get('name', 'unnamed')}"
+            description += ', complete sequence' if seq['complete'] else ', whole genome shotgun sequence'
+        elif(seq['type'] == bc.REPLICON_CHROMOSOME):
+            if seq.get('name', None):
+                source_qualifiers['chromosome'] = seq['name']
+            description = f'{description} chromosome, complete genome' if seq['complete'] else f"{description} chromosome {seq['id']}, whole genome shotgun sequence"
         else:
-            description += f" {contig['id']}, whole genome shotgun sequence"
+            description += f" {seq['id']}, whole genome shotgun sequence"
 
         if(len(description) > 0 and description[0] == ' '):  # discard potential leading whitespace
             description = description[1:]
 
-        contig_rec = SeqIO.SeqRecord(id=contig['id'], name=contig['id'], description=description, annotations=contig_annotations, seq=Seq(contig['sequence']))
+        sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['sequence']))
 
-        source = SeqFeature(FeatureLocation(0, contig['length'], strand=+1), type='source', qualifiers=source_qualifiers)
+        source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers)
         seq_feature_list = [source]
 
-        for feature in contig_features:
+        for feature in sequence_features:
             insdc_feature_type = None
             qualifiers = {
                 'note': []
@@ -226,7 +226,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
             start = feature['start'] - 1
             stop = feature['stop']
             if('edge' in feature):
-                fl_1 = FeatureLocation(start, contig['length'], strand=strand)
+                fl_1 = FeatureLocation(start, seq['length'], strand=strand)
                 fl_2 = FeatureLocation(0, stop, strand=strand)
                 if(feature['strand'] == bc.STRAND_REVERSE):
                     feature_location = CompoundLocation([fl_2, fl_1])
@@ -273,16 +273,16 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
             seq_feature_list.append(feat_seqfeat)
             for acc_feature in accompanying_features:  # add accompanying features, e.g. signal peptides
                 seq_feature_list.append(acc_feature)
-        contig_rec.features = seq_feature_list
-        contig_list.append(contig_rec)
+        sequence_record.features = seq_feature_list
+        sequence_list.append(sequence_record)
 
     with genbank_output_path.open('wt', encoding='utf-8') as fh:
         log.info('write GenBank: path=%s', genbank_output_path)
-        SeqIO.write(contig_list, fh, format='genbank')
+        SeqIO.write(sequence_list, fh, format='genbank')
 
     with embl_output_path.open('wt', encoding='utf-8') as fh:
         log.info('write EMBL: path=%s', embl_output_path)
-        SeqIO.write(contig_list, fh, format='embl')
+        SeqIO.write(sequence_list, fh, format='embl')
 
 
 def select_ncrna_class(feature: dict) -> str:
diff --git a/bakta/io/json.py b/bakta/io/json.py
index c27c60a9..819a1813 100644
--- a/bakta/io/json.py
+++ b/bakta/io/json.py
@@ -46,7 +46,7 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path):
         output['genome'] = ordered_genome
 
         stats = OrderedDict()
-        stats['no_sequences'] = len(genome['contigs'])
+        stats['no_sequences'] = len(genome['sequences'])
         stats['size'] = genome['size']
         stats['gc'] = genome['gc']
         stats['n_ratio'] = genome['n_ratio']
@@ -56,7 +56,7 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path):
 
     output['features'] = features
     if genome is not None:
-        output['sequences'] = genome['contigs']
+        output['sequences'] = genome['sequences']
 
     run = OrderedDict()
     run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S')
diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
index 4b5d09e5..0f61ee49 100644
--- a/bakta/io/tsv.py
+++ b/bakta/io/tsv.py
@@ -16,7 +16,7 @@
 log = logging.getLogger('TSV')
 
 
-def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
     """Export features in TSV format."""
     log.info('write feature tsv: path=%s', tsv_path)
 
@@ -28,8 +28,8 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict],
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
         fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n')
 
-        for contig in contigs:
-            for feat in features_by_contig[contig['id']]:
+        for seq in sequences:
+            for feat in features_by_sequence[seq['id']]:
                 feat_type = feat['type']
                 if(feat_type == bc.FEATURE_GAP):
                     feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP
@@ -46,7 +46,7 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict],
                     product = f"(partial) {product}"
                 fh.write('\t'.join(
                     [
-                        feat['contig'],
+                        feat['sequence'],
                         feat_type,
                         str(feat['start']),
                         str(feat['stop']),
@@ -62,20 +62,20 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict],
                     i = 0
                     while i < len(feat['spacers']):
                         repeat = feat['repeats'][i]
-                        fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                        fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                         fh.write('\n')
                         spacer = feat['spacers'][i]
-                        fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
+                        fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
                         fh.write('\n')
                         i += 1
                     if(len(feat['repeats']) - 1 == i):
                         repeat = feat['repeats'][i]
-                        fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                        fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
                         fh.write('\n')
     return
 
 
-def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path):
+def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path):
     """Export feature inference statistics in TSV format."""
     log.info('write tsv: path=%s', tsv_path)
 
@@ -87,8 +87,8 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
         fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n')
 
-        for contig in contigs:
-            for feat in features_by_contig[contig['id']]:
+        for seq in sequences:
+            for feat in features_by_sequence[seq['id']]:
                 if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]):
                     score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-'
                     if('ups' in feat or 'ips' in feat):
@@ -107,7 +107,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                         accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}"
                     fh.write('\t'.join(
                         [
-                            feat['contig'],
+                            feat['sequence'],
                             feat['type'],
                             str(feat['start']),
                             str(feat['stop']),
@@ -126,7 +126,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s
                     accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0]
                     fh.write('\t'.join(
                         [
-                            feat['contig'],
+                            feat['sequence'],
                             feat['type'],
                             str(feat['start']),
                             str(feat['stop']),
@@ -173,5 +173,5 @@ def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path):
             seq_stats = hypo['seq_stats']
             mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA'
             iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA'
-            fh.write(f"{hypo['contig']}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
+            fh.write(f"{hypo['sequence']}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n")
     return
diff --git a/bakta/ips.py b/bakta/ips.py
index 1cebc6db..f93a6829 100644
--- a/bakta/ips.py
+++ b/bakta/ips.py
@@ -50,8 +50,8 @@ def lookup(features: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict]]:
                 feature['ips'] = ips
                 features_found.append(feature)
                 log.debug(
-                    'lookup: contig=%s, start=%i, stop=%i, aa-length=%i, strand=%s, gene=%s, UniRef100=%s, UniRef90=%s',
-                    feature['contig'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ips.get(DB_IPS_COL_GENE, ''), ips.get(DB_IPS_COL_UNIREF100, ''), ips.get(DB_IPS_COL_UNIREF90, '')
+                    'lookup: seq=%s, start=%i, stop=%i, aa-length=%i, strand=%s, gene=%s, UniRef100=%s, UniRef90=%s',
+                    feature['sequence'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ips.get(DB_IPS_COL_GENE, ''), ips.get(DB_IPS_COL_UNIREF100, ''), ips.get(DB_IPS_COL_UNIREF90, '')
                 )
             else:
                 features_not_found.append(feature)
diff --git a/bakta/main.py b/bakta/main.py
index 42e7205a..523e36f2 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -79,7 +79,7 @@ def main():
         if(cfg.force): print(f'\tforce: {cfg.force}')
         print(f'\ttmp directory: {cfg.tmp_path}')
         if(cfg.compliant): print(f'\tINSDC compliant: {cfg.compliant}')
-        if(cfg.keep_contig_headers): print(f'\tkeep contig headers: {cfg.keep_contig_headers}')
+        if(cfg.keep_sequence_headers): print(f'\tkeep/sequence headers: {cfg.keep_sequence_headers}')
         print(f'\tprefix: {cfg.prefix}')
         print(f'\tthreads: {cfg.threads}')
         if(cfg.debug): print(f'\tdebug: {cfg.debug}')
@@ -104,35 +104,35 @@ def main():
 
     ############################################################################
     # Import genome
-    # - parse contigs in Fasta file
-    # - apply contig length filter
-    # - rename contigs
+    # - parse sequences in Fasta file
+    # - apply sequence length filter
+    # - rename sequences
     ############################################################################
     print('Parse genome sequences...')
     try:
-        contigs = fasta.import_contigs(cfg.genome_path)
-        log.info('imported sequences=%i', len(contigs))
-        print(f'\timported: {len(contigs)}')
+        sequences = fasta.import_sequences(cfg.genome_path)
+        log.info('imported sequences=%i', len(sequences))
+        print(f'\timported: {len(sequences)}')
     except:
         log.error('wrong genome file format!', exc_info=True)
         sys.exit('ERROR: wrong genome file format!')
     replicons = bu.parse_replicon_table(cfg.replicons) if cfg.replicons else None
-    contigs, complete_genome = bu.qc_contigs(contigs, replicons)
-    print(f'\tfiltered & revised: {len(contigs)}')
-    no_chromosomes = len([c for c in contigs if c['type'] == bc.REPLICON_CHROMOSOME])
+    sequences, complete_genome = bu.qc_sequences(sequences, replicons)
+    print(f'\tfiltered & revised: {len(sequences)}')
+    no_chromosomes = len([seq for seq in sequences if seq['type'] == bc.REPLICON_CHROMOSOME])
     if(no_chromosomes > 0):
         print(f"\tchromosomes: {no_chromosomes}")
-    no_plasmids = len([c for c in contigs if c['type'] == bc.REPLICON_PLASMID])
+    no_plasmids = len([seq for seq in sequences if seq['type'] == bc.REPLICON_PLASMID])
     if(no_plasmids > 0):
         print(f"\tplasmids: {no_plasmids}")
-    no_contigs = len([c for c in contigs if c['type'] == bc.REPLICON_CONTIG])
+    no_contigs = len([seq for seq in sequences if seq['type'] == bc.REPLICON_CONTIG])
     if(no_contigs > 0):
         print(f"\tcontigs: {no_contigs}")
-    if(len(contigs) == 0):
-        log.warning('no valid contigs!')
-        sys.exit('Error: input file contains no valid contigs.')
-    contigs_path = cfg.tmp_path.joinpath('contigs.fna')
-    fasta.export_contigs(contigs, contigs_path)
+    if(len(sequences) == 0):
+        log.warning('no valid sequences!')
+        sys.exit('Error: input file contains no valid sequences.')
+    sequences_path = cfg.tmp_path.joinpath('sequences.fna')
+    fasta.export_sequences(sequences, sequences_path)
     genome = {
         'genus': cfg.genus,
         'species': cfg.species,
@@ -140,10 +140,10 @@ def main():
         'taxon': cfg.taxon,
         'gram': cfg.gram,
         'translation_table': cfg.translation_table,
-        'size': sum([c['length'] for c in contigs]),
+        'size': sum([seq['length'] for seq in sequences]),
         'complete': cfg.complete or complete_genome,
         'features': {},
-        'contigs': contigs
+        'sequences': sequences
     }
     if(cfg.plasmid):
         genome['plasmid'] = cfg.plasmid
@@ -157,7 +157,7 @@ def main():
     else:
         print('predict tRNAs...')
         log.debug('start tRNA prediction')
-        genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, contigs_path)
+        genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_T_RNA])}")
 
     ############################################################################
@@ -168,7 +168,7 @@ def main():
     else:
         print('predict tmRNAs...')
         log.debug('start tmRNA prediction')
-        genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, contigs_path)
+        genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_TM_RNA])}")
 
     ############################################################################
@@ -179,7 +179,7 @@ def main():
     else:
         print('predict rRNAs...')
         log.debug('start rRNA prediction')
-        genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, contigs_path)
+        genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_R_RNA])}")
 
     ############################################################################
@@ -190,7 +190,7 @@ def main():
     else:
         print('predict ncRNAs...')
         log.debug('start ncRNA prediction')
-        genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, contigs_path)
+        genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA])}")
 
     ############################################################################
@@ -201,7 +201,7 @@ def main():
     else:
         print('predict ncRNA regions...')
         log.debug('start ncRNA region prediction')
-        genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, contigs_path)
+        genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA_REGION])}")
 
     ############################################################################
@@ -212,7 +212,7 @@ def main():
     else:
         print('predict CRISPR arrays...')
         log.debug('start CRISPR prediction')
-        genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, contigs_path)
+        genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, sequences_path)
         print(f"\tfound: {len(genome['features'][bc.FEATURE_CRISPR])}")
 
     ############################################################################
@@ -403,7 +403,7 @@ def main():
             sorf_aa_path = cfg.tmp_path.joinpath('sorfs.faa')
             with sorf_aa_path.open(mode='wt') as fh:
                 for sorf in sorfs_filtered:
-                    fh.write(f">{sorf['aa_hexdigest']}-{sorf['contig']}-{sorf['start']}\n{sorf['aa']}\n")
+                    fh.write(f">{sorf['aa_hexdigest']}-{sorf['sequence']}-{sorf['start']}\n{sorf['aa']}\n")
             sig_peptides_found = sig_peptides.search(sorfs_filtered, sorf_aa_path)
             print(f"\tsignal peptides: {len(sig_peptides_found)}")
 
@@ -429,13 +429,13 @@ def main():
     else:
         print('detect oriCs/oriVs...')
         log.debug('detect oriC/V')
-        oriCs = ori.predict_oris(genome, contigs_path, bc.FEATURE_ORIC)
+        oriCs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIC)
         genome['features'][bc.FEATURE_ORIC] = oriCs
         print(f'\tfound: {len(oriCs)}')
 
         print('detect oriTs...')
         log.debug('detect oriT')
-        oriTs = ori.predict_oris(genome, contigs_path, bc.FEATURE_ORIT)
+        oriTs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIT)
         genome['features'][bc.FEATURE_ORIT] = oriTs
         print(f'\tfound: {len(oriTs)}')
 
@@ -456,40 +456,26 @@ def main():
     ############################################################################
     print('select features and create locus tags...')
     log.debug('start feature selection and creation of locus tags')
-    features_by_contig = {k['id']: [] for k in genome['contigs']}
+    features_by_sequence = {k['id']: [] for k in genome['sequences']}
     feature_id = 1
-    feature_id_prefix = bu.create_locus_tag_prefix(contigs, length=10)
-    for feature_type in [
-            bc.FEATURE_T_RNA,
-            bc.FEATURE_TM_RNA,
-            bc.FEATURE_R_RNA,
-            bc.FEATURE_NC_RNA,
-            bc.FEATURE_NC_RNA_REGION,
-            bc.FEATURE_CRISPR,
-            bc.FEATURE_CDS,
-            bc.FEATURE_SORF,
-            bc.FEATURE_GAP,
-            bc.FEATURE_ORIC,
-            bc.FEATURE_ORIV,
-            bc.FEATURE_ORIT
-        ]:
-        feature_list = genome['features'].get(feature_type, [])
+    feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10)
+    for feature_list in genome['features'].values():
         for feature in feature_list:
             if('discarded' not in feature):
                 feature['id'] = f'{feature_id_prefix}_{feature_id}'
                 feature_id += 1
-                contig_features = features_by_contig.get(feature['contig'])
-                contig_features.append(feature)
+                seq_features = features_by_sequence.get(feature['sequence'])
+                seq_features.append(feature)
     features = []
-    for contig in genome['contigs']:
-        contig_features = features_by_contig[contig['id']]
-        contig_features.sort(key=lambda k: k['start'])
-        features.extend(contig_features)
+    for seq in genome['sequences']:
+        seq_features = features_by_sequence[seq['id']]
+        seq_features.sort(key=lambda k: k['start'])
+        features.extend(seq_features)
     log.info('selected features=%i', len(features))
     print(f'\tselected: {len(features)}')
 
     # use user provided locus tag if not None/non-empty or generate a sequence based locus prefix
-    locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(contigs)
+    locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(sequences)
     log.info('locus tag prefix=%s', locus_tag_prefix)
     locus_tag_nr = cfg.locus_tag_increment
     for feature in features:
@@ -514,7 +500,7 @@ def main():
     print('\nGenome statistics:')
     genome_stats = bu.calc_genome_stats(genome, features)
     print(f"\tGenome size: {genome['size']:,} bp")
-    print(f"\tContigs/replicons: {len(genome['contigs'])}")
+    print(f"\tContigs/replicons: {len(genome['sequences'])}")
     print(f"\tGC: {100 * genome_stats['gc']:.1f} %")
     print(f"\tN50: {genome_stats['n50']:,}")
     print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %")
@@ -547,11 +533,11 @@ def main():
     print(f'\nExport annotation results to: {cfg.output_path}')
     print('\thuman readable TSV...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
-    tsv.write_features(genome['contigs'], features_by_contig, tsv_path)
+    tsv.write_features(genome['sequences'], features_by_sequence, tsv_path)
 
     print('\tGFF3...')
     gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
-    gff.write_features(genome, features_by_contig, gff3_path)
+    gff.write_features(genome, features_by_sequence, gff3_path)
 
     print('\tINSDC GenBank & EMBL...')
     genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
@@ -560,7 +546,7 @@ def main():
 
     print('\tgenome sequences...')
     fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
-    fasta.export_contigs(genome['contigs'], fna_path, description=True, wrap=True)
+    fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True)
 
     print('\tfeature nucleotide sequences...')
     ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
@@ -572,13 +558,13 @@ def main():
 
     print('\tfeature inferences...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
-    tsv.write_feature_inferences(genome['contigs'], features_by_contig, tsv_path)
+    tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path)
 
     if(cfg.skip_plot  or  cfg.meta):
         print('\tskip generation of circular genome plot...')
     else:
         print('\tcircular genome plot...')
-        plot.write(features, contigs, cfg.output_path)
+        plot.write(features, sequences, cfg.output_path)
 
     if(cfg.skip_cds is False):
         hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
@@ -608,7 +594,7 @@ def main():
     with summary_path.open('w') as fh_out:
         fh_out.write('Sequence(s):\n')
         fh_out.write(f"Length: {genome['size']:}\n")
-        fh_out.write(f"Count: {len(genome['contigs'])}\n")
+        fh_out.write(f"Count: {len(genome['sequences'])}\n")
         fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
         fh_out.write(f"N50: {genome_stats['n50']:}\n")
         fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
diff --git a/bakta/plot.py b/bakta/plot.py
index e6510981..37c2a3a2 100644
--- a/bakta/plot.py
+++ b/bakta/plot.py
@@ -171,7 +171,7 @@ def main():
     with annotation_path.open('r') as fh:
         annotation = json.load(fh)
     features = annotation['features']
-    contigs = annotation['sequences']
+    sequences = annotation['sequences']
 
     # load colors if specified
     colors = COLORS
@@ -182,40 +182,40 @@ def main():
     print('Draw plots...')
     if args.sequences == 'all':  # write whole genome plot
         print(f'\tdraw circular genome plot (type={plot_type}) containing all sequences...')
-        write(features, contigs, output_path, colors, plot_type=plot_type)
+        write(features, sequences, output_path, colors, plot_type=plot_type)
     else:  # write genome plot containing provided sequences only
-        plot_contigs = []
+        plot_sequences = []
         sequence_identifiers = []
         for selected_sequence in args.sequences.split(','):
-            for i, contig in enumerate(contigs):
+            for i, seq in enumerate(sequences):
                 sequence_no = str(i + 1)
                 if selected_sequence == sequence_no:
-                    plot_contigs.append(contig)
+                    plot_sequences.append(seq)
                     sequence_identifiers.append(sequence_no)
-                elif selected_sequence.lower() == contig['id'].lower():
-                    plot_contigs.append(contig)
-                    sequence_identifiers.append(contig['id'])
-        if len(plot_contigs) > 0:
+                elif selected_sequence.lower() == seq['id'].lower():
+                    plot_sequences.append(seq)
+                    sequence_identifiers.append(seq['id'])
+        if len(plot_sequences) > 0:
             print(f'\tdraw circular genome plot (type={plot_type}) containing sequences: {sequence_identifiers}...')
             plot_name_suffix = '_'.join(sequence_identifiers)
-            plot_contig_ids = [c['id'] for c in plot_contigs]
-            features = [feat for feat in features if feat['contig'] in plot_contig_ids]
-            write(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type)
+            plot_sequence_ids = [seq['id'] for seq in plot_sequences]
+            features = [feat for feat in features if feat['sequence'] in plot_sequence_ids]
+            write(features, plot_sequences, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type)
 
 
-def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES):
+def write(features, sequences, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES):
     # config paths
     circos_path = cfg.tmp_path.joinpath(f'circos')
     circos_path.mkdir(parents=True, exist_ok=True)
 
     # fix edge features because Circos cannot handle them correctly
     non_edge_features = [feat for feat in features if not feat.get('edge', False)]
-    contigs_by_id = {c['id']: c for c in contigs}
+    sequences_by_id = {seq['id']: seq for seq in sequences}
     for feat in [feat for feat in features if feat.get('edge', False)]:
-        contig = contigs_by_id[feat['contig']]
-        log.info('split edge feature: contig=%s, start=%i, stop=%i, strand=%s, edge=%s', contig['id'], feat['start'], feat['stop'], feat['strand'], feat['edge'])
+        seq = sequences_by_id[feat['sequence']]
+        log.info('split edge feature: seq=%s, start=%i, stop=%i, strand=%s, edge=%s', seq['id'], feat['start'], feat['stop'], feat['strand'], feat['edge'])
         feat_1 = feat.copy()
-        feat_1['stop'] = contig['length']
+        feat_1['stop'] = seq['length']
         feat_1['edge'] = False
         non_edge_features.append(feat_1)
         feat_2 = feat.copy()
@@ -226,19 +226,19 @@ def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None,
 
     # write feature files
     if plot_type == bc.PLOT_COG:
-        feature_paths = write_features_type_cog(features, contigs, circos_path, colors)
+        feature_paths = write_features_type_cog(features, sequences, circos_path, colors)
     else:
-        feature_paths = write_features_type_feature(features, contigs, circos_path, colors)
+        feature_paths = write_features_type_feature(features, sequences, circos_path, colors)
 
     # write gc content and gc skew files
     tracks_path = circos_path.joinpath('tracks.conf')
     
-    gc_content_path, max_gc, gc_skew_path, max_gc_skew = write_gc_content_skew(contigs, circos_path, colors)
+    gc_content_path, max_gc, gc_skew_path, max_gc_skew = write_gc_content_skew(sequences, circos_path, colors)
     write_tracks(tracks_path, feature_paths, gc_content_path, max_gc, gc_skew_path, max_gc_skew)
 
     # write main config
     file_name = cfg.prefix if plot_name_suffix is None else f'{cfg.prefix}_{plot_name_suffix}'
-    main_conf_path = write_main_config(circos_path, output_path, tracks_path, contigs, file_name, colors)
+    main_conf_path = write_main_config(circos_path, output_path, tracks_path, sequences, file_name, colors)
     
     # execute Circos
     log.info('write circular genome plot: file-name=%s, output-dir=%s', file_name, output_path)
@@ -262,19 +262,19 @@ def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None,
         raise Exception(f'circos error! error code: {proc.returncode}')
 
 
-def write_features_type_feature(features, contigs, circos_path, colors):
+def write_features_type_feature(features, sequences, circos_path, colors):
     features_plus = []
     features_minus = []
-    contig_ids = set([c['id'] for c in contigs])
+    sequence_ids = set([seq['id'] for seq in sequences])
     for feat in features:
-        if feat['contig'] not in contig_ids:
+        if feat['sequence'] not in sequence_ids:
             continue
-        contig, start, stop, type = feat['contig'], feat['start'], feat['stop'], feat['type']
+        seq, start, stop, type = feat['sequence'], feat['start'], feat['stop'], feat['type']
         color = colors['features'].get(type, colors['features']['misc'])
         if feat['strand'] == bc.STRAND_FORWARD:
-            features_plus.append(f"{contig} {start} {stop} {bc.STRAND_FORWARD} color={hex_to_rgb(color)}")
+            features_plus.append(f"{seq} {start} {stop} {bc.STRAND_FORWARD} color={hex_to_rgb(color)}")
         else:
-            features_minus.append(f"{contig} {start} {stop} {bc.STRAND_REVERSE} color={hex_to_rgb(color)}")
+            features_minus.append(f"{seq} {start} {stop} {bc.STRAND_REVERSE} color={hex_to_rgb(color)}")
     features_plus_path = circos_path.joinpath('features-plus.txt')
     with features_plus_path.open('w') as fh:
         fh.write('\n'.join(features_plus))
@@ -286,15 +286,15 @@ def write_features_type_feature(features, contigs, circos_path, colors):
     return [features_plus_path, features_minus_path]
 
 
-def write_features_type_cog(features, contigs, circos_path, colors):
+def write_features_type_cog(features, sequences, circos_path, colors):
     features_plus = []
     features_minus = []
     features_extra = []
-    contig_ids = set([c['id'] for c in contigs])
+    sequence_ids = set([seq['id'] for seq in sequences])
     for feat in features:
-        if feat['contig'] not in contig_ids:
+        if feat['sequence'] not in sequence_ids:
             continue
-        contig, start, stop = feat['contig'], feat['start'], feat['stop']
+        seq, start, stop = feat['sequence'], feat['start'], feat['stop']
         if feat['type'] == bc.FEATURE_CDS:
             color = colors['features'][bc.FEATURE_CDS]
             psc = feat.get('psc', None)
@@ -305,11 +305,11 @@ def write_features_type_cog(features, contigs, circos_path, colors):
                         cog = cog[:1]
                     color = colors['cog-classes'].get(cog.upper(), colors['cog-classes']['S'])
             if feat['strand'] == bc.STRAND_FORWARD:
-                features_plus.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}")
+                features_plus.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}")
             else:
-                features_minus.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}")
+                features_minus.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}")
         else:
-            features_extra.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(colors['features']['misc'])}")
+            features_extra.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(colors['features']['misc'])}")
     features_plus_path = circos_path.joinpath('features-plus.txt')
     with features_plus_path.open('w') as fh:
         fh.write('\n'.join(features_plus))
@@ -325,8 +325,8 @@ def write_features_type_cog(features, contigs, circos_path, colors):
     return [features_plus_path, features_minus_path, features_extra_path]
 
 
-def write_gc_content_skew(contigs, circos_path, colors):
-    sequence_length = sum([c['length'] for c in contigs])
+def write_gc_content_skew(sequences, circos_path, colors):
+    sequence_length = sum([seq['length'] for seq in sequences])
     step_size = int(sequence_length / 3600)  # 10 * 360°
     if step_size < 3:
         step_size = 3
@@ -338,33 +338,33 @@ def write_gc_content_skew(contigs, circos_path, colors):
     max_gc = 0
     max_gc_skew = 0
     if float(bp.__version__) >= 1.80:
-        gc_mean = SeqUtils.gc_fraction(''.join([c['sequence'] for c in contigs]))
+        gc_mean = SeqUtils.gc_fraction(''.join([seq['sequence'] for seq in sequences]))
     else:
-        gc_mean = SeqUtils.GC(''.join([c['sequence'] for c in contigs])) / 100
-    for contig in contigs:
-        seq = contig['sequence']
-        for w in range(0, len(seq), step_size):
+        gc_mean = SeqUtils.GC(''.join([seq['sequence'] for seq in sequences])) / 100
+    for seq in sequences:
+        nt = seq['sequence']
+        for w in range(0, len(nt), step_size):
             start = w - window_size
             if start < 0:
-                start += len(seq)
+                start += len(nt)
             stop = w + window_size
-            if stop > len(seq):
-                stop -= len(seq)
-            subseq = seq[start:stop] if start < stop else seq[start:] + seq[:stop]
+            if stop > len(nt):
+                stop -= len(nt)
+            nt_subseq = nt[start:stop] if start < stop else nt[start:] + nt[:stop]
             if float(bp.__version__) >= 1.80:
-                gc_value = gc_mean - SeqUtils.gc_fraction(subseq)
+                gc_value = gc_mean - SeqUtils.gc_fraction(nt_subseq)
             else:
-                gc_value = gc_mean - (SeqUtils.GC(subseq) / 100)
+                gc_value = gc_mean - (SeqUtils.GC(nt_subseq) / 100)
             if max_gc < abs(gc_value):
                 max_gc = abs(gc_value)
             gc_color = colors['gc-positive'] if gc_value >= 0 else colors['gc-negative']
-            gc_contents.append(f"{contig['id']} {w} {w} {gc_value} fill_color={hex_to_rgb(gc_color)}")
-            g, c = subseq.count('G'), subseq.count('C')
+            gc_contents.append(f"{seq['id']} {w} {w} {gc_value} fill_color={hex_to_rgb(gc_color)}")
+            g, c = nt_subseq.count('G'), nt_subseq.count('C')
             gc_skew = gc_skew = (g - c) / float(g + c) if (g + c) > 0 else 0.0
             if max_gc_skew < abs(gc_skew):
                 max_gc_skew = abs(gc_skew)
             gc_skew_color = colors['gc-skew-positive'] if gc_skew >= 0 else colors['gc-skew-negative']
-            gc_skews.append(f"{contig['id']} {w} {w} {gc_skew} fill_color={hex_to_rgb(gc_skew_color)}")
+            gc_skews.append(f"{seq['id']} {w} {w} {gc_skew} fill_color={hex_to_rgb(gc_skew_color)}")
 
     log.debug('write gc config: seq-length=%i, step-size=%i, window-size=%i, max-gc=%i, max-gc-skew=%i', sequence_length, step_size, window_size, max_gc, max_gc_skew)
     gc_content_path = circos_path.joinpath('gc_content.txt')
@@ -430,9 +430,9 @@ def hex_to_rgb(hex_string):
     return ','.join(rgb)
 
 
-def write_main_config(circos_path, output_path, tracks_path, contigs, file_name, colors):
+def write_main_config(circos_path, output_path, tracks_path, sequences, file_name, colors):
     karyotype_path = circos_path.joinpath('karyotype.txt')
-    sequence_length = sum([c['length'] for c in contigs])
+    sequence_length = sum([seq['length'] for seq in sequences])
     chromosomes_units = round(sequence_length/(10**(len(str(sequence_length)) - 1)))*(10**(len(str(sequence_length)) - 1))
 
     if sequence_length > 10_000:
@@ -508,7 +508,7 @@ def write_main_config(circos_path, output_path, tracks_path, contigs, file_name,
 
     # write karyotype file
     karyotypes = []
-    for i, c in enumerate(contigs):
+    for i, c in enumerate(sequences):
         karyotypes.append(f"chr - {c['id']} {i + 1} 0 {c['length']} {hex_to_rgb(colors['backbone'])}")
     with karyotype_path.open('w') as fh:
         fh.write('\n'.join(karyotypes))
diff --git a/bakta/proteins.py b/bakta/proteins.py
index e22a9aa4..a7835611 100644
--- a/bakta/proteins.py
+++ b/bakta/proteins.py
@@ -128,7 +128,7 @@ def main():
     ############################################################################
     try:
         print('Parse protein sequences...')
-        aas = fasta.import_contigs(aa_path, False, False)
+        aas = fasta.import_sequences(aa_path, False, False)
         log.info('imported sequences=%i', len(aas))
         print(f'\timported: {len(aas)}')
     except:
@@ -139,7 +139,7 @@ def main():
         aa['type'] = bc.FEATURE_CDS
         aa['aa'] = aa['sequence']
         aa['locus'] = aa['id']
-        aa['contig'] = '-'
+        aa['sequence'] = '-'
         aa['start'] = mock_start
         aa['stop'] = -1
         aa['strand'] = bc.STRAND_UNKNOWN
@@ -166,11 +166,11 @@ def main():
     tsv.write_protein_features(aas, header_columns, map_aa_columns, annotations_path)
     inference_path = output_path.joinpath(f'{cfg.prefix}.inference.tsv')
     print(f'\tfeature inferences (TSV): {inference_path}')
-    mock_contigs = [{'id': '-'}]
-    features_by_contig = {'-': aas}
-    tsv.write_feature_inferences(mock_contigs, features_by_contig, inference_path)
+    mock_sequences = [{'id': '-'}]
+    features_by_sequence = {'-': aas}
+    tsv.write_feature_inferences(mock_sequences, features_by_sequence, inference_path)
     for aa in aas:  # cleanup mock attributes
-        aa.pop('contig', None)
+        aa.pop('sequence', None)
         aa.pop('start', None)
         aa.pop('stop', None)
         aa.pop('strand', None)
diff --git a/bakta/psc.py b/bakta/psc.py
index 24cb774b..0cb7398c 100644
--- a/bakta/psc.py
+++ b/bakta/psc.py
@@ -84,8 +84,8 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
                     'valid': identity >= bc.MIN_PSC_IDENTITY  # whether a valid PSC hit (id > 90%)
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
+                    'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     pscs_found = []
@@ -142,8 +142,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
                         feature['psc'] = psc
                 no_psc_lookups += 1
                 log.debug(
-                    'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef90=%s, EC=%s, gene=%s, product=%s',
-                    feature['contig'], feature['start'], feature['stop'], feature['strand'], psc.get(DB_PSC_COL_UNIREF90, ''), psc.get(DB_PSC_COL_EC, ''), psc.get(DB_PSC_COL_GENE, ''), psc.get(DB_PSC_COL_PRODUCT, '')
+                    'lookup: seq=%s, start=%i, stop=%i, strand=%s, UniRef90=%s, EC=%s, gene=%s, product=%s',
+                    feature['sequence'], feature['start'], feature['stop'], feature['strand'], psc.get(DB_PSC_COL_UNIREF90, ''), psc.get(DB_PSC_COL_EC, ''), psc.get(DB_PSC_COL_GENE, ''), psc.get(DB_PSC_COL_PRODUCT, '')
                 )
             else:
                 log.debug('lookup: ID not found! uniref90_id=%s', uniref90_id)
diff --git a/bakta/pscc.py b/bakta/pscc.py
index b4b34906..372c4cbf 100644
--- a/bakta/pscc.py
+++ b/bakta/pscc.py
@@ -76,8 +76,8 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen
                     'evalue': evalue
                 }
                 log.debug(
-                    'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s',
-                    cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
+                    'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s',
+                    cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id
                 )
 
     psccs_found = []
@@ -129,8 +129,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False):
                         feature['pscc'] = pscc  # add PSCC annotation info
                 no_pscc_lookups += 1
                 log.debug(
-                    'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s',
-                    feature['contig'], feature['start'], feature['stop'], feature['strand'], pscc.get(DB_PSCC_COL_UNIREF50, ''), pscc.get(DB_PSCC_COL_PRODUCT, '')
+                    'lookup: seq=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s',
+                    feature['sequence'], feature['start'], feature['stop'], feature['strand'], pscc.get(DB_PSCC_COL_UNIREF50, ''), pscc.get(DB_PSCC_COL_PRODUCT, '')
                 )
             else:
                 log.debug('lookup: ID not found! uniref50_id=%s', uniref50_id)
diff --git a/bakta/so.py b/bakta/so.py
index 7c9cb344..809bc2ff 100644
--- a/bakta/so.py
+++ b/bakta/so.py
@@ -7,7 +7,7 @@
 SO_REPLICON = SO('replicon', 'SO:0001235')
 SO_REPLICON_CHROMOSOME = SO('chromosome', 'SO:0000340')
 SO_REPLICON_PLASMID = SO('plasmid', 'SO:0000155')
-SO_CONTIG = SO('contig', 'SO:0000149')
+SO_CONTIG = SO('sequence', 'SO:0000149')
 
 SO_OPERON = SO('operon', 'SO:0000178')
 SO_PROMOTER = SO('promoter', 'SO:0000167')
diff --git a/bakta/ups.py b/bakta/ups.py
index 2d048c8e..e9f48bba 100644
--- a/bakta/ups.py
+++ b/bakta/ups.py
@@ -45,8 +45,8 @@ def lookup(features: Sequence[dict]):
                 feature['ups'] = ups
                 features_found.append(feature)
                 log.debug(
-                    'lookup: contig=%s, start=%i, stop=%i, aa-length=%i, strand=%s, UniParc=%s, UniRef100=%s, NCBI NRP=%s',
-                    feature['contig'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ups.get(DB_UPS_COL_UNIPARC, ''), ups.get(DB_UPS_COL_UNIREF100, ''), ups.get(DB_UPS_COL_REFSEQ_NRP, '')
+                    'lookup: seq=%s, start=%i, stop=%i, aa-length=%i, strand=%s, UniParc=%s, UniRef100=%s, NCBI NRP=%s',
+                    feature['sequence'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ups.get(DB_UPS_COL_UNIPARC, ''), ups.get(DB_UPS_COL_UNIREF100, ''), ups.get(DB_UPS_COL_REFSEQ_NRP, '')
                 )
             else:
                 features_not_found.append(feature)
diff --git a/bakta/utils.py b/bakta/utils.py
index 5b17aee9..04d02704 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -68,7 +68,7 @@ def parse_arguments():
 
     arg_group_io = parser.add_argument_group('Input / Output')
     arg_group_io.add_argument('--db', '-d', action='store', default=None, help='Database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.')
-    arg_group_io.add_argument('--min-contig-length', '-m', action='store', type=int, default=1, dest='min_contig_length', help='Minimum contig size (default = 1; 200 in compliant mode)')
+    arg_group_io.add_argument('--min-contig-length', '-m', action='store', type=int, default=1, dest='min_contig_length', help='Minimum contig/sequence size (default = 1; 200 in compliant mode)')
     arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
     arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
     arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder (except for current working directory)')
@@ -87,7 +87,7 @@ def parse_arguments():
     arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')")
     arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)')
     arg_group_annotation.add_argument('--locus-tag-increment', action='store', type=int, default=1, choices=[1, 5, 10], dest='locus_tag_increment', help='Locus tag increment: 1/5/10 (default = 1)')
-    arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers')
+    arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig/sequence headers')
     arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance')
     arg_group_annotation.add_argument('--replicons', '-r', action='store', default=None, dest='replicons', help='Replicon information table (tsv/csv)')
     arg_group_annotation.add_argument('--regions', action='store', default=None, help='Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).')
@@ -261,11 +261,11 @@ def test_dependencies():
         test_dependency(DEPENDENCY_CIRCOS)
 
 
-def create_locus_tag_prefix(contigs: Sequence[dict], length: int=6) -> str:
+def create_locus_tag_prefix(sequences: Sequence[dict], length: int=6) -> str:
     """Create either genus/species or sequence MD5 hex based locus tag prefix."""
     hash = hashlib.md5()
-    for contig in contigs:
-        hash.update(str.encode(contig['sequence']))
+    for seq in sequences:
+        hash.update(str.encode(seq['sequence']))
     hexdigest = hash.hexdigest().upper()
     locus_prefix_chars = []
     i = 0
@@ -300,10 +300,10 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]):
     # N50
     gc_sum = 0
     n_sum = 0
-    for contig in genome['contigs']:
-        seq = contig['sequence']
-        gc_sum += seq.count('G') + seq.count('C')
-        n_sum += seq.count('N')
+    for seq in genome['sequences']:
+        nt = seq['sequence']
+        gc_sum += nt.count('G') + nt.count('C')
+        n_sum += nt.count('N')
     gc_ratio = gc_sum / (genome_size - n_sum)
     genome['gc'] = gc_ratio
     log.info('GC=%0.3f', gc_ratio)
@@ -313,21 +313,21 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]):
     log.info('N=%0.3f', n_ratio)
 
     n50 = 0
-    contig_length_sum = 0
-    for contig in sorted(genome['contigs'], key=lambda x: x['length'], reverse=True):
-        contig_length = len(contig['sequence'])
-        contig_length_sum += contig_length
-        if(contig_length_sum >= genome_size / 2):
-            n50 = contig_length
+    sequence_length_sum = 0
+    for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True):
+        nt_length = len(seq['sequence'])
+        sequence_length_sum += nt_length
+        if(sequence_length_sum >= genome_size / 2):
+            n50 = nt_length
             break
     genome['n50'] = n50
     log.info('N50=%i', n50)
 
-    contigs_by_id = {c['id']: c for c in genome['contigs']}
+    sequence_by_id = {seq['id']: seq for seq in genome['sequences']}
     coding_nts = 0
     for feat in features:
         if(feat.get('edge', False)):
-            sequence_length = contigs_by_id[feat['contig']]['length']
+            sequence_length = sequence_by_id[feat['sequence']]['length']
             coding_nts += feat['stop'] + (sequence_length - feat['start'] + 1)  # feature coding nucleotides
         else:
             coding_nts += feat['stop'] - feat['start'] + 1  # feature coding nucleotides
@@ -389,120 +389,120 @@ def parse_replicon_table(replicon_table_path: Path) -> Dict[str, dict]:
     return replicons
 
 
-def qc_contigs(contigs: Sequence[dict], replicons: Dict[str, dict]) -> Tuple[Sequence[dict], bool]:
-    valid_contigs = []
-    contig_counter = 1
-    contig_prefix = cfg.locus if cfg.locus else 'contig'
+def qc_sequences(sequences: Sequence[dict], replicons: Dict[str, dict]) -> Tuple[Sequence[dict], bool]:
+    valid_sequences = []
+    sequence_counter = 1
+    sequence_prefix = cfg.locus if cfg.locus else 'contig'
     complete_genome = True
     plasmid_number = 1
-    contig_ids = set()
-    for contig in contigs:
-        if(contig['length'] >= cfg.min_contig_length):
-            contig_id_generated = f'{contig_prefix}_{contig_counter}'
-            contig['simple_id'] = contig_id_generated
-            contig_counter += 1
-
-            contig_description = contig['description'].lower()
+    sequence_ids = set()
+    for seq in sequences:
+        if(seq['length'] >= cfg.min_sequence_length):
+            sequence_id_generated = f'{sequence_prefix}_{sequence_counter}'
+            seq['simple_id'] = sequence_id_generated
+            sequence_counter += 1
+
+            sequence_description = seq['description'].lower()
             if(cfg.complete):
-                contig['complete'] = True
-                contig['topology'] = bc.TOPOLOGY_CIRCULAR
-            elif('circular=true' in contig_description):  # detection of Unicycler circularized sequences
-                contig['complete'] = True
-                contig['topology'] = bc.TOPOLOGY_CIRCULAR
-                log.debug('qc: detected Unicycler circular topology via description: id=%s, description=%s', contig['id'], contig['description'])
-            elif('complete' in contig_description and 'complete=false' not in contig_description):  # detection of public/described sequences
-                contig['complete'] = True
-                contig['topology'] = bc.TOPOLOGY_CIRCULAR
-                log.debug('qc: detected complete replicon via description: id=%s, description=%s', contig['id'], contig['description'])
+                seq['complete'] = True
+                seq['topology'] = bc.TOPOLOGY_CIRCULAR
+            elif('circular=true' in sequence_description):  # detection of Unicycler circularized sequences
+                seq['complete'] = True
+                seq['topology'] = bc.TOPOLOGY_CIRCULAR
+                log.debug('qc: detected Unicycler circular topology via description: id=%s, description=%s', seq['id'], seq['description'])
+            elif('complete' in sequence_description and 'complete=false' not in sequence_description):  # detection of public/described sequences
+                seq['complete'] = True
+                seq['topology'] = bc.TOPOLOGY_CIRCULAR
+                log.debug('qc: detected complete replicon via description: id=%s, description=%s', seq['id'], seq['description'])
             
-            if('chromosome' in contig_description):
-                contig['type'] = bc.REPLICON_CHROMOSOME
-                log.debug('qc: detected chromosome replicon type via description: id=%s, description=%s', contig['id'], contig['description'])
-            elif('plasmid' in contig_description):
-                contig['type'] = bc.REPLICON_PLASMID
-                log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', contig['id'], contig['description'])
-
-            contig_desc = []
-            if(cfg.keep_contig_headers):
-                if(contig['id'] in contig_ids):
-                    log.error('Fasta import: duplicated contig id! contig-id=%s', contig['id'])
-                    sys.exit(f"ERROR: Detected duplicated contig id! Contig ID ({contig['id']}) occures multiple times!")
+            if('chromosome' in sequence_description):
+                seq['type'] = bc.REPLICON_CHROMOSOME
+                log.debug('qc: detected chromosome replicon type via description: id=%s, description=%s', seq['id'], seq['description'])
+            elif('plasmid' in sequence_description):
+                seq['type'] = bc.REPLICON_PLASMID
+                log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', seq['id'], seq['description'])
+
+            sequence_desc = []
+            if(cfg.keep_sequence_headers):
+                if(seq['id'] in sequence_ids):
+                    log.error('Fasta import: duplicated seq id! seq-id=%s', seq['id'])
+                    sys.exit(f"ERROR: Detected duplicated sequence id! Sequence ID ({seq['id']}) occures multiple times!")
                 else:
-                    contig_ids.add(contig['id'])
+                    sequence_ids.add(seq['id'])
             else:
-                contig['orig_id'] = contig['id']
-                contig['id'] = contig_id_generated
-                contig['orig_description'] = contig['description']
+                seq['orig_id'] = seq['id']
+                seq['id'] = sequence_id_generated
+                seq['orig_description'] = seq['description']
                 if(cfg.genus is not None or cfg.species is not None):
                     organism = ' '.join([t for t in [cfg.genus, cfg.species] if t is not None])
-                    contig_desc.append(f"[organism={organism}]")
+                    sequence_desc.append(f"[organism={organism}]")
                 if(cfg.strain):
-                    contig_desc.append(f'[strain={cfg.strain}]')
-                contig_desc.append(f'[gcode={cfg.translation_table}]')
-
-            if(contig['complete'] and contig['topology'] == bc.TOPOLOGY_CIRCULAR):  # detection of chromosomes/plasmids via sequence length thresholds
-                if(contig['length'] >= bc.REPLICON_LENGTH_THRESHOLD_CHROMOSOME):
-                    contig['type'] = bc.REPLICON_CHROMOSOME
-                    log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', contig['id'], contig['type'], contig['length'], contig['description'])
-                elif(contig['length'] < bc.REPLICON_LENGTH_THRESHOLD_PLASMID):
-                    contig['type'] = bc.REPLICON_PLASMID
-                    log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', contig['id'], contig['type'], contig['length'], contig['description'])
-            valid_contigs.append(contig)
-
-            if(len(contigs) == 1 and cfg.plasmid is not None):  # use plasmid mode
-                contig['type'] = bc.REPLICON_PLASMID
-                contig['topology'] = bc.TOPOLOGY_CIRCULAR
-                contig['name'] = cfg.plasmid
+                    sequence_desc.append(f'[strain={cfg.strain}]')
+                sequence_desc.append(f'[gcode={cfg.translation_table}]')
+
+            if(seq['complete'] and seq['topology'] == bc.TOPOLOGY_CIRCULAR):  # detection of chromosomes/plasmids via sequence length thresholds
+                if(seq['length'] >= bc.REPLICON_LENGTH_THRESHOLD_CHROMOSOME):
+                    seq['type'] = bc.REPLICON_CHROMOSOME
+                    log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', seq['id'], seq['type'], seq['length'], seq['description'])
+                elif(seq['length'] < bc.REPLICON_LENGTH_THRESHOLD_PLASMID):
+                    seq['type'] = bc.REPLICON_PLASMID
+                    log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', seq['id'], seq['type'], seq['length'], seq['description'])
+            valid_sequences.append(seq)
+
+            if(len(sequences) == 1 and cfg.plasmid is not None):  # use plasmid mode
+                seq['type'] = bc.REPLICON_PLASMID
+                seq['topology'] = bc.TOPOLOGY_CIRCULAR
+                seq['name'] = cfg.plasmid
             elif(replicons):  # use user provided replicon table
-                contig_id = contig['orig_id'] if 'orig_id' in contig else contig['id']
-                replicon = replicons.get(contig_id, None)
+                sequence_id = seq['orig_id'] if 'orig_id' in seq else seq['id']
+                replicon = replicons.get(sequence_id, None)
                 if(replicon):
-                    contig['type'] = replicon['replicon_type']
-                    contig['topology'] = replicon['topology']
-                    contig['complete'] = replicon['replicon_type'] != bc.REPLICON_CONTIG
+                    seq['type'] = replicon['replicon_type']
+                    seq['topology'] = replicon['topology']
+                    seq['complete'] = replicon['replicon_type'] != bc.REPLICON_CONTIG
                     if(replicon['name']):
-                        contig['name'] = replicon['name']
-                    if(not cfg.keep_contig_headers):
-                        contig['id'] = replicon['new_locus_id'] if replicon['new_locus_id'] else contig['simple_id']
-                    contig.pop('simple_id')
+                        seq['name'] = replicon['name']
+                    if(not cfg.keep_sequence_headers):
+                        seq['id'] = replicon['new_locus_id'] if replicon['new_locus_id'] else seq['simple_id']
+                    seq.pop('simple_id')
             
-            if(not cfg.keep_contig_headers):
-                if(contig['complete']):
-                    contig_desc.append('[completeness=complete]')
-                if(contig['topology'] != bc.REPLICON_CONTIG):
-                    contig_desc.append(f"[topology={contig['topology']}]")
-                if(contig['type'] == bc.REPLICON_CHROMOSOME):
-                    contig_desc.append('[location=chromosome]')
-                elif(contig['type'] == bc.REPLICON_PLASMID):
-                    if(not contig.get('name', None)):
-                        contig['name'] = f'unnamed{plasmid_number}'
+            if(not cfg.keep_sequence_headers):
+                if(seq['complete']):
+                    sequence_desc.append('[completeness=complete]')
+                if(seq['topology'] != bc.REPLICON_CONTIG):
+                    sequence_desc.append(f"[topology={seq['topology']}]")
+                if(seq['type'] == bc.REPLICON_CHROMOSOME):
+                    sequence_desc.append('[location=chromosome]')
+                elif(seq['type'] == bc.REPLICON_PLASMID):
+                    if(not seq.get('name', None)):
+                        seq['name'] = f'unnamed{plasmid_number}'
                         plasmid_number += 1
-                    contig_desc.append(f"[plasmid-name={contig['name']}]")
-                contig['description'] = ' '.join(list(dict.fromkeys(contig_desc)))  # remove duplicates remaining order
+                    sequence_desc.append(f"[plasmid-name={seq['name']}]")
+                seq['description'] = ' '.join(list(dict.fromkeys(sequence_desc)))  # remove duplicates remaining order
 
-            if(contig['type'] == bc.REPLICON_CONTIG):
+            if(seq['type'] == bc.REPLICON_CONTIG):
                 complete_genome = False
 
             if(cfg.compliant):  # check INSDC compliance
-                if(len(contig['id']) > 25):  # max 25 characters
-                    log.error('INSDC compliance: contig id larger than 25! contig-id=%s', contig['id'])
-                    sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) larger than 25 characers!")
-                if(bc.RE_INSDC_ID.fullmatch(contig['id']) is None):  # invalid characters
-                    log.error('INSDC compliance: contig id contains invalid characters! contig-id=%s', contig['id'])
-                    sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) contains invalid characters!")
+                if(len(seq['id']) > 25):  # max 25 characters
+                    log.error('INSDC compliance: seq id larger than 25! seq-id=%s', seq['id'])
+                    sys.exit(f"ERROR: INSDC compliance failed! Sequence ID ({seq['id']}) larger than 25 characers!")
+                if(bc.RE_INSDC_ID.fullmatch(seq['id']) is None):  # invalid characters
+                    log.error('INSDC compliance: seq id contains invalid characters! seq-id=%s', seq['id'])
+                    sys.exit(f"ERROR: INSDC compliance failed! Sequence ID ({seq['id']}) contains invalid characters!")
 
             log.info(
                 "qc: revised sequence: id=%s, orig-id=%s, type=%s, complete=%s, topology=%s, name=%s, description='%s', orig-description='%s'",
-                contig['id'], contig.get('orig_id', ''), contig['type'], contig['complete'], contig['topology'], contig.get('name', ''), contig['description'], contig.get('orig_description', '')
+                seq['id'], seq.get('orig_id', ''), seq['type'], seq['complete'], seq['topology'], seq.get('name', ''), seq['description'], seq.get('orig_description', '')
             )
-    return valid_contigs, complete_genome
+    return valid_sequences, complete_genome
 
 
-def extract_feature_sequence(feature: dict, contig: dict) -> str:
+def extract_feature_sequence(feature: dict, sequence: dict) -> str:
     if(feature.get('edge', False)):
-        seq = contig['sequence'][feature['start']-1:] + contig['sequence'][:feature['stop']]
+        nt = sequence['sequence'][feature['start']-1:] + sequence['sequence'][:feature['stop']]
     else:
-        seq = contig['sequence'][feature['start']-1:feature['stop']]
+        nt = sequence['sequence'][feature['start']-1:feature['stop']]
     if(feature['strand'] == bc.STRAND_REVERSE):
-        seq = str(Seq(seq).reverse_complement())
-    return seq
+        nt = str(Seq(nt).reverse_complement())
+    return nt
diff --git a/scripts/extract-region.py b/scripts/extract-region.py
index b357b137..23da1605 100755
--- a/scripts/extract-region.py
+++ b/scripts/extract-region.py
@@ -43,9 +43,9 @@
 with genome_path.open() as fh:
     genome = json.load(fh)
 
-contig_id = args.sequence
-if(contig_id is None):  # take first sequence as default
-    contig_id = genome['sequences'][0]['id']
+sequence_id = args.sequence
+if(sequence_id is None):  # take first sequence as default
+    sequence_id = genome['sequences'][0]['id']
 
 prefix = args.prefix
 if(prefix is None):  # use input file prefix as default
@@ -55,14 +55,14 @@
 print('Extract features within selected region...')
 features_selected = []
 for feat in genome['features']:
-    if(feat['contig'] == contig_id):
+    if(feat['sequence'] == sequence_id):
         if(feat['start'] >= args.min  and  feat['stop'] <= args.max):
             features_selected.append(feat)
-features_by_contig = {contig_id: features_selected}  # needed for GFF3 export
+features_by_sequence = {sequence_id: features_selected}  # needed for GFF3 export
 print(f'\t...selected features: {len(features_selected)}')
 
 genome['features'] = features_selected
-genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id]
+genome['sequences'] = [sequence for sequence in genome['sequences'] if sequence['id'] == sequence_id]
 genome['genus'] = genome['genome']['genus']
 genome['species'] = genome['genome']['species']
 genome['strain'] = genome['genome']['strain']
@@ -76,7 +76,7 @@
 print('Write selected features...')
 output_path = Path(args.output).resolve()
 gff3_path = output_path.joinpath(f'{prefix}.gff3')
-gff.write_features(genome, features_by_contig, gff3_path)
+gff.write_features(genome, features_by_sequence, gff3_path)
 print('\t...INSDC GenBank & EMBL')
 genbank_path = output_path.joinpath(f'{prefix}.gbff')
 embl_path = output_path.joinpath(f'{prefix}.embl')
diff --git a/test/test_edge_features.py b/test/test_edge_features.py
index 38b381ce..35b164f3 100644
--- a/test/test_edge_features.py
+++ b/test/test_edge_features.py
@@ -7,7 +7,7 @@
 
 
 def test_bakta_edge_features(tmpdir):
-    # test edge lable on mock CDS contig
+    # test edge lable on mock CDS sequence
     proc = run(
         [
             'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test',
@@ -26,9 +26,9 @@ def test_bakta_edge_features(tmpdir):
         results = json.load(fh)
 
     for feat in results['features']:
-        if(feat['contig'] != 'dummy'):
-            if('forward' in feat['contig']):
+        if(feat['sequence'] != 'dummy'):
+            if('forward' in feat['sequence']):
                 assert feat['strand'] == bc.STRAND_FORWARD
-            elif('reverse' in feat['contig']):
+            elif('reverse' in feat['sequence']):
                 assert feat['strand'] == bc.STRAND_REVERSE
-            assert feat.get('edge', False) == ('edge' in feat['contig'])
+            assert feat.get('edge', False) == ('edge' in feat['sequence'])
diff --git a/test/test_nt_sequences.py b/test/test_nt_sequences.py
index 93e2bd2f..534243b7 100644
--- a/test/test_nt_sequences.py
+++ b/test/test_nt_sequences.py
@@ -27,7 +27,7 @@ def test_bakta_cds_nt_sequence(tmpdir):
         results = json.load(fh)
 
     for feat in results['features']:
-        if(feat['contig'] != 'dummy'):
+        if(feat['sequence'] != 'dummy'):
             assert feat['nt'] == CDS
 
 
diff --git a/test/test_pseudo.py b/test/test_pseudo.py
index a117882f..c35199e1 100644
--- a/test/test_pseudo.py
+++ b/test/test_pseudo.py
@@ -12,7 +12,7 @@
             'MKEGQFVGY-FKMKEQRKIPLTHIMIIGAFIFAFLQVVLLASLVHAVNVNNEIQEGLFQSGRIMVESLQHILSVQTGIN',
             #            *
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 37,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -36,7 +36,7 @@
             'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKNVIWDYPKDFIAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL\\KIIA\\AVANNYMPGVASYAG',
             'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKDVIWDYPKDFMAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL-KIIA-PVANDYMPGVDSYAG',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 1,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -59,7 +59,7 @@
             'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF',
             'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASQGSWYNF',
 {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 1,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -82,7 +82,7 @@
             'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF',
             'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASUGSWYNF',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 1,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -105,7 +105,7 @@
             'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF',
             'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASOGSWYNF',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 1,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -128,7 +128,7 @@
             'MLSIQSNRDWLSMSIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT',
             'MLSIQSNRDWLSASIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 40,
                 'stop': 100,
                 'strand': bc.STRAND_FORWARD,
@@ -153,7 +153,7 @@
             'MKEGQFVGY-FKMKEQRKIPLTHIMIIGAFIFAFLQVVLLASLVHAVNVNNEIQEGLFQSGRIMVESLQHILSVQTGIN',
             #            *
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 10,
                 'stop': 200,
                 'strand': bc.STRAND_REVERSE,
@@ -177,7 +177,7 @@
             'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKNVIWDYPKDFIAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL\\KIIA\\AVANNYMPGVASYAG',
             'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKDVIWDYPKDFMAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL-KIIA-PVANDYMPGVDSYAG',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 1,
                 'stop': 500,
                 'strand': bc.STRAND_REVERSE,
@@ -200,7 +200,7 @@
             'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF',
             'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASQGSWYNF',
 {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 100,
                 'stop': 500,
                 'strand': bc.STRAND_REVERSE,
@@ -223,7 +223,7 @@
             'MLSIQSNRDWLSMSIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT',
             'MLSIQSNRDWLSASIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT',
             {
-                'contig': 'foo',
+                'sequence': 'foo',
                 'start': 40,
                 'stop': 100,
                 'strand': bc.STRAND_REVERSE,
@@ -262,7 +262,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected
     assert observations == expected_result
 
 
-@pytest.mark.parametrize('cds, contig, expected_result', [
+@pytest.mark.parametrize('cds, sequence, expected_result', [
         (
             {
               'start': 310,  # linear fits cutoff
@@ -327,5 +327,5 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected
         )
     ]
 )
-def test_get_elongated_cds(cds, contig, expected_result):
-    assert feat_cds.get_elongated_cds(cds, contig, offset=300) == expected_result
+def test_get_elongated_cds(cds, sequence, expected_result):
+    assert feat_cds.get_elongated_cds(cds, sequence, offset=300) == expected_result
diff --git a/test/test_regions.py b/test/test_regions.py
index 3c7ad373..d02bdb20 100644
--- a/test/test_regions.py
+++ b/test/test_regions.py
@@ -29,8 +29,8 @@ def test_wrong_seq_id_failiing(tmpdir):
 @pytest.mark.parametrize(
     'keep_contig_headers',
     [
-        ([]),  # autogenerate contig ids
-        (['--keep-contig-headers'])  # keep contig headers
+        ([]),  # autogenerate sequence ids
+        (['--keep-contig-headers'])  # keep sequence headers
     ]
 )
 def test_regions_plasmid(regions, keep_contig_headers, tmpdir):
diff --git a/test/test_sORF.py b/test/test_sORF.py
index 72214a6a..56c21c9a 100644
--- a/test/test_sORF.py
+++ b/test/test_sORF.py
@@ -22,13 +22,13 @@
 }
 
 GENOME_1 = {
-    'contigs': [CONTIG_1]
+    'sequences': [CONTIG_1]
 }
 GENOME_2 = {
-    'contigs': [CONTIG_2]
+    'sequences': [CONTIG_2]
 }
 GENOME_3 = {
-    'contigs': [CONTIG_3]
+    'sequences': [CONTIG_3]
 }
 
 

From ec091963b1b744d820196f6ca7944a759651c0d0 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 13:20:28 +0200
Subject: [PATCH 2/8] rename seq 'sequence' attribute to 'nt'

---
 bakta/features/cds.py   |  8 ++++----
 bakta/features/gaps.py  |  4 ++--
 bakta/features/s_orf.py |  2 +-
 bakta/io/fasta.py       |  6 +++---
 bakta/io/gff.py         |  2 +-
 bakta/io/insdc.py       |  2 +-
 bakta/plot.py           |  6 +++---
 bakta/utils.py          | 10 +++++-----
 8 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 177dda58..0475ff67 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -42,7 +42,7 @@ def predict(genome: dict):
         if(not prodigal_metamode):
             log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
             gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed)
-            seqs = [seq['sequence'] for seq in genome['sequences']]
+            seqs = [seq['nt'] for seq in genome['sequences']]
             trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table)
         else:
             log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
@@ -64,7 +64,7 @@ def predict(genome: dict):
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True)
         else:
             gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=True, mask=True)
-        sequences = [seq['sequence'] for seq in linear_sequences]
+        sequences = [seq['nt'] for seq in linear_sequences]
         with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe:
             for seq, genes in zip(linear_sequences, tpe.map(gene_finder.find_genes, sequences)):
                 cdss_per_sequence = create_cdss(genes, seq)
@@ -77,7 +77,7 @@ def predict(genome: dict):
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True)
         else:
             gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=False, mask=True)
-        sequences = [seq['sequence'] for seq in circular_sequences]
+        sequences = [seq['nt'] for seq in circular_sequences]
         with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe:
             for seq, genes in zip(circular_sequences, tpe.map(gene_finder.find_genes, sequences)):
                 cdss_per_sequence = create_cdss(genes, seq)
@@ -790,7 +790,7 @@ def get_elongated_cds(cds: dict, sequence: dict, offset: int = bc.PSEUDOGENE_OFF
         'elongation_downstream': offset
     }
 
-    sequence_length = len(sequence['sequence'])
+    sequence_length = len(sequence['nt'])
     if sequence['topology'] == 'circular' and elongated_cds['start'] - offset < 0:
         elongated_cds['start'] = sequence_length + elongated_cds['start'] - offset
         elongated_cds['edge'] = True
diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py
index 5e052de6..0e2610ba 100644
--- a/bakta/features/gaps.py
+++ b/bakta/features/gaps.py
@@ -14,7 +14,7 @@
 def detect_assembly_gaps(genome: dict) -> Sequence[dict]:
     gaps = []
     for seq in genome['sequences']:
-        m = RE_ASSEMBLY_GAP.search(seq['sequence'])
+        m = RE_ASSEMBLY_GAP.search(seq['nt'])
         while m:
             start, end = m.span()
 
@@ -31,5 +31,5 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]:
                 'seq=%s, start=%i, stop=%i, length=%s',
                 gap['sequence'], gap['start'], gap['stop'], gap['length']
             )
-            m = RE_ASSEMBLY_GAP.search(seq['sequence'], end + 1)
+            m = RE_ASSEMBLY_GAP.search(seq['nt'], end + 1)
     return gaps
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index 500d52ef..1ae202d4 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -24,7 +24,7 @@ def extract(genome: dict):
     """Predict open reading frames in mem via BioPython."""
     orfs = []
     for seq in genome['sequences']:
-        nt_seq = Seq(seq['sequence'])
+        nt_seq = Seq(seq['nt'])
         for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]:  # strands +/-
             for frame in range(3):  # frames 1/2/3 -> 0, 1, 2
                 seq_frame = strand_nt_seq[frame:]
diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py
index 076eae4d..7a824ef3 100644
--- a/bakta/io/fasta.py
+++ b/bakta/io/fasta.py
@@ -43,7 +43,7 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T
             sequence = {
                 'id': record.id,
                 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '',
-                'sequence': raw_sequence,
+                'nt': raw_sequence,
                 'length': len(raw_sequence)
             }
             if(is_genomic):
@@ -69,9 +69,9 @@ def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: b
             else:
                 fh.write(f">{seq['id']}\n")
             if(wrap):
-                fh.write(wrap_sequence(seq['sequence']))
+                fh.write(wrap_sequence(seq['nt']))
             else:
-                fh.write(seq['sequence'])
+                fh.write(seq['nt'])
                 fh.write('\n')
 
 
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 9d7cd355..741e1989 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -355,7 +355,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat
             fh.write('##FASTA\n')
             for seq in genome['sequences']:  # write sequences
                 fh.write(f">{seq['id']}\n")
-                fh.write(fasta.wrap_sequence(seq['sequence']))
+                fh.write(fasta.wrap_sequence(seq['nt']))
     return
 
 
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index d1d380e0..aed9f16d 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -82,7 +82,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
         if(len(description) > 0 and description[0] == ' '):  # discard potential leading whitespace
             description = description[1:]
 
-        sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['sequence']))
+        sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['nt']))
 
         source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers)
         seq_feature_list = [source]
diff --git a/bakta/plot.py b/bakta/plot.py
index 37c2a3a2..6e1a4e24 100644
--- a/bakta/plot.py
+++ b/bakta/plot.py
@@ -338,11 +338,11 @@ def write_gc_content_skew(sequences, circos_path, colors):
     max_gc = 0
     max_gc_skew = 0
     if float(bp.__version__) >= 1.80:
-        gc_mean = SeqUtils.gc_fraction(''.join([seq['sequence'] for seq in sequences]))
+        gc_mean = SeqUtils.gc_fraction(''.join([seq['nt'] for seq in sequences]))
     else:
-        gc_mean = SeqUtils.GC(''.join([seq['sequence'] for seq in sequences])) / 100
+        gc_mean = SeqUtils.GC(''.join([seq['nt'] for seq in sequences])) / 100
     for seq in sequences:
-        nt = seq['sequence']
+        nt = seq['nt']
         for w in range(0, len(nt), step_size):
             start = w - window_size
             if start < 0:
diff --git a/bakta/utils.py b/bakta/utils.py
index 04d02704..c6f97cd7 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -265,7 +265,7 @@ def create_locus_tag_prefix(sequences: Sequence[dict], length: int=6) -> str:
     """Create either genus/species or sequence MD5 hex based locus tag prefix."""
     hash = hashlib.md5()
     for seq in sequences:
-        hash.update(str.encode(seq['sequence']))
+        hash.update(str.encode(seq['nt']))
     hexdigest = hash.hexdigest().upper()
     locus_prefix_chars = []
     i = 0
@@ -301,7 +301,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]):
     gc_sum = 0
     n_sum = 0
     for seq in genome['sequences']:
-        nt = seq['sequence']
+        nt = seq['nt']
         gc_sum += nt.count('G') + nt.count('C')
         n_sum += nt.count('N')
     gc_ratio = gc_sum / (genome_size - n_sum)
@@ -315,7 +315,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]):
     n50 = 0
     sequence_length_sum = 0
     for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True):
-        nt_length = len(seq['sequence'])
+        nt_length = len(seq['nt'])
         sequence_length_sum += nt_length
         if(sequence_length_sum >= genome_size / 2):
             n50 = nt_length
@@ -500,9 +500,9 @@ def qc_sequences(sequences: Sequence[dict], replicons: Dict[str, dict]) -> Tuple
 
 def extract_feature_sequence(feature: dict, sequence: dict) -> str:
     if(feature.get('edge', False)):
-        nt = sequence['sequence'][feature['start']-1:] + sequence['sequence'][:feature['stop']]
+        nt = sequence['nt'][feature['start']-1:] + sequence['nt'][:feature['stop']]
     else:
-        nt = sequence['sequence'][feature['start']-1:feature['stop']]
+        nt = sequence['nt'][feature['start']-1:feature['stop']]
     if(feature['strand'] == bc.STRAND_REVERSE):
         nt = str(Seq(nt).reverse_complement())
     return nt

From 88850bc6ea37bb49ac765417de077c5992a583d8 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 15:24:29 +0200
Subject: [PATCH 3/8] rename genome data structure to data

---
 bakta/features/annotation.py        | 34 +++++------
 bakta/features/cds.py               | 42 +++++++-------
 bakta/features/crispr.py            |  4 +-
 bakta/features/gaps.py              |  4 +-
 bakta/features/nc_rna.py            |  8 +--
 bakta/features/nc_rna_region.py     |  8 +--
 bakta/features/ori.py               |  4 +-
 bakta/features/r_rna.py             |  8 +--
 bakta/features/s_orf.py             | 34 +++++------
 bakta/features/t_rna.py             |  4 +-
 bakta/features/tm_rna.py            |  4 +-
 bakta/io.py                         | 24 ++++----
 bakta/io/gff.py                     | 10 ++--
 bakta/io/insdc.py                   | 18 +++---
 bakta/io/json.py                    | 36 ++++++------
 bakta/main.py                       | 90 ++++++++++++++---------------
 bakta/utils.py                      | 18 +++---
 scripts/collect-annotation-stats.py | 52 ++++++++---------
 scripts/extract-region.py           | 28 ++++-----
 19 files changed, 215 insertions(+), 215 deletions(-)

diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
index 3cfb03ab..63165b47 100644
--- a/bakta/features/annotation.py
+++ b/bakta/features/annotation.py
@@ -145,46 +145,46 @@ def combine_annotation(feature: dict):
     feature['db_xrefs'] = sorted(list(db_xrefs))
 
 
-def detect_feature_overlaps(genome: dict):
+def detect_feature_overlaps(data: dict):
     """Apply feature type specific hierarchical feature overlap filters.
     tRNA < tmRNA
     CDS < tmRNA, tRNA, rRNA, CRISPR
     sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations)
     """
-    sequence_t_rnas = {k['id']: [] for k in genome['sequences']}
-    for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []):
+    sequence_t_rnas = {k['id']: [] for k in data['sequences']}
+    for t_rna in data['features'].get(bc.FEATURE_T_RNA, []):
         t_rnas = sequence_t_rnas[t_rna['sequence']]
         t_rnas.append(t_rna)
-    sequence_tm_rnas = {k['id']: [] for k in genome['sequences']}
-    for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []):
+    sequence_tm_rnas = {k['id']: [] for k in data['sequences']}
+    for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []):
         tm_rnas = sequence_tm_rnas[tm_rna['sequence']]
         tm_rnas.append(tm_rna)
-    sequence_r_rnas = {k['id']: [] for k in genome['sequences']}
-    for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []):
+    sequence_r_rnas = {k['id']: [] for k in data['sequences']}
+    for r_rna in data['features'].get(bc.FEATURE_R_RNA, []):
         r_rnas = sequence_r_rnas[r_rna['sequence']]
         r_rnas.append(r_rna)
-    sequence_ncrna_regions = {k['id']: [] for k in genome['sequences']}
-    for ncRNA_region in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []):
+    sequence_ncrna_regions = {k['id']: [] for k in data['sequences']}
+    for ncRNA_region in data['features'].get(bc.FEATURE_NC_RNA_REGION, []):
         ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']]
         ncRNA_regions.append(ncRNA_region)
-    sequence_crispr_arrays = {k['id']: [] for k in genome['sequences']}
-    for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []):
+    sequence_crispr_arrays = {k['id']: [] for k in data['sequences']}
+    for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []):
         crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
-    sequence_cdss = {k['id']: [] for k in genome['sequences']}
-    sequence_cdss_user_provided = {k['id']: [] for k in genome['sequences']}
-    for cds in genome['features'].get(bc.FEATURE_CDS, []):
+    sequence_cdss = {k['id']: [] for k in data['sequences']}
+    sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']}
+    for cds in data['features'].get(bc.FEATURE_CDS, []):
         if(cds.get('source', None) == bc.CDS_SOURCE_USER):
             cdss = sequence_cdss_user_provided[cds['sequence']]
         else:
             cdss = sequence_cdss[cds['sequence']]
         cdss.append(cds)
-    sequence_sorfs = {k['id']: [] for k in genome['sequences']}
-    for sorf in genome['features'].get(bc.FEATURE_SORF, []):
+    sequence_sorfs = {k['id']: [] for k in data['sequences']}
+    for sorf in data['features'].get(bc.FEATURE_SORF, []):
         sorfs = sequence_sorfs[sorf['sequence']]
         sorfs.append(sorf)
 
-    for seq in genome['sequences']:  # find feature overlaps sequence-wise to increase the performance
+    for seq in data['sequences']:  # find feature overlaps sequence-wise to increase the performance
         log.debug('filter features on seq: %s', seq['id'])
 
         # mark tRNAs overlapping with tmRNAs
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 0475ff67..781f95f4 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -30,19 +30,19 @@
 log = logging.getLogger('CDS')
 
 
-def predict(genome: dict):
+def predict(data: dict):
     """Predict open reading frames with Pyrodigal."""
     # create Pyrodigal trainining file if not provided by the user
     prodigal_tf_path = cfg.prodigal_tf
     trainings_info = None
-    prodigal_metamode = cfg.meta  or  genome['size'] < pyrodigal.MIN_SINGLE_GENOME  # 20_000 bp
+    prodigal_metamode = cfg.meta  or  data['size'] < pyrodigal.MIN_SINGLE_GENOME  # 20_000 bp
     log.debug('prodigal mode: meta=%s', prodigal_metamode)
     if(prodigal_tf_path is None):
-        closed = not genome['complete']
+        closed = not data['complete']
         if(not prodigal_metamode):
             log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
             gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed)
-            seqs = [seq['nt'] for seq in genome['sequences']]
+            seqs = [seq['nt'] for seq in data['sequences']]
             trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table)
         else:
             log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
@@ -58,7 +58,7 @@ def predict(genome: dict):
 
     cdss = []
     # predict genes on linear sequences
-    linear_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR]
+    linear_sequences = [seq for seq in data['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR]
     if(len(linear_sequences) > 0):
         if prodigal_metamode:
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True)
@@ -71,7 +71,7 @@ def predict(genome: dict):
                 cdss.extend(cdss_per_sequence)
 
     # predict genes on circular replicons (chromosomes/plasmids)
-    circular_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR]
+    circular_sequences = [seq for seq in data['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR]
     if(len(circular_sequences) > 0):
         if prodigal_metamode:
             gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True)
@@ -182,14 +182,14 @@ def create_cdss(genes, sequence):
     return cdss_per_sequence
 
 
-def import_user_cdss(genome: dict, import_path: Path):
+def import_user_cdss(data: dict, import_path: Path):
     """Import user-provided CDS regions.
     Only CDS region information are imported skipping any existing functional annotations.
     
     Parameters
     ----------
-    genome : dict
-        Genome dictionary holding sequence information
+    data : dict
+        data dictionary holding sequence information
     import_path : Path
         Path to GFF3 or Genbank file with regions or features.
 
@@ -200,9 +200,9 @@ def import_user_cdss(genome: dict, import_path: Path):
     """
     user_cdss = []
     if(cfg.keep_sequence_headers):
-        sequences_by_id = {seq['id']: seq for seq in genome['sequences']}  # use ID as it's not altered -> no 'orig_id' field
+        sequences_by_id = {seq['id']: seq for seq in data['sequences']}  # use ID as it's not altered -> no 'orig_id' field
     else:
-        sequences_by_id = {seq['orig_id']: seq for seq in genome['sequences']}  # use 'orig_id' instead of autogenerated new 'id'
+        sequences_by_id = {seq['orig_id']: seq for seq in data['sequences']}  # use 'orig_id' instead of autogenerated new 'id'
     file_suffix = import_path.suffix.lower()
     if(file_suffix in ['.gff', '.gff3']):  # parse GFF3 format
         try:
@@ -401,22 +401,22 @@ def analyze_proteins(cdss: Sequence[dict]):
         cds['seq_stats'] = seq_stats
 
 
-def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
+def revise_translational_exceptions(data: dict, cdss: Sequence[dict]):
     """
     Revise translational exceptions as for istance selenocystein proteins.
     """
     no_revised = 0
-    if(bc.FEATURE_NC_RNA_REGION not in genome['features']):  # check if ncRNA regions have been detected, otherwise skip analysis and return
+    if(bc.FEATURE_NC_RNA_REGION not in data['features']):  # check if ncRNA regions have been detected, otherwise skip analysis and return
         return no_revised
 
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding.
-    cdss_per_sequences = {k['id']: [] for k in genome['sequences']}  # get CDS per sequence
+    cdss_per_sequences = {k['id']: [] for k in data['sequences']}  # get CDS per sequence
     for cds in cdss:
         cdss_per_sequence = cdss_per_sequences[cds['sequence']]
         if('truncated' not in cds):  # exclude truncated CDS for now
             cdss_per_sequence.append(cds)
-    cds_pairs_per_sequence = {k['id']: [] for k in genome['sequences']}  # extract inframe primate CDS neighbouring pairs
+    cds_pairs_per_sequence = {k['id']: [] for k in data['sequences']}  # extract inframe primate CDS neighbouring pairs
     for id, cdss_per_sequence in cdss_per_sequences.items():
         cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start'])
         for i in range(1, len(cdss_per_sequence)):
@@ -432,7 +432,7 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
                 cds_pairs = cds_pairs_per_sequence[cds_a['sequence']]
                 cds_pairs.append((cds_a, cds_b))
 
-    recoding_regions = [ncrna_region for ncrna_region in genome['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION]  #  Selenocysteine insertion sequences
+    recoding_regions = [ncrna_region for ncrna_region in data['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION]  #  Selenocysteine insertion sequences
     for recoding_region in recoding_regions:
         if('selenocysteine' in recoding_region.get('product', '').lower()):
             cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']]
@@ -488,13 +488,13 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]):
     return no_revised
 
 
-def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]):
+def revise_special_cases_annotated(data: dict, cdss: Sequence[dict]):
     """
     Revise rare but known special cases as for istance supposedly truncated dnaA genes on rotated chromosome starts
     which often appear on re-annotated genomes.
     """
     
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     # look for supposedly truncated dnaA genes on rotated chromosome starts: start=1, strand=+
     dnaA = None
     for cds in cdss:
@@ -611,7 +611,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]:
     return pseudo_candidates
 
 
-def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: dict) -> Sequence[dict]:
+def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], data: dict) -> Sequence[dict]:
     """
     Conduct a BLASTX search of 5'/3'-extended sequences of pseudogene candidates against matching PSCs.
     Search for and determine possible pseudogenization causes in the resulting alignments.
@@ -627,7 +627,7 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome:
             fh.write(f">{cluster_id}\n{faa_seq}\n")
 
     # Get extended cds sequences
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     candidates_extended_positions = {}
     with candidates_elongated_sequences_path.open(mode='w') as fh:
         for cds in candidates:
diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py
index 02b41c20..5214a1d3 100644
--- a/bakta/features/crispr.py
+++ b/bakta/features/crispr.py
@@ -17,7 +17,7 @@
 log = logging.getLogger('CRISPR')
 
 
-def predict_crispr(genome: dict, sequences_path: Path):
+def predict_crispr(data: dict, sequences_path: Path):
     """Predict CRISPR arrays with PILER-CR."""
 
     output_path = cfg.tmp_path.joinpath('crispr.txt')
@@ -44,7 +44,7 @@ def predict_crispr(genome: dict, sequences_path: Path):
 
     # parse crispr arrays
     crispr_arrays = {}
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with output_path.open() as fh:
         output_section = None
         sequence_id = None
diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py
index 0e2610ba..699a01f7 100644
--- a/bakta/features/gaps.py
+++ b/bakta/features/gaps.py
@@ -11,9 +11,9 @@
 RE_ASSEMBLY_GAP = re.compile(r'N{1,}', flags=0)
 
 
-def detect_assembly_gaps(genome: dict) -> Sequence[dict]:
+def detect_assembly_gaps(data: dict) -> Sequence[dict]:
     gaps = []
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         m = RE_ASSEMBLY_GAP.search(seq['nt'])
         while m:
             start, end = m.span()
diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py
index 70050e09..4c669674 100644
--- a/bakta/features/nc_rna.py
+++ b/bakta/features/nc_rna.py
@@ -17,7 +17,7 @@
 log = logging.getLogger('NC_RNA')
 
 
-def predict_nc_rnas(genome: dict, sequences_path: Path):
+def predict_nc_rnas(data: dict, sequences_path: Path):
     """Search for non-coding RNA genes."""
 
     output_path = cfg.tmp_path.joinpath('ncrna-genes.tsv')
@@ -31,9 +31,9 @@ def predict_nc_rnas(genome: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(genome['size'] >= 1000000):
+    if(data['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * genome['size'] // 1000000))
+        cmd.append(str(2 * data['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-genes')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
@@ -61,7 +61,7 @@ def predict_nc_rnas(genome: dict, sequences_path: Path):
                 rfam2go[rfam] = [go]
 
     ncrnas = []
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py
index b5e3500e..ec8ce92f 100644
--- a/bakta/features/nc_rna_region.py
+++ b/bakta/features/nc_rna_region.py
@@ -16,7 +16,7 @@
 log = logging.getLogger('NC_RNA_REGION')
 
 
-def predict_nc_rna_regions(genome: dict, sequences_path: Path):
+def predict_nc_rna_regions(data: dict, sequences_path: Path):
     """Search for non-coding RNA regions."""
 
     output_path = cfg.tmp_path.joinpath('ncrna-regions.tsv')
@@ -30,9 +30,9 @@ def predict_nc_rna_regions(genome: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(genome['size'] >= 1000000):
+    if(data['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * genome['size'] // 1000000))
+        cmd.append(str(2 * data['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-regions')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
@@ -60,7 +60,7 @@ def predict_nc_rna_regions(genome: dict, sequences_path: Path):
                 rfam2go[rfam] = [go]
 
     ncrnas = []
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
diff --git a/bakta/features/ori.py b/bakta/features/ori.py
index bc7c6e59..04ebb35f 100644
--- a/bakta/features/ori.py
+++ b/bakta/features/ori.py
@@ -18,7 +18,7 @@
 log = logging.getLogger('ORI')
 
 
-def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]:
+def predict_oris(data: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]:
     """Search for oriT/C sequences."""
 
     database = 'oric.fna' if ori_type == bc.FEATURE_ORIC else 'orit.fna'
@@ -78,7 +78,7 @@ def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[
 
     # combine overlapping hits (simple 1D array peak detection)
     oris = []
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         sequence_hits = hits.get(seq['id'], None)
         if(sequence_hits):
             region_hits = [0] * (seq['length'] + 1)  # init with extra leading slot (start at 1)
diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py
index 640ebfeb..7847921a 100644
--- a/bakta/features/r_rna.py
+++ b/bakta/features/r_rna.py
@@ -17,7 +17,7 @@
 log = logging.getLogger('R_RNA')
 
 
-def predict_r_rnas(genome: dict, sequences_path: Path):
+def predict_r_rnas(data: dict, sequences_path: Path):
     """Search for ribosomal RNA sequences."""
 
     output_path = cfg.tmp_path.joinpath('rrna.tsv')
@@ -31,9 +31,9 @@ def predict_r_rnas(genome: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(genome['size'] >= 1000000):
+    if(data['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * genome['size'] // 1000000))
+        cmd.append(str(2 * data['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('rRNA')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
@@ -51,7 +51,7 @@ def predict_r_rnas(genome: dict, sequences_path: Path):
         raise Exception(f'cmscan error! error code: {proc.returncode}')
 
     rrnas = []
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with output_path.open() as fh:
         for line in fh:
             if(line[0] != '#'):
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index 1ae202d4..cc7ddb0a 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -20,10 +20,10 @@
 log = logging.getLogger('S_ORF')
 
 
-def extract(genome: dict):
+def extract(data: dict):
     """Predict open reading frames in mem via BioPython."""
     orfs = []
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         nt_seq = Seq(seq['nt'])
         for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]:  # strands +/-
             for frame in range(3):  # frames 1/2/3 -> 0, 1, 2
@@ -87,40 +87,40 @@ def get_feature_stop(feature: dict) -> int:
     return feature['stop'] if feature['strand'] == bc.STRAND_FORWARD else feature['start']
 
 
-def overlap_filter(genome: dict, orfs_raw: Sequence[dict]):
+def overlap_filter(data: dict, orfs_raw: Sequence[dict]):
     """Filter in-mem ORFs by overlapping CDSs."""
-    t_rnas_per_sequence = {seq['id']: [] for seq in genome['sequences']}
-    for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []):
+    t_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']}
+    for t_rna in data['features'].get(bc.FEATURE_T_RNA, []):
         t_rnas = t_rnas_per_sequence[t_rna['sequence']]
         t_rnas.append(t_rna)
-    for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []):
+    for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []):
         t_rnas = t_rnas_per_sequence[tm_rna['sequence']]
         t_rnas.append(tm_rna)
 
-    r_rna_per_sequence = {seq['id']: [] for seq in genome['sequences']}
-    for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []):
+    r_rna_per_sequence = {seq['id']: [] for seq in data['sequences']}
+    for r_rna in data['features'].get(bc.FEATURE_R_RNA, []):
         r_rnas = r_rna_per_sequence[r_rna['sequence']]
         r_rnas.append(r_rna)
 
-    # nc_rnas_per_sequence = {k['id']: [] for k in genome['sequences']}
-    # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA, []):
+    # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']}
+    # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA, []):
     #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
-    # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []):
+    # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA_REGION, []):
     #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
 
-    crispr_arrays_per_sequence = {seq['id']: [] for seq in genome['sequences']}
-    for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []):
+    crispr_arrays_per_sequence = {seq['id']: [] for seq in data['sequences']}
+    for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []):
         crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
 
-    cdss_per_sequence = {k['id']: [] for k in genome['sequences']}
-    for cds in genome['features'].get(bc.FEATURE_CDS, []):
+    cdss_per_sequence = {k['id']: [] for k in data['sequences']}
+    for cds in data['features'].get(bc.FEATURE_CDS, []):
         cdss = cdss_per_sequence[cds['sequence']]
         cdss.append(cds)
 
-    sorfs_per_sequence = {seq['id']: [] for seq in genome['sequences']}
+    sorfs_per_sequence = {seq['id']: [] for seq in data['sequences']}
     for sorf in orfs_raw:
         orfs = sorfs_per_sequence[sorf['sequence']]
         orfs.append(sorf)
@@ -128,7 +128,7 @@ def overlap_filter(genome: dict, orfs_raw: Sequence[dict]):
     discarded_sorf_keys = set()
     with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe:
         futures = []
-        for seq in genome['sequences']:
+        for seq in data['sequences']:
             sequence_sorfs = sorfs_per_sequence[seq['id']]
             log.debug('filter: seq=%s, # sORFs=%i', seq['id'], len(sequence_sorfs))
             if(len(sequence_sorfs) < 100):  # execute sORF filter task
diff --git a/bakta/features/t_rna.py b/bakta/features/t_rna.py
index 901d3d9d..083fe28d 100644
--- a/bakta/features/t_rna.py
+++ b/bakta/features/t_rna.py
@@ -42,7 +42,7 @@
 }
 
 
-def predict_t_rnas(genome: dict, sequences_path: Path):
+def predict_t_rnas(data: dict, sequences_path: Path):
     """Search for tRNA sequences."""
 
     txt_output_path = cfg.tmp_path.joinpath('trna.tsv')
@@ -70,7 +70,7 @@ def predict_t_rnas(genome: dict, sequences_path: Path):
         raise Exception(f'tRNAscan-SE error! error code: {proc.returncode}')
 
     trnas = {}
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with txt_output_path.open() as fh:
         for line in fh.readlines()[3:]:  # skip first 3 lines
             (sequence_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t')
diff --git a/bakta/features/tm_rna.py b/bakta/features/tm_rna.py
index 26d0bc6c..052ac9e8 100644
--- a/bakta/features/tm_rna.py
+++ b/bakta/features/tm_rna.py
@@ -13,7 +13,7 @@
 log = logging.getLogger('TM_RNA')
 
 
-def predict_tm_rnas(genome: dict, sequences_path: Path):
+def predict_tm_rnas(data: dict, sequences_path: Path):
     """Search for tmRNA sequences."""
 
     txt_output_path = cfg.tmp_path.joinpath('tmrna.tsv')
@@ -45,7 +45,7 @@ def predict_tm_rnas(genome: dict, sequences_path: Path):
         raise Exception(f'aragorn error! error code: {proc.returncode}')
 
     tmrnas = []
-    sequences = {seq['id']: seq for seq in genome['sequences']}
+    sequences = {seq['id']: seq for seq in data['sequences']}
     with txt_output_path.open() as fh:
         sequence_id = None
         for line in fh:
diff --git a/bakta/io.py b/bakta/io.py
index 64c009e8..5f62efac 100644
--- a/bakta/io.py
+++ b/bakta/io.py
@@ -94,13 +94,13 @@ def main():
         annotation = json.load(fh)
     features = annotation['features']
     sequences = annotation['sequences']
-    genome = {
+    data = {
         'features': features,
         'sequence': sequences,
         'taxon': annotation['genome']
     }
-    features_by_sequence = {k['id']: [] for k in genome['sequences']}
-    for feature in genome['features']:
+    features_by_sequence = {k['id']: [] for k in data['sequences']}
+    for feature in data['features']:
         sequence_features = features_by_sequence.get(feature['sequence'])
         sequence_features.append(feature)
 
@@ -114,20 +114,20 @@ def main():
     print(f'\nExport annotation results to: {cfg.output_path}')
     print('\thuman readable TSV...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
-    tsv.write_features(genome['sequences'], features_by_sequence, tsv_path)
+    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)
 
     print('\tGFF3...')
     gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
-    gff.write_features(genome, features_by_sequence, gff3_path)
+    gff.write_features(data, features_by_sequence, gff3_path)
 
     print('\tINSDC GenBank & EMBL...')
     genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
     embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
-    insdc.write_features(genome, features, genbank_path, embl_path)
+    insdc.write_features(data, features, genbank_path, embl_path)
 
     print('\tgenome sequences...')
     fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
-    fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True)
+    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)
 
     print('\tfeature nucleotide sequences...')
     ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
@@ -139,13 +139,13 @@ def main():
 
     print('\tfeature inferences...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
-    tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path)
+    tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path)
 
     if(cfg.skip_plot  or  cfg.meta):
         print('\tskip generation of circular genome plot...')
     else:
         print('\tcircular genome plot...')
-        plot.write(features, genome['sequences'], cfg.output_path)
+        plot.write(features, data['sequences'], cfg.output_path)
 
     if(cfg.skip_cds is False):
         hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
@@ -160,10 +160,10 @@ def main():
     print('\tGenome and annotation summary...')
     summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
     with summary_path.open('w') as fh_out:
-        genome_stats = bu.calc_genome_stats(genome, features)
+        genome_stats = bu.calc_genome_stats(data, features)
         fh_out.write('Sequence(s):\n')
-        fh_out.write(f"Length: {genome['size']:}\n")
-        fh_out.write(f"Count: {len(genome['sequences'])}\n")
+        fh_out.write(f"Length: {data['size']:}\n")
+        fh_out.write(f"Count: {len(data['sequences'])}\n")
         fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
         fh_out.write(f"N50: {genome_stats['n50']:}\n")
         fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 741e1989..09fc2ca9 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -14,7 +14,7 @@
 log = logging.getLogger('GFF')
 
 
-def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_path: Path):
+def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path):
     """Export features in GFF3 format."""
     log.info('write features: path=%s', gff3_path)
 
@@ -22,8 +22,8 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat
         fh.write('##gff-version 3\n')  # GFF version
         fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version
 
-        if(genome['taxon']):  # write organism info
-            fh.write(f"# organism {genome['taxon']}\n")
+        if(data['taxon']):  # write organism info
+            fh.write(f"# organism {data['taxon']}\n")
 
         fh.write('# Annotated with Bakta\n')
         fh.write(f'# Software: v{bakta.__version__}\n')
@@ -31,7 +31,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat
         fh.write(f'# DOI: {bc.BAKTA_DOI}\n')
         fh.write(f'# URL: {bc.BAKTA_URL}\n')
 
-        for seq in genome['sequences']:  # write features
+        for seq in data['sequences']:  # write features
             fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n")  # sequence region
 
             # write landmark region
@@ -353,7 +353,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat
 
         if(not cfg.compliant):
             fh.write('##FASTA\n')
-            for seq in genome['sequences']:  # write sequences
+            for seq in data['sequences']:  # write sequences
                 fh.write(f">{seq['id']}\n")
                 fh.write(fasta.wrap_sequence(seq['nt']))
     return
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index aed9f16d..f49730e3 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -18,11 +18,11 @@
 log = logging.getLogger('INSDC')
 
 
-def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
+def write_features(data: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path):
     log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path)
 
     sequence_list = []
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         sequence_features = [feat for feat in features if feat['sequence'] == seq['id']]
         comment = (
             'Annotated with Bakta',
@@ -47,7 +47,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
         )
         sequence_annotations = {
             'molecule_type': 'DNA',
-            'source': genome['taxon'],
+            'source': data['taxon'],
             'date': date.today().strftime('%d-%b-%Y').upper(),
             'topology': seq['topology'],
             'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT',
@@ -61,12 +61,12 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path:
         }
 
         description = ''
-        if(genome['taxon']):
-            sequence_annotations['organism'] = genome['taxon']
-            source_qualifiers['organism'] = genome['taxon']
-            description = genome['taxon']
-        if(genome['strain']):
-            source_qualifiers['strain'] = genome['strain']
+        if(data['taxon']):
+            sequence_annotations['organism'] = data['taxon']
+            source_qualifiers['organism'] = data['taxon']
+            description = data['taxon']
+        if(data['strain']):
+            source_qualifiers['strain'] = data['strain']
 
         if(seq['type'] == bc.REPLICON_PLASMID):
             source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed'
diff --git a/bakta/io/json.py b/bakta/io/json.py
index 819a1813..1d5e8256 100644
--- a/bakta/io/json.py
+++ b/bakta/io/json.py
@@ -13,7 +13,7 @@
 log = logging.getLogger('JSON')
 
 
-def write_json(genome: dict, features: Sequence[dict], json_path: Path):
+def write_json(data: dict, features: Sequence[dict], json_path: Path):
     log.info('write JSON: path=%s', json_path)
 
     # clean feature attributes
@@ -33,30 +33,30 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path):
 
     # replace features type dict by sorted feature list
     output = OrderedDict()
-    if genome is not None:
+    if data is not None:
         ordered_genome = OrderedDict()
-        ordered_genome['genus'] = genome['genus']
-        ordered_genome['species'] = genome['species']
-        ordered_genome['strain'] = genome['strain']
-        if('plasmid' in genome):
-            ordered_genome['plasmid'] = genome['plasmid']
-        ordered_genome['complete'] = genome['complete']
-        ordered_genome['gram'] = genome['gram']
-        ordered_genome['translation_table'] = genome['translation_table']
+        ordered_genome['genus'] = data['genus']
+        ordered_genome['species'] = data['species']
+        ordered_genome['strain'] = data['strain']
+        if('plasmid' in data):
+            ordered_genome['plasmid'] = data['plasmid']
+        ordered_genome['complete'] = data['complete']
+        ordered_genome['gram'] = data['gram']
+        ordered_genome['translation_table'] = data['translation_table']
         output['genome'] = ordered_genome
 
         stats = OrderedDict()
-        stats['no_sequences'] = len(genome['sequences'])
-        stats['size'] = genome['size']
-        stats['gc'] = genome['gc']
-        stats['n_ratio'] = genome['n_ratio']
-        stats['n50'] = genome['n50']
-        stats['coding_ratio'] = genome['coding_ratio']
+        stats['no_sequences'] = len(data['sequences'])
+        stats['size'] = data['size']
+        stats['gc'] = data['gc']
+        stats['n_ratio'] = data['n_ratio']
+        stats['n50'] = data['n50']
+        stats['coding_ratio'] = data['coding_ratio']
         output['stats'] = stats
 
     output['features'] = features
-    if genome is not None:
-        output['sequences'] = genome['sequences']
+    if data is not None:
+        output['sequences'] = data['sequences']
 
     run = OrderedDict()
     run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S')
diff --git a/bakta/main.py b/bakta/main.py
index 523e36f2..d54df130 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -133,7 +133,7 @@ def main():
         sys.exit('Error: input file contains no valid sequences.')
     sequences_path = cfg.tmp_path.joinpath('sequences.fna')
     fasta.export_sequences(sequences, sequences_path)
-    genome = {
+    data = {
         'genus': cfg.genus,
         'species': cfg.species,
         'strain': cfg.strain,
@@ -146,7 +146,7 @@ def main():
         'sequences': sequences
     }
     if(cfg.plasmid):
-        genome['plasmid'] = cfg.plasmid
+        data['plasmid'] = cfg.plasmid
     print('\nStart annotation...')
 
     ############################################################################
@@ -157,8 +157,8 @@ def main():
     else:
         print('predict tRNAs...')
         log.debug('start tRNA prediction')
-        genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_T_RNA])}")
+        data['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_T_RNA])}")
 
     ############################################################################
     # tmRNA prediction
@@ -168,8 +168,8 @@ def main():
     else:
         print('predict tmRNAs...')
         log.debug('start tmRNA prediction')
-        genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_TM_RNA])}")
+        data['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_TM_RNA])}")
 
     ############################################################################
     # rRNA prediction
@@ -179,8 +179,8 @@ def main():
     else:
         print('predict rRNAs...')
         log.debug('start rRNA prediction')
-        genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_R_RNA])}")
+        data['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_R_RNA])}")
 
     ############################################################################
     # ncRNA gene prediction
@@ -190,8 +190,8 @@ def main():
     else:
         print('predict ncRNAs...')
         log.debug('start ncRNA prediction')
-        genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA])}")
+        data['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA])}")
 
     ############################################################################
     # ncRNA region prediction
@@ -201,8 +201,8 @@ def main():
     else:
         print('predict ncRNA regions...')
         log.debug('start ncRNA region prediction')
-        genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA_REGION])}")
+        data['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA_REGION])}")
 
     ############################################################################
     # CRISPR prediction
@@ -212,8 +212,8 @@ def main():
     else:
         print('predict CRISPR arrays...')
         log.debug('start CRISPR prediction')
-        genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, sequences_path)
-        print(f"\tfound: {len(genome['features'][bc.FEATURE_CRISPR])}")
+        data['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(data, sequences_path)
+        print(f"\tfound: {len(data['features'][bc.FEATURE_CRISPR])}")
 
     ############################################################################
     # CDS prediction
@@ -230,7 +230,7 @@ def main():
     else:
         print('predict & annotate CDSs...')
         log.debug('predict CDS')
-        cdss = feat_cds.predict(genome)
+        cdss = feat_cds.predict(data)
         print(f"\tpredicted: {len(cdss)} ")
 
         if(len(cdss) > 0):
@@ -241,13 +241,13 @@ def main():
         
         if(len(cdss) > 0):
             log.debug('revise translational exceptions')
-            no_revised = feat_cds.revise_translational_exceptions(genome, cdss)
+            no_revised = feat_cds.revise_translational_exceptions(data, cdss)
             print(f'\trevised translational exceptions: {no_revised}')
             cdss = [cds for cds in cdss if 'discarded' not in cds]
         
         if(cfg.regions):
             log.debug('import user-provided CDS regions')
-            imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions)
+            imported_cdss = feat_cds.import_user_cdss(data, cfg.regions)
             print(f'\timported CDS regions: {len(imported_cdss)}')
             cdss.extend(imported_cdss)
 
@@ -316,7 +316,7 @@ def main():
                     log.debug('search pseudogene candidates')
                     pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals)
                     print(f'\t\tcandidates: {len(pseudo_candidates)}')
-                    pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else []
+                    pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, data) if len(pseudo_candidates) > 0 else []
                     psc.lookup(pseudogenes, pseudo=True)
                     pscc.lookup(pseudogenes, pseudo=True)
                     for pseudogene in pseudogenes:
@@ -334,9 +334,9 @@ def main():
                 print('\t\tcalculated proteins statistics')
             
             print('\trevise special cases...')
-            feat_cds.revise_special_cases_annotated(genome, cdss)
+            feat_cds.revise_special_cases_annotated(data, cdss)
 
-        genome['features'][bc.FEATURE_CDS] = cdss
+        data['features'][bc.FEATURE_CDS] = cdss
 
     ############################################################################
     # sORF prediction
@@ -351,11 +351,11 @@ def main():
     else:
         print('detect & annotate sORF...')
         log.debug('extract sORF')
-        sorfs = s_orf.extract(genome)
+        sorfs = s_orf.extract(data)
         print(f'\tdetected: {len(sorfs)}')
 
         log.debug('apply sORF overlap filter')
-        sorfs, discarded_sorfs = s_orf.overlap_filter(genome, sorfs)
+        sorfs, discarded_sorfs = s_orf.overlap_filter(data, sorfs)
         print(f'\tdiscarded due to overlaps: {len(discarded_sorfs)}')
 
         if(len(sorfs) > 0):
@@ -396,7 +396,7 @@ def main():
         log.debug('combine sORF annotations')
         for feat in sorfs_filtered:
             anno.combine_annotation(feat)  # combine IPS and PSC annotations
-        genome['features'][bc.FEATURE_SORF] = sorfs_filtered
+        data['features'][bc.FEATURE_SORF] = sorfs_filtered
         print(f'\tfiltered sORFs: {len(sorfs_filtered)}')
         
         if(cfg.gram != bc.GRAM_UNKNOWN  and  len(sorfs_filtered) > 0):
@@ -417,8 +417,8 @@ def main():
     else:
         print('detect gaps...')
         log.debug('detect gaps')
-        assembly_gaps = gaps.detect_assembly_gaps(genome)
-        genome['features'][bc.FEATURE_GAP] = assembly_gaps
+        assembly_gaps = gaps.detect_assembly_gaps(data)
+        data['features'][bc.FEATURE_GAP] = assembly_gaps
         print(f'\tfound: {len(assembly_gaps)}')
 
     ############################################################################
@@ -429,14 +429,14 @@ def main():
     else:
         print('detect oriCs/oriVs...')
         log.debug('detect oriC/V')
-        oriCs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIC)
-        genome['features'][bc.FEATURE_ORIC] = oriCs
+        oriCs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIC)
+        data['features'][bc.FEATURE_ORIC] = oriCs
         print(f'\tfound: {len(oriCs)}')
 
         print('detect oriTs...')
         log.debug('detect oriT')
-        oriTs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIT)
-        genome['features'][bc.FEATURE_ORIT] = oriTs
+        oriTs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIT)
+        data['features'][bc.FEATURE_ORIT] = oriTs
         print(f'\tfound: {len(oriTs)}')
 
     ############################################################################
@@ -446,7 +446,7 @@ def main():
         print('skip feature overlap filters...')
     else:
         print('apply feature overlap filters...')
-        anno.detect_feature_overlaps(genome)
+        anno.detect_feature_overlaps(data)
 
     ############################################################################
     # Create annotations
@@ -456,10 +456,10 @@ def main():
     ############################################################################
     print('select features and create locus tags...')
     log.debug('start feature selection and creation of locus tags')
-    features_by_sequence = {k['id']: [] for k in genome['sequences']}
+    features_by_sequence = {k['id']: [] for k in data['sequences']}
     feature_id = 1
     feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10)
-    for feature_list in genome['features'].values():
+    for feature_list in data['features'].values():
         for feature in feature_list:
             if('discarded' not in feature):
                 feature['id'] = f'{feature_id_prefix}_{feature_id}'
@@ -467,7 +467,7 @@ def main():
                 seq_features = features_by_sequence.get(feature['sequence'])
                 seq_features.append(feature)
     features = []
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         seq_features = features_by_sequence[seq['id']]
         seq_features.sort(key=lambda k: k['start'])
         features.extend(seq_features)
@@ -498,9 +498,9 @@ def main():
     # - annotation stats
     ############################################################################
     print('\nGenome statistics:')
-    genome_stats = bu.calc_genome_stats(genome, features)
-    print(f"\tGenome size: {genome['size']:,} bp")
-    print(f"\tContigs/replicons: {len(genome['sequences'])}")
+    genome_stats = bu.calc_genome_stats(data, features)
+    print(f"\tGenome size: {data['size']:,} bp")
+    print(f"\tContigs/replicons: {len(data['sequences'])}")
     print(f"\tGC: {100 * genome_stats['gc']:.1f} %")
     print(f"\tN50: {genome_stats['n50']:,}")
     print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %")
@@ -533,20 +533,20 @@ def main():
     print(f'\nExport annotation results to: {cfg.output_path}')
     print('\thuman readable TSV...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
-    tsv.write_features(genome['sequences'], features_by_sequence, tsv_path)
+    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)
 
     print('\tGFF3...')
     gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
-    gff.write_features(genome, features_by_sequence, gff3_path)
+    gff.write_features(data, features_by_sequence, gff3_path)
 
     print('\tINSDC GenBank & EMBL...')
     genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
     embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
-    insdc.write_features(genome, features, genbank_path, embl_path)
+    insdc.write_features(data, features, genbank_path, embl_path)
 
     print('\tgenome sequences...')
     fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
-    fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True)
+    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)
 
     print('\tfeature nucleotide sequences...')
     ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
@@ -558,7 +558,7 @@ def main():
 
     print('\tfeature inferences...')
     tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
-    tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path)
+    tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path)
 
     if(cfg.skip_plot  or  cfg.meta):
         print('\tskip generation of circular genome plot...')
@@ -579,7 +579,7 @@ def main():
     # measure runtime at the latest possible
     cfg.run_end = datetime.now()
     run_duration = (cfg.run_end - cfg.run_start).total_seconds()
-    genome['run'] = {
+    data['run'] = {
         'start': cfg.run_start.strftime('%Y-%m-%d %H:%M:%S'),
         'end': cfg.run_end.strftime('%Y-%m-%d %H:%M:%S'),
         'duration': f'{(run_duration / 60):.2f} min'
@@ -587,14 +587,14 @@ def main():
 
     print('\tmachine readable JSON...')
     json_path = cfg.output_path.joinpath(f'{cfg.prefix}.json')
-    json.write_json(genome, features, json_path)
+    json.write_json(data, features, json_path)
 
     print('\tGenome and annotation summary...')
     summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
     with summary_path.open('w') as fh_out:
         fh_out.write('Sequence(s):\n')
-        fh_out.write(f"Length: {genome['size']:}\n")
-        fh_out.write(f"Count: {len(genome['sequences'])}\n")
+        fh_out.write(f"Length: {data['size']:}\n")
+        fh_out.write(f"Count: {len(data['sequences'])}\n")
         fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
         fh_out.write(f"N50: {genome_stats['n50']:}\n")
         fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
diff --git a/bakta/utils.py b/bakta/utils.py
index c6f97cd7..eb7863e1 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -293,37 +293,37 @@ def has_annotation(feature: dict, attribute: str) -> bool:
         return False
 
 
-def calc_genome_stats(genome: dict, features: Sequence[dict]):
-    genome_size = genome['size']
+def calc_genome_stats(data: dict, features: Sequence[dict]):
+    genome_size = data['size']
     log.info('genome-size=%i', genome_size)
 
     # N50
     gc_sum = 0
     n_sum = 0
-    for seq in genome['sequences']:
+    for seq in data['sequences']:
         nt = seq['nt']
         gc_sum += nt.count('G') + nt.count('C')
         n_sum += nt.count('N')
     gc_ratio = gc_sum / (genome_size - n_sum)
-    genome['gc'] = gc_ratio
+    data['gc'] = gc_ratio
     log.info('GC=%0.3f', gc_ratio)
 
     n_ratio = n_sum / genome_size
-    genome['n_ratio'] = n_ratio
+    data['n_ratio'] = n_ratio
     log.info('N=%0.3f', n_ratio)
 
     n50 = 0
     sequence_length_sum = 0
-    for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True):
+    for seq in sorted(data['sequences'], key=lambda x: x['length'], reverse=True):
         nt_length = len(seq['nt'])
         sequence_length_sum += nt_length
         if(sequence_length_sum >= genome_size / 2):
             n50 = nt_length
             break
-    genome['n50'] = n50
+    data['n50'] = n50
     log.info('N50=%i', n50)
 
-    sequence_by_id = {seq['id']: seq for seq in genome['sequences']}
+    sequence_by_id = {seq['id']: seq for seq in data['sequences']}
     coding_nts = 0
     for feat in features:
         if(feat.get('edge', False)):
@@ -332,7 +332,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]):
         else:
             coding_nts += feat['stop'] - feat['start'] + 1  # feature coding nucleotides
     coding_ratio = coding_nts / (genome_size - n_sum)
-    genome['coding_ratio'] = coding_ratio
+    data['coding_ratio'] = coding_ratio
     log.info('coding-ratio=%0.3f', coding_ratio)
 
     return {
diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py
index 7a772a31..4786a709 100755
--- a/scripts/collect-annotation-stats.py
+++ b/scripts/collect-annotation-stats.py
@@ -66,36 +66,36 @@
         )
     )
     fh_out.write('\n')
-    for genome in args.genomes:
-        genome_path = Path(genome).resolve()
+    for genome_file in args.genomes:
+        genome_path = Path(genome_file).resolve()
         try:
             with genome_path.open() as fh_in:
-                genome = json.load(fh_in)
+                data = json.load(fh_in)
             stats = [
                 genome_path.stem,
-                f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}",
-                'y' if genome['genome']['complete'] else 'n',
-                f"{genome['genome']['translation_table']}",
-                f"{genome['stats']['no_sequences']}",
-                f"{genome['stats']['size']}",
-                f"{100 * genome['stats']['gc']:.1f}",
-                f"{100 * genome['stats']['n_ratio']:.1f}",
-                f"{genome['stats']['n50']}",
-                f"{100 * genome['stats']['coding_ratio']:.1f}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}",
-                f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}",
+                f"{' '.join([t for t in [data['genome'].get('genus', None), data['genome'].get('species', None), data['genome'].get('strain', None)] if t is not None])}",
+                'y' if data['genome']['complete'] else 'n',
+                f"{data['genome']['translation_table']}",
+                f"{data['stats']['no_sequences']}",
+                f"{data['stats']['size']}",
+                f"{100 * data['stats']['gc']:.1f}",
+                f"{100 * data['stats']['n_ratio']:.1f}",
+                f"{data['stats']['n50']}",
+                f"{100 * data['stats']['coding_ratio']:.1f}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_T_RNA])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_R_RNA])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CRISPR])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_SORF])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_GAP])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIC])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIV])}",
+                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIT])}",
             ]
             output_line = '\t'.join(stats)
             print(output_line)
diff --git a/scripts/extract-region.py b/scripts/extract-region.py
index 23da1605..144a1bc3 100755
--- a/scripts/extract-region.py
+++ b/scripts/extract-region.py
@@ -41,11 +41,11 @@
 print('Load annotated genome...')
 genome_path = Path(args.genome).resolve()
 with genome_path.open() as fh:
-    genome = json.load(fh)
+    data = json.load(fh)
 
 sequence_id = args.sequence
 if(sequence_id is None):  # take first sequence as default
-    sequence_id = genome['sequences'][0]['id']
+    sequence_id = data['sequences'][0]['id']
 
 prefix = args.prefix
 if(prefix is None):  # use input file prefix as default
@@ -54,33 +54,33 @@
 
 print('Extract features within selected region...')
 features_selected = []
-for feat in genome['features']:
+for feat in data['features']:
     if(feat['sequence'] == sequence_id):
         if(feat['start'] >= args.min  and  feat['stop'] <= args.max):
             features_selected.append(feat)
 features_by_sequence = {sequence_id: features_selected}  # needed for GFF3 export
 print(f'\t...selected features: {len(features_selected)}')
 
-genome['features'] = features_selected
-genome['sequences'] = [sequence for sequence in genome['sequences'] if sequence['id'] == sequence_id]
-genome['genus'] = genome['genome']['genus']
-genome['species'] = genome['genome']['species']
-genome['strain'] = genome['genome']['strain']
-genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}"
+data['features'] = features_selected
+data['sequences'] = [sequence for sequence in data['sequences'] if sequence['id'] == sequence_id]
+data['genus'] = data['genome']['genus']
+data['species'] = data['genome']['species']
+data['strain'] = data['genome']['strain']
+data['taxon'] = f"{data['genome']['genus']} {data['genome']['species']} {data['genome']['strain']}"
 cfg.db_info = {
-    'major': genome['version']['db']['version'].split('.')[0],
-    'minor': genome['version']['db']['version'].split('.')[1],
-    'type': genome['version']['db']['type']
+    'major': data['version']['db']['version'].split('.')[0],
+    'minor': data['version']['db']['version'].split('.')[1],
+    'type': data['version']['db']['type']
 }
 
 print('Write selected features...')
 output_path = Path(args.output).resolve()
 gff3_path = output_path.joinpath(f'{prefix}.gff3')
-gff.write_features(genome, features_by_sequence, gff3_path)
+gff.write_features(data, features_by_sequence, gff3_path)
 print('\t...INSDC GenBank & EMBL')
 genbank_path = output_path.joinpath(f'{prefix}.gbff')
 embl_path = output_path.joinpath(f'{prefix}.embl')
-insdc.write_features(genome, features_selected, genbank_path, embl_path)
+insdc.write_features(data, features_selected, genbank_path, embl_path)
 print('\t...feature nucleotide sequences')
 ffn_path = output_path.joinpath(f'{prefix}.ffn')
 fasta.write_ffn(features_selected, ffn_path)

From 8c3b2fbd57ea1853efce9a4ec5e6cbb0502161d9 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 16:34:22 +0200
Subject: [PATCH 4/8] reorganize data strucutre

---
 bakta/features/annotation.py    |  18 +++---
 bakta/features/cds.py           |   8 +--
 bakta/features/nc_rna.py        |   4 +-
 bakta/features/nc_rna_region.py |   4 +-
 bakta/features/r_rna.py         |   4 +-
 bakta/features/s_orf.py         |  14 ++---
 bakta/io.py                     |  11 ++--
 bakta/io/fasta.py               |  15 ++---
 bakta/io/gff.py                 |   4 +-
 bakta/io/insdc.py               |  14 ++---
 bakta/io/json.py                |  36 +-----------
 bakta/main.py                   | 101 +++++++++++++++++---------------
 bakta/proteins.py               |   3 +-
 bakta/utils.py                  |  21 +++----
 test/test_pseudo.py             |   6 +-
 test/test_sORF.py               |   6 +-
 16 files changed, 119 insertions(+), 150 deletions(-)

diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
index 63165b47..7b771a32 100644
--- a/bakta/features/annotation.py
+++ b/bakta/features/annotation.py
@@ -152,35 +152,35 @@ def detect_feature_overlaps(data: dict):
     sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations)
     """
     sequence_t_rnas = {k['id']: [] for k in data['sequences']}
-    for t_rna in data['features'].get(bc.FEATURE_T_RNA, []):
-        t_rnas = sequence_t_rnas[t_rna['sequence']]
-        t_rnas.append(t_rna)
+    for trna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]:
+        t_rnas = sequence_t_rnas[trna['sequence']]
+        t_rnas.append(trna)
     sequence_tm_rnas = {k['id']: [] for k in data['sequences']}
-    for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []):
+    for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]:
         tm_rnas = sequence_tm_rnas[tm_rna['sequence']]
         tm_rnas.append(tm_rna)
     sequence_r_rnas = {k['id']: [] for k in data['sequences']}
-    for r_rna in data['features'].get(bc.FEATURE_R_RNA, []):
+    for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]:
         r_rnas = sequence_r_rnas[r_rna['sequence']]
         r_rnas.append(r_rna)
     sequence_ncrna_regions = {k['id']: [] for k in data['sequences']}
-    for ncRNA_region in data['features'].get(bc.FEATURE_NC_RNA_REGION, []):
+    for ncRNA_region in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]:
         ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']]
         ncRNA_regions.append(ncRNA_region)
     sequence_crispr_arrays = {k['id']: [] for k in data['sequences']}
-    for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []):
+    for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]:
         crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
     sequence_cdss = {k['id']: [] for k in data['sequences']}
     sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']}
-    for cds in data['features'].get(bc.FEATURE_CDS, []):
+    for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]:
         if(cds.get('source', None) == bc.CDS_SOURCE_USER):
             cdss = sequence_cdss_user_provided[cds['sequence']]
         else:
             cdss = sequence_cdss[cds['sequence']]
         cdss.append(cds)
     sequence_sorfs = {k['id']: [] for k in data['sequences']}
-    for sorf in data['features'].get(bc.FEATURE_SORF, []):
+    for sorf in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF]:
         sorfs = sequence_sorfs[sorf['sequence']]
         sorfs.append(sorf)
 
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 781f95f4..4215b7d6 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -35,10 +35,10 @@ def predict(data: dict):
     # create Pyrodigal trainining file if not provided by the user
     prodigal_tf_path = cfg.prodigal_tf
     trainings_info = None
-    prodigal_metamode = cfg.meta  or  data['size'] < pyrodigal.MIN_SINGLE_GENOME  # 20_000 bp
+    prodigal_metamode = cfg.meta  or  data['stats']['size'] < pyrodigal.MIN_SINGLE_GENOME  # 20_000 bp
     log.debug('prodigal mode: meta=%s', prodigal_metamode)
     if(prodigal_tf_path is None):
-        closed = not data['complete']
+        closed = not data['genome']['complete']
         if(not prodigal_metamode):
             log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed)
             gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed)
@@ -406,7 +406,7 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]):
     Revise translational exceptions as for istance selenocystein proteins.
     """
     no_revised = 0
-    if(bc.FEATURE_NC_RNA_REGION not in data['features']):  # check if ncRNA regions have been detected, otherwise skip analysis and return
+    if(len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]) == 0):  # check if ncRNA regions have been detected, otherwise skip analysis and return
         return no_revised
 
     sequences = {seq['id']: seq for seq in data['sequences']}
@@ -432,7 +432,7 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]):
                 cds_pairs = cds_pairs_per_sequence[cds_a['sequence']]
                 cds_pairs.append((cds_a, cds_b))
 
-    recoding_regions = [ncrna_region for ncrna_region in data['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION]  #  Selenocysteine insertion sequences
+    recoding_regions = [ncrna_region for ncrna_region in data['features'] if ncrna_region['type'] == bc.FEATURE_NC_RNA_REGION  and  ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION]  #  Selenocysteine insertion sequences
     for recoding_region in recoding_regions:
         if('selenocysteine' in recoding_region.get('product', '').lower()):
             cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']]
diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py
index 4c669674..16b94404 100644
--- a/bakta/features/nc_rna.py
+++ b/bakta/features/nc_rna.py
@@ -31,9 +31,9 @@ def predict_nc_rnas(data: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(data['size'] >= 1000000):
+    if(data['stats']['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * data['size'] // 1000000))
+        cmd.append(str(2 * data['stats']['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-genes')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py
index ec8ce92f..3dfe88b9 100644
--- a/bakta/features/nc_rna_region.py
+++ b/bakta/features/nc_rna_region.py
@@ -30,9 +30,9 @@ def predict_nc_rna_regions(data: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(data['size'] >= 1000000):
+    if(data['stats']['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * data['size'] // 1000000))
+        cmd.append(str(2 * data['stats']['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('ncRNA-regions')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py
index 7847921a..b5f5cb92 100644
--- a/bakta/features/r_rna.py
+++ b/bakta/features/r_rna.py
@@ -31,9 +31,9 @@ def predict_r_rnas(data: dict, sequences_path: Path):
         '--cpu', str(cfg.threads),
         '--tblout', str(output_path)
     ]
-    if(data['size'] >= 1000000):
+    if(data['stats']['size'] >= 1000000):
         cmd.append('-Z')
-        cmd.append(str(2 * data['size'] // 1000000))
+        cmd.append(str(2 * data['stats']['size'] // 1000000))
     cmd.append(str(cfg.db_path.joinpath('rRNA')))
     cmd.append(str(sequences_path))
     log.debug('cmd=%s', cmd)
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index cc7ddb0a..cb93f294 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -90,33 +90,33 @@ def get_feature_stop(feature: dict) -> int:
 def overlap_filter(data: dict, orfs_raw: Sequence[dict]):
     """Filter in-mem ORFs by overlapping CDSs."""
     t_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']}
-    for t_rna in data['features'].get(bc.FEATURE_T_RNA, []):
+    for t_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]:
         t_rnas = t_rnas_per_sequence[t_rna['sequence']]
         t_rnas.append(t_rna)
-    for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []):
+    for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]:
         t_rnas = t_rnas_per_sequence[tm_rna['sequence']]
         t_rnas.append(tm_rna)
 
     r_rna_per_sequence = {seq['id']: [] for seq in data['sequences']}
-    for r_rna in data['features'].get(bc.FEATURE_R_RNA, []):
+    for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]:
         r_rnas = r_rna_per_sequence[r_rna['sequence']]
         r_rnas.append(r_rna)
 
     # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']}
-    # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA, []):
+    # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA]:
     #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
-    # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA_REGION, []):
+    # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]:
     #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
 
     crispr_arrays_per_sequence = {seq['id']: [] for seq in data['sequences']}
-    for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []):
+    for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]:
         crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
 
     cdss_per_sequence = {k['id']: [] for k in data['sequences']}
-    for cds in data['features'].get(bc.FEATURE_CDS, []):
+    for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]:
         cdss = cdss_per_sequence[cds['sequence']]
         cdss.append(cds)
 
diff --git a/bakta/io.py b/bakta/io.py
index 5f62efac..8bb2bf08 100644
--- a/bakta/io.py
+++ b/bakta/io.py
@@ -160,14 +160,13 @@ def main():
     print('\tGenome and annotation summary...')
     summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
     with summary_path.open('w') as fh_out:
-        genome_stats = bu.calc_genome_stats(data, features)
         fh_out.write('Sequence(s):\n')
-        fh_out.write(f"Length: {data['size']:}\n")
+        fh_out.write(f"Length: {data['stats']['size']:}\n")
         fh_out.write(f"Count: {len(data['sequences'])}\n")
-        fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
-        fh_out.write(f"N50: {genome_stats['n50']:}\n")
-        fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
-        fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n")
+        fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n")
+        fh_out.write(f"N50: {data['stats']['n50']:}\n")
+        fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
+        fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
         fh_out.write('\nAnnotation:\n')
         fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n")
         fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n")
diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py
index 7a824ef3..9c3c10cb 100644
--- a/bakta/io/fasta.py
+++ b/bakta/io/fasta.py
@@ -23,6 +23,11 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T
     sequences = []
     with xopen(str(sequences_path), threads=0) as fh:
         for record in SeqIO.parse(fh, 'fasta'):
+            sequence = {
+                'id': record.id,
+                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else ''
+            }
+            
             raw_sequence = str(record.seq).upper()
             if('-' in raw_sequence):
                 dash_count = raw_sequence.count('-')
@@ -32,6 +37,7 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T
                 if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                     log.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id)
                     raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}')
+                sequence['nt'] = raw_sequence
             else:
                 if(raw_sequence[-1] == '*'):  # remove trailing stop asterik
                     raw_sequence = raw_sequence[:-1]
@@ -39,13 +45,8 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T
                 if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None):
                     log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence)
                     raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}')
-
-            sequence = {
-                'id': record.id,
-                'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '',
-                'nt': raw_sequence,
-                'length': len(raw_sequence)
-            }
+                sequence['aa'] = raw_sequence
+            sequence['length'] = len(raw_sequence)
             if(is_genomic):
                 sequence['complete'] = False
                 sequence['type'] = bc.REPLICON_CONTIG
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 09fc2ca9..9acca552 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -22,8 +22,8 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path:
         fh.write('##gff-version 3\n')  # GFF version
         fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n')  # SO feature version
 
-        if(data['taxon']):  # write organism info
-            fh.write(f"# organism {data['taxon']}\n")
+        if(data['genome']['taxon']):  # write organism info
+            fh.write(f"# organism {data['genome']['taxon']}\n")
 
         fh.write('# Annotated with Bakta\n')
         fh.write(f'# Software: v{bakta.__version__}\n')
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index f49730e3..698c8ff5 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -47,7 +47,7 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa
         )
         sequence_annotations = {
             'molecule_type': 'DNA',
-            'source': data['taxon'],
+            'source': data['genome']['taxon'],
             'date': date.today().strftime('%d-%b-%Y').upper(),
             'topology': seq['topology'],
             'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT',
@@ -61,12 +61,12 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa
         }
 
         description = ''
-        if(data['taxon']):
-            sequence_annotations['organism'] = data['taxon']
-            source_qualifiers['organism'] = data['taxon']
-            description = data['taxon']
-        if(data['strain']):
-            source_qualifiers['strain'] = data['strain']
+        if(data['genome']['taxon']):
+            sequence_annotations['organism'] = data['genome']['taxon']
+            source_qualifiers['organism'] = data['genome']['taxon']
+            description = data['genome']['taxon']
+        if(data['genome']['strain']):
+            source_qualifiers['strain'] = data['genome']['strain']
 
         if(seq['type'] == bc.REPLICON_PLASMID):
             source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed'
diff --git a/bakta/io/json.py b/bakta/io/json.py
index 1d5e8256..a10cdc84 100644
--- a/bakta/io/json.py
+++ b/bakta/io/json.py
@@ -31,45 +31,13 @@ def write_json(data: dict, features: Sequence[dict], json_path: Path):
             if(psc):
                 psc.pop('db_xrefs')
 
-    # replace features type dict by sorted feature list
-    output = OrderedDict()
-    if data is not None:
-        ordered_genome = OrderedDict()
-        ordered_genome['genus'] = data['genus']
-        ordered_genome['species'] = data['species']
-        ordered_genome['strain'] = data['strain']
-        if('plasmid' in data):
-            ordered_genome['plasmid'] = data['plasmid']
-        ordered_genome['complete'] = data['complete']
-        ordered_genome['gram'] = data['gram']
-        ordered_genome['translation_table'] = data['translation_table']
-        output['genome'] = ordered_genome
-
-        stats = OrderedDict()
-        stats['no_sequences'] = len(data['sequences'])
-        stats['size'] = data['size']
-        stats['gc'] = data['gc']
-        stats['n_ratio'] = data['n_ratio']
-        stats['n50'] = data['n50']
-        stats['coding_ratio'] = data['coding_ratio']
-        output['stats'] = stats
-
-    output['features'] = features
-    if data is not None:
-        output['sequences'] = data['sequences']
-
-    run = OrderedDict()
-    run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S')
-    run['end'] = cfg.run_end.strftime('%Y-%m-%d %H:%M:%S')
-    output['run'] = run
-
     version = OrderedDict()
     version['bakta'] = bakta.__version__
     version['db'] = {
         'version': f"{cfg.db_info['major']}.{cfg.db_info['minor']}",
         'type': cfg.db_info['type']
     }
-    output['version'] = version
+    data['version'] = version
 
     with json_path.open('wt') as fh:
-        json.dump(output, fh, indent=4)
+        json.dump(data, fh, indent=4)
diff --git a/bakta/main.py b/bakta/main.py
index d54df130..22ac72d0 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -134,19 +134,23 @@ def main():
     sequences_path = cfg.tmp_path.joinpath('sequences.fna')
     fasta.export_sequences(sequences, sequences_path)
     data = {
-        'genus': cfg.genus,
-        'species': cfg.species,
-        'strain': cfg.strain,
-        'taxon': cfg.taxon,
-        'gram': cfg.gram,
-        'translation_table': cfg.translation_table,
-        'size': sum([seq['length'] for seq in sequences]),
-        'complete': cfg.complete or complete_genome,
-        'features': {},
+        'genome': {
+            'genus': cfg.genus,
+            'species': cfg.species,
+            'strain': cfg.strain,
+            'taxon': cfg.taxon,
+            'complete': cfg.complete or complete_genome,
+            'gram': cfg.gram,
+            'translation_table': cfg.translation_table
+        },
+        'stats': {
+            'size': sum([seq['length'] for seq in sequences])
+        },
+        'features': [],
         'sequences': sequences
     }
     if(cfg.plasmid):
-        data['plasmid'] = cfg.plasmid
+        data['genome']['plasmid'] = cfg.plasmid
     print('\nStart annotation...')
 
     ############################################################################
@@ -157,8 +161,9 @@ def main():
     else:
         print('predict tRNAs...')
         log.debug('start tRNA prediction')
-        data['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_T_RNA])}")
+        trnas = t_rna.predict_t_rnas(data, sequences_path)
+        data['features'].extend(trnas)
+        print(f"\tfound: {len(trnas)}")
 
     ############################################################################
     # tmRNA prediction
@@ -168,8 +173,9 @@ def main():
     else:
         print('predict tmRNAs...')
         log.debug('start tmRNA prediction')
-        data['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_TM_RNA])}")
+        tmrnas = tm_rna.predict_tm_rnas(data, sequences_path)
+        data['features'].extend(tmrnas)
+        print(f"\tfound: {len(tmrnas)}")
 
     ############################################################################
     # rRNA prediction
@@ -179,8 +185,9 @@ def main():
     else:
         print('predict rRNAs...')
         log.debug('start rRNA prediction')
-        data['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_R_RNA])}")
+        rrnas = r_rna.predict_r_rnas(data, sequences_path)
+        data['features'].extend(rrnas)
+        print(f"\tfound: {len(rrnas)}")
 
     ############################################################################
     # ncRNA gene prediction
@@ -190,8 +197,9 @@ def main():
     else:
         print('predict ncRNAs...')
         log.debug('start ncRNA prediction')
-        data['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA])}")
+        ncrnas = nc_rna.predict_nc_rnas(data, sequences_path)
+        data['features'].extend(ncrnas)
+        print(f"\tfound: {len(ncrnas)}")
 
     ############################################################################
     # ncRNA region prediction
@@ -201,8 +209,9 @@ def main():
     else:
         print('predict ncRNA regions...')
         log.debug('start ncRNA region prediction')
-        data['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA_REGION])}")
+        ncrna_regions = nc_rna_region.predict_nc_rna_regions(data, sequences_path)
+        data['features'].extend(ncrna_regions)
+        print(f"\tfound: {len(ncrna_regions)}")
 
     ############################################################################
     # CRISPR prediction
@@ -212,8 +221,9 @@ def main():
     else:
         print('predict CRISPR arrays...')
         log.debug('start CRISPR prediction')
-        data['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(data, sequences_path)
-        print(f"\tfound: {len(data['features'][bc.FEATURE_CRISPR])}")
+        crisprs = crispr.predict_crispr(data, sequences_path)
+        data['features'].extend(crisprs)
+        print(f"\tfound: {len(crisprs)}")
 
     ############################################################################
     # CDS prediction
@@ -336,7 +346,7 @@ def main():
             print('\trevise special cases...')
             feat_cds.revise_special_cases_annotated(data, cdss)
 
-        data['features'][bc.FEATURE_CDS] = cdss
+        data['features'].extend(cdss)
 
     ############################################################################
     # sORF prediction
@@ -396,7 +406,7 @@ def main():
         log.debug('combine sORF annotations')
         for feat in sorfs_filtered:
             anno.combine_annotation(feat)  # combine IPS and PSC annotations
-        data['features'][bc.FEATURE_SORF] = sorfs_filtered
+        data['features'].extend(sorfs_filtered)
         print(f'\tfiltered sORFs: {len(sorfs_filtered)}')
         
         if(cfg.gram != bc.GRAM_UNKNOWN  and  len(sorfs_filtered) > 0):
@@ -418,7 +428,7 @@ def main():
         print('detect gaps...')
         log.debug('detect gaps')
         assembly_gaps = gaps.detect_assembly_gaps(data)
-        data['features'][bc.FEATURE_GAP] = assembly_gaps
+        data['features'].extend(assembly_gaps)
         print(f'\tfound: {len(assembly_gaps)}')
 
     ############################################################################
@@ -430,13 +440,13 @@ def main():
         print('detect oriCs/oriVs...')
         log.debug('detect oriC/V')
         oriCs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIC)
-        data['features'][bc.FEATURE_ORIC] = oriCs
+        data['features'].extend(oriCs)
         print(f'\tfound: {len(oriCs)}')
 
         print('detect oriTs...')
         log.debug('detect oriT')
         oriTs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIT)
-        data['features'][bc.FEATURE_ORIT] = oriTs
+        data['features'].extend(oriTs)
         print(f'\tfound: {len(oriTs)}')
 
     ############################################################################
@@ -459,18 +469,18 @@ def main():
     features_by_sequence = {k['id']: [] for k in data['sequences']}
     feature_id = 1
     feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10)
-    for feature_list in data['features'].values():
-        for feature in feature_list:
-            if('discarded' not in feature):
-                feature['id'] = f'{feature_id_prefix}_{feature_id}'
-                feature_id += 1
-                seq_features = features_by_sequence.get(feature['sequence'])
-                seq_features.append(feature)
+    for feature in data['features']:
+        if('discarded' not in feature):
+            feature['id'] = f'{feature_id_prefix}_{feature_id}'
+            feature_id += 1
+            seq_features = features_by_sequence.get(feature['sequence'])
+            seq_features.append(feature)
     features = []
     for seq in data['sequences']:
         seq_features = features_by_sequence[seq['id']]
         seq_features.sort(key=lambda k: k['start'])
         features.extend(seq_features)
+    data['features'] = features  # overwrite feature list by final sorted feature list
     log.info('selected features=%i', len(features))
     print(f'\tselected: {len(features)}')
 
@@ -497,15 +507,14 @@ def main():
     # - genome stats
     # - annotation stats
     ############################################################################
+    bu.calc_genome_stats(data)
     print('\nGenome statistics:')
-    genome_stats = bu.calc_genome_stats(data, features)
-    print(f"\tGenome size: {data['size']:,} bp")
+    print(f"\tGenome size: {data['stats']['size']:,} bp")
     print(f"\tContigs/replicons: {len(data['sequences'])}")
-    print(f"\tGC: {100 * genome_stats['gc']:.1f} %")
-    print(f"\tN50: {genome_stats['n50']:,}")
-    print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %")
-    print(f"\tcoding density: {100 * genome_stats['coding_ratio']:.1f} %")
-
+    print(f"\tGC: {100 * data['stats']['gc']:.1f} %")
+    print(f"\tN50: {data['stats']['n50']:,}")
+    print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %")
+    print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %")
     print('\nannotation summary:')
     print(f"\ttRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}")
     print(f"\ttmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}")
@@ -593,12 +602,12 @@ def main():
     summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
     with summary_path.open('w') as fh_out:
         fh_out.write('Sequence(s):\n')
-        fh_out.write(f"Length: {data['size']:}\n")
+        fh_out.write(f"Length: {data['stats']['size']:}\n")
         fh_out.write(f"Count: {len(data['sequences'])}\n")
-        fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n")
-        fh_out.write(f"N50: {genome_stats['n50']:}\n")
-        fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n")
-        fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n")
+        fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n")
+        fh_out.write(f"N50: {data['stats']['n50']:}\n")
+        fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
+        fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
         fh_out.write('\nAnnotation:\n')
         fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n")
         fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n")
diff --git a/bakta/proteins.py b/bakta/proteins.py
index a7835611..d7f7d169 100644
--- a/bakta/proteins.py
+++ b/bakta/proteins.py
@@ -137,7 +137,6 @@ def main():
     mock_start = 1
     for aa in aas:  # rename and mock feature attributes to reuse existing functions
         aa['type'] = bc.FEATURE_CDS
-        aa['aa'] = aa['sequence']
         aa['locus'] = aa['id']
         aa['sequence'] = '-'
         aa['start'] = mock_start
@@ -177,7 +176,7 @@ def main():
         aa.pop('frame', None)
     full_annotations_path = output_path.joinpath(f'{cfg.prefix}.json')
     print(f'\tfull annotations (JSON): {full_annotations_path}')
-    json.write_json(None, aas, full_annotations_path)
+    json.write_json({'features': aas}, aas, full_annotations_path)
     hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
     header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits']
     hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa]
diff --git a/bakta/utils.py b/bakta/utils.py
index eb7863e1..e405030e 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -293,8 +293,8 @@ def has_annotation(feature: dict, attribute: str) -> bool:
         return False
 
 
-def calc_genome_stats(data: dict, features: Sequence[dict]):
-    genome_size = data['size']
+def calc_genome_stats(data: dict):
+    genome_size = data['stats']['size']
     log.info('genome-size=%i', genome_size)
 
     # N50
@@ -305,11 +305,11 @@ def calc_genome_stats(data: dict, features: Sequence[dict]):
         gc_sum += nt.count('G') + nt.count('C')
         n_sum += nt.count('N')
     gc_ratio = gc_sum / (genome_size - n_sum)
-    data['gc'] = gc_ratio
+    data['stats']['gc'] = gc_ratio
     log.info('GC=%0.3f', gc_ratio)
 
     n_ratio = n_sum / genome_size
-    data['n_ratio'] = n_ratio
+    data['stats']['n_ratio'] = n_ratio
     log.info('N=%0.3f', n_ratio)
 
     n50 = 0
@@ -320,28 +320,21 @@ def calc_genome_stats(data: dict, features: Sequence[dict]):
         if(sequence_length_sum >= genome_size / 2):
             n50 = nt_length
             break
-    data['n50'] = n50
+    data['stats']['n50'] = n50
     log.info('N50=%i', n50)
 
     sequence_by_id = {seq['id']: seq for seq in data['sequences']}
     coding_nts = 0
-    for feat in features:
+    for feat in data['features']:
         if(feat.get('edge', False)):
             sequence_length = sequence_by_id[feat['sequence']]['length']
             coding_nts += feat['stop'] + (sequence_length - feat['start'] + 1)  # feature coding nucleotides
         else:
             coding_nts += feat['stop'] - feat['start'] + 1  # feature coding nucleotides
     coding_ratio = coding_nts / (genome_size - n_sum)
-    data['coding_ratio'] = coding_ratio
+    data['stats']['coding_ratio'] = coding_ratio
     log.info('coding-ratio=%0.3f', coding_ratio)
 
-    return {
-        'gc': gc_ratio,
-        'n_ratio': n_ratio,
-        'n50': n50,
-        'coding_ratio': coding_ratio
-    }
-
 
 def parse_replicon_table(replicon_table_path: Path) -> Dict[str, dict]:
     replicons = {}
diff --git a/test/test_pseudo.py b/test/test_pseudo.py
index c35199e1..e9a25b3b 100644
--- a/test/test_pseudo.py
+++ b/test/test_pseudo.py
@@ -271,7 +271,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected
               'edge': False
             },
             {
-              'sequence': 'ACGT' * 200,
+              'nt': 'ACGT' * 200,
               'topology': 'linear'
             },
             {
@@ -291,7 +291,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected
               'edge': False
             },
             {
-              'sequence': 'ACGT' * 50,  # 200nt
+              'nt': 'ACGT' * 50,  # 200nt
               'topology': 'linear'
             },
             {
@@ -313,7 +313,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected
               'elongation_downstream': 300
             },
             {
-              'sequence': 'ACGT' * 100,  # 400nt
+              'nt': 'ACGT' * 100,  # 400nt
               'topology': 'circular'
             },
             {
diff --git a/test/test_sORF.py b/test/test_sORF.py
index 56c21c9a..4db8004a 100644
--- a/test/test_sORF.py
+++ b/test/test_sORF.py
@@ -8,17 +8,17 @@
 CONTIG_1 = {
     'id': 1,
     'description': 'no sORFs',
-    'sequence': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG'
+    'nt': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG'
 }
 CONTIG_2 = {
     'id': 2,
     'description': 'out of limits',
-    'sequence': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
+    'nt': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
 }
 CONTIG_3 = {
     'id': 3,
     'description': 'two sORFs',
-    'sequence': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
+    'nt': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG'
 }
 
 GENOME_1 = {

From 93af02415ad416cac1e1037fbf3d861334f1ce9b Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 17:34:52 +0200
Subject: [PATCH 5/8] fix io

---
 bakta/io.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/bakta/io.py b/bakta/io.py
index 8bb2bf08..1002befb 100644
--- a/bakta/io.py
+++ b/bakta/io.py
@@ -91,15 +91,9 @@ def main():
     ############################################################################
     print('Parse genome annotations...')
     with annotation_path.open('r') as fh:
-        annotation = json.load(fh)
-    features = annotation['features']
-    sequences = annotation['sequences']
-    data = {
-        'features': features,
-        'sequence': sequences,
-        'taxon': annotation['genome']
-    }
-    features_by_sequence = {k['id']: [] for k in data['sequences']}
+        data = json.load(fh)
+    features = data['features']
+    features_by_sequence = {seq['id']: [] for seq in data['sequences']}
     for feature in data['features']:
         sequence_features = features_by_sequence.get(feature['sequence'])
         sequence_features.append(feature)

From 1f11b1cd86d643a9a70642bff44157c692997b12 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 17:37:42 +0200
Subject: [PATCH 6/8] refactor seq var name in list comprehensions

---
 bakta/features/annotation.py | 16 ++++++++--------
 bakta/features/cds.py        |  4 ++--
 bakta/features/s_orf.py      |  4 ++--
 bakta/main.py                |  2 +-
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
index 7b771a32..2c139162 100644
--- a/bakta/features/annotation.py
+++ b/bakta/features/annotation.py
@@ -151,35 +151,35 @@ def detect_feature_overlaps(data: dict):
     CDS < tmRNA, tRNA, rRNA, CRISPR
     sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations)
     """
-    sequence_t_rnas = {k['id']: [] for k in data['sequences']}
+    sequence_t_rnas = {seq['id']: [] for seq in data['sequences']}
     for trna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]:
         t_rnas = sequence_t_rnas[trna['sequence']]
         t_rnas.append(trna)
-    sequence_tm_rnas = {k['id']: [] for k in data['sequences']}
+    sequence_tm_rnas = {seq['id']: [] for seq in data['sequences']}
     for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]:
         tm_rnas = sequence_tm_rnas[tm_rna['sequence']]
         tm_rnas.append(tm_rna)
-    sequence_r_rnas = {k['id']: [] for k in data['sequences']}
+    sequence_r_rnas = {seq['id']: [] for seq in data['sequences']}
     for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]:
         r_rnas = sequence_r_rnas[r_rna['sequence']]
         r_rnas.append(r_rna)
-    sequence_ncrna_regions = {k['id']: [] for k in data['sequences']}
+    sequence_ncrna_regions = {seq['id']: [] for seq in data['sequences']}
     for ncRNA_region in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]:
         ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']]
         ncRNA_regions.append(ncRNA_region)
-    sequence_crispr_arrays = {k['id']: [] for k in data['sequences']}
+    sequence_crispr_arrays = {seq['id']: [] for seq in data['sequences']}
     for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]:
         crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
-    sequence_cdss = {k['id']: [] for k in data['sequences']}
-    sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']}
+    sequence_cdss = {seq['id']: [] for seq in data['sequences']}
+    sequence_cdss_user_provided = {seq['id']: [] for seq in data['sequences']}
     for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]:
         if(cds.get('source', None) == bc.CDS_SOURCE_USER):
             cdss = sequence_cdss_user_provided[cds['sequence']]
         else:
             cdss = sequence_cdss[cds['sequence']]
         cdss.append(cds)
-    sequence_sorfs = {k['id']: [] for k in data['sequences']}
+    sequence_sorfs = {seq['id']: [] for seq in data['sequences']}
     for sorf in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF]:
         sorfs = sequence_sorfs[sorf['sequence']]
         sorfs.append(sorf)
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 4215b7d6..8e5b0e99 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -411,12 +411,12 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]):
 
     sequences = {seq['id']: seq for seq in data['sequences']}
     # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding.
-    cdss_per_sequences = {k['id']: [] for k in data['sequences']}  # get CDS per sequence
+    cdss_per_sequences = {seq['id']: [] for seq in data['sequences']}  # get CDS per sequence
     for cds in cdss:
         cdss_per_sequence = cdss_per_sequences[cds['sequence']]
         if('truncated' not in cds):  # exclude truncated CDS for now
             cdss_per_sequence.append(cds)
-    cds_pairs_per_sequence = {k['id']: [] for k in data['sequences']}  # extract inframe primate CDS neighbouring pairs
+    cds_pairs_per_sequence = {seq['id']: [] for seq in data['sequences']}  # extract inframe primate CDS neighbouring pairs
     for id, cdss_per_sequence in cdss_per_sequences.items():
         cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start'])
         for i in range(1, len(cdss_per_sequence)):
diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py
index cb93f294..d8d8a5ad 100644
--- a/bakta/features/s_orf.py
+++ b/bakta/features/s_orf.py
@@ -102,7 +102,7 @@ def overlap_filter(data: dict, orfs_raw: Sequence[dict]):
         r_rnas = r_rna_per_sequence[r_rna['sequence']]
         r_rnas.append(r_rna)
 
-    # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']}
+    # nc_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']}
     # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA]:
     #     nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']]
     #     nc_rnas.append(nc_rna)
@@ -115,7 +115,7 @@ def overlap_filter(data: dict, orfs_raw: Sequence[dict]):
         crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']]
         crispr_arrays.append(crispr_array)
 
-    cdss_per_sequence = {k['id']: [] for k in data['sequences']}
+    cdss_per_sequence = {seq['id']: [] for seq in data['sequences']}
     for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]:
         cdss = cdss_per_sequence[cds['sequence']]
         cdss.append(cds)
diff --git a/bakta/main.py b/bakta/main.py
index 22ac72d0..664eb3aa 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -466,7 +466,7 @@ def main():
     ############################################################################
     print('select features and create locus tags...')
     log.debug('start feature selection and creation of locus tags')
-    features_by_sequence = {k['id']: [] for k in data['sequences']}
+    features_by_sequence = {seq['id']: [] for seq in data['sequences']}
     feature_id = 1
     feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10)
     for feature in data['features']:

From befe6eec861d141b64bfd82ba9682c4666fe3f4a Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 17:49:37 +0200
Subject: [PATCH 7/8] remove io.py

---
 bakta/io.py | 189 ----------------------------------------------------
 1 file changed, 189 deletions(-)
 delete mode 100644 bakta/io.py

diff --git a/bakta/io.py b/bakta/io.py
deleted file mode 100644
index 1002befb..00000000
--- a/bakta/io.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import atexit
-import logging
-import os
-import sys
-
-from pathlib import Path
-
-import bakta
-import bakta.constants as bc
-import bakta.config as cfg
-import bakta.utils as bu
-import bakta.io.fasta as fasta
-import bakta.io.json as json
-import bakta.io.tsv as tsv
-import bakta.io.gff as gff
-import bakta.io.insdc as insdc
-import bakta.plot as plot
-
-
-log = logging.getLogger('IO')
-
-
-def main():
-    # parse options and arguments
-    parser = bu.init_parser(sub_command='_proteins')
-    parser.add_argument('input', metavar='<input>', help='Bakta annotations in JSON format')
-    
-    arg_group_io = parser.add_argument_group('Input / Output')
-    arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
-    arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
-    arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder')
-    
-    arg_group_general = parser.add_argument_group('General')
-    arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit')
-    arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information')
-    arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.')
-    arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}')
-    args = parser.parse_args()
-
-    ############################################################################
-    # Setup logging
-    ############################################################################
-    cfg.prefix = args.prefix if args.prefix else Path(args.input).stem
-    output_path = cfg.check_output_path(args.output, args.force)
-    cfg.force = args.force
-    log.info('force=%s', args.force)
-    
-    bu.setup_logger(output_path, cfg.prefix, args)
-    log.info('prefix=%s', cfg.prefix)
-    log.info('output=%s', output_path)
-
-    ############################################################################
-    # Checks and configurations
-    # - check parameters and setup global configuration
-    # - test database
-    # - test binary dependencies
-    ############################################################################
-    try:
-        if args.input == '':
-            raise ValueError('File path argument must be non-empty')
-        annotation_path = Path(args.input).resolve()
-        cfg.check_readability('annotation', annotation_path)
-        cfg.check_content_size('annotation', annotation_path)
-    except:
-        log.error('provided annotation file not valid! path=%s', args.input)
-        sys.exit(f'ERROR: annotation file ({args.input}) not valid!')
-    log.info('input-path=%s', annotation_path)
-    
-    cfg.check_tmp_path(args)
-    cfg.debug = args.debug
-    log.info('debug=%s', cfg.debug)
-    cfg.verbose = True if cfg.debug else args.verbose
-    log.info('verbose=%s', cfg.verbose)
-    cfg.user_proteins = cfg.check_user_proteins(args)
-    
-    if(cfg.verbose):
-        print(f'Bakta v{bakta.__version__}')
-        print('Options and arguments:')
-        print(f'\tinput: {annotation_path}')
-        print(f'\toutput: {cfg.output_path}')
-        print(f'\tprefix: {cfg.prefix}')
-        if(cfg.force): print(f'\tforce: {cfg.force}')
-    
-    if(cfg.debug):
-        print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}")
-    else:
-        atexit.register(bu.cleanup, log, cfg.tmp_path)  # register cleanup exit hook
-    
-    ############################################################################
-    # Import annotations from JSON
-    ############################################################################
-    print('Parse genome annotations...')
-    with annotation_path.open('r') as fh:
-        data = json.load(fh)
-    features = data['features']
-    features_by_sequence = {seq['id']: [] for seq in data['sequences']}
-    for feature in data['features']:
-        sequence_features = features_by_sequence.get(feature['sequence'])
-        sequence_features.append(feature)
-
-    ############################################################################
-    # Write output files
-    # - write optional output files in GFF3/GenBank/EMBL formats
-    # - measure runtime
-    # - write comprehensive annotation results as JSON
-    # - remove temp directory
-    ############################################################################
-    print(f'\nExport annotation results to: {cfg.output_path}')
-    print('\thuman readable TSV...')
-    tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
-    tsv.write_features(data['sequences'], features_by_sequence, tsv_path)
-
-    print('\tGFF3...')
-    gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
-    gff.write_features(data, features_by_sequence, gff3_path)
-
-    print('\tINSDC GenBank & EMBL...')
-    genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
-    embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
-    insdc.write_features(data, features, genbank_path, embl_path)
-
-    print('\tgenome sequences...')
-    fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
-    fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)
-
-    print('\tfeature nucleotide sequences...')
-    ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
-    fasta.write_ffn(features, ffn_path)
-
-    print('\ttranslated CDS sequences...')
-    faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa')
-    fasta.write_faa(features, faa_path)
-
-    print('\tfeature inferences...')
-    tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
-    tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path)
-
-    if(cfg.skip_plot  or  cfg.meta):
-        print('\tskip generation of circular genome plot...')
-    else:
-        print('\tcircular genome plot...')
-        plot.write(features, data['sequences'], cfg.output_path)
-
-    if(cfg.skip_cds is False):
-        hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
-        print('\thypothetical TSV...')
-        tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
-        tsv.write_hypotheticals(hypotheticals, tsv_path)
-
-        print('\ttranslated hypothetical CDS sequences...')
-        faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa')
-        fasta.write_faa(hypotheticals, faa_path)
-
-    print('\tGenome and annotation summary...')
-    summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
-    with summary_path.open('w') as fh_out:
-        fh_out.write('Sequence(s):\n')
-        fh_out.write(f"Length: {data['stats']['size']:}\n")
-        fh_out.write(f"Count: {len(data['sequences'])}\n")
-        fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n")
-        fh_out.write(f"N50: {data['stats']['n50']:}\n")
-        fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
-        fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
-        fh_out.write('\nAnnotation:\n')
-        fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n")
-        fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n")
-        fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n")
-        fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n")
-        fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n")
-        fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n")
-        cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
-        fh_out.write(f"CDSs: {len(cdss)}\n")
-        fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
-        fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
-        fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
-        fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n")
-        fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n")
-        fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n")
-        fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n")
-        fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n")
-        fh_out.write('\nBakta:\n')
-        fh_out.write(f'Software: v{bakta.__version__}\n')
-        fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
-        fh_out.write('DOI: 10.1099/mgen.0.000685\n')
-        fh_out.write('URL: github.com/oschwengers/bakta\n')
-
-
-if __name__ == '__main__':
-    main()

From fe9eece8d93bea1f9bc918066735edfd54405e00 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Tue, 15 Oct 2024 18:15:04 +0200
Subject: [PATCH 8/8] refactor var names in list comprehensions

---
 bakta/main.py                       | 44 ++++++++++++++---------------
 bakta/proteins.py                   | 14 ++++-----
 scripts/collect-annotation-stats.py | 28 +++++++++---------
 test/test_bakta.py                  |  4 +--
 4 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/bakta/main.py b/bakta/main.py
index 664eb3aa..881711be 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -516,21 +516,21 @@ def main():
     print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %")
     print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %")
     print('\nannotation summary:')
-    print(f"\ttRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}")
-    print(f"\ttmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}")
-    print(f"\trRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}")
-    print(f"\tncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}")
-    print(f"\tncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}")
-    print(f"\tCRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}")
-    cdss = [f for f in features if f['type'] == bc.FEATURE_CDS]
+    print(f"\ttRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_T_RNA])}")
+    print(f"\ttmRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_TM_RNA])}")
+    print(f"\trRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_R_RNA])}")
+    print(f"\tncRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA])}")
+    print(f"\tncRNA regions: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA_REGION])}")
+    print(f"\tCRISPR arrays: {len([feat for feat in features if feat['type'] == bc.FEATURE_CRISPR])}")
+    cdss = [feat for feat in features if feat['type'] == bc.FEATURE_CDS]
     print(f"\tCDSs: {len(cdss)}")
     print(f"\t\thypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}")
     print(f"\t\tpseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}")
     print(f"\t\tsignal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}")
-    print(f"\tsORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}")
-    print(f"\tgaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}")
-    print(f"\toriCs/oriVs: {len([f for f in features if (f['type'] == bc.FEATURE_ORIC or f['type'] == bc.FEATURE_ORIV)])}")
-    print(f"\toriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}")
+    print(f"\tsORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}")
+    print(f"\tgaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}")
+    print(f"\toriCs/oriVs: {len([feat for feat in features if (feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV)])}")
+    print(f"\toriTs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIT])}")
 
     ############################################################################
     # Write output files
@@ -609,21 +609,21 @@ def main():
         fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
         fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
         fh_out.write('\nAnnotation:\n')
-        fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n")
-        fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n")
-        fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n")
-        fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n")
-        fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n")
-        fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n")
+        fh_out.write(f"tRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_T_RNA])}\n")
+        fh_out.write(f"tmRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_TM_RNA])}\n")
+        fh_out.write(f"rRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_R_RNA])}\n")
+        fh_out.write(f"ncRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA])}\n")
+        fh_out.write(f"ncRNA regions: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA_REGION])}\n")
+        fh_out.write(f"CRISPR arrays: {len([feat for feat in features if feat['type'] == bc.FEATURE_CRISPR])}\n")
         fh_out.write(f"CDSs: {len(cdss)}\n")
         fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
         fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
         fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
-        fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n")
-        fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n")
-        fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n")
-        fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n")
-        fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n")
+        fh_out.write(f"sORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}\n")
+        fh_out.write(f"gaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}\n")
+        fh_out.write(f"oriCs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIC])}\n")
+        fh_out.write(f"oriVs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIV])}\n")
+        fh_out.write(f"oriTs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIT])}\n")
         fh_out.write('\nBakta:\n')
         fh_out.write(f'Software: v{bakta.__version__}\n')
         fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
diff --git a/bakta/proteins.py b/bakta/proteins.py
index d7f7d169..f0485d25 100644
--- a/bakta/proteins.py
+++ b/bakta/proteins.py
@@ -199,12 +199,12 @@ def map_aa_columns(feat: dict) -> Sequence[str]:
         str(feat['length']),
         gene,
         feat['product'],
-        ','.join([k.replace('EC:', '') for k in feat['db_xrefs'] if 'EC:' in k]),
-        ','.join([k for k in feat['db_xrefs'] if 'GO:' in k]),
-        ','.join([k.replace('COG:', '') for k in feat['db_xrefs'] if 'COG:' in k]),
-        ','.join([k.replace('RefSeq:', '') for k in feat['db_xrefs'] if 'RefSeq:' in k]),
-        ','.join([k.replace('UniParc:', '') for k in feat['db_xrefs'] if 'UniParc:' in k]),
-        ','.join([k.replace('UniRef:', '') for k in feat['db_xrefs'] if 'UniRef' in k])
+        ','.join([dbxref.replace('EC:', '') for dbxref in feat['db_xrefs'] if 'EC:' in dbxref]),
+        ','.join([dbxref for dbxref in feat['db_xrefs'] if 'GO:' in dbxref]),
+        ','.join([dbxref.replace('COG:', '') for dbxref in feat['db_xrefs'] if 'COG:' in dbxref]),
+        ','.join([dbxref.replace('RefSeq:', '') for dbxref in feat['db_xrefs'] if 'RefSeq:' in dbxref]),
+        ','.join([dbxref.replace('UniParc:', '') for dbxref in feat['db_xrefs'] if 'UniParc:' in dbxref]),
+        ','.join([dbxref.replace('UniRef:', '') for dbxref in feat['db_xrefs'] if 'UniRef' in dbxref])
     ]
 
 
@@ -214,7 +214,7 @@ def map_hypothetical_columns(feat: dict) -> Sequence[str]:
         str(feat['length']),
         f"{(feat['seq_stats']['molecular_weight']/1000):.1f}" if feat['seq_stats']['molecular_weight'] else 'NA'
         f"{feat['seq_stats']['isoelectric_point']:.1f}" if feat['seq_stats']['isoelectric_point'] else 'NA'
-        ','.join([k.replace('PFAM:', '') for k in feat['db_xrefs'] if 'PFAM:' in k])
+        ','.join([dbxref.replace('PFAM:', '') for dbxref in feat['db_xrefs'] if 'PFAM:' in dbxref])
     ]
 
 
diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py
index 4786a709..58902e31 100755
--- a/scripts/collect-annotation-stats.py
+++ b/scripts/collect-annotation-stats.py
@@ -82,20 +82,20 @@
                 f"{100 * data['stats']['n_ratio']:.1f}",
                 f"{data['stats']['n50']}",
                 f"{100 * data['stats']['coding_ratio']:.1f}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_T_RNA])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_TM_RNA])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_R_RNA])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CRISPR])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_SORF])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_GAP])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIC])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIV])}",
-                f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIT])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS and 'pseudogene' in feat])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_GAP])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIC])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIV])}",
+                f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIT])}",
             ]
             output_line = '\t'.join(stats)
             print(output_line)
diff --git a/test/test_bakta.py b/test/test_bakta.py
index d4c4f261..9ca62cba 100644
--- a/test/test_bakta.py
+++ b/test/test_bakta.py
@@ -81,7 +81,7 @@ def test_bakta_plasmid(tmpdir):
         bc.FEATURE_ORIT: 0
     }
     for type, count in feature_counts_expected.items():
-        assert len([f for f in features if f['type'] == type]) == count
+        assert len([feat for feat in features if feat['type'] == type]) == count
 
 
 @pytest.mark.parametrize(
@@ -142,5 +142,5 @@ def test_bakta_genome(db, tmpdir):
         bc.FEATURE_ORIT: 0
     }
     for type, count in feature_counts_expected.items():
-        assert len([f for f in features if f['type'] == type]) == count
+        assert len([feat for feat in features if feat['type'] == type]) == count