From ec48cf641f1cbdf590c9bfa03e4e84724b7c99aa Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 16:50:25 +0100
Subject: [PATCH 01/17] introduce user-provided regions

---
 bakta/config.py | 16 +++++++++++++++-
 bakta/main.py   |  6 ++++++
 bakta/utils.py  |  1 +
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/bakta/config.py b/bakta/config.py
index 8c6866e7..0bf01990 100644
--- a/bakta/config.py
+++ b/bakta/config.py
@@ -55,6 +55,7 @@
 compliant = None
 user_proteins = None
 meta = None
+regions = None
 
 # workflow configuration
 skip_trna = None
@@ -160,7 +161,7 @@ def setup(args):
         taxon = None
 
     # annotation configurations
-    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta
+    global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta, regions
     complete = args.complete
     log.info('complete=%s', complete)
     prodigal_tf = args.prodigal_tf
@@ -232,6 +233,19 @@ def setup(args):
             sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
     log.info('replicon-table=%s', replicons)
     user_proteins = check_user_proteins(args)
+    regions = args.regions
+    if(regions is not None):
+        try:
+            if(regions == ''):
+                raise ValueError('File path argument must be non-empty')
+            regions_path = Path(args.regions).resolve()
+            check_readability('regions', regions_path)
+            check_content_size('regions', regions_path)
+            regions = regions_path
+        except:
+            log.error('provided regions file not valid! path=%s', regions)
+            sys.exit(f'ERROR: regions file ({regions}) not valid!')
+    log.info('regions=%s', regions)
     
 
     # workflow configurations
diff --git a/bakta/main.py b/bakta/main.py
index f0948348..b14cd0ec 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -239,6 +239,12 @@ def main():
             no_revised = feat_cds.revise_translational_exceptions(genome, cdss)
             print(f'\trevised translational exceptions: {no_revised}')
             cdss = [cds for cds in cdss if 'discarded' not in cds]
+        
+        if(cfg.regions):
+            log.debug('import user-provided CDS regions')
+            imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions)
+            print(f'\timported CDS regions: {len(imported_cdss)}')
+            # ToDo: exclude overlapping CDS
 
         if(len(cdss) > 0):
             log.debug('lookup CDS UPS/IPS')
diff --git a/bakta/utils.py b/bakta/utils.py
index dc224bb2..240f62ef 100644
--- a/bakta/utils.py
+++ b/bakta/utils.py
@@ -91,6 +91,7 @@ def parse_arguments():
     arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance')
     arg_group_annotation.add_argument('--proteins', action='store', default=None, dest='proteins', help='Fasta file of trusted protein sequences for CDS annotation')
     arg_group_annotation.add_argument('--meta', action='store_true', help='Run in metagenome mode. This only affects CDS prediction.')
+    arg_group_annotation.add_argument('--regions', action='store', default=None, help='Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).')
 
     arg_group_workflow = parser.add_argument_group('Workflow')
     arg_group_workflow.add_argument('--skip-trna', action='store_true', dest='skip_trna', help='Skip tRNA detection & annotation')

From 3072247df720449ad9ea5e817229c9e47caa0145 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 16:51:16 +0100
Subject: [PATCH 02/17] implement creation & overlap filters

---
 bakta/constants.py           |  4 +-
 bakta/features/annotation.py | 41 ++++++++++++++++++--
 bakta/features/cds.py        | 72 ++++++++++++++++++++++++++++++++++++
 bakta/main.py                |  1 -
 4 files changed, 113 insertions(+), 5 deletions(-)

diff --git a/bakta/constants.py b/bakta/constants.py
index 7e3d5598..0a612b6a 100644
--- a/bakta/constants.py
+++ b/bakta/constants.py
@@ -19,7 +19,7 @@
 
 
 ############################################################################
-# Protein identification settings
+# CDS settings
 ############################################################################
 MIN_PSCC_IDENTITY = 0.5  # min protein identity for PSC detection
 MIN_PSC_COVERAGE = 0.8  # min protein coverage for PSC detection
@@ -27,6 +27,8 @@
 MIN_SORF_COVERAGE = 0.9  # min sORF coverage for PSC detection
 MIN_SORF_IDENTITY = 0.9  # min sORF identity for PSC detection
 HYPOTHETICAL_PROTEIN = 'hypothetical protein'  # hypothetical protein product description
+CDS_MAX_OVERLAPS = 30  # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps
+CDS_SOURCE_USER = 'user-provided'
 
 
 ############################################################################
diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py
index 0f9412ee..fd99761a 100644
--- a/bakta/features/annotation.py
+++ b/bakta/features/annotation.py
@@ -169,8 +169,12 @@ def detect_feature_overlaps(genome: dict):
         crispr_arrays = contig_crispr_arrays[crispr_array['contig']]
         crispr_arrays.append(crispr_array)
     contig_cdss = {k['id']: [] for k in genome['contigs']}
+    contig_cdss_user_provided = {k['id']: [] for k in genome['contigs']}
     for cds in genome['features'].get(bc.FEATURE_CDS, []):
-        cdss = contig_cdss[cds['contig']]
+        if(cds.get('source', None) == bc.CDS_SOURCE_USER):
+            cdss = contig_cdss_user_provided[cds['contig']]
+        else:
+            cdss = contig_cdss[cds['contig']]
         cdss.append(cds)
     contig_sorfs = {k['id']: [] for k in genome['contigs']}
     for sorf in genome['features'].get(bc.FEATURE_SORF, []):
@@ -217,7 +221,7 @@ def detect_feature_overlaps(genome: dict):
                             ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['contig'], ncRNA_region['score'], ncRNA_region_overlap['score']
                         )
 
-        # mark CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs
+        # mark de novo-predicted CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs and user-provided CDS
         for cds in contig_cdss[contig['id']]:
             # tmRNA overlaps
             for tmRNA in contig_tm_rnas[contig['id']]:
@@ -279,6 +283,23 @@ def detect_feature_overlaps(genome: dict):
                         "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s",
                         cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['contig']
                     )
+            # user-provided CDS overlaps
+            for cds_user_provided in contig_cdss_user_provided[contig['id']]:
+                if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']):
+                    continue
+                else:  # overlap -> remove cds
+                    overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1
+                    if(overlap > bc.CDS_MAX_OVERLAPS):
+                        overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]"
+                        cds['discarded'] = {
+                            'type': bc.DISCARD_TYPE_OVERLAP,
+                            'feature_type': bc.FEATURE_CDS,
+                            'description': f'overlaps {bc.FEATURE_CDS} at {overlap}'
+                        }
+                        log.info(
+                            "overlap: de novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
+                            cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig']
+                        )
 
         # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs
         for sorf in contig_sorfs[contig['id']]:
@@ -342,7 +363,21 @@ def detect_feature_overlaps(genome: dict):
                         "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s",
                         sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['contig']
                     )
-            # CDS overlaps (skipped as most overlaps are filtered in sORF detection)
+            # user-provided CDS overlaps
+            for cds_user_provided in contig_cdss_user_provided[contig['id']]:
+                if(sorf['stop'] < cds_user_provided['start'] or sorf['start'] > cds_user_provided['stop']):
+                    continue
+                else:  # overlap -> remove sorf
+                    overlap = f"[{max(sorf['start'], cds_user_provided['start'])},{min(sorf['stop'], cds_user_provided['stop'])}]"
+                    sorf['discarded'] = {
+                        'type': bc.DISCARD_TYPE_OVERLAP,
+                        'feature_type': bc.FEATURE_CDS,
+                        'description': f'overlaps {bc.FEATURE_CDS} at {overlap}'
+                    }
+                    log.info(
+                        "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s",
+                        sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['contig']
+                    )
 
             # sORF overlaps
             for overlap_sorf in contig_sorfs[contig['id']]:
diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index b18a2959..b318f593 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -7,12 +7,15 @@
 
 from collections import OrderedDict
 from typing import Dict, Sequence, Set, Tuple, Union
+from pathlib import Path
 
 import pyrodigal
 import pyhmmer
 
+from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
+from xopen import xopen
 
 import bakta.config as cfg
 import bakta.constants as bc
@@ -164,6 +167,75 @@ def create_cdss(genes, contig):
     return cdss_per_sequence
 
 
+def import_user_cdss(genome: dict, import_path: Path):
+    """Imports user-provided CDS regions or features.
+    Regions are mere coordinates subject to subsequent internal annotation. Features comprise regions and existing functional annotation.
+    Both regions and annotations will be imported. Functional annotations are stored in a 'user' space and supersede internal function annotation information during the final annotation process.
+
+    Parameters
+    ----------
+    import_path : Path
+        Path to GFF3 or Genbank file with regions or features.
+
+    Returns
+    -------
+    list
+        a list of CDS features - with or without functional annotation.
+    """
+    user_cdss = []
+    # ToDO: implement GFF3 parsing
+    # try:
+    #     with xopen(str(import_path), threads=0) as fh_in:
+    #         for record in SeqIO.parse(fh_in, 'fasta'):
+    #             user_proteins.append(parse_user_protein_sequences_fasta(record))
+    # except Exception as e:
+    #     log.error('provided user proteins file Fasta format not valid!', exc_info=True)
+    #     sys.exit(f'ERROR: User proteins file Fasta format not valid!')
+
+    contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']}
+    try:
+        with xopen(str(import_path), threads=0) as fh_in:
+            for record in SeqIO.parse(fh_in, 'genbank'):
+                for feature in record.features:
+                    if(feature.type.lower() == 'cds'  and  'pseudo' not in feature.qualifiers and  bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers):
+                        contig = contigs_by_original_id[record.id]
+                        user_cds = OrderedDict()
+                        user_cds['type'] = bc.FEATURE_CDS
+                        user_cds['source'] = bc.CDS_SOURCE_USER
+                        user_cds['contig'] = contig['id']
+                        user_cds['start'] = feature.location.start
+                        user_cds['stop'] = feature.location.end
+                        user_cds['strand'] = bc.STRAND_FORWARD if feature.location == +1 else bc.STRAND_REVERSE
+                        user_cds['gene'] = None
+                        user_cds['product'] = None
+                        user_cds['db_xrefs'] = [so.SO_CDS.id]
+                        user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
+                        try:
+                            nt = bu.extract_feature_sequence(user_cds, contig)
+                            user_cds['nt'] = nt
+                        except:
+                            log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
+                            raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
+                        dna_seq = Seq(nt)
+                        try:
+                            aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
+                        except:
+                            log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
+                            raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
+                        user_cds['aa'] = aa
+                        user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+                        log.info(
+                            'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
+                            user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
+                        )
+                        user_cdss.append(user_cds)
+    except Exception as e:
+        log.error('user-provided regions/features file GenBank format not valid!', exc_info=True)
+        sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!')
+    
+    return user_cdss
+
+
 def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]:
     """Detect Pfam-A entries"""
     pfam_hits = []
diff --git a/bakta/main.py b/bakta/main.py
index b14cd0ec..2499822e 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -244,7 +244,6 @@ def main():
             log.debug('import user-provided CDS regions')
             imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions)
             print(f'\timported CDS regions: {len(imported_cdss)}')
-            # ToDo: exclude overlapping CDS
 
         if(len(cdss) > 0):
             log.debug('lookup CDS UPS/IPS')

From f3687adf85602551fbeee233bfe5b6dd3864623e Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 16:51:53 +0100
Subject: [PATCH 03/17] amend INSDC/GFF3 inferences

---
 bakta/io/gff.py   | 9 +++++++--
 bakta/io/insdc.py | 8 +++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 25349eb7..3a0ab552 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -230,7 +230,12 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         if(feat.get('gene', None)):
                             gene_annotations['gene'] = feat['gene']
                         annotations['Parent'] = gene_id
-                        annotations['inference'] = 'ab initio prediction:Prodigal:2.6'
+                        if(feat.get('source', None) == bc.CDS_SOURCE_USER):
+                            annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded'
+                            source = '?'
+                        else:
+                            annotations['inference'] = 'ab initio prediction:Prodigal:2.6'
+                            source = 'Prodigal'
                         annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                         annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                         if(feat.get('pseudo', False)):
@@ -251,7 +256,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         if('Notes' not in annotations):
                             annotations['Note'] = notes
                     annotations = encode_annotations(annotations)
-                    fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
+                    fh.write(f"{feat['contig']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n")
                     if(bc.FEATURE_SIGNAL_PEPTIDE in feat):
                         write_signal_peptide(fh, feat)
                 elif(feat['type'] == bc.FEATURE_SORF):
diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py
index 3b287afd..ba8d630e 100644
--- a/bakta/io/insdc.py
+++ b/bakta/io/insdc.py
@@ -131,7 +131,13 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat
                 qualifiers['transl_table'] = cfg.translation_table
                 insdc_feature_type = bc.INSDC_FEATURE_CDS
                 inference = []
-                inference.append('ab initio prediction:Prodigal:2.6' if feature['type'] == bc.FEATURE_CDS else f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}")
+                if(feature['type'] == bc.FEATURE_CDS):
+                    if(feature.get('source', None) == bc.CDS_SOURCE_USER):
+                        inference.append('EXISTENCE:non-experimental evidence, no additional details recorded')
+                    else:
+                        inference.append('ab initio prediction:Prodigal:2.6')
+                else:
+                    inference.append(f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}")
                 if('ncbi_nrp_id' in feature.get('ups', {})):
                     nrp_id = feature['ups']['ncbi_nrp_id']
                     inference.append(f'similar to AA sequence:{bc.DB_XREF_REFSEQ_NRP}:{nrp_id}')

From 60214ea4c5b3918b0b4ca613e3da022f0ad4c522 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 17:31:57 +0100
Subject: [PATCH 04/17] fix source value error in GFF3 output

---
 bakta/io/gff.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/bakta/io/gff.py b/bakta/io/gff.py
index 3a0ab552..f0368b84 100644
--- a/bakta/io/gff.py
+++ b/bakta/io/gff.py
@@ -221,6 +221,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         annotations['pseudo'] = True
                     if(feat.get('gene', None)):  # add gene annotation if available
                         annotations['gene'] = feat['gene']
+                    source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Prodigal'
                     if(cfg.compliant):
                         gene_id = f"{feat['locus']}_gene"
                         gene_annotations = {
@@ -232,10 +233,8 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         annotations['Parent'] = gene_id
                         if(feat.get('source', None) == bc.CDS_SOURCE_USER):
                             annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded'
-                            source = '?'
                         else:
                             annotations['inference'] = 'ab initio prediction:Prodigal:2.6'
-                            source = 'Prodigal'
                         annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs'])  # remove INSDC invalid DbXrefs
                         annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note')
                         if(feat.get('pseudo', False)):
@@ -244,7 +243,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         if(ec_number is not None):
                             annotations['ec_number'] = ec_number
                         gene_annotations = encode_annotations(gene_annotations)
-                        fh.write(f"{feat['contig']}\tProdigal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
+                        fh.write(f"{feat['contig']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n")
                     if('exception' in feat):
                         ex = feat['exception']
                         pos = f"{ex['start']}..{ex['stop']}"

From 47d325cdc52ecd75d5c32d708b9efc737e3b7dc8 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 18:40:02 +0100
Subject: [PATCH 05/17] actually add user-provided cdss in main

---
 bakta/main.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bakta/main.py b/bakta/main.py
index 2499822e..3cfb3a4d 100755
--- a/bakta/main.py
+++ b/bakta/main.py
@@ -244,6 +244,7 @@ def main():
             log.debug('import user-provided CDS regions')
             imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions)
             print(f'\timported CDS regions: {len(imported_cdss)}')
+            cdss.extend(imported_cdss)
 
         if(len(cdss) > 0):
             log.debug('lookup CDS UPS/IPS')

From b20c5ad4a359cf2b6c03d5c2b3a5e2f743ea0e7a Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 18:41:22 +0100
Subject: [PATCH 06/17] fix CDS start & strand

---
 bakta/features/cds.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index b318f593..37485e00 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -203,9 +203,9 @@ def import_user_cdss(genome: dict, import_path: Path):
                         user_cds['type'] = bc.FEATURE_CDS
                         user_cds['source'] = bc.CDS_SOURCE_USER
                         user_cds['contig'] = contig['id']
-                        user_cds['start'] = feature.location.start
+                        user_cds['start'] = feature.location.start + 1
                         user_cds['stop'] = feature.location.end
-                        user_cds['strand'] = bc.STRAND_FORWARD if feature.location == +1 else bc.STRAND_REVERSE
+                        user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
                         user_cds['gene'] = None
                         user_cds['product'] = None
                         user_cds['db_xrefs'] = [so.SO_CDS.id]

From 01a38e52925f82ab426fc85dbaca4cfb2aca739f Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 16 Nov 2023 18:41:42 +0100
Subject: [PATCH 07/17] add tests

---
 test/data/NC_002127.1-region.gbff | 79 +++++++++++++++++++++++++++++++
 test/data/NC_002127.1-region.gff3 | 68 ++++++++++++++++++++++++++
 test/test_args.py                 | 20 ++++++++
 test/test_regions.py              | 38 +++++++++++++++
 4 files changed, 205 insertions(+)
 create mode 100644 test/data/NC_002127.1-region.gbff
 create mode 100644 test/data/NC_002127.1-region.gff3
 create mode 100644 test/test_regions.py

diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff
new file mode 100644
index 00000000..1cc0907d
--- /dev/null
+++ b/test/data/NC_002127.1-region.gbff
@@ -0,0 +1,79 @@
+LOCUS       NC_002127.1             3306 bp    DNA     circular BCT 16-NOV-2023
+DEFINITION  plasmid unnamed1, complete sequence.
+KEYWORDS    .
+SOURCE      None
+  ORGANISM  .
+            .
+FEATURES             Location/Qualifiers
+     source          1..3306
+                     /mol_type="genomic DNA"
+                     /plasmid="unnamed1"
+     gene            32..736
+                     /locus_tag="DOGAIA_00005"
+     CDS             32..736
+                     /locus_tag="DOGAIA_00005"
+                     /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL
+                     FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ
+                     DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ
+                     EIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVMRLGALQKKLFIDGKRVGDRE
+                     YAEVLIAITEYHRALLSRLMAD"
+                     /codon_start=1
+                     /transl_table=11
+ORIGIN
+        1 ttcttctgcg agttcgtgca gcttctcaca catggtggcc tgctcgtcag catcgagtgc
+       61 gtccagtttt tcgagcagcg tcaggctctg gctttttatg aatcccgcca tgttgagtgc
+      121 agtttgctgc tgcttgttca tctttctgtt ttctccgttc tgtctgtcat ctgcgtcgtg
+      181 tgattatatc gcgcaccact tttcgaccgt cttaccgccg gtattctgcc gacggacatt
+      241 tcagtcagac aacactgtca ctgccaaaaa acagcagtgc tttgttggta attcgaactt
+      301 gcagacagga caggatgtgc aattgttata ccgcgcatac atgcacgcta ttacaattac
+      361 cctggtcagg gcttcgcccc gacaccccat gtcagatacg gagccatgtt ttatgacaaa
+      421 acgaagtgga agtaatacgc gcaggcgggc tatcagtcgc cctgttcgtc tgacggcaga
+      481 agaagaccag gaaatcagaa aaagggctgc tgaatgcggc aagaccgttt ctggtttttt
+      541 acgggcggca gctctcggta agaaagttaa ctcactgact gatgaccggg tgctgaaaga
+      601 agttatgcga ctgggggcgt tgcagaaaaa actctttatc gacggcaagc gtgtcgggga
+      661 cagagagtat gcggaggtgc tgatcgctat tacggagtat caccgtgccc tgttatccag
+      721 gcttatggca gattagcttc ccggagagaa actgtcgaaa acagacggta tgaacgccgt
+      781 aagcccccaa accgatcgcc attcactttc atgcatagct atgcagtgag ctgaaagcga
+      841 tcctgacgca tttttccggt ttaccccggg gaaaacatct ctttttgcgg tgtctgcgtc
+      901 agaatcgcgt tcagcgcgtt ttggcggtgc gcgtaatgag acgttatggt aaatgtcttc
+      961 tggcttgata ttatattgga atgccttttt tcaaagcaaa tgatgtggct ttggatagaa
+     1021 ggtttacgtt gatcttatca aagttttttt taaagaacga agccgagagc tcagataaat
+     1081 cattatattc atcagttttc gtaactttgt ttaatgtgta acttgaaaac ttctcgccat
+     1141 taaatgacgt atagacgtaa cgatcttttt ttccaccgtt aggaattatt aaatcaaaaa
+     1201 aaacatcacc cttgcttttc tttttcttca agtcggattc gatttttgag aaaaattcgc
+     1261 tcgggctata aatatcagta gcatagacaa taaataaagt tttatcttta ttttttattg
+     1321 cttctatttg atatttttta tcttttttca taatttcaac ctagctacgc caccatctat
+     1381 taattggcaa acggtatcga tgattgcgat tgattctaat ttgttaattg tcttcgtgtc
+     1441 agctattcct ggtttcatat gaaacaaacc atgcctgttc tcatgccagt aagtgtagca
+     1501 ttcacacaaa acttccgcta tttcaccatt tatagtttct tggtgtattt ctctgattat
+     1561 atatttgggt ttgttttcag tgaagtattc gccaaggttc tttgatgatg atggattgca
+     1621 aacatcatta agtatttgat atataaagcc ttctatggct cttaatgcag agaagcagta
+     1681 tgttgagtaa tcttccattt cgacatctat ttttttcatt attagcgagc atgatagctg
+     1741 ttttttgata tcttcatgga ttttatcgat gctttttggt agtttgctat gcaactcgga
+     1801 ctcaatagtt tcttttttta tgtcaacatt aaattcttta tttttttgtt cgacaatctc
+     1861 tttcatgttt agtattgagc acatgaaatc gttaatcaaa ctcgcgattt gaaggtattt
+     1921 tccttggaat tgaatagagc cgcgcttgta aatttttgcc ctgaccctgt caccattgct
+     1981 ggtggtcata atatattggt gtttacaatt aggatcgtta ttattatctt ctgttattgt
+     2041 tatcccctct tcagaaagaa attcaaatag atttgccctg tcatcatcac tgaattttgg
+     2101 aatggtgtat tcaaagttct ttgtgtctga atacaaacag ttttctttta taatcaaggc
+     2161 gatttcatca aagtaagtgt tattttgccc agacgctctt ccgatagtgg tatttccgcc
+     2221 tgatggcatt agttttatta agaagtctat tcctttatat gtgccagata tgtgagtttc
+     2281 tctttcgttt tttacattag aggaatagtt tgtgacgcca ttctgcgtca gagcagactc
+     2341 aatcttgtca atattgatat ttagtgcttt aaacgggttc tgtgccattg ggtcaatccg
+     2401 ttgttttttt tgaatatgta cagatcttgt ttttttgtca acggaatagc tgttcgttga
+     2461 cttgatagac cgattgattc atcatctcat aaataaagaa aaaccaccgc taccaacggt
+     2521 ggttttctca aggttcgctg agctaccaac tctttgaacc aaggtaactg gcttggagga
+     2581 gcgcagtcac caaaatctgt tctttcagtt tagccttaac aggtgcataa cttcaagaca
+     2641 aactcctcta aatcagttac caatggctgc tgccagtggc gataagtcgt gtcttaccgg
+     2701 gttggactca agacgatagt taccggataa ggcgcagcgg tcgggctgaa cggggggttc
+     2761 gtgcacacag cccagcttgg agcgaacgac ctacaccgaa ctgagatacc aacagcgtga
+     2821 gctatgagaa agcgccacgc ttcccgaagg gagaaaggcg gacaggtatc cggtaagtgg
+     2881 cagggtcgga acaggagagc gcacgaggga gcttccgggg ggaaacgcct ggtatcttta
+     2941 tagtcctgtc gggtttcgcc acctctggct tgagcgtcga tttttgtgat gctcgtcagg
+     3001 ggggcggagc ctatggaaaa acgcctgcgg tgctggcttc ttccggtgct ttgctttttg
+     3061 ctcacatgtt ctttccggct ttatcccctg attctgtgga taaccgtatt accgcctttg
+     3121 agtgagctga caccgctcgc cgcagtcgaa cgaccgagcg tagcgagtca gtgagcgagg
+     3181 aagcggaaga gcgccttatg tgacattttc tccttacgct ctgttgtgcc gttcggcatc
+     3241 ctgccctgag cgttatatct ctgtgctatt ttctacttca aagcgtgtct gtatgctgtt
+     3301 ctggag
+//
diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3
new file mode 100644
index 00000000..7ad6e5c8
--- /dev/null
+++ b/test/data/NC_002127.1-region.gff3
@@ -0,0 +1,68 @@
+##gff-version 3
+##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo
+# Annotated with Bakta
+# Software: v1.8.2
+# Database: v5.0, light
+# DOI: 10.1099/mgen.0.000685
+# URL: github.com/oschwengers/bakta
+##sequence-region NC_002127.1 1 3306
+NC_002127.1	Bakta	region	1	3306	.	+	.	ID=NC_002127.1;Name=NC_002127.1;Is_circular=true
+NC_002127.1	?	CDS	32	736	.	+	0	ID=DOGAIA_00005
+##FASTA
+>NC_002127.1
+TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC
+GTCCAGTTTTTCGAGCAGCGTCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGC
+AGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTCTGTCTGTCATCTGCGTCGTG
+TGATTATATCGCGCACCACTTTTCGACCGTCTTACCGCCGGTATTCTGCCGACGGACATT
+TCAGTCAGACAACACTGTCACTGCCAAAAAACAGCAGTGCTTTGTTGGTAATTCGAACTT
+GCAGACAGGACAGGATGTGCAATTGTTATACCGCGCATACATGCACGCTATTACAATTAC
+CCTGGTCAGGGCTTCGCCCCGACACCCCATGTCAGATACGGAGCCATGTTTTATGACAAA
+ACGAAGTGGAAGTAATACGCGCAGGCGGGCTATCAGTCGCCCTGTTCGTCTGACGGCAGA
+AGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAGACCGTTTCTGGTTTTTT
+ACGGGCGGCAGCTCTCGGTAAGAAAGTTAACTCACTGACTGATGACCGGGTGCTGAAAGA
+AGTTATGCGACTGGGGGCGTTGCAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGA
+CAGAGAGTATGCGGAGGTGCTGATCGCTATTACGGAGTATCACCGTGCCCTGTTATCCAG
+GCTTATGGCAGATTAGCTTCCCGGAGAGAAACTGTCGAAAACAGACGGTATGAACGCCGT
+AAGCCCCCAAACCGATCGCCATTCACTTTCATGCATAGCTATGCAGTGAGCTGAAAGCGA
+TCCTGACGCATTTTTCCGGTTTACCCCGGGGAAAACATCTCTTTTTGCGGTGTCTGCGTC
+AGAATCGCGTTCAGCGCGTTTTGGCGGTGCGCGTAATGAGACGTTATGGTAAATGTCTTC
+TGGCTTGATATTATATTGGAATGCCTTTTTTCAAAGCAAATGATGTGGCTTTGGATAGAA
+GGTTTACGTTGATCTTATCAAAGTTTTTTTTAAAGAACGAAGCCGAGAGCTCAGATAAAT
+CATTATATTCATCAGTTTTCGTAACTTTGTTTAATGTGTAACTTGAAAACTTCTCGCCAT
+TAAATGACGTATAGACGTAACGATCTTTTTTTCCACCGTTAGGAATTATTAAATCAAAAA
+AAACATCACCCTTGCTTTTCTTTTTCTTCAAGTCGGATTCGATTTTTGAGAAAAATTCGC
+TCGGGCTATAAATATCAGTAGCATAGACAATAAATAAAGTTTTATCTTTATTTTTTATTG
+CTTCTATTTGATATTTTTTATCTTTTTTCATAATTTCAACCTAGCTACGCCACCATCTAT
+TAATTGGCAAACGGTATCGATGATTGCGATTGATTCTAATTTGTTAATTGTCTTCGTGTC
+AGCTATTCCTGGTTTCATATGAAACAAACCATGCCTGTTCTCATGCCAGTAAGTGTAGCA
+TTCACACAAAACTTCCGCTATTTCACCATTTATAGTTTCTTGGTGTATTTCTCTGATTAT
+ATATTTGGGTTTGTTTTCAGTGAAGTATTCGCCAAGGTTCTTTGATGATGATGGATTGCA
+AACATCATTAAGTATTTGATATATAAAGCCTTCTATGGCTCTTAATGCAGAGAAGCAGTA
+TGTTGAGTAATCTTCCATTTCGACATCTATTTTTTTCATTATTAGCGAGCATGATAGCTG
+TTTTTTGATATCTTCATGGATTTTATCGATGCTTTTTGGTAGTTTGCTATGCAACTCGGA
+CTCAATAGTTTCTTTTTTTATGTCAACATTAAATTCTTTATTTTTTTGTTCGACAATCTC
+TTTCATGTTTAGTATTGAGCACATGAAATCGTTAATCAAACTCGCGATTTGAAGGTATTT
+TCCTTGGAATTGAATAGAGCCGCGCTTGTAAATTTTTGCCCTGACCCTGTCACCATTGCT
+GGTGGTCATAATATATTGGTGTTTACAATTAGGATCGTTATTATTATCTTCTGTTATTGT
+TATCCCCTCTTCAGAAAGAAATTCAAATAGATTTGCCCTGTCATCATCACTGAATTTTGG
+AATGGTGTATTCAAAGTTCTTTGTGTCTGAATACAAACAGTTTTCTTTTATAATCAAGGC
+GATTTCATCAAAGTAAGTGTTATTTTGCCCAGACGCTCTTCCGATAGTGGTATTTCCGCC
+TGATGGCATTAGTTTTATTAAGAAGTCTATTCCTTTATATGTGCCAGATATGTGAGTTTC
+TCTTTCGTTTTTTACATTAGAGGAATAGTTTGTGACGCCATTCTGCGTCAGAGCAGACTC
+AATCTTGTCAATATTGATATTTAGTGCTTTAAACGGGTTCTGTGCCATTGGGTCAATCCG
+TTGTTTTTTTTGAATATGTACAGATCTTGTTTTTTTGTCAACGGAATAGCTGTTCGTTGA
+CTTGATAGACCGATTGATTCATCATCTCATAAATAAAGAAAAACCACCGCTACCAACGGT
+GGTTTTCTCAAGGTTCGCTGAGCTACCAACTCTTTGAACCAAGGTAACTGGCTTGGAGGA
+GCGCAGTCACCAAAATCTGTTCTTTCAGTTTAGCCTTAACAGGTGCATAACTTCAAGACA
+AACTCCTCTAAATCAGTTACCAATGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGG
+GTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTC
+GTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCAACAGCGTGA
+GCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGTGG
+CAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCGGGGGGAAACGCCTGGTATCTTTA
+TAGTCCTGTCGGGTTTCGCCACCTCTGGCTTGAGCGTCGATTTTTGTGATGCTCGTCAGG
+GGGGCGGAGCCTATGGAAAAACGCCTGCGGTGCTGGCTTCTTCCGGTGCTTTGCTTTTTG
+CTCACATGTTCTTTCCGGCTTTATCCCCTGATTCTGTGGATAACCGTATTACCGCCTTTG
+AGTGAGCTGACACCGCTCGCCGCAGTCGAACGACCGAGCGTAGCGAGTCAGTGAGCGAGG
+AAGCGGAAGAGCGCCTTATGTGACATTTTCTCCTTACGCTCTGTTGTGCCGTTCGGCATC
+CTGCCCTGAGCGTTATATCTCTGTGCTATTTTCTACTTCAAAGCGTGTCTGTATGCTGTT
+CTGGAG
diff --git a/test/test_args.py b/test/test_args.py
index 6c09ed92..e2845f67 100644
--- a/test/test_args.py
+++ b/test/test_args.py
@@ -241,6 +241,26 @@ def test_replicons_ok(parameters, tmpdir):
         assert Path.exists(tmpdir_path.joinpath(file))
 
 
+@pytest.mark.parametrize(
+    'parameters',
+    [
+        (['--regions']),  # not provided
+        (['--regions', '']),  # empty
+        (['--regions', 'foo']),  # not existing
+        (['--regions', 'test/data/empty'])  # empty file
+    ]
+)
+def test_regions_failiing(parameters, tmpdir):
+    # test regions file arguments
+    proc = run(
+        ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] +
+        parameters +
+        SKIP_PARAMETERS +
+        ['test/data/NC_002127.1.fna']
+    )
+    assert proc.returncode != 0
+
+
 @pytest.mark.parametrize(
     'parameters',
     [
diff --git a/test/test_regions.py b/test/test_regions.py
new file mode 100644
index 00000000..b893935c
--- /dev/null
+++ b/test/test_regions.py
@@ -0,0 +1,38 @@
+import json
+import subprocess as sp
+
+from pathlib import Path
+from subprocess import run
+
+import pytest
+
+import bakta.constants as bc
+
+from .conftest import FILES, SKIP_PARAMETERS
+
+
+def test_bakta_plasmid(tmpdir):
+    # full test on plasmid
+    proc = run(
+        ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', 'test/data/NC_002127.1-region.gbff'] +
+        ['test/data/NC_002127.1.fna.gz']
+    )
+    assert proc.returncode == 0
+
+    tmpdir_path = Path(tmpdir)
+    for file in FILES:
+        output_path = tmpdir_path.joinpath(file)
+        assert Path.exists(output_path)
+        assert output_path.stat().st_size > 0
+
+    results_path = tmpdir_path.joinpath('test.json')
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    cdss = [feat for feat in results['features'] if feat['type'] == 'cds']
+    assert len(cdss) == 3
+    for cds in cdss:
+        if(cds['stop'] == 736):
+            assert cds['start'] != 2  # de novo-predicted CDS start
+            assert cds['start'] == 32  # user-provided CDS start

From 1bfb8dbbbe2f82bbc8243a258abcef49c2618cb5 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 09:59:43 +0100
Subject: [PATCH 08/17] implement GFF3 region support

---
 bakta/features/cds.py | 138 +++++++++++++++++++++++++++---------------
 1 file changed, 90 insertions(+), 48 deletions(-)

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index 37485e00..b3dc32d1 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -183,55 +183,97 @@ def import_user_cdss(genome: dict, import_path: Path):
         a list of CDS features - with or without functional annotation.
     """
     user_cdss = []
-    # ToDO: implement GFF3 parsing
-    # try:
-    #     with xopen(str(import_path), threads=0) as fh_in:
-    #         for record in SeqIO.parse(fh_in, 'fasta'):
-    #             user_proteins.append(parse_user_protein_sequences_fasta(record))
-    # except Exception as e:
-    #     log.error('provided user proteins file Fasta format not valid!', exc_info=True)
-    #     sys.exit(f'ERROR: User proteins file Fasta format not valid!')
-
     contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']}
-    try:
-        with xopen(str(import_path), threads=0) as fh_in:
-            for record in SeqIO.parse(fh_in, 'genbank'):
-                for feature in record.features:
-                    if(feature.type.lower() == 'cds'  and  'pseudo' not in feature.qualifiers and  bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers):
-                        contig = contigs_by_original_id[record.id]
-                        user_cds = OrderedDict()
-                        user_cds['type'] = bc.FEATURE_CDS
-                        user_cds['source'] = bc.CDS_SOURCE_USER
-                        user_cds['contig'] = contig['id']
-                        user_cds['start'] = feature.location.start + 1
-                        user_cds['stop'] = feature.location.end
-                        user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
-                        user_cds['gene'] = None
-                        user_cds['product'] = None
-                        user_cds['db_xrefs'] = [so.SO_CDS.id]
-                        user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
-                        try:
-                            nt = bu.extract_feature_sequence(user_cds, contig)
-                            user_cds['nt'] = nt
-                        except:
-                            log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
-                            raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
-                        dna_seq = Seq(nt)
-                        try:
-                            aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
-                        except:
-                            log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
-                            raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
-                        user_cds['aa'] = aa
-                        user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
-                        log.info(
-                            'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
-                            user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
-                        )
-                        user_cdss.append(user_cds)
-    except Exception as e:
-        log.error('user-provided regions/features file GenBank format not valid!', exc_info=True)
-        sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!')
+    file_suffix = import_path.suffix.lower()
+    if(file_suffix in ['.gff', '.gff3']):
+        try:
+            with xopen(str(import_path), threads=0) as fh_in:
+                skip_lines = False
+                for line in  fh_in:
+                    line = line.strip()
+                    if(line == '##FASTA'):
+                        skip_lines = True
+                    elif(skip_lines  or  line[0] == '#'):
+                        continue
+                    else:
+                        contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t')
+                        if(feature_type.lower() == 'cds'):
+                            contig = contigs_by_original_id[contig_id]
+                            user_cds = OrderedDict()
+                            user_cds['type'] = bc.FEATURE_CDS
+                            user_cds['source'] = bc.CDS_SOURCE_USER
+                            user_cds['contig'] = contig['id']
+                            user_cds['start'] = int(start)
+                            user_cds['stop'] = int(stop)
+                            user_cds['strand'] = strand
+                            user_cds['gene'] = None
+                            user_cds['product'] = None
+                            user_cds['db_xrefs'] = [so.SO_CDS.id]
+                            user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
+                            try:
+                                nt = bu.extract_feature_sequence(user_cds, contig)
+                                user_cds['nt'] = nt
+                            except:
+                                log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
+                                raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
+                            try:
+                                dna_seq = Seq(nt)
+                                aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
+                            except:
+                                log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
+                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
+                            user_cds['aa'] = aa
+                            user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+                            log.info(
+                                'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
+                                user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
+                            )
+                            user_cdss.append(user_cds)
+        except Exception as e:
+            log.error('user-provided regions/features file GFF3 format not valid!', exc_info=True)
+            sys.exit(f'ERROR: User-provided regions/features file GFF3 format not valid!')
+    elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']):
+        try:
+            with xopen(str(import_path), threads=0) as fh_in:
+                for record in SeqIO.parse(fh_in, 'genbank'):
+                    for feature in record.features:
+                        if(feature.type.lower() == 'cds'  and  'pseudo' not in feature.qualifiers and  bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers):
+                            contig = contigs_by_original_id[record.id]
+                            user_cds = OrderedDict()
+                            user_cds['type'] = bc.FEATURE_CDS
+                            user_cds['source'] = bc.CDS_SOURCE_USER
+                            user_cds['contig'] = contig['id']
+                            user_cds['start'] = feature.location.start + 1
+                            user_cds['stop'] = feature.location.end
+                            user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
+                            user_cds['gene'] = None
+                            user_cds['product'] = None
+                            user_cds['db_xrefs'] = [so.SO_CDS.id]
+                            user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
+                            try:
+                                nt = bu.extract_feature_sequence(user_cds, contig)
+                                user_cds['nt'] = nt
+                            except:
+                                log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
+                                raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
+                            dna_seq = Seq(nt)
+                            try:
+                                aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
+                            except:
+                                log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
+                                raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
+                            user_cds['aa'] = aa
+                            user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+                            log.info(
+                                'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
+                                user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
+                            )
+                            user_cdss.append(user_cds)
+        except Exception as e:
+            log.error('user-provided regions/features file GenBank format not valid!', exc_info=True)
+            sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!')
+    else:
+        log.warn('user-provided regions/features file suffix not detected! suffix=%s, path=%s', file_suffix, str(import_path))
     
     return user_cdss
 

From f998fc2747f2c78f3ec68f1cb51aef64da3541dd Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 09:59:54 +0100
Subject: [PATCH 09/17] add GFF3 tests

---
 test/test_regions.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/test/test_regions.py b/test/test_regions.py
index b893935c..dbc1e1c1 100644
--- a/test/test_regions.py
+++ b/test/test_regions.py
@@ -6,15 +6,20 @@
 
 import pytest
 
-import bakta.constants as bc
-
 from .conftest import FILES, SKIP_PARAMETERS
 
 
-def test_bakta_plasmid(tmpdir):
+@pytest.mark.parametrize(
+    'regions',
+    [
+        ('NC_002127.1-region.gff3'),  # GFF3
+        ('NC_002127.1-region.gbff')  # Genbank
+    ]
+)
+def test_bakta_plasmid(regions, tmpdir):
     # full test on plasmid
     proc = run(
-        ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', 'test/data/NC_002127.1-region.gbff'] +
+        ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', f'test/data/{regions}'] +
         ['test/data/NC_002127.1.fna.gz']
     )
     assert proc.returncode == 0

From 1182dd01c0115b01620453cc13fb8a993024fa06 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 10:43:36 +0100
Subject: [PATCH 10/17] refactor code in cds module

---
 bakta/features/cds.py | 88 +++++++++++++++++++------------------------
 1 file changed, 39 insertions(+), 49 deletions(-)

diff --git a/bakta/features/cds.py b/bakta/features/cds.py
index b3dc32d1..3dc08d97 100644
--- a/bakta/features/cds.py
+++ b/bakta/features/cds.py
@@ -86,26 +86,31 @@ def predict(genome: dict):
     return cdss
 
 
+def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: str):
+    cds = OrderedDict()
+    cds['type'] = bc.FEATURE_CDS
+    cds['contig'] = contig['id']
+    cds['start'] = start
+    cds['stop'] = stop
+    cds['strand'] = strand
+    cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (contig['length'] - stop) % 3 + 1
+    cds['gene'] = None
+    cds['product'] = None
+    cds['db_xrefs'] = [so.SO_CDS.id]
+    cds['nt'] = nt
+    cds['aa'] = aa
+    cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+    return cds
+
+
 def create_cdss(genes, contig):
     partial_cdss_per_sequence = []
     cdss_per_sequence = []
     for gene in genes:
-        cds = OrderedDict()
-        cds['type'] = bc.FEATURE_CDS
-        cds['contig'] = contig['id']
-        cds['start'] = gene.begin
-        cds['stop'] = gene.end
-        cds['strand'] = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE
-        cds['gene'] = None
-        cds['product'] = None
+        strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE
+        cds = create_cds(contig, gene.begin, gene.end, strand, '', '')
         cds['start_type'] = gene.start_type
         cds['rbs_motif'] = gene.rbs_motif
-        cds['db_xrefs'] = [so.SO_CDS.id]
-        if(cds['strand'] == bc.STRAND_FORWARD):
-            cds['frame'] = (cds['start'] - 1) % 3 + 1
-        else:
-            cds['frame'] = (contig['length'] - cds['stop']) % 3 + 1
-        
         if gene.partial_begin:
             cds['truncated'] = bc.FEATURE_END_5_PRIME if cds['strand'] == bc.STRAND_FORWARD else bc.FEATURE_END_3_PRIME
             partial_cdss_per_sequence.append(cds)
@@ -119,6 +124,7 @@ def create_cdss(genes, contig):
             aa = aa[:-1]  # discard trailing asterisk
         cds['aa'] = aa
         cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+        
         log.info(
             'contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s',
             cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif']
@@ -168,24 +174,25 @@ def create_cdss(genes, contig):
 
 
 def import_user_cdss(genome: dict, import_path: Path):
-    """Imports user-provided CDS regions or features.
-    Regions are mere coordinates subject to subsequent internal annotation. Features comprise regions and existing functional annotation.
-    Both regions and annotations will be imported. Functional annotations are stored in a 'user' space and supersede internal function annotation information during the final annotation process.
-
+    """Import user-provided CDS regions.
+    Only CDS region information are imported skipping any existing functional annotations.
+    
     Parameters
     ----------
+    genome : dict
+        Genome dictionary holding sequence information (contigs)
     import_path : Path
         Path to GFF3 or Genbank file with regions or features.
 
     Returns
     -------
     list
-        a list of CDS features - with or without functional annotation.
+        a list of CDS features - without functional annotations.
     """
     user_cdss = []
     contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']}
     file_suffix = import_path.suffix.lower()
-    if(file_suffix in ['.gff', '.gff3']):
+    if(file_suffix in ['.gff', '.gff3']):  # parse GFF3 format
         try:
             with xopen(str(import_path), threads=0) as fh_in:
                 skip_lines = False
@@ -199,17 +206,8 @@ def import_user_cdss(genome: dict, import_path: Path):
                         contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t')
                         if(feature_type.lower() == 'cds'):
                             contig = contigs_by_original_id[contig_id]
-                            user_cds = OrderedDict()
-                            user_cds['type'] = bc.FEATURE_CDS
+                            user_cds = create_cds(contig, int(start), int(stop), strand, '', '')
                             user_cds['source'] = bc.CDS_SOURCE_USER
-                            user_cds['contig'] = contig['id']
-                            user_cds['start'] = int(start)
-                            user_cds['stop'] = int(stop)
-                            user_cds['strand'] = strand
-                            user_cds['gene'] = None
-                            user_cds['product'] = None
-                            user_cds['db_xrefs'] = [so.SO_CDS.id]
-                            user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
                             try:
                                 nt = bu.extract_feature_sequence(user_cds, contig)
                                 user_cds['nt'] = nt
@@ -217,13 +215,13 @@ def import_user_cdss(genome: dict, import_path: Path):
                                 log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
                                 raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
                             try:
-                                dna_seq = Seq(nt)
-                                aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
+                                aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True))
+                                user_cds['aa'] = aa
+                                user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
                             except:
                                 log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
                                 raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
-                            user_cds['aa'] = aa
-                            user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+                            
                             log.info(
                                 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
                                 user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]
@@ -232,38 +230,30 @@ def import_user_cdss(genome: dict, import_path: Path):
         except Exception as e:
             log.error('user-provided regions/features file GFF3 format not valid!', exc_info=True)
             sys.exit(f'ERROR: User-provided regions/features file GFF3 format not valid!')
-    elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']):
+    elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']):  # parse GenBank format
         try:
             with xopen(str(import_path), threads=0) as fh_in:
                 for record in SeqIO.parse(fh_in, 'genbank'):
                     for feature in record.features:
                         if(feature.type.lower() == 'cds'  and  'pseudo' not in feature.qualifiers and  bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers):
                             contig = contigs_by_original_id[record.id]
-                            user_cds = OrderedDict()
-                            user_cds['type'] = bc.FEATURE_CDS
+                            strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
+                            user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '')
                             user_cds['source'] = bc.CDS_SOURCE_USER
-                            user_cds['contig'] = contig['id']
-                            user_cds['start'] = feature.location.start + 1
-                            user_cds['stop'] = feature.location.end
-                            user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE
-                            user_cds['gene'] = None
-                            user_cds['product'] = None
-                            user_cds['db_xrefs'] = [so.SO_CDS.id]
-                            user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1
                             try:
                                 nt = bu.extract_feature_sequence(user_cds, contig)
                                 user_cds['nt'] = nt
                             except:
                                 log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop'])
                                 raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}")
-                            dna_seq = Seq(nt)
                             try:
-                                aa = str(dna_seq.translate(table=cfg.translation_table, cds=True))
+                                aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True))
+                                user_cds['aa'] = aa
+                                user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
                             except:
                                 log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt)
                                 raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}")
-                            user_cds['aa'] = aa
-                            user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa)
+                            
                             log.info(
                                 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]',
                                 user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:]

From 424c8ce7fd5715e040e137d8586b053fedf9bfae Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 10:44:21 +0100
Subject: [PATCH 11/17] bump version to v1.9.0-beta

---
 bakta/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bakta/__init__.py b/bakta/__init__.py
index 169201b5..e92ada64 100644
--- a/bakta/__init__.py
+++ b/bakta/__init__.py
@@ -1,2 +1,2 @@
-__version__ = '1.8.2'
+__version__ = '1.9.0-beta'
 __db_schema_version__ = 5

From 3b07ff2a15d0048b1ac2164478d7e8d38711da91 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 10:57:28 +0100
Subject: [PATCH 12/17] update readme

---
 README.md | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 458a3f21..8e97cf40 100644
--- a/README.md
+++ b/README.md
@@ -226,9 +226,13 @@ NODE_3 | p2 | `p`  |  `c` | `pXYZ2`
 NODE_4 | special-contig-name-xyz |  `-` | -
 NODE_5 | `` |  `-` | -
 
-#### User provided protein sequences
+#### User-provided regions
 
-Bakta accepts user provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format. Using the Fasta format, each reference sequence can be provided in a short or long format:
+Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These features supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section.
+
+#### User-provided protein sequences
+
+Bakta accepts user-provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format which are used in the functional annotation process. Using the Fasta format, each reference sequence can be provided in a short or long format:
 
 ```bash
 # short:
@@ -324,7 +328,7 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar
 usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT]
              [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID]
              [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS]
-             [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta]
+             [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta] [--regions REGIONS]
              [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region]
              [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot]
              [--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version]
@@ -368,6 +372,7 @@ Annotation:
   --compliant           Force Genbank/ENA/DDJB compliance
   --proteins PROTEINS   Fasta file of trusted protein sequences for CDS annotation
   --meta                Run in metagenome mode. This only affects CDS prediction.
+  --regions REGIONS     Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).
 
 Workflow:
   --skip-trna           Skip tRNA detection & annotation
@@ -422,7 +427,7 @@ ncRNA (cis-regulatory) region types:
 
 ### Coding sequences
 
-The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa.
+The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa. In addition, superseding regions of pre-predicted CDS can be provided via `--regions`.
 
 To rapidly identify known protein sequences with exact sequence matches and to conduct a comprehensive annotations, Bakta utilizes a compact read-only SQLite database comprising protein sequence digests and pre-assigned annotations for millions of known protein sequences and clusters.
 
@@ -435,22 +440,23 @@ Conceptual terms:
 
 **CDS**:
 
-1. Prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs)
+1. De novo-prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs)
 2. Discard spurious CDS via AntiFam
 3. Detect translational exceptions (selenocysteines)
-4. Detection of UPSs via MD5 digests and lookup of related IPS and PCS
-5. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5)
-6. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively
-7. Execution of expert systems:
+4. Import of superseding user-provided CDS regions (optional)
+5. Detection of UPSs via MD5 digests and lookup of related IPS and PCS
+6. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5)
+7. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively
+8. Execution of expert systems:
    - AMR: AMRFinderPlus
    - Expert proteins: NCBI BlastRules, VFDB
    - User proteins (optionally via `--proteins <Fasta/GenBank>`)
-8. Prediction of signal peptides (optionally via `--gram <+/->`)
-9. Detection of pseudogenes:
+9. Prediction of signal peptides (optionally via `--gram <+/->`)
+10. Detection of pseudogenes:
    1. Search for reference PCSs using `hypothetical` CDS as seed sequences
    2. Translated alignment (blastx) of reference PCSs against up-/downstream-elongated CDS regions
    3. Analysis of translated alignments and detection of pseudogenization causes & effects
-10. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy
+11. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy
 
 CDS without IPS or PSC hits as well as those without gene symbols or product descriptions different from `hypothetical` will be marked as `hypothetical`.
 

From 02138919bd4f6d2cc74184f2e4c8801d4afbf71f Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 12:14:35 +0100
Subject: [PATCH 13/17] polish test cases

---
 test/data/NC_002127.1-region.gbff | 15 +++++++++++++--
 test/data/NC_002127.1-region.gff3 |  3 ++-
 test/test_regions.py              |  5 ++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff
index 1cc0907d..651ea786 100644
--- a/test/data/NC_002127.1-region.gbff
+++ b/test/data/NC_002127.1-region.gbff
@@ -8,10 +8,9 @@ FEATURES             Location/Qualifiers
      source          1..3306
                      /mol_type="genomic DNA"
                      /plasmid="unnamed1"
-     gene            32..736
-                     /locus_tag="DOGAIA_00005"
      CDS             32..736
                      /locus_tag="DOGAIA_00005"
+                     /product="alternative downstream start codon test case 1"
                      /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL
                      FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ
                      DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ
@@ -19,6 +18,18 @@ FEATURES             Location/Qualifiers
                      YAEVLIAITEYHRALLSRLMAD"
                      /codon_start=1
                      /transl_table=11
+     CDS             complement(1348..2388)
+                     /locus_tag="DOGAIA_00015"
+                     /product="alternative downstream start codon test case 2"
+                     /translation="MPSGGNTTIGRASGQNNTYFDEIALIIKENCLYSDTKNFEYTIPK
+                     FSDDDRANLFEFLSEEGITITEDNNNDPNCKHQYIMTTSNGDRVRAKIYKRGSIQFQGK
+                     YLQIASLINDFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
+                     QLSCSLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNLGEYFTENKPK
+                     YIIREIHQETINGEIAEVLCECYTYWHENRHGLFHMKPGIADTKTINKLESIAIIDTVC
+                     QLIDGGVARLKL"
+                     /codon_start=1
+                     /transl_table=11
+
 ORIGIN
         1 ttcttctgcg agttcgtgca gcttctcaca catggtggcc tgctcgtcag catcgagtgc
        61 gtccagtttt tcgagcagcg tcaggctctg gctttttatg aatcccgcca tgttgagtgc
diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3
index 7ad6e5c8..a33c1602 100644
--- a/test/data/NC_002127.1-region.gff3
+++ b/test/data/NC_002127.1-region.gff3
@@ -7,7 +7,8 @@
 # URL: github.com/oschwengers/bakta
 ##sequence-region NC_002127.1 1 3306
 NC_002127.1	Bakta	region	1	3306	.	+	.	ID=NC_002127.1;Name=NC_002127.1;Is_circular=true
-NC_002127.1	?	CDS	32	736	.	+	0	ID=DOGAIA_00005
+NC_002127.1	?	CDS	32	736	.	+	0	ID=DOGAIA_00005;NAME=alternative downstream start codon test case 1
+NC_002127.1	?	CDS	1348	2229	.	-	0	ID=DOGAIA_00015;NAME=alternative downstream start codon test case 2
 ##FASTA
 >NC_002127.1
 TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC
diff --git a/test/test_regions.py b/test/test_regions.py
index dbc1e1c1..d12c6319 100644
--- a/test/test_regions.py
+++ b/test/test_regions.py
@@ -38,6 +38,9 @@ def test_bakta_plasmid(regions, tmpdir):
     cdss = [feat for feat in results['features'] if feat['type'] == 'cds']
     assert len(cdss) == 3
     for cds in cdss:
-        if(cds['stop'] == 736):
+        if(cds['strand'] == '+'  and  cds['stop'] == 736):  # test case 1: alternative downstream start codon from 2 to 32 on + strand
             assert cds['start'] != 2  # de novo-predicted CDS start
             assert cds['start'] == 32  # user-provided CDS start
+        elif(cds['strand'] == '-'  and  cds['start'] == 1348):  # test case 2: alternative downstream start codon from 2388 to 2229 on - strand
+            assert cds['stop'] != 2388  # de novo-predicted CDS start
+            assert cds['stop'] == 2229  # user-provided CDS start

From b09843e92293a1ea38d4b9d82d03e97d49406df0 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 12:55:01 +0100
Subject: [PATCH 14/17] add overlapping CDS test case

---
 test/data/NC_002127.1-region.gbff | 18 ++++++------------
 test/data/NC_002127.1-region.gff3 |  1 +
 test/test_regions.py              |  6 ++++--
 3 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff
index 651ea786..ea30d1a6 100644
--- a/test/data/NC_002127.1-region.gbff
+++ b/test/data/NC_002127.1-region.gbff
@@ -11,22 +11,16 @@ FEATURES             Location/Qualifiers
      CDS             32..736
                      /locus_tag="DOGAIA_00005"
                      /product="alternative downstream start codon test case 1"
-                     /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL
-                     FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ
-                     DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ
-                     EIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVMRLGALQKKLFIDGKRVGDRE
-                     YAEVLIAITEYHRALLSRLMAD"
                      /codon_start=1
                      /transl_table=11
-     CDS             complement(1348..2388)
+     CDS             981..1121
+                     /locus_tag="DOGAIA_00006"
+                     /product="additional overlapping CDS test case 3"
+                     /codon_start=1
+                     /transl_table=11
+     CDS             complement(1348..2229)
                      /locus_tag="DOGAIA_00015"
                      /product="alternative downstream start codon test case 2"
-                     /translation="MPSGGNTTIGRASGQNNTYFDEIALIIKENCLYSDTKNFEYTIPK
-                     FSDDDRANLFEFLSEEGITITEDNNNDPNCKHQYIMTTSNGDRVRAKIYKRGSIQFQGK
-                     YLQIASLINDFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
-                     QLSCSLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNLGEYFTENKPK
-                     YIIREIHQETINGEIAEVLCECYTYWHENRHGLFHMKPGIADTKTINKLESIAIIDTVC
-                     QLIDGGVARLKL"
                      /codon_start=1
                      /transl_table=11
 
diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3
index a33c1602..3772ade6 100644
--- a/test/data/NC_002127.1-region.gff3
+++ b/test/data/NC_002127.1-region.gff3
@@ -8,6 +8,7 @@
 ##sequence-region NC_002127.1 1 3306
 NC_002127.1	Bakta	region	1	3306	.	+	.	ID=NC_002127.1;Name=NC_002127.1;Is_circular=true
 NC_002127.1	?	CDS	32	736	.	+	0	ID=DOGAIA_00005;NAME=alternative downstream start codon test case 1
+NC_002127.1	?	CDS	981	1121	.	+	0	ID=DOGAIA_00006;NAME=additional overlapping CDS test case 3
 NC_002127.1	?	CDS	1348	2229	.	-	0	ID=DOGAIA_00015;NAME=alternative downstream start codon test case 2
 ##FASTA
 >NC_002127.1
diff --git a/test/test_regions.py b/test/test_regions.py
index d12c6319..6b74bcd2 100644
--- a/test/test_regions.py
+++ b/test/test_regions.py
@@ -38,9 +38,11 @@ def test_bakta_plasmid(regions, tmpdir):
     cdss = [feat for feat in results['features'] if feat['type'] == 'cds']
     assert len(cdss) == 3
     for cds in cdss:
-        if(cds['strand'] == '+'  and  cds['stop'] == 736):  # test case 1: alternative downstream start codon from 2 to 32 on + strand
+        if(cds['strand'] == '+'  and  cds['stop'] == 736):  # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736]
             assert cds['start'] != 2  # de novo-predicted CDS start
             assert cds['start'] == 32  # user-provided CDS start
-        elif(cds['strand'] == '-'  and  cds['start'] == 1348):  # test case 2: alternative downstream start codon from 2388 to 2229 on - strand
+        elif(cds['strand'] == '-'  and  cds['start'] == 1348):  # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388]
             assert cds['stop'] != 2388  # de novo-predicted CDS start
             assert cds['stop'] == 2229  # user-provided CDS start
+        elif(cds['strand'] == '+'  and  cds['start'] == 981):  # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351]
+            assert cds['stop'] == 1121  # user-provided overlapping CDS -> de novo-predicted was filtered-out

From d805afd1d1e50ae9d7a3aab8d5b59aa94b29a37c Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 12:59:33 +0100
Subject: [PATCH 15/17] distinguish de novo from user-provided ORF aa seqs

---
 bakta/features/orf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bakta/features/orf.py b/bakta/features/orf.py
index 13f1e5f7..f36a22cb 100644
--- a/bakta/features/orf.py
+++ b/bakta/features/orf.py
@@ -48,7 +48,7 @@ def detect_spurious(orfs: Sequence[dict]):
 
 def get_orf_key(orf: dict) -> str:
     """Generate a standardized and unique ORF-like feature key for internal store/analyze/parse/retrieval cycles."""
-    return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}"
+    return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}"
 
 
 def get_orf_dictionary(orfs: Sequence[dict]) -> Dict[str, dict]:

From a632ced9e5e540613e7059a75a5bf28d2d460a77 Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 13:08:00 +0100
Subject: [PATCH 16/17] refactor tests for better readbility

---
 test/test_regions.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/test/test_regions.py b/test/test_regions.py
index 6b74bcd2..b457c80a 100644
--- a/test/test_regions.py
+++ b/test/test_regions.py
@@ -37,12 +37,21 @@ def test_bakta_plasmid(regions, tmpdir):
     assert results is not None
     cdss = [feat for feat in results['features'] if feat['type'] == 'cds']
     assert len(cdss) == 3
-    for cds in cdss:
-        if(cds['strand'] == '+'  and  cds['stop'] == 736):  # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736]
-            assert cds['start'] != 2  # de novo-predicted CDS start
-            assert cds['start'] == 32  # user-provided CDS start
-        elif(cds['strand'] == '-'  and  cds['start'] == 1348):  # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388]
-            assert cds['stop'] != 2388  # de novo-predicted CDS start
-            assert cds['stop'] == 2229  # user-provided CDS start
-        elif(cds['strand'] == '+'  and  cds['start'] == 981):  # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351]
-            assert cds['stop'] == 1121  # user-provided overlapping CDS -> de novo-predicted was filtered-out
+
+    # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736]
+    provided_cds_1 = [cds for cds in cdss if cds['start'] == 32 and cds['stop'] == 736 and cds['strand'] == '+']
+    assert len(provided_cds_1) == 1
+    denovo_cds_1 = [cds for cds in cdss if cds['start'] == 2 and cds['stop'] == 736 and cds['strand'] == '+']
+    assert len(denovo_cds_1) == 0  # de novo-predicted was filtered-out
+
+    # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388]
+    provided_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2229 and cds['strand'] == '-']
+    assert len(provided_cds_2) == 1
+    denovo_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2388 and cds['strand'] == '-']
+    assert len(denovo_cds_2) == 0  # de novo-predicted was filtered-out
+
+    # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351]
+    provided_cds_3 = [cds for cds in cdss if cds['start'] == 981 and cds['stop'] == 1121 and cds['strand'] == '+']
+    assert len(provided_cds_3) == 1
+    denovo_cds_3 = [cds for cds in cdss if cds['start'] == 971 and cds['stop'] == 1351 and cds['strand'] == '-']
+    assert len(denovo_cds_3) == 0  # de novo-predicted was filtered-out

From b25e3dbaae3c3d3b66707a5f153b56d2bc69544d Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Fri, 17 Nov 2023 16:14:56 +0100
Subject: [PATCH 17/17] amend readme [skip ci]

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8e97cf40..b01c94b7 100644
--- a/README.md
+++ b/README.md
@@ -228,7 +228,7 @@ NODE_5 | `` |  `-` | -
 
 #### User-provided regions
 
-Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These features supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section.
+Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These regions supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. A maximum overlap with de novo-predicted CDS of 30 bp is allowed. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section.
 
 #### User-provided protein sequences