From ec48cf641f1cbdf590c9bfa03e4e84724b7c99aa Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 16:50:25 +0100 Subject: [PATCH 01/17] introduce user-provided regions --- bakta/config.py | 16 +++++++++++++++- bakta/main.py | 6 ++++++ bakta/utils.py | 1 + 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/bakta/config.py b/bakta/config.py index 8c6866e7..0bf01990 100644 --- a/bakta/config.py +++ b/bakta/config.py @@ -55,6 +55,7 @@ compliant = None user_proteins = None meta = None +regions = None # workflow configuration skip_trna = None @@ -160,7 +161,7 @@ def setup(args): taxon = None # annotation configurations - global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta + global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta, regions complete = args.complete log.info('complete=%s', complete) prodigal_tf = args.prodigal_tf @@ -232,6 +233,19 @@ def setup(args): sys.exit(f'ERROR: replicon table file ({replicons}) not valid!') log.info('replicon-table=%s', replicons) user_proteins = check_user_proteins(args) + regions = args.regions + if(regions is not None): + try: + if(regions == ''): + raise ValueError('File path argument must be non-empty') + regions_path = Path(args.regions).resolve() + check_readability('regions', regions_path) + check_content_size('regions', regions_path) + regions = regions_path + except: + log.error('provided regions file not valid! path=%s', regions) + sys.exit(f'ERROR: regions file ({regions}) not valid!') + log.info('regions=%s', regions) # workflow configurations diff --git a/bakta/main.py b/bakta/main.py index f0948348..b14cd0ec 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -239,6 +239,12 @@ def main(): no_revised = feat_cds.revise_translational_exceptions(genome, cdss) print(f'\trevised translational exceptions: {no_revised}') cdss = [cds for cds in cdss if 'discarded' not in cds] + + if(cfg.regions): + log.debug('import user-provided CDS regions') + imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions) + print(f'\timported CDS regions: {len(imported_cdss)}') + # ToDo: exclude overlapping CDS if(len(cdss) > 0): log.debug('lookup CDS UPS/IPS') diff --git a/bakta/utils.py b/bakta/utils.py index dc224bb2..240f62ef 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -91,6 +91,7 @@ def parse_arguments(): arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance') arg_group_annotation.add_argument('--proteins', action='store', default=None, dest='proteins', help='Fasta file of trusted protein sequences for CDS annotation') arg_group_annotation.add_argument('--meta', action='store_true', help='Run in metagenome mode. This only affects CDS prediction.') + arg_group_annotation.add_argument('--regions', action='store', default=None, help='Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).') arg_group_workflow = parser.add_argument_group('Workflow') arg_group_workflow.add_argument('--skip-trna', action='store_true', dest='skip_trna', help='Skip tRNA detection & annotation') From 3072247df720449ad9ea5e817229c9e47caa0145 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 16:51:16 +0100 Subject: [PATCH 02/17] implement creation & overlap filters --- bakta/constants.py | 4 +- bakta/features/annotation.py | 41 ++++++++++++++++++-- bakta/features/cds.py | 72 ++++++++++++++++++++++++++++++++++++ bakta/main.py | 1 - 4 files changed, 113 insertions(+), 5 deletions(-) diff --git a/bakta/constants.py b/bakta/constants.py index 7e3d5598..0a612b6a 100644 --- a/bakta/constants.py +++ b/bakta/constants.py @@ -19,7 +19,7 @@ ############################################################################ -# Protein identification settings +# CDS settings ############################################################################ MIN_PSCC_IDENTITY = 0.5 # min protein identity for PSC detection MIN_PSC_COVERAGE = 0.8 # min protein coverage for PSC detection @@ -27,6 +27,8 @@ MIN_SORF_COVERAGE = 0.9 # min sORF coverage for PSC detection MIN_SORF_IDENTITY = 0.9 # min sORF identity for PSC detection HYPOTHETICAL_PROTEIN = 'hypothetical protein' # hypothetical protein product description +CDS_MAX_OVERLAPS = 30 # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps +CDS_SOURCE_USER = 'user-provided' ############################################################################ diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 0f9412ee..fd99761a 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -169,8 +169,12 @@ def detect_feature_overlaps(genome: dict): crispr_arrays = contig_crispr_arrays[crispr_array['contig']] crispr_arrays.append(crispr_array) contig_cdss = {k['id']: [] for k in genome['contigs']} + contig_cdss_user_provided = {k['id']: [] for k in genome['contigs']} for cds in genome['features'].get(bc.FEATURE_CDS, []): - cdss = contig_cdss[cds['contig']] + if(cds.get('source', None) == bc.CDS_SOURCE_USER): + cdss = contig_cdss_user_provided[cds['contig']] + else: + cdss = contig_cdss[cds['contig']] cdss.append(cds) contig_sorfs = {k['id']: [] for k in genome['contigs']} for sorf in genome['features'].get(bc.FEATURE_SORF, []): @@ -217,7 +221,7 @@ def detect_feature_overlaps(genome: dict): ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['contig'], ncRNA_region['score'], ncRNA_region_overlap['score'] ) - # mark CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs + # mark de novo-predicted CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs and user-provided CDS for cds in contig_cdss[contig['id']]: # tmRNA overlaps for tmRNA in contig_tm_rnas[contig['id']]: @@ -279,6 +283,23 @@ def detect_feature_overlaps(genome: dict): "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['contig'] ) + # user-provided CDS overlaps + for cds_user_provided in contig_cdss_user_provided[contig['id']]: + if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']): + continue + else: # overlap -> remove cds + overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1 + if(overlap > bc.CDS_MAX_OVERLAPS): + overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]" + cds['discarded'] = { + 'type': bc.DISCARD_TYPE_OVERLAP, + 'feature_type': bc.FEATURE_CDS, + 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' + } + log.info( + "overlap: de novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig'] + ) # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs for sorf in contig_sorfs[contig['id']]: @@ -342,7 +363,21 @@ def detect_feature_overlaps(genome: dict): "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['contig'] ) - # CDS overlaps (skipped as most overlaps are filtered in sORF detection) + # user-provided CDS overlaps + for cds_user_provided in contig_cdss_user_provided[contig['id']]: + if(sorf['stop'] < cds_user_provided['start'] or sorf['start'] > cds_user_provided['stop']): + continue + else: # overlap -> remove sorf + overlap = f"[{max(sorf['start'], cds_user_provided['start'])},{min(sorf['stop'], cds_user_provided['stop'])}]" + sorf['discarded'] = { + 'type': bc.DISCARD_TYPE_OVERLAP, + 'feature_type': bc.FEATURE_CDS, + 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' + } + log.info( + "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['contig'] + ) # sORF overlaps for overlap_sorf in contig_sorfs[contig['id']]: diff --git a/bakta/features/cds.py b/bakta/features/cds.py index b18a2959..b318f593 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -7,12 +7,15 @@ from collections import OrderedDict from typing import Dict, Sequence, Set, Tuple, Union +from pathlib import Path import pyrodigal import pyhmmer +from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqUtils.ProtParam import ProteinAnalysis +from xopen import xopen import bakta.config as cfg import bakta.constants as bc @@ -164,6 +167,75 @@ def create_cdss(genes, contig): return cdss_per_sequence +def import_user_cdss(genome: dict, import_path: Path): + """Imports user-provided CDS regions or features. + Regions are mere coordinates subject to subsequent internal annotation. Features comprise regions and existing functional annotation. + Both regions and annotations will be imported. Functional annotations are stored in a 'user' space and supersede internal function annotation information during the final annotation process. + + Parameters + ---------- + import_path : Path + Path to GFF3 or Genbank file with regions or features. + + Returns + ------- + list + a list of CDS features - with or without functional annotation. + """ + user_cdss = [] + # ToDO: implement GFF3 parsing + # try: + # with xopen(str(import_path), threads=0) as fh_in: + # for record in SeqIO.parse(fh_in, 'fasta'): + # user_proteins.append(parse_user_protein_sequences_fasta(record)) + # except Exception as e: + # log.error('provided user proteins file Fasta format not valid!', exc_info=True) + # sys.exit(f'ERROR: User proteins file Fasta format not valid!') + + contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']} + try: + with xopen(str(import_path), threads=0) as fh_in: + for record in SeqIO.parse(fh_in, 'genbank'): + for feature in record.features: + if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): + contig = contigs_by_original_id[record.id] + user_cds = OrderedDict() + user_cds['type'] = bc.FEATURE_CDS + user_cds['source'] = bc.CDS_SOURCE_USER + user_cds['contig'] = contig['id'] + user_cds['start'] = feature.location.start + user_cds['stop'] = feature.location.end + user_cds['strand'] = bc.STRAND_FORWARD if feature.location == +1 else bc.STRAND_REVERSE + user_cds['gene'] = None + user_cds['product'] = None + user_cds['db_xrefs'] = [so.SO_CDS.id] + user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 + try: + nt = bu.extract_feature_sequence(user_cds, contig) + user_cds['nt'] = nt + except: + log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + dna_seq = Seq(nt) + try: + aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) + except: + log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( + 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + ) + user_cdss.append(user_cds) + except Exception as e: + log.error('user-provided regions/features file GenBank format not valid!', exc_info=True) + sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!') + + return user_cdss + + def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: """Detect Pfam-A entries""" pfam_hits = [] diff --git a/bakta/main.py b/bakta/main.py index b14cd0ec..2499822e 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -244,7 +244,6 @@ def main(): log.debug('import user-provided CDS regions') imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions) print(f'\timported CDS regions: {len(imported_cdss)}') - # ToDo: exclude overlapping CDS if(len(cdss) > 0): log.debug('lookup CDS UPS/IPS') From f3687adf85602551fbeee233bfe5b6dd3864623e Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 16:51:53 +0100 Subject: [PATCH 03/17] amend INSDC/GFF3 inferences --- bakta/io/gff.py | 9 +++++++-- bakta/io/insdc.py | 8 +++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 25349eb7..3a0ab552 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -230,7 +230,12 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if(feat.get('gene', None)): gene_annotations['gene'] = feat['gene'] annotations['Parent'] = gene_id - annotations['inference'] = 'ab initio prediction:Prodigal:2.6' + if(feat.get('source', None) == bc.CDS_SOURCE_USER): + annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' + source = '?' + else: + annotations['inference'] = 'ab initio prediction:Prodigal:2.6' + source = 'Prodigal' annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note') if(feat.get('pseudo', False)): @@ -251,7 +256,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if('Notes' not in annotations): annotations['Note'] = notes annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") + fh.write(f"{feat['contig']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) elif(feat['type'] == bc.FEATURE_SORF): diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index 3b287afd..ba8d630e 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -131,7 +131,13 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat qualifiers['transl_table'] = cfg.translation_table insdc_feature_type = bc.INSDC_FEATURE_CDS inference = [] - inference.append('ab initio prediction:Prodigal:2.6' if feature['type'] == bc.FEATURE_CDS else f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}") + if(feature['type'] == bc.FEATURE_CDS): + if(feature.get('source', None) == bc.CDS_SOURCE_USER): + inference.append('EXISTENCE:non-experimental evidence, no additional details recorded') + else: + inference.append('ab initio prediction:Prodigal:2.6') + else: + inference.append(f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}") if('ncbi_nrp_id' in feature.get('ups', {})): nrp_id = feature['ups']['ncbi_nrp_id'] inference.append(f'similar to AA sequence:{bc.DB_XREF_REFSEQ_NRP}:{nrp_id}') From 60214ea4c5b3918b0b4ca613e3da022f0ad4c522 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 17:31:57 +0100 Subject: [PATCH 04/17] fix source value error in GFF3 output --- bakta/io/gff.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 3a0ab552..f0368b84 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -221,6 +221,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat annotations['pseudo'] = True if(feat.get('gene', None)): # add gene annotation if available annotations['gene'] = feat['gene'] + source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Prodigal' if(cfg.compliant): gene_id = f"{feat['locus']}_gene" gene_annotations = { @@ -232,10 +233,8 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat annotations['Parent'] = gene_id if(feat.get('source', None) == bc.CDS_SOURCE_USER): annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' - source = '?' else: annotations['inference'] = 'ab initio prediction:Prodigal:2.6' - source = 'Prodigal' annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note') if(feat.get('pseudo', False)): @@ -244,7 +243,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if(ec_number is not None): annotations['ec_number'] = ec_number gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tProdigal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['contig']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") if('exception' in feat): ex = feat['exception'] pos = f"{ex['start']}..{ex['stop']}" From 47d325cdc52ecd75d5c32d708b9efc737e3b7dc8 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 18:40:02 +0100 Subject: [PATCH 05/17] actually add user-provided cdss in main --- bakta/main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bakta/main.py b/bakta/main.py index 2499822e..3cfb3a4d 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -244,6 +244,7 @@ def main(): log.debug('import user-provided CDS regions') imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions) print(f'\timported CDS regions: {len(imported_cdss)}') + cdss.extend(imported_cdss) if(len(cdss) > 0): log.debug('lookup CDS UPS/IPS') From b20c5ad4a359cf2b6c03d5c2b3a5e2f743ea0e7a Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 18:41:22 +0100 Subject: [PATCH 06/17] fix CDS start & strand --- bakta/features/cds.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index b318f593..37485e00 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -203,9 +203,9 @@ def import_user_cdss(genome: dict, import_path: Path): user_cds['type'] = bc.FEATURE_CDS user_cds['source'] = bc.CDS_SOURCE_USER user_cds['contig'] = contig['id'] - user_cds['start'] = feature.location.start + user_cds['start'] = feature.location.start + 1 user_cds['stop'] = feature.location.end - user_cds['strand'] = bc.STRAND_FORWARD if feature.location == +1 else bc.STRAND_REVERSE + user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE user_cds['gene'] = None user_cds['product'] = None user_cds['db_xrefs'] = [so.SO_CDS.id] From 01a38e52925f82ab426fc85dbaca4cfb2aca739f Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 16 Nov 2023 18:41:42 +0100 Subject: [PATCH 07/17] add tests --- test/data/NC_002127.1-region.gbff | 79 +++++++++++++++++++++++++++++++ test/data/NC_002127.1-region.gff3 | 68 ++++++++++++++++++++++++++ test/test_args.py | 20 ++++++++ test/test_regions.py | 38 +++++++++++++++ 4 files changed, 205 insertions(+) create mode 100644 test/data/NC_002127.1-region.gbff create mode 100644 test/data/NC_002127.1-region.gff3 create mode 100644 test/test_regions.py diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff new file mode 100644 index 00000000..1cc0907d --- /dev/null +++ b/test/data/NC_002127.1-region.gbff @@ -0,0 +1,79 @@ +LOCUS NC_002127.1 3306 bp DNA circular BCT 16-NOV-2023 +DEFINITION plasmid unnamed1, complete sequence. +KEYWORDS . +SOURCE None + ORGANISM . + . +FEATURES Location/Qualifiers + source 1..3306 + /mol_type="genomic DNA" + /plasmid="unnamed1" + gene 32..736 + /locus_tag="DOGAIA_00005" + CDS 32..736 + /locus_tag="DOGAIA_00005" + /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL + FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ + DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ + EIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVMRLGALQKKLFIDGKRVGDRE + YAEVLIAITEYHRALLSRLMAD" + /codon_start=1 + /transl_table=11 +ORIGIN + 1 ttcttctgcg agttcgtgca gcttctcaca catggtggcc tgctcgtcag catcgagtgc + 61 gtccagtttt tcgagcagcg tcaggctctg gctttttatg aatcccgcca tgttgagtgc + 121 agtttgctgc tgcttgttca tctttctgtt ttctccgttc tgtctgtcat ctgcgtcgtg + 181 tgattatatc gcgcaccact tttcgaccgt cttaccgccg gtattctgcc gacggacatt + 241 tcagtcagac aacactgtca ctgccaaaaa acagcagtgc tttgttggta attcgaactt + 301 gcagacagga caggatgtgc aattgttata ccgcgcatac atgcacgcta ttacaattac + 361 cctggtcagg gcttcgcccc gacaccccat gtcagatacg gagccatgtt ttatgacaaa + 421 acgaagtgga agtaatacgc gcaggcgggc tatcagtcgc cctgttcgtc tgacggcaga + 481 agaagaccag gaaatcagaa aaagggctgc tgaatgcggc aagaccgttt ctggtttttt + 541 acgggcggca gctctcggta agaaagttaa ctcactgact gatgaccggg tgctgaaaga + 601 agttatgcga ctgggggcgt tgcagaaaaa actctttatc gacggcaagc gtgtcgggga + 661 cagagagtat gcggaggtgc tgatcgctat tacggagtat caccgtgccc tgttatccag + 721 gcttatggca gattagcttc ccggagagaa actgtcgaaa acagacggta tgaacgccgt + 781 aagcccccaa accgatcgcc attcactttc atgcatagct atgcagtgag ctgaaagcga + 841 tcctgacgca tttttccggt ttaccccggg gaaaacatct ctttttgcgg tgtctgcgtc + 901 agaatcgcgt tcagcgcgtt ttggcggtgc gcgtaatgag acgttatggt aaatgtcttc + 961 tggcttgata ttatattgga atgccttttt tcaaagcaaa tgatgtggct ttggatagaa + 1021 ggtttacgtt gatcttatca aagttttttt taaagaacga agccgagagc tcagataaat + 1081 cattatattc atcagttttc gtaactttgt ttaatgtgta acttgaaaac ttctcgccat + 1141 taaatgacgt atagacgtaa cgatcttttt ttccaccgtt aggaattatt aaatcaaaaa + 1201 aaacatcacc cttgcttttc tttttcttca agtcggattc gatttttgag aaaaattcgc + 1261 tcgggctata aatatcagta gcatagacaa taaataaagt tttatcttta ttttttattg + 1321 cttctatttg atatttttta tcttttttca taatttcaac ctagctacgc caccatctat + 1381 taattggcaa acggtatcga tgattgcgat tgattctaat ttgttaattg tcttcgtgtc + 1441 agctattcct ggtttcatat gaaacaaacc atgcctgttc tcatgccagt aagtgtagca + 1501 ttcacacaaa acttccgcta tttcaccatt tatagtttct tggtgtattt ctctgattat + 1561 atatttgggt ttgttttcag tgaagtattc gccaaggttc tttgatgatg atggattgca + 1621 aacatcatta agtatttgat atataaagcc ttctatggct cttaatgcag agaagcagta + 1681 tgttgagtaa tcttccattt cgacatctat ttttttcatt attagcgagc atgatagctg + 1741 ttttttgata tcttcatgga ttttatcgat gctttttggt agtttgctat gcaactcgga + 1801 ctcaatagtt tcttttttta tgtcaacatt aaattcttta tttttttgtt cgacaatctc + 1861 tttcatgttt agtattgagc acatgaaatc gttaatcaaa ctcgcgattt gaaggtattt + 1921 tccttggaat tgaatagagc cgcgcttgta aatttttgcc ctgaccctgt caccattgct + 1981 ggtggtcata atatattggt gtttacaatt aggatcgtta ttattatctt ctgttattgt + 2041 tatcccctct tcagaaagaa attcaaatag atttgccctg tcatcatcac tgaattttgg + 2101 aatggtgtat tcaaagttct ttgtgtctga atacaaacag ttttctttta taatcaaggc + 2161 gatttcatca aagtaagtgt tattttgccc agacgctctt ccgatagtgg tatttccgcc + 2221 tgatggcatt agttttatta agaagtctat tcctttatat gtgccagata tgtgagtttc + 2281 tctttcgttt tttacattag aggaatagtt tgtgacgcca ttctgcgtca gagcagactc + 2341 aatcttgtca atattgatat ttagtgcttt aaacgggttc tgtgccattg ggtcaatccg + 2401 ttgttttttt tgaatatgta cagatcttgt ttttttgtca acggaatagc tgttcgttga + 2461 cttgatagac cgattgattc atcatctcat aaataaagaa aaaccaccgc taccaacggt + 2521 ggttttctca aggttcgctg agctaccaac tctttgaacc aaggtaactg gcttggagga + 2581 gcgcagtcac caaaatctgt tctttcagtt tagccttaac aggtgcataa cttcaagaca + 2641 aactcctcta aatcagttac caatggctgc tgccagtggc gataagtcgt gtcttaccgg + 2701 gttggactca agacgatagt taccggataa ggcgcagcgg tcgggctgaa cggggggttc + 2761 gtgcacacag cccagcttgg agcgaacgac ctacaccgaa ctgagatacc aacagcgtga + 2821 gctatgagaa agcgccacgc ttcccgaagg gagaaaggcg gacaggtatc cggtaagtgg + 2881 cagggtcgga acaggagagc gcacgaggga gcttccgggg ggaaacgcct ggtatcttta + 2941 tagtcctgtc gggtttcgcc acctctggct tgagcgtcga tttttgtgat gctcgtcagg + 3001 ggggcggagc ctatggaaaa acgcctgcgg tgctggcttc ttccggtgct ttgctttttg + 3061 ctcacatgtt ctttccggct ttatcccctg attctgtgga taaccgtatt accgcctttg + 3121 agtgagctga caccgctcgc cgcagtcgaa cgaccgagcg tagcgagtca gtgagcgagg + 3181 aagcggaaga gcgccttatg tgacattttc tccttacgct ctgttgtgcc gttcggcatc + 3241 ctgccctgag cgttatatct ctgtgctatt ttctacttca aagcgtgtct gtatgctgtt + 3301 ctggag +// diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3 new file mode 100644 index 00000000..7ad6e5c8 --- /dev/null +++ b/test/data/NC_002127.1-region.gff3 @@ -0,0 +1,68 @@ +##gff-version 3 +##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo +# Annotated with Bakta +# Software: v1.8.2 +# Database: v5.0, light +# DOI: 10.1099/mgen.0.000685 +# URL: github.com/oschwengers/bakta +##sequence-region NC_002127.1 1 3306 +NC_002127.1 Bakta region 1 3306 . + . ID=NC_002127.1;Name=NC_002127.1;Is_circular=true +NC_002127.1 ? CDS 32 736 . + 0 ID=DOGAIA_00005 +##FASTA +>NC_002127.1 +TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC +GTCCAGTTTTTCGAGCAGCGTCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGC +AGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTCTGTCTGTCATCTGCGTCGTG +TGATTATATCGCGCACCACTTTTCGACCGTCTTACCGCCGGTATTCTGCCGACGGACATT +TCAGTCAGACAACACTGTCACTGCCAAAAAACAGCAGTGCTTTGTTGGTAATTCGAACTT +GCAGACAGGACAGGATGTGCAATTGTTATACCGCGCATACATGCACGCTATTACAATTAC +CCTGGTCAGGGCTTCGCCCCGACACCCCATGTCAGATACGGAGCCATGTTTTATGACAAA +ACGAAGTGGAAGTAATACGCGCAGGCGGGCTATCAGTCGCCCTGTTCGTCTGACGGCAGA +AGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAGACCGTTTCTGGTTTTTT +ACGGGCGGCAGCTCTCGGTAAGAAAGTTAACTCACTGACTGATGACCGGGTGCTGAAAGA +AGTTATGCGACTGGGGGCGTTGCAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGA +CAGAGAGTATGCGGAGGTGCTGATCGCTATTACGGAGTATCACCGTGCCCTGTTATCCAG +GCTTATGGCAGATTAGCTTCCCGGAGAGAAACTGTCGAAAACAGACGGTATGAACGCCGT +AAGCCCCCAAACCGATCGCCATTCACTTTCATGCATAGCTATGCAGTGAGCTGAAAGCGA +TCCTGACGCATTTTTCCGGTTTACCCCGGGGAAAACATCTCTTTTTGCGGTGTCTGCGTC +AGAATCGCGTTCAGCGCGTTTTGGCGGTGCGCGTAATGAGACGTTATGGTAAATGTCTTC +TGGCTTGATATTATATTGGAATGCCTTTTTTCAAAGCAAATGATGTGGCTTTGGATAGAA +GGTTTACGTTGATCTTATCAAAGTTTTTTTTAAAGAACGAAGCCGAGAGCTCAGATAAAT +CATTATATTCATCAGTTTTCGTAACTTTGTTTAATGTGTAACTTGAAAACTTCTCGCCAT +TAAATGACGTATAGACGTAACGATCTTTTTTTCCACCGTTAGGAATTATTAAATCAAAAA +AAACATCACCCTTGCTTTTCTTTTTCTTCAAGTCGGATTCGATTTTTGAGAAAAATTCGC +TCGGGCTATAAATATCAGTAGCATAGACAATAAATAAAGTTTTATCTTTATTTTTTATTG +CTTCTATTTGATATTTTTTATCTTTTTTCATAATTTCAACCTAGCTACGCCACCATCTAT +TAATTGGCAAACGGTATCGATGATTGCGATTGATTCTAATTTGTTAATTGTCTTCGTGTC +AGCTATTCCTGGTTTCATATGAAACAAACCATGCCTGTTCTCATGCCAGTAAGTGTAGCA +TTCACACAAAACTTCCGCTATTTCACCATTTATAGTTTCTTGGTGTATTTCTCTGATTAT +ATATTTGGGTTTGTTTTCAGTGAAGTATTCGCCAAGGTTCTTTGATGATGATGGATTGCA +AACATCATTAAGTATTTGATATATAAAGCCTTCTATGGCTCTTAATGCAGAGAAGCAGTA +TGTTGAGTAATCTTCCATTTCGACATCTATTTTTTTCATTATTAGCGAGCATGATAGCTG +TTTTTTGATATCTTCATGGATTTTATCGATGCTTTTTGGTAGTTTGCTATGCAACTCGGA +CTCAATAGTTTCTTTTTTTATGTCAACATTAAATTCTTTATTTTTTTGTTCGACAATCTC +TTTCATGTTTAGTATTGAGCACATGAAATCGTTAATCAAACTCGCGATTTGAAGGTATTT +TCCTTGGAATTGAATAGAGCCGCGCTTGTAAATTTTTGCCCTGACCCTGTCACCATTGCT +GGTGGTCATAATATATTGGTGTTTACAATTAGGATCGTTATTATTATCTTCTGTTATTGT +TATCCCCTCTTCAGAAAGAAATTCAAATAGATTTGCCCTGTCATCATCACTGAATTTTGG +AATGGTGTATTCAAAGTTCTTTGTGTCTGAATACAAACAGTTTTCTTTTATAATCAAGGC +GATTTCATCAAAGTAAGTGTTATTTTGCCCAGACGCTCTTCCGATAGTGGTATTTCCGCC +TGATGGCATTAGTTTTATTAAGAAGTCTATTCCTTTATATGTGCCAGATATGTGAGTTTC +TCTTTCGTTTTTTACATTAGAGGAATAGTTTGTGACGCCATTCTGCGTCAGAGCAGACTC +AATCTTGTCAATATTGATATTTAGTGCTTTAAACGGGTTCTGTGCCATTGGGTCAATCCG +TTGTTTTTTTTGAATATGTACAGATCTTGTTTTTTTGTCAACGGAATAGCTGTTCGTTGA +CTTGATAGACCGATTGATTCATCATCTCATAAATAAAGAAAAACCACCGCTACCAACGGT +GGTTTTCTCAAGGTTCGCTGAGCTACCAACTCTTTGAACCAAGGTAACTGGCTTGGAGGA +GCGCAGTCACCAAAATCTGTTCTTTCAGTTTAGCCTTAACAGGTGCATAACTTCAAGACA +AACTCCTCTAAATCAGTTACCAATGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGG +GTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTC +GTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCAACAGCGTGA +GCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGTGG +CAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCGGGGGGAAACGCCTGGTATCTTTA +TAGTCCTGTCGGGTTTCGCCACCTCTGGCTTGAGCGTCGATTTTTGTGATGCTCGTCAGG +GGGGCGGAGCCTATGGAAAAACGCCTGCGGTGCTGGCTTCTTCCGGTGCTTTGCTTTTTG +CTCACATGTTCTTTCCGGCTTTATCCCCTGATTCTGTGGATAACCGTATTACCGCCTTTG +AGTGAGCTGACACCGCTCGCCGCAGTCGAACGACCGAGCGTAGCGAGTCAGTGAGCGAGG +AAGCGGAAGAGCGCCTTATGTGACATTTTCTCCTTACGCTCTGTTGTGCCGTTCGGCATC +CTGCCCTGAGCGTTATATCTCTGTGCTATTTTCTACTTCAAAGCGTGTCTGTATGCTGTT +CTGGAG diff --git a/test/test_args.py b/test/test_args.py index 6c09ed92..e2845f67 100644 --- a/test/test_args.py +++ b/test/test_args.py @@ -241,6 +241,26 @@ def test_replicons_ok(parameters, tmpdir): assert Path.exists(tmpdir_path.joinpath(file)) +@pytest.mark.parametrize( + 'parameters', + [ + (['--regions']), # not provided + (['--regions', '']), # empty + (['--regions', 'foo']), # not existing + (['--regions', 'test/data/empty']) # empty file + ] +) +def test_regions_failiing(parameters, tmpdir): + # test regions file arguments + proc = run( + ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] + + parameters + + SKIP_PARAMETERS + + ['test/data/NC_002127.1.fna'] + ) + assert proc.returncode != 0 + + @pytest.mark.parametrize( 'parameters', [ diff --git a/test/test_regions.py b/test/test_regions.py new file mode 100644 index 00000000..b893935c --- /dev/null +++ b/test/test_regions.py @@ -0,0 +1,38 @@ +import json +import subprocess as sp + +from pathlib import Path +from subprocess import run + +import pytest + +import bakta.constants as bc + +from .conftest import FILES, SKIP_PARAMETERS + + +def test_bakta_plasmid(tmpdir): + # full test on plasmid + proc = run( + ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', 'test/data/NC_002127.1-region.gbff'] + + ['test/data/NC_002127.1.fna.gz'] + ) + assert proc.returncode == 0 + + tmpdir_path = Path(tmpdir) + for file in FILES: + output_path = tmpdir_path.joinpath(file) + assert Path.exists(output_path) + assert output_path.stat().st_size > 0 + + results_path = tmpdir_path.joinpath('test.json') + results = None + with results_path.open() as fh: + results = json.load(fh) + assert results is not None + cdss = [feat for feat in results['features'] if feat['type'] == 'cds'] + assert len(cdss) == 3 + for cds in cdss: + if(cds['stop'] == 736): + assert cds['start'] != 2 # de novo-predicted CDS start + assert cds['start'] == 32 # user-provided CDS start From 1bfb8dbbbe2f82bbc8243a258abcef49c2618cb5 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 09:59:43 +0100 Subject: [PATCH 08/17] implement GFF3 region support --- bakta/features/cds.py | 138 +++++++++++++++++++++++++++--------------- 1 file changed, 90 insertions(+), 48 deletions(-) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 37485e00..b3dc32d1 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -183,55 +183,97 @@ def import_user_cdss(genome: dict, import_path: Path): a list of CDS features - with or without functional annotation. """ user_cdss = [] - # ToDO: implement GFF3 parsing - # try: - # with xopen(str(import_path), threads=0) as fh_in: - # for record in SeqIO.parse(fh_in, 'fasta'): - # user_proteins.append(parse_user_protein_sequences_fasta(record)) - # except Exception as e: - # log.error('provided user proteins file Fasta format not valid!', exc_info=True) - # sys.exit(f'ERROR: User proteins file Fasta format not valid!') - contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']} - try: - with xopen(str(import_path), threads=0) as fh_in: - for record in SeqIO.parse(fh_in, 'genbank'): - for feature in record.features: - if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): - contig = contigs_by_original_id[record.id] - user_cds = OrderedDict() - user_cds['type'] = bc.FEATURE_CDS - user_cds['source'] = bc.CDS_SOURCE_USER - user_cds['contig'] = contig['id'] - user_cds['start'] = feature.location.start + 1 - user_cds['stop'] = feature.location.end - user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE - user_cds['gene'] = None - user_cds['product'] = None - user_cds['db_xrefs'] = [so.SO_CDS.id] - user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 - try: - nt = bu.extract_feature_sequence(user_cds, contig) - user_cds['nt'] = nt - except: - log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) - raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") - dna_seq = Seq(nt) - try: - aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) - except: - log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) - raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") - user_cds['aa'] = aa - user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) - log.info( - 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', - user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] - ) - user_cdss.append(user_cds) - except Exception as e: - log.error('user-provided regions/features file GenBank format not valid!', exc_info=True) - sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!') + file_suffix = import_path.suffix.lower() + if(file_suffix in ['.gff', '.gff3']): + try: + with xopen(str(import_path), threads=0) as fh_in: + skip_lines = False + for line in fh_in: + line = line.strip() + if(line == '##FASTA'): + skip_lines = True + elif(skip_lines or line[0] == '#'): + continue + else: + contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') + if(feature_type.lower() == 'cds'): + contig = contigs_by_original_id[contig_id] + user_cds = OrderedDict() + user_cds['type'] = bc.FEATURE_CDS + user_cds['source'] = bc.CDS_SOURCE_USER + user_cds['contig'] = contig['id'] + user_cds['start'] = int(start) + user_cds['stop'] = int(stop) + user_cds['strand'] = strand + user_cds['gene'] = None + user_cds['product'] = None + user_cds['db_xrefs'] = [so.SO_CDS.id] + user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 + try: + nt = bu.extract_feature_sequence(user_cds, contig) + user_cds['nt'] = nt + except: + log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + try: + dna_seq = Seq(nt) + aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) + except: + log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( + 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + ) + user_cdss.append(user_cds) + except Exception as e: + log.error('user-provided regions/features file GFF3 format not valid!', exc_info=True) + sys.exit(f'ERROR: User-provided regions/features file GFF3 format not valid!') + elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']): + try: + with xopen(str(import_path), threads=0) as fh_in: + for record in SeqIO.parse(fh_in, 'genbank'): + for feature in record.features: + if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): + contig = contigs_by_original_id[record.id] + user_cds = OrderedDict() + user_cds['type'] = bc.FEATURE_CDS + user_cds['source'] = bc.CDS_SOURCE_USER + user_cds['contig'] = contig['id'] + user_cds['start'] = feature.location.start + 1 + user_cds['stop'] = feature.location.end + user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + user_cds['gene'] = None + user_cds['product'] = None + user_cds['db_xrefs'] = [so.SO_CDS.id] + user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 + try: + nt = bu.extract_feature_sequence(user_cds, contig) + user_cds['nt'] = nt + except: + log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + dna_seq = Seq(nt) + try: + aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) + except: + log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( + 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + ) + user_cdss.append(user_cds) + except Exception as e: + log.error('user-provided regions/features file GenBank format not valid!', exc_info=True) + sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!') + else: + log.warn('user-provided regions/features file suffix not detected! suffix=%s, path=%s', file_suffix, str(import_path)) return user_cdss From f998fc2747f2c78f3ec68f1cb51aef64da3541dd Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 09:59:54 +0100 Subject: [PATCH 09/17] add GFF3 tests --- test/test_regions.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/test/test_regions.py b/test/test_regions.py index b893935c..dbc1e1c1 100644 --- a/test/test_regions.py +++ b/test/test_regions.py @@ -6,15 +6,20 @@ import pytest -import bakta.constants as bc - from .conftest import FILES, SKIP_PARAMETERS -def test_bakta_plasmid(tmpdir): +@pytest.mark.parametrize( + 'regions', + [ + ('NC_002127.1-region.gff3'), # GFF3 + ('NC_002127.1-region.gbff') # Genbank + ] +) +def test_bakta_plasmid(regions, tmpdir): # full test on plasmid proc = run( - ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', 'test/data/NC_002127.1-region.gbff'] + + ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', f'test/data/{regions}'] + ['test/data/NC_002127.1.fna.gz'] ) assert proc.returncode == 0 From 1182dd01c0115b01620453cc13fb8a993024fa06 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 10:43:36 +0100 Subject: [PATCH 10/17] refactor code in cds module --- bakta/features/cds.py | 88 +++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index b3dc32d1..3dc08d97 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -86,26 +86,31 @@ def predict(genome: dict): return cdss +def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: str): + cds = OrderedDict() + cds['type'] = bc.FEATURE_CDS + cds['contig'] = contig['id'] + cds['start'] = start + cds['stop'] = stop + cds['strand'] = strand + cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (contig['length'] - stop) % 3 + 1 + cds['gene'] = None + cds['product'] = None + cds['db_xrefs'] = [so.SO_CDS.id] + cds['nt'] = nt + cds['aa'] = aa + cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + return cds + + def create_cdss(genes, contig): partial_cdss_per_sequence = [] cdss_per_sequence = [] for gene in genes: - cds = OrderedDict() - cds['type'] = bc.FEATURE_CDS - cds['contig'] = contig['id'] - cds['start'] = gene.begin - cds['stop'] = gene.end - cds['strand'] = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE - cds['gene'] = None - cds['product'] = None + strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE + cds = create_cds(contig, gene.begin, gene.end, strand, '', '') cds['start_type'] = gene.start_type cds['rbs_motif'] = gene.rbs_motif - cds['db_xrefs'] = [so.SO_CDS.id] - if(cds['strand'] == bc.STRAND_FORWARD): - cds['frame'] = (cds['start'] - 1) % 3 + 1 - else: - cds['frame'] = (contig['length'] - cds['stop']) % 3 + 1 - if gene.partial_begin: cds['truncated'] = bc.FEATURE_END_5_PRIME if cds['strand'] == bc.STRAND_FORWARD else bc.FEATURE_END_3_PRIME partial_cdss_per_sequence.append(cds) @@ -119,6 +124,7 @@ def create_cdss(genes, contig): aa = aa[:-1] # discard trailing asterisk cds['aa'] = aa cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( 'contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s', cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif'] @@ -168,24 +174,25 @@ def create_cdss(genes, contig): def import_user_cdss(genome: dict, import_path: Path): - """Imports user-provided CDS regions or features. - Regions are mere coordinates subject to subsequent internal annotation. Features comprise regions and existing functional annotation. - Both regions and annotations will be imported. Functional annotations are stored in a 'user' space and supersede internal function annotation information during the final annotation process. - + """Import user-provided CDS regions. + Only CDS region information are imported skipping any existing functional annotations. + Parameters ---------- + genome : dict + Genome dictionary holding sequence information (contigs) import_path : Path Path to GFF3 or Genbank file with regions or features. Returns ------- list - a list of CDS features - with or without functional annotation. + a list of CDS features - without functional annotations. """ user_cdss = [] contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']} file_suffix = import_path.suffix.lower() - if(file_suffix in ['.gff', '.gff3']): + if(file_suffix in ['.gff', '.gff3']): # parse GFF3 format try: with xopen(str(import_path), threads=0) as fh_in: skip_lines = False @@ -199,17 +206,8 @@ def import_user_cdss(genome: dict, import_path: Path): contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') if(feature_type.lower() == 'cds'): contig = contigs_by_original_id[contig_id] - user_cds = OrderedDict() - user_cds['type'] = bc.FEATURE_CDS + user_cds = create_cds(contig, int(start), int(stop), strand, '', '') user_cds['source'] = bc.CDS_SOURCE_USER - user_cds['contig'] = contig['id'] - user_cds['start'] = int(start) - user_cds['stop'] = int(stop) - user_cds['strand'] = strand - user_cds['gene'] = None - user_cds['product'] = None - user_cds['db_xrefs'] = [so.SO_CDS.id] - user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 try: nt = bu.extract_feature_sequence(user_cds, contig) user_cds['nt'] = nt @@ -217,13 +215,13 @@ def import_user_cdss(genome: dict, import_path: Path): log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") try: - dna_seq = Seq(nt) - aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) + aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) except: log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") - user_cds['aa'] = aa - user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] @@ -232,38 +230,30 @@ def import_user_cdss(genome: dict, import_path: Path): except Exception as e: log.error('user-provided regions/features file GFF3 format not valid!', exc_info=True) sys.exit(f'ERROR: User-provided regions/features file GFF3 format not valid!') - elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']): + elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']): # parse GenBank format try: with xopen(str(import_path), threads=0) as fh_in: for record in SeqIO.parse(fh_in, 'genbank'): for feature in record.features: if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): contig = contigs_by_original_id[record.id] - user_cds = OrderedDict() - user_cds['type'] = bc.FEATURE_CDS + strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '') user_cds['source'] = bc.CDS_SOURCE_USER - user_cds['contig'] = contig['id'] - user_cds['start'] = feature.location.start + 1 - user_cds['stop'] = feature.location.end - user_cds['strand'] = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE - user_cds['gene'] = None - user_cds['product'] = None - user_cds['db_xrefs'] = [so.SO_CDS.id] - user_cds['frame'] = (user_cds['start'] - 1) % 3 + 1 if user_cds['strand'] == bc.STRAND_FORWARD else (contig['length'] - user_cds['stop']) % 3 + 1 try: nt = bu.extract_feature_sequence(user_cds, contig) user_cds['nt'] = nt except: log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") - dna_seq = Seq(nt) try: - aa = str(dna_seq.translate(table=cfg.translation_table, cds=True)) + aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) except: log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") - user_cds['aa'] = aa - user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] From 424c8ce7fd5715e040e137d8586b053fedf9bfae Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 10:44:21 +0100 Subject: [PATCH 11/17] bump version to v1.9.0-beta --- bakta/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bakta/__init__.py b/bakta/__init__.py index 169201b5..e92ada64 100644 --- a/bakta/__init__.py +++ b/bakta/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.8.2' +__version__ = '1.9.0-beta' __db_schema_version__ = 5 From 3b07ff2a15d0048b1ac2164478d7e8d38711da91 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 10:57:28 +0100 Subject: [PATCH 12/17] update readme --- README.md | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 458a3f21..8e97cf40 100644 --- a/README.md +++ b/README.md @@ -226,9 +226,13 @@ NODE_3 | p2 | `p` | `c` | `pXYZ2` NODE_4 | special-contig-name-xyz | `-` | - NODE_5 | `` | `-` | - -#### User provided protein sequences +#### User-provided regions -Bakta accepts user provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format. Using the Fasta format, each reference sequence can be provided in a short or long format: +Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These features supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section. + +#### User-provided protein sequences + +Bakta accepts user-provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format which are used in the functional annotation process. Using the Fasta format, each reference sequence can be provided in a short or long format: ```bash # short: @@ -324,7 +328,7 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID] [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS] - [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta] + [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta] [--regions REGIONS] [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region] [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot] [--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version] @@ -368,6 +372,7 @@ Annotation: --compliant Force Genbank/ENA/DDJB compliance --proteins PROTEINS Fasta file of trusted protein sequences for CDS annotation --meta Run in metagenome mode. This only affects CDS prediction. + --regions REGIONS Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations). Workflow: --skip-trna Skip tRNA detection & annotation @@ -422,7 +427,7 @@ ncRNA (cis-regulatory) region types: ### Coding sequences -The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa. +The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa. In addition, superseding regions of pre-predicted CDS can be provided via `--regions`. To rapidly identify known protein sequences with exact sequence matches and to conduct a comprehensive annotations, Bakta utilizes a compact read-only SQLite database comprising protein sequence digests and pre-assigned annotations for millions of known protein sequences and clusters. @@ -435,22 +440,23 @@ Conceptual terms: **CDS**: -1. Prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs) +1. De novo-prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs) 2. Discard spurious CDS via AntiFam 3. Detect translational exceptions (selenocysteines) -4. Detection of UPSs via MD5 digests and lookup of related IPS and PCS -5. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5) -6. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively -7. Execution of expert systems: +4. Import of superseding user-provided CDS regions (optional) +5. Detection of UPSs via MD5 digests and lookup of related IPS and PCS +6. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5) +7. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively +8. Execution of expert systems: - AMR: AMRFinderPlus - Expert proteins: NCBI BlastRules, VFDB - User proteins (optionally via `--proteins `) -8. Prediction of signal peptides (optionally via `--gram <+/->`) -9. Detection of pseudogenes: +9. Prediction of signal peptides (optionally via `--gram <+/->`) +10. Detection of pseudogenes: 1. Search for reference PCSs using `hypothetical` CDS as seed sequences 2. Translated alignment (blastx) of reference PCSs against up-/downstream-elongated CDS regions 3. Analysis of translated alignments and detection of pseudogenization causes & effects -10. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy +11. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy CDS without IPS or PSC hits as well as those without gene symbols or product descriptions different from `hypothetical` will be marked as `hypothetical`. From 02138919bd4f6d2cc74184f2e4c8801d4afbf71f Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 12:14:35 +0100 Subject: [PATCH 13/17] polish test cases --- test/data/NC_002127.1-region.gbff | 15 +++++++++++++-- test/data/NC_002127.1-region.gff3 | 3 ++- test/test_regions.py | 5 ++++- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff index 1cc0907d..651ea786 100644 --- a/test/data/NC_002127.1-region.gbff +++ b/test/data/NC_002127.1-region.gbff @@ -8,10 +8,9 @@ FEATURES Location/Qualifiers source 1..3306 /mol_type="genomic DNA" /plasmid="unnamed1" - gene 32..736 - /locus_tag="DOGAIA_00005" CDS 32..736 /locus_tag="DOGAIA_00005" + /product="alternative downstream start codon test case 1" /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ @@ -19,6 +18,18 @@ FEATURES Location/Qualifiers YAEVLIAITEYHRALLSRLMAD" /codon_start=1 /transl_table=11 + CDS complement(1348..2388) + /locus_tag="DOGAIA_00015" + /product="alternative downstream start codon test case 2" + /translation="MPSGGNTTIGRASGQNNTYFDEIALIIKENCLYSDTKNFEYTIPK + FSDDDRANLFEFLSEEGITITEDNNNDPNCKHQYIMTTSNGDRVRAKIYKRGSIQFQGK + YLQIASLINDFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK + QLSCSLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNLGEYFTENKPK + YIIREIHQETINGEIAEVLCECYTYWHENRHGLFHMKPGIADTKTINKLESIAIIDTVC + QLIDGGVARLKL" + /codon_start=1 + /transl_table=11 + ORIGIN 1 ttcttctgcg agttcgtgca gcttctcaca catggtggcc tgctcgtcag catcgagtgc 61 gtccagtttt tcgagcagcg tcaggctctg gctttttatg aatcccgcca tgttgagtgc diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3 index 7ad6e5c8..a33c1602 100644 --- a/test/data/NC_002127.1-region.gff3 +++ b/test/data/NC_002127.1-region.gff3 @@ -7,7 +7,8 @@ # URL: github.com/oschwengers/bakta ##sequence-region NC_002127.1 1 3306 NC_002127.1 Bakta region 1 3306 . + . ID=NC_002127.1;Name=NC_002127.1;Is_circular=true -NC_002127.1 ? CDS 32 736 . + 0 ID=DOGAIA_00005 +NC_002127.1 ? CDS 32 736 . + 0 ID=DOGAIA_00005;NAME=alternative downstream start codon test case 1 +NC_002127.1 ? CDS 1348 2229 . - 0 ID=DOGAIA_00015;NAME=alternative downstream start codon test case 2 ##FASTA >NC_002127.1 TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC diff --git a/test/test_regions.py b/test/test_regions.py index dbc1e1c1..d12c6319 100644 --- a/test/test_regions.py +++ b/test/test_regions.py @@ -38,6 +38,9 @@ def test_bakta_plasmid(regions, tmpdir): cdss = [feat for feat in results['features'] if feat['type'] == 'cds'] assert len(cdss) == 3 for cds in cdss: - if(cds['stop'] == 736): + if(cds['strand'] == '+' and cds['stop'] == 736): # test case 1: alternative downstream start codon from 2 to 32 on + strand assert cds['start'] != 2 # de novo-predicted CDS start assert cds['start'] == 32 # user-provided CDS start + elif(cds['strand'] == '-' and cds['start'] == 1348): # test case 2: alternative downstream start codon from 2388 to 2229 on - strand + assert cds['stop'] != 2388 # de novo-predicted CDS start + assert cds['stop'] == 2229 # user-provided CDS start From b09843e92293a1ea38d4b9d82d03e97d49406df0 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 12:55:01 +0100 Subject: [PATCH 14/17] add overlapping CDS test case --- test/data/NC_002127.1-region.gbff | 18 ++++++------------ test/data/NC_002127.1-region.gff3 | 1 + test/test_regions.py | 6 ++++-- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff index 651ea786..ea30d1a6 100644 --- a/test/data/NC_002127.1-region.gbff +++ b/test/data/NC_002127.1-region.gbff @@ -11,22 +11,16 @@ FEATURES Location/Qualifiers CDS 32..736 /locus_tag="DOGAIA_00005" /product="alternative downstream start codon test case 1" - /translation="MVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL - FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ - DVQLLYRAYMHAITITLVRASPRHPMSDTEPCFMTKRSGSNTRRRAISRPVRLTAEEDQ - EIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVMRLGALQKKLFIDGKRVGDRE - YAEVLIAITEYHRALLSRLMAD" /codon_start=1 /transl_table=11 - CDS complement(1348..2388) + CDS 981..1121 + /locus_tag="DOGAIA_00006" + /product="additional overlapping CDS test case 3" + /codon_start=1 + /transl_table=11 + CDS complement(1348..2229) /locus_tag="DOGAIA_00015" /product="alternative downstream start codon test case 2" - /translation="MPSGGNTTIGRASGQNNTYFDEIALIIKENCLYSDTKNFEYTIPK - FSDDDRANLFEFLSEEGITITEDNNNDPNCKHQYIMTTSNGDRVRAKIYKRGSIQFQGK - YLQIASLINDFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK - QLSCSLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNLGEYFTENKPK - YIIREIHQETINGEIAEVLCECYTYWHENRHGLFHMKPGIADTKTINKLESIAIIDTVC - QLIDGGVARLKL" /codon_start=1 /transl_table=11 diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3 index a33c1602..3772ade6 100644 --- a/test/data/NC_002127.1-region.gff3 +++ b/test/data/NC_002127.1-region.gff3 @@ -8,6 +8,7 @@ ##sequence-region NC_002127.1 1 3306 NC_002127.1 Bakta region 1 3306 . + . ID=NC_002127.1;Name=NC_002127.1;Is_circular=true NC_002127.1 ? CDS 32 736 . + 0 ID=DOGAIA_00005;NAME=alternative downstream start codon test case 1 +NC_002127.1 ? CDS 981 1121 . + 0 ID=DOGAIA_00006;NAME=additional overlapping CDS test case 3 NC_002127.1 ? CDS 1348 2229 . - 0 ID=DOGAIA_00015;NAME=alternative downstream start codon test case 2 ##FASTA >NC_002127.1 diff --git a/test/test_regions.py b/test/test_regions.py index d12c6319..6b74bcd2 100644 --- a/test/test_regions.py +++ b/test/test_regions.py @@ -38,9 +38,11 @@ def test_bakta_plasmid(regions, tmpdir): cdss = [feat for feat in results['features'] if feat['type'] == 'cds'] assert len(cdss) == 3 for cds in cdss: - if(cds['strand'] == '+' and cds['stop'] == 736): # test case 1: alternative downstream start codon from 2 to 32 on + strand + if(cds['strand'] == '+' and cds['stop'] == 736): # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736] assert cds['start'] != 2 # de novo-predicted CDS start assert cds['start'] == 32 # user-provided CDS start - elif(cds['strand'] == '-' and cds['start'] == 1348): # test case 2: alternative downstream start codon from 2388 to 2229 on - strand + elif(cds['strand'] == '-' and cds['start'] == 1348): # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388] assert cds['stop'] != 2388 # de novo-predicted CDS start assert cds['stop'] == 2229 # user-provided CDS start + elif(cds['strand'] == '+' and cds['start'] == 981): # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351] + assert cds['stop'] == 1121 # user-provided overlapping CDS -> de novo-predicted was filtered-out From d805afd1d1e50ae9d7a3aab8d5b59aa94b29a37c Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 12:59:33 +0100 Subject: [PATCH 15/17] distinguish de novo from user-provided ORF aa seqs --- bakta/features/orf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bakta/features/orf.py b/bakta/features/orf.py index 13f1e5f7..f36a22cb 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -48,7 +48,7 @@ def detect_spurious(orfs: Sequence[dict]): def get_orf_key(orf: dict) -> str: """Generate a standardized and unique ORF-like feature key for internal store/analyze/parse/retrieval cycles.""" - return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}" + return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}" def get_orf_dictionary(orfs: Sequence[dict]) -> Dict[str, dict]: From a632ced9e5e540613e7059a75a5bf28d2d460a77 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 13:08:00 +0100 Subject: [PATCH 16/17] refactor tests for better readbility --- test/test_regions.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/test/test_regions.py b/test/test_regions.py index 6b74bcd2..b457c80a 100644 --- a/test/test_regions.py +++ b/test/test_regions.py @@ -37,12 +37,21 @@ def test_bakta_plasmid(regions, tmpdir): assert results is not None cdss = [feat for feat in results['features'] if feat['type'] == 'cds'] assert len(cdss) == 3 - for cds in cdss: - if(cds['strand'] == '+' and cds['stop'] == 736): # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736] - assert cds['start'] != 2 # de novo-predicted CDS start - assert cds['start'] == 32 # user-provided CDS start - elif(cds['strand'] == '-' and cds['start'] == 1348): # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388] - assert cds['stop'] != 2388 # de novo-predicted CDS start - assert cds['stop'] == 2229 # user-provided CDS start - elif(cds['strand'] == '+' and cds['start'] == 981): # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351] - assert cds['stop'] == 1121 # user-provided overlapping CDS -> de novo-predicted was filtered-out + + # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736] + provided_cds_1 = [cds for cds in cdss if cds['start'] == 32 and cds['stop'] == 736 and cds['strand'] == '+'] + assert len(provided_cds_1) == 1 + denovo_cds_1 = [cds for cds in cdss if cds['start'] == 2 and cds['stop'] == 736 and cds['strand'] == '+'] + assert len(denovo_cds_1) == 0 # de novo-predicted was filtered-out + + # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388] + provided_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2229 and cds['strand'] == '-'] + assert len(provided_cds_2) == 1 + denovo_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2388 and cds['strand'] == '-'] + assert len(denovo_cds_2) == 0 # de novo-predicted was filtered-out + + # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351] + provided_cds_3 = [cds for cds in cdss if cds['start'] == 981 and cds['stop'] == 1121 and cds['strand'] == '+'] + assert len(provided_cds_3) == 1 + denovo_cds_3 = [cds for cds in cdss if cds['start'] == 971 and cds['stop'] == 1351 and cds['strand'] == '-'] + assert len(denovo_cds_3) == 0 # de novo-predicted was filtered-out From b25e3dbaae3c3d3b66707a5f153b56d2bc69544d Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Fri, 17 Nov 2023 16:14:56 +0100 Subject: [PATCH 17/17] amend readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e97cf40..b01c94b7 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,7 @@ NODE_5 | `` | `-` | - #### User-provided regions -Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These features supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section. +Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These regions supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. A maximum overlap with de novo-predicted CDS of 30 bp is allowed. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section. #### User-provided protein sequences