diff --git a/README.md b/README.md index 458a3f21..b01c94b7 100644 --- a/README.md +++ b/README.md @@ -226,9 +226,13 @@ NODE_3 | p2 | `p` | `c` | `pXYZ2` NODE_4 | special-contig-name-xyz | `-` | - NODE_5 | `` | `-` | - -#### User provided protein sequences +#### User-provided regions -Bakta accepts user provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format. Using the Fasta format, each reference sequence can be provided in a short or long format: +Bakta accepts pre-annotated, user-provided feature regions via `--regions` in either GFF3 or GenBank format. These regions supersede all de novo-predicted regions, but are equally subject to the internal functional annotation process. Currently, only `CDS` are supported. A maximum overlap with de novo-predicted CDS of 30 bp is allowed. If you would like to provide custom functional annotations, you can provide these via `--proteins` which is described in the following section. + +#### User-provided protein sequences + +Bakta accepts user-provided trusted protein sequences via `--proteins` in either GenBank (CDS features) or Fasta format which are used in the functional annotation process. Using the Fasta format, each reference sequence can be provided in a short or long format: ```bash # short: @@ -324,7 +328,7 @@ Exemplary annotation result files for several genomes (mostly ESKAPE species) ar usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] [--prefix PREFIX] [--output OUTPUT] [--genus GENUS] [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID] [--complete] [--prodigal-tf PRODIGAL_TF] [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS] - [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta] + [--locus-tag LOCUS_TAG] [--keep-contig-headers] [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] [--meta] [--regions REGIONS] [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] [--skip-ncrna-region] [--skip-crispr] [--skip-cds] [--skip-pseudo] [--skip-sorf] [--skip-gap] [--skip-ori] [--skip-plot] [--help] [--verbose] [--debug] [--threads THREADS] [--tmp-dir TMP_DIR] [--version] @@ -368,6 +372,7 @@ Annotation: --compliant Force Genbank/ENA/DDJB compliance --proteins PROTEINS Fasta file of trusted protein sequences for CDS annotation --meta Run in metagenome mode. This only affects CDS prediction. + --regions REGIONS Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations). Workflow: --skip-trna Skip tRNA detection & annotation @@ -422,7 +427,7 @@ ncRNA (cis-regulatory) region types: ### Coding sequences -The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa. +The structural prediction is conducted via Pyrodigal and complemented by a custom detection of sORF < 30 aa. In addition, superseding regions of pre-predicted CDS can be provided via `--regions`. To rapidly identify known protein sequences with exact sequence matches and to conduct a comprehensive annotations, Bakta utilizes a compact read-only SQLite database comprising protein sequence digests and pre-assigned annotations for millions of known protein sequences and clusters. @@ -435,22 +440,23 @@ Conceptual terms: **CDS**: -1. Prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs) +1. De novo-prediction via Pyrodigal respecting sequences' completeness (distinct prediction for complete replicons and uncompleted contigs) 2. Discard spurious CDS via AntiFam 3. Detect translational exceptions (selenocysteines) -4. Detection of UPSs via MD5 digests and lookup of related IPS and PCS -5. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5) -6. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively -7. Execution of expert systems: +4. Import of superseding user-provided CDS regions (optional) +5. Detection of UPSs via MD5 digests and lookup of related IPS and PCS +6. Sequence alignments of remainder via Diamond vs. PSC (query/subject coverage=0.8, identity=0.5) +7. Assignment to UniRef90 or UniRef50 clusters if alignment hits achieve identities larger than 0.9 or 0.5, respectively +8. Execution of expert systems: - AMR: AMRFinderPlus - Expert proteins: NCBI BlastRules, VFDB - User proteins (optionally via `--proteins `) -8. Prediction of signal peptides (optionally via `--gram <+/->`) -9. Detection of pseudogenes: +9. Prediction of signal peptides (optionally via `--gram <+/->`) +10. Detection of pseudogenes: 1. Search for reference PCSs using `hypothetical` CDS as seed sequences 2. Translated alignment (blastx) of reference PCSs against up-/downstream-elongated CDS regions 3. Analysis of translated alignments and detection of pseudogenization causes & effects -10. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy +11. Combination of IPS, PSC, PSCC and expert system information favouring more specific annotations and avoiding redundancy CDS without IPS or PSC hits as well as those without gene symbols or product descriptions different from `hypothetical` will be marked as `hypothetical`. diff --git a/bakta/__init__.py b/bakta/__init__.py index 169201b5..e92ada64 100644 --- a/bakta/__init__.py +++ b/bakta/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.8.2' +__version__ = '1.9.0-beta' __db_schema_version__ = 5 diff --git a/bakta/config.py b/bakta/config.py index 8c6866e7..0bf01990 100644 --- a/bakta/config.py +++ b/bakta/config.py @@ -55,6 +55,7 @@ compliant = None user_proteins = None meta = None +regions = None # workflow configuration skip_trna = None @@ -160,7 +161,7 @@ def setup(args): taxon = None # annotation configurations - global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta + global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, gram, replicons, compliant, user_proteins, meta, regions complete = args.complete log.info('complete=%s', complete) prodigal_tf = args.prodigal_tf @@ -232,6 +233,19 @@ def setup(args): sys.exit(f'ERROR: replicon table file ({replicons}) not valid!') log.info('replicon-table=%s', replicons) user_proteins = check_user_proteins(args) + regions = args.regions + if(regions is not None): + try: + if(regions == ''): + raise ValueError('File path argument must be non-empty') + regions_path = Path(args.regions).resolve() + check_readability('regions', regions_path) + check_content_size('regions', regions_path) + regions = regions_path + except: + log.error('provided regions file not valid! path=%s', regions) + sys.exit(f'ERROR: regions file ({regions}) not valid!') + log.info('regions=%s', regions) # workflow configurations diff --git a/bakta/constants.py b/bakta/constants.py index 7e3d5598..0a612b6a 100644 --- a/bakta/constants.py +++ b/bakta/constants.py @@ -19,7 +19,7 @@ ############################################################################ -# Protein identification settings +# CDS settings ############################################################################ MIN_PSCC_IDENTITY = 0.5 # min protein identity for PSC detection MIN_PSC_COVERAGE = 0.8 # min protein coverage for PSC detection @@ -27,6 +27,8 @@ MIN_SORF_COVERAGE = 0.9 # min sORF coverage for PSC detection MIN_SORF_IDENTITY = 0.9 # min sORF identity for PSC detection HYPOTHETICAL_PROTEIN = 'hypothetical protein' # hypothetical protein product description +CDS_MAX_OVERLAPS = 30 # max overlap [bp] allowed for user-provided/de novo-predicted CDS overlaps +CDS_SOURCE_USER = 'user-provided' ############################################################################ diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 0f9412ee..fd99761a 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -169,8 +169,12 @@ def detect_feature_overlaps(genome: dict): crispr_arrays = contig_crispr_arrays[crispr_array['contig']] crispr_arrays.append(crispr_array) contig_cdss = {k['id']: [] for k in genome['contigs']} + contig_cdss_user_provided = {k['id']: [] for k in genome['contigs']} for cds in genome['features'].get(bc.FEATURE_CDS, []): - cdss = contig_cdss[cds['contig']] + if(cds.get('source', None) == bc.CDS_SOURCE_USER): + cdss = contig_cdss_user_provided[cds['contig']] + else: + cdss = contig_cdss[cds['contig']] cdss.append(cds) contig_sorfs = {k['id']: [] for k in genome['contigs']} for sorf in genome['features'].get(bc.FEATURE_SORF, []): @@ -217,7 +221,7 @@ def detect_feature_overlaps(genome: dict): ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['contig'], ncRNA_region['score'], ncRNA_region_overlap['score'] ) - # mark CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs + # mark de novo-predicted CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs and user-provided CDS for cds in contig_cdss[contig['id']]: # tmRNA overlaps for tmRNA in contig_tm_rnas[contig['id']]: @@ -279,6 +283,23 @@ def detect_feature_overlaps(genome: dict): "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['contig'] ) + # user-provided CDS overlaps + for cds_user_provided in contig_cdss_user_provided[contig['id']]: + if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']): + continue + else: # overlap -> remove cds + overlap = min(cds['stop'], cds_user_provided['stop']) - max(cds['start'], cds_user_provided['start']) + 1 + if(overlap > bc.CDS_MAX_OVERLAPS): + overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]" + cds['discarded'] = { + 'type': bc.DISCARD_TYPE_OVERLAP, + 'feature_type': bc.FEATURE_CDS, + 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' + } + log.info( + "overlap: de novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig'] + ) # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs for sorf in contig_sorfs[contig['id']]: @@ -342,7 +363,21 @@ def detect_feature_overlaps(genome: dict): "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['contig'] ) - # CDS overlaps (skipped as most overlaps are filtered in sORF detection) + # user-provided CDS overlaps + for cds_user_provided in contig_cdss_user_provided[contig['id']]: + if(sorf['stop'] < cds_user_provided['start'] or sorf['start'] > cds_user_provided['stop']): + continue + else: # overlap -> remove sorf + overlap = f"[{max(sorf['start'], cds_user_provided['start'])},{min(sorf['stop'], cds_user_provided['stop'])}]" + sorf['discarded'] = { + 'type': bc.DISCARD_TYPE_OVERLAP, + 'feature_type': bc.FEATURE_CDS, + 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' + } + log.info( + "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['contig'] + ) # sORF overlaps for overlap_sorf in contig_sorfs[contig['id']]: diff --git a/bakta/features/cds.py b/bakta/features/cds.py index b18a2959..3dc08d97 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -7,12 +7,15 @@ from collections import OrderedDict from typing import Dict, Sequence, Set, Tuple, Union +from pathlib import Path import pyrodigal import pyhmmer +from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqUtils.ProtParam import ProteinAnalysis +from xopen import xopen import bakta.config as cfg import bakta.constants as bc @@ -83,26 +86,31 @@ def predict(genome: dict): return cdss +def create_cds(contig: dict, start: int, stop: int, strand: str, nt: str, aa: str): + cds = OrderedDict() + cds['type'] = bc.FEATURE_CDS + cds['contig'] = contig['id'] + cds['start'] = start + cds['stop'] = stop + cds['strand'] = strand + cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (contig['length'] - stop) % 3 + 1 + cds['gene'] = None + cds['product'] = None + cds['db_xrefs'] = [so.SO_CDS.id] + cds['nt'] = nt + cds['aa'] = aa + cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + return cds + + def create_cdss(genes, contig): partial_cdss_per_sequence = [] cdss_per_sequence = [] for gene in genes: - cds = OrderedDict() - cds['type'] = bc.FEATURE_CDS - cds['contig'] = contig['id'] - cds['start'] = gene.begin - cds['stop'] = gene.end - cds['strand'] = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE - cds['gene'] = None - cds['product'] = None + strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE + cds = create_cds(contig, gene.begin, gene.end, strand, '', '') cds['start_type'] = gene.start_type cds['rbs_motif'] = gene.rbs_motif - cds['db_xrefs'] = [so.SO_CDS.id] - if(cds['strand'] == bc.STRAND_FORWARD): - cds['frame'] = (cds['start'] - 1) % 3 + 1 - else: - cds['frame'] = (contig['length'] - cds['stop']) % 3 + 1 - if gene.partial_begin: cds['truncated'] = bc.FEATURE_END_5_PRIME if cds['strand'] == bc.STRAND_FORWARD else bc.FEATURE_END_3_PRIME partial_cdss_per_sequence.append(cds) @@ -116,6 +124,7 @@ def create_cdss(genes, contig): aa = aa[:-1] # discard trailing asterisk cds['aa'] = aa cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + log.info( 'contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s', cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif'] @@ -164,6 +173,101 @@ def create_cdss(genes, contig): return cdss_per_sequence +def import_user_cdss(genome: dict, import_path: Path): + """Import user-provided CDS regions. + Only CDS region information are imported skipping any existing functional annotations. + + Parameters + ---------- + genome : dict + Genome dictionary holding sequence information (contigs) + import_path : Path + Path to GFF3 or Genbank file with regions or features. + + Returns + ------- + list + a list of CDS features - without functional annotations. + """ + user_cdss = [] + contigs_by_original_id = {c['orig_id']: c for c in genome['contigs']} + file_suffix = import_path.suffix.lower() + if(file_suffix in ['.gff', '.gff3']): # parse GFF3 format + try: + with xopen(str(import_path), threads=0) as fh_in: + skip_lines = False + for line in fh_in: + line = line.strip() + if(line == '##FASTA'): + skip_lines = True + elif(skip_lines or line[0] == '#'): + continue + else: + contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') + if(feature_type.lower() == 'cds'): + contig = contigs_by_original_id[contig_id] + user_cds = create_cds(contig, int(start), int(stop), strand, '', '') + user_cds['source'] = bc.CDS_SOURCE_USER + try: + nt = bu.extract_feature_sequence(user_cds, contig) + user_cds['nt'] = nt + except: + log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + try: + aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + except: + log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + + log.info( + 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + ) + user_cdss.append(user_cds) + except Exception as e: + log.error('user-provided regions/features file GFF3 format not valid!', exc_info=True) + sys.exit(f'ERROR: User-provided regions/features file GFF3 format not valid!') + elif(file_suffix in ['.gb', '.genbank', '.gbk', '.gbff']): # parse GenBank format + try: + with xopen(str(import_path), threads=0) as fh_in: + for record in SeqIO.parse(fh_in, 'genbank'): + for feature in record.features: + if(feature.type.lower() == 'cds' and 'pseudo' not in feature.qualifiers and bc.INSDC_FEATURE_PSEUDOGENE not in feature.qualifiers): + contig = contigs_by_original_id[record.id] + strand = bc.STRAND_FORWARD if feature.location.strand == +1 else bc.STRAND_REVERSE + user_cds = create_cds(contig, feature.location.start + 1, feature.location.end, strand, '', '') + user_cds['source'] = bc.CDS_SOURCE_USER + try: + nt = bu.extract_feature_sequence(user_cds, contig) + user_cds['nt'] = nt + except: + log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + try: + aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) + user_cds['aa'] = aa + user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) + except: + log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + + log.info( + 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + ) + user_cdss.append(user_cds) + except Exception as e: + log.error('user-provided regions/features file GenBank format not valid!', exc_info=True) + sys.exit(f'ERROR: User-provided regions/features file GenBank format not valid!') + else: + log.warn('user-provided regions/features file suffix not detected! suffix=%s, path=%s', file_suffix, str(import_path)) + + return user_cdss + + def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: """Detect Pfam-A entries""" pfam_hits = [] diff --git a/bakta/features/orf.py b/bakta/features/orf.py index 13f1e5f7..f36a22cb 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -48,7 +48,7 @@ def detect_spurious(orfs: Sequence[dict]): def get_orf_key(orf: dict) -> str: """Generate a standardized and unique ORF-like feature key for internal store/analyze/parse/retrieval cycles.""" - return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}" + return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}" def get_orf_dictionary(orfs: Sequence[dict]) -> Dict[str, dict]: diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 25349eb7..f0368b84 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -221,6 +221,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat annotations['pseudo'] = True if(feat.get('gene', None)): # add gene annotation if available annotations['gene'] = feat['gene'] + source = '?' if feat.get('source', None) == bc.CDS_SOURCE_USER else 'Prodigal' if(cfg.compliant): gene_id = f"{feat['locus']}_gene" gene_annotations = { @@ -230,7 +231,10 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if(feat.get('gene', None)): gene_annotations['gene'] = feat['gene'] annotations['Parent'] = gene_id - annotations['inference'] = 'ab initio prediction:Prodigal:2.6' + if(feat.get('source', None) == bc.CDS_SOURCE_USER): + annotations['inference'] = 'EXISTENCE:non-experimental evidence, no additional details recorded' + else: + annotations['inference'] = 'ab initio prediction:Prodigal:2.6' annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs annotations['Note'], ec_number = insdc.extract_ec_from_notes_insdc(annotations, 'Note') if(feat.get('pseudo', False)): @@ -239,7 +243,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if(ec_number is not None): annotations['ec_number'] = ec_number gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tProdigal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['contig']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") if('exception' in feat): ex = feat['exception'] pos = f"{ex['start']}..{ex['stop']}" @@ -251,7 +255,7 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat if('Notes' not in annotations): annotations['Note'] = notes annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tProdigal\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") + fh.write(f"{feat['contig']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) elif(feat['type'] == bc.FEATURE_SORF): diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index 3b287afd..ba8d630e 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -131,7 +131,13 @@ def write_insdc(genome: dict, features: Sequence[dict], genbank_output_path: Pat qualifiers['transl_table'] = cfg.translation_table insdc_feature_type = bc.INSDC_FEATURE_CDS inference = [] - inference.append('ab initio prediction:Prodigal:2.6' if feature['type'] == bc.FEATURE_CDS else f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}") + if(feature['type'] == bc.FEATURE_CDS): + if(feature.get('source', None) == bc.CDS_SOURCE_USER): + inference.append('EXISTENCE:non-experimental evidence, no additional details recorded') + else: + inference.append('ab initio prediction:Prodigal:2.6') + else: + inference.append(f"ab initio prediction:Bakta:{'.'.join(bakta.__version__.split('.')[0:2])}") if('ncbi_nrp_id' in feature.get('ups', {})): nrp_id = feature['ups']['ncbi_nrp_id'] inference.append(f'similar to AA sequence:{bc.DB_XREF_REFSEQ_NRP}:{nrp_id}') diff --git a/bakta/main.py b/bakta/main.py index f0948348..3cfb3a4d 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -239,6 +239,12 @@ def main(): no_revised = feat_cds.revise_translational_exceptions(genome, cdss) print(f'\trevised translational exceptions: {no_revised}') cdss = [cds for cds in cdss if 'discarded' not in cds] + + if(cfg.regions): + log.debug('import user-provided CDS regions') + imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions) + print(f'\timported CDS regions: {len(imported_cdss)}') + cdss.extend(imported_cdss) if(len(cdss) > 0): log.debug('lookup CDS UPS/IPS') diff --git a/bakta/utils.py b/bakta/utils.py index dc224bb2..240f62ef 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -91,6 +91,7 @@ def parse_arguments(): arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance') arg_group_annotation.add_argument('--proteins', action='store', default=None, dest='proteins', help='Fasta file of trusted protein sequences for CDS annotation') arg_group_annotation.add_argument('--meta', action='store_true', help='Run in metagenome mode. This only affects CDS prediction.') + arg_group_annotation.add_argument('--regions', action='store', default=None, help='Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).') arg_group_workflow = parser.add_argument_group('Workflow') arg_group_workflow.add_argument('--skip-trna', action='store_true', dest='skip_trna', help='Skip tRNA detection & annotation') diff --git a/test/data/NC_002127.1-region.gbff b/test/data/NC_002127.1-region.gbff new file mode 100644 index 00000000..ea30d1a6 --- /dev/null +++ b/test/data/NC_002127.1-region.gbff @@ -0,0 +1,84 @@ +LOCUS NC_002127.1 3306 bp DNA circular BCT 16-NOV-2023 +DEFINITION plasmid unnamed1, complete sequence. +KEYWORDS . +SOURCE None + ORGANISM . + . +FEATURES Location/Qualifiers + source 1..3306 + /mol_type="genomic DNA" + /plasmid="unnamed1" + CDS 32..736 + /locus_tag="DOGAIA_00005" + /product="alternative downstream start codon test case 1" + /codon_start=1 + /transl_table=11 + CDS 981..1121 + /locus_tag="DOGAIA_00006" + /product="additional overlapping CDS test case 3" + /codon_start=1 + /transl_table=11 + CDS complement(1348..2229) + /locus_tag="DOGAIA_00015" + /product="alternative downstream start codon test case 2" + /codon_start=1 + /transl_table=11 + +ORIGIN + 1 ttcttctgcg agttcgtgca gcttctcaca catggtggcc tgctcgtcag catcgagtgc + 61 gtccagtttt tcgagcagcg tcaggctctg gctttttatg aatcccgcca tgttgagtgc + 121 agtttgctgc tgcttgttca tctttctgtt ttctccgttc tgtctgtcat ctgcgtcgtg + 181 tgattatatc gcgcaccact tttcgaccgt cttaccgccg gtattctgcc gacggacatt + 241 tcagtcagac aacactgtca ctgccaaaaa acagcagtgc tttgttggta attcgaactt + 301 gcagacagga caggatgtgc aattgttata ccgcgcatac atgcacgcta ttacaattac + 361 cctggtcagg gcttcgcccc gacaccccat gtcagatacg gagccatgtt ttatgacaaa + 421 acgaagtgga agtaatacgc gcaggcgggc tatcagtcgc cctgttcgtc tgacggcaga + 481 agaagaccag gaaatcagaa aaagggctgc tgaatgcggc aagaccgttt ctggtttttt + 541 acgggcggca gctctcggta agaaagttaa ctcactgact gatgaccggg tgctgaaaga + 601 agttatgcga ctgggggcgt tgcagaaaaa actctttatc gacggcaagc gtgtcgggga + 661 cagagagtat gcggaggtgc tgatcgctat tacggagtat caccgtgccc tgttatccag + 721 gcttatggca gattagcttc ccggagagaa actgtcgaaa acagacggta tgaacgccgt + 781 aagcccccaa accgatcgcc attcactttc atgcatagct atgcagtgag ctgaaagcga + 841 tcctgacgca tttttccggt ttaccccggg gaaaacatct ctttttgcgg tgtctgcgtc + 901 agaatcgcgt tcagcgcgtt ttggcggtgc gcgtaatgag acgttatggt aaatgtcttc + 961 tggcttgata ttatattgga atgccttttt tcaaagcaaa tgatgtggct ttggatagaa + 1021 ggtttacgtt gatcttatca aagttttttt taaagaacga agccgagagc tcagataaat + 1081 cattatattc atcagttttc gtaactttgt ttaatgtgta acttgaaaac ttctcgccat + 1141 taaatgacgt atagacgtaa cgatcttttt ttccaccgtt aggaattatt aaatcaaaaa + 1201 aaacatcacc cttgcttttc tttttcttca agtcggattc gatttttgag aaaaattcgc + 1261 tcgggctata aatatcagta gcatagacaa taaataaagt tttatcttta ttttttattg + 1321 cttctatttg atatttttta tcttttttca taatttcaac ctagctacgc caccatctat + 1381 taattggcaa acggtatcga tgattgcgat tgattctaat ttgttaattg tcttcgtgtc + 1441 agctattcct ggtttcatat gaaacaaacc atgcctgttc tcatgccagt aagtgtagca + 1501 ttcacacaaa acttccgcta tttcaccatt tatagtttct tggtgtattt ctctgattat + 1561 atatttgggt ttgttttcag tgaagtattc gccaaggttc tttgatgatg atggattgca + 1621 aacatcatta agtatttgat atataaagcc ttctatggct cttaatgcag agaagcagta + 1681 tgttgagtaa tcttccattt cgacatctat ttttttcatt attagcgagc atgatagctg + 1741 ttttttgata tcttcatgga ttttatcgat gctttttggt agtttgctat gcaactcgga + 1801 ctcaatagtt tcttttttta tgtcaacatt aaattcttta tttttttgtt cgacaatctc + 1861 tttcatgttt agtattgagc acatgaaatc gttaatcaaa ctcgcgattt gaaggtattt + 1921 tccttggaat tgaatagagc cgcgcttgta aatttttgcc ctgaccctgt caccattgct + 1981 ggtggtcata atatattggt gtttacaatt aggatcgtta ttattatctt ctgttattgt + 2041 tatcccctct tcagaaagaa attcaaatag atttgccctg tcatcatcac tgaattttgg + 2101 aatggtgtat tcaaagttct ttgtgtctga atacaaacag ttttctttta taatcaaggc + 2161 gatttcatca aagtaagtgt tattttgccc agacgctctt ccgatagtgg tatttccgcc + 2221 tgatggcatt agttttatta agaagtctat tcctttatat gtgccagata tgtgagtttc + 2281 tctttcgttt tttacattag aggaatagtt tgtgacgcca ttctgcgtca gagcagactc + 2341 aatcttgtca atattgatat ttagtgcttt aaacgggttc tgtgccattg ggtcaatccg + 2401 ttgttttttt tgaatatgta cagatcttgt ttttttgtca acggaatagc tgttcgttga + 2461 cttgatagac cgattgattc atcatctcat aaataaagaa aaaccaccgc taccaacggt + 2521 ggttttctca aggttcgctg agctaccaac tctttgaacc aaggtaactg gcttggagga + 2581 gcgcagtcac caaaatctgt tctttcagtt tagccttaac aggtgcataa cttcaagaca + 2641 aactcctcta aatcagttac caatggctgc tgccagtggc gataagtcgt gtcttaccgg + 2701 gttggactca agacgatagt taccggataa ggcgcagcgg tcgggctgaa cggggggttc + 2761 gtgcacacag cccagcttgg agcgaacgac ctacaccgaa ctgagatacc aacagcgtga + 2821 gctatgagaa agcgccacgc ttcccgaagg gagaaaggcg gacaggtatc cggtaagtgg + 2881 cagggtcgga acaggagagc gcacgaggga gcttccgggg ggaaacgcct ggtatcttta + 2941 tagtcctgtc gggtttcgcc acctctggct tgagcgtcga tttttgtgat gctcgtcagg + 3001 ggggcggagc ctatggaaaa acgcctgcgg tgctggcttc ttccggtgct ttgctttttg + 3061 ctcacatgtt ctttccggct ttatcccctg attctgtgga taaccgtatt accgcctttg + 3121 agtgagctga caccgctcgc cgcagtcgaa cgaccgagcg tagcgagtca gtgagcgagg + 3181 aagcggaaga gcgccttatg tgacattttc tccttacgct ctgttgtgcc gttcggcatc + 3241 ctgccctgag cgttatatct ctgtgctatt ttctacttca aagcgtgtct gtatgctgtt + 3301 ctggag +// diff --git a/test/data/NC_002127.1-region.gff3 b/test/data/NC_002127.1-region.gff3 new file mode 100644 index 00000000..3772ade6 --- /dev/null +++ b/test/data/NC_002127.1-region.gff3 @@ -0,0 +1,70 @@ +##gff-version 3 +##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo +# Annotated with Bakta +# Software: v1.8.2 +# Database: v5.0, light +# DOI: 10.1099/mgen.0.000685 +# URL: github.com/oschwengers/bakta +##sequence-region NC_002127.1 1 3306 +NC_002127.1 Bakta region 1 3306 . + . ID=NC_002127.1;Name=NC_002127.1;Is_circular=true +NC_002127.1 ? CDS 32 736 . + 0 ID=DOGAIA_00005;NAME=alternative downstream start codon test case 1 +NC_002127.1 ? CDS 981 1121 . + 0 ID=DOGAIA_00006;NAME=additional overlapping CDS test case 3 +NC_002127.1 ? CDS 1348 2229 . - 0 ID=DOGAIA_00015;NAME=alternative downstream start codon test case 2 +##FASTA +>NC_002127.1 +TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC +GTCCAGTTTTTCGAGCAGCGTCAGGCTCTGGCTTTTTATGAATCCCGCCATGTTGAGTGC +AGTTTGCTGCTGCTTGTTCATCTTTCTGTTTTCTCCGTTCTGTCTGTCATCTGCGTCGTG +TGATTATATCGCGCACCACTTTTCGACCGTCTTACCGCCGGTATTCTGCCGACGGACATT +TCAGTCAGACAACACTGTCACTGCCAAAAAACAGCAGTGCTTTGTTGGTAATTCGAACTT +GCAGACAGGACAGGATGTGCAATTGTTATACCGCGCATACATGCACGCTATTACAATTAC +CCTGGTCAGGGCTTCGCCCCGACACCCCATGTCAGATACGGAGCCATGTTTTATGACAAA +ACGAAGTGGAAGTAATACGCGCAGGCGGGCTATCAGTCGCCCTGTTCGTCTGACGGCAGA +AGAAGACCAGGAAATCAGAAAAAGGGCTGCTGAATGCGGCAAGACCGTTTCTGGTTTTTT +ACGGGCGGCAGCTCTCGGTAAGAAAGTTAACTCACTGACTGATGACCGGGTGCTGAAAGA +AGTTATGCGACTGGGGGCGTTGCAGAAAAAACTCTTTATCGACGGCAAGCGTGTCGGGGA +CAGAGAGTATGCGGAGGTGCTGATCGCTATTACGGAGTATCACCGTGCCCTGTTATCCAG +GCTTATGGCAGATTAGCTTCCCGGAGAGAAACTGTCGAAAACAGACGGTATGAACGCCGT +AAGCCCCCAAACCGATCGCCATTCACTTTCATGCATAGCTATGCAGTGAGCTGAAAGCGA +TCCTGACGCATTTTTCCGGTTTACCCCGGGGAAAACATCTCTTTTTGCGGTGTCTGCGTC +AGAATCGCGTTCAGCGCGTTTTGGCGGTGCGCGTAATGAGACGTTATGGTAAATGTCTTC +TGGCTTGATATTATATTGGAATGCCTTTTTTCAAAGCAAATGATGTGGCTTTGGATAGAA +GGTTTACGTTGATCTTATCAAAGTTTTTTTTAAAGAACGAAGCCGAGAGCTCAGATAAAT +CATTATATTCATCAGTTTTCGTAACTTTGTTTAATGTGTAACTTGAAAACTTCTCGCCAT +TAAATGACGTATAGACGTAACGATCTTTTTTTCCACCGTTAGGAATTATTAAATCAAAAA +AAACATCACCCTTGCTTTTCTTTTTCTTCAAGTCGGATTCGATTTTTGAGAAAAATTCGC +TCGGGCTATAAATATCAGTAGCATAGACAATAAATAAAGTTTTATCTTTATTTTTTATTG +CTTCTATTTGATATTTTTTATCTTTTTTCATAATTTCAACCTAGCTACGCCACCATCTAT +TAATTGGCAAACGGTATCGATGATTGCGATTGATTCTAATTTGTTAATTGTCTTCGTGTC +AGCTATTCCTGGTTTCATATGAAACAAACCATGCCTGTTCTCATGCCAGTAAGTGTAGCA +TTCACACAAAACTTCCGCTATTTCACCATTTATAGTTTCTTGGTGTATTTCTCTGATTAT +ATATTTGGGTTTGTTTTCAGTGAAGTATTCGCCAAGGTTCTTTGATGATGATGGATTGCA +AACATCATTAAGTATTTGATATATAAAGCCTTCTATGGCTCTTAATGCAGAGAAGCAGTA +TGTTGAGTAATCTTCCATTTCGACATCTATTTTTTTCATTATTAGCGAGCATGATAGCTG +TTTTTTGATATCTTCATGGATTTTATCGATGCTTTTTGGTAGTTTGCTATGCAACTCGGA +CTCAATAGTTTCTTTTTTTATGTCAACATTAAATTCTTTATTTTTTTGTTCGACAATCTC +TTTCATGTTTAGTATTGAGCACATGAAATCGTTAATCAAACTCGCGATTTGAAGGTATTT +TCCTTGGAATTGAATAGAGCCGCGCTTGTAAATTTTTGCCCTGACCCTGTCACCATTGCT +GGTGGTCATAATATATTGGTGTTTACAATTAGGATCGTTATTATTATCTTCTGTTATTGT +TATCCCCTCTTCAGAAAGAAATTCAAATAGATTTGCCCTGTCATCATCACTGAATTTTGG +AATGGTGTATTCAAAGTTCTTTGTGTCTGAATACAAACAGTTTTCTTTTATAATCAAGGC +GATTTCATCAAAGTAAGTGTTATTTTGCCCAGACGCTCTTCCGATAGTGGTATTTCCGCC +TGATGGCATTAGTTTTATTAAGAAGTCTATTCCTTTATATGTGCCAGATATGTGAGTTTC +TCTTTCGTTTTTTACATTAGAGGAATAGTTTGTGACGCCATTCTGCGTCAGAGCAGACTC +AATCTTGTCAATATTGATATTTAGTGCTTTAAACGGGTTCTGTGCCATTGGGTCAATCCG +TTGTTTTTTTTGAATATGTACAGATCTTGTTTTTTTGTCAACGGAATAGCTGTTCGTTGA +CTTGATAGACCGATTGATTCATCATCTCATAAATAAAGAAAAACCACCGCTACCAACGGT +GGTTTTCTCAAGGTTCGCTGAGCTACCAACTCTTTGAACCAAGGTAACTGGCTTGGAGGA +GCGCAGTCACCAAAATCTGTTCTTTCAGTTTAGCCTTAACAGGTGCATAACTTCAAGACA +AACTCCTCTAAATCAGTTACCAATGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGG +GTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTC +GTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCAACAGCGTGA +GCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGTGG +CAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCGGGGGGAAACGCCTGGTATCTTTA +TAGTCCTGTCGGGTTTCGCCACCTCTGGCTTGAGCGTCGATTTTTGTGATGCTCGTCAGG +GGGGCGGAGCCTATGGAAAAACGCCTGCGGTGCTGGCTTCTTCCGGTGCTTTGCTTTTTG +CTCACATGTTCTTTCCGGCTTTATCCCCTGATTCTGTGGATAACCGTATTACCGCCTTTG +AGTGAGCTGACACCGCTCGCCGCAGTCGAACGACCGAGCGTAGCGAGTCAGTGAGCGAGG +AAGCGGAAGAGCGCCTTATGTGACATTTTCTCCTTACGCTCTGTTGTGCCGTTCGGCATC +CTGCCCTGAGCGTTATATCTCTGTGCTATTTTCTACTTCAAAGCGTGTCTGTATGCTGTT +CTGGAG diff --git a/test/test_args.py b/test/test_args.py index 6c09ed92..e2845f67 100644 --- a/test/test_args.py +++ b/test/test_args.py @@ -241,6 +241,26 @@ def test_replicons_ok(parameters, tmpdir): assert Path.exists(tmpdir_path.joinpath(file)) +@pytest.mark.parametrize( + 'parameters', + [ + (['--regions']), # not provided + (['--regions', '']), # empty + (['--regions', 'foo']), # not existing + (['--regions', 'test/data/empty']) # empty file + ] +) +def test_regions_failiing(parameters, tmpdir): + # test regions file arguments + proc = run( + ['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--skip-plot'] + + parameters + + SKIP_PARAMETERS + + ['test/data/NC_002127.1.fna'] + ) + assert proc.returncode != 0 + + @pytest.mark.parametrize( 'parameters', [ diff --git a/test/test_regions.py b/test/test_regions.py new file mode 100644 index 00000000..b457c80a --- /dev/null +++ b/test/test_regions.py @@ -0,0 +1,57 @@ +import json +import subprocess as sp + +from pathlib import Path +from subprocess import run + +import pytest + +from .conftest import FILES, SKIP_PARAMETERS + + +@pytest.mark.parametrize( + 'regions', + [ + ('NC_002127.1-region.gff3'), # GFF3 + ('NC_002127.1-region.gbff') # Genbank + ] +) +def test_bakta_plasmid(regions, tmpdir): + # full test on plasmid + proc = run( + ['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--force', '--prefix', 'test', '--regions', f'test/data/{regions}'] + + ['test/data/NC_002127.1.fna.gz'] + ) + assert proc.returncode == 0 + + tmpdir_path = Path(tmpdir) + for file in FILES: + output_path = tmpdir_path.joinpath(file) + assert Path.exists(output_path) + assert output_path.stat().st_size > 0 + + results_path = tmpdir_path.joinpath('test.json') + results = None + with results_path.open() as fh: + results = json.load(fh) + assert results is not None + cdss = [feat for feat in results['features'] if feat['type'] == 'cds'] + assert len(cdss) == 3 + + # test case 1: alternative downstream start codon +[32,736] superseding de novo-predicted CDS +[2,736] + provided_cds_1 = [cds for cds in cdss if cds['start'] == 32 and cds['stop'] == 736 and cds['strand'] == '+'] + assert len(provided_cds_1) == 1 + denovo_cds_1 = [cds for cds in cdss if cds['start'] == 2 and cds['stop'] == 736 and cds['strand'] == '+'] + assert len(denovo_cds_1) == 0 # de novo-predicted was filtered-out + + # test case 2: alternative downstream start codon -[1348,2229] superseding de novo-predicted CDS -[1348,2388] + provided_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2229 and cds['strand'] == '-'] + assert len(provided_cds_2) == 1 + denovo_cds_2 = [cds for cds in cdss if cds['start'] == 1348 and cds['stop'] == 2388 and cds['strand'] == '-'] + assert len(denovo_cds_2) == 0 # de novo-predicted was filtered-out + + # test case 3: user-provided CDS +[981,1121] overlapping de novo-predicted CDS -[971,1351] + provided_cds_3 = [cds for cds in cdss if cds['start'] == 981 and cds['stop'] == 1121 and cds['strand'] == '+'] + assert len(provided_cds_3) == 1 + denovo_cds_3 = [cds for cds in cdss if cds['start'] == 971 and cds['stop'] == 1351 and cds['strand'] == '-'] + assert len(denovo_cds_3) == 0 # de novo-predicted was filtered-out