From 61d11b2e4eb5287e4697b0920349b7731e2b05f7 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 12:49:59 +0200 Subject: [PATCH 1/8] rename contig to more general sequence --- README.md | 4 +- bakta/__init__.py | 2 +- bakta/config.py | 26 +-- bakta/constants.py | 2 +- bakta/expert/amrfinder.py | 4 +- bakta/expert/protein_hmms.py | 8 +- bakta/expert/protein_sequences.py | 4 +- bakta/features/annotation.py | 154 ++++++++--------- bakta/features/cds.py | 270 +++++++++++++++--------------- bakta/features/crispr.py | 28 ++-- bakta/features/gaps.py | 12 +- bakta/features/nc_rna.py | 20 +-- bakta/features/nc_rna_region.py | 20 +-- bakta/features/orf.py | 10 +- bakta/features/ori.py | 46 ++--- bakta/features/r_rna.py | 24 +-- bakta/features/s_orf.py | 86 +++++----- bakta/features/signal_peptides.py | 4 +- bakta/features/t_rna.py | 20 +-- bakta/features/tm_rna.py | 18 +- bakta/io.py | 196 ++++++++++++++++++++++ bakta/io/fasta.py | 60 +++---- bakta/io/gff.py | 68 ++++---- bakta/io/insdc.py | 70 ++++---- bakta/io/json.py | 4 +- bakta/io/tsv.py | 26 +-- bakta/ips.py | 4 +- bakta/main.py | 104 +++++------- bakta/plot.py | 106 ++++++------ bakta/proteins.py | 12 +- bakta/psc.py | 8 +- bakta/pscc.py | 8 +- bakta/so.py | 2 +- bakta/ups.py | 4 +- bakta/utils.py | 214 +++++++++++------------ scripts/extract-region.py | 14 +- test/test_edge_features.py | 10 +- test/test_nt_sequences.py | 2 +- test/test_pseudo.py | 26 +-- test/test_regions.py | 4 +- test/test_sORF.py | 6 +- 41 files changed, 946 insertions(+), 764 deletions(-) create mode 100644 bakta/io.py diff --git a/README.md b/README.md index 383aa302..81e865d2 100644 --- a/README.md +++ b/README.md @@ -382,7 +382,7 @@ positional arguments: Input / Output: --db DB, -d DB Database path (default = /db). Can also be provided as BAKTA_DB environment variable. --min-contig-length MIN_CONTIG_LENGTH, -m MIN_CONTIG_LENGTH - Minimum contig size (default = 1; 200 in compliant mode) + Minimum contig/sequence size (default = 1; 200 in compliant mode) --prefix PREFIX, -p PREFIX Prefix for output files --output OUTPUT, -o OUTPUT @@ -409,7 +409,7 @@ Annotation: Locus tag increment: 1/5/10 (default = 1) --keep-contig-headers - Keep original contig headers + Keep original contig/sequence headers --compliant Force Genbank/ENA/DDJB compliance --replicons REPLICONS, -r REPLICONS Replicon information table (tsv/csv) diff --git a/bakta/__init__.py b/bakta/__init__.py index 819c77ed..61a4358d 100644 --- a/bakta/__init__.py +++ b/bakta/__init__.py @@ -1,2 +1,2 @@ -__version__ = '1.9.4' +__version__ = '1.10.0-beta' __db_schema_version__ = 5 diff --git a/bakta/config.py b/bakta/config.py index bc418b8d..0c3c1695 100644 --- a/bakta/config.py +++ b/bakta/config.py @@ -30,7 +30,7 @@ db_info = None tmp_path = None genome_path = None -min_contig_length = None +min_sequence_length = None prefix = None output_path = None force = None @@ -46,7 +46,7 @@ complete = None prodigal_tf = None translation_table = None -keep_contig_headers = None +keep_sequence_headers = None locus = None locus_tag = None locus_tag_increment = None @@ -92,7 +92,7 @@ def setup(args): verbose = True # input / output path configurations - global db_path, db_info, tmp_path, genome_path, min_contig_length, prefix, output_path, force + global db_path, db_info, tmp_path, genome_path, min_sequence_length, prefix, output_path, force db_path = check_db_path(args) tmp_path = check_tmp_path(args) @@ -108,11 +108,11 @@ def setup(args): log.info('genome-path=%s', genome_path) # input / output configurations - min_contig_length = args.min_contig_length - if(min_contig_length <= 0): - log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_contig_length) - sys.exit(f"ERROR: wrong argument ({min_contig_length}) for 'min- contig-length' parameter! Value must be larger than 0") - log.info('min_contig_length=%s', min_contig_length) + min_sequence_length = args.min_contig_length + if(min_sequence_length <= 0): + log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_sequence_length) + sys.exit(f"ERROR: wrong argument ({min_sequence_length}) for 'min- contig-length' parameter! Value must be larger than 0") + log.info('min_contig_length=%s', min_sequence_length) log.info('prefix=%s', prefix) # set in main.py before global logger config log.info('output-path=%s', output_path) force = args.force @@ -163,7 +163,7 @@ def setup(args): taxon = None # annotation configurations - global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions + global complete, prodigal_tf, translation_table, keep_sequence_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions complete = args.complete log.info('complete=%s', complete) prodigal_tf = args.prodigal_tf @@ -186,8 +186,8 @@ def setup(args): compliant = args.compliant log.info('compliant=%s', compliant) if(compliant): - min_contig_length = 200 - log.info('compliant mode! min_contig_length=%s', min_contig_length) + min_sequence_length = 200 + log.info('compliant mode! min_contig_length=%s', min_sequence_length) meta = args.meta log.info('meta=%s', meta) locus = args.locus @@ -221,8 +221,8 @@ def setup(args): log.info('locus-tag=%s', locus_tag) locus_tag_increment = args.locus_tag_increment log.info('locus-tag-increment=%s', locus_tag_increment) - keep_contig_headers = args.keep_contig_headers - log.info('keep_contig_headers=%s', keep_contig_headers) + keep_sequence_headers = args.keep_contig_headers + log.info('keep_contig_headers=%s', keep_sequence_headers) replicons = args.replicons if(replicons is not None): try: diff --git a/bakta/constants.py b/bakta/constants.py index b9e7e83f..8cdfb88e 100644 --- a/bakta/constants.py +++ b/bakta/constants.py @@ -191,7 +191,7 @@ ############################################################################ REPLICON_CHROMOSOME = 'chromosome' REPLICON_PLASMID = 'plasmid' -REPLICON_CONTIG = 'contig' +REPLICON_CONTIG = 'sequence' REPLICON_LENGTH_THRESHOLD_PLASMID = 112_000 # Nasuia deltocephalinicola -> DOI: 10.1093/gbe/evt118 REPLICON_LENGTH_THRESHOLD_CHROMOSOME = 2_800_000 # max plasmid length (except 1 outlier-> https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/ TOPOLOGY_CIRCULAR = 'circular' diff --git a/bakta/expert/amrfinder.py b/bakta/expert/amrfinder.py index 943fdc3c..a716a2ff 100644 --- a/bakta/expert/amrfinder.py +++ b/bakta/expert/amrfinder.py @@ -80,8 +80,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path): cds.setdefault('expert', []) cds['expert'].append(hit) log.debug( - 'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, contig=%s, start=%i, stop=%i, strand=%s', - gene, product, method, model_cov, identity, cds['contig'], cds['start'], cds['stop'], cds['strand'] + 'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, seq=%s, start=%i, stop=%i, strand=%s', + gene, product, method, model_cov, identity, cds['sequence'], cds['start'], cds['stop'], cds['strand'] ) cds_found.add(aa_identifier) diff --git a/bakta/expert/protein_hmms.py b/bakta/expert/protein_hmms.py index 6661f3f8..31585405 100644 --- a/bakta/expert/protein_hmms.py +++ b/bakta/expert/protein_hmms.py @@ -30,8 +30,8 @@ def search(cdss: Sequence[dict], user_hmms_path): cds = orf_by_aa_digest[aa_identifier] if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE: log.debug( - 'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f', - cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score + 'discard low evalue: seq=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score ) else: hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included]) @@ -64,8 +64,8 @@ def search(cdss: Sequence[dict], user_hmms_path): cds.setdefault('expert', []) cds['expert'].append(hit) log.debug( - 'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f', - cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score'] + 'hit: source=UserHMMs, rank=99, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score'] ) cds_found.add(aa_identifier) diff --git a/bakta/expert/protein_sequences.py b/bakta/expert/protein_sequences.py index 7a4b5de6..ee1b53e9 100644 --- a/bakta/expert/protein_sequences.py +++ b/bakta/expert/protein_sequences.py @@ -83,8 +83,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa cds.setdefault('expert', []) cds['expert'].append(hit) log.debug( - 'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s', - source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product + 'hit: source=%s, rank=%i, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s', + source, rank, cds['sequence'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product ) cds_found.add(aa_identifier) diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 7358aaf9..3cfb03ab 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -151,45 +151,45 @@ def detect_feature_overlaps(genome: dict): CDS < tmRNA, tRNA, rRNA, CRISPR sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations) """ - contig_t_rnas = {k['id']: [] for k in genome['contigs']} + sequence_t_rnas = {k['id']: [] for k in genome['sequences']} for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []): - t_rnas = contig_t_rnas[t_rna['contig']] + t_rnas = sequence_t_rnas[t_rna['sequence']] t_rnas.append(t_rna) - contig_tm_rnas = {k['id']: [] for k in genome['contigs']} + sequence_tm_rnas = {k['id']: [] for k in genome['sequences']} for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []): - tm_rnas = contig_tm_rnas[tm_rna['contig']] + tm_rnas = sequence_tm_rnas[tm_rna['sequence']] tm_rnas.append(tm_rna) - contig_r_rnas = {k['id']: [] for k in genome['contigs']} + sequence_r_rnas = {k['id']: [] for k in genome['sequences']} for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []): - r_rnas = contig_r_rnas[r_rna['contig']] + r_rnas = sequence_r_rnas[r_rna['sequence']] r_rnas.append(r_rna) - contig_ncrna_regions = {k['id']: [] for k in genome['contigs']} + sequence_ncrna_regions = {k['id']: [] for k in genome['sequences']} for ncRNA_region in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []): - ncRNA_regions = contig_ncrna_regions[ncRNA_region['contig']] + ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']] ncRNA_regions.append(ncRNA_region) - contig_crispr_arrays = {k['id']: [] for k in genome['contigs']} + sequence_crispr_arrays = {k['id']: [] for k in genome['sequences']} for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []): - crispr_arrays = contig_crispr_arrays[crispr_array['contig']] + crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']] crispr_arrays.append(crispr_array) - contig_cdss = {k['id']: [] for k in genome['contigs']} - contig_cdss_user_provided = {k['id']: [] for k in genome['contigs']} + sequence_cdss = {k['id']: [] for k in genome['sequences']} + sequence_cdss_user_provided = {k['id']: [] for k in genome['sequences']} for cds in genome['features'].get(bc.FEATURE_CDS, []): if(cds.get('source', None) == bc.CDS_SOURCE_USER): - cdss = contig_cdss_user_provided[cds['contig']] + cdss = sequence_cdss_user_provided[cds['sequence']] else: - cdss = contig_cdss[cds['contig']] + cdss = sequence_cdss[cds['sequence']] cdss.append(cds) - contig_sorfs = {k['id']: [] for k in genome['contigs']} + sequence_sorfs = {k['id']: [] for k in genome['sequences']} for sorf in genome['features'].get(bc.FEATURE_SORF, []): - sorfs = contig_sorfs[sorf['contig']] + sorfs = sequence_sorfs[sorf['sequence']] sorfs.append(sorf) - for contig in genome['contigs']: # find feature overlaps contig-wise to increase the performance - log.debug('filter features on contig: %s', contig['id']) + for seq in genome['sequences']: # find feature overlaps sequence-wise to increase the performance + log.debug('filter features on seq: %s', seq['id']) # mark tRNAs overlapping with tmRNAs - for tRNA in contig_t_rnas[contig['id']]: - for tmRNA in contig_tm_rnas[contig['id']]: + for tRNA in sequence_t_rnas[seq['id']]: + for tmRNA in sequence_tm_rnas[seq['id']]: if(tRNA['stop'] < tmRNA['start'] or tRNA['start'] > tmRNA['stop']): continue else: # overlap -> remove tRNA @@ -200,13 +200,13 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}" } log.info( - "overlap: tRNA (%s) [%i, %i] overlapping with tmRNA (%s) [%i, %i] at %s on contig=%s", - tRNA['product'], tRNA['start'], tRNA['stop'], tmRNA['product'], tmRNA['start'], tmRNA['stop'], overlap, tRNA['contig'] + "overlap: tRNA (%s) [%i, %i] overlapping with tmRNA (%s) [%i, %i] at %s on seq=%s", + tRNA['product'], tRNA['start'], tRNA['stop'], tmRNA['product'], tmRNA['start'], tmRNA['stop'], overlap, tRNA['sequence'] ) # mark ncRNA-regions overlapping with ncRNA-regions - for ncRNA_region in contig_ncrna_regions[contig['id']]: - for ncRNA_region_overlap in contig_ncrna_regions[contig['id']]: + for ncRNA_region in sequence_ncrna_regions[seq['id']]: + for ncRNA_region_overlap in sequence_ncrna_regions[seq['id']]: if(ncRNA_region['stop'] < ncRNA_region_overlap['start'] or ncRNA_region['start'] > ncRNA_region_overlap['stop']): continue if(ncRNA_region['db_xrefs'][0] == ncRNA_region_overlap['db_xrefs'][0]): @@ -220,14 +220,14 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_NC_RNA_REGION} overlap with ({ncRNA_region_overlap['product']}) at {overlap}" } log.info( - "overlap: ncRNA-region (%s) [%i, %i] overlapping with ncRNA-region (%s) [%i, %i] at %s on contig=%s, lower bitscore (%f/%f)", - ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['contig'], ncRNA_region['score'], ncRNA_region_overlap['score'] + "overlap: ncRNA-region (%s) [%i, %i] overlapping with ncRNA-region (%s) [%i, %i] at %s on seq=%s, lower bitscore (%f/%f)", + ncRNA_region['product'], ncRNA_region['start'], ncRNA_region['stop'], ncRNA_region_overlap['product'], ncRNA_region_overlap['start'], ncRNA_region_overlap['stop'], overlap, ncRNA_region['sequence'], ncRNA_region['score'], ncRNA_region_overlap['score'] ) # mark de novo-predicted CDS overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs and user-provided CDS - for cds in contig_cdss[contig['id']]: + for cds in sequence_cdss[seq['id']]: # tmRNA overlaps - for tmRNA in contig_tm_rnas[contig['id']]: + for tmRNA in sequence_tm_rnas[seq['id']]: if(cds['stop'] < tmRNA['start'] or cds['start'] > tmRNA['stop']): continue else: # overlap -> remove cds @@ -238,11 +238,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}" } log.info( - "overlap: CDS (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, cds['contig'] + "overlap: CDS (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, seq=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, cds['sequence'] ) # tRNA overlaps - for tRNA in contig_t_rnas[contig['id']]: + for tRNA in sequence_t_rnas[seq['id']]: if(cds['stop'] < tRNA['start'] or cds['start'] > tRNA['stop']): continue else: # overlap -> remove cds @@ -253,11 +253,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_T_RNA} overlap with ({tRNA['product']}) at {overlap}" } log.info( - "overlap: CDS (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, cds['contig'] + "overlap: CDS (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, seq=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, cds['sequence'] ) # rRNA overlaps - for rRNA in contig_r_rnas[contig['id']]: + for rRNA in sequence_r_rnas[seq['id']]: if(cds['stop'] < rRNA['start'] or cds['start'] > rRNA['stop']): continue else: # overlap -> remove cds @@ -268,11 +268,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_R_RNA} overlap with ({rRNA['product']}) at {overlap}" } log.info( - "overlap: CDS (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, cds['contig'] + "overlap: CDS (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, seq=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, cds['sequence'] ) # CRISPR overlaps - for crispr in contig_crispr_arrays[contig['id']]: + for crispr in sequence_crispr_arrays[seq['id']]: if(cds['stop'] < crispr['start'] or cds['start'] > crispr['stop']): continue else: # overlap -> remove cds @@ -283,11 +283,11 @@ def detect_feature_overlaps(genome: dict): 'description': f'overlaps {bc.FEATURE_CRISPR} at {overlap}' } log.info( - "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['contig'] + "overlap: CDS (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, seq=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], crispr['start'], crispr['stop'], overlap, cds['sequence'] ) # user-provided CDS overlaps - for cds_user_provided in contig_cdss_user_provided[contig['id']]: + for cds_user_provided in sequence_cdss_user_provided[seq['id']]: overlap = 0 if(not cds_user_provided.get('edge', False) and not cds.get('edge', False)): # both CDS not edge features if(cds['stop'] < cds_user_provided['start'] or cds['start'] > cds_user_provided['stop']): @@ -309,7 +309,7 @@ def detect_feature_overlaps(genome: dict): else: continue elif(cds_user_provided.get('edge', False) and cds.get('edge', False)): # both CDS edge features - overlap = (contig['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop']) + overlap = (seq['length'] - max(cds['start'], cds_user_provided['start']) + 1) + min(cds['stop'], cds_user_provided['stop']) if(overlap > bc.CDS_MAX_OVERLAPS): overlap = f"[{max(cds['start'], cds_user_provided['start'])},{min(cds['stop'], cds_user_provided['stop'])}]" cds['discarded'] = { @@ -318,14 +318,14 @@ def detect_feature_overlaps(genome: dict): 'description': f'overlaps user-provided {bc.FEATURE_CDS} at {overlap}' } log.info( - "overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", - cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['contig'] + "overlap: de-novo CDS (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, seq=%s", + cds.get('gene', '-'), cds.get('product', '-'), cds['start'], cds['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, cds['sequence'] ) # remove sORF overlapping with tRNAs, tmRNAs, rRNAs, CRISPRs, inframe CDSs, shorter inframe sORFs - for sorf in contig_sorfs[contig['id']]: + for sorf in sequence_sorfs[seq['id']]: # tmRNA overlaps - for tmRNA in contig_tm_rnas[contig['id']]: + for tmRNA in sequence_tm_rnas[seq['id']]: if(sorf['stop'] < tmRNA['start'] or sorf['start'] > tmRNA['stop']): continue else: # overlap -> remove sorf @@ -336,11 +336,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_TM_RNA} overlap with ({tmRNA['product']}) at {overlap}" } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, contig=%s", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, sorf['contig'] + "overlap: sORF (%s/%s) [%i, %i] overlapping tmRNA (%s) [%i, %i], %s, seq=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tmRNA['gene'], tmRNA['start'], tmRNA['stop'], overlap, sorf['sequence'] ) # tRNA overlaps - for tRNA in contig_t_rnas[contig['id']]: + for tRNA in sequence_t_rnas[seq['id']]: if(sorf['stop'] < tRNA['start'] or sorf['start'] > tRNA['stop']): continue else: # overlap -> remove sorf @@ -351,11 +351,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_T_RNA} overlap with ({tRNA['product']}) at {overlap}" } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, contig=%s", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, sorf['contig'] + "overlap: sORF (%s/%s) [%i, %i] overlapping tRNA (%s) [%i, %i], %s, seq=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], tRNA['gene'], tRNA['start'], tRNA['stop'], overlap, sorf['sequence'] ) # rRNA overlaps - for rRNA in contig_r_rnas[contig['id']]: + for rRNA in sequence_r_rnas[seq['id']]: if(sorf['stop'] < rRNA['start'] or sorf['start'] > rRNA['stop']): continue else: # overlap -> remove sorf @@ -366,11 +366,11 @@ def detect_feature_overlaps(genome: dict): 'description': f"{bc.FEATURE_R_RNA} overlap with ({rRNA['product']}) at {overlap}" } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, contig=%s", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, sorf['contig'] + "overlap: sORF (%s/%s) [%i, %i] overlapping rRNA (%s) [%i, %i], %s, seq=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], rRNA['gene'], rRNA['start'], rRNA['stop'], overlap, sorf['sequence'] ) # CRISPR overlaps - for crispr in contig_crispr_arrays[contig['id']]: + for crispr in sequence_crispr_arrays[seq['id']]: if(sorf['stop'] < crispr['start'] or sorf['start'] > crispr['stop']): continue else: # overlap -> remove sorf @@ -381,11 +381,11 @@ def detect_feature_overlaps(genome: dict): 'description': f'overlaps {bc.FEATURE_CRISPR} at {overlap}' } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, contig=%s", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['contig'] + "overlap: sORF (%s/%s) [%i, %i] overlapping CRISPR [%i, %i], %s, seq=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], crispr['start'], crispr['stop'], overlap, sorf['sequence'] ) # user-provided CDS overlaps - for cds_user_provided in contig_cdss_user_provided[contig['id']]: + for cds_user_provided in sequence_cdss_user_provided[seq['id']]: if(sorf['stop'] < cds_user_provided['start'] or sorf['start'] > cds_user_provided['stop']): continue else: # overlap -> remove sorf @@ -396,12 +396,12 @@ def detect_feature_overlaps(genome: dict): 'description': f'overlaps {bc.FEATURE_CDS} at {overlap}' } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, contig=%s", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['contig'] + "overlap: sORF (%s/%s) [%i, %i] overlapping user-provided CDS [%i, %i], %s, seq=%s", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], cds_user_provided['start'], cds_user_provided['stop'], overlap, sorf['sequence'] ) # sORF overlaps - for overlap_sorf in contig_sorfs[contig['id']]: + for overlap_sorf in sequence_sorfs[seq['id']]: if(sorf['stop'] < overlap_sorf['start'] or sorf['start'] > overlap_sorf['stop']): continue # no overlap elif(sorf['start'] == overlap_sorf['start'] and sorf['stop'] == overlap_sorf['stop']): @@ -418,8 +418,8 @@ def detect_feature_overlaps(genome: dict): 'description': f"overlaps {bc.FEATURE_SORF} ({overlap_sorf.get('gene', '-')}/{overlap_sorf.get('product', '-')}) at {overlap} with lower score ({score_sorf}/{score_overlap_sorf})" } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, contig=%s, lower annotation score (%i/%i)", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['contig'], score_sorf, score_overlap_sorf + "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, seq=%s, lower annotation score (%i/%i)", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['sequence'], score_sorf, score_overlap_sorf ) elif(score_sorf == score_overlap_sorf and len(sorf['aa']) < len(overlap_sorf['aa'])): # equal annotation score but shorter sequence -> potential fragment or too short ORF prediction overlap = f"[{max(sorf['start'], overlap_sorf['start'])},{min(sorf['stop'], overlap_sorf['stop'])}]" @@ -429,8 +429,8 @@ def detect_feature_overlaps(genome: dict): 'description': f"overlaps {bc.FEATURE_SORF} ({overlap_sorf.get('gene', '-')}/{overlap_sorf.get('product', '-')}) at {overlap} with equal score ({score_sorf}) but lower length ({len(sorf['aa'])}/{len(overlap_sorf['aa'])})" } log.info( - "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, contig=%s, equal annotation score (%i), lower length (%i/%i)", - sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['contig'], score_sorf, len(sorf['aa']), len(overlap_sorf['aa']) + "overlap: sORF (%s/%s) [%i, %i] overlapping sORF (%s/%s) [%i, %i], %s, seq=%s, equal annotation score (%i), lower length (%i/%i)", + sorf.get('gene', '-'), sorf.get('product', '-'), sorf['start'], sorf['stop'], overlap_sorf.get('gene', '-'), overlap_sorf.get('product', '-'), overlap_sorf['start'], overlap_sorf['stop'], overlap, sorf['sequence'], score_sorf, len(sorf['aa']), len(overlap_sorf['aa']) ) @@ -451,8 +451,8 @@ def calc_cds_annotation_score(cds: dict) -> int: score += 1 score += calc_annotation_score(psc) log.debug( - 'cds score: contig=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i', - cds['contig'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score + 'cds score: seq=%s, start=%i, stop=%i, gene=%s, product=%s, score=%i', + cds['sequence'], cds['start'], cds['stop'], cds.get('gene', '-'), cds.get('product', '-'), score ) return score @@ -620,7 +620,7 @@ def revise_cds_product(product: str): old_product = product if( - RE_PROTEIN_CONTIG.search(product) or # protein containing 'contig' + RE_PROTEIN_CONTIG.search(product) or # protein containing 'sequence' RE_PROTEIN_NODE.search(product) or # potential contig name (SPAdes) RE_PROTEIN_POTENTIAL_CONTIG_NAME.search(product) or # potential contig name (SPAdes) RE_PROTEIN_NO_LETTERS.fullmatch(product) # no letters -> set to Hypothetical @@ -633,8 +633,8 @@ def revise_cds_product(product: str): def mark_as_hypothetical(feature: dict): log.info( - 'marked as hypothetical: contig=%s, start=%i, stop=%i, strand=%s', - feature['contig'], feature['start'], feature['stop'], feature['strand'] + 'marked as hypothetical: seq=%s, start=%i, stop=%i, strand=%s', + feature['sequence'], feature['start'], feature['stop'], feature['strand'] ) feature['hypothetical'] = True feature['gene'] = None @@ -660,8 +660,8 @@ def get_adjacent_genes(feature: dict, features: Sequence[dict], neighbors=3): upstream_genes.extend(downstream_genes) for gene in upstream_genes: log.debug( - 'extracted neighbor genes: contig=%s, start=%i, stop=%i, gene=%s, product=%s', - gene['contig'], gene['start'], gene['stop'], gene.get('gene', '-'), gene.get('product', '-') + 'extracted neighbor genes: seq=%s, start=%i, stop=%i, gene=%s, product=%s', + gene['sequence'], gene['start'], gene['stop'], gene.get('gene', '-'), gene.get('product', '-') ) return upstream_genes return [] @@ -680,14 +680,14 @@ def select_gene_symbols(features: Sequence[dict]): if(gene_symbol != old_gene_symbol): feat['gene'] = gene_symbol log.info( - 'gene product symbol selection: contig=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s', - feat['contig'], feat['start'], feat['stop'], gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-') + 'gene product symbol selection: seq=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s', + feat['sequence'], feat['start'], feat['stop'], gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-') ) improved_genes.append(feat) else: # multiple gene symbols of varying prefixes are available, e.g. acrS, envR log.debug( - 'select gene symbol: contig=%s, start=%i, stop=%i, gene=%s, genes=%s, product=%s', - feat['contig'], feat['start'], feat['stop'], feat.get('gene', '-'), ','.join(feat['genes']), feat.get('product', '-') + 'select gene symbol: seq=%s, start=%i, stop=%i, gene=%s, genes=%s, product=%s', + feat['sequence'], feat['start'], feat['stop'], feat.get('gene', '-'), ','.join(feat['genes']), feat.get('product', '-') ) adjacent_genes = get_adjacent_genes(feat, features, neighbors=3) adjacent_gene_symbol_lists = [gene.get('genes', []) for gene in adjacent_genes] @@ -711,8 +711,8 @@ def select_gene_symbols(features: Sequence[dict]): if(selected_gene_symbol != old_gene_symbol): feat['gene'] = selected_gene_symbol log.info( - 'gene neighborhood symbol selection: contig=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s', - feat['contig'], feat['start'], feat['stop'], selected_gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-') + 'gene neighborhood symbol selection: seq=%s, start=%i, stop=%i, new-gene=%s, old-gene=%s, genes=%s, product=%s', + feat['sequence'], feat['start'], feat['stop'], selected_gene_symbol, old_gene_symbol, ','.join(feat['genes']), feat.get('product', '-') ) improved_genes.append(feat) return improved_genes \ No newline at end of file diff --git a/bakta/features/cds.py b/bakta/features/cds.py index ec728b98..177dda58 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -42,7 +42,7 @@ def predict(genome: dict): if(not prodigal_metamode): log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed) - seqs = [c['sequence'] for c in genome['contigs']] + seqs = [seq['sequence'] for seq in genome['sequences']] trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table) else: log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) @@ -58,43 +58,43 @@ def predict(genome: dict): cdss = [] # predict genes on linear sequences - linear_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_LINEAR] - if(len(linear_contigs) > 0): + linear_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR] + if(len(linear_sequences) > 0): if prodigal_metamode: gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True) else: gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=True, mask=True) - sequences = [contig['sequence'] for contig in linear_contigs] + sequences = [seq['sequence'] for seq in linear_sequences] with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe: - for contig, genes in zip(linear_contigs, tpe.map(gene_finder.find_genes, sequences)): - cdss_per_sequence = create_cdss(genes, contig) + for seq, genes in zip(linear_sequences, tpe.map(gene_finder.find_genes, sequences)): + cdss_per_sequence = create_cdss(genes, seq) cdss.extend(cdss_per_sequence) # predict genes on circular replicons (chromosomes/plasmids) - circular_contigs = [c for c in genome['contigs'] if c['topology'] == bc.TOPOLOGY_CIRCULAR] - if(len(circular_contigs) > 0): + circular_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR] + if(len(circular_sequences) > 0): if prodigal_metamode: gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True) else: gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=False, mask=True) - sequences = [contig['sequence'] for contig in circular_contigs] + sequences = [seq['sequence'] for seq in circular_sequences] with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe: - for contig, genes in zip(circular_contigs, tpe.map(gene_finder.find_genes, sequences)): - cdss_per_sequence = create_cdss(genes, contig) + for seq, genes in zip(circular_sequences, tpe.map(gene_finder.find_genes, sequences)): + cdss_per_sequence = create_cdss(genes, seq) cdss.extend(cdss_per_sequence) log.info('predicted=%i', len(cdss)) return cdss -def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str): +def create_cds(sequence: dict, start: int, stop: int, strand: str, edge:bool, nt: str, aa: str): cds = OrderedDict() cds['type'] = bc.FEATURE_CDS - cds['contig'] = contig['id'] + cds['sequence'] = sequence['id'] cds['start'] = start cds['stop'] = stop cds['strand'] = strand - cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (contig['length'] - stop) % 3 + 1 + cds['frame'] = (start - 1) % 3 + 1 if strand == bc.STRAND_FORWARD else (sequence['length'] - stop) % 3 + 1 cds['gene'] = None cds['product'] = None cds['db_xrefs'] = [so.SO_CDS.id] @@ -106,12 +106,12 @@ def create_cds(contig: dict, start: int, stop: int, strand: str, edge:bool, nt: return cds -def create_cdss(genes, contig): +def create_cdss(genes, sequence): partial_cdss_per_sequence = [] cdss_per_sequence = [] for gene in genes: strand = bc.STRAND_FORWARD if gene.strand == 1 else bc.STRAND_REVERSE - cds = create_cds(contig, gene.begin, gene.end, strand, False, '', '') + cds = create_cds(sequence, gene.begin, gene.end, strand, False, '', '') cds['start_type'] = gene.start_type cds['rbs_motif'] = gene.rbs_motif if gene.partial_begin: @@ -135,18 +135,18 @@ def create_cdss(genes, contig): cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif'] + 'seq=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds.get('truncated', 'no'), cds['start_type'], cds['rbs_motif'] ) - if(contig['topology'] == bc.TOPOLOGY_CIRCULAR and len(partial_cdss_per_sequence) >= 2): - first_partial_cds = partial_cdss_per_sequence[0] # first partial CDS per contig - last_partial_cds = partial_cdss_per_sequence[-1] # last partial CDS per contig + if(sequence['topology'] == bc.TOPOLOGY_CIRCULAR and len(partial_cdss_per_sequence) >= 2): + first_partial_cds = partial_cdss_per_sequence[0] # first partial CDS per sequence + last_partial_cds = partial_cdss_per_sequence[-1] # last partial CDS per sequence # check if partial CDSs are on same strand and have opposite truncated edges - # and first starts at 1 and last ends at contig end (length) + # and first starts at 1 and last ends at sequence end (length) if(first_partial_cds['strand'] == last_partial_cds['strand'] and first_partial_cds['truncated'] != last_partial_cds['truncated'] and first_partial_cds['start'] == 1 - and last_partial_cds['stop'] == contig['length']): + and last_partial_cds['stop'] == sequence['length']): cds = last_partial_cds cds['stop'] = first_partial_cds['stop'] if(last_partial_cds['truncated'] == bc.FEATURE_END_3_PRIME): @@ -162,22 +162,22 @@ def create_cdss(genes, contig): cds['aa_digest'], cds['aa_hexdigest'] = bu.calc_aa_hash(aa) cdss_per_sequence.append(cds) log.info( - 'edge CDS: contig=%s, start=%i, stop=%i, strand=%s, frame=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]', - cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds['start_type'], cds['rbs_motif'], cds['aa_hexdigest'], aa[:10], aa[-10:] + 'edge CDS: seq=%s, start=%i, stop=%i, strand=%s, frame=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'], cds['start_type'], cds['rbs_motif'], cds['aa_hexdigest'], aa[:10], aa[-10:] ) partial_cdss_per_sequence = partial_cdss_per_sequence[1:-1] # remove first/last partial CDS for partial_cds in partial_cdss_per_sequence: cdss_per_sequence.append(partial_cds) log.info( - 'truncated CDS: contig=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]', - partial_cds['contig'], partial_cds['start'], partial_cds['stop'], partial_cds['strand'], partial_cds['frame'], partial_cds['truncated'], partial_cds['start_type'], partial_cds['rbs_motif'], partial_cds['aa_hexdigest'], partial_cds['aa'][:10], partial_cds['aa'][-10:] + 'truncated CDS: seq=%s, start=%i, stop=%i, strand=%s, frame=%s, truncated=%s, start-type=%s, RBS-motif=%s, aa-hexdigest=%s, aa=[%s..%s]', + partial_cds['sequence'], partial_cds['start'], partial_cds['stop'], partial_cds['strand'], partial_cds['frame'], partial_cds['truncated'], partial_cds['start_type'], partial_cds['rbs_motif'], partial_cds['aa_hexdigest'], partial_cds['aa'][:10], partial_cds['aa'][-10:] ) for cds in cdss_per_sequence: # extract nt sequences - nt = bu.extract_feature_sequence(cds, contig) + nt = bu.extract_feature_sequence(cds, sequence) cds['nt'] = nt log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s]', - cds['contig'], cds['start'], cds['stop'], cds['strand'], nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s]', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], nt[:10], nt[-10:] ) return cdss_per_sequence @@ -189,7 +189,7 @@ def import_user_cdss(genome: dict, import_path: Path): Parameters ---------- genome : dict - Genome dictionary holding sequence information (contigs) + Genome dictionary holding sequence information import_path : Path Path to GFF3 or Genbank file with regions or features. @@ -199,10 +199,10 @@ def import_user_cdss(genome: dict, import_path: Path): a list of CDS features - without functional annotations. """ user_cdss = [] - if(cfg.keep_contig_headers): - contigs_by_id = {c['id']: c for c in genome['contigs']} # use ID as it's not altered -> no 'orig_id' field + if(cfg.keep_sequence_headers): + sequences_by_id = {seq['id']: seq for seq in genome['sequences']} # use ID as it's not altered -> no 'orig_id' field else: - contigs_by_id = {c['orig_id']: c for c in genome['contigs']} # use 'orig_id' instead of autogenerated new 'id' + sequences_by_id = {seq['orig_id']: seq for seq in genome['sequences']} # use 'orig_id' instead of autogenerated new 'id' file_suffix = import_path.suffix.lower() if(file_suffix in ['.gff', '.gff3']): # parse GFF3 format try: @@ -215,45 +215,45 @@ def import_user_cdss(genome: dict, import_path: Path): elif(skip_lines or line[0] == '#'): continue else: - contig_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') + sequence_id, tool, feature_type, start, stop, score, strand, phase, attributes = line.split('\t') if(feature_type.lower() == 'cds'): attributes = attributes.lower().split(';') - contig = contigs_by_id.get(contig_id, None) - if(contig is None): - log.error('user-provided CDS: No contig found for id=%s', contig_id) - raise Exception(f'user-provided CDS: No contig found for id={contig_id}') + seq = sequences_by_id.get(sequence_id, None) + if(seq is None): + log.error('user-provided CDS: No seq found for id=%s', sequence_id) + raise Exception(f'user-provided CDS: No sequence found for id={sequence_id}') edge = False start = int(start) stop = int(stop) - if(stop > contig['length']): # check for features spanning sequence edges - stop = stop - contig['length'] + if(stop > seq['length']): # check for features spanning sequence edges + stop = stop - seq['length'] edge = True - user_cds = create_cds(contig, start, stop, strand, edge, '', '') + user_cds = create_cds(seq, start, stop, strand, edge, '', '') user_cds['source'] = bc.CDS_SOURCE_USER if('pseudo=' in attributes or bc.INSDC_FEATURE_PSEUDOGENE in attributes): # skip pseudo genes log.debug( - 'skip user-provided CDS: reason=pseudogene contig=%s, start=%i, stop=%i, strand=%s', - user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'] + 'skip user-provided CDS: reason=pseudogene seq=%s, start=%i, stop=%i, strand=%s', + user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand'] ) continue try: - nt = bu.extract_feature_sequence(user_cds, contig) + nt = bu.extract_feature_sequence(user_cds, seq) user_cds['nt'] = nt except: - log.error('user-provided CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) - raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + log.error('user-provided CDS out of range! seq=%s, start=%i, stop=%i', user_cds['sequence'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}") try: aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) user_cds['aa'] = aa user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) except: - log.error('user-provided CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) - raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + log.error('user-provided CDS could not be translated into a valid amino acid sequence! seq=%s, start=%i, stop=%i, cds=%s', user_cds['sequence'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") log.info( - 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', - user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + 'user-provided CDS: seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] ) user_cdss.append(user_cds) except Exception as e: @@ -265,10 +265,10 @@ def import_user_cdss(genome: dict, import_path: Path): for record in SeqIO.parse(fh_in, 'genbank'): for feature in record.features: if(feature.type.lower() == 'cds'): - contig = contigs_by_id.get(record.id, None) - if(contig is None): - log.error('user-provided CDS: No contig found for id=%s', record.id) - raise Exception(f'user-provided CDS: No contig found for id={record.id}') + seq = sequences_by_id.get(record.id, None) + if(seq is None): + log.error('user-provided CDS: No seq found for id=%s', record.id) + raise Exception(f'user-provided CDS: No sequence found for id={record.id}') if(feature.location.strand is None): # weird mixed-stranded compound locations strand = bc.STRAND_UNKNOWN else: @@ -278,20 +278,20 @@ def import_user_cdss(genome: dict, import_path: Path): edge = False if('<' in str(feature.location.start) or '>' in str(feature.location.end)): log.debug( - 'skip user-provided CDS: reason=partial, contig=%s, start=%s, stop=%s, strand=%s', - contig['id'], feature.location.start, feature.location.end, strand + 'skip user-provided CDS: reason=partial, seq=%s, start=%s, stop=%s, strand=%s', + seq['id'], feature.location.start, feature.location.end, strand ) continue elif(bc.INSDC_FEATURE_PSEUDO in feature.qualifiers or bc.INSDC_FEATURE_PSEUDOGENE in feature.qualifiers): log.debug( - 'skip user-provided CDS: reason=pseudo, contig=%s, start=%i, stop=%i, strand=%s', - contig['id'], feature.location.start, feature.location.end, strand + 'skip user-provided CDS: reason=pseudo, seq=%s, start=%i, stop=%i, strand=%s', + seq['id'], feature.location.start, feature.location.end, strand ) continue elif('ribosomal_slippage' in feature.qualifiers): log.debug( - 'skip user-provided CDS: reason=ribosomal slippage, contig=%s, start=%i, stop=%i, strand=%s', - contig['id'], feature.location.start, feature.location.end, strand + 'skip user-provided CDS: reason=ribosomal slippage, seq=%s, start=%i, stop=%i, strand=%s', + seq['id'], feature.location.start, feature.location.end, strand ) continue elif(isinstance(feature.location, SeqFeature.CompoundLocation) and len(feature.location.parts) == 2): @@ -307,25 +307,25 @@ def import_user_cdss(genome: dict, import_path: Path): start = edge_right.start + 1 end = edge_left.end - user_cds = create_cds(contig, start, end, strand, edge, '', '') + user_cds = create_cds(seq, start, end, strand, edge, '', '') user_cds['source'] = bc.CDS_SOURCE_USER try: - nt = bu.extract_feature_sequence(user_cds, contig) + nt = bu.extract_feature_sequence(user_cds, seq) user_cds['nt'] = nt except: - log.error('user-provided CDS: CDS out of range! contig=%s, start=%i, stop=%i', user_cds['contig'], user_cds['start'], user_cds['stop']) - raise ValueError(f"User-provided CDS out of range! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}") + log.error('user-provided CDS: CDS out of range! seq=%s, start=%i, stop=%i', user_cds['sequence'], user_cds['start'], user_cds['stop']) + raise ValueError(f"User-provided CDS out of range! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}") try: aa = str(Seq(nt).translate(table=cfg.translation_table, cds=True)) user_cds['aa'] = aa user_cds['aa_digest'], user_cds['aa_hexdigest'] = bu.calc_aa_hash(aa) except: - log.error('user-provided CDS: CDS could not be translated into a valid amino acid sequence! contig=%s, start=%i, stop=%i, cds=%s', user_cds['contig'], user_cds['start'], user_cds['stop'], nt) - raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! contig={user_cds['contig']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") + log.error('user-provided CDS: CDS could not be translated into a valid amino acid sequence! seq=%s, start=%i, stop=%i, cds=%s', user_cds['sequence'], user_cds['start'], user_cds['stop'], nt) + raise ValueError(f"User-provided CDS could not be translated into a valid amino acid sequence! sequence={user_cds['sequence']}, start={user_cds['start']}, stop={user_cds['stop']}, cds={nt}") log.info( - 'user-provided CDS: contig=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', - user_cds['contig'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] + 'user-provided CDS: seq=%s, start=%i, stop=%i, strand=%s, nt=[%s..%s], aa=[%s..%s]', + user_cds['sequence'], user_cds['start'], user_cds['stop'], user_cds['strand'], nt[:10], nt[-10:], aa[:10], aa[-10:] ) user_cdss.append(user_cds) except Exception as e: @@ -370,8 +370,8 @@ def predict_pfam(cdss: Sequence[dict]) -> Sequence[dict]: pfam_hits.append(cds) cds_with_pfams_hits[aa_identifier] = cds log.info( - 'pfam detected: contig=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], + 'pfam detected: seq=%s, start=%i, stop=%i, strand=%s, pfam-id=%s, length=%i, aa-start=%i, aa-stop=%i, aa-cov=%1.1f, hmm-cov=%1.1f, evalue=%1.1e, bitscore=%1.1f, name=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], pfam['id'], pfam['length'], pfam['start'], pfam['stop'], pfam['aa_cov'], pfam['hmm_cov'], pfam['evalue'], pfam['score'], pfam['name'] ) log.info('predicted-pfams=%i, CDS-w/-pfams=%i', len(pfam_hits), len(cds_with_pfams_hits)) @@ -386,16 +386,16 @@ def analyze_proteins(cdss: Sequence[dict]): seq_stats['molecular_weight'] = seq.molecular_weight() except: log.warning( - 'could not calc molecular weight! contig=%s, start=%i, stop=%i, strand=%s, frame=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'] + 'could not calc molecular weight! seq=%s, start=%i, stop=%i, strand=%s, frame=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'] ) seq_stats['molecular_weight'] = None try: seq_stats['isoelectric_point'] = seq.isoelectric_point() except: log.warning( - 'could not calc isoelectric point! contig=%s, start=%i, stop=%i, strand=%s, frame=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['frame'] + 'could not calc isoelectric point! seq=%s, start=%i, stop=%i, strand=%s, frame=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['frame'] ) seq_stats['isoelectric_point'] = None cds['seq_stats'] = seq_stats @@ -409,19 +409,19 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): if(bc.FEATURE_NC_RNA_REGION not in genome['features']): # check if ncRNA regions have been detected, otherwise skip analysis and return return no_revised - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding. - cdss_per_contigs = {k['id']: [] for k in genome['contigs']} # get CDS per contig + cdss_per_sequences = {k['id']: [] for k in genome['sequences']} # get CDS per sequence for cds in cdss: - cdss_per_contig = cdss_per_contigs[cds['contig']] + cdss_per_sequence = cdss_per_sequences[cds['sequence']] if('truncated' not in cds): # exclude truncated CDS for now - cdss_per_contig.append(cds) - cds_pairs_per_contig = {k['id']: [] for k in genome['contigs']} # extract inframe primate CDS neighbouring pairs - for id, cdss_per_contig in cdss_per_contigs.items(): - cdss_per_contig = sorted(cdss_per_contig, key=lambda k: k['start']) - for i in range(1, len(cdss_per_contig)): - cds_a = cdss_per_contig[i-1] - cds_b = cdss_per_contig[i] + cdss_per_sequence.append(cds) + cds_pairs_per_sequence = {k['id']: [] for k in genome['sequences']} # extract inframe primate CDS neighbouring pairs + for id, cdss_per_sequence in cdss_per_sequences.items(): + cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start']) + for i in range(1, len(cdss_per_sequence)): + cds_a = cdss_per_sequence[i-1] + cds_b = cdss_per_sequence[i] strand = cds_a['strand'] upstream_stop_codon = cds_a['nt'][-3:] if strand == bc.STRAND_FORWARD else cds_b['nt'][-3:] if( @@ -429,27 +429,27 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): cds_a['frame'] == cds_b['frame'] and # up- and downstream ORFs on the same frame upstream_stop_codon == 'TGA' and # tRNAScan-SE 2.0 only predicts tRNA-Sec with UCA anticodons, therefore we can only detect TGA stop codons (cds_b['start'] - cds_a['stop']) < 100): # up- and downstream ORFs in close proximity - cds_pairs = cds_pairs_per_contig[cds_a['contig']] + cds_pairs = cds_pairs_per_sequence[cds_a['sequence']] cds_pairs.append((cds_a, cds_b)) recoding_regions = [ncrna_region for ncrna_region in genome['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION] # Selenocysteine insertion sequences for recoding_region in recoding_regions: if('selenocysteine' in recoding_region.get('product', '').lower()): - cds_pairs = cds_pairs_per_contig[recoding_region['contig']] + cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']] for cds_a, cds_b in cds_pairs: # find CDS pair around recoding region strand = cds_a['strand'] if( strand == recoding_region['strand'] and # everything is on the same strand cds_a['start'] < recoding_region['start'] and recoding_region['stop'] < cds_b['stop']): # recoding region lies between up- and downstream ORFs log.debug( - 'selenocysteine recoding ncRNA/CDS pair detected: contig=%s, strand=%s, CDS-A=[%i...%i] (%s..%s), recoding-ie=[%i..%i], CDS-B=[%i...%i] (%s..%s)', - recoding_region['contig'], recoding_region['strand'], cds_a['start'], cds_a['stop'], cds_a['nt'][:10], cds_a['nt'][-10:], recoding_region['start'], recoding_region['stop'], cds_b['start'], cds_b['stop'], cds_b['nt'][:10], cds_b['nt'][-10:] + 'selenocysteine recoding ncRNA/CDS pair detected: seq=%s, strand=%s, CDS-A=[%i...%i] (%s..%s), recoding-ie=[%i..%i], CDS-B=[%i...%i] (%s..%s)', + recoding_region['sequence'], recoding_region['strand'], cds_a['start'], cds_a['stop'], cds_a['nt'][:10], cds_a['nt'][-10:], recoding_region['start'], recoding_region['stop'], cds_b['start'], cds_b['stop'], cds_b['nt'][:10], cds_b['nt'][-10:] ) seleno_cds = copy.deepcopy(cds_a) seleno_cds['stop'] = cds_b['stop'] seleno_cds['rbs_motif'] = cds_a['rbs_motif'] if strand == bc.STRAND_FORWARD else cds_b['rbs_motif'] - contig = contigs[seleno_cds['contig']] - nt = bu.extract_feature_sequence(seleno_cds, contig) + seq = sequences[seleno_cds['sequence']] + nt = bu.extract_feature_sequence(seleno_cds, seq) seleno_cds['nt'] = nt aa = str(Seq(nt).translate(table=cfg.translation_table, stop_symbol='*', to_stop=False, cds=False)) if( @@ -470,8 +470,8 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): } cdss.append(seleno_cds) log.info( - 'selenocysteine CDS detected: contig=%s, start=%i, stop=%i, strand=%s, frame=%i, exception=[%i..%i], nt=[%s..%s], aa=[%s..%s], aa-hexdigest=%s', - seleno_cds['contig'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], seleno_cds['exception']['start'], seleno_cds['exception']['stop'], nt[:10], nt[-10:], aa[:10], aa[-10:], seleno_cds['aa_hexdigest'] + 'selenocysteine CDS detected: seq=%s, start=%i, stop=%i, strand=%s, frame=%i, exception=[%i..%i], nt=[%s..%s], aa=[%s..%s], aa-hexdigest=%s', + seleno_cds['sequence'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], seleno_cds['exception']['start'], seleno_cds['exception']['stop'], nt[:10], nt[-10:], aa[:10], aa[-10:], seleno_cds['aa_hexdigest'] ) discard = { # mark CDS a/b as discarded 'type': bc.DISCARD_TYPE_RECODING, @@ -482,8 +482,8 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): no_revised += 1 else: log.warning( - 'spurious selenocysteine CDS detected: contig=%s, start=%i, stop=%i, strand=%s, frame=%i, nt=[%s], aa=[%s]', - seleno_cds['contig'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], nt, aa + 'spurious selenocysteine CDS detected: seq=%s, start=%i, stop=%i, strand=%s, frame=%i, nt=[%s], aa=[%s]', + seleno_cds['sequence'], seleno_cds['start'], seleno_cds['stop'], seleno_cds['strand'], seleno_cds['frame'], nt, aa ) return no_revised @@ -494,13 +494,13 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]): which often appear on re-annotated genomes. """ - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} # look for supposedly truncated dnaA genes on rotated chromosome starts: start=1, strand=+ dnaA = None for cds in cdss: - contig = contigs[cds['contig']] + seq = sequences[cds['sequence']] if( - contig['complete'] and + seq['complete'] and cds['start'] == 1 and cds['strand'] == bc.STRAND_FORWARD and cds['start_type'] == 'Edge' and @@ -512,16 +512,16 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]): dnaA.pop('truncated') gene = dnaA.get('gene', '-') log.info( - 'revise supposedly truncated dnaA gene on rotated chromosome start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]', - dnaA['contig'], dnaA['start'], dnaA['stop'], dnaA['strand'], gene, dnaA['product'], dnaA['nt'][:10], dnaA['nt'][-10:], dnaA['aa'][:10], dnaA['aa'][-10:] + 'revise supposedly truncated dnaA gene on rotated chromosome start: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]', + dnaA['sequence'], dnaA['start'], dnaA['stop'], dnaA['strand'], gene, dnaA['product'], dnaA['nt'][:10], dnaA['nt'][-10:], dnaA['aa'][:10], dnaA['aa'][-10:] ) # look for supposedly truncated repA genes on rotated plasmid starts: start=1, strand=+ repAs = [] for cds in cdss: - contig = contigs[cds['contig']] + seq = sequences[cds['sequence']] if( - contig['complete'] and + seq['complete'] and cds['start'] == 1 and cds['strand'] == bc.STRAND_FORWARD and cds['start_type'] == 'Edge' and @@ -533,8 +533,8 @@ def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]): repA.pop('truncated') gene = repA.get('gene', '-') log.info( - 'revise supposedly truncated repA gene on rotated plasmid start: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]', - repA['contig'], repA['start'], repA['stop'], repA['strand'], gene, repA['product'], repA['nt'][:10], repA['nt'][-10:], repA['aa'][:10], repA['aa'][-10:] + 'revise supposedly truncated repA gene on rotated plasmid start: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s], aa=[%s..%s]', + repA['sequence'], repA['start'], repA['stop'], repA['strand'], gene, repA['product'], repA['nt'][:10], repA['nt'][-10:], repA['aa'][:10], repA['aa'][-10:] ) @@ -604,8 +604,8 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]: } pseudo_candidates.append(cds) log.debug( - 'pseudogene-candidate: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id + 'pseudogene-candidate: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) log.info('found: pseudogene-candidates=%i', len(pseudo_candidates)) return pseudo_candidates @@ -627,13 +627,13 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: fh.write(f">{cluster_id}\n{faa_seq}\n") # Get extended cds sequences - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} candidates_extended_positions = {} with candidates_elongated_sequences_path.open(mode='w') as fh: for cds in candidates: - contig = contigs[cds['contig']] - cds_elongated = get_elongated_cds(cds, contig) - seq = bu.extract_feature_sequence(cds_elongated, contig) + seq = sequences[cds['sequence']] + cds_elongated = get_elongated_cds(cds, seq) + seq = bu.extract_feature_sequence(cds_elongated, seq) orf_key = orf.get_orf_key(cds) fh.write(f">{orf_key}\n{seq}\n") candidates_extended_positions[orf_key] = cds_elongated @@ -700,8 +700,8 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: if alignment_length == len(cds['aa']): # skip non-extended genes (full match) log.debug( - 'no pseudogene (full match): contig=%s, start=%i, stop=%i, strand=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'] + 'no pseudogene (full match): seq=%s, start=%i, stop=%i, strand=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'] ) continue @@ -763,8 +763,8 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: cds.pop('hypothetical') pseudogenes.append(cds) log.info( - 'pseudogene: contig=%s, start=%i, stop=%i, strand=%s, insertions=%s, deletions=%s, mutations=%s, effect=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], observations.get(bc.PSEUDOGENE_CAUSE_INSERTION, []), observations.get(bc.PSEUDOGENE_CAUSE_DELETION, []), observations.get(bc.PSEUDOGENE_CAUSE_MUTATION, []), effects + 'pseudogene: seq=%s, start=%i, stop=%i, strand=%s, insertions=%s, deletions=%s, mutations=%s, effect=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], observations.get(bc.PSEUDOGENE_CAUSE_INSERTION, []), observations.get(bc.PSEUDOGENE_CAUSE_DELETION, []), observations.get(bc.PSEUDOGENE_CAUSE_MUTATION, []), effects ) elif observations[bc.PSEUDOGENE_EXCEPTION_SELENOCYSTEINE] or observations[bc.PSEUDOGENE_EXCEPTION_PYROLYSINE]: @@ -777,7 +777,7 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: return pseudogenes -def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSET) -> Dict[str, Union[int, str, bool]]: +def get_elongated_cds(cds: dict, sequence: dict, offset: int = bc.PSEUDOGENE_OFFSET) -> Dict[str, Union[int, str, bool]]: """ Elongate the given CDS sequence with the offset in upstream and downstream direction, if possible. """ @@ -790,9 +790,9 @@ def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSE 'elongation_downstream': offset } - contig_length = len(contig['sequence']) - if contig['topology'] == 'circular' and elongated_cds['start'] - offset < 0: - elongated_cds['start'] = contig_length + elongated_cds['start'] - offset + sequence_length = len(sequence['sequence']) + if sequence['topology'] == 'circular' and elongated_cds['start'] - offset < 0: + elongated_cds['start'] = sequence_length + elongated_cds['start'] - offset elongated_cds['edge'] = True elif elongated_cds['start'] - offset < 0: elongated_cds['start'] = 1 @@ -800,12 +800,12 @@ def get_elongated_cds(cds: dict, contig: dict, offset: int = bc.PSEUDOGENE_OFFSE else: elongated_cds['start'] = elongated_cds['start'] - offset - if contig['topology'] == 'circular' and elongated_cds['stop'] + offset > contig_length: - elongated_cds['stop'] = elongated_cds['stop'] + offset - contig_length + if sequence['topology'] == 'circular' and elongated_cds['stop'] + offset > sequence_length: + elongated_cds['stop'] = elongated_cds['stop'] + offset - sequence_length elongated_cds['edge'] = True - elif elongated_cds['stop'] + offset > contig_length: - elongated_cds['stop'] = contig_length - elongated_cds['elongation_downstream'] = contig_length - cds['stop'] + elif elongated_cds['stop'] + offset > sequence_length: + elongated_cds['stop'] = sequence_length + elongated_cds['elongation_downstream'] = sequence_length - cds['stop'] else: elongated_cds['stop'] = elongated_cds['stop'] + offset @@ -889,8 +889,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c observations[bc.PSEUDOGENE_EFFECT_START].add(genome_position) observations['directions'].add(bc.FEATURE_END_3_PRIME) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, original start=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], cds['start'] + genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, original start=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], cds['start'] + genome_position ) else: # RBS was predicted (protein iso-form) -> skip pass @@ -911,8 +911,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c observations[bc.PSEUDOGENE_CAUSE_INSERTION].add(genome_position) observations['directions'].add(get_direction(alignment_position, edge)) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, cause=insertion, position=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, cause=insertion, position=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position ) alignment_position += 1 elif char == '/': # deletion @@ -921,23 +921,23 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c observations[bc.PSEUDOGENE_CAUSE_DELETION].add(genome_position) observations['directions'].add(get_direction(alignment_position, edge)) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, cause=deletion, position=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, cause=deletion, position=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position ) elif char == '*': # stop codon, selenocysteine, pyrolysine if ref_char == 'U': # selenocysteine genome_position = get_abs_position(cds, start, alignment_position, edge) observations[bc.PSEUDOGENE_EXCEPTION_SELENOCYSTEINE].add(genome_position) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, exception=selenocysteine, position=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, exception=selenocysteine, position=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position ) elif ref_char == 'O': # pyrolysine genome_position = get_abs_position(cds, start, alignment_position, edge) observations[bc.PSEUDOGENE_EXCEPTION_PYROLYSINE].add(genome_position) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, exception=pyrolysin, position=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, exception=pyrolysin, position=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], genome_position ) else: # stop codon mutation = '' @@ -948,8 +948,8 @@ def compare_alignments(observations: dict, alignment: str, ref_alignment: str, c observations[bc.PSEUDOGENE_EFFECT_STOP].add(genome_position) observations['directions'].add(get_direction(alignment_position, edge)) log.info( - 'pseudogene observation: contig=%s, start=%i, stop=%i, strand=%s, effect=stop%s, position=%i', - cds['contig'], cds['start'], cds['stop'], cds['strand'], mutation, genome_position + 'pseudogene observation: seq=%s, start=%i, stop=%i, strand=%s, effect=stop%s, position=%i', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], mutation, genome_position ) alignment_position += 3 else: diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py index b15818f2..02b41c20 100644 --- a/bakta/features/crispr.py +++ b/bakta/features/crispr.py @@ -17,13 +17,13 @@ log = logging.getLogger('CRISPR') -def predict_crispr(genome: dict, contigs_path: Path): +def predict_crispr(genome: dict, sequences_path: Path): """Predict CRISPR arrays with PILER-CR.""" output_path = cfg.tmp_path.joinpath('crispr.txt') cmd = [ 'pilercr', - '-in', str(contigs_path), + '-in', str(sequences_path), '-out', str(output_path), '-noinfo', # omit help in output '-quiet' # silent mode @@ -44,10 +44,10 @@ def predict_crispr(genome: dict, contigs_path: Path): # parse crispr arrays crispr_arrays = {} - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with output_path.open() as fh: output_section = None - contig_id = None + sequence_id = None array_id = None skip_lines = True crispr_array = None @@ -77,8 +77,8 @@ def predict_crispr(genome: dict, contigs_path: Path): crispr_array['spacers'] = [] crispr_arrays[array_id] = crispr_array elif(line[0] == '>'): - contig_id = line[1:] - crispr_array['contig'] = contig_id + sequence_id = line[1:] + crispr_array['sequence'] = sequence_id elif(line[0] != '='): m = RE_CRISPR.fullmatch(line) if(m is not None): @@ -102,20 +102,20 @@ def predict_crispr(genome: dict, contigs_path: Path): crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count crispr_spacer['sequence'] = spacer_seq crispr_array['spacers'].append(crispr_spacer) - spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id]) + spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, sequences[sequence_id]) log.debug('spacer: array-id=%s, start=%i, stop=%i, genome-seq=%s, spacer-seq=%s', array_id, crispr_spacer['start'], crispr_spacer['stop'], spacer_genome_seq, spacer_seq) assert spacer_seq == spacer_genome_seq # assure PILER-CR provided sequence equals sequence extracted from genome elif(output_section == 'POSITION'): if(line[0] == '>'): - contig_id = line[1:] + sequence_id = line[1:] elif(line[0] != 'A' and line[0] != '='): cols = line.split() if(len(cols) == 8): - (array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols + (array_id, sequence, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols else: - (array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols + (array_id, sequence, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols crispr_array = crispr_arrays[array_id] - positions = [c['start'] for c in crispr_array['spacers']] + [c['stop'] for c in crispr_array['spacers']] + [c['start'] for c in crispr_array['repeats']] + [c['stop'] for c in crispr_array['repeats']] + positions = [seq['start'] for seq in crispr_array['spacers']] + [seq['stop'] for seq in crispr_array['spacers']] + [seq['start'] for seq in crispr_array['repeats']] + [seq['stop'] for seq in crispr_array['repeats']] crispr_array['start'] = min(positions) crispr_array['stop'] = max(positions) crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}' @@ -125,11 +125,11 @@ def predict_crispr(genome: dict, contigs_path: Path): crispr_array['repeat_consensus'] = repeat_consensus crispr_array['db_xrefs'] = [so.SO_CRISPR.id] - nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(crispr_array, sequences[sequence_id]) # extract nt sequences crispr_array['nt'] = nt log.info( - 'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]', - crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]', + crispr_array['sequence'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:] ) crispr_arrays = crispr_arrays.values() log.info('predicted=%i', len(crispr_arrays)) diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py index fb6342ff..5e052de6 100644 --- a/bakta/features/gaps.py +++ b/bakta/features/gaps.py @@ -13,14 +13,14 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]: gaps = [] - for contig in genome['contigs']: - m = RE_ASSEMBLY_GAP.search(contig['sequence']) + for seq in genome['sequences']: + m = RE_ASSEMBLY_GAP.search(seq['sequence']) while m: start, end = m.span() gap = OrderedDict() gap['type'] = bc.FEATURE_GAP - gap['contig'] = contig['id'] + gap['sequence'] = seq['id'] gap['start'] = start + 1 gap['stop'] = end gap['strand'] = bc.STRAND_NA @@ -28,8 +28,8 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]: gaps.append(gap) log.info( - 'contig=%s, start=%i, stop=%i, length=%s', - gap['contig'], gap['start'], gap['stop'], gap['length'] + 'seq=%s, start=%i, stop=%i, length=%s', + gap['sequence'], gap['start'], gap['stop'], gap['length'] ) - m = RE_ASSEMBLY_GAP.search(contig['sequence'], end + 1) + m = RE_ASSEMBLY_GAP.search(seq['sequence'], end + 1) return gaps diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py index 207efa00..70050e09 100644 --- a/bakta/features/nc_rna.py +++ b/bakta/features/nc_rna.py @@ -17,7 +17,7 @@ log = logging.getLogger('NC_RNA') -def predict_nc_rnas(genome: dict, contigs_path: Path): +def predict_nc_rnas(genome: dict, sequences_path: Path): """Search for non-coding RNA genes.""" output_path = cfg.tmp_path.joinpath('ncrna-genes.tsv') @@ -35,7 +35,7 @@ def predict_nc_rnas(genome: dict, contigs_path: Path): cmd.append('-Z') cmd.append(str(2 * genome['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-genes'))) - cmd.append(str(contigs_path)) + cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) proc = sp.run( cmd, @@ -61,12 +61,12 @@ def predict_nc_rnas(genome: dict, contigs_path: Path): rfam2go[rfam] = [go] ncrnas = [] - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): ( - subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to, + subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to, start, stop, strand, trunc, passed, gc, bias, score, evalue, inc, description ) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17) @@ -86,8 +86,8 @@ def predict_nc_rnas(genome: dict, contigs_path: Path): if(evalue > HIT_EVALUE): log.debug( - 'discard low E value: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', - contig_id, start, stop, strand, subject, length, truncated, score, evalue + 'discard low E value: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', + sequence_id, start, stop, strand, subject, length, truncated, score, evalue ) else: rfam_id = f'{bc.DB_XREF_RFAM}:{accession}' @@ -98,7 +98,7 @@ def predict_nc_rnas(genome: dict, contigs_path: Path): ncrna = OrderedDict() ncrna['type'] = bc.FEATURE_NC_RNA ncrna['class'] = determine_class(description) - ncrna['contig'] = contig_id + ncrna['sequence'] = sequence_id ncrna['start'] = start ncrna['stop'] = stop ncrna['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE @@ -122,13 +122,13 @@ def predict_nc_rnas(genome: dict, contigs_path: Path): ncrna['evalue'] = evalue ncrna['db_xrefs'] = db_xrefs - nt = bu.extract_feature_sequence(ncrna, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(ncrna, sequences[sequence_id]) # extract nt sequences ncrna['nt'] = nt ncrnas.append(ncrna) log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]', - ncrna['contig'], ncrna['start'], ncrna['stop'], ncrna['strand'], ncrna['gene'], ncrna['product'], length, truncated, ncrna['score'], ncrna['evalue'], nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]', + ncrna['sequence'], ncrna['start'], ncrna['stop'], ncrna['strand'], ncrna['gene'], ncrna['product'], length, truncated, ncrna['score'], ncrna['evalue'], nt[:10], nt[-10:] ) log.info('predicted=%i', len(ncrnas)) return ncrnas diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py index cdeaf797..b5e3500e 100644 --- a/bakta/features/nc_rna_region.py +++ b/bakta/features/nc_rna_region.py @@ -16,7 +16,7 @@ log = logging.getLogger('NC_RNA_REGION') -def predict_nc_rna_regions(genome: dict, contigs_path: Path): +def predict_nc_rna_regions(genome: dict, sequences_path: Path): """Search for non-coding RNA regions.""" output_path = cfg.tmp_path.joinpath('ncrna-regions.tsv') @@ -34,7 +34,7 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path): cmd.append('-Z') cmd.append(str(2 * genome['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-regions'))) - cmd.append(str(contigs_path)) + cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) proc = sp.run( cmd, @@ -60,11 +60,11 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path): rfam2go[rfam] = [go] ncrnas = [] - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): - (subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to, + (subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to, start, stop, strand, trunc, passed, gc, bias, score, evalue, inc, description) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17) @@ -83,8 +83,8 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path): if(evalue > HIT_EVALUE): log.debug( - 'discard low E value: contig=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', - contig_id, start, stop, strand, subject, length, truncated, score, evalue + 'discard low E value: seq=%s, start=%i, stop=%i, strand=%s, gene=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', + sequence_id, start, stop, strand, subject, length, truncated, score, evalue ) else: rfam_id = f'{bc.DB_XREF_RFAM}:{accession}' @@ -95,7 +95,7 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path): ncrna_region = OrderedDict() ncrna_region['type'] = bc.FEATURE_NC_RNA_REGION ncrna_region['class'] = determine_class(description) - ncrna_region['contig'] = contig_id + ncrna_region['sequence'] = sequence_id ncrna_region['start'] = start ncrna_region['stop'] = stop ncrna_region['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE @@ -114,13 +114,13 @@ def predict_nc_rna_regions(genome: dict, contigs_path: Path): ncrna_region['evalue'] = evalue ncrna_region['db_xrefs'] = db_xrefs - nt = bu.extract_feature_sequence(ncrna_region, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(ncrna_region, sequences[sequence_id]) # extract nt sequences ncrna_region['nt'] = nt ncrnas.append(ncrna_region) log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, label=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', - ncrna_region['contig'], ncrna_region['start'], ncrna_region['stop'], ncrna_region['strand'], ncrna_region['label'], ncrna_region['product'], length, truncated, ncrna_region['score'], ncrna_region['evalue'] + 'seq=%s, start=%i, stop=%i, strand=%s, label=%s, product=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', + ncrna_region['sequence'], ncrna_region['start'], ncrna_region['stop'], ncrna_region['strand'], ncrna_region['label'], ncrna_region['product'], length, truncated, ncrna_region['score'], ncrna_region['evalue'] ) log.info('predicted=%i', len(ncrnas)) return ncrnas diff --git a/bakta/features/orf.py b/bakta/features/orf.py index a58c72a6..61cf2790 100644 --- a/bakta/features/orf.py +++ b/bakta/features/orf.py @@ -25,8 +25,8 @@ def detect_spurious(orfs: Sequence[dict]): orf = orf_by_aa_digest[hit.name.decode()] if hit.evalue > bc.MIN_HMM_EVALUE: log.debug( - 'discard low spurious E value: contig=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score + 'discard low spurious E value: seq=%s, start=%i, stop=%i, strand=%s, subject=%s, evalue=%1.1e, bitscore=%f', + orf['sequence'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score ) else: discard = OrderedDict() @@ -38,8 +38,8 @@ def detect_spurious(orfs: Sequence[dict]): orf['discarded'] = discard discarded_orfs.append(orf) log.info( - 'discard spurious: contig=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', - orf['contig'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score + 'discard spurious: seq=%s, start=%i, stop=%i, strand=%s, homology=%s, evalue=%1.1e, bitscore=%f', + orf['sequence'], orf['start'], orf['stop'], orf['strand'], hit.best_domain.alignment.hmm_name.decode(), hit.evalue, hit.score ) log.info('discarded=%i', len(discarded_orfs)) return discarded_orfs @@ -47,7 +47,7 @@ def detect_spurious(orfs: Sequence[dict]): def get_orf_key(orf: dict) -> str: """Generate a standardized and unique ORF-like feature key for internal store/analyze/parse/retrieval cycles.""" - return f"{orf['aa_hexdigest']}-{orf['contig']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}" + return f"{orf['aa_hexdigest']}-{orf['sequence']}-{orf['start']}-{orf['stop']}-{orf['strand']}-{orf.get('source', 'internal')}" def get_orf_dictionary(orfs: Sequence[dict]) -> Dict[str, dict]: diff --git a/bakta/features/ori.py b/bakta/features/ori.py index 82281612..bc7c6e59 100644 --- a/bakta/features/ori.py +++ b/bakta/features/ori.py @@ -18,7 +18,7 @@ log = logging.getLogger('ORI') -def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[dict]: +def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]: """Search for oriT/C sequences.""" database = 'oric.fna' if ori_type == bc.FEATURE_ORIC else 'orit.fna' @@ -26,7 +26,7 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di cmd = [ 'blastn', '-query', str(cfg.db_path.joinpath(database)), - '-subject', str(contigs_path), + '-subject', str(sequences_path), '-culling_limit', '1', '-evalue', HIT_EVALUE, '-num_threads', str(cfg.threads), @@ -57,33 +57,33 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di 'ori_start': int(cols[1]), 'ori_end': int(cols[2]), 'ori_length': int(cols[3]), - 'contig': cols[4], - 'contig_start': int(cols[5]), - 'contig_stop': int(cols[6]), + 'sequence': cols[4], + 'sequence_start': int(cols[5]), + 'sequence_stop': int(cols[6]), 'strand': bc.STRAND_FORWARD if cols[9] == 'plus' else bc.STRAND_REVERSE, 'coverage': int(cols[7]) / int(cols[3]), 'identity': int(cols[8]) / int(cols[7]) } if(hit['strand'] == bc.STRAND_REVERSE): - hit['contig_start'], hit['contig_stop'] = hit['contig_stop'], hit['contig_start'] + hit['sequence_start'], hit['sequence_stop'] = hit['sequence_stop'], hit['sequence_start'] if(hit['coverage'] >= HIT_COVERAGE and hit['identity'] >= HIT_IDENTITY): - contig_hits = hits.get(hit['contig'], []) - contig_hits.append(hit) - if(len(contig_hits) == 1): - hits[hit['contig']] = contig_hits + sequence_hits = hits.get(hit['sequence'], []) + sequence_hits.append(hit) + if(len(sequence_hits) == 1): + hits[hit['sequence']] = sequence_hits log.debug( - 'raw hit: type=%s, contig=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f', - ori_type, hit['contig'], hit['contig_start'], hit['contig_stop'], hit['strand'], hit['coverage'], hit['identity'] + 'raw hit: type=%s, seq=%s, start=%i, stop=%i, strand=%s, coverage=%0.3f, identity=%0.3f', + ori_type, hit['sequence'], hit['sequence_start'], hit['sequence_stop'], hit['strand'], hit['coverage'], hit['identity'] ) # combine overlapping hits (simple 1D array peak detection) oris = [] - for contig in genome['contigs']: - contig_hits = hits.get(contig['id'], None) - if(contig_hits): - region_hits = [0] * (contig['length'] + 1) # init with extra leading slot (start at 1) - for hit in contig_hits: - for i in range(hit['contig_start'], hit['contig_stop'] + 1): + for seq in genome['sequences']: + sequence_hits = hits.get(seq['id'], None) + if(sequence_hits): + region_hits = [0] * (seq['length'] + 1) # init with extra leading slot (start at 1) + for hit in sequence_hits: + for i in range(hit['sequence_start'], hit['sequence_stop'] + 1): region_hits[i] += 1 start = -1 stop = -1 @@ -91,11 +91,11 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di if(hit_count == 0): if(start != -1): # new stop stop = i - 1 - if(ori_type == bc.FEATURE_ORIC and contig['type'] == bc.REPLICON_PLASMID): + if(ori_type == bc.FEATURE_ORIC and seq['type'] == bc.REPLICON_PLASMID): ori_type = bc.FEATURE_ORIV ori = OrderedDict() ori['type'] = ori_type - ori['contig'] = contig['id'] + ori['sequence'] = seq['id'] ori['start'] = start ori['stop'] = stop ori['strand'] = bc.STRAND_UNKNOWN @@ -109,12 +109,12 @@ def predict_oris(genome: dict, contigs_path: Path, ori_type: str) -> Sequence[di else: ori['product'] = 'origin of replication' - nt = bu.extract_feature_sequence(ori, contig) # extract nt sequences + nt = bu.extract_feature_sequence(ori, seq) # extract nt sequences ori['nt'] = nt log.info( - 'type=%s, contig=%s, start=%i, stop=%i, nt=[%s..%s]', - ori_type, ori['contig'], ori['start'], ori['stop'], nt[:10], nt[-10:] + 'type=%s, seq=%s, start=%i, stop=%i, nt=[%s..%s]', + ori_type, ori['sequence'], ori['start'], ori['stop'], nt[:10], nt[-10:] ) start = -1 stop = -1 diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py index 4c23d72a..640ebfeb 100644 --- a/bakta/features/r_rna.py +++ b/bakta/features/r_rna.py @@ -17,7 +17,7 @@ log = logging.getLogger('R_RNA') -def predict_r_rnas(genome: dict, contigs_path: Path): +def predict_r_rnas(genome: dict, sequences_path: Path): """Search for ribosomal RNA sequences.""" output_path = cfg.tmp_path.joinpath('rrna.tsv') @@ -35,7 +35,7 @@ def predict_r_rnas(genome: dict, contigs_path: Path): cmd.append('-Z') cmd.append(str(2 * genome['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('rRNA'))) - cmd.append(str(contigs_path)) + cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) proc = sp.run( cmd, @@ -51,12 +51,12 @@ def predict_r_rnas(genome: dict, contigs_path: Path): raise Exception(f'cmscan error! error code: {proc.returncode}') rrnas = [] - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): ( - subject, accession, contig_id, contig_acc, mdl, mdl_from, mdl_to, + subject, accession, sequence_id, sequence_acc, mdl, mdl_from, mdl_to, start, stop, strand, trunc, passed, gc, bias, score, evalue, inc, description ) = bc.RE_MULTIWHITESPACE.split(line.strip(), maxsplit=17) @@ -89,8 +89,8 @@ def predict_r_rnas(genome: dict, contigs_path: Path): consensus_length = 2925 else: log.warning( - 'unknown rRNA detected! accession=%s, contig=%s, start=%i, stop=%i, strand=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', - accession, contig_id, start, stop, strand, length, truncated, score, evalue + 'unknown rRNA detected! accession=%s, seq=%s, start=%i, stop=%i, strand=%s, length=%i, truncated=%s, score=%1.1f, evalue=%1.1e', + accession, sequence_id, start, stop, strand, length, truncated, score, evalue ) continue @@ -100,13 +100,13 @@ def predict_r_rnas(genome: dict, contigs_path: Path): if(coverage < HIT_COVERAGE): log.debug( - 'discard low coverage: contig=%s, rRNA=%s, start=%i, stop=%i, strand=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e', - contig_id, rrna_tag, start, stop, strand, length, coverage, truncated, score, evalue + 'discard low coverage: seq=%s, rRNA=%s, start=%i, stop=%i, strand=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e', + sequence_id, rrna_tag, start, stop, strand, length, coverage, truncated, score, evalue ) else: rrna = OrderedDict() rrna['type'] = bc.FEATURE_R_RNA - rrna['contig'] = contig_id + rrna['sequence'] = sequence_id rrna['start'] = start rrna['stop'] = stop rrna['strand'] = bc.STRAND_FORWARD if strand == '+' else bc.STRAND_REVERSE @@ -126,13 +126,13 @@ def predict_r_rnas(genome: dict, contigs_path: Path): rrna['evalue'] = evalue rrna['db_xrefs'] = db_xrefs - nt = bu.extract_feature_sequence(rrna, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(rrna, sequences[sequence_id]) # extract nt sequences rrna['nt'] = nt rrnas.append(rrna) log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]', - rrna['contig'], rrna['start'], rrna['stop'], rrna['strand'], rrna['gene'], rrna['product'], length, coverage, truncated, score, evalue, nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, length=%i, coverage=%0.3f, truncated=%s, score=%1.1f, evalue=%1.1e, nt=[%s..%s]', + rrna['sequence'], rrna['start'], rrna['stop'], rrna['strand'], rrna['gene'], rrna['product'], length, coverage, truncated, score, evalue, nt[:10], nt[-10:] ) log.info('predicted=%i', len(rrnas)) diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index 74124c32..500d52ef 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -23,11 +23,11 @@ def extract(genome: dict): """Predict open reading frames in mem via BioPython.""" orfs = [] - for contig in genome['contigs']: - dna_seq = Seq(contig['sequence']) - for strand, seq in [(bc.STRAND_FORWARD, dna_seq), (bc.STRAND_REVERSE, dna_seq.reverse_complement())]: # strands +/- + for seq in genome['sequences']: + nt_seq = Seq(seq['sequence']) + for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]: # strands +/- for frame in range(3): # frames 1/2/3 -> 0, 1, 2 - seq_frame = seq[frame:] + seq_frame = strand_nt_seq[frame:] # remove non-triplet tail nucleotides residue = len(seq_frame) % 3 @@ -46,12 +46,12 @@ def extract(genome: dict): dna_start = aa_start * 3 + frame + 1 # +1: 0 based idx to 1 based dna_stop = aa_end * 3 + 2 + frame + 1 else: - dna_start = len(seq) - frame - (aa_end + 1) * 3 + 1 - dna_stop = len(seq) - frame - aa_start * 3 + dna_start = len(strand_nt_seq) - frame - (aa_end + 1) * 3 + 1 + dna_stop = len(strand_nt_seq) - frame - aa_start * 3 sorf = OrderedDict() sorf['type'] = bc.FEATURE_SORF - sorf['contig'] = contig['id'] + sorf['sequence'] = seq['id'] sorf['start'] = dna_start sorf['stop'] = dna_stop sorf['strand'] = strand @@ -63,13 +63,13 @@ def extract(genome: dict): sorf['aa_digest'] = aa_digest sorf['aa_hexdigest'] = aa_hexdigest - nt = bu.extract_feature_sequence(sorf, contig) # extract nt sequences + nt = bu.extract_feature_sequence(sorf, seq) # extract nt sequences sorf['nt'] = nt orfs.append(sorf) log.debug( - 'contig=%s, start=%i, stop=%i, strand=%s, frame=%i, aa-length=%i, aa=%s, nt=[%s..%s]', - contig['id'], sorf['start'], sorf['stop'], strand, frame, len(aa), aa, nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, frame=%i, aa-length=%i, aa=%s, nt=[%s..%s]', + seq['id'], sorf['start'], sorf['stop'], strand, frame, len(aa), aa, nt[:10], nt[-10:] ) aa_start = aa_seq.find('M', aa_start + 1) if(aa_start > aa_end): @@ -89,68 +89,68 @@ def get_feature_stop(feature: dict) -> int: def overlap_filter(genome: dict, orfs_raw: Sequence[dict]): """Filter in-mem ORFs by overlapping CDSs.""" - t_rnas_per_contig = {k['id']: [] for k in genome['contigs']} + t_rnas_per_sequence = {seq['id']: [] for seq in genome['sequences']} for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []): - t_rnas = t_rnas_per_contig[t_rna['contig']] + t_rnas = t_rnas_per_sequence[t_rna['sequence']] t_rnas.append(t_rna) for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []): - t_rnas = t_rnas_per_contig[tm_rna['contig']] + t_rnas = t_rnas_per_sequence[tm_rna['sequence']] t_rnas.append(tm_rna) - r_rna_per_contig = {k['id']: [] for k in genome['contigs']} + r_rna_per_sequence = {seq['id']: [] for seq in genome['sequences']} for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []): - r_rnas = r_rna_per_contig[r_rna['contig']] + r_rnas = r_rna_per_sequence[r_rna['sequence']] r_rnas.append(r_rna) - # nc_rnas_per_contig = {k['id']: [] for k in genome['contigs']} + # nc_rnas_per_sequence = {k['id']: [] for k in genome['sequences']} # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA, []): - # nc_rnas = nc_rnas_per_contig[nc_rna['contig']] + # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []): - # nc_rnas = nc_rnas_per_contig[nc_rna['contig']] + # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) - crispr_arrays_per_contig = {k['id']: [] for k in genome['contigs']} + crispr_arrays_per_sequence = {seq['id']: [] for seq in genome['sequences']} for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []): - crispr_arrays = crispr_arrays_per_contig[crispr_array['contig']] + crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']] crispr_arrays.append(crispr_array) - cdss_per_contig = {k['id']: [] for k in genome['contigs']} + cdss_per_sequence = {k['id']: [] for k in genome['sequences']} for cds in genome['features'].get(bc.FEATURE_CDS, []): - cdss = cdss_per_contig[cds['contig']] + cdss = cdss_per_sequence[cds['sequence']] cdss.append(cds) - sorfs_per_contig = {k['id']: [] for k in genome['contigs']} + sorfs_per_sequence = {seq['id']: [] for seq in genome['sequences']} for sorf in orfs_raw: - orfs = sorfs_per_contig[sorf['contig']] + orfs = sorfs_per_sequence[sorf['sequence']] orfs.append(sorf) discarded_sorf_keys = set() with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe: futures = [] - for contig in genome['contigs']: - contig_sorfs = sorfs_per_contig[contig['id']] - log.debug('filter: contig=%s, # sORFs=%i', contig['id'], len(contig_sorfs)) - if(len(contig_sorfs) < 100): # execute sORF filter task - sorf_keys = filter_sorf(contig_sorfs, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']]) + for seq in genome['sequences']: + sequence_sorfs = sorfs_per_sequence[seq['id']] + log.debug('filter: seq=%s, # sORFs=%i', seq['id'], len(sequence_sorfs)) + if(len(sequence_sorfs) < 100): # execute sORF filter task + sorf_keys = filter_sorf(sequence_sorfs, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']]) for sorf_key in [sk for sk in sorf_keys if sk is not None]: discarded_sorf_keys.add(sorf_key) - elif(len(contig_sorfs) < 1000): # submit sORF filter task to thread pool - futures.append(tpe.submit(filter_sorf, contig_sorfs, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']])) + elif(len(sequence_sorfs) < 1000): # submit sORF filter task to thread pool + futures.append(tpe.submit(filter_sorf, sequence_sorfs, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']])) else: # submit sORF chunk filter tasks to thread pool - chunk_size = math.ceil(len(contig_sorfs) / cfg.threads) if (len(contig_sorfs) >= cfg.threads * 1000) else 1000 + chunk_size = math.ceil(len(sequence_sorfs) / cfg.threads) if (len(sequence_sorfs) >= cfg.threads * 1000) else 1000 log.debug('filter: chunk-size=%i', chunk_size) - for i in range(0, len(contig_sorfs), chunk_size): - sorf_chunk = contig_sorfs[i:i + chunk_size] + for i in range(0, len(sequence_sorfs), chunk_size): + sorf_chunk = sequence_sorfs[i:i + chunk_size] log.debug('filter chunk: i=%i, chunk-elements=%i', i, len(sorf_chunk)) - futures.append(tpe.submit(filter_sorf, sorf_chunk, cdss_per_contig[contig['id']], r_rna_per_contig[contig['id']], t_rnas_per_contig[contig['id']], crispr_arrays_per_contig[contig['id']])) + futures.append(tpe.submit(filter_sorf, sorf_chunk, cdss_per_sequence[seq['id']], r_rna_per_sequence[seq['id']], t_rnas_per_sequence[seq['id']], crispr_arrays_per_sequence[seq['id']])) for f in futures: for sorf_key in [sk for sk in f.result() if sk is not None]: discarded_sorf_keys.add(sorf_key) valid_sorfs = [] discarded_sorfs = [] - for sorfs in sorfs_per_contig.values(): + for sorfs in sorfs_per_sequence.values(): for sorf in sorfs: key = orf.get_orf_key(sorf) if(key in discarded_sorf_keys): @@ -162,12 +162,12 @@ def overlap_filter(genome: dict, orfs_raw: Sequence[dict]): return valid_sorfs, discarded_sorfs -def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_r_rnas: Sequence[dict], contig_t_rnas: Sequence[dict], contig_crispr_arrays: Sequence[dict]): +def filter_sorf(sorf_chunk: Sequence[dict], sequence_cdss: Sequence[dict], sequence_r_rnas: Sequence[dict], sequence_t_rnas: Sequence[dict], sequence_crispr_arrays: Sequence[dict]): discarded_sorf_keys = [] for sorf in sorf_chunk: break_flag = False # filter CDS overlapping ORFs - for cds in contig_cdss: + for cds in sequence_cdss: # log.debug('filter short ORFs by CDS: %s%i[%i->%i]', cds['strand'], cds['frame'], cds['start'], cds['stop']) if(sorf['strand'] == cds['strand']): if(sorf['frame'] == cds['frame']): @@ -218,7 +218,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_ continue # filter rRNA overlapping ORFs - for r_rna in contig_r_rnas: + for r_rna in sequence_r_rnas: # log.debug('filter short ORFs by rRNA: %s[%i->%i]', r_rna['strand'], r_rna['start'], r_rna['stop']) # fast/simple overlap detection for rRNAs if(sorf['stop'] < r_rna['start'] or sorf['start'] > r_rna['stop']): @@ -232,7 +232,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_ # filter tRNA overlapping ORFs # log.debug('filter short ORFs by tRNA: %s[%i->%i]', t_rna['strand'], t_rna['start'], t_rna['stop']) - for t_rna in contig_t_rnas: + for t_rna in sequence_t_rnas: # fast/simple overlap detection for tRNAs if(sorf['stop'] < t_rna['start'] or sorf['start'] > t_rna['stop']): continue @@ -245,7 +245,7 @@ def filter_sorf(sorf_chunk: Sequence[dict], contig_cdss: Sequence[dict], contig_ # filter CRISPR array overlapping ORFs # log.debug('filter short ORFs by CRISPR: [%i->%i]', crispr['start'], crispr['stop']) - for crispr in contig_crispr_arrays: + for crispr in sequence_crispr_arrays: # fast/simple overlap detection for CRISPR if(sorf['stop'] < crispr['start'] or sorf['start'] > crispr['stop']): continue @@ -365,8 +365,8 @@ def search(sorfs: Sequence[dict], cluster_type: str): result[psc.DB_PSC_COL_UNIREF90 if cluster_type == 'full' else pscc.DB_PSCC_COL_UNIREF50] = cluster_id sorf['psc' if cluster_type == 'full' else 'pscc'] = result log.info( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s', - sorf['contig'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id + 'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef=%s', + sorf['sequence'], sorf['start'], sorf['stop'], sorf['strand'], len(sorf['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) sorfs_found = [] diff --git a/bakta/features/signal_peptides.py b/bakta/features/signal_peptides.py index f1197f2a..4657f730 100644 --- a/bakta/features/signal_peptides.py +++ b/bakta/features/signal_peptides.py @@ -62,8 +62,8 @@ def search(orfs: Sequence[dict], orf_aa_path: Path): orf[bc.FEATURE_SIGNAL_PEPTIDE] = {} orf[bc.FEATURE_SIGNAL_PEPTIDE] = sig_pep log.debug( - 'hit: contig=%s, nt-start=%i, nt-stop=%i, aa-start=%i, aa-stop=%i, score=%0.2f', - orf['contig'], start_nt, stop_nt, start_aa, stop_aa, score + 'hit: seq=%s, nt-start=%i, nt-stop=%i, aa-start=%i, aa-stop=%i, score=%0.2f', + orf['sequence'], start_nt, stop_nt, start_aa, stop_aa, score ) sig_peps.append(sig_pep) else: diff --git a/bakta/features/t_rna.py b/bakta/features/t_rna.py index efa4a39b..901d3d9d 100644 --- a/bakta/features/t_rna.py +++ b/bakta/features/t_rna.py @@ -42,7 +42,7 @@ } -def predict_t_rnas(genome: dict, contigs_path: Path): +def predict_t_rnas(genome: dict, sequences_path: Path): """Search for tRNA sequences.""" txt_output_path = cfg.tmp_path.joinpath('trna.tsv') @@ -53,7 +53,7 @@ def predict_t_rnas(genome: dict, contigs_path: Path): '--output', str(txt_output_path), '--fasta', str(fasta_output_path), '--thread', str(cfg.threads), - str(contigs_path) + str(sequences_path) ] log.debug('cmd=%s', cmd) proc = sp.run( @@ -70,20 +70,20 @@ def predict_t_rnas(genome: dict, contigs_path: Path): raise Exception(f'tRNAscan-SE error! error code: {proc.returncode}') trnas = {} - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with txt_output_path.open() as fh: for line in fh.readlines()[3:]: # skip first 3 lines - (contig_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t') + (sequence_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t') start, stop, strand = int(start), int(stop), bc.STRAND_FORWARD if(start > stop): # reverse start, stop = stop, start strand = bc.STRAND_REVERSE - contig_id = contig_id.strip() # bugfix for extra single whitespace in tRNAscan-SE output + sequence_id = sequence_id.strip() # bugfix for extra single whitespace in tRNAscan-SE output trna = OrderedDict() trna['type'] = bc.FEATURE_T_RNA - trna['contig'] = contig_id + trna['sequence'] = sequence_id trna['start'] = start trna['stop'] = stop trna['strand'] = strand @@ -101,7 +101,7 @@ def predict_t_rnas(genome: dict, contigs_path: Path): trna['score'] = float(score) - nt = bu.extract_feature_sequence(trna, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(trna, sequences[sequence_id]) # extract nt sequences trna['nt'] = nt trna['db_xrefs'] = [] @@ -109,11 +109,11 @@ def predict_t_rnas(genome: dict, contigs_path: Path): if(so_term): trna['db_xrefs'].append(so_term.id) - key = f'{contig_id}.trna{trna_id}' + key = f'{sequence_id}.trna{trna_id}' trnas[key] = trna log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, score=%1.1f, nt=[%s..%s]', - trna['contig'], trna['start'], trna['stop'], trna['strand'], trna.get('gene', ''), trna['product'], trna['score'], nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, score=%1.1f, nt=[%s..%s]', + trna['sequence'], trna['start'], trna['stop'], trna['strand'], trna.get('gene', ''), trna['product'], trna['score'], nt[:10], nt[-10:] ) with fasta_output_path.open() as fh: diff --git a/bakta/features/tm_rna.py b/bakta/features/tm_rna.py index 17abd1aa..26d0bc6c 100644 --- a/bakta/features/tm_rna.py +++ b/bakta/features/tm_rna.py @@ -13,7 +13,7 @@ log = logging.getLogger('TM_RNA') -def predict_tm_rnas(genome: dict, contigs_path: Path): +def predict_tm_rnas(genome: dict, sequences_path: Path): """Search for tmRNA sequences.""" txt_output_path = cfg.tmp_path.joinpath('tmrna.tsv') @@ -23,7 +23,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path): f'-gc{cfg.translation_table}', '-w', # batch mode '-o', str(txt_output_path), - str(contigs_path) + str(sequences_path) ] if(cfg.complete): cmd.append('-c') # complete circular sequence(s) @@ -45,14 +45,14 @@ def predict_tm_rnas(genome: dict, contigs_path: Path): raise Exception(f'aragorn error! error code: {proc.returncode}') tmrnas = [] - contigs = {c['id']: c for c in genome['contigs']} + sequences = {seq['id']: seq for seq in genome['sequences']} with txt_output_path.open() as fh: - contig_id = None + sequence_id = None for line in fh: line = line.strip() cols = line.split() if(line[0] == '>'): - contig_id = cols[0][1:] + sequence_id = cols[0][1:] elif(len(cols) == 5): (nr, type, location, tag_location, tag_aa) = line.split() strand = bc.STRAND_FORWARD @@ -66,7 +66,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path): if(start > 0 and stop > 0): # prevent edge tmRNA on linear sequences tmrna = OrderedDict() tmrna['type'] = bc.FEATURE_TM_RNA - tmrna['contig'] = contig_id + tmrna['sequence'] = sequence_id tmrna['start'] = start tmrna['stop'] = stop tmrna['strand'] = strand @@ -75,7 +75,7 @@ def predict_tm_rnas(genome: dict, contigs_path: Path): tmrna['tag_aa'] = tag_aa.replace('*', '') tmrna['db_xrefs'] = [so.SO_TMRNA.id] - nt = bu.extract_feature_sequence(tmrna, contigs[contig_id]) # extract nt sequences + nt = bu.extract_feature_sequence(tmrna, sequences[sequence_id]) # extract nt sequences tmrna['nt'] = nt if(start > stop): @@ -83,8 +83,8 @@ def predict_tm_rnas(genome: dict, contigs_path: Path): tmrnas.append(tmrna) log.info( - 'contig=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s]', - tmrna['contig'], tmrna['start'], tmrna['stop'], tmrna['strand'], tmrna['gene'], tmrna['product'], nt[:10], nt[-10:] + 'seq=%s, start=%i, stop=%i, strand=%s, gene=%s, product=%s, nt=[%s..%s]', + tmrna['sequence'], tmrna['start'], tmrna['stop'], tmrna['strand'], tmrna['gene'], tmrna['product'], nt[:10], nt[-10:] ) log.info('predicted=%i', len(tmrnas)) return tmrnas diff --git a/bakta/io.py b/bakta/io.py new file mode 100644 index 00000000..64c009e8 --- /dev/null +++ b/bakta/io.py @@ -0,0 +1,196 @@ +import atexit +import logging +import os +import sys + +from pathlib import Path + +import bakta +import bakta.constants as bc +import bakta.config as cfg +import bakta.utils as bu +import bakta.io.fasta as fasta +import bakta.io.json as json +import bakta.io.tsv as tsv +import bakta.io.gff as gff +import bakta.io.insdc as insdc +import bakta.plot as plot + + +log = logging.getLogger('IO') + + +def main(): + # parse options and arguments + parser = bu.init_parser(sub_command='_proteins') + parser.add_argument('input', metavar='', help='Bakta annotations in JSON format') + + arg_group_io = parser.add_argument_group('Input / Output') + arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') + arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') + arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder') + + arg_group_general = parser.add_argument_group('General') + arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit') + arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information') + arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.') + arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}') + args = parser.parse_args() + + ############################################################################ + # Setup logging + ############################################################################ + cfg.prefix = args.prefix if args.prefix else Path(args.input).stem + output_path = cfg.check_output_path(args.output, args.force) + cfg.force = args.force + log.info('force=%s', args.force) + + bu.setup_logger(output_path, cfg.prefix, args) + log.info('prefix=%s', cfg.prefix) + log.info('output=%s', output_path) + + ############################################################################ + # Checks and configurations + # - check parameters and setup global configuration + # - test database + # - test binary dependencies + ############################################################################ + try: + if args.input == '': + raise ValueError('File path argument must be non-empty') + annotation_path = Path(args.input).resolve() + cfg.check_readability('annotation', annotation_path) + cfg.check_content_size('annotation', annotation_path) + except: + log.error('provided annotation file not valid! path=%s', args.input) + sys.exit(f'ERROR: annotation file ({args.input}) not valid!') + log.info('input-path=%s', annotation_path) + + cfg.check_tmp_path(args) + cfg.debug = args.debug + log.info('debug=%s', cfg.debug) + cfg.verbose = True if cfg.debug else args.verbose + log.info('verbose=%s', cfg.verbose) + cfg.user_proteins = cfg.check_user_proteins(args) + + if(cfg.verbose): + print(f'Bakta v{bakta.__version__}') + print('Options and arguments:') + print(f'\tinput: {annotation_path}') + print(f'\toutput: {cfg.output_path}') + print(f'\tprefix: {cfg.prefix}') + if(cfg.force): print(f'\tforce: {cfg.force}') + + if(cfg.debug): + print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}") + else: + atexit.register(bu.cleanup, log, cfg.tmp_path) # register cleanup exit hook + + ############################################################################ + # Import annotations from JSON + ############################################################################ + print('Parse genome annotations...') + with annotation_path.open('r') as fh: + annotation = json.load(fh) + features = annotation['features'] + sequences = annotation['sequences'] + genome = { + 'features': features, + 'sequence': sequences, + 'taxon': annotation['genome'] + } + features_by_sequence = {k['id']: [] for k in genome['sequences']} + for feature in genome['features']: + sequence_features = features_by_sequence.get(feature['sequence']) + sequence_features.append(feature) + + ############################################################################ + # Write output files + # - write optional output files in GFF3/GenBank/EMBL formats + # - measure runtime + # - write comprehensive annotation results as JSON + # - remove temp directory + ############################################################################ + print(f'\nExport annotation results to: {cfg.output_path}') + print('\thuman readable TSV...') + tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') + tsv.write_features(genome['sequences'], features_by_sequence, tsv_path) + + print('\tGFF3...') + gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') + gff.write_features(genome, features_by_sequence, gff3_path) + + print('\tINSDC GenBank & EMBL...') + genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') + embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') + insdc.write_features(genome, features, genbank_path, embl_path) + + print('\tgenome sequences...') + fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') + fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True) + + print('\tfeature nucleotide sequences...') + ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') + fasta.write_ffn(features, ffn_path) + + print('\ttranslated CDS sequences...') + faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa') + fasta.write_faa(features, faa_path) + + print('\tfeature inferences...') + tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') + tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path) + + if(cfg.skip_plot or cfg.meta): + print('\tskip generation of circular genome plot...') + else: + print('\tcircular genome plot...') + plot.write(features, genome['sequences'], cfg.output_path) + + if(cfg.skip_cds is False): + hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] + print('\thypothetical TSV...') + tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv') + tsv.write_hypotheticals(hypotheticals, tsv_path) + + print('\ttranslated hypothetical CDS sequences...') + faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa') + fasta.write_faa(hypotheticals, faa_path) + + print('\tGenome and annotation summary...') + summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') + with summary_path.open('w') as fh_out: + genome_stats = bu.calc_genome_stats(genome, features) + fh_out.write('Sequence(s):\n') + fh_out.write(f"Length: {genome['size']:}\n") + fh_out.write(f"Count: {len(genome['sequences'])}\n") + fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") + fh_out.write(f"N50: {genome_stats['n50']:}\n") + fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") + fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n") + fh_out.write('\nAnnotation:\n') + fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") + fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") + fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n") + fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n") + fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n") + fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n") + cdss = [f for f in features if f['type'] == bc.FEATURE_CDS] + fh_out.write(f"CDSs: {len(cdss)}\n") + fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n") + fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n") + fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n") + fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n") + fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n") + fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n") + fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n") + fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n") + fh_out.write('\nBakta:\n') + fh_out.write(f'Software: v{bakta.__version__}\n') + fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n") + fh_out.write('DOI: 10.1099/mgen.0.000685\n') + fh_out.write('URL: github.com/oschwengers/bakta\n') + + +if __name__ == '__main__': + main() diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py index 240acbf8..076eae4d 100644 --- a/bakta/io/fasta.py +++ b/bakta/io/fasta.py @@ -18,60 +18,60 @@ FASTA_LINE_WRAPPING = 60 -def import_contigs(contigs_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]: - """Import raw contigs.""" - contigs = [] - with xopen(str(contigs_path), threads=0) as fh: +def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=True) -> Sequence[dict]: + """Import raw sequences from Fasta file.""" + sequences = [] + with xopen(str(sequences_path), threads=0) as fh: for record in SeqIO.parse(fh, 'fasta'): - seq = str(record.seq).upper() - if('-' in seq): - dash_count = seq.count('-') - seq = seq.replace('-', '') + raw_sequence = str(record.seq).upper() + if('-' in raw_sequence): + dash_count = raw_sequence.count('-') + raw_sequence = raw_sequence.replace('-', '') log.info('import: Discarded alignment gaps (dashes): id=%s, occurences=%i', record.id, dash_count) if(is_dna): - if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(seq) is None): + if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None): log.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id) raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}') else: - if(seq[-1] == '*'): # remove trailing stop asterik - seq = seq[:-1] - log.debug('import: Removed trailing asterik! id=%s, seq=%s', record.id, seq) - if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(seq) is None): - log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, seq) + if(raw_sequence[-1] == '*'): # remove trailing stop asterik + raw_sequence = raw_sequence[:-1] + log.debug('import: Removed trailing asterik! id=%s, seq=%s', record.id, raw_sequence) + if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None): + log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence) raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}') - contig = { + sequence = { 'id': record.id, 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '', - 'sequence': seq, - 'length': len(seq) + 'sequence': raw_sequence, + 'length': len(raw_sequence) } if(is_genomic): - contig['complete'] = False - contig['type'] = bc.REPLICON_CONTIG - contig['topology'] = bc.TOPOLOGY_LINEAR + sequence['complete'] = False + sequence['type'] = bc.REPLICON_CONTIG + sequence['topology'] = bc.TOPOLOGY_LINEAR log.info( 'imported: id=%s, length=%i, description=%s, genomic=%s, dna=%s', - contig['id'], contig['length'], contig['description'], is_genomic, is_dna + sequence['id'], sequence['length'], sequence['description'], is_genomic, is_dna ) - contigs.append(contig) - return contigs + sequences.append(sequence) + return sequences -def export_contigs(contigs: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False): - """Write contigs to Fasta file.""" +def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: bool=False, wrap: bool=False): + """Write sequences to Fasta file.""" log.info('write genome sequences: path=%s, description=%s, wrap=%s', fasta_path, description, wrap) with fasta_path.open('wt') as fh: - for contig in contigs: + for seq in sequences: if(description): - fh.write(f">{contig['id']} {contig['description']}\n") + fh.write(f">{seq['id']} {seq['description']}\n") else: - fh.write(f">{contig['id']}\n") + fh.write(f">{seq['id']}\n") if(wrap): - fh.write(wrap_sequence(contig['sequence'])) + fh.write(wrap_sequence(seq['sequence'])) else: - fh.write(contig['sequence']) + fh.write(seq['sequence']) fh.write('\n') diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 3f615a54..9d7cd355 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -14,7 +14,7 @@ log = logging.getLogger('GFF') -def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Path): +def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_path: Path): """Export features in GFF3 format.""" log.info('write features: path=%s', gff3_path) @@ -31,24 +31,24 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: fh.write(f'# DOI: {bc.BAKTA_DOI}\n') fh.write(f'# URL: {bc.BAKTA_URL}\n') - for contig in genome['contigs']: # write features - fh.write(f"##sequence-region {contig['id']} 1 {contig['length']}\n") # sequence region + for seq in genome['sequences']: # write features + fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n") # sequence region # write landmark region annotations = { - 'ID': contig['id'], - 'Name': contig['id'] + 'ID': seq['id'], + 'Name': seq['id'] } - if(contig['topology'] == bc.TOPOLOGY_CIRCULAR): + if(seq['topology'] == bc.TOPOLOGY_CIRCULAR): annotations['Is_circular'] = 'true' annotations = encode_annotations(annotations) - fh.write(f"{contig['id']}\tBakta\tregion\t1\t{str(contig['length'])}\t.\t+\t.\t{annotations}\n") + fh.write(f"{seq['id']}\tBakta\tregion\t1\t{str(seq['length'])}\t.\t+\t.\t{annotations}\n") - for feat in features_by_contig[contig['id']]: + for feat in features_by_sequence[seq['id']]: start = feat['start'] stop = feat['stop'] if('edge' in feat): - stop += contig['length'] + stop += seq['length'] if(feat['type'] == bc.FEATURE_T_RNA): annotations = { @@ -82,9 +82,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if(bc.PSEUDOGENE in feat): gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNKNOWN gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\ttRNAscan-SE\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\ttRNAscan-SE\t{so.SO_TRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_TM_RNA): annotations = { 'ID': feat['locus'], @@ -110,9 +110,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if('truncated' in feat): gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\tAragorn\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tAragorn\t{so.SO_TMRNA.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_R_RNA): annotations = { 'ID': feat['locus'], @@ -138,9 +138,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if('truncated' in feat): gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_RRNA.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_NC_RNA): annotations = { 'ID': feat['locus'], @@ -167,9 +167,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if('truncated' in feat): gene_annotations[bc.INSDC_FEATURE_PSEUDO] = True gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\tInfernal\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_NCRNA_GENE.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_NC_RNA_REGION): annotations = { 'ID': feat['id'], @@ -185,7 +185,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: annotations['Dbxref'], annotations['Note'] = insdc.revise_dbxref_insdc(feat['db_xrefs']) # remove INSDC invalid DbXrefs annotations[bc.INSDC_FEATURE_REGULATORY_CLASS] = insdc.select_regulatory_class(feat) annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tInfernal\t{so.SO_REGULATORY_REGION.name}\t{start}\t{stop}\t{feat['evalue']}\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_CRISPR): annotations = { 'ID': feat['id'], @@ -201,7 +201,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: annotations[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct' annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus'] annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") if(not cfg.compliant): i = 0 while i < len(feat['spacers']): @@ -211,7 +211,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: 'Parent': feat['id'] } annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n") spacer = feat['spacers'][i] annotations = { 'ID': f"{feat['id']}_spacer_{i+1}", @@ -219,13 +219,13 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: 'sequence': spacer['sequence'] } annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n") i += 1 if(len(feat['repeats']) - 1 == i): repeat = feat['repeats'][i] annotations = { 'ID': f"{feat['id']}_repeat_{i+1}" } annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_CDS): annotations = { 'ID': feat['locus'], @@ -258,7 +258,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if(bc.PSEUDOGENE in feat): gene_annotations[bc.INSDC_FEATURE_PSEUDOGENE] = bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNPROCESSED if feat[bc.PSEUDOGENE]['paralog'] else bc.INSDC_FEATURE_PSEUDOGENE_TYPE_UNITARY gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\t{source}\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") if('exception' in feat): ex = feat['exception'] pos = f"{ex['start']}..{ex['stop']}" @@ -270,7 +270,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if('Notes' not in annotations): annotations['Note'] = notes annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") + fh.write(f"{feat['sequence']}\t{source}\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) elif(feat['type'] == bc.FEATURE_SORF): @@ -298,9 +298,9 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: if(feat.get('gene', None)): gene_annotations['gene'] = feat['gene'] gene_annotations = encode_annotations(gene_annotations) - fh.write(f"{feat['contig']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") + fh.write(f"{feat['sequence']}\tBakta\tgene\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{gene_annotations}\n") annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") + fh.write(f"{feat['sequence']}\tBakta\t{so.SO_CDS.name}\t{start}\t{stop}\t.\t{feat['strand']}\t0\t{annotations}\n") if(bc.FEATURE_SIGNAL_PEPTIDE in feat): write_signal_peptide(fh, feat) elif(feat['type'] == bc.FEATURE_GAP): @@ -310,7 +310,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: 'product': f"gap ({feat['length']} bp)" } annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tBakta\t{so.SO_GAP.name}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_ORIC): annotations = { 'ID': feat['id'], @@ -323,7 +323,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: annotations['inference'] = 'similar to DNA sequence' annotations = encode_annotations(annotations) feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name - fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_ORIV): annotations = { 'ID': feat['id'], @@ -336,7 +336,7 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: annotations['inference'] = 'similar to DNA sequence' annotations = encode_annotations(annotations) feat_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION if cfg.compliant else so.SO_ORIC.name - fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") elif(feat['type'] == bc.FEATURE_ORIT): annotations = { 'ID': feat['id'], @@ -349,13 +349,13 @@ def write_features(genome: dict, features_by_contig: Dict[str, dict], gff3_path: annotations['inference'] = 'similar to DNA sequence' annotations = encode_annotations(annotations) feat_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER if cfg.compliant else so.SO_ORIT.name - fh.write(f"{feat['contig']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tBLAST+\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n") if(not cfg.compliant): fh.write('##FASTA\n') - for contig in genome['contigs']: # write sequences - fh.write(f">{contig['id']}\n") - fh.write(fasta.wrap_sequence(contig['sequence'])) + for seq in genome['sequences']: # write sequences + fh.write(f">{seq['id']}\n") + fh.write(fasta.wrap_sequence(seq['sequence'])) return @@ -393,4 +393,4 @@ def write_signal_peptide(fh, feat: dict): 'Parent': feat['locus'] } annotations = encode_annotations(annotations) - fh.write(f"{feat['contig']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n") + fh.write(f"{feat['sequence']}\tDeepSig\t{so.SO_SIGNAL_PEPTIDE.name}\t{sig_peptide['start']}\t{sig_peptide['stop']}\t{sig_peptide['score']:.2f}\t{feat['strand']}\t.\t{annotations}\n") diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index da56e3ef..d1d380e0 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -21,9 +21,9 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path): log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path) - contig_list = [] - for contig in genome['contigs']: - contig_features = [feat for feat in features if feat['contig'] == contig['id']] + sequence_list = [] + for seq in genome['sequences']: + sequence_features = [feat for feat in features if feat['sequence'] == seq['id']] comment = ( 'Annotated with Bakta', f"Software: v{bakta.__version__}\n", @@ -33,24 +33,24 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: '\n', '##Genome Annotation Summary:##\n', f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n", - f"{'CDSs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n", - f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n", - f"{'tmRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n", - f"{'rRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n", - f"{'ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n", - f"{'regulatory ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n", - f"{'CRISPR Arrays':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CRISPR]):5,}", - f"{'oriCs/oriVs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}", - f"{'oriTs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIT]):5,}", - f"{'gaps':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_GAP]):5,}", - f"{'pseudogenes':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS and bc.PSEUDOGENE in feat]):5,}\n" + f"{'CDSs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n", + f"{'tRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n", + f"{'tmRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n", + f"{'rRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n", + f"{'ncRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n", + f"{'regulatory ncRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n", + f"{'CRISPR Arrays':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CRISPR]):5,}", + f"{'oriCs/oriVs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}", + f"{'oriTs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_ORIT]):5,}", + f"{'gaps':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_GAP]):5,}", + f"{'pseudogenes':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CDS and bc.PSEUDOGENE in feat]):5,}\n" ) - contig_annotations = { + sequence_annotations = { 'molecule_type': 'DNA', 'source': genome['taxon'], 'date': date.today().strftime('%d-%b-%Y').upper(), - 'topology': contig['topology'], - 'data_file_division': 'HGT' if contig['type'] == bc.REPLICON_CONTIG else 'BCT', + 'topology': seq['topology'], + 'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT', # 'accession': '*', # hold back until EMBL output bug is fixed in BioPython (https://github.com/biopython/biopython/pull/3572) 'comment': comment # TODO: taxonomy @@ -62,32 +62,32 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: description = '' if(genome['taxon']): - contig_annotations['organism'] = genome['taxon'] + sequence_annotations['organism'] = genome['taxon'] source_qualifiers['organism'] = genome['taxon'] description = genome['taxon'] if(genome['strain']): source_qualifiers['strain'] = genome['strain'] - if(contig['type'] == bc.REPLICON_PLASMID): - source_qualifiers['plasmid'] = contig['name'] if contig.get('name', None) else 'unnamed' - description = f"{description} plasmid {contig.get('name', 'unnamed')}" - description += ', complete sequence' if contig['complete'] else ', whole genome shotgun sequence' - elif(contig['type'] == bc.REPLICON_CHROMOSOME): - if contig.get('name', None): - source_qualifiers['chromosome'] = contig['name'] - description = f'{description} chromosome, complete genome' if contig['complete'] else f"{description} chromosome {contig['id']}, whole genome shotgun sequence" + if(seq['type'] == bc.REPLICON_PLASMID): + source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed' + description = f"{description} plasmid {seq.get('name', 'unnamed')}" + description += ', complete sequence' if seq['complete'] else ', whole genome shotgun sequence' + elif(seq['type'] == bc.REPLICON_CHROMOSOME): + if seq.get('name', None): + source_qualifiers['chromosome'] = seq['name'] + description = f'{description} chromosome, complete genome' if seq['complete'] else f"{description} chromosome {seq['id']}, whole genome shotgun sequence" else: - description += f" {contig['id']}, whole genome shotgun sequence" + description += f" {seq['id']}, whole genome shotgun sequence" if(len(description) > 0 and description[0] == ' '): # discard potential leading whitespace description = description[1:] - contig_rec = SeqIO.SeqRecord(id=contig['id'], name=contig['id'], description=description, annotations=contig_annotations, seq=Seq(contig['sequence'])) + sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['sequence'])) - source = SeqFeature(FeatureLocation(0, contig['length'], strand=+1), type='source', qualifiers=source_qualifiers) + source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers) seq_feature_list = [source] - for feature in contig_features: + for feature in sequence_features: insdc_feature_type = None qualifiers = { 'note': [] @@ -226,7 +226,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: start = feature['start'] - 1 stop = feature['stop'] if('edge' in feature): - fl_1 = FeatureLocation(start, contig['length'], strand=strand) + fl_1 = FeatureLocation(start, seq['length'], strand=strand) fl_2 = FeatureLocation(0, stop, strand=strand) if(feature['strand'] == bc.STRAND_REVERSE): feature_location = CompoundLocation([fl_2, fl_1]) @@ -273,16 +273,16 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: seq_feature_list.append(feat_seqfeat) for acc_feature in accompanying_features: # add accompanying features, e.g. signal peptides seq_feature_list.append(acc_feature) - contig_rec.features = seq_feature_list - contig_list.append(contig_rec) + sequence_record.features = seq_feature_list + sequence_list.append(sequence_record) with genbank_output_path.open('wt', encoding='utf-8') as fh: log.info('write GenBank: path=%s', genbank_output_path) - SeqIO.write(contig_list, fh, format='genbank') + SeqIO.write(sequence_list, fh, format='genbank') with embl_output_path.open('wt', encoding='utf-8') as fh: log.info('write EMBL: path=%s', embl_output_path) - SeqIO.write(contig_list, fh, format='embl') + SeqIO.write(sequence_list, fh, format='embl') def select_ncrna_class(feature: dict) -> str: diff --git a/bakta/io/json.py b/bakta/io/json.py index c27c60a9..819a1813 100644 --- a/bakta/io/json.py +++ b/bakta/io/json.py @@ -46,7 +46,7 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path): output['genome'] = ordered_genome stats = OrderedDict() - stats['no_sequences'] = len(genome['contigs']) + stats['no_sequences'] = len(genome['sequences']) stats['size'] = genome['size'] stats['gc'] = genome['gc'] stats['n_ratio'] = genome['n_ratio'] @@ -56,7 +56,7 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path): output['features'] = features if genome is not None: - output['sequences'] = genome['contigs'] + output['sequences'] = genome['sequences'] run = OrderedDict() run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S') diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py index 4b5d09e5..0f61ee49 100644 --- a/bakta/io/tsv.py +++ b/bakta/io/tsv.py @@ -16,7 +16,7 @@ log = logging.getLogger('TSV') -def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path): +def write_features(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path): """Export features in TSV format.""" log.info('write feature tsv: path=%s', tsv_path) @@ -28,8 +28,8 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], fh.write(f'# URL: {bc.BAKTA_URL}\n') fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tGene\tProduct\tDbXrefs\n') - for contig in contigs: - for feat in features_by_contig[contig['id']]: + for seq in sequences: + for feat in features_by_sequence[seq['id']]: feat_type = feat['type'] if(feat_type == bc.FEATURE_GAP): feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP @@ -46,7 +46,7 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], product = f"(partial) {product}" fh.write('\t'.join( [ - feat['contig'], + feat['sequence'], feat_type, str(feat['start']), str(feat['stop']), @@ -62,20 +62,20 @@ def write_features(contigs: Sequence[dict], features_by_contig: Dict[str, dict], i = 0 while i < len(feat['spacers']): repeat = feat['repeats'][i] - fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) + fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) fh.write('\n') spacer = feat['spacers'][i] - fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", ''])) + fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_SPACER, str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", ''])) fh.write('\n') i += 1 if(len(feat['repeats']) - 1 == i): repeat = feat['repeats'][i] - fh.write('\t'.join([feat['contig'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) + fh.write('\t'.join([feat['sequence'], bc.FEATURE_CRISPR_REPEAT, str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", ''])) fh.write('\n') return -def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_path: Path): +def write_feature_inferences(sequences: Sequence[dict], features_by_sequence: Dict[str, dict], tsv_path: Path): """Export feature inference statistics in TSV format.""" log.info('write tsv: path=%s', tsv_path) @@ -87,8 +87,8 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s fh.write(f'# URL: {bc.BAKTA_URL}\n') fh.write('#Sequence Id\tType\tStart\tStop\tStrand\tLocus Tag\tScore\tEvalue\tQuery Cov\tSubject Cov\tId\tAccession\n') - for contig in contigs: - for feat in features_by_contig[contig['id']]: + for seq in sequences: + for feat in features_by_sequence[seq['id']]: if(feat['type'] in [bc.FEATURE_CDS, bc.FEATURE_SORF]): score, evalue, query_cov, subject_cov, identity, accession = None, None, None, None, None, '-' if('ups' in feat or 'ips' in feat): @@ -107,7 +107,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s accession = f"{bc.DB_XREF_UNIREF}:{feat['psc'][bpsc.DB_PSC_COL_UNIREF90]}" if 'psc' in feat else f"{bc.DB_XREF_UNIREF}:{feat['pscc'][bpscc.DB_PSCC_COL_UNIREF50]}" fh.write('\t'.join( [ - feat['contig'], + feat['sequence'], feat['type'], str(feat['start']), str(feat['stop']), @@ -126,7 +126,7 @@ def write_feature_inferences(contigs: Sequence[dict], features_by_contig: Dict[s accession = '-' if feat['type'] == bc.FEATURE_T_RNA else [xref for xref in feat['db_xrefs'] if bc.DB_XREF_RFAM in xref][0] fh.write('\t'.join( [ - feat['contig'], + feat['sequence'], feat['type'], str(feat['start']), str(feat['stop']), @@ -173,5 +173,5 @@ def write_hypotheticals(hypotheticals: Sequence[dict], tsv_path: Path): seq_stats = hypo['seq_stats'] mol_weight = f"{(seq_stats['molecular_weight']/1000):.1f}" if seq_stats['molecular_weight'] else 'NA' iso_point = f"{seq_stats['isoelectric_point']:.1f}" if seq_stats['isoelectric_point'] else 'NA' - fh.write(f"{hypo['contig']}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n") + fh.write(f"{hypo['sequence']}\t{hypo['start']}\t{hypo['stop']}\t{hypo['strand']}\t{hypo.get('locus', '')}\t{mol_weight}\t{iso_point}\t{', '.join(sorted(pfams))}\t{', '.join(sorted(hypo.get('db_xrefs', [])))}\n") return diff --git a/bakta/ips.py b/bakta/ips.py index 1cebc6db..f93a6829 100644 --- a/bakta/ips.py +++ b/bakta/ips.py @@ -50,8 +50,8 @@ def lookup(features: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict]]: feature['ips'] = ips features_found.append(feature) log.debug( - 'lookup: contig=%s, start=%i, stop=%i, aa-length=%i, strand=%s, gene=%s, UniRef100=%s, UniRef90=%s', - feature['contig'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ips.get(DB_IPS_COL_GENE, ''), ips.get(DB_IPS_COL_UNIREF100, ''), ips.get(DB_IPS_COL_UNIREF90, '') + 'lookup: seq=%s, start=%i, stop=%i, aa-length=%i, strand=%s, gene=%s, UniRef100=%s, UniRef90=%s', + feature['sequence'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ips.get(DB_IPS_COL_GENE, ''), ips.get(DB_IPS_COL_UNIREF100, ''), ips.get(DB_IPS_COL_UNIREF90, '') ) else: features_not_found.append(feature) diff --git a/bakta/main.py b/bakta/main.py index 42e7205a..523e36f2 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -79,7 +79,7 @@ def main(): if(cfg.force): print(f'\tforce: {cfg.force}') print(f'\ttmp directory: {cfg.tmp_path}') if(cfg.compliant): print(f'\tINSDC compliant: {cfg.compliant}') - if(cfg.keep_contig_headers): print(f'\tkeep contig headers: {cfg.keep_contig_headers}') + if(cfg.keep_sequence_headers): print(f'\tkeep/sequence headers: {cfg.keep_sequence_headers}') print(f'\tprefix: {cfg.prefix}') print(f'\tthreads: {cfg.threads}') if(cfg.debug): print(f'\tdebug: {cfg.debug}') @@ -104,35 +104,35 @@ def main(): ############################################################################ # Import genome - # - parse contigs in Fasta file - # - apply contig length filter - # - rename contigs + # - parse sequences in Fasta file + # - apply sequence length filter + # - rename sequences ############################################################################ print('Parse genome sequences...') try: - contigs = fasta.import_contigs(cfg.genome_path) - log.info('imported sequences=%i', len(contigs)) - print(f'\timported: {len(contigs)}') + sequences = fasta.import_sequences(cfg.genome_path) + log.info('imported sequences=%i', len(sequences)) + print(f'\timported: {len(sequences)}') except: log.error('wrong genome file format!', exc_info=True) sys.exit('ERROR: wrong genome file format!') replicons = bu.parse_replicon_table(cfg.replicons) if cfg.replicons else None - contigs, complete_genome = bu.qc_contigs(contigs, replicons) - print(f'\tfiltered & revised: {len(contigs)}') - no_chromosomes = len([c for c in contigs if c['type'] == bc.REPLICON_CHROMOSOME]) + sequences, complete_genome = bu.qc_sequences(sequences, replicons) + print(f'\tfiltered & revised: {len(sequences)}') + no_chromosomes = len([seq for seq in sequences if seq['type'] == bc.REPLICON_CHROMOSOME]) if(no_chromosomes > 0): print(f"\tchromosomes: {no_chromosomes}") - no_plasmids = len([c for c in contigs if c['type'] == bc.REPLICON_PLASMID]) + no_plasmids = len([seq for seq in sequences if seq['type'] == bc.REPLICON_PLASMID]) if(no_plasmids > 0): print(f"\tplasmids: {no_plasmids}") - no_contigs = len([c for c in contigs if c['type'] == bc.REPLICON_CONTIG]) + no_contigs = len([seq for seq in sequences if seq['type'] == bc.REPLICON_CONTIG]) if(no_contigs > 0): print(f"\tcontigs: {no_contigs}") - if(len(contigs) == 0): - log.warning('no valid contigs!') - sys.exit('Error: input file contains no valid contigs.') - contigs_path = cfg.tmp_path.joinpath('contigs.fna') - fasta.export_contigs(contigs, contigs_path) + if(len(sequences) == 0): + log.warning('no valid sequences!') + sys.exit('Error: input file contains no valid sequences.') + sequences_path = cfg.tmp_path.joinpath('sequences.fna') + fasta.export_sequences(sequences, sequences_path) genome = { 'genus': cfg.genus, 'species': cfg.species, @@ -140,10 +140,10 @@ def main(): 'taxon': cfg.taxon, 'gram': cfg.gram, 'translation_table': cfg.translation_table, - 'size': sum([c['length'] for c in contigs]), + 'size': sum([seq['length'] for seq in sequences]), 'complete': cfg.complete or complete_genome, 'features': {}, - 'contigs': contigs + 'sequences': sequences } if(cfg.plasmid): genome['plasmid'] = cfg.plasmid @@ -157,7 +157,7 @@ def main(): else: print('predict tRNAs...') log.debug('start tRNA prediction') - genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, contigs_path) + genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_T_RNA])}") ############################################################################ @@ -168,7 +168,7 @@ def main(): else: print('predict tmRNAs...') log.debug('start tmRNA prediction') - genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, contigs_path) + genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_TM_RNA])}") ############################################################################ @@ -179,7 +179,7 @@ def main(): else: print('predict rRNAs...') log.debug('start rRNA prediction') - genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, contigs_path) + genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_R_RNA])}") ############################################################################ @@ -190,7 +190,7 @@ def main(): else: print('predict ncRNAs...') log.debug('start ncRNA prediction') - genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, contigs_path) + genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA])}") ############################################################################ @@ -201,7 +201,7 @@ def main(): else: print('predict ncRNA regions...') log.debug('start ncRNA region prediction') - genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, contigs_path) + genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA_REGION])}") ############################################################################ @@ -212,7 +212,7 @@ def main(): else: print('predict CRISPR arrays...') log.debug('start CRISPR prediction') - genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, contigs_path) + genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, sequences_path) print(f"\tfound: {len(genome['features'][bc.FEATURE_CRISPR])}") ############################################################################ @@ -403,7 +403,7 @@ def main(): sorf_aa_path = cfg.tmp_path.joinpath('sorfs.faa') with sorf_aa_path.open(mode='wt') as fh: for sorf in sorfs_filtered: - fh.write(f">{sorf['aa_hexdigest']}-{sorf['contig']}-{sorf['start']}\n{sorf['aa']}\n") + fh.write(f">{sorf['aa_hexdigest']}-{sorf['sequence']}-{sorf['start']}\n{sorf['aa']}\n") sig_peptides_found = sig_peptides.search(sorfs_filtered, sorf_aa_path) print(f"\tsignal peptides: {len(sig_peptides_found)}") @@ -429,13 +429,13 @@ def main(): else: print('detect oriCs/oriVs...') log.debug('detect oriC/V') - oriCs = ori.predict_oris(genome, contigs_path, bc.FEATURE_ORIC) + oriCs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIC) genome['features'][bc.FEATURE_ORIC] = oriCs print(f'\tfound: {len(oriCs)}') print('detect oriTs...') log.debug('detect oriT') - oriTs = ori.predict_oris(genome, contigs_path, bc.FEATURE_ORIT) + oriTs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIT) genome['features'][bc.FEATURE_ORIT] = oriTs print(f'\tfound: {len(oriTs)}') @@ -456,40 +456,26 @@ def main(): ############################################################################ print('select features and create locus tags...') log.debug('start feature selection and creation of locus tags') - features_by_contig = {k['id']: [] for k in genome['contigs']} + features_by_sequence = {k['id']: [] for k in genome['sequences']} feature_id = 1 - feature_id_prefix = bu.create_locus_tag_prefix(contigs, length=10) - for feature_type in [ - bc.FEATURE_T_RNA, - bc.FEATURE_TM_RNA, - bc.FEATURE_R_RNA, - bc.FEATURE_NC_RNA, - bc.FEATURE_NC_RNA_REGION, - bc.FEATURE_CRISPR, - bc.FEATURE_CDS, - bc.FEATURE_SORF, - bc.FEATURE_GAP, - bc.FEATURE_ORIC, - bc.FEATURE_ORIV, - bc.FEATURE_ORIT - ]: - feature_list = genome['features'].get(feature_type, []) + feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10) + for feature_list in genome['features'].values(): for feature in feature_list: if('discarded' not in feature): feature['id'] = f'{feature_id_prefix}_{feature_id}' feature_id += 1 - contig_features = features_by_contig.get(feature['contig']) - contig_features.append(feature) + seq_features = features_by_sequence.get(feature['sequence']) + seq_features.append(feature) features = [] - for contig in genome['contigs']: - contig_features = features_by_contig[contig['id']] - contig_features.sort(key=lambda k: k['start']) - features.extend(contig_features) + for seq in genome['sequences']: + seq_features = features_by_sequence[seq['id']] + seq_features.sort(key=lambda k: k['start']) + features.extend(seq_features) log.info('selected features=%i', len(features)) print(f'\tselected: {len(features)}') # use user provided locus tag if not None/non-empty or generate a sequence based locus prefix - locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(contigs) + locus_tag_prefix = cfg.locus_tag if cfg.locus_tag else bu.create_locus_tag_prefix(sequences) log.info('locus tag prefix=%s', locus_tag_prefix) locus_tag_nr = cfg.locus_tag_increment for feature in features: @@ -514,7 +500,7 @@ def main(): print('\nGenome statistics:') genome_stats = bu.calc_genome_stats(genome, features) print(f"\tGenome size: {genome['size']:,} bp") - print(f"\tContigs/replicons: {len(genome['contigs'])}") + print(f"\tContigs/replicons: {len(genome['sequences'])}") print(f"\tGC: {100 * genome_stats['gc']:.1f} %") print(f"\tN50: {genome_stats['n50']:,}") print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %") @@ -547,11 +533,11 @@ def main(): print(f'\nExport annotation results to: {cfg.output_path}') print('\thuman readable TSV...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_features(genome['contigs'], features_by_contig, tsv_path) + tsv.write_features(genome['sequences'], features_by_sequence, tsv_path) print('\tGFF3...') gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_features(genome, features_by_contig, gff3_path) + gff.write_features(genome, features_by_sequence, gff3_path) print('\tINSDC GenBank & EMBL...') genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') @@ -560,7 +546,7 @@ def main(): print('\tgenome sequences...') fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') - fasta.export_contigs(genome['contigs'], fna_path, description=True, wrap=True) + fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True) print('\tfeature nucleotide sequences...') ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') @@ -572,13 +558,13 @@ def main(): print('\tfeature inferences...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') - tsv.write_feature_inferences(genome['contigs'], features_by_contig, tsv_path) + tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path) if(cfg.skip_plot or cfg.meta): print('\tskip generation of circular genome plot...') else: print('\tcircular genome plot...') - plot.write(features, contigs, cfg.output_path) + plot.write(features, sequences, cfg.output_path) if(cfg.skip_cds is False): hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] @@ -608,7 +594,7 @@ def main(): with summary_path.open('w') as fh_out: fh_out.write('Sequence(s):\n') fh_out.write(f"Length: {genome['size']:}\n") - fh_out.write(f"Count: {len(genome['contigs'])}\n") + fh_out.write(f"Count: {len(genome['sequences'])}\n") fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") fh_out.write(f"N50: {genome_stats['n50']:}\n") fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") diff --git a/bakta/plot.py b/bakta/plot.py index e6510981..37c2a3a2 100644 --- a/bakta/plot.py +++ b/bakta/plot.py @@ -171,7 +171,7 @@ def main(): with annotation_path.open('r') as fh: annotation = json.load(fh) features = annotation['features'] - contigs = annotation['sequences'] + sequences = annotation['sequences'] # load colors if specified colors = COLORS @@ -182,40 +182,40 @@ def main(): print('Draw plots...') if args.sequences == 'all': # write whole genome plot print(f'\tdraw circular genome plot (type={plot_type}) containing all sequences...') - write(features, contigs, output_path, colors, plot_type=plot_type) + write(features, sequences, output_path, colors, plot_type=plot_type) else: # write genome plot containing provided sequences only - plot_contigs = [] + plot_sequences = [] sequence_identifiers = [] for selected_sequence in args.sequences.split(','): - for i, contig in enumerate(contigs): + for i, seq in enumerate(sequences): sequence_no = str(i + 1) if selected_sequence == sequence_no: - plot_contigs.append(contig) + plot_sequences.append(seq) sequence_identifiers.append(sequence_no) - elif selected_sequence.lower() == contig['id'].lower(): - plot_contigs.append(contig) - sequence_identifiers.append(contig['id']) - if len(plot_contigs) > 0: + elif selected_sequence.lower() == seq['id'].lower(): + plot_sequences.append(seq) + sequence_identifiers.append(seq['id']) + if len(plot_sequences) > 0: print(f'\tdraw circular genome plot (type={plot_type}) containing sequences: {sequence_identifiers}...') plot_name_suffix = '_'.join(sequence_identifiers) - plot_contig_ids = [c['id'] for c in plot_contigs] - features = [feat for feat in features if feat['contig'] in plot_contig_ids] - write(features, plot_contigs, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type) + plot_sequence_ids = [seq['id'] for seq in plot_sequences] + features = [feat for feat in features if feat['sequence'] in plot_sequence_ids] + write(features, plot_sequences, output_path, colors, plot_name_suffix=plot_name_suffix, plot_type=plot_type) -def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES): +def write(features, sequences, output_path, colors=COLORS, plot_name_suffix=None, plot_type=bc.PLOT_FEATURES): # config paths circos_path = cfg.tmp_path.joinpath(f'circos') circos_path.mkdir(parents=True, exist_ok=True) # fix edge features because Circos cannot handle them correctly non_edge_features = [feat for feat in features if not feat.get('edge', False)] - contigs_by_id = {c['id']: c for c in contigs} + sequences_by_id = {seq['id']: seq for seq in sequences} for feat in [feat for feat in features if feat.get('edge', False)]: - contig = contigs_by_id[feat['contig']] - log.info('split edge feature: contig=%s, start=%i, stop=%i, strand=%s, edge=%s', contig['id'], feat['start'], feat['stop'], feat['strand'], feat['edge']) + seq = sequences_by_id[feat['sequence']] + log.info('split edge feature: seq=%s, start=%i, stop=%i, strand=%s, edge=%s', seq['id'], feat['start'], feat['stop'], feat['strand'], feat['edge']) feat_1 = feat.copy() - feat_1['stop'] = contig['length'] + feat_1['stop'] = seq['length'] feat_1['edge'] = False non_edge_features.append(feat_1) feat_2 = feat.copy() @@ -226,19 +226,19 @@ def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, # write feature files if plot_type == bc.PLOT_COG: - feature_paths = write_features_type_cog(features, contigs, circos_path, colors) + feature_paths = write_features_type_cog(features, sequences, circos_path, colors) else: - feature_paths = write_features_type_feature(features, contigs, circos_path, colors) + feature_paths = write_features_type_feature(features, sequences, circos_path, colors) # write gc content and gc skew files tracks_path = circos_path.joinpath('tracks.conf') - gc_content_path, max_gc, gc_skew_path, max_gc_skew = write_gc_content_skew(contigs, circos_path, colors) + gc_content_path, max_gc, gc_skew_path, max_gc_skew = write_gc_content_skew(sequences, circos_path, colors) write_tracks(tracks_path, feature_paths, gc_content_path, max_gc, gc_skew_path, max_gc_skew) # write main config file_name = cfg.prefix if plot_name_suffix is None else f'{cfg.prefix}_{plot_name_suffix}' - main_conf_path = write_main_config(circos_path, output_path, tracks_path, contigs, file_name, colors) + main_conf_path = write_main_config(circos_path, output_path, tracks_path, sequences, file_name, colors) # execute Circos log.info('write circular genome plot: file-name=%s, output-dir=%s', file_name, output_path) @@ -262,19 +262,19 @@ def write(features, contigs, output_path, colors=COLORS, plot_name_suffix=None, raise Exception(f'circos error! error code: {proc.returncode}') -def write_features_type_feature(features, contigs, circos_path, colors): +def write_features_type_feature(features, sequences, circos_path, colors): features_plus = [] features_minus = [] - contig_ids = set([c['id'] for c in contigs]) + sequence_ids = set([seq['id'] for seq in sequences]) for feat in features: - if feat['contig'] not in contig_ids: + if feat['sequence'] not in sequence_ids: continue - contig, start, stop, type = feat['contig'], feat['start'], feat['stop'], feat['type'] + seq, start, stop, type = feat['sequence'], feat['start'], feat['stop'], feat['type'] color = colors['features'].get(type, colors['features']['misc']) if feat['strand'] == bc.STRAND_FORWARD: - features_plus.append(f"{contig} {start} {stop} {bc.STRAND_FORWARD} color={hex_to_rgb(color)}") + features_plus.append(f"{seq} {start} {stop} {bc.STRAND_FORWARD} color={hex_to_rgb(color)}") else: - features_minus.append(f"{contig} {start} {stop} {bc.STRAND_REVERSE} color={hex_to_rgb(color)}") + features_minus.append(f"{seq} {start} {stop} {bc.STRAND_REVERSE} color={hex_to_rgb(color)}") features_plus_path = circos_path.joinpath('features-plus.txt') with features_plus_path.open('w') as fh: fh.write('\n'.join(features_plus)) @@ -286,15 +286,15 @@ def write_features_type_feature(features, contigs, circos_path, colors): return [features_plus_path, features_minus_path] -def write_features_type_cog(features, contigs, circos_path, colors): +def write_features_type_cog(features, sequences, circos_path, colors): features_plus = [] features_minus = [] features_extra = [] - contig_ids = set([c['id'] for c in contigs]) + sequence_ids = set([seq['id'] for seq in sequences]) for feat in features: - if feat['contig'] not in contig_ids: + if feat['sequence'] not in sequence_ids: continue - contig, start, stop = feat['contig'], feat['start'], feat['stop'] + seq, start, stop = feat['sequence'], feat['start'], feat['stop'] if feat['type'] == bc.FEATURE_CDS: color = colors['features'][bc.FEATURE_CDS] psc = feat.get('psc', None) @@ -305,11 +305,11 @@ def write_features_type_cog(features, contigs, circos_path, colors): cog = cog[:1] color = colors['cog-classes'].get(cog.upper(), colors['cog-classes']['S']) if feat['strand'] == bc.STRAND_FORWARD: - features_plus.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}") + features_plus.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}") else: - features_minus.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}") + features_minus.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(color)}") else: - features_extra.append(f"{contig} {start} {stop} {feat['strand']} color={hex_to_rgb(colors['features']['misc'])}") + features_extra.append(f"{seq} {start} {stop} {feat['strand']} color={hex_to_rgb(colors['features']['misc'])}") features_plus_path = circos_path.joinpath('features-plus.txt') with features_plus_path.open('w') as fh: fh.write('\n'.join(features_plus)) @@ -325,8 +325,8 @@ def write_features_type_cog(features, contigs, circos_path, colors): return [features_plus_path, features_minus_path, features_extra_path] -def write_gc_content_skew(contigs, circos_path, colors): - sequence_length = sum([c['length'] for c in contigs]) +def write_gc_content_skew(sequences, circos_path, colors): + sequence_length = sum([seq['length'] for seq in sequences]) step_size = int(sequence_length / 3600) # 10 * 360° if step_size < 3: step_size = 3 @@ -338,33 +338,33 @@ def write_gc_content_skew(contigs, circos_path, colors): max_gc = 0 max_gc_skew = 0 if float(bp.__version__) >= 1.80: - gc_mean = SeqUtils.gc_fraction(''.join([c['sequence'] for c in contigs])) + gc_mean = SeqUtils.gc_fraction(''.join([seq['sequence'] for seq in sequences])) else: - gc_mean = SeqUtils.GC(''.join([c['sequence'] for c in contigs])) / 100 - for contig in contigs: - seq = contig['sequence'] - for w in range(0, len(seq), step_size): + gc_mean = SeqUtils.GC(''.join([seq['sequence'] for seq in sequences])) / 100 + for seq in sequences: + nt = seq['sequence'] + for w in range(0, len(nt), step_size): start = w - window_size if start < 0: - start += len(seq) + start += len(nt) stop = w + window_size - if stop > len(seq): - stop -= len(seq) - subseq = seq[start:stop] if start < stop else seq[start:] + seq[:stop] + if stop > len(nt): + stop -= len(nt) + nt_subseq = nt[start:stop] if start < stop else nt[start:] + nt[:stop] if float(bp.__version__) >= 1.80: - gc_value = gc_mean - SeqUtils.gc_fraction(subseq) + gc_value = gc_mean - SeqUtils.gc_fraction(nt_subseq) else: - gc_value = gc_mean - (SeqUtils.GC(subseq) / 100) + gc_value = gc_mean - (SeqUtils.GC(nt_subseq) / 100) if max_gc < abs(gc_value): max_gc = abs(gc_value) gc_color = colors['gc-positive'] if gc_value >= 0 else colors['gc-negative'] - gc_contents.append(f"{contig['id']} {w} {w} {gc_value} fill_color={hex_to_rgb(gc_color)}") - g, c = subseq.count('G'), subseq.count('C') + gc_contents.append(f"{seq['id']} {w} {w} {gc_value} fill_color={hex_to_rgb(gc_color)}") + g, c = nt_subseq.count('G'), nt_subseq.count('C') gc_skew = gc_skew = (g - c) / float(g + c) if (g + c) > 0 else 0.0 if max_gc_skew < abs(gc_skew): max_gc_skew = abs(gc_skew) gc_skew_color = colors['gc-skew-positive'] if gc_skew >= 0 else colors['gc-skew-negative'] - gc_skews.append(f"{contig['id']} {w} {w} {gc_skew} fill_color={hex_to_rgb(gc_skew_color)}") + gc_skews.append(f"{seq['id']} {w} {w} {gc_skew} fill_color={hex_to_rgb(gc_skew_color)}") log.debug('write gc config: seq-length=%i, step-size=%i, window-size=%i, max-gc=%i, max-gc-skew=%i', sequence_length, step_size, window_size, max_gc, max_gc_skew) gc_content_path = circos_path.joinpath('gc_content.txt') @@ -430,9 +430,9 @@ def hex_to_rgb(hex_string): return ','.join(rgb) -def write_main_config(circos_path, output_path, tracks_path, contigs, file_name, colors): +def write_main_config(circos_path, output_path, tracks_path, sequences, file_name, colors): karyotype_path = circos_path.joinpath('karyotype.txt') - sequence_length = sum([c['length'] for c in contigs]) + sequence_length = sum([seq['length'] for seq in sequences]) chromosomes_units = round(sequence_length/(10**(len(str(sequence_length)) - 1)))*(10**(len(str(sequence_length)) - 1)) if sequence_length > 10_000: @@ -508,7 +508,7 @@ def write_main_config(circos_path, output_path, tracks_path, contigs, file_name, # write karyotype file karyotypes = [] - for i, c in enumerate(contigs): + for i, c in enumerate(sequences): karyotypes.append(f"chr - {c['id']} {i + 1} 0 {c['length']} {hex_to_rgb(colors['backbone'])}") with karyotype_path.open('w') as fh: fh.write('\n'.join(karyotypes)) diff --git a/bakta/proteins.py b/bakta/proteins.py index e22a9aa4..a7835611 100644 --- a/bakta/proteins.py +++ b/bakta/proteins.py @@ -128,7 +128,7 @@ def main(): ############################################################################ try: print('Parse protein sequences...') - aas = fasta.import_contigs(aa_path, False, False) + aas = fasta.import_sequences(aa_path, False, False) log.info('imported sequences=%i', len(aas)) print(f'\timported: {len(aas)}') except: @@ -139,7 +139,7 @@ def main(): aa['type'] = bc.FEATURE_CDS aa['aa'] = aa['sequence'] aa['locus'] = aa['id'] - aa['contig'] = '-' + aa['sequence'] = '-' aa['start'] = mock_start aa['stop'] = -1 aa['strand'] = bc.STRAND_UNKNOWN @@ -166,11 +166,11 @@ def main(): tsv.write_protein_features(aas, header_columns, map_aa_columns, annotations_path) inference_path = output_path.joinpath(f'{cfg.prefix}.inference.tsv') print(f'\tfeature inferences (TSV): {inference_path}') - mock_contigs = [{'id': '-'}] - features_by_contig = {'-': aas} - tsv.write_feature_inferences(mock_contigs, features_by_contig, inference_path) + mock_sequences = [{'id': '-'}] + features_by_sequence = {'-': aas} + tsv.write_feature_inferences(mock_sequences, features_by_sequence, inference_path) for aa in aas: # cleanup mock attributes - aa.pop('contig', None) + aa.pop('sequence', None) aa.pop('start', None) aa.pop('stop', None) aa.pop('strand', None) diff --git a/bakta/psc.py b/bakta/psc.py index 24cb774b..0cb7398c 100644 --- a/bakta/psc.py +++ b/bakta/psc.py @@ -84,8 +84,8 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen 'valid': identity >= bc.MIN_PSC_IDENTITY # whether a valid PSC hit (id > 90%) } log.debug( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id + 'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef90=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) pscs_found = [] @@ -142,8 +142,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False): feature['psc'] = psc no_psc_lookups += 1 log.debug( - 'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef90=%s, EC=%s, gene=%s, product=%s', - feature['contig'], feature['start'], feature['stop'], feature['strand'], psc.get(DB_PSC_COL_UNIREF90, ''), psc.get(DB_PSC_COL_EC, ''), psc.get(DB_PSC_COL_GENE, ''), psc.get(DB_PSC_COL_PRODUCT, '') + 'lookup: seq=%s, start=%i, stop=%i, strand=%s, UniRef90=%s, EC=%s, gene=%s, product=%s', + feature['sequence'], feature['start'], feature['stop'], feature['strand'], psc.get(DB_PSC_COL_UNIREF90, ''), psc.get(DB_PSC_COL_EC, ''), psc.get(DB_PSC_COL_GENE, ''), psc.get(DB_PSC_COL_PRODUCT, '') ) else: log.debug('lookup: ID not found! uniref90_id=%s', uniref90_id) diff --git a/bakta/pscc.py b/bakta/pscc.py index b4b34906..372c4cbf 100644 --- a/bakta/pscc.py +++ b/bakta/pscc.py @@ -76,8 +76,8 @@ def search(cdss: Sequence[dict]) -> Tuple[Sequence[dict], Sequence[dict], Sequen 'evalue': evalue } log.debug( - 'homology: contig=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s', - cds['contig'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id + 'homology: seq=%s, start=%i, stop=%i, strand=%s, aa-length=%i, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, UniRef50=%s', + cds['sequence'], cds['start'], cds['stop'], cds['strand'], len(cds['aa']), query_cov, subject_cov, identity, bitscore, evalue, cluster_id ) psccs_found = [] @@ -129,8 +129,8 @@ def lookup(features: Sequence[dict], pseudo: bool = False): feature['pscc'] = pscc # add PSCC annotation info no_pscc_lookups += 1 log.debug( - 'lookup: contig=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s', - feature['contig'], feature['start'], feature['stop'], feature['strand'], pscc.get(DB_PSCC_COL_UNIREF50, ''), pscc.get(DB_PSCC_COL_PRODUCT, '') + 'lookup: seq=%s, start=%i, stop=%i, strand=%s, UniRef50=%s, product=%s', + feature['sequence'], feature['start'], feature['stop'], feature['strand'], pscc.get(DB_PSCC_COL_UNIREF50, ''), pscc.get(DB_PSCC_COL_PRODUCT, '') ) else: log.debug('lookup: ID not found! uniref50_id=%s', uniref50_id) diff --git a/bakta/so.py b/bakta/so.py index 7c9cb344..809bc2ff 100644 --- a/bakta/so.py +++ b/bakta/so.py @@ -7,7 +7,7 @@ SO_REPLICON = SO('replicon', 'SO:0001235') SO_REPLICON_CHROMOSOME = SO('chromosome', 'SO:0000340') SO_REPLICON_PLASMID = SO('plasmid', 'SO:0000155') -SO_CONTIG = SO('contig', 'SO:0000149') +SO_CONTIG = SO('sequence', 'SO:0000149') SO_OPERON = SO('operon', 'SO:0000178') SO_PROMOTER = SO('promoter', 'SO:0000167') diff --git a/bakta/ups.py b/bakta/ups.py index 2d048c8e..e9f48bba 100644 --- a/bakta/ups.py +++ b/bakta/ups.py @@ -45,8 +45,8 @@ def lookup(features: Sequence[dict]): feature['ups'] = ups features_found.append(feature) log.debug( - 'lookup: contig=%s, start=%i, stop=%i, aa-length=%i, strand=%s, UniParc=%s, UniRef100=%s, NCBI NRP=%s', - feature['contig'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ups.get(DB_UPS_COL_UNIPARC, ''), ups.get(DB_UPS_COL_UNIREF100, ''), ups.get(DB_UPS_COL_REFSEQ_NRP, '') + 'lookup: seq=%s, start=%i, stop=%i, aa-length=%i, strand=%s, UniParc=%s, UniRef100=%s, NCBI NRP=%s', + feature['sequence'], feature['start'], feature['stop'], len(feature['aa']), feature['strand'], ups.get(DB_UPS_COL_UNIPARC, ''), ups.get(DB_UPS_COL_UNIREF100, ''), ups.get(DB_UPS_COL_REFSEQ_NRP, '') ) else: features_not_found.append(feature) diff --git a/bakta/utils.py b/bakta/utils.py index 5b17aee9..04d02704 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -68,7 +68,7 @@ def parse_arguments(): arg_group_io = parser.add_argument_group('Input / Output') arg_group_io.add_argument('--db', '-d', action='store', default=None, help='Database path (default = /db). Can also be provided as BAKTA_DB environment variable.') - arg_group_io.add_argument('--min-contig-length', '-m', action='store', type=int, default=1, dest='min_contig_length', help='Minimum contig size (default = 1; 200 in compliant mode)') + arg_group_io.add_argument('--min-contig-length', '-m', action='store', type=int, default=1, dest='min_contig_length', help='Minimum contig/sequence size (default = 1; 200 in compliant mode)') arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder (except for current working directory)') @@ -87,7 +87,7 @@ def parse_arguments(): arg_group_annotation.add_argument('--locus', action='store', default=None, help="Locus prefix (default = 'contig')") arg_group_annotation.add_argument('--locus-tag', action='store', default=None, dest='locus_tag', help='Locus tag prefix (default = autogenerated)') arg_group_annotation.add_argument('--locus-tag-increment', action='store', type=int, default=1, choices=[1, 5, 10], dest='locus_tag_increment', help='Locus tag increment: 1/5/10 (default = 1)') - arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig headers') + arg_group_annotation.add_argument('--keep-contig-headers', action='store_true', dest='keep_contig_headers', help='Keep original contig/sequence headers') arg_group_annotation.add_argument('--compliant', action='store_true', help='Force Genbank/ENA/DDJB compliance') arg_group_annotation.add_argument('--replicons', '-r', action='store', default=None, dest='replicons', help='Replicon information table (tsv/csv)') arg_group_annotation.add_argument('--regions', action='store', default=None, help='Path to pre-annotated regions in GFF3 or Genbank format (regions only, no functional annotations).') @@ -261,11 +261,11 @@ def test_dependencies(): test_dependency(DEPENDENCY_CIRCOS) -def create_locus_tag_prefix(contigs: Sequence[dict], length: int=6) -> str: +def create_locus_tag_prefix(sequences: Sequence[dict], length: int=6) -> str: """Create either genus/species or sequence MD5 hex based locus tag prefix.""" hash = hashlib.md5() - for contig in contigs: - hash.update(str.encode(contig['sequence'])) + for seq in sequences: + hash.update(str.encode(seq['sequence'])) hexdigest = hash.hexdigest().upper() locus_prefix_chars = [] i = 0 @@ -300,10 +300,10 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]): # N50 gc_sum = 0 n_sum = 0 - for contig in genome['contigs']: - seq = contig['sequence'] - gc_sum += seq.count('G') + seq.count('C') - n_sum += seq.count('N') + for seq in genome['sequences']: + nt = seq['sequence'] + gc_sum += nt.count('G') + nt.count('C') + n_sum += nt.count('N') gc_ratio = gc_sum / (genome_size - n_sum) genome['gc'] = gc_ratio log.info('GC=%0.3f', gc_ratio) @@ -313,21 +313,21 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]): log.info('N=%0.3f', n_ratio) n50 = 0 - contig_length_sum = 0 - for contig in sorted(genome['contigs'], key=lambda x: x['length'], reverse=True): - contig_length = len(contig['sequence']) - contig_length_sum += contig_length - if(contig_length_sum >= genome_size / 2): - n50 = contig_length + sequence_length_sum = 0 + for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True): + nt_length = len(seq['sequence']) + sequence_length_sum += nt_length + if(sequence_length_sum >= genome_size / 2): + n50 = nt_length break genome['n50'] = n50 log.info('N50=%i', n50) - contigs_by_id = {c['id']: c for c in genome['contigs']} + sequence_by_id = {seq['id']: seq for seq in genome['sequences']} coding_nts = 0 for feat in features: if(feat.get('edge', False)): - sequence_length = contigs_by_id[feat['contig']]['length'] + sequence_length = sequence_by_id[feat['sequence']]['length'] coding_nts += feat['stop'] + (sequence_length - feat['start'] + 1) # feature coding nucleotides else: coding_nts += feat['stop'] - feat['start'] + 1 # feature coding nucleotides @@ -389,120 +389,120 @@ def parse_replicon_table(replicon_table_path: Path) -> Dict[str, dict]: return replicons -def qc_contigs(contigs: Sequence[dict], replicons: Dict[str, dict]) -> Tuple[Sequence[dict], bool]: - valid_contigs = [] - contig_counter = 1 - contig_prefix = cfg.locus if cfg.locus else 'contig' +def qc_sequences(sequences: Sequence[dict], replicons: Dict[str, dict]) -> Tuple[Sequence[dict], bool]: + valid_sequences = [] + sequence_counter = 1 + sequence_prefix = cfg.locus if cfg.locus else 'contig' complete_genome = True plasmid_number = 1 - contig_ids = set() - for contig in contigs: - if(contig['length'] >= cfg.min_contig_length): - contig_id_generated = f'{contig_prefix}_{contig_counter}' - contig['simple_id'] = contig_id_generated - contig_counter += 1 - - contig_description = contig['description'].lower() + sequence_ids = set() + for seq in sequences: + if(seq['length'] >= cfg.min_sequence_length): + sequence_id_generated = f'{sequence_prefix}_{sequence_counter}' + seq['simple_id'] = sequence_id_generated + sequence_counter += 1 + + sequence_description = seq['description'].lower() if(cfg.complete): - contig['complete'] = True - contig['topology'] = bc.TOPOLOGY_CIRCULAR - elif('circular=true' in contig_description): # detection of Unicycler circularized sequences - contig['complete'] = True - contig['topology'] = bc.TOPOLOGY_CIRCULAR - log.debug('qc: detected Unicycler circular topology via description: id=%s, description=%s', contig['id'], contig['description']) - elif('complete' in contig_description and 'complete=false' not in contig_description): # detection of public/described sequences - contig['complete'] = True - contig['topology'] = bc.TOPOLOGY_CIRCULAR - log.debug('qc: detected complete replicon via description: id=%s, description=%s', contig['id'], contig['description']) + seq['complete'] = True + seq['topology'] = bc.TOPOLOGY_CIRCULAR + elif('circular=true' in sequence_description): # detection of Unicycler circularized sequences + seq['complete'] = True + seq['topology'] = bc.TOPOLOGY_CIRCULAR + log.debug('qc: detected Unicycler circular topology via description: id=%s, description=%s', seq['id'], seq['description']) + elif('complete' in sequence_description and 'complete=false' not in sequence_description): # detection of public/described sequences + seq['complete'] = True + seq['topology'] = bc.TOPOLOGY_CIRCULAR + log.debug('qc: detected complete replicon via description: id=%s, description=%s', seq['id'], seq['description']) - if('chromosome' in contig_description): - contig['type'] = bc.REPLICON_CHROMOSOME - log.debug('qc: detected chromosome replicon type via description: id=%s, description=%s', contig['id'], contig['description']) - elif('plasmid' in contig_description): - contig['type'] = bc.REPLICON_PLASMID - log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', contig['id'], contig['description']) - - contig_desc = [] - if(cfg.keep_contig_headers): - if(contig['id'] in contig_ids): - log.error('Fasta import: duplicated contig id! contig-id=%s', contig['id']) - sys.exit(f"ERROR: Detected duplicated contig id! Contig ID ({contig['id']}) occures multiple times!") + if('chromosome' in sequence_description): + seq['type'] = bc.REPLICON_CHROMOSOME + log.debug('qc: detected chromosome replicon type via description: id=%s, description=%s', seq['id'], seq['description']) + elif('plasmid' in sequence_description): + seq['type'] = bc.REPLICON_PLASMID + log.debug('qc: detected plasmid replicon type via description: id=%s, description=%s', seq['id'], seq['description']) + + sequence_desc = [] + if(cfg.keep_sequence_headers): + if(seq['id'] in sequence_ids): + log.error('Fasta import: duplicated seq id! seq-id=%s', seq['id']) + sys.exit(f"ERROR: Detected duplicated sequence id! Sequence ID ({seq['id']}) occures multiple times!") else: - contig_ids.add(contig['id']) + sequence_ids.add(seq['id']) else: - contig['orig_id'] = contig['id'] - contig['id'] = contig_id_generated - contig['orig_description'] = contig['description'] + seq['orig_id'] = seq['id'] + seq['id'] = sequence_id_generated + seq['orig_description'] = seq['description'] if(cfg.genus is not None or cfg.species is not None): organism = ' '.join([t for t in [cfg.genus, cfg.species] if t is not None]) - contig_desc.append(f"[organism={organism}]") + sequence_desc.append(f"[organism={organism}]") if(cfg.strain): - contig_desc.append(f'[strain={cfg.strain}]') - contig_desc.append(f'[gcode={cfg.translation_table}]') - - if(contig['complete'] and contig['topology'] == bc.TOPOLOGY_CIRCULAR): # detection of chromosomes/plasmids via sequence length thresholds - if(contig['length'] >= bc.REPLICON_LENGTH_THRESHOLD_CHROMOSOME): - contig['type'] = bc.REPLICON_CHROMOSOME - log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', contig['id'], contig['type'], contig['length'], contig['description']) - elif(contig['length'] < bc.REPLICON_LENGTH_THRESHOLD_PLASMID): - contig['type'] = bc.REPLICON_PLASMID - log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', contig['id'], contig['type'], contig['length'], contig['description']) - valid_contigs.append(contig) - - if(len(contigs) == 1 and cfg.plasmid is not None): # use plasmid mode - contig['type'] = bc.REPLICON_PLASMID - contig['topology'] = bc.TOPOLOGY_CIRCULAR - contig['name'] = cfg.plasmid + sequence_desc.append(f'[strain={cfg.strain}]') + sequence_desc.append(f'[gcode={cfg.translation_table}]') + + if(seq['complete'] and seq['topology'] == bc.TOPOLOGY_CIRCULAR): # detection of chromosomes/plasmids via sequence length thresholds + if(seq['length'] >= bc.REPLICON_LENGTH_THRESHOLD_CHROMOSOME): + seq['type'] = bc.REPLICON_CHROMOSOME + log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', seq['id'], seq['type'], seq['length'], seq['description']) + elif(seq['length'] < bc.REPLICON_LENGTH_THRESHOLD_PLASMID): + seq['type'] = bc.REPLICON_PLASMID + log.debug('qc: detected replicon type via length: id=%s, type=%s, length=%i, description=%s', seq['id'], seq['type'], seq['length'], seq['description']) + valid_sequences.append(seq) + + if(len(sequences) == 1 and cfg.plasmid is not None): # use plasmid mode + seq['type'] = bc.REPLICON_PLASMID + seq['topology'] = bc.TOPOLOGY_CIRCULAR + seq['name'] = cfg.plasmid elif(replicons): # use user provided replicon table - contig_id = contig['orig_id'] if 'orig_id' in contig else contig['id'] - replicon = replicons.get(contig_id, None) + sequence_id = seq['orig_id'] if 'orig_id' in seq else seq['id'] + replicon = replicons.get(sequence_id, None) if(replicon): - contig['type'] = replicon['replicon_type'] - contig['topology'] = replicon['topology'] - contig['complete'] = replicon['replicon_type'] != bc.REPLICON_CONTIG + seq['type'] = replicon['replicon_type'] + seq['topology'] = replicon['topology'] + seq['complete'] = replicon['replicon_type'] != bc.REPLICON_CONTIG if(replicon['name']): - contig['name'] = replicon['name'] - if(not cfg.keep_contig_headers): - contig['id'] = replicon['new_locus_id'] if replicon['new_locus_id'] else contig['simple_id'] - contig.pop('simple_id') + seq['name'] = replicon['name'] + if(not cfg.keep_sequence_headers): + seq['id'] = replicon['new_locus_id'] if replicon['new_locus_id'] else seq['simple_id'] + seq.pop('simple_id') - if(not cfg.keep_contig_headers): - if(contig['complete']): - contig_desc.append('[completeness=complete]') - if(contig['topology'] != bc.REPLICON_CONTIG): - contig_desc.append(f"[topology={contig['topology']}]") - if(contig['type'] == bc.REPLICON_CHROMOSOME): - contig_desc.append('[location=chromosome]') - elif(contig['type'] == bc.REPLICON_PLASMID): - if(not contig.get('name', None)): - contig['name'] = f'unnamed{plasmid_number}' + if(not cfg.keep_sequence_headers): + if(seq['complete']): + sequence_desc.append('[completeness=complete]') + if(seq['topology'] != bc.REPLICON_CONTIG): + sequence_desc.append(f"[topology={seq['topology']}]") + if(seq['type'] == bc.REPLICON_CHROMOSOME): + sequence_desc.append('[location=chromosome]') + elif(seq['type'] == bc.REPLICON_PLASMID): + if(not seq.get('name', None)): + seq['name'] = f'unnamed{plasmid_number}' plasmid_number += 1 - contig_desc.append(f"[plasmid-name={contig['name']}]") - contig['description'] = ' '.join(list(dict.fromkeys(contig_desc))) # remove duplicates remaining order + sequence_desc.append(f"[plasmid-name={seq['name']}]") + seq['description'] = ' '.join(list(dict.fromkeys(sequence_desc))) # remove duplicates remaining order - if(contig['type'] == bc.REPLICON_CONTIG): + if(seq['type'] == bc.REPLICON_CONTIG): complete_genome = False if(cfg.compliant): # check INSDC compliance - if(len(contig['id']) > 25): # max 25 characters - log.error('INSDC compliance: contig id larger than 25! contig-id=%s', contig['id']) - sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) larger than 25 characers!") - if(bc.RE_INSDC_ID.fullmatch(contig['id']) is None): # invalid characters - log.error('INSDC compliance: contig id contains invalid characters! contig-id=%s', contig['id']) - sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) contains invalid characters!") + if(len(seq['id']) > 25): # max 25 characters + log.error('INSDC compliance: seq id larger than 25! seq-id=%s', seq['id']) + sys.exit(f"ERROR: INSDC compliance failed! Sequence ID ({seq['id']}) larger than 25 characers!") + if(bc.RE_INSDC_ID.fullmatch(seq['id']) is None): # invalid characters + log.error('INSDC compliance: seq id contains invalid characters! seq-id=%s', seq['id']) + sys.exit(f"ERROR: INSDC compliance failed! Sequence ID ({seq['id']}) contains invalid characters!") log.info( "qc: revised sequence: id=%s, orig-id=%s, type=%s, complete=%s, topology=%s, name=%s, description='%s', orig-description='%s'", - contig['id'], contig.get('orig_id', ''), contig['type'], contig['complete'], contig['topology'], contig.get('name', ''), contig['description'], contig.get('orig_description', '') + seq['id'], seq.get('orig_id', ''), seq['type'], seq['complete'], seq['topology'], seq.get('name', ''), seq['description'], seq.get('orig_description', '') ) - return valid_contigs, complete_genome + return valid_sequences, complete_genome -def extract_feature_sequence(feature: dict, contig: dict) -> str: +def extract_feature_sequence(feature: dict, sequence: dict) -> str: if(feature.get('edge', False)): - seq = contig['sequence'][feature['start']-1:] + contig['sequence'][:feature['stop']] + nt = sequence['sequence'][feature['start']-1:] + sequence['sequence'][:feature['stop']] else: - seq = contig['sequence'][feature['start']-1:feature['stop']] + nt = sequence['sequence'][feature['start']-1:feature['stop']] if(feature['strand'] == bc.STRAND_REVERSE): - seq = str(Seq(seq).reverse_complement()) - return seq + nt = str(Seq(nt).reverse_complement()) + return nt diff --git a/scripts/extract-region.py b/scripts/extract-region.py index b357b137..23da1605 100755 --- a/scripts/extract-region.py +++ b/scripts/extract-region.py @@ -43,9 +43,9 @@ with genome_path.open() as fh: genome = json.load(fh) -contig_id = args.sequence -if(contig_id is None): # take first sequence as default - contig_id = genome['sequences'][0]['id'] +sequence_id = args.sequence +if(sequence_id is None): # take first sequence as default + sequence_id = genome['sequences'][0]['id'] prefix = args.prefix if(prefix is None): # use input file prefix as default @@ -55,14 +55,14 @@ print('Extract features within selected region...') features_selected = [] for feat in genome['features']: - if(feat['contig'] == contig_id): + if(feat['sequence'] == sequence_id): if(feat['start'] >= args.min and feat['stop'] <= args.max): features_selected.append(feat) -features_by_contig = {contig_id: features_selected} # needed for GFF3 export +features_by_sequence = {sequence_id: features_selected} # needed for GFF3 export print(f'\t...selected features: {len(features_selected)}') genome['features'] = features_selected -genome['contigs'] = [sequence for sequence in genome['sequences'] if sequence['id'] == contig_id] +genome['sequences'] = [sequence for sequence in genome['sequences'] if sequence['id'] == sequence_id] genome['genus'] = genome['genome']['genus'] genome['species'] = genome['genome']['species'] genome['strain'] = genome['genome']['strain'] @@ -76,7 +76,7 @@ print('Write selected features...') output_path = Path(args.output).resolve() gff3_path = output_path.joinpath(f'{prefix}.gff3') -gff.write_features(genome, features_by_contig, gff3_path) +gff.write_features(genome, features_by_sequence, gff3_path) print('\t...INSDC GenBank & EMBL') genbank_path = output_path.joinpath(f'{prefix}.gbff') embl_path = output_path.joinpath(f'{prefix}.embl') diff --git a/test/test_edge_features.py b/test/test_edge_features.py index 38b381ce..35b164f3 100644 --- a/test/test_edge_features.py +++ b/test/test_edge_features.py @@ -7,7 +7,7 @@ def test_bakta_edge_features(tmpdir): - # test edge lable on mock CDS contig + # test edge lable on mock CDS sequence proc = run( [ 'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test', @@ -26,9 +26,9 @@ def test_bakta_edge_features(tmpdir): results = json.load(fh) for feat in results['features']: - if(feat['contig'] != 'dummy'): - if('forward' in feat['contig']): + if(feat['sequence'] != 'dummy'): + if('forward' in feat['sequence']): assert feat['strand'] == bc.STRAND_FORWARD - elif('reverse' in feat['contig']): + elif('reverse' in feat['sequence']): assert feat['strand'] == bc.STRAND_REVERSE - assert feat.get('edge', False) == ('edge' in feat['contig']) + assert feat.get('edge', False) == ('edge' in feat['sequence']) diff --git a/test/test_nt_sequences.py b/test/test_nt_sequences.py index 93e2bd2f..534243b7 100644 --- a/test/test_nt_sequences.py +++ b/test/test_nt_sequences.py @@ -27,7 +27,7 @@ def test_bakta_cds_nt_sequence(tmpdir): results = json.load(fh) for feat in results['features']: - if(feat['contig'] != 'dummy'): + if(feat['sequence'] != 'dummy'): assert feat['nt'] == CDS diff --git a/test/test_pseudo.py b/test/test_pseudo.py index a117882f..c35199e1 100644 --- a/test/test_pseudo.py +++ b/test/test_pseudo.py @@ -12,7 +12,7 @@ 'MKEGQFVGY-FKMKEQRKIPLTHIMIIGAFIFAFLQVVLLASLVHAVNVNNEIQEGLFQSGRIMVESLQHILSVQTGIN', # * { - 'contig': 'foo', + 'sequence': 'foo', 'start': 37, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -36,7 +36,7 @@ 'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKNVIWDYPKDFIAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL\\KIIA\\AVANNYMPGVASYAG', 'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKDVIWDYPKDFMAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL-KIIA-PVANDYMPGVDSYAG', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 1, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -59,7 +59,7 @@ 'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF', 'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASQGSWYNF', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 1, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -82,7 +82,7 @@ 'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF', 'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASUGSWYNF', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 1, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -105,7 +105,7 @@ 'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF', 'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASOGSWYNF', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 1, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -128,7 +128,7 @@ 'MLSIQSNRDWLSMSIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT', 'MLSIQSNRDWLSASIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 40, 'stop': 100, 'strand': bc.STRAND_FORWARD, @@ -153,7 +153,7 @@ 'MKEGQFVGY-FKMKEQRKIPLTHIMIIGAFIFAFLQVVLLASLVHAVNVNNEIQEGLFQSGRIMVESLQHILSVQTGIN', # * { - 'contig': 'foo', + 'sequence': 'foo', 'start': 10, 'stop': 200, 'strand': bc.STRAND_REVERSE, @@ -177,7 +177,7 @@ 'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKNVIWDYPKDFIAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL\\KIIA\\AVANNYMPGVASYAG', 'MTQRPWSKLQREIYDLLTPTINLQIHCTRYPMRSQNGGSTDLPRYWITLDKDVIWDYPKDFMAGNGGVRNFHGETCWYPYLTDICSISDLLREYIDTPKAELLTKQFTSDKWGLVNILRAADRRIGMRRLDQLRRKTHNIAAL-KIIA-PVANDYMPGVDSYAG', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 1, 'stop': 500, 'strand': bc.STRAND_REVERSE, @@ -200,7 +200,7 @@ 'MSLYIKLILSIVREISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNDAS*GSWYNF', 'MPLYIKLILSIVRRISVNTICSLIVVVALSLLSFSSVAKTITAVGSTINSTEKEISLQAEKQGKSYKILGAFFKNRVYMIAKLTPVSKNNASQGSWYNF', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 100, 'stop': 500, 'strand': bc.STRAND_REVERSE, @@ -223,7 +223,7 @@ 'MLSIQSNRDWLSMSIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT', 'MLSIQSNRDWLSASIFSDYSSSSEMHNNLTIDYYLALSSTKGSGITNIISIILQQAQDYDVAKIT', { - 'contig': 'foo', + 'sequence': 'foo', 'start': 40, 'stop': 100, 'strand': bc.STRAND_REVERSE, @@ -262,7 +262,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected assert observations == expected_result -@pytest.mark.parametrize('cds, contig, expected_result', [ +@pytest.mark.parametrize('cds, sequence, expected_result', [ ( { 'start': 310, # linear fits cutoff @@ -327,5 +327,5 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected ) ] ) -def test_get_elongated_cds(cds, contig, expected_result): - assert feat_cds.get_elongated_cds(cds, contig, offset=300) == expected_result +def test_get_elongated_cds(cds, sequence, expected_result): + assert feat_cds.get_elongated_cds(cds, sequence, offset=300) == expected_result diff --git a/test/test_regions.py b/test/test_regions.py index 3c7ad373..d02bdb20 100644 --- a/test/test_regions.py +++ b/test/test_regions.py @@ -29,8 +29,8 @@ def test_wrong_seq_id_failiing(tmpdir): @pytest.mark.parametrize( 'keep_contig_headers', [ - ([]), # autogenerate contig ids - (['--keep-contig-headers']) # keep contig headers + ([]), # autogenerate sequence ids + (['--keep-contig-headers']) # keep sequence headers ] ) def test_regions_plasmid(regions, keep_contig_headers, tmpdir): diff --git a/test/test_sORF.py b/test/test_sORF.py index 72214a6a..56c21c9a 100644 --- a/test/test_sORF.py +++ b/test/test_sORF.py @@ -22,13 +22,13 @@ } GENOME_1 = { - 'contigs': [CONTIG_1] + 'sequences': [CONTIG_1] } GENOME_2 = { - 'contigs': [CONTIG_2] + 'sequences': [CONTIG_2] } GENOME_3 = { - 'contigs': [CONTIG_3] + 'sequences': [CONTIG_3] } From ec091963b1b744d820196f6ca7944a759651c0d0 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 13:20:28 +0200 Subject: [PATCH 2/8] rename seq 'sequence' attribute to 'nt' --- bakta/features/cds.py | 8 ++++---- bakta/features/gaps.py | 4 ++-- bakta/features/s_orf.py | 2 +- bakta/io/fasta.py | 6 +++--- bakta/io/gff.py | 2 +- bakta/io/insdc.py | 2 +- bakta/plot.py | 6 +++--- bakta/utils.py | 10 +++++----- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 177dda58..0475ff67 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -42,7 +42,7 @@ def predict(genome: dict): if(not prodigal_metamode): log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed) - seqs = [seq['sequence'] for seq in genome['sequences']] + seqs = [seq['nt'] for seq in genome['sequences']] trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table) else: log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) @@ -64,7 +64,7 @@ def predict(genome: dict): gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True) else: gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=True, mask=True) - sequences = [seq['sequence'] for seq in linear_sequences] + sequences = [seq['nt'] for seq in linear_sequences] with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe: for seq, genes in zip(linear_sequences, tpe.map(gene_finder.find_genes, sequences)): cdss_per_sequence = create_cdss(genes, seq) @@ -77,7 +77,7 @@ def predict(genome: dict): gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True) else: gene_finder = pyrodigal.GeneFinder(trainings_info, meta=False, closed=False, mask=True) - sequences = [seq['sequence'] for seq in circular_sequences] + sequences = [seq['nt'] for seq in circular_sequences] with cf.ThreadPoolExecutor(max_workers=cfg.threads) as tpe: for seq, genes in zip(circular_sequences, tpe.map(gene_finder.find_genes, sequences)): cdss_per_sequence = create_cdss(genes, seq) @@ -790,7 +790,7 @@ def get_elongated_cds(cds: dict, sequence: dict, offset: int = bc.PSEUDOGENE_OFF 'elongation_downstream': offset } - sequence_length = len(sequence['sequence']) + sequence_length = len(sequence['nt']) if sequence['topology'] == 'circular' and elongated_cds['start'] - offset < 0: elongated_cds['start'] = sequence_length + elongated_cds['start'] - offset elongated_cds['edge'] = True diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py index 5e052de6..0e2610ba 100644 --- a/bakta/features/gaps.py +++ b/bakta/features/gaps.py @@ -14,7 +14,7 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]: gaps = [] for seq in genome['sequences']: - m = RE_ASSEMBLY_GAP.search(seq['sequence']) + m = RE_ASSEMBLY_GAP.search(seq['nt']) while m: start, end = m.span() @@ -31,5 +31,5 @@ def detect_assembly_gaps(genome: dict) -> Sequence[dict]: 'seq=%s, start=%i, stop=%i, length=%s', gap['sequence'], gap['start'], gap['stop'], gap['length'] ) - m = RE_ASSEMBLY_GAP.search(seq['sequence'], end + 1) + m = RE_ASSEMBLY_GAP.search(seq['nt'], end + 1) return gaps diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index 500d52ef..1ae202d4 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -24,7 +24,7 @@ def extract(genome: dict): """Predict open reading frames in mem via BioPython.""" orfs = [] for seq in genome['sequences']: - nt_seq = Seq(seq['sequence']) + nt_seq = Seq(seq['nt']) for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]: # strands +/- for frame in range(3): # frames 1/2/3 -> 0, 1, 2 seq_frame = strand_nt_seq[frame:] diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py index 076eae4d..7a824ef3 100644 --- a/bakta/io/fasta.py +++ b/bakta/io/fasta.py @@ -43,7 +43,7 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T sequence = { 'id': record.id, 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '', - 'sequence': raw_sequence, + 'nt': raw_sequence, 'length': len(raw_sequence) } if(is_genomic): @@ -69,9 +69,9 @@ def export_sequences(sequences: Sequence[dict], fasta_path: Path, description: b else: fh.write(f">{seq['id']}\n") if(wrap): - fh.write(wrap_sequence(seq['sequence'])) + fh.write(wrap_sequence(seq['nt'])) else: - fh.write(seq['sequence']) + fh.write(seq['nt']) fh.write('\n') diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 9d7cd355..741e1989 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -355,7 +355,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat fh.write('##FASTA\n') for seq in genome['sequences']: # write sequences fh.write(f">{seq['id']}\n") - fh.write(fasta.wrap_sequence(seq['sequence'])) + fh.write(fasta.wrap_sequence(seq['nt'])) return diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index d1d380e0..aed9f16d 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -82,7 +82,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: if(len(description) > 0 and description[0] == ' '): # discard potential leading whitespace description = description[1:] - sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['sequence'])) + sequence_record = SeqIO.SeqRecord(id=seq['id'], name=seq['id'], description=description, annotations=sequence_annotations, seq=Seq(seq['nt'])) source = SeqFeature(FeatureLocation(0, seq['length'], strand=+1), type='source', qualifiers=source_qualifiers) seq_feature_list = [source] diff --git a/bakta/plot.py b/bakta/plot.py index 37c2a3a2..6e1a4e24 100644 --- a/bakta/plot.py +++ b/bakta/plot.py @@ -338,11 +338,11 @@ def write_gc_content_skew(sequences, circos_path, colors): max_gc = 0 max_gc_skew = 0 if float(bp.__version__) >= 1.80: - gc_mean = SeqUtils.gc_fraction(''.join([seq['sequence'] for seq in sequences])) + gc_mean = SeqUtils.gc_fraction(''.join([seq['nt'] for seq in sequences])) else: - gc_mean = SeqUtils.GC(''.join([seq['sequence'] for seq in sequences])) / 100 + gc_mean = SeqUtils.GC(''.join([seq['nt'] for seq in sequences])) / 100 for seq in sequences: - nt = seq['sequence'] + nt = seq['nt'] for w in range(0, len(nt), step_size): start = w - window_size if start < 0: diff --git a/bakta/utils.py b/bakta/utils.py index 04d02704..c6f97cd7 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -265,7 +265,7 @@ def create_locus_tag_prefix(sequences: Sequence[dict], length: int=6) -> str: """Create either genus/species or sequence MD5 hex based locus tag prefix.""" hash = hashlib.md5() for seq in sequences: - hash.update(str.encode(seq['sequence'])) + hash.update(str.encode(seq['nt'])) hexdigest = hash.hexdigest().upper() locus_prefix_chars = [] i = 0 @@ -301,7 +301,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]): gc_sum = 0 n_sum = 0 for seq in genome['sequences']: - nt = seq['sequence'] + nt = seq['nt'] gc_sum += nt.count('G') + nt.count('C') n_sum += nt.count('N') gc_ratio = gc_sum / (genome_size - n_sum) @@ -315,7 +315,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]): n50 = 0 sequence_length_sum = 0 for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True): - nt_length = len(seq['sequence']) + nt_length = len(seq['nt']) sequence_length_sum += nt_length if(sequence_length_sum >= genome_size / 2): n50 = nt_length @@ -500,9 +500,9 @@ def qc_sequences(sequences: Sequence[dict], replicons: Dict[str, dict]) -> Tuple def extract_feature_sequence(feature: dict, sequence: dict) -> str: if(feature.get('edge', False)): - nt = sequence['sequence'][feature['start']-1:] + sequence['sequence'][:feature['stop']] + nt = sequence['nt'][feature['start']-1:] + sequence['nt'][:feature['stop']] else: - nt = sequence['sequence'][feature['start']-1:feature['stop']] + nt = sequence['nt'][feature['start']-1:feature['stop']] if(feature['strand'] == bc.STRAND_REVERSE): nt = str(Seq(nt).reverse_complement()) return nt From 88850bc6ea37bb49ac765417de077c5992a583d8 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 15:24:29 +0200 Subject: [PATCH 3/8] rename genome data structure to data --- bakta/features/annotation.py | 34 +++++------ bakta/features/cds.py | 42 +++++++------- bakta/features/crispr.py | 4 +- bakta/features/gaps.py | 4 +- bakta/features/nc_rna.py | 8 +-- bakta/features/nc_rna_region.py | 8 +-- bakta/features/ori.py | 4 +- bakta/features/r_rna.py | 8 +-- bakta/features/s_orf.py | 34 +++++------ bakta/features/t_rna.py | 4 +- bakta/features/tm_rna.py | 4 +- bakta/io.py | 24 ++++---- bakta/io/gff.py | 10 ++-- bakta/io/insdc.py | 18 +++--- bakta/io/json.py | 36 ++++++------ bakta/main.py | 90 ++++++++++++++--------------- bakta/utils.py | 18 +++--- scripts/collect-annotation-stats.py | 52 ++++++++--------- scripts/extract-region.py | 28 ++++----- 19 files changed, 215 insertions(+), 215 deletions(-) diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 3cfb03ab..63165b47 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -145,46 +145,46 @@ def combine_annotation(feature: dict): feature['db_xrefs'] = sorted(list(db_xrefs)) -def detect_feature_overlaps(genome: dict): +def detect_feature_overlaps(data: dict): """Apply feature type specific hierarchical feature overlap filters. tRNA < tmRNA CDS < tmRNA, tRNA, rRNA, CRISPR sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations) """ - sequence_t_rnas = {k['id']: [] for k in genome['sequences']} - for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []): + sequence_t_rnas = {k['id']: [] for k in data['sequences']} + for t_rna in data['features'].get(bc.FEATURE_T_RNA, []): t_rnas = sequence_t_rnas[t_rna['sequence']] t_rnas.append(t_rna) - sequence_tm_rnas = {k['id']: [] for k in genome['sequences']} - for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []): + sequence_tm_rnas = {k['id']: [] for k in data['sequences']} + for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []): tm_rnas = sequence_tm_rnas[tm_rna['sequence']] tm_rnas.append(tm_rna) - sequence_r_rnas = {k['id']: [] for k in genome['sequences']} - for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []): + sequence_r_rnas = {k['id']: [] for k in data['sequences']} + for r_rna in data['features'].get(bc.FEATURE_R_RNA, []): r_rnas = sequence_r_rnas[r_rna['sequence']] r_rnas.append(r_rna) - sequence_ncrna_regions = {k['id']: [] for k in genome['sequences']} - for ncRNA_region in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []): + sequence_ncrna_regions = {k['id']: [] for k in data['sequences']} + for ncRNA_region in data['features'].get(bc.FEATURE_NC_RNA_REGION, []): ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']] ncRNA_regions.append(ncRNA_region) - sequence_crispr_arrays = {k['id']: [] for k in genome['sequences']} - for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []): + sequence_crispr_arrays = {k['id']: [] for k in data['sequences']} + for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []): crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']] crispr_arrays.append(crispr_array) - sequence_cdss = {k['id']: [] for k in genome['sequences']} - sequence_cdss_user_provided = {k['id']: [] for k in genome['sequences']} - for cds in genome['features'].get(bc.FEATURE_CDS, []): + sequence_cdss = {k['id']: [] for k in data['sequences']} + sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']} + for cds in data['features'].get(bc.FEATURE_CDS, []): if(cds.get('source', None) == bc.CDS_SOURCE_USER): cdss = sequence_cdss_user_provided[cds['sequence']] else: cdss = sequence_cdss[cds['sequence']] cdss.append(cds) - sequence_sorfs = {k['id']: [] for k in genome['sequences']} - for sorf in genome['features'].get(bc.FEATURE_SORF, []): + sequence_sorfs = {k['id']: [] for k in data['sequences']} + for sorf in data['features'].get(bc.FEATURE_SORF, []): sorfs = sequence_sorfs[sorf['sequence']] sorfs.append(sorf) - for seq in genome['sequences']: # find feature overlaps sequence-wise to increase the performance + for seq in data['sequences']: # find feature overlaps sequence-wise to increase the performance log.debug('filter features on seq: %s', seq['id']) # mark tRNAs overlapping with tmRNAs diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 0475ff67..781f95f4 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -30,19 +30,19 @@ log = logging.getLogger('CDS') -def predict(genome: dict): +def predict(data: dict): """Predict open reading frames with Pyrodigal.""" # create Pyrodigal trainining file if not provided by the user prodigal_tf_path = cfg.prodigal_tf trainings_info = None - prodigal_metamode = cfg.meta or genome['size'] < pyrodigal.MIN_SINGLE_GENOME # 20_000 bp + prodigal_metamode = cfg.meta or data['size'] < pyrodigal.MIN_SINGLE_GENOME # 20_000 bp log.debug('prodigal mode: meta=%s', prodigal_metamode) if(prodigal_tf_path is None): - closed = not genome['complete'] + closed = not data['complete'] if(not prodigal_metamode): log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed) - seqs = [seq['nt'] for seq in genome['sequences']] + seqs = [seq['nt'] for seq in data['sequences']] trainings_info = gene_finder.train(*seqs, translation_table=cfg.translation_table) else: log.info('skip creation of prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) @@ -58,7 +58,7 @@ def predict(genome: dict): cdss = [] # predict genes on linear sequences - linear_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR] + linear_sequences = [seq for seq in data['sequences'] if seq['topology'] == bc.TOPOLOGY_LINEAR] if(len(linear_sequences) > 0): if prodigal_metamode: gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=True, mask=True) @@ -71,7 +71,7 @@ def predict(genome: dict): cdss.extend(cdss_per_sequence) # predict genes on circular replicons (chromosomes/plasmids) - circular_sequences = [seq for seq in genome['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR] + circular_sequences = [seq for seq in data['sequences'] if seq['topology'] == bc.TOPOLOGY_CIRCULAR] if(len(circular_sequences) > 0): if prodigal_metamode: gene_finder = pyrodigal.GeneFinder(meta=True, metagenomic_bins=None, closed=False, mask=True) @@ -182,14 +182,14 @@ def create_cdss(genes, sequence): return cdss_per_sequence -def import_user_cdss(genome: dict, import_path: Path): +def import_user_cdss(data: dict, import_path: Path): """Import user-provided CDS regions. Only CDS region information are imported skipping any existing functional annotations. Parameters ---------- - genome : dict - Genome dictionary holding sequence information + data : dict + data dictionary holding sequence information import_path : Path Path to GFF3 or Genbank file with regions or features. @@ -200,9 +200,9 @@ def import_user_cdss(genome: dict, import_path: Path): """ user_cdss = [] if(cfg.keep_sequence_headers): - sequences_by_id = {seq['id']: seq for seq in genome['sequences']} # use ID as it's not altered -> no 'orig_id' field + sequences_by_id = {seq['id']: seq for seq in data['sequences']} # use ID as it's not altered -> no 'orig_id' field else: - sequences_by_id = {seq['orig_id']: seq for seq in genome['sequences']} # use 'orig_id' instead of autogenerated new 'id' + sequences_by_id = {seq['orig_id']: seq for seq in data['sequences']} # use 'orig_id' instead of autogenerated new 'id' file_suffix = import_path.suffix.lower() if(file_suffix in ['.gff', '.gff3']): # parse GFF3 format try: @@ -401,22 +401,22 @@ def analyze_proteins(cdss: Sequence[dict]): cds['seq_stats'] = seq_stats -def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): +def revise_translational_exceptions(data: dict, cdss: Sequence[dict]): """ Revise translational exceptions as for istance selenocystein proteins. """ no_revised = 0 - if(bc.FEATURE_NC_RNA_REGION not in genome['features']): # check if ncRNA regions have been detected, otherwise skip analysis and return + if(bc.FEATURE_NC_RNA_REGION not in data['features']): # check if ncRNA regions have been detected, otherwise skip analysis and return return no_revised - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding. - cdss_per_sequences = {k['id']: [] for k in genome['sequences']} # get CDS per sequence + cdss_per_sequences = {k['id']: [] for k in data['sequences']} # get CDS per sequence for cds in cdss: cdss_per_sequence = cdss_per_sequences[cds['sequence']] if('truncated' not in cds): # exclude truncated CDS for now cdss_per_sequence.append(cds) - cds_pairs_per_sequence = {k['id']: [] for k in genome['sequences']} # extract inframe primate CDS neighbouring pairs + cds_pairs_per_sequence = {k['id']: [] for k in data['sequences']} # extract inframe primate CDS neighbouring pairs for id, cdss_per_sequence in cdss_per_sequences.items(): cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start']) for i in range(1, len(cdss_per_sequence)): @@ -432,7 +432,7 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): cds_pairs = cds_pairs_per_sequence[cds_a['sequence']] cds_pairs.append((cds_a, cds_b)) - recoding_regions = [ncrna_region for ncrna_region in genome['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION] # Selenocysteine insertion sequences + recoding_regions = [ncrna_region for ncrna_region in data['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION] # Selenocysteine insertion sequences for recoding_region in recoding_regions: if('selenocysteine' in recoding_region.get('product', '').lower()): cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']] @@ -488,13 +488,13 @@ def revise_translational_exceptions(genome: dict, cdss: Sequence[dict]): return no_revised -def revise_special_cases_annotated(genome: dict, cdss: Sequence[dict]): +def revise_special_cases_annotated(data: dict, cdss: Sequence[dict]): """ Revise rare but known special cases as for istance supposedly truncated dnaA genes on rotated chromosome starts which often appear on re-annotated genomes. """ - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} # look for supposedly truncated dnaA genes on rotated chromosome starts: start=1, strand=+ dnaA = None for cds in cdss: @@ -611,7 +611,7 @@ def predict_pseudo_candidates(hypotheticals: Sequence[dict]) -> Sequence[dict]: return pseudo_candidates -def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: dict) -> Sequence[dict]: +def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], data: dict) -> Sequence[dict]: """ Conduct a BLASTX search of 5'/3'-extended sequences of pseudogene candidates against matching PSCs. Search for and determine possible pseudogenization causes in the resulting alignments. @@ -627,7 +627,7 @@ def detect_pseudogenes(candidates: Sequence[dict], cdss: Sequence[dict], genome: fh.write(f">{cluster_id}\n{faa_seq}\n") # Get extended cds sequences - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} candidates_extended_positions = {} with candidates_elongated_sequences_path.open(mode='w') as fh: for cds in candidates: diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py index 02b41c20..5214a1d3 100644 --- a/bakta/features/crispr.py +++ b/bakta/features/crispr.py @@ -17,7 +17,7 @@ log = logging.getLogger('CRISPR') -def predict_crispr(genome: dict, sequences_path: Path): +def predict_crispr(data: dict, sequences_path: Path): """Predict CRISPR arrays with PILER-CR.""" output_path = cfg.tmp_path.joinpath('crispr.txt') @@ -44,7 +44,7 @@ def predict_crispr(genome: dict, sequences_path: Path): # parse crispr arrays crispr_arrays = {} - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with output_path.open() as fh: output_section = None sequence_id = None diff --git a/bakta/features/gaps.py b/bakta/features/gaps.py index 0e2610ba..699a01f7 100644 --- a/bakta/features/gaps.py +++ b/bakta/features/gaps.py @@ -11,9 +11,9 @@ RE_ASSEMBLY_GAP = re.compile(r'N{1,}', flags=0) -def detect_assembly_gaps(genome: dict) -> Sequence[dict]: +def detect_assembly_gaps(data: dict) -> Sequence[dict]: gaps = [] - for seq in genome['sequences']: + for seq in data['sequences']: m = RE_ASSEMBLY_GAP.search(seq['nt']) while m: start, end = m.span() diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py index 70050e09..4c669674 100644 --- a/bakta/features/nc_rna.py +++ b/bakta/features/nc_rna.py @@ -17,7 +17,7 @@ log = logging.getLogger('NC_RNA') -def predict_nc_rnas(genome: dict, sequences_path: Path): +def predict_nc_rnas(data: dict, sequences_path: Path): """Search for non-coding RNA genes.""" output_path = cfg.tmp_path.joinpath('ncrna-genes.tsv') @@ -31,9 +31,9 @@ def predict_nc_rnas(genome: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(genome['size'] >= 1000000): + if(data['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * genome['size'] // 1000000)) + cmd.append(str(2 * data['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-genes'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) @@ -61,7 +61,7 @@ def predict_nc_rnas(genome: dict, sequences_path: Path): rfam2go[rfam] = [go] ncrnas = [] - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py index b5e3500e..ec8ce92f 100644 --- a/bakta/features/nc_rna_region.py +++ b/bakta/features/nc_rna_region.py @@ -16,7 +16,7 @@ log = logging.getLogger('NC_RNA_REGION') -def predict_nc_rna_regions(genome: dict, sequences_path: Path): +def predict_nc_rna_regions(data: dict, sequences_path: Path): """Search for non-coding RNA regions.""" output_path = cfg.tmp_path.joinpath('ncrna-regions.tsv') @@ -30,9 +30,9 @@ def predict_nc_rna_regions(genome: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(genome['size'] >= 1000000): + if(data['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * genome['size'] // 1000000)) + cmd.append(str(2 * data['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-regions'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) @@ -60,7 +60,7 @@ def predict_nc_rna_regions(genome: dict, sequences_path: Path): rfam2go[rfam] = [go] ncrnas = [] - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): diff --git a/bakta/features/ori.py b/bakta/features/ori.py index bc7c6e59..04ebb35f 100644 --- a/bakta/features/ori.py +++ b/bakta/features/ori.py @@ -18,7 +18,7 @@ log = logging.getLogger('ORI') -def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]: +def predict_oris(data: dict, sequences_path: Path, ori_type: str) -> Sequence[dict]: """Search for oriT/C sequences.""" database = 'oric.fna' if ori_type == bc.FEATURE_ORIC else 'orit.fna' @@ -78,7 +78,7 @@ def predict_oris(genome: dict, sequences_path: Path, ori_type: str) -> Sequence[ # combine overlapping hits (simple 1D array peak detection) oris = [] - for seq in genome['sequences']: + for seq in data['sequences']: sequence_hits = hits.get(seq['id'], None) if(sequence_hits): region_hits = [0] * (seq['length'] + 1) # init with extra leading slot (start at 1) diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py index 640ebfeb..7847921a 100644 --- a/bakta/features/r_rna.py +++ b/bakta/features/r_rna.py @@ -17,7 +17,7 @@ log = logging.getLogger('R_RNA') -def predict_r_rnas(genome: dict, sequences_path: Path): +def predict_r_rnas(data: dict, sequences_path: Path): """Search for ribosomal RNA sequences.""" output_path = cfg.tmp_path.joinpath('rrna.tsv') @@ -31,9 +31,9 @@ def predict_r_rnas(genome: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(genome['size'] >= 1000000): + if(data['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * genome['size'] // 1000000)) + cmd.append(str(2 * data['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('rRNA'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) @@ -51,7 +51,7 @@ def predict_r_rnas(genome: dict, sequences_path: Path): raise Exception(f'cmscan error! error code: {proc.returncode}') rrnas = [] - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with output_path.open() as fh: for line in fh: if(line[0] != '#'): diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index 1ae202d4..cc7ddb0a 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -20,10 +20,10 @@ log = logging.getLogger('S_ORF') -def extract(genome: dict): +def extract(data: dict): """Predict open reading frames in mem via BioPython.""" orfs = [] - for seq in genome['sequences']: + for seq in data['sequences']: nt_seq = Seq(seq['nt']) for strand, strand_nt_seq in [(bc.STRAND_FORWARD, nt_seq), (bc.STRAND_REVERSE, nt_seq.reverse_complement())]: # strands +/- for frame in range(3): # frames 1/2/3 -> 0, 1, 2 @@ -87,40 +87,40 @@ def get_feature_stop(feature: dict) -> int: return feature['stop'] if feature['strand'] == bc.STRAND_FORWARD else feature['start'] -def overlap_filter(genome: dict, orfs_raw: Sequence[dict]): +def overlap_filter(data: dict, orfs_raw: Sequence[dict]): """Filter in-mem ORFs by overlapping CDSs.""" - t_rnas_per_sequence = {seq['id']: [] for seq in genome['sequences']} - for t_rna in genome['features'].get(bc.FEATURE_T_RNA, []): + t_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']} + for t_rna in data['features'].get(bc.FEATURE_T_RNA, []): t_rnas = t_rnas_per_sequence[t_rna['sequence']] t_rnas.append(t_rna) - for tm_rna in genome['features'].get(bc.FEATURE_TM_RNA, []): + for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []): t_rnas = t_rnas_per_sequence[tm_rna['sequence']] t_rnas.append(tm_rna) - r_rna_per_sequence = {seq['id']: [] for seq in genome['sequences']} - for r_rna in genome['features'].get(bc.FEATURE_R_RNA, []): + r_rna_per_sequence = {seq['id']: [] for seq in data['sequences']} + for r_rna in data['features'].get(bc.FEATURE_R_RNA, []): r_rnas = r_rna_per_sequence[r_rna['sequence']] r_rnas.append(r_rna) - # nc_rnas_per_sequence = {k['id']: [] for k in genome['sequences']} - # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA, []): + # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']} + # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA, []): # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) - # for nc_rna in genome['features'].get(bc.FEATURE_NC_RNA_REGION, []): + # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA_REGION, []): # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) - crispr_arrays_per_sequence = {seq['id']: [] for seq in genome['sequences']} - for crispr_array in genome['features'].get(bc.FEATURE_CRISPR, []): + crispr_arrays_per_sequence = {seq['id']: [] for seq in data['sequences']} + for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []): crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']] crispr_arrays.append(crispr_array) - cdss_per_sequence = {k['id']: [] for k in genome['sequences']} - for cds in genome['features'].get(bc.FEATURE_CDS, []): + cdss_per_sequence = {k['id']: [] for k in data['sequences']} + for cds in data['features'].get(bc.FEATURE_CDS, []): cdss = cdss_per_sequence[cds['sequence']] cdss.append(cds) - sorfs_per_sequence = {seq['id']: [] for seq in genome['sequences']} + sorfs_per_sequence = {seq['id']: [] for seq in data['sequences']} for sorf in orfs_raw: orfs = sorfs_per_sequence[sorf['sequence']] orfs.append(sorf) @@ -128,7 +128,7 @@ def overlap_filter(genome: dict, orfs_raw: Sequence[dict]): discarded_sorf_keys = set() with cf.ProcessPoolExecutor(max_workers=cfg.threads) as tpe: futures = [] - for seq in genome['sequences']: + for seq in data['sequences']: sequence_sorfs = sorfs_per_sequence[seq['id']] log.debug('filter: seq=%s, # sORFs=%i', seq['id'], len(sequence_sorfs)) if(len(sequence_sorfs) < 100): # execute sORF filter task diff --git a/bakta/features/t_rna.py b/bakta/features/t_rna.py index 901d3d9d..083fe28d 100644 --- a/bakta/features/t_rna.py +++ b/bakta/features/t_rna.py @@ -42,7 +42,7 @@ } -def predict_t_rnas(genome: dict, sequences_path: Path): +def predict_t_rnas(data: dict, sequences_path: Path): """Search for tRNA sequences.""" txt_output_path = cfg.tmp_path.joinpath('trna.tsv') @@ -70,7 +70,7 @@ def predict_t_rnas(genome: dict, sequences_path: Path): raise Exception(f'tRNAscan-SE error! error code: {proc.returncode}') trnas = {} - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with txt_output_path.open() as fh: for line in fh.readlines()[3:]: # skip first 3 lines (sequence_id, trna_id, start, stop, trna_type, anti_codon, intron_begin, bounds_end, score, note) = line.split('\t') diff --git a/bakta/features/tm_rna.py b/bakta/features/tm_rna.py index 26d0bc6c..052ac9e8 100644 --- a/bakta/features/tm_rna.py +++ b/bakta/features/tm_rna.py @@ -13,7 +13,7 @@ log = logging.getLogger('TM_RNA') -def predict_tm_rnas(genome: dict, sequences_path: Path): +def predict_tm_rnas(data: dict, sequences_path: Path): """Search for tmRNA sequences.""" txt_output_path = cfg.tmp_path.joinpath('tmrna.tsv') @@ -45,7 +45,7 @@ def predict_tm_rnas(genome: dict, sequences_path: Path): raise Exception(f'aragorn error! error code: {proc.returncode}') tmrnas = [] - sequences = {seq['id']: seq for seq in genome['sequences']} + sequences = {seq['id']: seq for seq in data['sequences']} with txt_output_path.open() as fh: sequence_id = None for line in fh: diff --git a/bakta/io.py b/bakta/io.py index 64c009e8..5f62efac 100644 --- a/bakta/io.py +++ b/bakta/io.py @@ -94,13 +94,13 @@ def main(): annotation = json.load(fh) features = annotation['features'] sequences = annotation['sequences'] - genome = { + data = { 'features': features, 'sequence': sequences, 'taxon': annotation['genome'] } - features_by_sequence = {k['id']: [] for k in genome['sequences']} - for feature in genome['features']: + features_by_sequence = {k['id']: [] for k in data['sequences']} + for feature in data['features']: sequence_features = features_by_sequence.get(feature['sequence']) sequence_features.append(feature) @@ -114,20 +114,20 @@ def main(): print(f'\nExport annotation results to: {cfg.output_path}') print('\thuman readable TSV...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_features(genome['sequences'], features_by_sequence, tsv_path) + tsv.write_features(data['sequences'], features_by_sequence, tsv_path) print('\tGFF3...') gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_features(genome, features_by_sequence, gff3_path) + gff.write_features(data, features_by_sequence, gff3_path) print('\tINSDC GenBank & EMBL...') genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') - insdc.write_features(genome, features, genbank_path, embl_path) + insdc.write_features(data, features, genbank_path, embl_path) print('\tgenome sequences...') fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') - fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True) + fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True) print('\tfeature nucleotide sequences...') ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') @@ -139,13 +139,13 @@ def main(): print('\tfeature inferences...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') - tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path) + tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path) if(cfg.skip_plot or cfg.meta): print('\tskip generation of circular genome plot...') else: print('\tcircular genome plot...') - plot.write(features, genome['sequences'], cfg.output_path) + plot.write(features, data['sequences'], cfg.output_path) if(cfg.skip_cds is False): hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] @@ -160,10 +160,10 @@ def main(): print('\tGenome and annotation summary...') summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') with summary_path.open('w') as fh_out: - genome_stats = bu.calc_genome_stats(genome, features) + genome_stats = bu.calc_genome_stats(data, features) fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {genome['size']:}\n") - fh_out.write(f"Count: {len(genome['sequences'])}\n") + fh_out.write(f"Length: {data['size']:}\n") + fh_out.write(f"Count: {len(data['sequences'])}\n") fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") fh_out.write(f"N50: {genome_stats['n50']:}\n") fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 741e1989..09fc2ca9 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -14,7 +14,7 @@ log = logging.getLogger('GFF') -def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_path: Path): +def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: Path): """Export features in GFF3 format.""" log.info('write features: path=%s', gff3_path) @@ -22,8 +22,8 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat fh.write('##gff-version 3\n') # GFF version fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n') # SO feature version - if(genome['taxon']): # write organism info - fh.write(f"# organism {genome['taxon']}\n") + if(data['taxon']): # write organism info + fh.write(f"# organism {data['taxon']}\n") fh.write('# Annotated with Bakta\n') fh.write(f'# Software: v{bakta.__version__}\n') @@ -31,7 +31,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat fh.write(f'# DOI: {bc.BAKTA_DOI}\n') fh.write(f'# URL: {bc.BAKTA_URL}\n') - for seq in genome['sequences']: # write features + for seq in data['sequences']: # write features fh.write(f"##sequence-region {seq['id']} 1 {seq['length']}\n") # sequence region # write landmark region @@ -353,7 +353,7 @@ def write_features(genome: dict, features_by_sequence: Dict[str, dict], gff3_pat if(not cfg.compliant): fh.write('##FASTA\n') - for seq in genome['sequences']: # write sequences + for seq in data['sequences']: # write sequences fh.write(f">{seq['id']}\n") fh.write(fasta.wrap_sequence(seq['nt'])) return diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index aed9f16d..f49730e3 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -18,11 +18,11 @@ log = logging.getLogger('INSDC') -def write_features(genome: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path): +def write_features(data: dict, features: Sequence[dict], genbank_output_path: Path, embl_output_path: Path): log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path) sequence_list = [] - for seq in genome['sequences']: + for seq in data['sequences']: sequence_features = [feat for feat in features if feat['sequence'] == seq['id']] comment = ( 'Annotated with Bakta', @@ -47,7 +47,7 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: ) sequence_annotations = { 'molecule_type': 'DNA', - 'source': genome['taxon'], + 'source': data['taxon'], 'date': date.today().strftime('%d-%b-%Y').upper(), 'topology': seq['topology'], 'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT', @@ -61,12 +61,12 @@ def write_features(genome: dict, features: Sequence[dict], genbank_output_path: } description = '' - if(genome['taxon']): - sequence_annotations['organism'] = genome['taxon'] - source_qualifiers['organism'] = genome['taxon'] - description = genome['taxon'] - if(genome['strain']): - source_qualifiers['strain'] = genome['strain'] + if(data['taxon']): + sequence_annotations['organism'] = data['taxon'] + source_qualifiers['organism'] = data['taxon'] + description = data['taxon'] + if(data['strain']): + source_qualifiers['strain'] = data['strain'] if(seq['type'] == bc.REPLICON_PLASMID): source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed' diff --git a/bakta/io/json.py b/bakta/io/json.py index 819a1813..1d5e8256 100644 --- a/bakta/io/json.py +++ b/bakta/io/json.py @@ -13,7 +13,7 @@ log = logging.getLogger('JSON') -def write_json(genome: dict, features: Sequence[dict], json_path: Path): +def write_json(data: dict, features: Sequence[dict], json_path: Path): log.info('write JSON: path=%s', json_path) # clean feature attributes @@ -33,30 +33,30 @@ def write_json(genome: dict, features: Sequence[dict], json_path: Path): # replace features type dict by sorted feature list output = OrderedDict() - if genome is not None: + if data is not None: ordered_genome = OrderedDict() - ordered_genome['genus'] = genome['genus'] - ordered_genome['species'] = genome['species'] - ordered_genome['strain'] = genome['strain'] - if('plasmid' in genome): - ordered_genome['plasmid'] = genome['plasmid'] - ordered_genome['complete'] = genome['complete'] - ordered_genome['gram'] = genome['gram'] - ordered_genome['translation_table'] = genome['translation_table'] + ordered_genome['genus'] = data['genus'] + ordered_genome['species'] = data['species'] + ordered_genome['strain'] = data['strain'] + if('plasmid' in data): + ordered_genome['plasmid'] = data['plasmid'] + ordered_genome['complete'] = data['complete'] + ordered_genome['gram'] = data['gram'] + ordered_genome['translation_table'] = data['translation_table'] output['genome'] = ordered_genome stats = OrderedDict() - stats['no_sequences'] = len(genome['sequences']) - stats['size'] = genome['size'] - stats['gc'] = genome['gc'] - stats['n_ratio'] = genome['n_ratio'] - stats['n50'] = genome['n50'] - stats['coding_ratio'] = genome['coding_ratio'] + stats['no_sequences'] = len(data['sequences']) + stats['size'] = data['size'] + stats['gc'] = data['gc'] + stats['n_ratio'] = data['n_ratio'] + stats['n50'] = data['n50'] + stats['coding_ratio'] = data['coding_ratio'] output['stats'] = stats output['features'] = features - if genome is not None: - output['sequences'] = genome['sequences'] + if data is not None: + output['sequences'] = data['sequences'] run = OrderedDict() run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S') diff --git a/bakta/main.py b/bakta/main.py index 523e36f2..d54df130 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -133,7 +133,7 @@ def main(): sys.exit('Error: input file contains no valid sequences.') sequences_path = cfg.tmp_path.joinpath('sequences.fna') fasta.export_sequences(sequences, sequences_path) - genome = { + data = { 'genus': cfg.genus, 'species': cfg.species, 'strain': cfg.strain, @@ -146,7 +146,7 @@ def main(): 'sequences': sequences } if(cfg.plasmid): - genome['plasmid'] = cfg.plasmid + data['plasmid'] = cfg.plasmid print('\nStart annotation...') ############################################################################ @@ -157,8 +157,8 @@ def main(): else: print('predict tRNAs...') log.debug('start tRNA prediction') - genome['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_T_RNA])}") + data['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_T_RNA])}") ############################################################################ # tmRNA prediction @@ -168,8 +168,8 @@ def main(): else: print('predict tmRNAs...') log.debug('start tmRNA prediction') - genome['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_TM_RNA])}") + data['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_TM_RNA])}") ############################################################################ # rRNA prediction @@ -179,8 +179,8 @@ def main(): else: print('predict rRNAs...') log.debug('start rRNA prediction') - genome['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_R_RNA])}") + data['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_R_RNA])}") ############################################################################ # ncRNA gene prediction @@ -190,8 +190,8 @@ def main(): else: print('predict ncRNAs...') log.debug('start ncRNA prediction') - genome['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA])}") + data['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA])}") ############################################################################ # ncRNA region prediction @@ -201,8 +201,8 @@ def main(): else: print('predict ncRNA regions...') log.debug('start ncRNA region prediction') - genome['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_NC_RNA_REGION])}") + data['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA_REGION])}") ############################################################################ # CRISPR prediction @@ -212,8 +212,8 @@ def main(): else: print('predict CRISPR arrays...') log.debug('start CRISPR prediction') - genome['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(genome, sequences_path) - print(f"\tfound: {len(genome['features'][bc.FEATURE_CRISPR])}") + data['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(data, sequences_path) + print(f"\tfound: {len(data['features'][bc.FEATURE_CRISPR])}") ############################################################################ # CDS prediction @@ -230,7 +230,7 @@ def main(): else: print('predict & annotate CDSs...') log.debug('predict CDS') - cdss = feat_cds.predict(genome) + cdss = feat_cds.predict(data) print(f"\tpredicted: {len(cdss)} ") if(len(cdss) > 0): @@ -241,13 +241,13 @@ def main(): if(len(cdss) > 0): log.debug('revise translational exceptions') - no_revised = feat_cds.revise_translational_exceptions(genome, cdss) + no_revised = feat_cds.revise_translational_exceptions(data, cdss) print(f'\trevised translational exceptions: {no_revised}') cdss = [cds for cds in cdss if 'discarded' not in cds] if(cfg.regions): log.debug('import user-provided CDS regions') - imported_cdss = feat_cds.import_user_cdss(genome, cfg.regions) + imported_cdss = feat_cds.import_user_cdss(data, cfg.regions) print(f'\timported CDS regions: {len(imported_cdss)}') cdss.extend(imported_cdss) @@ -316,7 +316,7 @@ def main(): log.debug('search pseudogene candidates') pseudo_candidates = feat_cds.predict_pseudo_candidates(hypotheticals) print(f'\t\tcandidates: {len(pseudo_candidates)}') - pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, genome) if len(pseudo_candidates) > 0 else [] + pseudogenes = feat_cds.detect_pseudogenes(pseudo_candidates, cdss, data) if len(pseudo_candidates) > 0 else [] psc.lookup(pseudogenes, pseudo=True) pscc.lookup(pseudogenes, pseudo=True) for pseudogene in pseudogenes: @@ -334,9 +334,9 @@ def main(): print('\t\tcalculated proteins statistics') print('\trevise special cases...') - feat_cds.revise_special_cases_annotated(genome, cdss) + feat_cds.revise_special_cases_annotated(data, cdss) - genome['features'][bc.FEATURE_CDS] = cdss + data['features'][bc.FEATURE_CDS] = cdss ############################################################################ # sORF prediction @@ -351,11 +351,11 @@ def main(): else: print('detect & annotate sORF...') log.debug('extract sORF') - sorfs = s_orf.extract(genome) + sorfs = s_orf.extract(data) print(f'\tdetected: {len(sorfs)}') log.debug('apply sORF overlap filter') - sorfs, discarded_sorfs = s_orf.overlap_filter(genome, sorfs) + sorfs, discarded_sorfs = s_orf.overlap_filter(data, sorfs) print(f'\tdiscarded due to overlaps: {len(discarded_sorfs)}') if(len(sorfs) > 0): @@ -396,7 +396,7 @@ def main(): log.debug('combine sORF annotations') for feat in sorfs_filtered: anno.combine_annotation(feat) # combine IPS and PSC annotations - genome['features'][bc.FEATURE_SORF] = sorfs_filtered + data['features'][bc.FEATURE_SORF] = sorfs_filtered print(f'\tfiltered sORFs: {len(sorfs_filtered)}') if(cfg.gram != bc.GRAM_UNKNOWN and len(sorfs_filtered) > 0): @@ -417,8 +417,8 @@ def main(): else: print('detect gaps...') log.debug('detect gaps') - assembly_gaps = gaps.detect_assembly_gaps(genome) - genome['features'][bc.FEATURE_GAP] = assembly_gaps + assembly_gaps = gaps.detect_assembly_gaps(data) + data['features'][bc.FEATURE_GAP] = assembly_gaps print(f'\tfound: {len(assembly_gaps)}') ############################################################################ @@ -429,14 +429,14 @@ def main(): else: print('detect oriCs/oriVs...') log.debug('detect oriC/V') - oriCs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIC) - genome['features'][bc.FEATURE_ORIC] = oriCs + oriCs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIC) + data['features'][bc.FEATURE_ORIC] = oriCs print(f'\tfound: {len(oriCs)}') print('detect oriTs...') log.debug('detect oriT') - oriTs = ori.predict_oris(genome, sequences_path, bc.FEATURE_ORIT) - genome['features'][bc.FEATURE_ORIT] = oriTs + oriTs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIT) + data['features'][bc.FEATURE_ORIT] = oriTs print(f'\tfound: {len(oriTs)}') ############################################################################ @@ -446,7 +446,7 @@ def main(): print('skip feature overlap filters...') else: print('apply feature overlap filters...') - anno.detect_feature_overlaps(genome) + anno.detect_feature_overlaps(data) ############################################################################ # Create annotations @@ -456,10 +456,10 @@ def main(): ############################################################################ print('select features and create locus tags...') log.debug('start feature selection and creation of locus tags') - features_by_sequence = {k['id']: [] for k in genome['sequences']} + features_by_sequence = {k['id']: [] for k in data['sequences']} feature_id = 1 feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10) - for feature_list in genome['features'].values(): + for feature_list in data['features'].values(): for feature in feature_list: if('discarded' not in feature): feature['id'] = f'{feature_id_prefix}_{feature_id}' @@ -467,7 +467,7 @@ def main(): seq_features = features_by_sequence.get(feature['sequence']) seq_features.append(feature) features = [] - for seq in genome['sequences']: + for seq in data['sequences']: seq_features = features_by_sequence[seq['id']] seq_features.sort(key=lambda k: k['start']) features.extend(seq_features) @@ -498,9 +498,9 @@ def main(): # - annotation stats ############################################################################ print('\nGenome statistics:') - genome_stats = bu.calc_genome_stats(genome, features) - print(f"\tGenome size: {genome['size']:,} bp") - print(f"\tContigs/replicons: {len(genome['sequences'])}") + genome_stats = bu.calc_genome_stats(data, features) + print(f"\tGenome size: {data['size']:,} bp") + print(f"\tContigs/replicons: {len(data['sequences'])}") print(f"\tGC: {100 * genome_stats['gc']:.1f} %") print(f"\tN50: {genome_stats['n50']:,}") print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %") @@ -533,20 +533,20 @@ def main(): print(f'\nExport annotation results to: {cfg.output_path}') print('\thuman readable TSV...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_features(genome['sequences'], features_by_sequence, tsv_path) + tsv.write_features(data['sequences'], features_by_sequence, tsv_path) print('\tGFF3...') gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_features(genome, features_by_sequence, gff3_path) + gff.write_features(data, features_by_sequence, gff3_path) print('\tINSDC GenBank & EMBL...') genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') - insdc.write_features(genome, features, genbank_path, embl_path) + insdc.write_features(data, features, genbank_path, embl_path) print('\tgenome sequences...') fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') - fasta.export_sequences(genome['sequences'], fna_path, description=True, wrap=True) + fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True) print('\tfeature nucleotide sequences...') ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') @@ -558,7 +558,7 @@ def main(): print('\tfeature inferences...') tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') - tsv.write_feature_inferences(genome['sequences'], features_by_sequence, tsv_path) + tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path) if(cfg.skip_plot or cfg.meta): print('\tskip generation of circular genome plot...') @@ -579,7 +579,7 @@ def main(): # measure runtime at the latest possible cfg.run_end = datetime.now() run_duration = (cfg.run_end - cfg.run_start).total_seconds() - genome['run'] = { + data['run'] = { 'start': cfg.run_start.strftime('%Y-%m-%d %H:%M:%S'), 'end': cfg.run_end.strftime('%Y-%m-%d %H:%M:%S'), 'duration': f'{(run_duration / 60):.2f} min' @@ -587,14 +587,14 @@ def main(): print('\tmachine readable JSON...') json_path = cfg.output_path.joinpath(f'{cfg.prefix}.json') - json.write_json(genome, features, json_path) + json.write_json(data, features, json_path) print('\tGenome and annotation summary...') summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') with summary_path.open('w') as fh_out: fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {genome['size']:}\n") - fh_out.write(f"Count: {len(genome['sequences'])}\n") + fh_out.write(f"Length: {data['size']:}\n") + fh_out.write(f"Count: {len(data['sequences'])}\n") fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") fh_out.write(f"N50: {genome_stats['n50']:}\n") fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") diff --git a/bakta/utils.py b/bakta/utils.py index c6f97cd7..eb7863e1 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -293,37 +293,37 @@ def has_annotation(feature: dict, attribute: str) -> bool: return False -def calc_genome_stats(genome: dict, features: Sequence[dict]): - genome_size = genome['size'] +def calc_genome_stats(data: dict, features: Sequence[dict]): + genome_size = data['size'] log.info('genome-size=%i', genome_size) # N50 gc_sum = 0 n_sum = 0 - for seq in genome['sequences']: + for seq in data['sequences']: nt = seq['nt'] gc_sum += nt.count('G') + nt.count('C') n_sum += nt.count('N') gc_ratio = gc_sum / (genome_size - n_sum) - genome['gc'] = gc_ratio + data['gc'] = gc_ratio log.info('GC=%0.3f', gc_ratio) n_ratio = n_sum / genome_size - genome['n_ratio'] = n_ratio + data['n_ratio'] = n_ratio log.info('N=%0.3f', n_ratio) n50 = 0 sequence_length_sum = 0 - for seq in sorted(genome['sequences'], key=lambda x: x['length'], reverse=True): + for seq in sorted(data['sequences'], key=lambda x: x['length'], reverse=True): nt_length = len(seq['nt']) sequence_length_sum += nt_length if(sequence_length_sum >= genome_size / 2): n50 = nt_length break - genome['n50'] = n50 + data['n50'] = n50 log.info('N50=%i', n50) - sequence_by_id = {seq['id']: seq for seq in genome['sequences']} + sequence_by_id = {seq['id']: seq for seq in data['sequences']} coding_nts = 0 for feat in features: if(feat.get('edge', False)): @@ -332,7 +332,7 @@ def calc_genome_stats(genome: dict, features: Sequence[dict]): else: coding_nts += feat['stop'] - feat['start'] + 1 # feature coding nucleotides coding_ratio = coding_nts / (genome_size - n_sum) - genome['coding_ratio'] = coding_ratio + data['coding_ratio'] = coding_ratio log.info('coding-ratio=%0.3f', coding_ratio) return { diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py index 7a772a31..4786a709 100755 --- a/scripts/collect-annotation-stats.py +++ b/scripts/collect-annotation-stats.py @@ -66,36 +66,36 @@ ) ) fh_out.write('\n') - for genome in args.genomes: - genome_path = Path(genome).resolve() + for genome_file in args.genomes: + genome_path = Path(genome_file).resolve() try: with genome_path.open() as fh_in: - genome = json.load(fh_in) + data = json.load(fh_in) stats = [ genome_path.stem, - f"{' '.join([t for t in [genome['genome'].get('genus', None), genome['genome'].get('species', None), genome['genome'].get('strain', None)] if t is not None])}", - 'y' if genome['genome']['complete'] else 'n', - f"{genome['genome']['translation_table']}", - f"{genome['stats']['no_sequences']}", - f"{genome['stats']['size']}", - f"{100 * genome['stats']['gc']:.1f}", - f"{100 * genome['stats']['n_ratio']:.1f}", - f"{genome['stats']['n50']}", - f"{100 * genome['stats']['coding_ratio']:.1f}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_T_RNA])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_TM_RNA])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_R_RNA])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CRISPR])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_SORF])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_GAP])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIC])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIV])}", - f"{len([f for f in genome['features'] if f['type'] == bc.FEATURE_ORIT])}", + f"{' '.join([t for t in [data['genome'].get('genus', None), data['genome'].get('species', None), data['genome'].get('strain', None)] if t is not None])}", + 'y' if data['genome']['complete'] else 'n', + f"{data['genome']['translation_table']}", + f"{data['stats']['no_sequences']}", + f"{data['stats']['size']}", + f"{100 * data['stats']['gc']:.1f}", + f"{100 * data['stats']['n_ratio']:.1f}", + f"{data['stats']['n50']}", + f"{100 * data['stats']['coding_ratio']:.1f}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_T_RNA])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_TM_RNA])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_R_RNA])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CRISPR])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_SORF])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_GAP])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIC])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIV])}", + f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIT])}", ] output_line = '\t'.join(stats) print(output_line) diff --git a/scripts/extract-region.py b/scripts/extract-region.py index 23da1605..144a1bc3 100755 --- a/scripts/extract-region.py +++ b/scripts/extract-region.py @@ -41,11 +41,11 @@ print('Load annotated genome...') genome_path = Path(args.genome).resolve() with genome_path.open() as fh: - genome = json.load(fh) + data = json.load(fh) sequence_id = args.sequence if(sequence_id is None): # take first sequence as default - sequence_id = genome['sequences'][0]['id'] + sequence_id = data['sequences'][0]['id'] prefix = args.prefix if(prefix is None): # use input file prefix as default @@ -54,33 +54,33 @@ print('Extract features within selected region...') features_selected = [] -for feat in genome['features']: +for feat in data['features']: if(feat['sequence'] == sequence_id): if(feat['start'] >= args.min and feat['stop'] <= args.max): features_selected.append(feat) features_by_sequence = {sequence_id: features_selected} # needed for GFF3 export print(f'\t...selected features: {len(features_selected)}') -genome['features'] = features_selected -genome['sequences'] = [sequence for sequence in genome['sequences'] if sequence['id'] == sequence_id] -genome['genus'] = genome['genome']['genus'] -genome['species'] = genome['genome']['species'] -genome['strain'] = genome['genome']['strain'] -genome['taxon'] = f"{genome['genome']['genus']} {genome['genome']['species']} {genome['genome']['strain']}" +data['features'] = features_selected +data['sequences'] = [sequence for sequence in data['sequences'] if sequence['id'] == sequence_id] +data['genus'] = data['genome']['genus'] +data['species'] = data['genome']['species'] +data['strain'] = data['genome']['strain'] +data['taxon'] = f"{data['genome']['genus']} {data['genome']['species']} {data['genome']['strain']}" cfg.db_info = { - 'major': genome['version']['db']['version'].split('.')[0], - 'minor': genome['version']['db']['version'].split('.')[1], - 'type': genome['version']['db']['type'] + 'major': data['version']['db']['version'].split('.')[0], + 'minor': data['version']['db']['version'].split('.')[1], + 'type': data['version']['db']['type'] } print('Write selected features...') output_path = Path(args.output).resolve() gff3_path = output_path.joinpath(f'{prefix}.gff3') -gff.write_features(genome, features_by_sequence, gff3_path) +gff.write_features(data, features_by_sequence, gff3_path) print('\t...INSDC GenBank & EMBL') genbank_path = output_path.joinpath(f'{prefix}.gbff') embl_path = output_path.joinpath(f'{prefix}.embl') -insdc.write_features(genome, features_selected, genbank_path, embl_path) +insdc.write_features(data, features_selected, genbank_path, embl_path) print('\t...feature nucleotide sequences') ffn_path = output_path.joinpath(f'{prefix}.ffn') fasta.write_ffn(features_selected, ffn_path) From 8c3b2fbd57ea1853efce9a4ec5e6cbb0502161d9 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 16:34:22 +0200 Subject: [PATCH 4/8] reorganize data strucutre --- bakta/features/annotation.py | 18 +++--- bakta/features/cds.py | 8 +-- bakta/features/nc_rna.py | 4 +- bakta/features/nc_rna_region.py | 4 +- bakta/features/r_rna.py | 4 +- bakta/features/s_orf.py | 14 ++--- bakta/io.py | 11 ++-- bakta/io/fasta.py | 15 ++--- bakta/io/gff.py | 4 +- bakta/io/insdc.py | 14 ++--- bakta/io/json.py | 36 +----------- bakta/main.py | 101 +++++++++++++++++--------------- bakta/proteins.py | 3 +- bakta/utils.py | 21 +++---- test/test_pseudo.py | 6 +- test/test_sORF.py | 6 +- 16 files changed, 119 insertions(+), 150 deletions(-) diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 63165b47..7b771a32 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -152,35 +152,35 @@ def detect_feature_overlaps(data: dict): sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations) """ sequence_t_rnas = {k['id']: [] for k in data['sequences']} - for t_rna in data['features'].get(bc.FEATURE_T_RNA, []): - t_rnas = sequence_t_rnas[t_rna['sequence']] - t_rnas.append(t_rna) + for trna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]: + t_rnas = sequence_t_rnas[trna['sequence']] + t_rnas.append(trna) sequence_tm_rnas = {k['id']: [] for k in data['sequences']} - for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []): + for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]: tm_rnas = sequence_tm_rnas[tm_rna['sequence']] tm_rnas.append(tm_rna) sequence_r_rnas = {k['id']: [] for k in data['sequences']} - for r_rna in data['features'].get(bc.FEATURE_R_RNA, []): + for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]: r_rnas = sequence_r_rnas[r_rna['sequence']] r_rnas.append(r_rna) sequence_ncrna_regions = {k['id']: [] for k in data['sequences']} - for ncRNA_region in data['features'].get(bc.FEATURE_NC_RNA_REGION, []): + for ncRNA_region in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]: ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']] ncRNA_regions.append(ncRNA_region) sequence_crispr_arrays = {k['id']: [] for k in data['sequences']} - for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []): + for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]: crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']] crispr_arrays.append(crispr_array) sequence_cdss = {k['id']: [] for k in data['sequences']} sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']} - for cds in data['features'].get(bc.FEATURE_CDS, []): + for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]: if(cds.get('source', None) == bc.CDS_SOURCE_USER): cdss = sequence_cdss_user_provided[cds['sequence']] else: cdss = sequence_cdss[cds['sequence']] cdss.append(cds) sequence_sorfs = {k['id']: [] for k in data['sequences']} - for sorf in data['features'].get(bc.FEATURE_SORF, []): + for sorf in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF]: sorfs = sequence_sorfs[sorf['sequence']] sorfs.append(sorf) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 781f95f4..4215b7d6 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -35,10 +35,10 @@ def predict(data: dict): # create Pyrodigal trainining file if not provided by the user prodigal_tf_path = cfg.prodigal_tf trainings_info = None - prodigal_metamode = cfg.meta or data['size'] < pyrodigal.MIN_SINGLE_GENOME # 20_000 bp + prodigal_metamode = cfg.meta or data['stats']['size'] < pyrodigal.MIN_SINGLE_GENOME # 20_000 bp log.debug('prodigal mode: meta=%s', prodigal_metamode) if(prodigal_tf_path is None): - closed = not data['complete'] + closed = not data['genome']['complete'] if(not prodigal_metamode): log.info('create prodigal training info object: meta=%s, closed=%s', prodigal_metamode, closed) gene_finder = pyrodigal.GeneFinder(meta=prodigal_metamode, closed=closed) @@ -406,7 +406,7 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]): Revise translational exceptions as for istance selenocystein proteins. """ no_revised = 0 - if(bc.FEATURE_NC_RNA_REGION not in data['features']): # check if ncRNA regions have been detected, otherwise skip analysis and return + if(len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]) == 0): # check if ncRNA regions have been detected, otherwise skip analysis and return return no_revised sequences = {seq['id']: seq for seq in data['sequences']} @@ -432,7 +432,7 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]): cds_pairs = cds_pairs_per_sequence[cds_a['sequence']] cds_pairs.append((cds_a, cds_b)) - recoding_regions = [ncrna_region for ncrna_region in data['features'][bc.FEATURE_NC_RNA_REGION] if ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION] # Selenocysteine insertion sequences + recoding_regions = [ncrna_region for ncrna_region in data['features'] if ncrna_region['type'] == bc.FEATURE_NC_RNA_REGION and ncrna_region['class'] == so.SO_CIS_REG_RECODING_STIMULATION_REGION] # Selenocysteine insertion sequences for recoding_region in recoding_regions: if('selenocysteine' in recoding_region.get('product', '').lower()): cds_pairs = cds_pairs_per_sequence[recoding_region['sequence']] diff --git a/bakta/features/nc_rna.py b/bakta/features/nc_rna.py index 4c669674..16b94404 100644 --- a/bakta/features/nc_rna.py +++ b/bakta/features/nc_rna.py @@ -31,9 +31,9 @@ def predict_nc_rnas(data: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(data['size'] >= 1000000): + if(data['stats']['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * data['size'] // 1000000)) + cmd.append(str(2 * data['stats']['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-genes'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) diff --git a/bakta/features/nc_rna_region.py b/bakta/features/nc_rna_region.py index ec8ce92f..3dfe88b9 100644 --- a/bakta/features/nc_rna_region.py +++ b/bakta/features/nc_rna_region.py @@ -30,9 +30,9 @@ def predict_nc_rna_regions(data: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(data['size'] >= 1000000): + if(data['stats']['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * data['size'] // 1000000)) + cmd.append(str(2 * data['stats']['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('ncRNA-regions'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) diff --git a/bakta/features/r_rna.py b/bakta/features/r_rna.py index 7847921a..b5f5cb92 100644 --- a/bakta/features/r_rna.py +++ b/bakta/features/r_rna.py @@ -31,9 +31,9 @@ def predict_r_rnas(data: dict, sequences_path: Path): '--cpu', str(cfg.threads), '--tblout', str(output_path) ] - if(data['size'] >= 1000000): + if(data['stats']['size'] >= 1000000): cmd.append('-Z') - cmd.append(str(2 * data['size'] // 1000000)) + cmd.append(str(2 * data['stats']['size'] // 1000000)) cmd.append(str(cfg.db_path.joinpath('rRNA'))) cmd.append(str(sequences_path)) log.debug('cmd=%s', cmd) diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index cc7ddb0a..cb93f294 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -90,33 +90,33 @@ def get_feature_stop(feature: dict) -> int: def overlap_filter(data: dict, orfs_raw: Sequence[dict]): """Filter in-mem ORFs by overlapping CDSs.""" t_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']} - for t_rna in data['features'].get(bc.FEATURE_T_RNA, []): + for t_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]: t_rnas = t_rnas_per_sequence[t_rna['sequence']] t_rnas.append(t_rna) - for tm_rna in data['features'].get(bc.FEATURE_TM_RNA, []): + for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]: t_rnas = t_rnas_per_sequence[tm_rna['sequence']] t_rnas.append(tm_rna) r_rna_per_sequence = {seq['id']: [] for seq in data['sequences']} - for r_rna in data['features'].get(bc.FEATURE_R_RNA, []): + for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]: r_rnas = r_rna_per_sequence[r_rna['sequence']] r_rnas.append(r_rna) # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']} - # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA, []): + # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA]: # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) - # for nc_rna in data['features'].get(bc.FEATURE_NC_RNA_REGION, []): + # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]: # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) crispr_arrays_per_sequence = {seq['id']: [] for seq in data['sequences']} - for crispr_array in data['features'].get(bc.FEATURE_CRISPR, []): + for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]: crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']] crispr_arrays.append(crispr_array) cdss_per_sequence = {k['id']: [] for k in data['sequences']} - for cds in data['features'].get(bc.FEATURE_CDS, []): + for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]: cdss = cdss_per_sequence[cds['sequence']] cdss.append(cds) diff --git a/bakta/io.py b/bakta/io.py index 5f62efac..8bb2bf08 100644 --- a/bakta/io.py +++ b/bakta/io.py @@ -160,14 +160,13 @@ def main(): print('\tGenome and annotation summary...') summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') with summary_path.open('w') as fh_out: - genome_stats = bu.calc_genome_stats(data, features) fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {data['size']:}\n") + fh_out.write(f"Length: {data['stats']['size']:}\n") fh_out.write(f"Count: {len(data['sequences'])}\n") - fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") - fh_out.write(f"N50: {genome_stats['n50']:}\n") - fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") - fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n") + fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n") + fh_out.write(f"N50: {data['stats']['n50']:}\n") + fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") + fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") fh_out.write('\nAnnotation:\n') fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") diff --git a/bakta/io/fasta.py b/bakta/io/fasta.py index 7a824ef3..9c3c10cb 100644 --- a/bakta/io/fasta.py +++ b/bakta/io/fasta.py @@ -23,6 +23,11 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T sequences = [] with xopen(str(sequences_path), threads=0) as fh: for record in SeqIO.parse(fh, 'fasta'): + sequence = { + 'id': record.id, + 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '' + } + raw_sequence = str(record.seq).upper() if('-' in raw_sequence): dash_count = raw_sequence.count('-') @@ -32,6 +37,7 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T if(FASTA_DNA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None): log.error('import: Fasta sequence contains invalid DNA characters! id=%s', record.id) raise ValueError(f'Fasta sequence contains invalid DNA characters! id={record.id}') + sequence['nt'] = raw_sequence else: if(raw_sequence[-1] == '*'): # remove trailing stop asterik raw_sequence = raw_sequence[:-1] @@ -39,13 +45,8 @@ def import_sequences(sequences_path: Path, is_genomic: bool=True, is_dna: bool=T if(FASTA_AA_SEQUENCE_PATTERN.fullmatch(raw_sequence) is None): log.error('import: Fasta sequence contains invalid AA characters! id=%s, seq=%s', record.id, raw_sequence) raise ValueError(f'Fasta sequence contains invalid AA characters! id={record.id}') - - sequence = { - 'id': record.id, - 'description': record.description.split(' ', maxsplit=1)[1] if ' ' in record.description else '', - 'nt': raw_sequence, - 'length': len(raw_sequence) - } + sequence['aa'] = raw_sequence + sequence['length'] = len(raw_sequence) if(is_genomic): sequence['complete'] = False sequence['type'] = bc.REPLICON_CONTIG diff --git a/bakta/io/gff.py b/bakta/io/gff.py index 09fc2ca9..9acca552 100644 --- a/bakta/io/gff.py +++ b/bakta/io/gff.py @@ -22,8 +22,8 @@ def write_features(data: dict, features_by_sequence: Dict[str, dict], gff3_path: fh.write('##gff-version 3\n') # GFF version fh.write('##feature-ontology https://github.com/The-Sequence-Ontology/SO-Ontologies/blob/v3.1/so.obo\n') # SO feature version - if(data['taxon']): # write organism info - fh.write(f"# organism {data['taxon']}\n") + if(data['genome']['taxon']): # write organism info + fh.write(f"# organism {data['genome']['taxon']}\n") fh.write('# Annotated with Bakta\n') fh.write(f'# Software: v{bakta.__version__}\n') diff --git a/bakta/io/insdc.py b/bakta/io/insdc.py index f49730e3..698c8ff5 100644 --- a/bakta/io/insdc.py +++ b/bakta/io/insdc.py @@ -47,7 +47,7 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa ) sequence_annotations = { 'molecule_type': 'DNA', - 'source': data['taxon'], + 'source': data['genome']['taxon'], 'date': date.today().strftime('%d-%b-%Y').upper(), 'topology': seq['topology'], 'data_file_division': 'HGT' if seq['type'] == bc.REPLICON_CONTIG else 'BCT', @@ -61,12 +61,12 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa } description = '' - if(data['taxon']): - sequence_annotations['organism'] = data['taxon'] - source_qualifiers['organism'] = data['taxon'] - description = data['taxon'] - if(data['strain']): - source_qualifiers['strain'] = data['strain'] + if(data['genome']['taxon']): + sequence_annotations['organism'] = data['genome']['taxon'] + source_qualifiers['organism'] = data['genome']['taxon'] + description = data['genome']['taxon'] + if(data['genome']['strain']): + source_qualifiers['strain'] = data['genome']['strain'] if(seq['type'] == bc.REPLICON_PLASMID): source_qualifiers['plasmid'] = seq['name'] if seq.get('name', None) else 'unnamed' diff --git a/bakta/io/json.py b/bakta/io/json.py index 1d5e8256..a10cdc84 100644 --- a/bakta/io/json.py +++ b/bakta/io/json.py @@ -31,45 +31,13 @@ def write_json(data: dict, features: Sequence[dict], json_path: Path): if(psc): psc.pop('db_xrefs') - # replace features type dict by sorted feature list - output = OrderedDict() - if data is not None: - ordered_genome = OrderedDict() - ordered_genome['genus'] = data['genus'] - ordered_genome['species'] = data['species'] - ordered_genome['strain'] = data['strain'] - if('plasmid' in data): - ordered_genome['plasmid'] = data['plasmid'] - ordered_genome['complete'] = data['complete'] - ordered_genome['gram'] = data['gram'] - ordered_genome['translation_table'] = data['translation_table'] - output['genome'] = ordered_genome - - stats = OrderedDict() - stats['no_sequences'] = len(data['sequences']) - stats['size'] = data['size'] - stats['gc'] = data['gc'] - stats['n_ratio'] = data['n_ratio'] - stats['n50'] = data['n50'] - stats['coding_ratio'] = data['coding_ratio'] - output['stats'] = stats - - output['features'] = features - if data is not None: - output['sequences'] = data['sequences'] - - run = OrderedDict() - run['start'] = cfg.run_start.strftime('%Y-%m-%d %H:%M:%S') - run['end'] = cfg.run_end.strftime('%Y-%m-%d %H:%M:%S') - output['run'] = run - version = OrderedDict() version['bakta'] = bakta.__version__ version['db'] = { 'version': f"{cfg.db_info['major']}.{cfg.db_info['minor']}", 'type': cfg.db_info['type'] } - output['version'] = version + data['version'] = version with json_path.open('wt') as fh: - json.dump(output, fh, indent=4) + json.dump(data, fh, indent=4) diff --git a/bakta/main.py b/bakta/main.py index d54df130..22ac72d0 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -134,19 +134,23 @@ def main(): sequences_path = cfg.tmp_path.joinpath('sequences.fna') fasta.export_sequences(sequences, sequences_path) data = { - 'genus': cfg.genus, - 'species': cfg.species, - 'strain': cfg.strain, - 'taxon': cfg.taxon, - 'gram': cfg.gram, - 'translation_table': cfg.translation_table, - 'size': sum([seq['length'] for seq in sequences]), - 'complete': cfg.complete or complete_genome, - 'features': {}, + 'genome': { + 'genus': cfg.genus, + 'species': cfg.species, + 'strain': cfg.strain, + 'taxon': cfg.taxon, + 'complete': cfg.complete or complete_genome, + 'gram': cfg.gram, + 'translation_table': cfg.translation_table + }, + 'stats': { + 'size': sum([seq['length'] for seq in sequences]) + }, + 'features': [], 'sequences': sequences } if(cfg.plasmid): - data['plasmid'] = cfg.plasmid + data['genome']['plasmid'] = cfg.plasmid print('\nStart annotation...') ############################################################################ @@ -157,8 +161,9 @@ def main(): else: print('predict tRNAs...') log.debug('start tRNA prediction') - data['features'][bc.FEATURE_T_RNA] = t_rna.predict_t_rnas(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_T_RNA])}") + trnas = t_rna.predict_t_rnas(data, sequences_path) + data['features'].extend(trnas) + print(f"\tfound: {len(trnas)}") ############################################################################ # tmRNA prediction @@ -168,8 +173,9 @@ def main(): else: print('predict tmRNAs...') log.debug('start tmRNA prediction') - data['features'][bc.FEATURE_TM_RNA] = tm_rna.predict_tm_rnas(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_TM_RNA])}") + tmrnas = tm_rna.predict_tm_rnas(data, sequences_path) + data['features'].extend(tmrnas) + print(f"\tfound: {len(tmrnas)}") ############################################################################ # rRNA prediction @@ -179,8 +185,9 @@ def main(): else: print('predict rRNAs...') log.debug('start rRNA prediction') - data['features'][bc.FEATURE_R_RNA] = r_rna.predict_r_rnas(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_R_RNA])}") + rrnas = r_rna.predict_r_rnas(data, sequences_path) + data['features'].extend(rrnas) + print(f"\tfound: {len(rrnas)}") ############################################################################ # ncRNA gene prediction @@ -190,8 +197,9 @@ def main(): else: print('predict ncRNAs...') log.debug('start ncRNA prediction') - data['features'][bc.FEATURE_NC_RNA] = nc_rna.predict_nc_rnas(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA])}") + ncrnas = nc_rna.predict_nc_rnas(data, sequences_path) + data['features'].extend(ncrnas) + print(f"\tfound: {len(ncrnas)}") ############################################################################ # ncRNA region prediction @@ -201,8 +209,9 @@ def main(): else: print('predict ncRNA regions...') log.debug('start ncRNA region prediction') - data['features'][bc.FEATURE_NC_RNA_REGION] = nc_rna_region.predict_nc_rna_regions(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_NC_RNA_REGION])}") + ncrna_regions = nc_rna_region.predict_nc_rna_regions(data, sequences_path) + data['features'].extend(ncrna_regions) + print(f"\tfound: {len(ncrna_regions)}") ############################################################################ # CRISPR prediction @@ -212,8 +221,9 @@ def main(): else: print('predict CRISPR arrays...') log.debug('start CRISPR prediction') - data['features'][bc.FEATURE_CRISPR] = crispr.predict_crispr(data, sequences_path) - print(f"\tfound: {len(data['features'][bc.FEATURE_CRISPR])}") + crisprs = crispr.predict_crispr(data, sequences_path) + data['features'].extend(crisprs) + print(f"\tfound: {len(crisprs)}") ############################################################################ # CDS prediction @@ -336,7 +346,7 @@ def main(): print('\trevise special cases...') feat_cds.revise_special_cases_annotated(data, cdss) - data['features'][bc.FEATURE_CDS] = cdss + data['features'].extend(cdss) ############################################################################ # sORF prediction @@ -396,7 +406,7 @@ def main(): log.debug('combine sORF annotations') for feat in sorfs_filtered: anno.combine_annotation(feat) # combine IPS and PSC annotations - data['features'][bc.FEATURE_SORF] = sorfs_filtered + data['features'].extend(sorfs_filtered) print(f'\tfiltered sORFs: {len(sorfs_filtered)}') if(cfg.gram != bc.GRAM_UNKNOWN and len(sorfs_filtered) > 0): @@ -418,7 +428,7 @@ def main(): print('detect gaps...') log.debug('detect gaps') assembly_gaps = gaps.detect_assembly_gaps(data) - data['features'][bc.FEATURE_GAP] = assembly_gaps + data['features'].extend(assembly_gaps) print(f'\tfound: {len(assembly_gaps)}') ############################################################################ @@ -430,13 +440,13 @@ def main(): print('detect oriCs/oriVs...') log.debug('detect oriC/V') oriCs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIC) - data['features'][bc.FEATURE_ORIC] = oriCs + data['features'].extend(oriCs) print(f'\tfound: {len(oriCs)}') print('detect oriTs...') log.debug('detect oriT') oriTs = ori.predict_oris(data, sequences_path, bc.FEATURE_ORIT) - data['features'][bc.FEATURE_ORIT] = oriTs + data['features'].extend(oriTs) print(f'\tfound: {len(oriTs)}') ############################################################################ @@ -459,18 +469,18 @@ def main(): features_by_sequence = {k['id']: [] for k in data['sequences']} feature_id = 1 feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10) - for feature_list in data['features'].values(): - for feature in feature_list: - if('discarded' not in feature): - feature['id'] = f'{feature_id_prefix}_{feature_id}' - feature_id += 1 - seq_features = features_by_sequence.get(feature['sequence']) - seq_features.append(feature) + for feature in data['features']: + if('discarded' not in feature): + feature['id'] = f'{feature_id_prefix}_{feature_id}' + feature_id += 1 + seq_features = features_by_sequence.get(feature['sequence']) + seq_features.append(feature) features = [] for seq in data['sequences']: seq_features = features_by_sequence[seq['id']] seq_features.sort(key=lambda k: k['start']) features.extend(seq_features) + data['features'] = features # overwrite feature list by final sorted feature list log.info('selected features=%i', len(features)) print(f'\tselected: {len(features)}') @@ -497,15 +507,14 @@ def main(): # - genome stats # - annotation stats ############################################################################ + bu.calc_genome_stats(data) print('\nGenome statistics:') - genome_stats = bu.calc_genome_stats(data, features) - print(f"\tGenome size: {data['size']:,} bp") + print(f"\tGenome size: {data['stats']['size']:,} bp") print(f"\tContigs/replicons: {len(data['sequences'])}") - print(f"\tGC: {100 * genome_stats['gc']:.1f} %") - print(f"\tN50: {genome_stats['n50']:,}") - print(f"\tN ratio: {100 * genome_stats['n_ratio']:.1f} %") - print(f"\tcoding density: {100 * genome_stats['coding_ratio']:.1f} %") - + print(f"\tGC: {100 * data['stats']['gc']:.1f} %") + print(f"\tN50: {data['stats']['n50']:,}") + print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %") + print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %") print('\nannotation summary:') print(f"\ttRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}") print(f"\ttmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}") @@ -593,12 +602,12 @@ def main(): summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') with summary_path.open('w') as fh_out: fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {data['size']:}\n") + fh_out.write(f"Length: {data['stats']['size']:}\n") fh_out.write(f"Count: {len(data['sequences'])}\n") - fh_out.write(f"GC: {100 * genome_stats['gc']:.1f}\n") - fh_out.write(f"N50: {genome_stats['n50']:}\n") - fh_out.write(f"N ratio: {100 * genome_stats['n_ratio']:.1f}\n") - fh_out.write(f"coding density: {100 * genome_stats['coding_ratio']:.1f}\n") + fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n") + fh_out.write(f"N50: {data['stats']['n50']:}\n") + fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") + fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") fh_out.write('\nAnnotation:\n') fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") diff --git a/bakta/proteins.py b/bakta/proteins.py index a7835611..d7f7d169 100644 --- a/bakta/proteins.py +++ b/bakta/proteins.py @@ -137,7 +137,6 @@ def main(): mock_start = 1 for aa in aas: # rename and mock feature attributes to reuse existing functions aa['type'] = bc.FEATURE_CDS - aa['aa'] = aa['sequence'] aa['locus'] = aa['id'] aa['sequence'] = '-' aa['start'] = mock_start @@ -177,7 +176,7 @@ def main(): aa.pop('frame', None) full_annotations_path = output_path.joinpath(f'{cfg.prefix}.json') print(f'\tfull annotations (JSON): {full_annotations_path}') - json.write_json(None, aas, full_annotations_path) + json.write_json({'features': aas}, aas, full_annotations_path) hypotheticals_path = output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv') header_columns = ['ID', 'Length', 'Mol Weight [kDa]', 'Iso El. Point', 'Pfam hits'] hypotheticals = hypotheticals = [aa for aa in aas if 'hypothetical' in aa] diff --git a/bakta/utils.py b/bakta/utils.py index eb7863e1..e405030e 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -293,8 +293,8 @@ def has_annotation(feature: dict, attribute: str) -> bool: return False -def calc_genome_stats(data: dict, features: Sequence[dict]): - genome_size = data['size'] +def calc_genome_stats(data: dict): + genome_size = data['stats']['size'] log.info('genome-size=%i', genome_size) # N50 @@ -305,11 +305,11 @@ def calc_genome_stats(data: dict, features: Sequence[dict]): gc_sum += nt.count('G') + nt.count('C') n_sum += nt.count('N') gc_ratio = gc_sum / (genome_size - n_sum) - data['gc'] = gc_ratio + data['stats']['gc'] = gc_ratio log.info('GC=%0.3f', gc_ratio) n_ratio = n_sum / genome_size - data['n_ratio'] = n_ratio + data['stats']['n_ratio'] = n_ratio log.info('N=%0.3f', n_ratio) n50 = 0 @@ -320,28 +320,21 @@ def calc_genome_stats(data: dict, features: Sequence[dict]): if(sequence_length_sum >= genome_size / 2): n50 = nt_length break - data['n50'] = n50 + data['stats']['n50'] = n50 log.info('N50=%i', n50) sequence_by_id = {seq['id']: seq for seq in data['sequences']} coding_nts = 0 - for feat in features: + for feat in data['features']: if(feat.get('edge', False)): sequence_length = sequence_by_id[feat['sequence']]['length'] coding_nts += feat['stop'] + (sequence_length - feat['start'] + 1) # feature coding nucleotides else: coding_nts += feat['stop'] - feat['start'] + 1 # feature coding nucleotides coding_ratio = coding_nts / (genome_size - n_sum) - data['coding_ratio'] = coding_ratio + data['stats']['coding_ratio'] = coding_ratio log.info('coding-ratio=%0.3f', coding_ratio) - return { - 'gc': gc_ratio, - 'n_ratio': n_ratio, - 'n50': n50, - 'coding_ratio': coding_ratio - } - def parse_replicon_table(replicon_table_path: Path) -> Dict[str, dict]: replicons = {} diff --git a/test/test_pseudo.py b/test/test_pseudo.py index c35199e1..e9a25b3b 100644 --- a/test/test_pseudo.py +++ b/test/test_pseudo.py @@ -271,7 +271,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected 'edge': False }, { - 'sequence': 'ACGT' * 200, + 'nt': 'ACGT' * 200, 'topology': 'linear' }, { @@ -291,7 +291,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected 'edge': False }, { - 'sequence': 'ACGT' * 50, # 200nt + 'nt': 'ACGT' * 50, # 200nt 'topology': 'linear' }, { @@ -313,7 +313,7 @@ def test_compare_alignments(alignment, ref_alignment, cds, coordinates, expected 'elongation_downstream': 300 }, { - 'sequence': 'ACGT' * 100, # 400nt + 'nt': 'ACGT' * 100, # 400nt 'topology': 'circular' }, { diff --git a/test/test_sORF.py b/test/test_sORF.py index 56c21c9a..4db8004a 100644 --- a/test/test_sORF.py +++ b/test/test_sORF.py @@ -8,17 +8,17 @@ CONTIG_1 = { 'id': 1, 'description': 'no sORFs', - 'sequence': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG' + 'nt': 'GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG' } CONTIG_2 = { 'id': 2, 'description': 'out of limits', - 'sequence': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' + 'nt': 'ATGAAAAAATAGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' } CONTIG_3 = { 'id': 3, 'description': 'two sORFs', - 'sequence': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' + 'nt': 'ATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGATGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTAG' } GENOME_1 = { From 93af02415ad416cac1e1037fbf3d861334f1ce9b Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 17:34:52 +0200 Subject: [PATCH 5/8] fix io --- bakta/io.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/bakta/io.py b/bakta/io.py index 8bb2bf08..1002befb 100644 --- a/bakta/io.py +++ b/bakta/io.py @@ -91,15 +91,9 @@ def main(): ############################################################################ print('Parse genome annotations...') with annotation_path.open('r') as fh: - annotation = json.load(fh) - features = annotation['features'] - sequences = annotation['sequences'] - data = { - 'features': features, - 'sequence': sequences, - 'taxon': annotation['genome'] - } - features_by_sequence = {k['id']: [] for k in data['sequences']} + data = json.load(fh) + features = data['features'] + features_by_sequence = {seq['id']: [] for seq in data['sequences']} for feature in data['features']: sequence_features = features_by_sequence.get(feature['sequence']) sequence_features.append(feature) From 1f11b1cd86d643a9a70642bff44157c692997b12 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 17:37:42 +0200 Subject: [PATCH 6/8] refactor seq var name in list comprehensions --- bakta/features/annotation.py | 16 ++++++++-------- bakta/features/cds.py | 4 ++-- bakta/features/s_orf.py | 4 ++-- bakta/main.py | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/bakta/features/annotation.py b/bakta/features/annotation.py index 7b771a32..2c139162 100644 --- a/bakta/features/annotation.py +++ b/bakta/features/annotation.py @@ -151,35 +151,35 @@ def detect_feature_overlaps(data: dict): CDS < tmRNA, tRNA, rRNA, CRISPR sORF < mRNA, tRNA, rRNA, CRISPR, CDS (in-frame & entirely overlapping), sORF (shorter, weaker annotations) """ - sequence_t_rnas = {k['id']: [] for k in data['sequences']} + sequence_t_rnas = {seq['id']: [] for seq in data['sequences']} for trna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA]: t_rnas = sequence_t_rnas[trna['sequence']] t_rnas.append(trna) - sequence_tm_rnas = {k['id']: [] for k in data['sequences']} + sequence_tm_rnas = {seq['id']: [] for seq in data['sequences']} for tm_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA]: tm_rnas = sequence_tm_rnas[tm_rna['sequence']] tm_rnas.append(tm_rna) - sequence_r_rnas = {k['id']: [] for k in data['sequences']} + sequence_r_rnas = {seq['id']: [] for seq in data['sequences']} for r_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA]: r_rnas = sequence_r_rnas[r_rna['sequence']] r_rnas.append(r_rna) - sequence_ncrna_regions = {k['id']: [] for k in data['sequences']} + sequence_ncrna_regions = {seq['id']: [] for seq in data['sequences']} for ncRNA_region in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION]: ncRNA_regions = sequence_ncrna_regions[ncRNA_region['sequence']] ncRNA_regions.append(ncRNA_region) - sequence_crispr_arrays = {k['id']: [] for k in data['sequences']} + sequence_crispr_arrays = {seq['id']: [] for seq in data['sequences']} for crispr_array in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR]: crispr_arrays = sequence_crispr_arrays[crispr_array['sequence']] crispr_arrays.append(crispr_array) - sequence_cdss = {k['id']: [] for k in data['sequences']} - sequence_cdss_user_provided = {k['id']: [] for k in data['sequences']} + sequence_cdss = {seq['id']: [] for seq in data['sequences']} + sequence_cdss_user_provided = {seq['id']: [] for seq in data['sequences']} for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]: if(cds.get('source', None) == bc.CDS_SOURCE_USER): cdss = sequence_cdss_user_provided[cds['sequence']] else: cdss = sequence_cdss[cds['sequence']] cdss.append(cds) - sequence_sorfs = {k['id']: [] for k in data['sequences']} + sequence_sorfs = {seq['id']: [] for seq in data['sequences']} for sorf in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF]: sorfs = sequence_sorfs[sorf['sequence']] sorfs.append(sorf) diff --git a/bakta/features/cds.py b/bakta/features/cds.py index 4215b7d6..8e5b0e99 100644 --- a/bakta/features/cds.py +++ b/bakta/features/cds.py @@ -411,12 +411,12 @@ def revise_translational_exceptions(data: dict, cdss: Sequence[dict]): sequences = {seq['id']: seq for seq in data['sequences']} # detect splitted orphan ORFs of selenocystein proteins that are subject to stop codon recoding. - cdss_per_sequences = {k['id']: [] for k in data['sequences']} # get CDS per sequence + cdss_per_sequences = {seq['id']: [] for seq in data['sequences']} # get CDS per sequence for cds in cdss: cdss_per_sequence = cdss_per_sequences[cds['sequence']] if('truncated' not in cds): # exclude truncated CDS for now cdss_per_sequence.append(cds) - cds_pairs_per_sequence = {k['id']: [] for k in data['sequences']} # extract inframe primate CDS neighbouring pairs + cds_pairs_per_sequence = {seq['id']: [] for seq in data['sequences']} # extract inframe primate CDS neighbouring pairs for id, cdss_per_sequence in cdss_per_sequences.items(): cdss_per_sequence = sorted(cdss_per_sequence, key=lambda k: k['start']) for i in range(1, len(cdss_per_sequence)): diff --git a/bakta/features/s_orf.py b/bakta/features/s_orf.py index cb93f294..d8d8a5ad 100644 --- a/bakta/features/s_orf.py +++ b/bakta/features/s_orf.py @@ -102,7 +102,7 @@ def overlap_filter(data: dict, orfs_raw: Sequence[dict]): r_rnas = r_rna_per_sequence[r_rna['sequence']] r_rnas.append(r_rna) - # nc_rnas_per_sequence = {k['id']: [] for k in data['sequences']} + # nc_rnas_per_sequence = {seq['id']: [] for seq in data['sequences']} # for nc_rna in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA]: # nc_rnas = nc_rnas_per_sequence[nc_rna['sequence']] # nc_rnas.append(nc_rna) @@ -115,7 +115,7 @@ def overlap_filter(data: dict, orfs_raw: Sequence[dict]): crispr_arrays = crispr_arrays_per_sequence[crispr_array['sequence']] crispr_arrays.append(crispr_array) - cdss_per_sequence = {k['id']: [] for k in data['sequences']} + cdss_per_sequence = {seq['id']: [] for seq in data['sequences']} for cds in [feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS]: cdss = cdss_per_sequence[cds['sequence']] cdss.append(cds) diff --git a/bakta/main.py b/bakta/main.py index 22ac72d0..664eb3aa 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -466,7 +466,7 @@ def main(): ############################################################################ print('select features and create locus tags...') log.debug('start feature selection and creation of locus tags') - features_by_sequence = {k['id']: [] for k in data['sequences']} + features_by_sequence = {seq['id']: [] for seq in data['sequences']} feature_id = 1 feature_id_prefix = bu.create_locus_tag_prefix(sequences, length=10) for feature in data['features']: From befe6eec861d141b64bfd82ba9682c4666fe3f4a Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 17:49:37 +0200 Subject: [PATCH 7/8] remove io.py --- bakta/io.py | 189 ---------------------------------------------------- 1 file changed, 189 deletions(-) delete mode 100644 bakta/io.py diff --git a/bakta/io.py b/bakta/io.py deleted file mode 100644 index 1002befb..00000000 --- a/bakta/io.py +++ /dev/null @@ -1,189 +0,0 @@ -import atexit -import logging -import os -import sys - -from pathlib import Path - -import bakta -import bakta.constants as bc -import bakta.config as cfg -import bakta.utils as bu -import bakta.io.fasta as fasta -import bakta.io.json as json -import bakta.io.tsv as tsv -import bakta.io.gff as gff -import bakta.io.insdc as insdc -import bakta.plot as plot - - -log = logging.getLogger('IO') - - -def main(): - # parse options and arguments - parser = bu.init_parser(sub_command='_proteins') - parser.add_argument('input', metavar='', help='Bakta annotations in JSON format') - - arg_group_io = parser.add_argument_group('Input / Output') - arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') - arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') - arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder') - - arg_group_general = parser.add_argument_group('General') - arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit') - arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information') - arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.') - arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}') - args = parser.parse_args() - - ############################################################################ - # Setup logging - ############################################################################ - cfg.prefix = args.prefix if args.prefix else Path(args.input).stem - output_path = cfg.check_output_path(args.output, args.force) - cfg.force = args.force - log.info('force=%s', args.force) - - bu.setup_logger(output_path, cfg.prefix, args) - log.info('prefix=%s', cfg.prefix) - log.info('output=%s', output_path) - - ############################################################################ - # Checks and configurations - # - check parameters and setup global configuration - # - test database - # - test binary dependencies - ############################################################################ - try: - if args.input == '': - raise ValueError('File path argument must be non-empty') - annotation_path = Path(args.input).resolve() - cfg.check_readability('annotation', annotation_path) - cfg.check_content_size('annotation', annotation_path) - except: - log.error('provided annotation file not valid! path=%s', args.input) - sys.exit(f'ERROR: annotation file ({args.input}) not valid!') - log.info('input-path=%s', annotation_path) - - cfg.check_tmp_path(args) - cfg.debug = args.debug - log.info('debug=%s', cfg.debug) - cfg.verbose = True if cfg.debug else args.verbose - log.info('verbose=%s', cfg.verbose) - cfg.user_proteins = cfg.check_user_proteins(args) - - if(cfg.verbose): - print(f'Bakta v{bakta.__version__}') - print('Options and arguments:') - print(f'\tinput: {annotation_path}') - print(f'\toutput: {cfg.output_path}') - print(f'\tprefix: {cfg.prefix}') - if(cfg.force): print(f'\tforce: {cfg.force}') - - if(cfg.debug): - print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}") - else: - atexit.register(bu.cleanup, log, cfg.tmp_path) # register cleanup exit hook - - ############################################################################ - # Import annotations from JSON - ############################################################################ - print('Parse genome annotations...') - with annotation_path.open('r') as fh: - data = json.load(fh) - features = data['features'] - features_by_sequence = {seq['id']: [] for seq in data['sequences']} - for feature in data['features']: - sequence_features = features_by_sequence.get(feature['sequence']) - sequence_features.append(feature) - - ############################################################################ - # Write output files - # - write optional output files in GFF3/GenBank/EMBL formats - # - measure runtime - # - write comprehensive annotation results as JSON - # - remove temp directory - ############################################################################ - print(f'\nExport annotation results to: {cfg.output_path}') - print('\thuman readable TSV...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_features(data['sequences'], features_by_sequence, tsv_path) - - print('\tGFF3...') - gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_features(data, features_by_sequence, gff3_path) - - print('\tINSDC GenBank & EMBL...') - genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') - embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') - insdc.write_features(data, features, genbank_path, embl_path) - - print('\tgenome sequences...') - fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') - fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True) - - print('\tfeature nucleotide sequences...') - ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') - fasta.write_ffn(features, ffn_path) - - print('\ttranslated CDS sequences...') - faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa') - fasta.write_faa(features, faa_path) - - print('\tfeature inferences...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') - tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path) - - if(cfg.skip_plot or cfg.meta): - print('\tskip generation of circular genome plot...') - else: - print('\tcircular genome plot...') - plot.write(features, data['sequences'], cfg.output_path) - - if(cfg.skip_cds is False): - hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] - print('\thypothetical TSV...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv') - tsv.write_hypotheticals(hypotheticals, tsv_path) - - print('\ttranslated hypothetical CDS sequences...') - faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa') - fasta.write_faa(hypotheticals, faa_path) - - print('\tGenome and annotation summary...') - summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') - with summary_path.open('w') as fh_out: - fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {data['stats']['size']:}\n") - fh_out.write(f"Count: {len(data['sequences'])}\n") - fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n") - fh_out.write(f"N50: {data['stats']['n50']:}\n") - fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") - fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") - fh_out.write('\nAnnotation:\n') - fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") - fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") - fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n") - fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n") - fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n") - fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n") - cdss = [f for f in features if f['type'] == bc.FEATURE_CDS] - fh_out.write(f"CDSs: {len(cdss)}\n") - fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n") - fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n") - fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n") - fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n") - fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n") - fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n") - fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n") - fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n") - fh_out.write('\nBakta:\n') - fh_out.write(f'Software: v{bakta.__version__}\n') - fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n") - fh_out.write('DOI: 10.1099/mgen.0.000685\n') - fh_out.write('URL: github.com/oschwengers/bakta\n') - - -if __name__ == '__main__': - main() From fe9eece8d93bea1f9bc918066735edfd54405e00 Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 18:15:04 +0200 Subject: [PATCH 8/8] refactor var names in list comprehensions --- bakta/main.py | 44 ++++++++++++++--------------- bakta/proteins.py | 14 ++++----- scripts/collect-annotation-stats.py | 28 +++++++++--------- test/test_bakta.py | 4 +-- 4 files changed, 45 insertions(+), 45 deletions(-) diff --git a/bakta/main.py b/bakta/main.py index 664eb3aa..881711be 100755 --- a/bakta/main.py +++ b/bakta/main.py @@ -516,21 +516,21 @@ def main(): print(f"\tN ratio: {100 * data['stats']['n_ratio']:.1f} %") print(f"\tcoding density: {100 * data['stats']['coding_ratio']:.1f} %") print('\nannotation summary:') - print(f"\ttRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}") - print(f"\ttmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}") - print(f"\trRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}") - print(f"\tncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}") - print(f"\tncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}") - print(f"\tCRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}") - cdss = [f for f in features if f['type'] == bc.FEATURE_CDS] + print(f"\ttRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_T_RNA])}") + print(f"\ttmRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_TM_RNA])}") + print(f"\trRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_R_RNA])}") + print(f"\tncRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA])}") + print(f"\tncRNA regions: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA_REGION])}") + print(f"\tCRISPR arrays: {len([feat for feat in features if feat['type'] == bc.FEATURE_CRISPR])}") + cdss = [feat for feat in features if feat['type'] == bc.FEATURE_CDS] print(f"\tCDSs: {len(cdss)}") print(f"\t\thypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}") print(f"\t\tpseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}") print(f"\t\tsignal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}") - print(f"\tsORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}") - print(f"\tgaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}") - print(f"\toriCs/oriVs: {len([f for f in features if (f['type'] == bc.FEATURE_ORIC or f['type'] == bc.FEATURE_ORIV)])}") - print(f"\toriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}") + print(f"\tsORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}") + print(f"\tgaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}") + print(f"\toriCs/oriVs: {len([feat for feat in features if (feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV)])}") + print(f"\toriTs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIT])}") ############################################################################ # Write output files @@ -609,21 +609,21 @@ def main(): fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") fh_out.write('\nAnnotation:\n') - fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") - fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") - fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n") - fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n") - fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n") - fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n") + fh_out.write(f"tRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_T_RNA])}\n") + fh_out.write(f"tmRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_TM_RNA])}\n") + fh_out.write(f"rRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_R_RNA])}\n") + fh_out.write(f"ncRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA])}\n") + fh_out.write(f"ncRNA regions: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA_REGION])}\n") + fh_out.write(f"CRISPR arrays: {len([feat for feat in features if feat['type'] == bc.FEATURE_CRISPR])}\n") fh_out.write(f"CDSs: {len(cdss)}\n") fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n") fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n") fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n") - fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n") - fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n") - fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n") - fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n") - fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n") + fh_out.write(f"sORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}\n") + fh_out.write(f"gaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}\n") + fh_out.write(f"oriCs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIC])}\n") + fh_out.write(f"oriVs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIV])}\n") + fh_out.write(f"oriTs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIT])}\n") fh_out.write('\nBakta:\n') fh_out.write(f'Software: v{bakta.__version__}\n') fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n") diff --git a/bakta/proteins.py b/bakta/proteins.py index d7f7d169..f0485d25 100644 --- a/bakta/proteins.py +++ b/bakta/proteins.py @@ -199,12 +199,12 @@ def map_aa_columns(feat: dict) -> Sequence[str]: str(feat['length']), gene, feat['product'], - ','.join([k.replace('EC:', '') for k in feat['db_xrefs'] if 'EC:' in k]), - ','.join([k for k in feat['db_xrefs'] if 'GO:' in k]), - ','.join([k.replace('COG:', '') for k in feat['db_xrefs'] if 'COG:' in k]), - ','.join([k.replace('RefSeq:', '') for k in feat['db_xrefs'] if 'RefSeq:' in k]), - ','.join([k.replace('UniParc:', '') for k in feat['db_xrefs'] if 'UniParc:' in k]), - ','.join([k.replace('UniRef:', '') for k in feat['db_xrefs'] if 'UniRef' in k]) + ','.join([dbxref.replace('EC:', '') for dbxref in feat['db_xrefs'] if 'EC:' in dbxref]), + ','.join([dbxref for dbxref in feat['db_xrefs'] if 'GO:' in dbxref]), + ','.join([dbxref.replace('COG:', '') for dbxref in feat['db_xrefs'] if 'COG:' in dbxref]), + ','.join([dbxref.replace('RefSeq:', '') for dbxref in feat['db_xrefs'] if 'RefSeq:' in dbxref]), + ','.join([dbxref.replace('UniParc:', '') for dbxref in feat['db_xrefs'] if 'UniParc:' in dbxref]), + ','.join([dbxref.replace('UniRef:', '') for dbxref in feat['db_xrefs'] if 'UniRef' in dbxref]) ] @@ -214,7 +214,7 @@ def map_hypothetical_columns(feat: dict) -> Sequence[str]: str(feat['length']), f"{(feat['seq_stats']['molecular_weight']/1000):.1f}" if feat['seq_stats']['molecular_weight'] else 'NA' f"{feat['seq_stats']['isoelectric_point']:.1f}" if feat['seq_stats']['isoelectric_point'] else 'NA' - ','.join([k.replace('PFAM:', '') for k in feat['db_xrefs'] if 'PFAM:' in k]) + ','.join([dbxref.replace('PFAM:', '') for dbxref in feat['db_xrefs'] if 'PFAM:' in dbxref]) ] diff --git a/scripts/collect-annotation-stats.py b/scripts/collect-annotation-stats.py index 4786a709..58902e31 100755 --- a/scripts/collect-annotation-stats.py +++ b/scripts/collect-annotation-stats.py @@ -82,20 +82,20 @@ f"{100 * data['stats']['n_ratio']:.1f}", f"{data['stats']['n50']}", f"{100 * data['stats']['coding_ratio']:.1f}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_T_RNA])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_TM_RNA])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_R_RNA])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_NC_RNA_REGION])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CRISPR])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'hypothetical' in f])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_CDS and 'pseudogene' in f])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_SORF])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_GAP])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIC])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIV])}", - f"{len([f for f in data['features'] if f['type'] == bc.FEATURE_ORIT])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_T_RNA])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_TM_RNA])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_R_RNA])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_NC_RNA_REGION])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CRISPR])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_CDS and 'pseudogene' in feat])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_SORF])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_GAP])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIC])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIV])}", + f"{len([feat for feat in data['features'] if feat['type'] == bc.FEATURE_ORIT])}", ] output_line = '\t'.join(stats) print(output_line) diff --git a/test/test_bakta.py b/test/test_bakta.py index d4c4f261..9ca62cba 100644 --- a/test/test_bakta.py +++ b/test/test_bakta.py @@ -81,7 +81,7 @@ def test_bakta_plasmid(tmpdir): bc.FEATURE_ORIT: 0 } for type, count in feature_counts_expected.items(): - assert len([f for f in features if f['type'] == type]) == count + assert len([feat for feat in features if feat['type'] == type]) == count @pytest.mark.parametrize( @@ -142,5 +142,5 @@ def test_bakta_genome(db, tmpdir): bc.FEATURE_ORIT: 0 } for type, count in feature_counts_expected.items(): - assert len([f for f in features if f['type'] == type]) == count + assert len([feat for feat in features if feat['type'] == type]) == count