Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce bakta_io output file reconstruction #339

Merged
merged 4 commits into from
Oct 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,10 +309,10 @@ Annotation results are provided in standard bioinformatics file formats:
- `<prefix>.inference.tsv`: inference metrics (score, evalue, coverage, identity) for annotated accessions as TSV
- `<prefix>.hypotheticals.tsv`: further information on hypothetical protein CDS as simple human readble tab separated values
- `<prefix>.hypotheticals.faa`: hypothetical protein CDS amino acid sequences as FASTA
- `<prefix>.json`: all (internal) annotation & sequence information as JSON
- `<prefix>.txt`: summary as TXT
- `<prefix>.png`: circular genome annotation plot as PNG
- `<prefix>.svg`: circular genome annotation plot as SVG
- `<prefix>.json`: all (internal) annotation & sequence information as JSON

The `<prefix>` can be set via `--prefix <prefix>`. If no prefix is set, Bakta uses the input file prefix.

Expand Down Expand Up @@ -359,6 +359,14 @@ Of note, Bakta provides all detailed (internal) information on each annotated fe
}
```

Bakta provides a helper function to create above mentioned output files from the (GNU-zipped) *JSON* result file, thus helping potential long-term or large-scale annotation projects to reduce overall storage requirements.

```bash
bakta_io --output <output-path> --prefix <prefix> result.json.gz

bakta_io --help
```

Exemplary annotation result files for several genomes (mostly ESKAPE species) are hosted at Zenodo: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4770026.svg)](https://doi.org/10.5281/zenodo.4770026)

## Usage
Expand Down
198 changes: 198 additions & 0 deletions bakta/io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import atexit
import json
import logging
import os
import sys

from datetime import datetime
from pathlib import Path

from xopen import xopen

import bakta
import bakta.constants as bc
import bakta.config as cfg
import bakta.utils as bu
import bakta.io.fasta as fasta
import bakta.io.tsv as tsv
import bakta.io.gff as gff
import bakta.io.insdc as insdc
import bakta.plot as plot


log = logging.getLogger('IO')


def main():
# parse options and arguments
parser = bu.init_parser(sub_command='_proteins')
parser.add_argument('input', metavar='<input>', help='Bakta annotations in JSON format')

arg_group_io = parser.add_argument_group('Input / Output')
arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)')
arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files')
arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder')

arg_group_general = parser.add_argument_group('General')
arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit')
arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information')
arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.')
arg_group_general.add_argument('--tmp-dir', action='store', default=None, dest='tmp_dir', help='Location for temporary files (default = system dependent auto detection)')
arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}')
args = parser.parse_args()

############################################################################
# Setup logging
############################################################################
cfg.prefix = args.prefix if args.prefix else Path(args.input).stem
output_path = cfg.check_output_path(args.output, args.force)
cfg.force = args.force
log.info('force=%s', args.force)

bu.setup_logger(output_path, f'{cfg.prefix}.reconstruction', args)
log.info('prefix=%s', cfg.prefix)
log.info('output=%s', output_path)

############################################################################
# Checks and configurations
# - check parameters and setup global configuration
# - test database
# - test binary dependencies
############################################################################
try:
if args.input == '':
raise ValueError('File path argument must be non-empty')
annotation_path = Path(args.input).resolve()
cfg.check_readability('annotation', annotation_path)
cfg.check_content_size('annotation', annotation_path)
except:
log.error('provided annotation file not valid! path=%s', args.input)
sys.exit(f'ERROR: annotation file ({args.input}) not valid!')
log.info('input-path=%s', annotation_path)

cfg.check_tmp_path(args)
cfg.debug = args.debug
log.info('debug=%s', cfg.debug)
cfg.verbose = True if cfg.debug else args.verbose
log.info('verbose=%s', cfg.verbose)

if(cfg.verbose):
print(f'Bakta v{bakta.__version__}')
print('Options and arguments:')
print(f'\tinput: {annotation_path}')
print(f'\toutput: {cfg.output_path}')
print(f'\tprefix: {cfg.prefix}')
if(cfg.force): print(f'\tforce: {cfg.force}')

if(cfg.debug):
print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}")
else:
atexit.register(bu.cleanup, log, cfg.tmp_path) # register cleanup exit hook

############################################################################
# Import annotations from JSON
############################################################################
print('Parse genome annotations...')
with xopen(str(annotation_path), threads=0) as fh:
data = json.load(fh)
features = data['features']
features_by_sequence = {seq['id']: [] for seq in data['sequences']}
for feature in data['features']:
sequence_features = features_by_sequence.get(feature['sequence'])
sequence_features.append(feature)

# set global config objects based on information from imported JSON document
cfg.db_info = {
'type': data['version']['db']['type'],
'major': data['version']['db']['version'].split('.')[0],
'minor': data['version']['db']['version'].split('.')[1]
}
cfg.translation_table = data['genome']['translation_table']
cfg.run_start = datetime.strptime(data['run']['start'], '%Y-%m-%d %H:%M:%S')
cfg.run_end = datetime.strptime(data['run']['end'], '%Y-%m-%d %H:%M:%S')

############################################################################
# Write output files
# - write optional output files in GFF3/GenBank/EMBL formats
# - measure runtime
# - write comprehensive annotation results as JSON
# - remove temp directory
############################################################################
print(f'\nExport annotation results to: {cfg.output_path}')
print('\thuman readable TSV...')
tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
tsv.write_features(data['sequences'], features_by_sequence, tsv_path)

print('\tGFF3...')
gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3')
gff.write_features(data, features_by_sequence, gff3_path)

print('\tINSDC GenBank & EMBL...')
genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff')
embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl')
insdc.write_features(data, features, genbank_path, embl_path)

print('\tgenome sequences...')
fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna')
fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True)

print('\tfeature nucleotide sequences...')
ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn')
fasta.write_ffn(features, ffn_path)

print('\ttranslated CDS sequences...')
faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa')
fasta.write_faa(features, faa_path)

print('\tfeature inferences...')
tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv')
tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path)

print('\tcircular genome plot...')
plot.write(features, data['sequences'], cfg.output_path)

hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat]
print('\thypothetical TSV...')
tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv')
tsv.write_hypotheticals(hypotheticals, tsv_path)

print('\ttranslated hypothetical CDS sequences...')
faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa')
fasta.write_faa(hypotheticals, faa_path)

print('\tGenome and annotation summary...')
summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt')
with summary_path.open('w') as fh_out:
fh_out.write('Sequence(s):\n')
fh_out.write(f"Length: {data['stats']['size']:}\n")
fh_out.write(f"Count: {len(data['sequences'])}\n")
fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n")
fh_out.write(f"N50: {data['stats']['n50']:}\n")
fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n")
fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n")
fh_out.write('\nAnnotation:\n')
fh_out.write(f"tRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_T_RNA])}\n")
fh_out.write(f"tmRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_TM_RNA])}\n")
fh_out.write(f"rRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_R_RNA])}\n")
fh_out.write(f"ncRNAs: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA])}\n")
fh_out.write(f"ncRNA regions: {len([feat for feat in features if feat['type'] == bc.FEATURE_NC_RNA_REGION])}\n")
fh_out.write(f"CRISPR arrays: {len([feat for feat in features if feat['type'] == bc.FEATURE_CRISPR])}\n")
cdss = [feat for feat in features if feat['type'] == bc.FEATURE_CDS]
fh_out.write(f"CDSs: {len(cdss)}\n")
fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n")
fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n")
fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n")
fh_out.write(f"sORFs: {len([feat for feat in features if feat['type'] == bc.FEATURE_SORF])}\n")
fh_out.write(f"gaps: {len([feat for feat in features if feat['type'] == bc.FEATURE_GAP])}\n")
fh_out.write(f"oriCs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIC])}\n")
fh_out.write(f"oriVs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIV])}\n")
fh_out.write(f"oriTs: {len([feat for feat in features if feat['type'] == bc.FEATURE_ORIT])}\n")
fh_out.write('\nBakta:\n')
fh_out.write(f'Software: v{bakta.__version__}\n')
fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n")
fh_out.write('DOI: 10.1099/mgen.0.000685\n')
fh_out.write('URL: github.com/oschwengers/bakta\n')


if __name__ == '__main__':
main()
2 changes: 1 addition & 1 deletion bakta/io/insdc.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def write_features(data: dict, features: Sequence[dict], genbank_output_path: Pa
f'URL: {bc.BAKTA_URL}\n',
'\n',
'##Genome Annotation Summary:##\n',
f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n",
f"{'Annotation Date':<30} :: {cfg.run_end.strftime('%m/%d/%Y, %H:%M:%S')}\n",
f"{'CDSs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n",
f"{'tRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n",
f"{'tmRNAs':<30} :: {len([feat for feat in sequence_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n",
Expand Down
7 changes: 4 additions & 3 deletions bakta/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -534,11 +534,13 @@ def main():

############################################################################
# Write output files
# - write optional output files in GFF3/GenBank/EMBL formats
# - measure runtime
# - write optional output files in GFF3/GenBank/EMBL formats
# - write comprehensive annotation results as JSON
# - remove temp directory
############################################################################
cfg.run_end = datetime.now() # measure runtime

print(f'\nExport annotation results to: {cfg.output_path}')
print('\thuman readable TSV...')
tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv')
Expand Down Expand Up @@ -585,8 +587,7 @@ def main():
faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa')
fasta.write_faa(hypotheticals, faa_path)

# measure runtime at the latest possible
cfg.run_end = datetime.now()
# calc & store runtime
run_duration = (cfg.run_end - cfg.run_start).total_seconds()
data['run'] = {
'start': cfg.run_start.strftime('%Y-%m-%d %H:%M:%S'),
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
'bakta=bakta.main:main',
'bakta_proteins=bakta.proteins:main',
'bakta_db=bakta.db:main',
'bakta_plot=bakta.plot:main'
'bakta_plot=bakta.plot:main',
'bakta_io=bakta.io:main'
]
},
classifiers=[
Expand Down
Loading