From befe6eec861d141b64bfd82ba9682c4666fe3f4a Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Tue, 15 Oct 2024 17:49:37 +0200 Subject: [PATCH] remove io.py --- bakta/io.py | 189 ---------------------------------------------------- 1 file changed, 189 deletions(-) delete mode 100644 bakta/io.py diff --git a/bakta/io.py b/bakta/io.py deleted file mode 100644 index 1002bef..0000000 --- a/bakta/io.py +++ /dev/null @@ -1,189 +0,0 @@ -import atexit -import logging -import os -import sys - -from pathlib import Path - -import bakta -import bakta.constants as bc -import bakta.config as cfg -import bakta.utils as bu -import bakta.io.fasta as fasta -import bakta.io.json as json -import bakta.io.tsv as tsv -import bakta.io.gff as gff -import bakta.io.insdc as insdc -import bakta.plot as plot - - -log = logging.getLogger('IO') - - -def main(): - # parse options and arguments - parser = bu.init_parser(sub_command='_proteins') - parser.add_argument('input', metavar='', help='Bakta annotations in JSON format') - - arg_group_io = parser.add_argument_group('Input / Output') - arg_group_io.add_argument('--output', '-o', action='store', default=os.getcwd(), help='Output directory (default = current working directory)') - arg_group_io.add_argument('--prefix', '-p', action='store', default=None, help='Prefix for output files') - arg_group_io.add_argument('--force', '-f', action='store_true', help='Force overwriting existing output folder') - - arg_group_general = parser.add_argument_group('General') - arg_group_general.add_argument('--help', '-h', action='help', help='Show this help message and exit') - arg_group_general.add_argument('--verbose', '-v', action='store_true', help='Print verbose information') - arg_group_general.add_argument('--debug', action='store_true', help='Run Bakta in debug mode. Temp data will not be removed.') - arg_group_general.add_argument('--version', '-V', action='version', version=f'%(prog)s {bakta.__version__}') - args = parser.parse_args() - - ############################################################################ - # Setup logging - ############################################################################ - cfg.prefix = args.prefix if args.prefix else Path(args.input).stem - output_path = cfg.check_output_path(args.output, args.force) - cfg.force = args.force - log.info('force=%s', args.force) - - bu.setup_logger(output_path, cfg.prefix, args) - log.info('prefix=%s', cfg.prefix) - log.info('output=%s', output_path) - - ############################################################################ - # Checks and configurations - # - check parameters and setup global configuration - # - test database - # - test binary dependencies - ############################################################################ - try: - if args.input == '': - raise ValueError('File path argument must be non-empty') - annotation_path = Path(args.input).resolve() - cfg.check_readability('annotation', annotation_path) - cfg.check_content_size('annotation', annotation_path) - except: - log.error('provided annotation file not valid! path=%s', args.input) - sys.exit(f'ERROR: annotation file ({args.input}) not valid!') - log.info('input-path=%s', annotation_path) - - cfg.check_tmp_path(args) - cfg.debug = args.debug - log.info('debug=%s', cfg.debug) - cfg.verbose = True if cfg.debug else args.verbose - log.info('verbose=%s', cfg.verbose) - cfg.user_proteins = cfg.check_user_proteins(args) - - if(cfg.verbose): - print(f'Bakta v{bakta.__version__}') - print('Options and arguments:') - print(f'\tinput: {annotation_path}') - print(f'\toutput: {cfg.output_path}') - print(f'\tprefix: {cfg.prefix}') - if(cfg.force): print(f'\tforce: {cfg.force}') - - if(cfg.debug): - print(f"\nBakta runs in DEBUG mode! Temporary data will not be destroyed at: {cfg.tmp_path}") - else: - atexit.register(bu.cleanup, log, cfg.tmp_path) # register cleanup exit hook - - ############################################################################ - # Import annotations from JSON - ############################################################################ - print('Parse genome annotations...') - with annotation_path.open('r') as fh: - data = json.load(fh) - features = data['features'] - features_by_sequence = {seq['id']: [] for seq in data['sequences']} - for feature in data['features']: - sequence_features = features_by_sequence.get(feature['sequence']) - sequence_features.append(feature) - - ############################################################################ - # Write output files - # - write optional output files in GFF3/GenBank/EMBL formats - # - measure runtime - # - write comprehensive annotation results as JSON - # - remove temp directory - ############################################################################ - print(f'\nExport annotation results to: {cfg.output_path}') - print('\thuman readable TSV...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.tsv') - tsv.write_features(data['sequences'], features_by_sequence, tsv_path) - - print('\tGFF3...') - gff3_path = cfg.output_path.joinpath(f'{cfg.prefix}.gff3') - gff.write_features(data, features_by_sequence, gff3_path) - - print('\tINSDC GenBank & EMBL...') - genbank_path = cfg.output_path.joinpath(f'{cfg.prefix}.gbff') - embl_path = cfg.output_path.joinpath(f'{cfg.prefix}.embl') - insdc.write_features(data, features, genbank_path, embl_path) - - print('\tgenome sequences...') - fna_path = cfg.output_path.joinpath(f'{cfg.prefix}.fna') - fasta.export_sequences(data['sequences'], fna_path, description=True, wrap=True) - - print('\tfeature nucleotide sequences...') - ffn_path = cfg.output_path.joinpath(f'{cfg.prefix}.ffn') - fasta.write_ffn(features, ffn_path) - - print('\ttranslated CDS sequences...') - faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.faa') - fasta.write_faa(features, faa_path) - - print('\tfeature inferences...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.inference.tsv') - tsv.write_feature_inferences(data['sequences'], features_by_sequence, tsv_path) - - if(cfg.skip_plot or cfg.meta): - print('\tskip generation of circular genome plot...') - else: - print('\tcircular genome plot...') - plot.write(features, data['sequences'], cfg.output_path) - - if(cfg.skip_cds is False): - hypotheticals = [feat for feat in features if feat['type'] == bc.FEATURE_CDS and 'hypothetical' in feat] - print('\thypothetical TSV...') - tsv_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.tsv') - tsv.write_hypotheticals(hypotheticals, tsv_path) - - print('\ttranslated hypothetical CDS sequences...') - faa_path = cfg.output_path.joinpath(f'{cfg.prefix}.hypotheticals.faa') - fasta.write_faa(hypotheticals, faa_path) - - print('\tGenome and annotation summary...') - summary_path = cfg.output_path.joinpath(f'{cfg.prefix}.txt') - with summary_path.open('w') as fh_out: - fh_out.write('Sequence(s):\n') - fh_out.write(f"Length: {data['stats']['size']:}\n") - fh_out.write(f"Count: {len(data['sequences'])}\n") - fh_out.write(f"GC: {100 * data['stats']['gc']:.1f}\n") - fh_out.write(f"N50: {data['stats']['n50']:}\n") - fh_out.write(f"N ratio: {100 * data['stats']['n_ratio']:.1f}\n") - fh_out.write(f"coding density: {100 * data['stats']['coding_ratio']:.1f}\n") - fh_out.write('\nAnnotation:\n') - fh_out.write(f"tRNAs: {len([f for f in features if f['type'] == bc.FEATURE_T_RNA])}\n") - fh_out.write(f"tmRNAs: {len([f for f in features if f['type'] == bc.FEATURE_TM_RNA])}\n") - fh_out.write(f"rRNAs: {len([f for f in features if f['type'] == bc.FEATURE_R_RNA])}\n") - fh_out.write(f"ncRNAs: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA])}\n") - fh_out.write(f"ncRNA regions: {len([f for f in features if f['type'] == bc.FEATURE_NC_RNA_REGION])}\n") - fh_out.write(f"CRISPR arrays: {len([f for f in features if f['type'] == bc.FEATURE_CRISPR])}\n") - cdss = [f for f in features if f['type'] == bc.FEATURE_CDS] - fh_out.write(f"CDSs: {len(cdss)}\n") - fh_out.write(f"pseudogenes: {len([cds for cds in cdss if 'pseudogene' in cds])}\n") - fh_out.write(f"hypotheticals: {len([cds for cds in cdss if 'hypothetical' in cds])}\n") - fh_out.write(f"signal peptides: {len([cds for cds in cdss if bc.FEATURE_SIGNAL_PEPTIDE in cds])}\n") - fh_out.write(f"sORFs: {len([f for f in features if f['type'] == bc.FEATURE_SORF])}\n") - fh_out.write(f"gaps: {len([f for f in features if f['type'] == bc.FEATURE_GAP])}\n") - fh_out.write(f"oriCs: {len([f for f in features if f['type'] == bc.FEATURE_ORIC])}\n") - fh_out.write(f"oriVs: {len([f for f in features if f['type'] == bc.FEATURE_ORIV])}\n") - fh_out.write(f"oriTs: {len([f for f in features if f['type'] == bc.FEATURE_ORIT])}\n") - fh_out.write('\nBakta:\n') - fh_out.write(f'Software: v{bakta.__version__}\n') - fh_out.write(f"Database: v{cfg.db_info['major']}.{cfg.db_info['minor']}, {cfg.db_info['type']}\n") - fh_out.write('DOI: 10.1099/mgen.0.000685\n') - fh_out.write('URL: github.com/oschwengers/bakta\n') - - -if __name__ == '__main__': - main()