Skip to content

Commit

Permalink
Refactor internal data structure and major variable names (#338)
Browse files Browse the repository at this point in the history
* rename contig to more general sequence
* rename seq 'sequence' attribute to 'nt'
* rename genome data structure to data
* reorganize data strucutre
* fix io
* refactor seq var name in list comprehensions
* remove io.py
* refactor var names in list comprehensions
  • Loading branch information
oschwengers authored Oct 15, 2024
1 parent 9bd2227 commit 0ebd7cc
Show file tree
Hide file tree
Showing 42 changed files with 941 additions and 985 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ positional arguments:
Input / Output:
--db DB, -d DB Database path (default = <bakta_path>/db). Can also be provided as BAKTA_DB environment variable.
--min-contig-length MIN_CONTIG_LENGTH, -m MIN_CONTIG_LENGTH
Minimum contig size (default = 1; 200 in compliant mode)
Minimum contig/sequence size (default = 1; 200 in compliant mode)
--prefix PREFIX, -p PREFIX
Prefix for output files
--output OUTPUT, -o OUTPUT
Expand All @@ -409,7 +409,7 @@ Annotation:
Locus tag increment: 1/5/10 (default = 1)

--keep-contig-headers
Keep original contig headers
Keep original contig/sequence headers
--compliant Force Genbank/ENA/DDJB compliance
--replicons REPLICONS, -r REPLICONS
Replicon information table (tsv/csv)
Expand Down
2 changes: 1 addition & 1 deletion bakta/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = '1.9.4'
__version__ = '1.10.0-beta'
__db_schema_version__ = 5
26 changes: 13 additions & 13 deletions bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
db_info = None
tmp_path = None
genome_path = None
min_contig_length = None
min_sequence_length = None
prefix = None
output_path = None
force = None
Expand All @@ -46,7 +46,7 @@
complete = None
prodigal_tf = None
translation_table = None
keep_contig_headers = None
keep_sequence_headers = None
locus = None
locus_tag = None
locus_tag_increment = None
Expand Down Expand Up @@ -92,7 +92,7 @@ def setup(args):
verbose = True

# input / output path configurations
global db_path, db_info, tmp_path, genome_path, min_contig_length, prefix, output_path, force
global db_path, db_info, tmp_path, genome_path, min_sequence_length, prefix, output_path, force
db_path = check_db_path(args)
tmp_path = check_tmp_path(args)

Expand All @@ -108,11 +108,11 @@ def setup(args):
log.info('genome-path=%s', genome_path)

# input / output configurations
min_contig_length = args.min_contig_length
if(min_contig_length <= 0):
log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_contig_length)
sys.exit(f"ERROR: wrong argument ({min_contig_length}) for 'min- contig-length' parameter! Value must be larger than 0")
log.info('min_contig_length=%s', min_contig_length)
min_sequence_length = args.min_contig_length
if(min_sequence_length <= 0):
log.error("wrong argument for 'min-contig-length' parameter! min_contig_length=%s", min_sequence_length)
sys.exit(f"ERROR: wrong argument ({min_sequence_length}) for 'min- contig-length' parameter! Value must be larger than 0")
log.info('min_contig_length=%s', min_sequence_length)
log.info('prefix=%s', prefix) # set in main.py before global logger config
log.info('output-path=%s', output_path)
force = args.force
Expand Down Expand Up @@ -163,7 +163,7 @@ def setup(args):
taxon = None

# annotation configurations
global complete, prodigal_tf, translation_table, keep_contig_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
global complete, prodigal_tf, translation_table, keep_sequence_headers, locus, locus_tag, locus_tag_increment, gram, replicons, compliant, user_proteins, user_hmms, meta, regions
complete = args.complete
log.info('complete=%s', complete)
prodigal_tf = args.prodigal_tf
Expand All @@ -186,8 +186,8 @@ def setup(args):
compliant = args.compliant
log.info('compliant=%s', compliant)
if(compliant):
min_contig_length = 200
log.info('compliant mode! min_contig_length=%s', min_contig_length)
min_sequence_length = 200
log.info('compliant mode! min_contig_length=%s', min_sequence_length)
meta = args.meta
log.info('meta=%s', meta)
locus = args.locus
Expand Down Expand Up @@ -221,8 +221,8 @@ def setup(args):
log.info('locus-tag=%s', locus_tag)
locus_tag_increment = args.locus_tag_increment
log.info('locus-tag-increment=%s', locus_tag_increment)
keep_contig_headers = args.keep_contig_headers
log.info('keep_contig_headers=%s', keep_contig_headers)
keep_sequence_headers = args.keep_contig_headers
log.info('keep_contig_headers=%s', keep_sequence_headers)
replicons = args.replicons
if(replicons is not None):
try:
Expand Down
2 changes: 1 addition & 1 deletion bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@
############################################################################
REPLICON_CHROMOSOME = 'chromosome'
REPLICON_PLASMID = 'plasmid'
REPLICON_CONTIG = 'contig'
REPLICON_CONTIG = 'sequence'
REPLICON_LENGTH_THRESHOLD_PLASMID = 112_000 # Nasuia deltocephalinicola -> DOI: 10.1093/gbe/evt118
REPLICON_LENGTH_THRESHOLD_CHROMOSOME = 2_800_000 # max plasmid length (except 1 outlier-> https://ccb-microbe.cs.uni-saarland.de/plsdb/plasmids/
TOPOLOGY_CIRCULAR = 'circular'
Expand Down
4 changes: 2 additions & 2 deletions bakta/expert/amrfinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path):
cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, contig=%s, start=%i, stop=%i, strand=%s',
gene, product, method, model_cov, identity, cds['contig'], cds['start'], cds['stop'], cds['strand']
'hit: gene=%s, product=%s, method=%s, target-cov=%0.3f, identity=%0.3f, seq=%s, start=%i, stop=%i, strand=%s',
gene, product, method, model_cov, identity, cds['sequence'], cds['start'], cds['stop'], cds['strand']
)
cds_found.add(aa_identifier)

Expand Down
8 changes: 4 additions & 4 deletions bakta/expert/protein_hmms.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def search(cdss: Sequence[dict], user_hmms_path):
cds = orf_by_aa_digest[aa_identifier]
if hmm_query_hit.evalue > bc.MIN_HMM_EVALUE:
log.debug(
'discard low evalue: contig=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
'discard low evalue: seq=%s, start=%i, stop=%i, strand=%s, id=%s, evalue=%1.1e, bitscore=%f',
cds['sequence'], cds['start'], cds['stop'], cds['strand'], hmm_id, hmm_query_hit.evalue, hmm_query_hit.score
)
else:
hit_domain_lengths_sum = sum([len(dom.alignment.hmm_sequence) for dom in hmm_query_hit.domains.included])
Expand Down Expand Up @@ -64,8 +64,8 @@ def search(cdss: Sequence[dict], user_hmms_path):
cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=UserHMMs, rank=99, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
cds['contig'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
'hit: source=UserHMMs, rank=99, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, model-cov=%0.3f, hmm-id=%s, gene=%s, product=%s, evalue=%1.1e, bitscore=%f',
cds['sequence'], cds['start'], cds['stop'], cds['strand'], hit['aa_cov'], hit['hmm_cov'], hmm_id, hit['gene'], hit['product'], hit['evalue'], hit['score']
)
cds_found.add(aa_identifier)

Expand Down
4 changes: 2 additions & 2 deletions bakta/expert/protein_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def search(cdss: Sequence[dict], cds_fasta_path: Path, expert_system: str, db_pa
cds.setdefault('expert', [])
cds['expert'].append(hit)
log.debug(
'hit: source=%s, rank=%i, contig=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
source, rank, cds['contig'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
'hit: source=%s, rank=%i, seq=%s, start=%i, stop=%i, strand=%s, query-cov=%0.3f, subject-cov=%0.3f, identity=%0.3f, score=%0.1f, evalue=%1.1e, gene=%s, product=%s',
source, rank, cds['sequence'], cds['start'], cds['stop'], cds['strand'], query_cov, model_cov, identity, bitscore, evalue, gene, product
)
cds_found.add(aa_identifier)

Expand Down
Loading

0 comments on commit 0ebd7cc

Please sign in to comment.