Skip to content

Commit

Permalink
Improve PILER-CR CRISPR parser (#249)
Browse files Browse the repository at this point in the history
* implement parsing of CRISPR repeats & spacers
* write CRISPR repeats & spacers to GFF
* add CRISPR repeat & spacer output to TSV
* add CRISPR PILER-CR test
* refactor test
* fix full black box test for expanded CRISPR features
* refactor test code
  • Loading branch information
oschwengers authored Oct 24, 2023
1 parent 358ac16 commit 53df2bf
Show file tree
Hide file tree
Showing 13 changed files with 269 additions and 152 deletions.
2 changes: 2 additions & 0 deletions bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@
FEATURE_NC_RNA = 'ncRNA'
FEATURE_NC_RNA_REGION = 'ncRNA-region'
FEATURE_CRISPR = 'crispr'
FEATURE_CRISPR_REPEAT = 'crispr-repeat'
FEATURE_CRISPR_SPACER = 'crispr-spacer'
FEATURE_ORF = 'orf'
FEATURE_SORF = 'sorf'
FEATURE_CDS = 'cds'
Expand Down
93 changes: 71 additions & 22 deletions bakta/features/crispr.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,18 +38,72 @@ def predict_crispr(genome: dict, contigs_path: Path):
log.warning('CRISPRs failed! pilercr-error-code=%d', proc.returncode)
raise Exception(f'PILER-CR error! error code: {proc.returncode}')

# parse orfs
crispr_arrays = []
# parse crispr arrays
crispr_arrays = {}
contigs = {c['id']: c for c in genome['contigs']}
with output_path.open() as fh:
output_section = None
contig_id = None
array_id = None
skip_lines = True
crispr_array = None
gap_count = 0
for line in fh:
line = line.strip()
if(line == 'SUMMARY BY POSITION'):
if(line == ''):
continue
if(line == 'DETAIL REPORT'):
output_section = 'DETAIL'
skip_lines = False
elif(line == 'SUMMARY BY POSITION'):
output_section = 'POSITION'
skip_lines = False
elif(line == 'SUMMARY BY SIMILARITY'):
output_section = 'SIMILARITY'
skip_lines = False
elif(skip_lines is False):
if(len(line) > 0):
if(output_section == 'DETAIL'):
if(line[0:5] == 'Array'):
gap_count = 0
array_id = line.split()[1]
crispr_array = OrderedDict()
crispr_array['type'] = bc.FEATURE_CRISPR
crispr_array['strand'] = bc.STRAND_UNKNOWN
crispr_array['repeats'] = []
crispr_array['spacers'] = []
crispr_arrays[array_id] = crispr_array
elif(line[0] == '>'):
contig_id = line[1:]
crispr_array['contig'] = contig_id
elif(line[0] != '='):
cols = line.split()
if(len(cols) == 7 and cols[0] != 'Pos'):
(position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols
position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
spacer_seq = spacer_seq.upper()
crispr_repeat = OrderedDict()
crispr_repeat['strand'] = bc.STRAND_UNKNOWN
crispr_repeat['start'] = position - gap_count
crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
crispr_array['repeats'].append(crispr_repeat)
gap_count += repeat_seq.count('-') # correct wrong PILER-CR detail positions by gaps
crispr_spacer = OrderedDict()
crispr_spacer['strand'] = bc.STRAND_UNKNOWN
crispr_spacer['start'] = position + repeat_length - gap_count
crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
crispr_spacer['sequence'] = spacer_seq
crispr_array['spacers'].append(crispr_spacer)
spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
assert spacer_seq == spacer_genome_seq # assure PILER-CR spacer sequence equal extraction from genome
elif(len(cols) == 6 and cols[0] != 'Pos'): # last line in array without spacer
(position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols
position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
crispr_repeat = OrderedDict()
crispr_repeat['strand'] = bc.STRAND_UNKNOWN
crispr_repeat['start'] = position - gap_count
crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
crispr_array['repeats'].append(crispr_repeat)
elif(output_section == 'POSITION'):
if(line[0] == '>'):
contig_id = line[1:]
elif(line[0] != 'A' and line[0] != '='):
Expand All @@ -58,27 +112,22 @@ def predict_crispr(genome: dict, contigs_path: Path):
(array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols
else:
(array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols
crispr_array = crispr_arrays[array_id]
crispr_array['start'] = int(position)
crispr_array['stop'] = int(position) + int(length) - 1
crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
crispr_array['spacer_length'] = int(spacer_length)
crispr_array['repeat_length'] = int(repeat_length)
assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
crispr_array['repeat_consensus'] = repeat_consensus
crispr_array['db_xrefs'] = [so.SO_CRISPR.id]

crispr = OrderedDict()
crispr['type'] = bc.FEATURE_CRISPR
crispr['contig'] = contig_id
crispr['start'] = int(position)
crispr['stop'] = int(position) + int(length) - 1
crispr['strand'] = bc.STRAND_UNKNOWN
crispr['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
crispr['spacer_length'] = int(spacer_length)
crispr['repeat_length'] = int(repeat_length)
crispr['repeats'] = int(copies)
crispr['repeat_consensus'] = repeat_consensus
crispr['db_xrefs'] = [so.SO_CRISPR.id]

nt = bu.extract_feature_sequence(crispr, contigs[contig_id]) # extract nt sequences
crispr['nt'] = nt

crispr_arrays.append(crispr)
nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id]) # extract nt sequences
crispr_array['nt'] = nt
log.info(
'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]',
crispr['contig'], crispr['start'], crispr['stop'], crispr['spacer_length'], crispr['repeat_length'], crispr['repeats'], crispr['repeat_consensus'], nt[:10], nt[-10:]
crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:]
)
crispr_arrays = crispr_arrays.values()
log.info('predicted=%i', len(crispr_arrays))
return crispr_arrays
23 changes: 23 additions & 0 deletions bakta/io/gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,29 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
if(not cfg.compliant):
i = 0
while i < len(feat['spacers']):
repeat = feat['repeats'][i]
annotations = {
'ID': f"{feat['id']}_repeat_{i+1}"
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
spacer = feat['spacers'][i]
annotations = {
'ID': f"{feat['id']}_spacer_{i+1}",
'sequence': spacer['sequence']
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
i += 1
repeat = feat['repeats'][i]
annotations = {
'ID': f"{feat['id']}_repeat_{i+1}"
}
annotations = encode_annotations(annotations)
fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
elif(feat['type'] is bc.FEATURE_CDS):
annotations = {
'ID': feat['locus'],
Expand Down
37 changes: 27 additions & 10 deletions bakta/io/tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,21 +27,38 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
for contig in contigs:
for feat in features_by_contig[contig['id']]:
feat_type = feat['type']
if(feat['type'] == bc.FEATURE_GAP):
if(feat_type == bc.FEATURE_GAP):
feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP

gene = feat['gene'] if feat.get('gene', None) else ''
product = f"(pseudo) {feat.get('product', '')}" if feat.get('pseudo', False) else feat.get('product', '')
fh.write('\t'.join([feat['contig'],
feat_type,
str(feat['start']),
str(feat['stop']),
feat['strand'],
feat.get('locus', ''),
gene,
product,
', '.join(sorted(feat.get('db_xrefs', [])))]))
fh.write('\t'.join(
[
feat['contig'],
feat_type,
str(feat['start']),
str(feat['stop']),
feat['strand'],
feat.get('locus', ''),
gene,
product,
', '.join(sorted(feat.get('db_xrefs', [])))
])
)
fh.write('\n')
if(feat_type == bc.FEATURE_CRISPR):
i = 0
while i < len(feat['spacers']):
repeat = feat['repeats'][i]
fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
fh.write('\n')
spacer = feat['spacers'][i]
fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
fh.write('\n')
i += 1
repeat = feat['repeats'][i]
fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
fh.write('\n')
return


Expand Down
4 changes: 2 additions & 2 deletions test/test_args.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from inspect import Parameter
import os
import pytest

from pathlib import Path
from subprocess import run

import pytest

from .conftest import FILES, SKIP_PARAMETERS


Expand Down
102 changes: 44 additions & 58 deletions test/test_bakta.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import json
import subprocess as sp
import sys

from pathlib import Path
from subprocess import run

import pytest

import bakta.constants as bc

from .conftest import FILES, SKIP_PARAMETERS


Expand Down Expand Up @@ -58,24 +60,28 @@ def test_bakta_plasmid(tmpdir):
assert Path.exists(output_path)
assert output_path.stat().st_size > 0

output_path = tmpdir_path.joinpath('test.tsv')
feature_count, feature_counts = count_features(output_path)
assert feature_count == 3
results_path = tmpdir_path.joinpath('test.json')
results = None
with results_path.open() as fh:
results = json.load(fh)
assert results is not None
features = results['features']
assert len(features) == 3
feature_counts_expected = {
'tRNA': 0,
'tmRNA': 0,
'rRNA': 0,
'ncRNA': 0,
'ncRNA-region': 0,
'crispr': 0,
'sorf': 0,
'oriV': 0,
'oriC': 0,
'oriT': 0,
'cds': 3
bc.FEATURE_T_RNA: 0,
bc.FEATURE_TM_RNA: 0,
bc.FEATURE_R_RNA: 0,
bc.FEATURE_NC_RNA: 0,
bc.FEATURE_NC_RNA_REGION: 0,
bc.FEATURE_CRISPR: 0,
bc.FEATURE_CDS: 3,
bc.FEATURE_SORF: 0,
bc.FEATURE_ORIC: 0,
bc.FEATURE_ORIV: 0,
bc.FEATURE_ORIT: 0
}
for type in feature_counts:
assert feature_counts[type] == feature_counts_expected[type]
for type, count in feature_counts_expected.items():
assert len([f for f in features if f['type'] == type]) == count


@pytest.mark.parametrize(
Expand All @@ -96,46 +102,26 @@ def test_bakta_genome(db, tmpdir):
assert Path.exists(output_path)
assert output_path.stat().st_size > 0

output_path = tmpdir_path.joinpath('test.tsv')
feature_count, feature_counts = count_features(output_path)
assert feature_count == 5551
results_path = tmpdir_path.joinpath('test.json')
results = None
with results_path.open() as fh:
results = json.load(fh)
assert results is not None
features = results['features']
assert len(features) == 5551
feature_counts_expected = {
'tRNA': 107,
'tmRNA': 1,
'rRNA': 7,
'ncRNA': 57,
'ncRNA-region': 1,
'crispr': 1,
'sorf': 2,
'oriV': 0,
'oriC': 0,
'oriT': 0,
'cds': 5375
bc.FEATURE_T_RNA: 107,
bc.FEATURE_TM_RNA: 1,
bc.FEATURE_R_RNA: 7,
bc.FEATURE_NC_RNA: 57,
bc.FEATURE_NC_RNA_REGION: 1,
bc.FEATURE_CRISPR: 1,
bc.FEATURE_CDS: 5375,
bc.FEATURE_SORF: 2,
bc.FEATURE_ORIC: 0,
bc.FEATURE_ORIV: 0,
bc.FEATURE_ORIT: 0
}
for type in feature_counts:
assert feature_counts[type] == feature_counts_expected[type]


def count_features(file_path):
with open(file_path, 'r') as fh:
feature_count = 0
feature_counts = {
'tRNA': 0,
'tmRNA': 0,
'rRNA': 0,
'ncRNA': 0,
'ncRNA-region': 0,
'crispr': 0,
'sorf': 0,
'oriV': 0,
'oriC': 0,
'oriT': 0,
'cds': 0
}
for line in fh:
if not line.startswith('#'):
feature_count += 1
feature = line.split('\t')[1]
if feature in feature_counts:
feature_counts[feature] += 1
return feature_count, feature_counts
for type, count in feature_counts_expected.items():
assert len([f for f in features if f['type'] == type]) == count

1 change: 1 addition & 0 deletions test/test_bakta_proteins.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pytest


FILES = [
'test.tsv',
'test.hypotheticals.tsv',
Expand Down
43 changes: 43 additions & 0 deletions test/test_crispr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import json

from pathlib import Path
from subprocess import run


CRISPR_ARRAYS = [
{
'repeat_consensus': 'CGGTTTATCCCCGCTGGCGCGGGGAACACA',
'spacers': [
'AACCGAAACACACGATCAATCCGAATATGAG',
'TTGGTGACAGTTTTTGTCACTGTTTTGGTGA',
'CTAAGCATACATATCTGTTTTTAAACA'
],
'repeats': 3
}
]


def test_crispr_arrays(tmpdir):
proc = run(
[
'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test',
'--skip-tmrna', '--skip-trna', '--skip-rrna', '--skip-ncrna', '--skip-ncrna-region', '--skip-cds', '--skip-sorf', '--skip-ori', '--skip-gap', '--skip-plot',
'test/data/GCF_000008865.2.fna.gz'
]
)
assert proc.returncode == 0

results_path = Path(tmpdir).joinpath('test.json')
assert Path.exists(results_path)

results = None
with results_path.open() as fh:
results = json.load(fh)
assert results is not None

crispr_arrays = [feat for feat in results['features'] if feat['type'] == 'crispr']
assert len(crispr_arrays) == 1

for idx, crispr_array in enumerate(crispr_arrays):
assert crispr_array['repeat_consensus'] == CRISPR_ARRAYS[idx]['repeat_consensus']
assert len(crispr_array['repeats']) == CRISPR_ARRAYS[idx]['repeats']
Loading

0 comments on commit 53df2bf

Please sign in to comment.