Improve PILER-CR CRISPR parser (#249)

* implement parsing of CRISPR repeats & spacers * write CRISPR repeats & spacers to GFF * add CRISPR repeat & spacer output to TSV * add CRISPR PILER-CR test * refactor test * fix full black box test for expanded CRISPR features * refactor test code
oschwengers · Oct 24, 2023 · 53df2bf · 53df2bf
1 parent 358ac16
commit 53df2bf
Show file tree

Hide file tree

Showing 13 changed files with 269 additions and 152 deletions.
diff --git a/bakta/constants.py b/bakta/constants.py
@@ -97,6 +97,8 @@
 FEATURE_NC_RNA = 'ncRNA'
 FEATURE_NC_RNA_REGION = 'ncRNA-region'
 FEATURE_CRISPR = 'crispr'
+FEATURE_CRISPR_REPEAT = 'crispr-repeat'
+FEATURE_CRISPR_SPACER = 'crispr-spacer'
 FEATURE_ORF = 'orf'
 FEATURE_SORF = 'sorf'
 FEATURE_CDS = 'cds'

diff --git a/bakta/features/crispr.py b/bakta/features/crispr.py
@@ -38,18 +38,72 @@ def predict_crispr(genome: dict, contigs_path: Path):
         log.warning('CRISPRs failed! pilercr-error-code=%d', proc.returncode)
         raise Exception(f'PILER-CR error! error code: {proc.returncode}')
 
-    # parse orfs
-    crispr_arrays = []
+    # parse crispr arrays
+    crispr_arrays = {}
     contigs = {c['id']: c for c in genome['contigs']}
     with output_path.open() as fh:
+        output_section = None
         contig_id = None
+        array_id = None
         skip_lines = True
+        crispr_array = None
+        gap_count = 0
         for line in fh:
             line = line.strip()
-            if(line == 'SUMMARY BY POSITION'):
+            if(line == ''):
+                continue
+            if(line == 'DETAIL REPORT'):
+                output_section = 'DETAIL'
+                skip_lines = False
+            elif(line == 'SUMMARY BY POSITION'):
+                output_section = 'POSITION'
+                skip_lines = False
+            elif(line == 'SUMMARY BY SIMILARITY'):
+                output_section = 'SIMILARITY'
                 skip_lines = False
             elif(skip_lines is False):
-                if(len(line) > 0):
+                if(output_section == 'DETAIL'):
+                    if(line[0:5] == 'Array'):
+                        gap_count = 0
+                        array_id = line.split()[1]
+                        crispr_array = OrderedDict()
+                        crispr_array['type'] = bc.FEATURE_CRISPR
+                        crispr_array['strand'] = bc.STRAND_UNKNOWN
+                        crispr_array['repeats'] = []
+                        crispr_array['spacers'] = []
+                        crispr_arrays[array_id] = crispr_array
+                    elif(line[0] == '>'):
+                        contig_id = line[1:]
+                        crispr_array['contig'] = contig_id
+                    elif(line[0] != '='):
+                        cols = line.split()
+                        if(len(cols) == 7  and  cols[0] != 'Pos'):
+                            (position, repeat_length, id, spacer_length, left_flank, repeat_seq, spacer_seq) = cols
+                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
+                            spacer_seq = spacer_seq.upper()
+                            crispr_repeat = OrderedDict()
+                            crispr_repeat['strand'] = bc.STRAND_UNKNOWN
+                            crispr_repeat['start'] = position - gap_count
+                            crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
+                            crispr_array['repeats'].append(crispr_repeat)
+                            gap_count += repeat_seq.count('-')  # correct wrong PILER-CR detail positions by gaps
+                            crispr_spacer = OrderedDict()
+                            crispr_spacer['strand'] = bc.STRAND_UNKNOWN
+                            crispr_spacer['start'] = position + repeat_length  - gap_count
+                            crispr_spacer['stop'] = position + repeat_length + spacer_length - 1 - gap_count
+                            crispr_spacer['sequence'] = spacer_seq
+                            crispr_array['spacers'].append(crispr_spacer)
+                            spacer_genome_seq = bu.extract_feature_sequence(crispr_spacer, contigs[contig_id])
+                            assert spacer_seq == spacer_genome_seq  # assure PILER-CR spacer sequence equal extraction from genome
+                        elif(len(cols) == 6  and  cols[0] != 'Pos'):  # last line in array without spacer
+                            (position, repeat_length, id, left_flank, repeat_seq, spacer_seq) = cols
+                            position, repeat_length, spacer_length = int(position), int(repeat_length), int(spacer_length)
+                            crispr_repeat = OrderedDict()
+                            crispr_repeat['strand'] = bc.STRAND_UNKNOWN
+                            crispr_repeat['start'] = position - gap_count
+                            crispr_repeat['stop'] = position + repeat_length - 1 - gap_count
+                            crispr_array['repeats'].append(crispr_repeat)
+                elif(output_section == 'POSITION'):
                     if(line[0] == '>'):
                         contig_id = line[1:]
                     elif(line[0] != 'A' and line[0] != '='):
@@ -58,27 +112,22 @@ def predict_crispr(genome: dict, contigs_path: Path):
                             (array_id, contig, position, length, copies, repeat_length, spacer_length, repeat_consensus) = cols
                         else:
                             (array_id, contig, position, length, copies, repeat_length, spacer_length, distance, repeat_consensus) = cols
+                        crispr_array = crispr_arrays[array_id]
+                        crispr_array['start'] = int(position)
+                        crispr_array['stop'] = int(position) + int(length) - 1
+                        crispr_array['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
+                        crispr_array['spacer_length'] = int(spacer_length)
+                        crispr_array['repeat_length'] = int(repeat_length)
+                        assert len(crispr_array['repeats']) == int(copies), print(f"len(reps)={len(crispr_array['repeats'])}, int(copies)={int(copies)}")
+                        crispr_array['repeat_consensus'] = repeat_consensus
+                        crispr_array['db_xrefs'] = [so.SO_CRISPR.id]
 
-                        crispr = OrderedDict()
-                        crispr['type'] = bc.FEATURE_CRISPR
-                        crispr['contig'] = contig_id
-                        crispr['start'] = int(position)
-                        crispr['stop'] = int(position) + int(length) - 1
-                        crispr['strand'] = bc.STRAND_UNKNOWN
-                        crispr['product'] = f'CRISPR array with {copies} repeats of length {repeat_length}, consensus sequence {repeat_consensus} and spacer length {spacer_length}'
-                        crispr['spacer_length'] = int(spacer_length)
-                        crispr['repeat_length'] = int(repeat_length)
-                        crispr['repeats'] = int(copies)
-                        crispr['repeat_consensus'] = repeat_consensus
-                        crispr['db_xrefs'] = [so.SO_CRISPR.id]
-
-                        nt = bu.extract_feature_sequence(crispr, contigs[contig_id])  # extract nt sequences
-                        crispr['nt'] = nt
-
-                        crispr_arrays.append(crispr)
+                        nt = bu.extract_feature_sequence(crispr_array, contigs[contig_id])  # extract nt sequences
+                        crispr_array['nt'] = nt
                         log.info(
                             'contig=%s, start=%i, stop=%i, spacer-length=%i, repeat-length=%i, # repeats=%i, repeat-consensus=%s, nt=[%s..%s]',
-                            crispr['contig'], crispr['start'], crispr['stop'], crispr['spacer_length'], crispr['repeat_length'], crispr['repeats'], crispr['repeat_consensus'], nt[:10], nt[-10:]
+                            crispr_array['contig'], crispr_array['start'], crispr_array['stop'], crispr_array['spacer_length'], crispr_array['repeat_length'], len(crispr_array['repeats']), crispr_array['repeat_consensus'], nt[:10], nt[-10:]
                         )
+    crispr_arrays = crispr_arrays.values()                        
     log.info('predicted=%i', len(crispr_arrays))
     return crispr_arrays
diff --git a/bakta/io/gff.py b/bakta/io/gff.py
@@ -186,6 +186,29 @@ def write_gff3(genome: dict, features_by_contig: Dict[str, dict], gff3_path: Pat
                         annotations[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feat['repeat_consensus']
                     annotations = encode_annotations(annotations)
                     fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
+                    if(not cfg.compliant):
+                        i = 0
+                        while i < len(feat['spacers']):
+                            repeat = feat['repeats'][i]
+                            annotations = {
+                                'ID': f"{feat['id']}_repeat_{i+1}"
+                            }
+                            annotations = encode_annotations(annotations)
+                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_REPEAT}\t{repeat['start']}\t{repeat['stop']}\t.\t{repeat['strand']}\t.\t{annotations}\n")
+                            spacer = feat['spacers'][i]
+                            annotations = {
+                                'ID': f"{feat['id']}_spacer_{i+1}",
+                                'sequence': spacer['sequence']
+                            }
+                            annotations = encode_annotations(annotations)
+                            fh.write(f"{feat['contig']}\tPILER-CR\t{bc.FEATURE_CRISPR_SPACER}\t{spacer['start']}\t{spacer['stop']}\t.\t{spacer['strand']}\t.\t{annotations}\n")
+                            i += 1
+                        repeat = feat['repeats'][i]
+                        annotations = {
+                            'ID': f"{feat['id']}_repeat_{i+1}"
+                        }
+                        annotations = encode_annotations(annotations)
+                        fh.write(f"{feat['contig']}\tPILER-CR\t{feat_type}\t{start}\t{stop}\t.\t{feat['strand']}\t.\t{annotations}\n")
                 elif(feat['type'] is bc.FEATURE_CDS):
                     annotations = {
                         'ID': feat['locus'],

diff --git a/bakta/io/tsv.py b/bakta/io/tsv.py
@@ -27,21 +27,38 @@ def write_tsv(contigs: Sequence[dict], features_by_contig: Dict[str, dict], tsv_
         for contig in contigs:
             for feat in features_by_contig[contig['id']]:
                 feat_type = feat['type']
-                if(feat['type'] == bc.FEATURE_GAP):
+                if(feat_type == bc.FEATURE_GAP):
                     feat_type = bc.INSDC_FEATURE_ASSEMBLY_GAP if feat['length'] >= 100 else bc.INSDC_FEATURE_GAP
 
                 gene = feat['gene'] if feat.get('gene', None) else ''
                 product = f"(pseudo) {feat.get('product', '')}" if feat.get('pseudo', False) else feat.get('product', '')
-                fh.write('\t'.join([feat['contig'],
-                                    feat_type,
-                                    str(feat['start']),
-                                    str(feat['stop']),
-                                    feat['strand'],
-                                    feat.get('locus', ''),
-                                    gene,
-                                    product,
-                                    ', '.join(sorted(feat.get('db_xrefs', [])))]))
+                fh.write('\t'.join(
+                    [
+                        feat['contig'],
+                        feat_type,
+                        str(feat['start']),
+                        str(feat['stop']),
+                        feat['strand'],
+                        feat.get('locus', ''),
+                        gene,
+                        product,
+                        ', '.join(sorted(feat.get('db_xrefs', [])))
+                    ])
+                )
                 fh.write('\n')
+                if(feat_type == bc.FEATURE_CRISPR):
+                    i = 0
+                    while i < len(feat['spacers']):
+                        repeat = feat['repeats'][i]
+                        fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                        fh.write('\n')
+                        spacer = feat['spacers'][i]
+                        fh.write('\t'.join([feat['contig'], 'CRISPR spacer', str(spacer['start']), str(spacer['stop']), spacer['strand'], '', '', f"CRISPR spacer, sequence {spacer['sequence']}", '']))
+                        fh.write('\n')
+                        i += 1
+                    repeat = feat['repeats'][i]
+                    fh.write('\t'.join([feat['contig'], 'CRISPR repeat', str(repeat['start']), str(repeat['stop']), repeat['strand'], '', '', f"CRISPR repeat", '']))
+                    fh.write('\n')
     return
 
 

diff --git a/test/test_args.py b/test/test_args.py
@@ -1,10 +1,10 @@
-from inspect import Parameter
 import os
-import pytest
 
 from pathlib import Path
 from subprocess import run
 
+import pytest
+
 from .conftest import FILES, SKIP_PARAMETERS
 
 

diff --git a/test/test_bakta.py b/test/test_bakta.py
@@ -1,11 +1,13 @@
+import json
 import subprocess as sp
-import sys
 
 from pathlib import Path
 from subprocess import run
 
 import pytest
 
+import bakta.constants as bc
+
 from .conftest import FILES, SKIP_PARAMETERS
 
 
@@ -58,24 +60,28 @@ def test_bakta_plasmid(tmpdir):
         assert Path.exists(output_path)
         assert output_path.stat().st_size > 0
 
-    output_path = tmpdir_path.joinpath('test.tsv')
-    feature_count, feature_counts = count_features(output_path)
-    assert feature_count == 3
+    results_path = tmpdir_path.joinpath('test.json')
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    features = results['features']
+    assert len(features) == 3
     feature_counts_expected = {
-        'tRNA': 0,
-        'tmRNA': 0,
-        'rRNA': 0,
-        'ncRNA': 0,
-        'ncRNA-region': 0,
-        'crispr': 0,
-        'sorf': 0,
-        'oriV': 0,
-        'oriC': 0,
-        'oriT': 0,
-        'cds': 3
+        bc.FEATURE_T_RNA: 0,
+        bc.FEATURE_TM_RNA: 0,
+        bc.FEATURE_R_RNA: 0,
+        bc.FEATURE_NC_RNA: 0,
+        bc.FEATURE_NC_RNA_REGION: 0,
+        bc.FEATURE_CRISPR: 0,
+        bc.FEATURE_CDS: 3,
+        bc.FEATURE_SORF: 0,
+        bc.FEATURE_ORIC: 0,
+        bc.FEATURE_ORIV: 0,
+        bc.FEATURE_ORIT: 0
     }
-    for type in feature_counts:
-        assert feature_counts[type] == feature_counts_expected[type]
+    for type, count in feature_counts_expected.items():
+        assert len([f for f in features if f['type'] == type]) == count
 
 
 @pytest.mark.parametrize(
@@ -96,46 +102,26 @@ def test_bakta_genome(db, tmpdir):
         assert Path.exists(output_path)
         assert output_path.stat().st_size > 0
 
-    output_path = tmpdir_path.joinpath('test.tsv')
-    feature_count, feature_counts = count_features(output_path)
-    assert feature_count == 5551
+    results_path = tmpdir_path.joinpath('test.json')
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    features = results['features']
+    assert len(features) == 5551
     feature_counts_expected = {
-        'tRNA': 107,
-        'tmRNA': 1,
-        'rRNA': 7,
-        'ncRNA': 57,
-        'ncRNA-region': 1,
-        'crispr': 1,
-        'sorf': 2,
-        'oriV': 0,
-        'oriC': 0,
-        'oriT': 0,
-        'cds': 5375
+        bc.FEATURE_T_RNA: 107,
+        bc.FEATURE_TM_RNA: 1,
+        bc.FEATURE_R_RNA: 7,
+        bc.FEATURE_NC_RNA: 57,
+        bc.FEATURE_NC_RNA_REGION: 1,
+        bc.FEATURE_CRISPR: 1,
+        bc.FEATURE_CDS: 5375,
+        bc.FEATURE_SORF: 2,
+        bc.FEATURE_ORIC: 0,
+        bc.FEATURE_ORIV: 0,
+        bc.FEATURE_ORIT: 0
     }
-    for type in feature_counts:
-        assert feature_counts[type] == feature_counts_expected[type]
-
-
-def count_features(file_path):
-    with open(file_path, 'r') as fh:
-        feature_count = 0
-        feature_counts = {
-            'tRNA': 0,
-            'tmRNA': 0,
-            'rRNA': 0,
-            'ncRNA': 0,
-            'ncRNA-region': 0,
-            'crispr': 0,
-            'sorf': 0,
-            'oriV': 0,
-            'oriC': 0,
-            'oriT': 0,
-            'cds': 0
-        }
-        for line in fh:
-            if not line.startswith('#'):
-                feature_count += 1
-                feature = line.split('\t')[1]
-                if feature in feature_counts:
-                    feature_counts[feature] += 1
-    return feature_count, feature_counts
+    for type, count in feature_counts_expected.items():
+        assert len([f for f in features if f['type'] == type]) == count
+
diff --git a/test/test_bakta_proteins.py b/test/test_bakta_proteins.py
@@ -3,6 +3,7 @@
 
 import pytest
 
+
 FILES = [
     'test.tsv',
     'test.hypotheticals.tsv',

diff --git a/test/test_crispr.py b/test/test_crispr.py
@@ -0,0 +1,43 @@
+import json
+
+from pathlib import Path
+from subprocess import run
+
+
+CRISPR_ARRAYS = [
+    {
+        'repeat_consensus': 'CGGTTTATCCCCGCTGGCGCGGGGAACACA',
+        'spacers': [
+            'AACCGAAACACACGATCAATCCGAATATGAG',
+            'TTGGTGACAGTTTTTGTCACTGTTTTGGTGA',
+            'CTAAGCATACATATCTGTTTTTAAACA'
+        ],
+        'repeats': 3
+    }
+]
+
+
+def test_crispr_arrays(tmpdir):
+    proc = run(
+        [
+            'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--force', '--prefix', 'test',
+            '--skip-tmrna', '--skip-trna', '--skip-rrna', '--skip-ncrna', '--skip-ncrna-region', '--skip-cds', '--skip-sorf', '--skip-ori', '--skip-gap', '--skip-plot',
+            'test/data/GCF_000008865.2.fna.gz'
+        ]
+    )
+    assert proc.returncode == 0
+
+    results_path = Path(tmpdir).joinpath('test.json')
+    assert Path.exists(results_path)
+
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+
+    crispr_arrays = [feat for feat in results['features'] if feat['type'] == 'crispr']
+    assert len(crispr_arrays) == 1
+
+    for idx, crispr_array in enumerate(crispr_arrays):
+        assert crispr_array['repeat_consensus'] == CRISPR_ARRAYS[idx]['repeat_consensus']
+        assert len(crispr_array['repeats']) == CRISPR_ARRAYS[idx]['repeats']