Skip to content

Commit

Permalink
Merge pull request #421 from apriltuesday/parallel-evidence
Browse files Browse the repository at this point in the history
Issue 416 - Parallelise evidence string generation
  • Loading branch information
apriltuesday authored Mar 7, 2024
2 parents 0bfba70 + 0668213 commit 1e1afdb
Show file tree
Hide file tree
Showing 11 changed files with 365 additions and 115 deletions.
29 changes: 29 additions & 0 deletions bin/aggregate_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env python3

import argparse
import glob

from cmat.output_generation.report import Report

parser = argparse.ArgumentParser('Aggregate counts reports')
parser.add_argument('--counts-yml', nargs='+', help='YAML files containing intermediate counts', required=True)


if __name__ == '__main__':
args = parser.parse_args()

# Load all the reports
filenames = [f for files in args.counts_yml for f in glob.glob(files)]
reports = []
for filename in filenames:
r = Report()
r.load_from_file(filename)
reports.append(r)

# Sum them up and output the results
complete_report = sum(reports, start=Report())
complete_report.print_report()
complete_report.dump_to_file(dir_out='.')
complete_report.write_unmapped_terms(dir_out='.')
if not complete_report.check_counts():
raise RuntimeError('Aggregate counts not consistent')
13 changes: 13 additions & 0 deletions bin/count_clinvar_rcvs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env python3

import argparse

from cmat.clinvar_xml_io import ClinVarDataset

parser = argparse.ArgumentParser('Count number of RCV records in the XML, print to stdout')
parser.add_argument('--clinvar-xml', help='ClinVar XML release', required=True)


if __name__ == '__main__':
args = parser.parse_args()
print(sum(1 for _ in ClinVarDataset(args.clinvar_xml)))
4 changes: 3 additions & 1 deletion bin/evidence_string_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
parser.add_argument('--gene-mapping', help='Variant to gene & consequence mappings', required=True)
parser.add_argument('--ot-schema', help='OpenTargets schema JSON', required=True)
parser.add_argument('--out', help='Output directory', required=True)
parser.add_argument('--start', help='Start index (inclusive)', required=False, type=int)
parser.add_argument('--end', help='End index (exclusive)', required=False, type=int)


if __name__ == '__main__':
args = parser.parse_args()
clinvar_to_evidence_strings.launch_pipeline(
clinvar_xml_file=args.clinvar_xml, efo_mapping_file=args.efo_mapping, gene_mapping_file=args.gene_mapping,
ot_schema_file=args.ot_schema, dir_out=args.out)
ot_schema_file=args.ot_schema, dir_out=args.out, start=args.start, end=args.end)
6 changes: 6 additions & 0 deletions cmat/clinvar_xml_io/clinvar_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ class ClinVarRecord:

# Some allele origin terms in ClinVar are essentially conveying lack of information and are thus not useful.
NONSPECIFIC_ALLELE_ORIGINS = {'unknown', 'not provided', 'not applicable', 'tested-inconclusive', 'not-reported'}
# Some records have been flagged by ClinVar and should not be used.
INVALID_CLINICAL_SIGNFICANCES = {'no classifications from unflagged records'}

def __init__(self, rcv, trait_class=ClinVarTrait, measure_class=ClinVarRecordMeasure):
"""Initialise a ClinVar record object from an RCV XML record."""
Expand Down Expand Up @@ -143,6 +145,10 @@ def clinical_significance_list(self):
See /data-exploration/clinvar-variant-types/README.md for further explanation."""
return sorted(list(set(re.split('/|, |; ', self.clinical_significance_raw.lower().replace('_', ' ')))))

@property
def valid_clinical_significances(self):
return [cs for cs in self.clinical_significance_list if cs.lower() not in self.INVALID_CLINICAL_SIGNFICANCES]

@property
def allele_origins(self):
return {elem.text for elem in find_elements(self.rcv, './ObservedIn/Sample/Origin')}
Expand Down
134 changes: 33 additions & 101 deletions cmat/output_generation/clinvar_to_evidence_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import re
import sys
import os
from collections import defaultdict, Counter
from collections import defaultdict

import jsonschema

from cmat.clinvar_xml_io import ClinVarDataset
from cmat.output_generation import consequence_type as CT
from cmat.output_generation.report import Report

logger = logging.getLogger(__package__)

Expand All @@ -22,90 +23,6 @@
# Output file names.
EVIDENCE_STRINGS_FILE_NAME = 'evidence_strings.json'
EVIDENCE_RECORDS_FILE_NAME = 'evidence_records.tsv'
UNMAPPED_TRAITS_FILE_NAME = 'unmapped_traits.tsv'


class Report:
"""Holds counters and other records for a pipeline run."""

def __init__(self, trait_mappings, consequence_mappings):
self.report_strings = []

# The main evidence string counter.
self.evidence_string_count = 0
# Complete evidence strings are ones with an EFO mapping.
self.complete_evidence_string_count = 0

# ClinVar record counters.
self.clinvar_total = 0
self.clinvar_fatal_no_valid_traits = 0
self.clinvar_skip_unsupported_variation = 0
self.clinvar_skip_no_functional_consequences = 0
self.clinvar_skip_missing_efo_mapping = 0
self.clinvar_skip_invalid_evidence_string = 0
self.clinvar_done_one_complete_evidence_string = 0
self.clinvar_done_multiple_complete_evidence_strings = 0

# Total number of trait-to-ontology mappings present in the database.
self.total_trait_mappings = sum([len(mappings) for mappings in trait_mappings.values()])
# All distinct (trait name, EFO ID) mappings used in the evidence strings.
self.used_trait_mappings = set()
# All unmapped trait names which prevented evidence string generation and their counts.
self.unmapped_trait_names = Counter()

# Variant-to-consequence mapping counts.
self.total_consequence_mappings = sum([len(mappings) for mappings in consequence_mappings.values()])
self.repeat_expansion_variants = 0
self.structural_variants = 0

def print_report_and_check_counts(self):
"""Print report of counts and return True if counts are consistent, False otherwise."""
# ClinVar tallies.
clinvar_fatal = self.clinvar_fatal_no_valid_traits
clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
clinvar_done = (self.clinvar_done_one_complete_evidence_string +
self.clinvar_done_multiple_complete_evidence_strings)

report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
Total number of ClinVar records\t{self.clinvar_total}
Fatal: No traits with valid names\t{self.clinvar_fatal_no_valid_traits}
Skipped: Can be rescued by future improvements\t{clinvar_skipped}
Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
No functional consequences\t{self.clinvar_skip_no_functional_consequences}
Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
Done: Generated at least one complete evidence string\t{clinvar_done}
One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
Percentage of all potentially supportable ClinVar records which generated at least one complete evidence string\t{
clinvar_done / (clinvar_skipped + clinvar_done):.1%}
Total number of trait-to-ontology mappings in the database\t{self.total_trait_mappings}
The number of distinct trait-to-ontology mappings used in the evidence strings\t{
len(self.used_trait_mappings)}
The number of distinct unmapped trait names which prevented complete evidence string generation\t{
len(self.unmapped_trait_names)}
Total number of variant to consequence mappings\t{self.total_consequence_mappings}
Number of repeat expansion variants\t{self.repeat_expansion_variants}
Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
print(report)

# Confirm counts as expected, exit with error if not.
expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
if expected_total != self.clinvar_total:
logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
return False
return True

def write_unmapped_terms(self, dir_out):
with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
for trait, number_of_occurrences in sorted(self.unmapped_trait_names.items(), key=lambda x: -x[1]):
unmapped_traits_file.write(f'{trait}\t{number_of_occurrences}\n')


def validate_evidence_string(ev_string, ot_schema_contents):
Expand All @@ -122,29 +39,38 @@ def validate_evidence_string(ev_string, ot_schema_contents):
sys.exit(1)


def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out):
def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_schema_file, dir_out, start, end):
os.makedirs(dir_out, exist_ok=True)
string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)

report, exception_raised = clinvar_to_evidence_strings(
string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
counts_consistent = report.print_report_and_check_counts()
report.write_unmapped_terms(dir_out)
output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME), start=start, end=end)
counts_consistent = report.check_counts()
report.print_report()
report.dump_to_file(dir_out)
if exception_raised or not counts_consistent:
sys.exit(1)


def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
output_evidence_strings):
output_evidence_strings, start=None, end=None):
report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
ot_schema_contents = json.loads(open(ot_schema).read())
output_evidence_strings_file = open(output_evidence_strings, 'wt')
exception_raised = False

logger.info('Processing ClinVar records')
i = -1
for clinvar_record in ClinVarDataset(clinvar_xml):
# If start & end provided, only process records in the range [start, end)
i += 1
if start and i < start:
continue
if end and i >= end:
break

report.clinvar_total += 1
if report.clinvar_total % 1000 == 0:
logger.info(f'{report.clinvar_total} records processed')
Expand All @@ -156,20 +82,25 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if not clinvar_record.traits_with_valid_names:
report.clinvar_fatal_no_valid_traits += 1
continue
# Failure mode 2 (fatal). A ClinVar record contains no valid clinical significance terms, likely due to
# submissions being flagged.
if not clinvar_record.valid_clinical_significances:
report.clinvar_fatal_no_clinical_significance += 1
continue

# Failure mode 2 (skip). A ClinVar record contains an unsupported variation type.
# Failure mode 3 (skip). A ClinVar record contains an unsupported variation type.
if clinvar_record.measure is None:
report.clinvar_skip_unsupported_variation += 1
continue

# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid allele
# origins, (2) EFO mappings, and (3) genes where the variant has effect.
# Within each ClinVar record, an evidence string is generated for all possible permutations of (1) valid
# allele origins, (2) EFO mappings, and (3) genes where the variant has effect.
grouped_allele_origins = convert_allele_origins(clinvar_record.valid_allele_origins)
consequence_types, _ = get_consequence_types(clinvar_record.measure, variant_to_gene_mappings)
grouped_diseases = group_diseases_by_efo_mapping(clinvar_record.traits_with_valid_names,
string_to_efo_mappings)

# Failure mode 3 (skip). No functional consequences are available.
# Failure mode 4 (skip). No functional consequences are available.
if not consequence_types:
report.clinvar_skip_no_functional_consequences += 1
continue
Expand All @@ -180,9 +111,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
if is_structural_variant(clinvar_record.measure):
report.structural_variants += len(consequence_types)

# Failure mode 4 (skip). A ClinVar record has at least one trait with at least one valid name, but no suitable
# EFO mappings were found in the database. This will still generate an evidence string, but is tracked as a
# failure so we can continue to measure mapping coverage.
# Failure mode 5 (skip). A ClinVar record has at least one trait with at least one valid name, but no
# suitable EFO mappings were found in the database. This will still generate an evidence string, but is
# tracked as a failure so we can continue to measure mapping coverage.
if not any(group[-1] for group in grouped_diseases):
report.clinvar_skip_missing_efo_mapping += 1
unmapped_trait_name = clinvar_record.traits_with_valid_names[0].preferred_or_other_valid_name
Expand All @@ -196,8 +127,9 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
for allele_origins, disease_attributes, consequence_attributes in itertools.product(
grouped_allele_origins, grouped_diseases, consequence_types):
disease_name, disease_source_id, disease_mapped_efo_id = disease_attributes
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name, disease_source_id,
disease_mapped_efo_id, consequence_attributes)
evidence_string = generate_evidence_string(clinvar_record, allele_origins, disease_name,
disease_source_id, disease_mapped_efo_id,
consequence_attributes)

# Validate and immediately output the evidence string (not keeping everything in memory).
is_valid = validate_evidence_string(evidence_string, ot_schema_contents)
Expand All @@ -208,7 +140,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
evidence_strings_generated += 1
if disease_mapped_efo_id is not None:
complete_evidence_strings_generated += 1
report.used_trait_mappings.add((disease_name, disease_mapped_efo_id))
report.used_trait_mappings.add(disease_mapped_efo_id)

if complete_evidence_strings_generated == 1:
report.clinvar_done_one_complete_evidence_string += 1
Expand Down Expand Up @@ -259,7 +191,7 @@ def generate_evidence_string(clinvar_record, allele_origins, disease_name, disea
'allelicRequirements': clinvar_record.mode_of_inheritance,

# Levels of clinical significance reported for the variant.
'clinicalSignificances': clinvar_record.clinical_significance_list,
'clinicalSignificances': clinvar_record.valid_clinical_significances,

# Confidence (review status).
'confidence': clinvar_record.review_status,
Expand Down
Loading

0 comments on commit 1e1afdb

Please sign in to comment.