diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 3134b5b2..74ffd48f 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -42,6 +42,7 @@ def __init__(self, trait_mappings, consequence_mappings): self.clinvar_skip_unsupported_variation = 0 self.clinvar_skip_no_functional_consequences = 0 self.clinvar_skip_missing_efo_mapping = 0 + self.clinvar_skip_invalid_evidence_string = 0 self.clinvar_done_one_complete_evidence_string = 0 self.clinvar_done_multiple_complete_evidence_strings = 0 @@ -57,17 +58,16 @@ def __init__(self, trait_mappings, consequence_mappings): self.repeat_expansion_variants = 0 self.structural_variants = 0 - def collate_report(self): + def print_report_and_check_counts(self): + """Print report of counts and return True if counts are consistent, False otherwise.""" # ClinVar tallies. clinvar_fatal = self.clinvar_fatal_no_valid_traits clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + - self.clinvar_skip_missing_efo_mapping) + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) clinvar_done = (self.clinvar_done_one_complete_evidence_string + self.clinvar_done_multiple_complete_evidence_strings) - assert clinvar_fatal + clinvar_skipped + clinvar_done == self.clinvar_total, \ - 'ClinVar evidence string tallies do not add up to the total amount.' - return f'''Total number of evidence strings generated\t{self.evidence_string_count} + report = f'''Total number of evidence strings generated\t{self.evidence_string_count} Total number of complete evidence strings generated\t{self.complete_evidence_string_count} Total number of ClinVar records\t{self.clinvar_total} @@ -76,6 +76,7 @@ def collate_report(self): Unsupported variation type\t{self.clinvar_skip_unsupported_variation} No functional consequences\t{self.clinvar_skip_no_functional_consequences} Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} + Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} Done: Generated at least one complete evidence string\t{clinvar_done} One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} @@ -91,6 +92,15 @@ def collate_report(self): Total number of variant to consequence mappings\t{self.total_consequence_mappings} Number of repeat expansion variants\t{self.repeat_expansion_variants} Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') + print(report) + + # Confirm counts as expected, exit with error if not. + expected_total = clinvar_fatal + clinvar_skipped + clinvar_done + if expected_total != self.clinvar_total: + logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' + f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') + return False + return True def write_unmapped_terms(self, dir_out): with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: @@ -117,11 +127,13 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file) variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) - report = clinvar_to_evidence_strings( + report, exception_raised = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) - print(report.collate_report()) + counts_consistent = report.print_report_and_check_counts() report.write_unmapped_terms(dir_out) + if exception_raised or not counts_consistent: + sys.exit(1) def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, @@ -129,6 +141,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') + exception_raised = False logger.info('Processing ClinVar records') for clinvar_record in ClinVarDataset(clinvar_xml): @@ -202,16 +215,22 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings elif complete_evidence_strings_generated > 1: report.clinvar_done_multiple_complete_evidence_strings += 1 + if evidence_strings_generated == 0: + report.clinvar_skip_invalid_evidence_string += 1 + report.complete_evidence_string_count += complete_evidence_strings_generated report.evidence_string_count += evidence_strings_generated except Exception as e: + # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all + # records and printing the report. logger.error(f'Problem generating evidence for {clinvar_record.accession}') logger.error(f'Error: {e}') + exception_raised = True continue output_evidence_strings_file.close() - return report + return report, exception_raised def format_creation_date(s):