From cea3f43031cacaa9cc8b0bd9ee9144823ee29843 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 7 Feb 2024 08:59:59 +0000 Subject: [PATCH 1/3] modify counts for when evidence is invalid --- .../clinvar_to_evidence_strings.py | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 3134b5b2..f35a96e1 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -42,6 +42,7 @@ def __init__(self, trait_mappings, consequence_mappings): self.clinvar_skip_unsupported_variation = 0 self.clinvar_skip_no_functional_consequences = 0 self.clinvar_skip_missing_efo_mapping = 0 + self.clinvar_skip_invalid_evidence_string = 0 self.clinvar_done_one_complete_evidence_string = 0 self.clinvar_done_multiple_complete_evidence_strings = 0 @@ -57,17 +58,15 @@ def __init__(self, trait_mappings, consequence_mappings): self.repeat_expansion_variants = 0 self.structural_variants = 0 - def collate_report(self): + def print_report_and_check_counts(self): # ClinVar tallies. clinvar_fatal = self.clinvar_fatal_no_valid_traits clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + - self.clinvar_skip_missing_efo_mapping) + self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string) clinvar_done = (self.clinvar_done_one_complete_evidence_string + self.clinvar_done_multiple_complete_evidence_strings) - assert clinvar_fatal + clinvar_skipped + clinvar_done == self.clinvar_total, \ - 'ClinVar evidence string tallies do not add up to the total amount.' - return f'''Total number of evidence strings generated\t{self.evidence_string_count} + report = f'''Total number of evidence strings generated\t{self.evidence_string_count} Total number of complete evidence strings generated\t{self.complete_evidence_string_count} Total number of ClinVar records\t{self.clinvar_total} @@ -91,6 +90,14 @@ def collate_report(self): Total number of variant to consequence mappings\t{self.total_consequence_mappings} Number of repeat expansion variants\t{self.repeat_expansion_variants} Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n') + print(report) + + # Confirm counts as expected, exit with error if not. + expected_total = clinvar_fatal + clinvar_skipped + clinvar_done + if expected_total != self.clinvar_total: + logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' + f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') + sys.exit(1) def write_unmapped_terms(self, dir_out): with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: @@ -120,7 +127,7 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc report = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) - print(report.collate_report()) + report.print_report_and_check_counts() report.write_unmapped_terms(dir_out) @@ -201,11 +208,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report.clinvar_done_one_complete_evidence_string += 1 elif complete_evidence_strings_generated > 1: report.clinvar_done_multiple_complete_evidence_strings += 1 + else: + report.clinvar_skip_invalid_evidence_string += 1 report.complete_evidence_string_count += complete_evidence_strings_generated report.evidence_string_count += evidence_strings_generated except Exception as e: + # Note while we catch exceptions here, this may or may not cause inconsistencies in the counts, + # in which case the pipeline will crash after processing all records and printing the report. logger.error(f'Problem generating evidence for {clinvar_record.accession}') logger.error(f'Error: {e}') continue From 6b7d6064e9388b09433265b72832b46903cbf638 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 7 Feb 2024 10:23:29 +0000 Subject: [PATCH 2/3] fix count and report invalid evidence --- cmat/output_generation/clinvar_to_evidence_strings.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index f35a96e1..9ff2d2be 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -75,6 +75,7 @@ def print_report_and_check_counts(self): Unsupported variation type\t{self.clinvar_skip_unsupported_variation} No functional consequences\t{self.clinvar_skip_no_functional_consequences} Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping} + Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string} Done: Generated at least one complete evidence string\t{clinvar_done} One complete evidence string\t{self.clinvar_done_one_complete_evidence_string} Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings} @@ -208,7 +209,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report.clinvar_done_one_complete_evidence_string += 1 elif complete_evidence_strings_generated > 1: report.clinvar_done_multiple_complete_evidence_strings += 1 - else: + + if evidence_strings_generated == 0: report.clinvar_skip_invalid_evidence_string += 1 report.complete_evidence_string_count += complete_evidence_strings_generated From be2dd91468a6046cc5af2e22d51e159f711bae61 Mon Sep 17 00:00:00 2001 From: April Shen Date: Wed, 7 Feb 2024 11:52:29 +0000 Subject: [PATCH 3/3] exit if any exception thrown and after all steps complete --- .../clinvar_to_evidence_strings.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py index 9ff2d2be..74ffd48f 100644 --- a/cmat/output_generation/clinvar_to_evidence_strings.py +++ b/cmat/output_generation/clinvar_to_evidence_strings.py @@ -59,6 +59,7 @@ def __init__(self, trait_mappings, consequence_mappings): self.structural_variants = 0 def print_report_and_check_counts(self): + """Print report of counts and return True if counts are consistent, False otherwise.""" # ClinVar tallies. clinvar_fatal = self.clinvar_fatal_no_valid_traits clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences + @@ -98,7 +99,8 @@ def print_report_and_check_counts(self): if expected_total != self.clinvar_total: logger.error(f'ClinVar evidence string tallies do not add up to the total amount: ' f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}') - sys.exit(1) + return False + return True def write_unmapped_terms(self, dir_out): with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file: @@ -125,11 +127,13 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file) variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file) - report = clinvar_to_evidence_strings( + report, exception_raised = clinvar_to_evidence_strings( string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file, output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME)) - report.print_report_and_check_counts() + counts_consistent = report.print_report_and_check_counts() report.write_unmapped_terms(dir_out) + if exception_raised or not counts_consistent: + sys.exit(1) def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema, @@ -137,6 +141,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings) ot_schema_contents = json.loads(open(ot_schema).read()) output_evidence_strings_file = open(output_evidence_strings, 'wt') + exception_raised = False logger.info('Processing ClinVar records') for clinvar_record in ClinVarDataset(clinvar_xml): @@ -217,14 +222,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings report.evidence_string_count += evidence_strings_generated except Exception as e: - # Note while we catch exceptions here, this may or may not cause inconsistencies in the counts, - # in which case the pipeline will crash after processing all records and printing the report. + # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all + # records and printing the report. logger.error(f'Problem generating evidence for {clinvar_record.accession}') logger.error(f'Error: {e}') + exception_raised = True continue output_evidence_strings_file.close() - return report + return report, exception_raised def format_creation_date(s):