From cea3f43031cacaa9cc8b0bd9ee9144823ee29843 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 7 Feb 2024 08:59:59 +0000
Subject: [PATCH 1/3] modify counts for when evidence is invalid

---
 .../clinvar_to_evidence_strings.py            | 23 ++++++++++++++-----
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index 3134b5b2..f35a96e1 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -42,6 +42,7 @@ def __init__(self, trait_mappings, consequence_mappings):
         self.clinvar_skip_unsupported_variation = 0
         self.clinvar_skip_no_functional_consequences = 0
         self.clinvar_skip_missing_efo_mapping = 0
+        self.clinvar_skip_invalid_evidence_string = 0
         self.clinvar_done_one_complete_evidence_string = 0
         self.clinvar_done_multiple_complete_evidence_strings = 0
 
@@ -57,17 +58,15 @@ def __init__(self, trait_mappings, consequence_mappings):
         self.repeat_expansion_variants = 0
         self.structural_variants = 0
 
-    def collate_report(self):
+    def print_report_and_check_counts(self):
         # ClinVar tallies.
         clinvar_fatal = self.clinvar_fatal_no_valid_traits
         clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
-                           self.clinvar_skip_missing_efo_mapping)
+                           self.clinvar_skip_missing_efo_mapping + self.clinvar_skip_invalid_evidence_string)
         clinvar_done = (self.clinvar_done_one_complete_evidence_string +
                         self.clinvar_done_multiple_complete_evidence_strings)
-        assert clinvar_fatal + clinvar_skipped + clinvar_done == self.clinvar_total, \
-            'ClinVar evidence string tallies do not add up to the total amount.'
 
-        return f'''Total number of evidence strings generated\t{self.evidence_string_count}
+        report = f'''Total number of evidence strings generated\t{self.evidence_string_count}
             Total number of complete evidence strings generated\t{self.complete_evidence_string_count}
 
             Total number of ClinVar records\t{self.clinvar_total}
@@ -91,6 +90,14 @@ def collate_report(self):
             Total number of variant to consequence mappings\t{self.total_consequence_mappings}
                 Number of repeat expansion variants\t{self.repeat_expansion_variants}
                 Number of structural variants \t{self.structural_variants}'''.replace('\n' + ' ' * 12, '\n')
+        print(report)
+
+        # Confirm counts as expected, exit with error if not.
+        expected_total = clinvar_fatal + clinvar_skipped + clinvar_done
+        if expected_total != self.clinvar_total:
+            logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
+                         f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
+            sys.exit(1)
 
     def write_unmapped_terms(self, dir_out):
         with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
@@ -120,7 +127,7 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc
     report = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
         output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
-    print(report.collate_report())
+    report.print_report_and_check_counts()
     report.write_unmapped_terms(dir_out)
 
 
@@ -201,11 +208,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                 report.clinvar_done_one_complete_evidence_string += 1
             elif complete_evidence_strings_generated > 1:
                 report.clinvar_done_multiple_complete_evidence_strings += 1
+            else:
+                report.clinvar_skip_invalid_evidence_string += 1
 
             report.complete_evidence_string_count += complete_evidence_strings_generated
             report.evidence_string_count += evidence_strings_generated
 
         except Exception as e:
+            # Note while we catch exceptions here, this may or may not cause inconsistencies in the counts,
+            # in which case the pipeline will crash after processing all records and printing the report.
             logger.error(f'Problem generating evidence for {clinvar_record.accession}')
             logger.error(f'Error: {e}')
             continue

From 6b7d6064e9388b09433265b72832b46903cbf638 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 7 Feb 2024 10:23:29 +0000
Subject: [PATCH 2/3] fix count and report invalid evidence

---
 cmat/output_generation/clinvar_to_evidence_strings.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index f35a96e1..9ff2d2be 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -75,6 +75,7 @@ def print_report_and_check_counts(self):
                     Unsupported variation type\t{self.clinvar_skip_unsupported_variation}
                     No functional consequences\t{self.clinvar_skip_no_functional_consequences}
                     Missing EFO mapping\t{self.clinvar_skip_missing_efo_mapping}
+                    Invalid evidence string\t{self.clinvar_skip_invalid_evidence_string}
                 Done: Generated at least one complete evidence string\t{clinvar_done}
                     One complete evidence string\t{self.clinvar_done_one_complete_evidence_string}
                     Multiple complete evidence strings\t{self.clinvar_done_multiple_complete_evidence_strings}
@@ -208,7 +209,8 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
                 report.clinvar_done_one_complete_evidence_string += 1
             elif complete_evidence_strings_generated > 1:
                 report.clinvar_done_multiple_complete_evidence_strings += 1
-            else:
+
+            if evidence_strings_generated == 0:
                 report.clinvar_skip_invalid_evidence_string += 1
 
             report.complete_evidence_string_count += complete_evidence_strings_generated

From be2dd91468a6046cc5af2e22d51e159f711bae61 Mon Sep 17 00:00:00 2001
From: April Shen <ashen@ebi.ac.uk>
Date: Wed, 7 Feb 2024 11:52:29 +0000
Subject: [PATCH 3/3] exit if any exception thrown and after all steps complete

---
 .../clinvar_to_evidence_strings.py             | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cmat/output_generation/clinvar_to_evidence_strings.py b/cmat/output_generation/clinvar_to_evidence_strings.py
index 9ff2d2be..74ffd48f 100644
--- a/cmat/output_generation/clinvar_to_evidence_strings.py
+++ b/cmat/output_generation/clinvar_to_evidence_strings.py
@@ -59,6 +59,7 @@ def __init__(self, trait_mappings, consequence_mappings):
         self.structural_variants = 0
 
     def print_report_and_check_counts(self):
+        """Print report of counts and return True if counts are consistent, False otherwise."""
         # ClinVar tallies.
         clinvar_fatal = self.clinvar_fatal_no_valid_traits
         clinvar_skipped = (self.clinvar_skip_unsupported_variation + self.clinvar_skip_no_functional_consequences +
@@ -98,7 +99,8 @@ def print_report_and_check_counts(self):
         if expected_total != self.clinvar_total:
             logger.error(f'ClinVar evidence string tallies do not add up to the total amount: '
                          f'fatal + skipped + done = {expected_total}, total = {self.clinvar_total}')
-            sys.exit(1)
+            return False
+        return True
 
     def write_unmapped_terms(self, dir_out):
         with open(os.path.join(dir_out, UNMAPPED_TRAITS_FILE_NAME), 'w') as unmapped_traits_file:
@@ -125,11 +127,13 @@ def launch_pipeline(clinvar_xml_file, efo_mapping_file, gene_mapping_file, ot_sc
     string_to_efo_mappings, _ = load_ontology_mapping(efo_mapping_file)
     variant_to_gene_mappings = CT.process_consequence_type_file(gene_mapping_file)
 
-    report = clinvar_to_evidence_strings(
+    report, exception_raised = clinvar_to_evidence_strings(
         string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml_file, ot_schema_file,
         output_evidence_strings=os.path.join(dir_out, EVIDENCE_STRINGS_FILE_NAME))
-    report.print_report_and_check_counts()
+    counts_consistent = report.print_report_and_check_counts()
     report.write_unmapped_terms(dir_out)
+    if exception_raised or not counts_consistent:
+        sys.exit(1)
 
 
 def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings, clinvar_xml, ot_schema,
@@ -137,6 +141,7 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
     report = Report(trait_mappings=string_to_efo_mappings, consequence_mappings=variant_to_gene_mappings)
     ot_schema_contents = json.loads(open(ot_schema).read())
     output_evidence_strings_file = open(output_evidence_strings, 'wt')
+    exception_raised = False
 
     logger.info('Processing ClinVar records')
     for clinvar_record in ClinVarDataset(clinvar_xml):
@@ -217,14 +222,15 @@ def clinvar_to_evidence_strings(string_to_efo_mappings, variant_to_gene_mappings
             report.evidence_string_count += evidence_strings_generated
 
         except Exception as e:
-            # Note while we catch exceptions here, this may or may not cause inconsistencies in the counts,
-            # in which case the pipeline will crash after processing all records and printing the report.
+            # We catch exceptions but record when one is thrown, so that the pipeline will crash after processing all
+            # records and printing the report.
             logger.error(f'Problem generating evidence for {clinvar_record.accession}')
             logger.error(f'Error: {e}')
+            exception_raised = True
             continue
 
     output_evidence_strings_file.close()
-    return report
+    return report, exception_raised
 
 
 def format_creation_date(s):