wip: messy initial work for supporting civic assertions

cancervariants · Jan 9, 2025 · 42b4e6c · 42b4e6c
1 parent 0ebb8a6
commit 42b4e6c
Show file tree

Hide file tree

Showing 4 changed files with 217 additions and 65 deletions.
diff --git a/src/metakb/transformers/civic.py b/src/metakb/transformers/civic.py
@@ -12,6 +12,7 @@
     Extension,
     MappableConcept,
     Relation,
+    code,
 )
 from ga4gh.va_spec.aac_2017.models import (
     VariantDiagnosticProposition,
@@ -25,6 +26,7 @@
     DiagnosticPredicate,
     Direction,
     Document,
+    EvidenceLine,
     PrognosticPredicate,
     TherapeuticResponsePredicate,
 )
@@ -117,8 +119,8 @@ class _TherapeuticMetadata(BaseModel):
     therapies: list[dict]
 
 
-class _CivicEvidenceType(str, Enum):
-    """Define constraints for CIViC evidence types supported by MetaKB
+class _CivicEvidenceAssertionType(str, Enum):
+    """Define constraints for CIViC evidence and assertion types supported by MetaKB
 
     DIAGNOSTIC, ONCOGENIC, PREDISPOSING are not currently supported
     """
@@ -181,6 +183,9 @@ def __init__(
             "genes": {},
         }
 
+        # CIViC EID ID: CIVIC EID
+        self._evidence_cache = {}
+
     @staticmethod
     def _mp_to_variant_mapping(molecular_profiles: list[dict]) -> tuple[list, dict]:
         """Get mapping from Molecular Profile ID to Variant ID.
@@ -229,7 +234,16 @@ async def transform(self, harvested_data: CivicHarvestedData) -> None:
             e
             for e in evidence_items
             if e["status"] == "accepted"
-            and e["evidence_type"] in _CivicEvidenceType.__members__
+            and e["evidence_type"] in _CivicEvidenceAssertionType.__members__
+        ]
+
+        # Only want assertions with approved status and assertion
+        assertions = harvested_data.assertions
+        assertions = [
+            assertion
+            for assertion in assertions
+            if assertion["status"] == "accepted"
+            and assertion["assertion_type"] in _CivicEvidenceAssertionType.__members__
         ]
 
         # Get all variant IDs from supported molecular profiles
@@ -238,6 +252,11 @@ async def transform(self, harvested_data: CivicHarvestedData) -> None:
             for e in evidence_items
             if e["molecular_profile_id"]
         }
+        vids |= {
+            mp_id_to_v_id_mapping[assertion["molecular_profile_id"]]
+            for assertion in assertions
+            if assertion["molecular_profile_id"]
+        }
 
         # Add variant (only supported) and gene (all) data
         # (mutates `variations` and `genes`)
@@ -257,52 +276,96 @@ async def transform(self, harvested_data: CivicHarvestedData) -> None:
         self._add_categorical_variants(mps, mp_id_to_v_id_mapping)
 
         for evidence_item in evidence_items:
-            self._add_variant_study_stmt(evidence_item, mp_id_to_v_id_mapping)
+            self._add_variant_study_stmt(
+                evidence_item, mp_id_to_v_id_mapping, is_evidence=True
+            )
+
+        for assertion in assertions:
+            self._add_variant_study_stmt(
+                assertion, mp_id_to_v_id_mapping, is_evidence=False
+            )
 
     def _add_variant_study_stmt(
-        self, evidence_item: dict, mp_id_to_v_id_mapping: dict
+        self, record: dict, mp_id_to_v_id_mapping: dict, is_evidence: bool = True
     ) -> None:
         """Create Variant Study Statement given CIViC Evidence Items.
         Will add associated values to ``processed_data`` instance variable
         (``therapies``, ``conditions``, and ``documents``).
         ``able_to_normalize`` and ``unable_to_normalize`` will also be mutated for
         associated therapies and conditions.
 
-        :param evidence_item: CIViC Evidence Item
+        :param record: CIViC Evidence Item or Assertion
         :param mp_id_to_v_id_mapping: Molecular Profile ID to Variant ID mapping
             {mp_id: v_id}
+        :param is_evidence: ``True`` if ``record`` is an evidence item. ``False`` if
+            ``record`` is an assertion.
         """
         # Check cache for molecular profile, variation and gene data
-        mp_id = f"civic.mpid:{evidence_item['molecular_profile_id']}"
+        mp_id = f"civic.mpid:{record['molecular_profile_id']}"
         mp = self.able_to_normalize["categorical_variants"].get(mp_id)
         if not mp:
             _logger.debug("mp_id not supported: %s", mp_id)
             return
 
         variant_id = (
-            f"civic.vid:{mp_id_to_v_id_mapping[evidence_item['molecular_profile_id']]}"
+            f"civic.vid:{mp_id_to_v_id_mapping[record['molecular_profile_id']]}"
         )
         variation_gene_map = self.able_to_normalize["variations"].get(variant_id)
         if not variation_gene_map:
             _logger.debug("variant_id not supported: %s", variant_id)
             return
 
-        # Add document
-        document = self._add_eid_document(evidence_item["source"])
-        if not document:
-            return
+        extensions = []
+        classification = None
+        record_prefix = "evidence" if is_evidence else "assertion"
+        direction = self._get_direction(record[f"{record_prefix}_direction"])
 
-        evidence_type = evidence_item["evidence_type"]
+        if is_evidence:
+            evidence_lines = None
+            document = self._add_eid_document(record["source"])
+            if not document:
+                return
+
+            reported_in = [document] if document else None
+            # Get strength
+            evidence_level = CivicEvidenceLevel[record["evidence_level"]]
+            strength = self.evidence_level_to_vicc_concept_mapping[evidence_level]
+        else:
+            strength = None
+            reported_in = None
+
+            if record["amp_level"]:
+                classification = self._get_classification(record["amp_level"])
+
+            evidence_lines = []
+            evidence_ids = []
+            for eid in record["evidence_ids"]:
+                civic_eid = f"civic.eid:{eid}"
+                evidence_ids.append(civic_eid)
+                evidence_item = self._evidence_cache.get(civic_eid)
+                if evidence_item:
+                    evidence_lines.append(
+                        EvidenceLine(
+                            hasEvidenceItems=[evidence_item],
+                            directionOfEvidenceProvided=Direction.SUPPORTS,  # TODO: Is this always supports?
+                        )
+                    )
+
+            # TODO: Figure out how to handle cases where CIViC evidence items can't be processed. Is this what we want?
+            if evidence_ids:
+                extensions.append(Extension(name="evidence_ids", value=evidence_ids))
+
+        record_type = record[f"{record_prefix}_type"]
 
         # Get predicate
-        predicate = CLIN_SIG_TO_PREDICATE.get(evidence_item["significance"])
+        predicate = CLIN_SIG_TO_PREDICATE.get(record["significance"])
 
         # Don't support evidence that has  `None`, "N/A", or "Unknown" predicate
         if not predicate:
             return
 
         # Add disease
-        disease = evidence_item["disease"]
+        disease = record["disease"]
         if not disease:
             return
 
@@ -311,8 +374,8 @@ def _add_variant_study_stmt(
             return
 
         civic_therapeutic = None
-        if evidence_type == _CivicEvidenceType.PREDICTIVE:
-            therapeutic_metadata = self._get_therapeutic_metadata(evidence_item)
+        if record_type == _CivicEvidenceAssertionType.PREDICTIVE:
+            therapeutic_metadata = self._get_therapeutic_metadata(record)
             if therapeutic_metadata:
                 civic_therapeutic = self._add_therapy(
                     therapeutic_metadata.therapy_id,
@@ -327,33 +390,49 @@ def _add_variant_study_stmt(
         else:
             condition_key = "objectCondition"
 
-        # Get strength
-        direction = self._get_evidence_direction(evidence_item["evidence_direction"])
-        evidence_level = CivicEvidenceLevel[evidence_item["evidence_level"]]
-        strength = self.evidence_level_to_vicc_concept_mapping[evidence_level]
-
         # Get qualifier
         civic_gene = self.able_to_normalize["genes"].get(
             variation_gene_map.civic_gene_id
         )
 
-        variant_origin = evidence_item["variant_origin"].upper()
+        variant_origin = record["variant_origin"].upper()
         if variant_origin == "SOMATIC":
             allele_origin_qualifier = MappableConcept(label="somatic")
         elif variant_origin in {"RARE_GERMLINE", "COMMON_GERMLINE"}:
             allele_origin_qualifier = MappableConcept(label="germline")
         else:
             allele_origin_qualifier = None
 
+        statement_id = record["name"].lower()
+        statement_id = (
+            statement_id.replace("eid", "civic.eid:")
+            if is_evidence
+            else statement_id.replace("aid", "civic.aid:")
+        )
+
+        mappings = [
+            ConceptMapping(
+                coding=Coding(
+                    code=str(record["id"]),
+                    system="https://civicdb.org/evidence/"
+                    if is_evidence
+                    else "https://civicdb.org/assertions/",
+                ),
+                relation=Relation.EXACT_MATCH,
+            )
+        ]
+
         stmt_params = {
-            "id": evidence_item["name"].lower().replace("eid", "civic.eid:"),
-            "description": evidence_item["description"]
-            if evidence_item["description"]
-            else None,
+            "id": statement_id,
+            "description": record["description"] or None,
             "direction": direction,
             "strength": strength,
             "specifiedBy": self.processed_data.methods[0],
-            "reportedIn": [document],
+            "reportedIn": reported_in,
+            "classification": classification,
+            "extensions": extensions or None,
+            "hasEvidenceLines": evidence_lines or None,
+            "mappings": mappings,
         }
 
         prop_params = {
@@ -364,26 +443,57 @@ def _add_variant_study_stmt(
             "subjectVariant": mp,
         }
 
-        if evidence_type == _CivicEvidenceType.PREDICTIVE:
+        if record_type == _CivicEvidenceAssertionType.PREDICTIVE:
             prop_params["objectTherapeutic"] = civic_therapeutic
             stmt_params["proposition"] = VariantTherapeuticResponseProposition(
                 **prop_params
             )
             statement = VariantTherapeuticResponseStudyStatement(**stmt_params)
-        elif evidence_type == _CivicEvidenceType.PROGNOSTIC:
+        elif record_type == _CivicEvidenceAssertionType.PROGNOSTIC:
             stmt_params["proposition"] = VariantPrognosticProposition(**prop_params)
             statement = VariantPrognosticStudyStatement(**stmt_params)
         else:
             stmt_params["proposition"] = VariantDiagnosticProposition(**prop_params)
             statement = VariantDiagnosticStudyStatement(**stmt_params)
 
+        if is_evidence:
+            self._evidence_cache[statement_id] = statement
         self.processed_data.statements.append(statement)
 
-    def _get_evidence_direction(self, direction: str) -> Direction | None:
-        """Get the normalized evidence direction
+    @staticmethod
+    def _get_classification(amp_level: str) -> MappableConcept | None:
+        """Get statement classification
+
+        :param amp_level: AMP/ASCO/CAP level
+        :return: Classification represented as a mappable concept
+        """
+        if amp_level == "NA":
+            classification = None
+        else:
+            pattern = re.compile(r"TIER_(?P<tier>[IV]+)(?:_LEVEL_(?P<level>[A-D]))?")
+            match = pattern.match(amp_level).groupdict()
+            primary_code = f"{match['tier']}{match['level'] or ''}"
+            classification = MappableConcept(
+                conceptType="Guideline",
+                primaryCode=primary_code,
+                mappings=[
+                    ConceptMapping(
+                        relation=Relation.EXACT_MATCH,
+                        coding=Coding(
+                            system="AMP/ASCO/CAP",
+                            systemVersion="2017",
+                            code=code(primary_code),
+                        ),
+                    )
+                ],
+            )
+        return classification
+
+    def _get_direction(self, direction: str) -> Direction | None:
+        """Get the normalized evidence or assertion direction
 
-        :param direction: CIViC evidence item's direction
-        :return: Normalized evidence direction
+        :param direction: CIViC evidence item or assertion's direction
+        :return: Normalized evidence or assertion direction
         """
         direction_upper = direction.upper()
         if direction_upper == "SUPPORTS":

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1091,29 +1091,64 @@ def civic_eid1409_statement():
 
 
 @pytest.fixture(scope="session")
-def civic_aid6_statement():
+def civic_aid6_statement(
+    civic_method,
+    civic_mpid33,
+    civic_gid19,
+    civic_tid146,
+    civic_did8,
+    civic_eid2997_study_stmt,
+):
     """Create CIViC AID 6 test fixture."""
     return {
         "id": "civic.aid:6",
-        "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved, and is recommended (category 1) by NCCN guidelines along with erlotinib, gefitinib and osimertinib as first line systemic therapy in NSCLC with sensitizing EGFR mutation.",
+        "description": "L858R is among the most common sensitizing EGFR mutations in NSCLC, and is assessed via DNA mutational analysis, including Sanger sequencing and next generation sequencing methods. Tyrosine kinase inhibitor afatinib is FDA approved as a first line systemic therapy in NSCLC with sensitizing EGFR mutation (civic.EID:2997).",
+        "type": "Statement",
+        "extensions": [
+            {
+                "name": "evidence_ids",
+                "value": [
+                    "civic.eid:2997",
+                    "civic.eid:2629",
+                    "civic.eid:982",
+                    "civic.eid:968",
+                    "civic.eid:883",
+                    "civic.eid:879",
+                ],
+            }
+        ],
+        "specifiedBy": civic_method,
+        "proposition": {
+            "type": "VariantTherapeuticResponseProposition",
+            "subjectVariant": civic_mpid33,
+            "geneContextQualifier": civic_gid19,
+            "alleleOriginQualifier": {"label": "somatic"},
+            "predicate": "predictsSensitivityTo",
+            "objectTherapeutic": civic_tid146,
+            "conditionQualifier": civic_did8,
+        },
         "direction": "supports",
-        "evidence_level": "amp_asco_cap_2017_level:1A",
-        "proposition": "proposition:Zfp_VG0uvxwteCcJYO6_AJv1KDmJlFjs",
-        "variation_origin": "somatic",
-        "variation_descriptor": "civic.vid:33",
-        "therapy_descriptor": "civic.tid:146",
-        "disease_descriptor": "civic.did:8",
-        "method": "method:2",
-        "supported_by": [
-            "document:9WsQBGXOmTFRXBUanTaIec8Gvgg8bsMA",
-            "civic.eid:2997",
-            "civic.eid:2629",
-            "civic.eid:982",
-            "civic.eid:968",
-            "civic.eid:883",
-            "civic.eid:879",
+        "classification": {
+            "conceptType": "Guideline",
+            "primaryCode": "IA",
+            "mappings": [
+                {
+                    "coding": {
+                        "system": "AMP/ASCO/CAP",
+                        "systemVersion": "2017",
+                        "code": "IA",
+                    },
+                    "relation": "exactMatch",
+                }
+            ],
+        },
+        "hasEvidenceLines": [
+            {
+                "type": "EvidenceLine",
+                "hasEvidenceItems": [civic_eid2997_study_stmt],
+                "directionOfEvidenceProvided": "supports",
+            }
         ],
-        "type": "Statement",
     }