From 6faa7597ddb7de98210dd6f407267bef497ad37d Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Mon, 3 Feb 2025 12:21:40 -0500 Subject: [PATCH] feat!: moa disease conflict resolution should follow therapy conflict resolution --- src/metakb/transformers/moa.py | 114 +++++++++++------- ...lict.json => moa_harvester_conflicts.json} | 12 +- .../test_moa_transformer_therapeutic.py | 21 +++- 3 files changed, 96 insertions(+), 51 deletions(-) rename tests/data/transformers/therapeutic/{moa_harvester_conflict.json => moa_harvester_conflicts.json} (93%) diff --git a/src/metakb/transformers/moa.py b/src/metakb/transformers/moa.py index b99d9a05..aac0d612 100644 --- a/src/metakb/transformers/moa.py +++ b/src/metakb/transformers/moa.py @@ -497,6 +497,65 @@ def _get_therapeutic_substitute_group( :return: None, since not supported by MOA """ + def _resolve_concept_discrepancy( + self, + cached_id: str, + cached_obj: MappableConcept, + cached_label: str, + moa_concept_label: str, + is_disease: bool = False, + ) -> None: + """Resolve conflict where MOA disease or therapy resolve to same normalized + concept + + The min label will be used as the primary label for the mappable concept, and + the other label will be added as an alias in extensions. + The cache will be updated with updated object. + The cached object will be removed from ``self.processed_data`` + + :param cached_id: ID found in cache + :param cached_obj: Mappable concept found in cache for ``cached_id``. This will + be mutated + :param cached_label: Label for ``cached_obj`` + :param moa_concept_label: MOA concept label + :param is_disease: ``True`` if ``cached_obj`` is a disease. ``False`` if + ``cached_obj`` is a therapy + """ + logger.debug( + "MOA %s and %s resolve to same concept %s", + moa_concept_label, + cached_label, + cached_id, + ) + alias = max(moa_concept_label, cached_label) + cached_obj.label = min(moa_concept_label, cached_label) + extensions = cached_obj.extensions or [] + + aliases_ext = next( + (ext for ext in extensions if ext.name == "aliases"), + None, + ) + if aliases_ext: + if cached_obj.label in aliases_ext.value: + aliases_ext.value.remove(cached_obj.label) + aliases_ext.value.append(alias) + else: + extensions.append(Extension(name="aliases", value=[alias])) + cached_obj.extensions = extensions + + if is_disease: + self.processed_data.conditions = [ + c for c in self.processed_data.conditions if c.id != cached_obj.id + ] + cache = self._cache.conditions + else: + self.processed_data.therapies = [ + t for t in self.processed_data.therapies if t.id != cached_id + ] + cache = self._cache.normalized_therapies + + cache[cached_id] = cached_obj + def _get_therapy(self, therapy_id: str, therapy: dict) -> MappableConcept: """Get Therapy mappable concept for a MOA therapy name. @@ -525,37 +584,13 @@ def _resolve_therapy_discrepancy( therapy_norm_obj = self._cache.normalized_therapies[cached_id] og_therapy_norm_label = therapy_norm_obj.label if moa_concept_label != og_therapy_norm_label: - logger.debug( - "MOA therapy %s and %s resolve to same concept %s", - moa_concept_label, - og_therapy_norm_label, + self._resolve_concept_discrepancy( cached_id, + therapy_norm_obj, + og_therapy_norm_label, + moa_concept_label, + is_disease=False, ) - alias = max(moa_concept_label, og_therapy_norm_label) - therapy_norm_obj.label = min(moa_concept_label, og_therapy_norm_label) - extensions = therapy_norm_obj.extensions or [] - - aliases_ext = next( - ( - ext - for ext in therapy_norm_obj.extensions - if ext.name == "aliases" - ), - None, - ) - if aliases_ext: - if therapy_norm_obj.label in aliases_ext.value: - aliases_ext.value.remove(therapy_norm_obj.label) - aliases_ext.value.append(alias) - else: - extensions.append(Extension(name="aliases", value=[alias])) - therapy_norm_obj.extensions = extensions - - # Remove from processed (it will be added back in _add_therapy) - self.processed_data.therapies = [ - t for t in self.processed_data.therapies if t.id != cached_id - ] - self._cache.normalized_therapies[cached_id] = therapy_norm_obj return therapy_norm_obj mappings = [] @@ -630,23 +665,18 @@ def _add_disease(self, disease: dict) -> MappableConcept | None: oncotree_kv = [f"{oncotree_key}:{oncotree_value}"] blob = json.dumps(oncotree_kv, separators=(",", ":")).encode("ascii") disease_id = sha512t24u(blob) - moa_disease = self._cache.conditions.get(disease_id) if moa_disease: source_disease_name = disease["name"] if source_disease_name != moa_disease.label: - if not moa_disease.extensions: - moa_disease.extensions = [ - Extension(name="aliases", value=[source_disease_name]) - ] - else: - for ext in moa_disease.extensions: - if ( - ext.name == "aliases" - and source_disease_name not in ext.value - ): - ext.value.append(source_disease_name) - break + self._resolve_concept_discrepancy( + disease_id, + moa_disease, + moa_disease.label, + source_disease_name, + is_disease=True, + ) + self.processed_data.conditions.append(moa_disease) return moa_disease moa_disease = None diff --git a/tests/data/transformers/therapeutic/moa_harvester_conflict.json b/tests/data/transformers/therapeutic/moa_harvester_conflicts.json similarity index 93% rename from tests/data/transformers/therapeutic/moa_harvester_conflict.json rename to tests/data/transformers/therapeutic/moa_harvester_conflicts.json index 1a22fbf8..a92b8bca 100644 --- a/tests/data/transformers/therapeutic/moa_harvester_conflict.json +++ b/tests/data/transformers/therapeutic/moa_harvester_conflicts.json @@ -6,9 +6,9 @@ "deprecated": false, "description": "The U.S. Food and Drug Administration (FDA) granted approval to selpercatinib for the treatment of adult patients with locally advanced or metastatic non-small cell lung cancer (NSCLC) with a RET gene fusion, as detected by an FDA-approved test.", "disease": { - "name": "Non-Small Cell Lung Cancer", - "oncotree_code": "NSCLC", - "oncotree_term": "Non-Small Cell Lung Cancer" + "name": "Myelodysplastic Syndromes", + "oncotree_code": "MDS", + "oncotree_term": "Myelodysplastic Syndromes" }, "therapy": { "name": "Selpercatinib", @@ -40,9 +40,9 @@ "deprecated": false, "description": "Certain missesnse mutations may predict resistance to RET inhibitors", "disease": { - "name": "Medullary Thyroid Cancer", - "oncotree_code": "THME", - "oncotree_term": "Medullary Thyroid Cancer" + "name": "Myelodysplasia", + "oncotree_code": "MDS", + "oncotree_term": "Myelodysplasia" }, "therapy": { "name": "LOXO-292", diff --git a/tests/unit/transformers/test_moa_transformer_therapeutic.py b/tests/unit/transformers/test_moa_transformer_therapeutic.py index 65ea9348..cd5aa03d 100644 --- a/tests/unit/transformers/test_moa_transformer_therapeutic.py +++ b/tests/unit/transformers/test_moa_transformer_therapeutic.py @@ -404,11 +404,11 @@ def test_moa_cdm_not_normalizable( @pytest.mark.asyncio() -async def test_moa_therapy_conflict(normalizers): - """Test that MOA therapy conflict merges concept correctly""" +async def test_moa_concept_conflicts(normalizers): + """Test that MOA therapy and disease conflict resolution works correctly""" t = MoaTransformer( data_dir=DATA_DIR, - harvester_path=DATA_DIR / "moa_harvester_conflict.json", + harvester_path=DATA_DIR / "moa_harvester_conflicts.json", normalizers=normalizers, ) harvested_data = t.extract_harvested_data() @@ -428,3 +428,18 @@ async def test_moa_therapy_conflict(normalizers): "name": "aliases", "value": ["Selpercatinib"], } + + conditions = t.processed_data.conditions + assert len(conditions) == 1 + + condition = conditions[0] + assert condition.id == "moa.normalize.disease.ncit:C3247" + assert condition.label == "Myelodysplasia" + condition_alias_ext = next( + (ext for ext in condition.extensions if ext.name == "aliases"), + None, + ) + assert condition_alias_ext.model_dump(exclude_none=True) == { + "name": "aliases", + "value": ["Myelodysplastic Syndromes"], + }