Skip to content

Commit

Permalink
feat!: use cat-vrs-python + va-spec-python (#404)
Browse files Browse the repository at this point in the history
close #388 , #395, #309

* Breaking changes
  * Update/add ga4gh packages
    * Add `ga4gh.cat_vrs` + `ga4gh.va_spec` packages 
* The Pydantic models in these packages replace the manually created
models in `metakb/schemas/annotation.py`,
`metakb/schemas/categorical_variation.py`, and
`metakb/schemas/variation_statement.py`. (#388)
    * `ga4gh.vrs` version bumped
* The models in all ga4gh packages caused breaking changes (mainly
renaming) to the evidence models (such as #395)
* Represent categorical variation members and constraints properly
(#309)
  * Standardize representing normalizer data in extensions
* The `extension.name` will always be `vicc_normalizer_data` and value
will contain `id`, `label`, and optional `mondo_id` (for disease)
* Simplify assertion checks in tests
  • Loading branch information
korikuzma authored Nov 20, 2024
1 parent 4c1ab8b commit 261c065
Show file tree
Hide file tree
Showing 21 changed files with 715 additions and 1,143 deletions.
8 changes: 0 additions & 8 deletions docs/source/reference/api/metakb.schemas.annotation.rst

This file was deleted.

This file was deleted.

This file was deleted.

3 changes: 0 additions & 3 deletions docs/source/reference/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,8 @@ Data Schemas
:toctree: api/
:template: module_summary.rst

metakb.schemas.annotation
metakb.schemas.api
metakb.schemas.app
metakb.schemas.categorical_variation
metakb.schemas.variation_statement

Harvesters
----------
Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ requires-python = ">=3.10"
description = "A search interface for cancer variant interpretations assembled by aggregating and harmonizing across multiple cancer variant interpretation knowledgebases."
license = {file = "LICENSE"}
dependencies = [
"ga4gh.vrs~=2.0.0a10",
"ga4gh.vrs~=2.0.0a12",
"ga4gh.cat_vrs~=0.1.0",
"ga4gh.va_spec~=0.1.0",
"gene-normalizer[etl]~=0.4.1",
"variation-normalizer~=0.10.0",
"disease-normalizer[etl]~=0.5.0",
Expand All @@ -42,7 +44,7 @@ dependencies = [
dynamic = ["version"]

[project.optional-dependencies]
tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio"]
tests = ["pytest", "pytest-cov", "mock", "pytest-asyncio", "deepdiff"]
dev = ["pre-commit>=3.7.1", "ruff==0.5.0"]
notebooks = ["ipykernel", "jupyterlab"]
docs = [
Expand Down
4 changes: 2 additions & 2 deletions src/metakb/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@ def _get_credentials(
"disease_id_constraint": "CREATE CONSTRAINT disease_id_constraint IF NOT EXISTS FOR (n:Disease) REQUIRE n.id IS UNIQUE;",
"therapeuticprocedure_id_constraint": "CREATE CONSTRAINT therapeuticprocedure_id_constraint IF NOT EXISTS FOR (n:TherapeuticProcedure) REQUIRE n.id IS UNIQUE;",
"variation_id_constraint": "CREATE CONSTRAINT variation_id_constraint IF NOT EXISTS FOR (n:Variation) REQUIRE n.id IS UNIQUE;",
"categoricalvariation_id_constraint": "CREATE CONSTRAINT categoricalvariation_id_constraint IF NOT EXISTS FOR (n:CategoricalVariation) REQUIRE n.id IS UNIQUE;",
"categoricalvariant_id_constraint": "CREATE CONSTRAINT categoricalvariant_id_constraint IF NOT EXISTS FOR (n:CategoricalVariant) REQUIRE n.id IS UNIQUE;",
"variantgroup_id_constraint": "CREATE CONSTRAINT variantgroup_id_constraint IF NOT EXISTS FOR (n:VariantGroup) REQUIRE n.id IS UNIQUE;",
"location_id_constraint": "CREATE CONSTRAINT location_id_constraint IF NOT EXISTS FOR (n:Location) REQUIRE n.id IS UNIQUE;",
"document_id_constraint": "CREATE CONSTRAINT document_id_constraint IF NOT EXISTS FOR (n:Document) REQUIRE n.id IS UNIQUE;",
"study_id_constraint": "CREATE CONSTRAINT study_id_constraint IF NOT EXISTS FOR (n:Study) REQUIRE n.id IS UNIQUE;",
"statement_id_constraint": "CREATE CONSTRAINT statement_id_constraint IF NOT EXISTS FOR (n:Statement) REQUIRE n.id IS UNIQUE;",
"method_id_constraint": "CREATE CONSTRAINT method_id_constraint IF NOT EXISTS FOR (n:Method) REQUIRE n.id IS UNIQUE;",
}

Expand Down
95 changes: 49 additions & 46 deletions src/metakb/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from neo4j import Driver, ManagedTransaction

from metakb.database import get_driver
from metakb.normalizers import VICC_NORMALIZER_DATA, ViccDiseaseNormalizerData

_logger = logging.getLogger(__name__)

Expand All @@ -16,7 +17,7 @@ def _create_parameterized_query(
) -> str:
"""Create parameterized query string for requested params if non-null in entity.
:param entity: entity to check against, eg a Variation or Study
:param entity: entity to check against, eg a Variation or Statement
:param params: Parameter names to check
:param entity_param_prefix: Prefix for parameter names in entity object
:return: Parameterized query, such as (`name:$name`)
Expand All @@ -41,18 +42,23 @@ def _add_mappings_and_exts_to_obj(obj: dict, obj_keys: list[str]) -> None:

extensions = obj.get("extensions", [])
for ext in extensions:
if ext["name"].endswith("_normalizer_data"):
obj_type = ext["name"].split("_normalizer_data")[0]
name = f"{obj_type}_normalizer_id"
obj[name] = ext["value"]["normalized_id"]
if ext["name"] == VICC_NORMALIZER_DATA:
for normalized_field in ViccDiseaseNormalizerData.model_fields:
normalized_val = ext["value"].get(normalized_field)
if normalized_val is None:
continue

name = f"normalizer_{normalized_field}"
obj[name] = normalized_val
obj_keys.append(f"{name}:${name}")
else:
name = "_".join(ext["name"].split()).lower()
val = ext["value"]
if isinstance(val, (dict | list)):
obj[name] = json.dumps(val)
else:
obj[name] = val
obj_keys.append(f"{name}:${name}")
obj_keys.append(f"{name}:${name}")


def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str]) -> None:
Expand All @@ -69,11 +75,13 @@ def _add_method(tx: ManagedTransaction, method: dict, ids_in_studies: set[str])
MERGE (m:Method {id:$id, label:$label})
"""

is_reported_in = method.get("isReportedIn")
is_reported_in = method.get("reportedIn")
if is_reported_in:
# Method's documents are unique and do not currently have IDs
_add_document(tx, is_reported_in, ids_in_studies)
doc_doi = is_reported_in["doi"]
# They also only have one document
document = is_reported_in[0]
_add_document(tx, document, ids_in_studies)
doc_doi = document["doi"]
query += f"""
MERGE (d:Document {{ doi:'{doc_doi}' }})
MERGE (m) -[:IS_REPORTED_IN] -> (d)
Expand Down Expand Up @@ -253,21 +261,21 @@ def _add_variation(tx: ManagedTransaction, variation_in: dict) -> None:
tx.run(query, **v)


def _add_categorical_variation(
def _add_categorical_variant(
tx: ManagedTransaction,
categorical_variation_in: dict,
categorical_variant_in: dict,
ids_in_studies: set[str],
) -> None:
"""Add categorical variation objects to DB.
"""Add categorical variant objects to DB.
:param tx: Transaction object provided to transaction functions
:param categorical_variation_in: Categorical variation CDM object
:param categorical_variant_in: Categorical variant CDM object
:param ids_in_studies: IDs found in studies
"""
if categorical_variation_in["id"] not in ids_in_studies:
if categorical_variant_in["id"] not in ids_in_studies:
return

cv = categorical_variation_in.copy()
cv = categorical_variant_in.copy()

mp_nonnull_keys = [
_create_parameterized_query(
Expand All @@ -278,7 +286,7 @@ def _add_categorical_variation(
_add_mappings_and_exts_to_obj(cv, mp_nonnull_keys)
mp_keys = ", ".join(mp_nonnull_keys)

defining_context = cv["definingContext"]
defining_context = cv["constraints"][0]["definingContext"]
_add_variation(tx, defining_context)
dc_type = defining_context["type"]

Expand All @@ -293,9 +301,9 @@ def _add_categorical_variation(

query = f"""
{members_match}
MERGE (dc:{dc_type}:Variation {{ id: '{defining_context['id']}' }})
MERGE (dc:Variation:{dc_type} {{ id: '{defining_context['id']}' }})
MERGE (dc) -[:HAS_LOCATION] -> (loc)
MERGE (v:{cv['type']}:CategoricalVariation {{ {mp_keys} }})
MERGE (v:Variation:{cv['type']} {{ {mp_keys} }})
MERGE (v) -[:HAS_DEFINING_CONTEXT] -> (dc)
{members_relation}
"""
Expand Down Expand Up @@ -330,7 +338,7 @@ def _add_document(
document = document_in.copy()
formatted_keys = [
_create_parameterized_query(
document, ("id", "label", "title", "pmid", "url", "doi")
document, ("id", "label", "title", "pmid", "urls", "doi")
)
]

Expand Down Expand Up @@ -365,11 +373,11 @@ def _add_obj_id_to_set(obj: dict, ids_set: set[str]) -> None:
for study in studies:
for obj in [
study.get("specifiedBy"), # method
study.get("isReportedIn"),
study.get("variant"),
study.get("therapeutic"),
study.get("tumorType"),
study.get("qualifiers", {}).get("geneContext"),
study.get("reportedIn"),
study.get("subjectVariant"),
study.get("objectTherapeutic"),
study.get("conditionQualifier"),
study.get("geneContextQualifier"),
]:
if obj:
if isinstance(obj, list):
Expand All @@ -385,7 +393,7 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
"""Add study node and its relationships
:param tx: Transaction object provided to transaction functions
:param study_in: Study CDM object
:param study_in: Statement CDM object
"""
study = study_in.copy()
study_type = study["type"]
Expand All @@ -396,23 +404,22 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
match_line = ""
rel_line = ""

is_reported_in_docs = study.get("isReportedIn", [])
is_reported_in_docs = study.get("reportedIn", [])
for ri_doc in is_reported_in_docs:
ri_doc_id = ri_doc["id"]
name = f"doc_{ri_doc_id.split(':')[-1]}"
match_line += f"MERGE ({name} {{ id: '{ri_doc_id}'}})\n"
rel_line += f"MERGE (s) -[:IS_REPORTED_IN] -> ({name})\n"

qualifiers = study.get("qualifiers")
if qualifiers:
allele_origin = qualifiers.get("alleleOrigin")
study["alleleOrigin"] = allele_origin
match_line += "SET s.alleleOrigin=$alleleOrigin\n"
allele_origin = study.get("alleleOriginQualifier")
if allele_origin:
study["alleleOriginQualifier"] = allele_origin
match_line += "SET s.alleleOriginQualifier=$alleleOriginQualifier\n"

gene_context_id = qualifiers.get("geneContext", {}).get("id")
if gene_context_id:
match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n"
rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n"
gene_context_id = study.get("geneContextQualifier", {}).get("id")
if gene_context_id:
match_line += f"MERGE (g:Gene {{id: '{gene_context_id}'}})\n"
rel_line += "MERGE (s) -[:HAS_GENE_CONTEXT] -> (g)\n"

method_id = study["specifiedBy"]["id"]
match_line += f"MERGE (m {{ id: '{method_id}' }})\n"
Expand All @@ -433,24 +440,20 @@ def _add_study(tx: ManagedTransaction, study_in: dict) -> None:
match_line += f"MERGE (c:Coding {{ {coding_keys} }})\n"
rel_line += "MERGE (s) -[:HAS_STRENGTH] -> (c)\n"

variant_id = study["variant"]["id"]
if study["variant"]["type"] == "ProteinSequenceConsequence":
v_parent_type = "CategoricalVariation"
else:
v_parent_type = "Variation"
match_line += f"MERGE (v:{v_parent_type} {{ id: '{variant_id}' }})\n"
variant_id = study["subjectVariant"]["id"]
match_line += f"MERGE (v:Variation {{ id: '{variant_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_VARIANT] -> (v)\n"

therapeutic_id = study["therapeutic"]["id"]
therapeutic_id = study["objectTherapeutic"]["id"]
match_line += f"MERGE (t:TherapeuticProcedure {{ id: '{therapeutic_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_THERAPEUTIC] -> (t)\n"

tumor_type_id = study["tumorType"]["id"]
tumor_type_id = study["conditionQualifier"]["id"]
match_line += f"MERGE (tt:Condition {{ id: '{tumor_type_id}' }})\n"
rel_line += "MERGE (s) -[:HAS_TUMOR_TYPE] -> (tt)\n"

query = f"""
MERGE (s:{study_type}:Study {{ {study_keys} }})
MERGE (s:{study_type}:StudyStatement:Statement {{ {study_keys} }})
{match_line}
{rel_line}
"""
Expand All @@ -472,8 +475,8 @@ def add_transformed_data(driver: Driver, data: dict) -> None:
with driver.session() as session:
loaded_study_count = 0

for cv in data.get("categorical_variations", []):
session.execute_write(_add_categorical_variation, cv, ids_in_studies)
for cv in data.get("categorical_variants", []):
session.execute_write(_add_categorical_variant, cv, ids_in_studies)

for doc in data.get("documents", []):
session.execute_write(_add_document, doc, ids_in_studies)
Expand Down
24 changes: 24 additions & 0 deletions src/metakb/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import os
from collections.abc import Iterable
from enum import Enum
from typing import Literal

from botocore.exceptions import TokenRetrievalError
from disease.cli import update_db as update_disease_db
Expand All @@ -22,6 +23,7 @@
from gene.database.database import AWS_ENV_VAR_NAME as GENE_AWS_ENV_VAR_NAME
from gene.query import QueryHandler as GeneQueryHandler
from gene.schemas import NormalizeService as NormalizedGene
from pydantic import BaseModel
from therapy.cli import update_normalizer_db as update_therapy_db
from therapy.database import create_db as create_therapy_db
from therapy.database.database import AWS_ENV_VAR_NAME as THERAPY_AWS_ENV_VAR_NAME
Expand All @@ -42,6 +44,28 @@
_logger = logging.getLogger(__name__)


class ViccNormalizerData(BaseModel, extra="forbid"):
"""Define model for representing VICC normalizer data"""

id: str
label: str


class ViccDiseaseNormalizerData(ViccNormalizerData, extra="forbid"):
"""Define model for representing VICC disease normalizer data"""

mondo_id: str | None = None


VICC_NORMALIZER_DATA = "vicc_normalizer_data"


class ViccNormalizerDataExtension(Extension):
"""Define model for representing VICC normalizer data as an Extension"""

name: Literal["vicc_normalizer_data"] = VICC_NORMALIZER_DATA


class ViccNormalizers:
"""Manage VICC concept normalization services.
Expand Down
Loading

0 comments on commit 261c065

Please sign in to comment.