From 922ab949a40153bc6e896852adea7d0d1e93ad5e Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Thu, 29 Aug 2024 16:36:09 +1000 Subject: [PATCH 01/54] Debug statistics record problem. --- src/mavedb/routers/statistics.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/mavedb/routers/statistics.py b/src/mavedb/routers/statistics.py index 53a3e8fc..61be6a5c 100644 --- a/src/mavedb/routers/statistics.py +++ b/src/mavedb/routers/statistics.py @@ -260,6 +260,7 @@ def _record_from_field_and_model( model_created_by_field = getattr(queried_model, "created_by_id") model_published_data_field = getattr(queried_model, "published_date") if field is RecordFields.createdBy: + print("111") query = ( select(User.username, func.count(User.id)) .join(queried_model, model_created_by_field == User.id) @@ -269,6 +270,7 @@ def _record_from_field_and_model( return db.execute(query).all() else: + print("2222") # All assc table identifiers which are linked to a published model. queried_assc_table = association_tables[model][field] published_score_sets_statement = ( @@ -332,6 +334,11 @@ def record_object_statistics( Model names and fields should be members of the Enum classes defined above. Providing an invalid model name or model field will yield a 422 Unprocessable Entity error with details about valid enum values. """ + # Validation to ensure 'keywords' is only used with 'experiment'. + if model == RecordNames.scoreSet and field == RecordFields.keywords: + raise HTTPException(status_code=422, + detail="The 'keywords' field can only be used with the 'experiment' model.") + count_data = _record_from_field_and_model(db, model, field) return {field_val: count for field_val, count in count_data if field_val is not None} From 554501b22dfe04c4227a95a908e73d915d032737 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Jun 2024 10:55:44 -0700 Subject: [PATCH 02/54] Add vrs map job definition to worker and vrs map api to docker Co-authored-by: Ben Capodanno <31941502+bencap@users.noreply.github.com> --- docker-compose-dev.yml | 20 ++++++++++++++++++++ settings/.env.template | 19 +++++++++++++++++++ src/mavedb/worker/jobs.py | 26 +++++++++++++++++++++++++- src/mavedb/worker/settings.py | 4 ++-- 4 files changed, 66 insertions(+), 3 deletions(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 8f7723a8..93165694 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -63,6 +63,25 @@ services: volumes: - mavedb-redis-dev:/data + dcd-mapping: + image: dcd-mapping:dev + command: bash -c "uvicorn api.server_main:app --host 0.0.0.0 --port 8000 --reload" + depends_on: + - db + - seqrepo + env_file: + - settings/.env.dev + - settings/.env.dev.mapping + ports: + - "8004:8000" + volumes: + - mavedb-seqrepo-dev:/usr/local/share/seqrepo + + seqrepo: + image: biocommons/seqrepo:2021-01-29 + volumes: + - mavedb-seqrepo-dev:/usr/local/share/seqrepo + # rabbitmq: # image: rabbitmq:3.8.3 # ports: @@ -71,3 +90,4 @@ services: volumes: mavedb-data-dev: mavedb-redis-dev: + mavedb-seqrepo-dev: diff --git a/settings/.env.template b/settings/.env.template index dc6592e5..50d1fef1 100644 --- a/settings/.env.template +++ b/settings/.env.template @@ -32,3 +32,22 @@ SLACK_WEBHOOK_URL= ORCID_CLIENT_ID=client-id ORCID_CLIENT_SECRET=secret ORCID_JWT_SIGNING_PUBLIC_KEY="-----BEGIN PUBLIC KEY-----\npublic key\n-----END PUBLIC KEY-----" + +#################################################################################################### +# Environment variables for vrs-mapping +#################################################################################################### + +GENE_NORM_DB_URL=postgres://postgres:postgres@db:5432/gene_normalizer +MAVEDB_STORAGE_DIR=/root/.local/share/dcd-mapping + +#################################################################################################### +# Environment variables for UTA connection via CoolSeqTool +#################################################################################################### + +UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_20180821 + +#################################################################################################### +# Environment variables for seqrepo +#################################################################################################### + +SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29 diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 7ba6e0e1..4eee0676 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -1,4 +1,5 @@ import logging +import requests import pandas as pd from cdot.hgvs.dataproviders import RESTDataProvider @@ -17,6 +18,7 @@ validate_and_standardize_dataframe_pair, ) from mavedb.models.enums.processing_state import ProcessingState +from mavedb.models.mapped_variant import MappedVariant from mavedb.models.score_set import ScoreSet from mavedb.models.user import User from mavedb.models.variant import Variant @@ -49,7 +51,9 @@ async def create_variants_for_score_set( db: Session = ctx["db"] hdp: RESTDataProvider = ctx["hdp"] - score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() + score_set = db.scalars( + select(ScoreSet).where(ScoreSet.urn == score_set_urn) + ).one() updated_by = db.scalars(select(User).where(User.id == updater_id)).one() score_set.modified_by = updated_by @@ -143,3 +147,23 @@ async def create_variants_for_score_set( ctx["state"][ctx["job_id"]] = logging_context.copy() return score_set.processing_state.name + + +async def map_variants_for_score_set(ctx, score_set_urn: str): + db: Session = ctx["db"] + + response = requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") + response.raise_for_status() + mapping_results = response.json() + + for mapped_score in mapping_results["mapped_scores"]: + variant_urn = mapped_score["mavedb_id"] + variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + mapped_variant = MappedVariant( + pre_mapped = mapped_score["pre_mapped"], + post_mapped = mapped_score["post_mapped"], + variant_id = variant.id + ) + db.add(mapped_variant) + + db.commit() diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index ac488fe6..24ee5edc 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -5,12 +5,12 @@ from arq import cron from mavedb.lib.logging.canonical import log_job -from mavedb.worker.jobs import create_variants_for_score_set +from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set from mavedb.db.session import SessionLocal from mavedb.data_providers.services import cdot_rest # ARQ requires at least one task on startup. -BACKGROUND_FUNCTIONS: list[Callable] = [create_variants_for_score_set] +BACKGROUND_FUNCTIONS: list[Callable] = [create_variants_for_score_set, map_variants_for_score_set] BACKGROUND_CRONJOBS: list[Callable] = [] REDIS_IP = os.getenv("REDIS_IP") or "localhost" From 1b160df470b222978771e62c1e440560a179f452 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Jun 2024 15:29:55 -0700 Subject: [PATCH 03/54] Add variant mapper manager (wip) Co-authored-by: Ben Capodanno <31941502+bencap@users.noreply.github.com> --- src/mavedb/worker/jobs.py | 52 +++++++++++++++++++++++++++++++++-- src/mavedb/worker/settings.py | 9 ++++-- 2 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 4eee0676..1a7997fd 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -1,7 +1,12 @@ +import asyncio +import functools import logging import requests +from typing import Optional import pandas as pd +from arq import ArqRedis +from arq.jobs import Job, JobStatus from cdot.hgvs.dataproviders import RESTDataProvider from sqlalchemy import delete, select, null from sqlalchemy.orm import Session @@ -50,6 +55,7 @@ async def create_variants_for_score_set( db: Session = ctx["db"] hdp: RESTDataProvider = ctx["hdp"] + redis: ArqRedis = ctx["redis"] score_set = db.scalars( select(ScoreSet).where(ScoreSet.urn == score_set_urn) @@ -139,6 +145,7 @@ async def create_variants_for_score_set( logging_context["processing_state"] = score_set.processing_state.name logger.info(msg="Finished creating variants in score set.", extra=logging_context) + await redis.enqueue_job('variant_mapper_manager', score_set_urn) finally: db.add(score_set) db.commit() @@ -152,18 +159,57 @@ async def create_variants_for_score_set( async def map_variants_for_score_set(ctx, score_set_urn: str): db: Session = ctx["db"] - response = requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") + logger.info(f"map variants for score set has started for score set {score_set_urn}") + + blocking = functools.partial(requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}")) + loop = asyncio.get_running_loop() + response = await loop.run_in_executor(ctx["pool"], blocking) + + #response = requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") response.raise_for_status() mapping_results = response.json() + if mapping_results: + existing_variants = select(Variant.id).join(ScoreSet).where(ScoreSet.urn == score_set_urn).subquery() + db.execute(delete(MappedVariant).where(MappedVariant.variant_id.in_(existing_variants))) + for mapped_score in mapping_results["mapped_scores"]: variant_urn = mapped_score["mavedb_id"] variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() mapped_variant = MappedVariant( - pre_mapped = mapped_score["pre_mapped"], - post_mapped = mapped_score["post_mapped"], + pre_mapped = mapped_score["pre_mapped_2_0"], + post_mapped = mapped_score["post_mapped_2_0"], variant_id = variant.id ) db.add(mapped_variant) db.commit() + + +async def map_variants_async(ctx, score_set_urn: str): + blocking = functools.partial(map_variants_for_score_set, ctx["db"], score_set_urn) + loop = asyncio.get_running_loop() + return await loop.run_in_executor(ctx["pool"], blocking) + + +async def variant_mapper_manager(ctx, score_set_urn: Optional[str]): + queue: list[str] = ctx["mapping_queue"] + redis: ArqRedis = ctx["redis"] + + if score_set_urn: + queue.append(score_set_urn) + + logger.info(f"Queue: {queue}") + + if not queue: + return + + job = Job(job_id='vrs_map', redis=redis) + logger.info(f"checked for job: {job}") + if await job.status() is JobStatus.not_found: + logger.info("Job is none, attempting to enqueue new map variants job") + score_set_urn = queue.pop(0) + await redis.enqueue_job('map_variants_for_score_set', score_set_urn, _job_id='vrs_map') + else: + logger.info(f"Job status: {await job.status()}") + diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index 24ee5edc..de02d57c 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -1,16 +1,18 @@ import os +from concurrent import futures from typing import Callable from arq.connections import RedisSettings from arq import cron from mavedb.lib.logging.canonical import log_job -from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set +from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager from mavedb.db.session import SessionLocal from mavedb.data_providers.services import cdot_rest # ARQ requires at least one task on startup. -BACKGROUND_FUNCTIONS: list[Callable] = [create_variants_for_score_set, map_variants_for_score_set] +BACKGROUND_FUNCTIONS: list[Callable] = [create_variants_for_score_set, variant_mapper_manager, map_variants_for_score_set] +#BACKGROUND_CRONJOBS: list[Callable] = [cron(variant_mapper_manager, hour={range(0, 23)}, minute={0, 15, 30, 45})] BACKGROUND_CRONJOBS: list[Callable] = [] REDIS_IP = os.getenv("REDIS_IP") or "localhost" @@ -35,6 +37,8 @@ async def on_job_start(ctx): ctx["db"] = db ctx["hdp"] = cdot_rest() ctx["state"] = {} + ctx["mapping_queue"] = [] + ctx["pool"] = futures.ProcessPoolExecutor() async def on_job_end(ctx): @@ -57,3 +61,4 @@ class ArqWorkerSettings: cron_jobs: list = BACKGROUND_CRONJOBS job_timeout = 5 * 60 * 60 # Keep jobs alive for a long while... + keep_result = 0 # don't keep job results, in order to continuously enqueue new mapping jobs From 9a3f6b8f20d3e0c35c7cf40dbcedf60d4377c5a6 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Jun 2024 15:31:08 -0700 Subject: [PATCH 04/54] Add temp api call to variant mapper manager for testing --- src/mavedb/routers/mapped_variant.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index d1ff0d1d..c0811a2b 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -1,5 +1,6 @@ from typing import Any, List, Optional +from arq import ArqRedis from fastapi import APIRouter, Depends from fastapi.exceptions import HTTPException from sqlalchemy.orm import Session @@ -39,4 +40,13 @@ async def show_mapped_variant( Fetch a mapped variant by URN. """ - return await fetch_mapped_variant_by_variant_urn(db, urn) \ No newline at end of file + return await fetch_mapped_variant_by_variant_urn(db, urn) + +@router.post("/map/{urn}", status_code=200, responses={404: {}, 500: {}}) +async def map_score_set( + *, urn: str, worker: ArqRedis = Depends(deps.get_worker) +) -> Any: + await worker.enqueue_job( + "variant_mapper_manager", urn, + ) + \ No newline at end of file From db4c2d0f6eaa76bf02f10063fc4ef004dc78cbe5 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 18 Jun 2024 15:40:29 -0700 Subject: [PATCH 05/54] Fixed coroutine pool call to vrs map api --- src/mavedb/worker/jobs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 1a7997fd..1fbdde95 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -161,7 +161,7 @@ async def map_variants_for_score_set(ctx, score_set_urn: str): logger.info(f"map variants for score set has started for score set {score_set_urn}") - blocking = functools.partial(requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}")) + blocking = functools.partial(requests.post, f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") loop = asyncio.get_running_loop() response = await loop.run_in_executor(ctx["pool"], blocking) From 96bd1433967b9b8e3c23b8c6e8146ade28d599c1 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 20 Jun 2024 11:30:09 -0700 Subject: [PATCH 06/54] Remove mapping env file --- docker-compose-dev.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker-compose-dev.yml b/docker-compose-dev.yml index 93165694..294708d0 100644 --- a/docker-compose-dev.yml +++ b/docker-compose-dev.yml @@ -71,7 +71,6 @@ services: - seqrepo env_file: - settings/.env.dev - - settings/.env.dev.mapping ports: - "8004:8000" volumes: From 5afbb43ec1e96f0d6df24ab737387e770ea1801b Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 20 Jun 2024 11:31:23 -0700 Subject: [PATCH 07/54] wip: Refactor VRS Worker Manager --- src/mavedb/routers/mapped_variant.py | 29 +++++++---- src/mavedb/worker/jobs.py | 75 +++++++++++++++------------- src/mavedb/worker/settings.py | 19 ++++--- 3 files changed, 70 insertions(+), 53 deletions(-) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index c0811a2b..3d292f12 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -12,6 +12,7 @@ from mavedb.models.variant import Variant from mavedb.view_models import mapped_variant + async def fetch_mapped_variant_by_variant_urn(db, urn: str) -> Optional[MappedVariant]: """ We may combine this function back to show_mapped_variant if none of any new function call it. @@ -23,30 +24,36 @@ async def fetch_mapped_variant_by_variant_urn(db, urn: str) -> Optional[MappedVa user. """ try: - item = db.query(MappedVariant).filter(Variant.urn == urn).filter(MappedVariant.variant_id == Variant.id).one_or_none() + item = ( + db.query(MappedVariant) + .filter(Variant.urn == urn) + .filter(MappedVariant.variant_id == Variant.id) + .one_or_none() + ) except MultipleResultsFound: raise HTTPException(status_code=500, detail=f"Multiple variants with URN {urn} were found.") if not item: raise HTTPException(status_code=404, detail=f"Mapped variant with URN {urn} not found") return item -router = APIRouter(prefix="/api/v1/mapped-variants", tags=["mapped variants"], responses={404: {"description": "Not found"}}) + +router = APIRouter( + prefix="/api/v1/mapped-variants", tags=["mapped variants"], responses={404: {"description": "Not found"}} +) + @router.get("/{urn}", status_code=200, response_model=mapped_variant.MappedVariant, responses={404: {}, 500: {}}) -async def show_mapped_variant( - *, urn: str, db: Session = Depends(deps.get_db) -) -> Any: +async def show_mapped_variant(*, urn: str, db: Session = Depends(deps.get_db)) -> Any: """ Fetch a mapped variant by URN. """ return await fetch_mapped_variant_by_variant_urn(db, urn) + @router.post("/map/{urn}", status_code=200, responses={404: {}, 500: {}}) -async def map_score_set( - *, urn: str, worker: ArqRedis = Depends(deps.get_worker) -) -> Any: +async def map_score_set(*, urn: str, worker: ArqRedis = Depends(deps.get_worker)) -> Any: + await worker.lpush("mapping_queue", urn) # type: ignore await worker.enqueue_job( - "variant_mapper_manager", urn, - ) - \ No newline at end of file + "variant_mapper_manager", + ) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 1fbdde95..2164b864 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -2,6 +2,7 @@ import functools import logging import requests +from datetime import timedelta from typing import Optional import pandas as pd @@ -57,9 +58,7 @@ async def create_variants_for_score_set( hdp: RESTDataProvider = ctx["hdp"] redis: ArqRedis = ctx["redis"] - score_set = db.scalars( - select(ScoreSet).where(ScoreSet.urn == score_set_urn) - ).one() + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() updated_by = db.scalars(select(User).where(User.id == updater_id)).one() score_set.modified_by = updated_by @@ -145,7 +144,9 @@ async def create_variants_for_score_set( logging_context["processing_state"] = score_set.processing_state.name logger.info(msg="Finished creating variants in score set.", extra=logging_context) - await redis.enqueue_job('variant_mapper_manager', score_set_urn) + + await redis.lpush("mapping_queue", score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager") finally: db.add(score_set) db.commit() @@ -159,57 +160,63 @@ async def create_variants_for_score_set( async def map_variants_for_score_set(ctx, score_set_urn: str): db: Session = ctx["db"] - logger.info(f"map variants for score set has started for score set {score_set_urn}") + logger.info(f"Started variant mapping for score set: {score_set_urn}") + # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. blocking = functools.partial(requests.post, f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") loop = asyncio.get_running_loop() response = await loop.run_in_executor(ctx["pool"], blocking) - #response = requests.post(f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") - response.raise_for_status() - mapping_results = response.json() + try: + response.raise_for_status() + mapping_results = response.json() + except requests.HTTPError as e: + logger.error( + f"Encountered an exception while mapping variants for {score_set_urn}", + exc_info=e, + ) + + logger.debug("Done mapping variants.") if mapping_results: - existing_variants = select(Variant.id).join(ScoreSet).where(ScoreSet.urn == score_set_urn).subquery() + existing_variants = select(Variant.id).join(ScoreSet).where(ScoreSet.urn == score_set_urn) db.execute(delete(MappedVariant).where(MappedVariant.variant_id.in_(existing_variants))) + logger.debug("Removed existing mapped variants for this score set.") for mapped_score in mapping_results["mapped_scores"]: variant_urn = mapped_score["mavedb_id"] variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + mapped_variant = MappedVariant( - pre_mapped = mapped_score["pre_mapped_2_0"], - post_mapped = mapped_score["post_mapped_2_0"], - variant_id = variant.id + pre_mapped=mapped_score["pre_mapped_2_0"], + post_mapped=mapped_score["post_mapped_2_0"], + variant_id=variant.id, ) db.add(mapped_variant) - + + logger.info(f"Inserted {len(mapping_results['mapped_scores'])} mapped variants.") db.commit() -async def map_variants_async(ctx, score_set_urn: str): - blocking = functools.partial(map_variants_for_score_set, ctx["db"], score_set_urn) - loop = asyncio.get_running_loop() - return await loop.run_in_executor(ctx["pool"], blocking) +async def variant_mapper_manager(ctx: dict) -> Optional[Job]: + logger.debug("Variant mapping manager began execution") + redis: ArqRedis = ctx["redis"] + queue_length = await redis.llen("mapping_queue") # type:ignore + if queue_length == 0: + logger.debug("No mapping jobs exist in the queue.") + return None -async def variant_mapper_manager(ctx, score_set_urn: Optional[str]): - queue: list[str] = ctx["mapping_queue"] - redis: ArqRedis = ctx["redis"] + logger.debug(f"{queue_length + 1} mapping job(s) are queued.") - if score_set_urn: - queue.append(score_set_urn) - - logger.info(f"Queue: {queue}") - - if not queue: - return - - job = Job(job_id='vrs_map', redis=redis) - logger.info(f"checked for job: {job}") + job = Job(job_id="vrs_map", redis=redis) if await job.status() is JobStatus.not_found: - logger.info("Job is none, attempting to enqueue new map variants job") - score_set_urn = queue.pop(0) - await redis.enqueue_job('map_variants_for_score_set', score_set_urn, _job_id='vrs_map') + logger.info("No mapping jobs are running, queuing a new one.") + queued_urn = await redis.rpop("mapping_queue") # type:ignore + return await redis.enqueue_job("map_variants_for_score_set", queued_urn, _job_id="vrs_map") else: - logger.info(f"Job status: {await job.status()}") + logger.debug("A mapping job is already running, deferring mapping by 5 minutes.") + # Our persistent Redis queue and ARQs execution rules ensure that even if the worker is stopped and not restarted + # before the deferred time, these deferred jobs will still run once able. + return await redis.enqueue_job("variant_mapper_manager", _defer_by=timedelta(minutes=5)) diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index de02d57c..f62efe24 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -3,7 +3,8 @@ from typing import Callable from arq.connections import RedisSettings -from arq import cron +from arq.cron import CronJob +from arq import ArqRedis, cron from mavedb.lib.logging.canonical import log_job from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager @@ -11,9 +12,12 @@ from mavedb.data_providers.services import cdot_rest # ARQ requires at least one task on startup. -BACKGROUND_FUNCTIONS: list[Callable] = [create_variants_for_score_set, variant_mapper_manager, map_variants_for_score_set] -#BACKGROUND_CRONJOBS: list[Callable] = [cron(variant_mapper_manager, hour={range(0, 23)}, minute={0, 15, 30, 45})] -BACKGROUND_CRONJOBS: list[Callable] = [] +BACKGROUND_FUNCTIONS: list[Callable] = [ + create_variants_for_score_set, + variant_mapper_manager, + map_variants_for_score_set, +] +BACKGROUND_CRONJOBS: list[CronJob] = [] REDIS_IP = os.getenv("REDIS_IP") or "localhost" REDIS_PORT = int(os.getenv("REDIS_PORT") or 6379) @@ -24,7 +28,8 @@ async def startup(ctx): - pass + ctx["mapping_queue"] = [] + ctx["pool"] = futures.ProcessPoolExecutor() async def shutdown(ctx): @@ -37,8 +42,6 @@ async def on_job_start(ctx): ctx["db"] = db ctx["hdp"] = cdot_rest() ctx["state"] = {} - ctx["mapping_queue"] = [] - ctx["pool"] = futures.ProcessPoolExecutor() async def on_job_end(ctx): @@ -61,4 +64,4 @@ class ArqWorkerSettings: cron_jobs: list = BACKGROUND_CRONJOBS job_timeout = 5 * 60 * 60 # Keep jobs alive for a long while... - keep_result = 0 # don't keep job results, in order to continuously enqueue new mapping jobs + keep_result = 0 # don't keep job results, in order to continuously enqueue new mapping jobs From 820d1778020e64754c3107e2e3b50aa3ece74be0 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 20 Jun 2024 15:09:00 -0700 Subject: [PATCH 08/54] Make Queue name global variable --- src/mavedb/routers/mapped_variant.py | 3 ++- src/mavedb/worker/jobs.py | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index 3d292f12..b2237fa3 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -11,6 +11,7 @@ from mavedb.models.mapped_variant import MappedVariant from mavedb.models.variant import Variant from mavedb.view_models import mapped_variant +from mavedb.worker.jobs import MAPPING_QUEUE_NAME async def fetch_mapped_variant_by_variant_urn(db, urn: str) -> Optional[MappedVariant]: @@ -53,7 +54,7 @@ async def show_mapped_variant(*, urn: str, db: Session = Depends(deps.get_db)) - @router.post("/map/{urn}", status_code=200, responses={404: {}, 500: {}}) async def map_score_set(*, urn: str, worker: ArqRedis = Depends(deps.get_worker)) -> Any: - await worker.lpush("mapping_queue", urn) # type: ignore + await worker.lpush(MAPPING_QUEUE_NAME, urn) # type: ignore await worker.enqueue_job( "variant_mapper_manager", ) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 2164b864..7bd61769 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -31,6 +31,7 @@ logger = logging.getLogger(__name__) +MAPPING_QUEUE_NAME = "vrs_mapping_queue" def setup_job_state(ctx, invoker: int, resource: str, correlation_id: str): ctx["state"][ctx["job_id"]] = { @@ -144,8 +145,7 @@ async def create_variants_for_score_set( logging_context["processing_state"] = score_set.processing_state.name logger.info(msg="Finished creating variants in score set.", extra=logging_context) - - await redis.lpush("mapping_queue", score_set_urn) # type: ignore + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore await redis.enqueue_job("variant_mapper_manager") finally: db.add(score_set) @@ -170,7 +170,7 @@ async def map_variants_for_score_set(ctx, score_set_urn: str): try: response.raise_for_status() mapping_results = response.json() - except requests.HTTPError as e: + except requests.exceptions.HTTPError as e: logger.error( f"Encountered an exception while mapping variants for {score_set_urn}", exc_info=e, @@ -202,7 +202,7 @@ async def variant_mapper_manager(ctx: dict) -> Optional[Job]: logger.debug("Variant mapping manager began execution") redis: ArqRedis = ctx["redis"] - queue_length = await redis.llen("mapping_queue") # type:ignore + queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type:ignore if queue_length == 0: logger.debug("No mapping jobs exist in the queue.") return None @@ -212,11 +212,11 @@ async def variant_mapper_manager(ctx: dict) -> Optional[Job]: job = Job(job_id="vrs_map", redis=redis) if await job.status() is JobStatus.not_found: logger.info("No mapping jobs are running, queuing a new one.") - queued_urn = await redis.rpop("mapping_queue") # type:ignore + queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type:ignore return await redis.enqueue_job("map_variants_for_score_set", queued_urn, _job_id="vrs_map") else: logger.debug("A mapping job is already running, deferring mapping by 5 minutes.") - # Our persistent Redis queue and ARQs execution rules ensure that even if the worker is stopped and not restarted + # Our persistent Redis queue and ARQ's execution rules ensure that even if the worker is stopped and not restarted # before the deferred time, these deferred jobs will still run once able. return await redis.enqueue_job("variant_mapper_manager", _defer_by=timedelta(minutes=5)) From 9e5587dad2da23beb2e897217246afee43ef6b59 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 20 Jun 2024 16:02:52 -0700 Subject: [PATCH 09/54] Add Redis Entry to Worker Context --- tests/conftest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index fe03b0b4..0aad5398 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,7 @@ from mavedb.lib.authentication import get_current_user, UserData from mavedb.models.user import User from mavedb.server_main import app -from mavedb.worker.jobs import create_variants_for_score_set +from mavedb.worker.jobs import create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager sys.path.append(".") @@ -152,7 +152,7 @@ async def on_job(ctx): ctx["hdp"] = data_provider worker_ = Worker( - functions=[create_variants_for_score_set], + functions=[create_variants_for_score_set, map_variants_for_score_set, variant_mapper_manager], redis_pool=arq_redis, burst=True, poll_delay=0, @@ -168,8 +168,8 @@ async def on_job(ctx): @pytest.fixture -def standalone_worker_context(session, data_provider): - yield {"db": session, "hdp": data_provider, "state": {}, "job_id": "test_job"} +def standalone_worker_context(session, data_provider, arq_redis): + yield {"db": session, "hdp": data_provider, "state": {}, "job_id": "test_job", "redis": arq_redis} @pytest.fixture() From 4c548b52776f52608dbcf77f82d89dab6e93d130 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 27 Jun 2024 08:40:46 -0700 Subject: [PATCH 10/54] wip: Integration tests for VRS Mapping --- src/mavedb/data_providers/services.py | 22 +++ src/mavedb/worker/jobs.py | 11 +- tests/conftest.py | 3 +- tests/helpers/constants.py | 12 ++ tests/worker/test_jobs.py | 205 +++++++++++++++++++++++++- 5 files changed, 246 insertions(+), 7 deletions(-) diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index eb13922f..dbec73d6 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,3 +1,6 @@ +import requests +from typing import Optional, Union + from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider GENOMIC_FASTA_FILES = [ @@ -5,6 +8,8 @@ "/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", ] +DCD_MAP_URL = "http://dcd-mapping:8000" + def seqfetcher() -> ChainedSeqFetcher: return ChainedSeqFetcher(*[FastaSeqFetcher(file) for file in GENOMIC_FASTA_FILES]) @@ -12,3 +17,20 @@ def seqfetcher() -> ChainedSeqFetcher: def cdot_rest() -> RESTDataProvider: return RESTDataProvider(seqfetcher=seqfetcher()) + + +class VRSMap: + url: str + + def __init__(self, url: str) -> None: + self.url = url + + def map_score_set(self, score_set_urn: str) -> dict[str, Union[dict, list]]: + uri = f"{self.url}/api/v1/map/{score_set_urn}" + response = requests.post(uri) + response.raise_for_status() + return response.json() + + +def vrs_mapper(url: Optional[str] = None) -> VRSMap: + return VRSMap(DCD_MAP_URL) if not url else VRSMap(url) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 7bd61769..c3c2b1e9 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -28,6 +28,7 @@ from mavedb.models.score_set import ScoreSet from mavedb.models.user import User from mavedb.models.variant import Variant +from mavedb.data_providers.services import vrs_mapper logger = logging.getLogger(__name__) @@ -163,18 +164,18 @@ async def map_variants_for_score_set(ctx, score_set_urn: str): logger.info(f"Started variant mapping for score set: {score_set_urn}") # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. - blocking = functools.partial(requests.post, f"http://dcd-mapping:8000/api/v1/map/{score_set_urn}") + vrs = vrs_mapper() + blocking = functools.partial(vrs.map_score_set, score_set_urn) loop = asyncio.get_running_loop() - response = await loop.run_in_executor(ctx["pool"], blocking) try: - response.raise_for_status() - mapping_results = response.json() + mapping_results = await loop.run_in_executor(ctx["pool"], blocking) except requests.exceptions.HTTPError as e: logger.error( f"Encountered an exception while mapping variants for {score_set_urn}", exc_info=e, ) + return logger.debug("Done mapping variants.") @@ -207,7 +208,7 @@ async def variant_mapper_manager(ctx: dict) -> Optional[Job]: logger.debug("No mapping jobs exist in the queue.") return None - logger.debug(f"{queue_length + 1} mapping job(s) are queued.") + logger.debug(f"{queue_length} mapping job(s) are queued.") job = Job(job_id="vrs_map", redis=redis) if await job.status() is JobStatus.not_found: diff --git a/tests/conftest.py b/tests/conftest.py index 0aad5398..954903d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import os import sys +from concurrent import futures from inspect import getsourcefile from os.path import abspath from unittest.mock import patch @@ -169,7 +170,7 @@ async def on_job(ctx): @pytest.fixture def standalone_worker_context(session, data_provider, arq_redis): - yield {"db": session, "hdp": data_provider, "state": {}, "job_id": "test_job", "redis": arq_redis} + yield {"db": session, "hdp": data_provider, "state": {}, "job_id": "test_job", "redis": arq_redis, "pool": futures.ProcessPoolExecutor()} @pytest.fixture() diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index f15b6284..41305286 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -523,3 +523,15 @@ } }, } + + +TEST_VARIANT_MAPPING_SCAFFOLD = { + "metadata": {}, + "computed_reference_sequence": {"sequence_type": "dna", "sequence_id": "ga4gh:SQ.ref_test", "sequence": "ACGTTT"}, + "mapped_reference_sequence": { + "sequence_type": "dna", + "sequence_id": "ga4gh:SQ.map_test", + "sequence_accessions": ["NC_000001.11"], + }, + "mapped_scores": [], +} diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 4be3fcf1..f78573f5 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -1,7 +1,11 @@ +import requests +import requests_mock from copy import deepcopy from uuid import uuid4 from unittest.mock import patch +from concurrent import futures +import arq.jobs import cdot.hgvs.dataproviders import jsonschema import pandas as pd @@ -12,9 +16,15 @@ from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.score_set import ScoreSet as ScoreSetDbModel from mavedb.models.variant import Variant +from mavedb.models.mapped_variant import MappedVariant from mavedb.view_models.experiment import Experiment, ExperimentCreate from mavedb.view_models.score_set import ScoreSet, ScoreSetCreate -from mavedb.worker.jobs import create_variants_for_score_set +from mavedb.worker.jobs import ( + create_variants_for_score_set, + map_variants_for_score_set, + variant_mapper_manager, + MAPPING_QUEUE_NAME, +) from sqlalchemy import select from tests.helpers.constants import ( @@ -22,6 +32,7 @@ TEST_MINIMAL_ACC_SCORESET, TEST_MINIMAL_EXPERIMENT, TEST_MINIMAL_SEQ_SCORESET, + TEST_VARIANT_MAPPING_SCAFFOLD, ) @@ -53,6 +64,38 @@ async def setup_records_and_files(async_client, data_files, input_score_set): return score_set["urn"], scores, counts +async def setup_records_files_and_variants(async_client, data_files, input_score_set, worker_ctx): + urn, scores, counts = await setup_records_and_files(async_client, data_files, input_score_set) + + # Patch CDOT `_get_transcript`, in the event this function is called on an accesssion based scoreset. + with patch.object(cdot.hgvs.dataproviders.RESTDataProvider, "_get_transcript", return_value=TEST_CDOT_TRANSCRIPT): + score_set = await create_variants_for_score_set(worker_ctx, urn, 1, scores, counts) + + assert score_set.processing_state is ProcessingState.success + assert score_set.num_variants == 3 + + return score_set + + +async def setup_mapping_output(async_client, session, score_set): + score_set_response = await async_client.get(f"/api/v1/score-set/{score_set.urn}") + + mapping_output = TEST_VARIANT_MAPPING_SCAFFOLD.copy() + mapping_output["metadata"] = score_set_response.json() + + variants = session.scalars(select(Variant).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn)).all() + for variant in variants: + mapped_score = { + "pre_mapped": {"test": "pre_mapped_output"}, + "post_mapped": {"test": "post_mapped_output"}, + "mavedb_id": variant.urn, + } + + mapping_output["mapped_scores"].append(mapped_score) + + return mapping_output + + @pytest.mark.asyncio @pytest.mark.parametrize( "input_score_set,validation_error", @@ -299,3 +342,163 @@ async def test_create_variants_for_score_set( # Have to commit at the end of async tests for DB threads to be released. Otherwise pytest # thinks we are still using the session fixture and will hang indefinitely. session.commit() + + +# NOTE: These tests operate under the assumption that mapping output is consistent between accession based and sequence based score sets. If +# this assumption changes in the future, tests reflecting this difference in output should be added for accession based score sets. + + +@pytest.mark.asyncio +@pytest.mark.skip +async def test_create_mapped_variants_for_scoreset( + setup_worker_db, + async_client, + standalone_worker_context, + session, + data_files, +): + score_set = await setup_records_files_and_variants( + async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context + ) + + test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) + + # TODO: How can we mock a functools.partial object running in another event loop? Do we require + # some sort of wrapper object? Must we context manage within this other event loop? + with requests_mock.Mocker() as m: + m.post( + f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", + json=test_mapping_output_for_score_set, + ) + + await map_variants_for_score_set(standalone_worker_context, score_set.urn) + assert m.called + + mapped_variants_for_score_set = session.scalars( + select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + ).all() + assert len(mapped_variants_for_score_set) == score_set.num_variants + + +@pytest.mark.asyncio +@pytest.mark.skip +async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants( + setup_worker_db, async_client, standalone_worker_context, session, data_files +): + score_set = await setup_records_files_and_variants( + async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context + ) + + test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) + + # TODO: How can we mock a functools.partial object running in another event loop? Do we require + # some sort of wrapper object? Must we context manage within this other event loop? + with requests_mock.mock() as m: + m.post( + f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", + json=test_mapping_output_for_score_set, + ) + + session.add( + MappedVariant(pre_mapped={"preexisting": "variant"}, post_mapped={"preexisting": "variant"}, variant_id=1) + ) + session.commit() + + await map_variants_for_score_set(standalone_worker_context, score_set.urn) + assert m.called + + mapped_variants_for_score_set = session.scalars( + select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + ).all() + assert len(mapped_variants_for_score_set) == score_set.num_variants + + +@pytest.mark.asyncio +@pytest.mark.skip +async def test_create_mapped_variants_for_scoreset_mapping_exception( + input_score_set, setup_worker_db, async_client, standalone_worker_context, session, data_files +): + score_set = await setup_records_files_and_variants( + async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context + ) + + # TODO: How can we mock a functools.partial object running in another event loop? Do we require + # some sort of wrapper object? Must we context manage within this other event loop? + with requests_mock.mock() as m: + m.post( + f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", + exc=requests.exceptions.HTTPError, + ) + + await map_variants_for_score_set(standalone_worker_context, score_set.urn) + assert m.called + + # TODO: How are errors persisted? Test persistence mechanism. + mapped_variants_for_score_set = session.scalars( + select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + ).all() + assert len(mapped_variants_for_score_set) == 0 + + +@pytest.mark.asyncio +@pytest.mark.skip +async def test_create_mapped_variants_for_scoreset_no_mapping_output( + input_score_set, setup_worker_db, async_client, standalone_worker_context, session, data_files +): + score_set = await setup_records_files_and_variants( + async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context + ) + + # TODO: How can we mock a functools.partial object running in another event loop? Do we require + # some sort of wrapper object? Must we context manage within this other event loop? + test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) + test_mapping_output_for_score_set["mapped_scores"] = [] + + with requests_mock.mock() as m: + m.post( + f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", + json=test_mapping_output_for_score_set, + ) + + await map_variants_for_score_set(standalone_worker_context, score_set.urn) + assert m.called + + mapped_variants_for_score_set = session.scalars( + select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + ).all() + assert len(mapped_variants_for_score_set) == 0 + + +@pytest.mark.asyncio +async def test_mapping_manager_empty_queue(setup_worker_db, standalone_worker_context, session): + queued_job = await variant_mapper_manager(standalone_worker_context) + + # No new jobs should have been created if nothing is in the queue. + assert queued_job is None + session.commit() + + +@pytest.mark.asyncio +async def test_mapping_manager_occupied_queue_mapping_in_progress(setup_worker_db, standalone_worker_context, session): + await standalone_worker_context["redis"].lpush(MAPPING_QUEUE_NAME, "mavedb:test-urn") + + with patch.object(arq.jobs.Job, "status", return_value=arq.jobs.JobStatus.in_progress): + queued_job = await variant_mapper_manager(standalone_worker_context) + + # Execution should be deferred if a job is in progress. + assert await queued_job.status() is arq.jobs.JobStatus.deferred + session.commit() + + +@pytest.mark.asyncio +async def test_mapping_manager_occupied_queue_mapping_not_in_progress( + setup_worker_db, standalone_worker_context, session +): + await standalone_worker_context["redis"].lpush(MAPPING_QUEUE_NAME, "mavedb:test-urn") + + with patch.object(arq.jobs.Job, "status", return_value=arq.jobs.JobStatus.not_found): + queued_job = await variant_mapper_manager(standalone_worker_context) + + # VRS Mapping jobs have the same ID. + assert queued_job.job_id == "vrs_map" + session.commit() From 1c226c4d4694d207962e30e625d634673c5f8ca9 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 27 Jun 2024 11:02:48 -0700 Subject: [PATCH 11/54] Workaround for Testing Pooled Processes --- tests/worker/test_jobs.py | 134 +++++++++++++++++++++++--------------- 1 file changed, 80 insertions(+), 54 deletions(-) diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index f78573f5..4a892f80 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -1,15 +1,15 @@ -import requests -import requests_mock +from asyncio.unix_events import _UnixSelectorEventLoop from copy import deepcopy +from requests import HTTPError from uuid import uuid4 from unittest.mock import patch -from concurrent import futures import arq.jobs import cdot.hgvs.dataproviders import jsonschema import pandas as pd import pytest + from mavedb.lib.mave.constants import HGVS_NT_COLUMN from mavedb.lib.score_sets import csv_data_to_df from mavedb.lib.validation.exceptions import ValidationError @@ -77,17 +77,22 @@ async def setup_records_files_and_variants(async_client, data_files, input_score return score_set -async def setup_mapping_output(async_client, session, score_set): +async def setup_mapping_output(async_client, session, score_set, empty=False): score_set_response = await async_client.get(f"/api/v1/score-set/{score_set.urn}") - mapping_output = TEST_VARIANT_MAPPING_SCAFFOLD.copy() + mapping_output = deepcopy(TEST_VARIANT_MAPPING_SCAFFOLD) mapping_output["metadata"] = score_set_response.json() + if empty: + return mapping_output + variants = session.scalars(select(Variant).join(ScoreSetDbModel).where(ScoreSetDbModel.urn == score_set.urn)).all() for variant in variants: mapped_score = { "pre_mapped": {"test": "pre_mapped_output"}, + "pre_mapped_2_0": {"test": "pre_mapped_output (2.0)"}, "post_mapped": {"test": "post_mapped_output"}, + "post_mapped_2_0": {"test": "post_mapped_output (2.0)"}, "mavedb_id": variant.urn, } @@ -349,7 +354,6 @@ async def test_create_variants_for_score_set( @pytest.mark.asyncio -@pytest.mark.skip async def test_create_mapped_variants_for_scoreset( setup_worker_db, async_client, @@ -361,27 +365,29 @@ async def test_create_mapped_variants_for_scoreset( async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context ) - test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) - - # TODO: How can we mock a functools.partial object running in another event loop? Do we require - # some sort of wrapper object? Must we context manage within this other event loop? - with requests_mock.Mocker() as m: - m.post( - f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", - json=test_mapping_output_for_score_set, - ) + # Do not await, we need a co-routine object to be the return value of our `run_in_executor` mock. + mapping_test_output_for_score_set = setup_mapping_output(async_client, session, score_set) + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround + # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine + # object that sets up test mappingn output. + # + # TODO: Does this work on non-unix based machines. + # TODO: Is it even a safe operation to patch this event loop method? + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): await map_variants_for_score_set(standalone_worker_context, score_set.urn) - assert m.called mapped_variants_for_score_set = session.scalars( - select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) ).all() assert len(mapped_variants_for_score_set) == score_set.num_variants + # Have to commit at the end of async tests for DB threads to be released. Otherwise pytest + # thinks we are still using the session fixture and will hang indefinitely. + session.commit() + @pytest.mark.asyncio -@pytest.mark.skip async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants( setup_worker_db, async_client, standalone_worker_context, session, data_files ): @@ -389,85 +395,105 @@ async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context ) - test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) + # Do not await, we need a co-routine object to be the return value of our `run_in_executor` mock. + mapping_test_output_for_score_set = setup_mapping_output(async_client, session, score_set) - # TODO: How can we mock a functools.partial object running in another event loop? Do we require - # some sort of wrapper object? Must we context manage within this other event loop? - with requests_mock.mock() as m: - m.post( - f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", - json=test_mapping_output_for_score_set, - ) + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround + # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine + # object that sets up test mappingn output. + # + # TODO: Does this work on non-unix based machines. + # TODO: Is it even a safe operation to patch this event loop method? + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): + existing_variant = session.scalars(select(Variant)).first() + + if not existing_variant: + raise ValueError session.add( - MappedVariant(pre_mapped={"preexisting": "variant"}, post_mapped={"preexisting": "variant"}, variant_id=1) + MappedVariant( + pre_mapped={"preexisting": "variant"}, + post_mapped={"preexisting": "variant"}, + variant_id=existing_variant.id, + ) ) session.commit() await map_variants_for_score_set(standalone_worker_context, score_set.urn) - assert m.called mapped_variants_for_score_set = session.scalars( - select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) ).all() assert len(mapped_variants_for_score_set) == score_set.num_variants + # Have to commit at the end of async tests for DB threads to be released. Otherwise pytest + # thinks we are still using the session fixture and will hang indefinitely. + session.commit() + @pytest.mark.asyncio -@pytest.mark.skip async def test_create_mapped_variants_for_scoreset_mapping_exception( - input_score_set, setup_worker_db, async_client, standalone_worker_context, session, data_files + setup_worker_db, async_client, standalone_worker_context, session, data_files ): + async def awaitable_http_error(): + raise HTTPError + score_set = await setup_records_files_and_variants( async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context ) - # TODO: How can we mock a functools.partial object running in another event loop? Do we require - # some sort of wrapper object? Must we context manage within this other event loop? - with requests_mock.mock() as m: - m.post( - f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", - exc=requests.exceptions.HTTPError, - ) + # Do not await, we need a co-routine object which raises an http error once awaited. + mapping_test_output_for_score_set = awaitable_http_error() + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround + # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine + # object that sets up test mappingn output. + # + # TODO: Does this work on non-unix based machines? + # TODO: Is it even a safe operation to patch this event loop method? + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): await map_variants_for_score_set(standalone_worker_context, score_set.urn) - assert m.called # TODO: How are errors persisted? Test persistence mechanism. mapped_variants_for_score_set = session.scalars( - select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) ).all() assert len(mapped_variants_for_score_set) == 0 + # Have to commit at the end of async tests for DB threads to be released. Otherwise pytest + # thinks we are still using the session fixture and will hang indefinitely. + session.commit() + @pytest.mark.asyncio -@pytest.mark.skip async def test_create_mapped_variants_for_scoreset_no_mapping_output( - input_score_set, setup_worker_db, async_client, standalone_worker_context, session, data_files + setup_worker_db, async_client, standalone_worker_context, session, data_files ): score_set = await setup_records_files_and_variants( async_client, data_files, TEST_MINIMAL_SEQ_SCORESET, standalone_worker_context ) - # TODO: How can we mock a functools.partial object running in another event loop? Do we require - # some sort of wrapper object? Must we context manage within this other event loop? - test_mapping_output_for_score_set = await setup_mapping_output(async_client, session, score_set) - test_mapping_output_for_score_set["mapped_scores"] = [] - - with requests_mock.mock() as m: - m.post( - f"http://dcd-mapping:8000/api/v1/map/{score_set.urn}", - json=test_mapping_output_for_score_set, - ) + # Do not await, we need a co-routine object to be the return value of our `run_in_executor` mock. + mapping_test_output_for_score_set = setup_mapping_output(async_client, session, score_set, empty=True) + # We seem unable to mock requests via requests_mock that occur inside another event loop. Workaround + # this limitation by instead patching the _UnixSelectorEventLoop 's executor function, with a coroutine + # object that sets up test mappingn output. + # + # TODO: Does this work on non-unix based machines. + # TODO: Is it even a safe operation to patch this event loop method? + with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): await map_variants_for_score_set(standalone_worker_context, score_set.urn) - assert m.called mapped_variants_for_score_set = session.scalars( - select(MappedVariant).join(Variant).join(ScoreSet).filter(ScoreSet.urn == score_set.urn) + select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) ).all() assert len(mapped_variants_for_score_set) == 0 + # Have to commit at the end of async tests for DB threads to be released. Otherwise pytest + # thinks we are still using the session fixture and will hang indefinitely. + session.commit() + @pytest.mark.asyncio async def test_mapping_manager_empty_queue(setup_worker_db, standalone_worker_context, session): From fc0310a07bb40ecd506054e4c1a283a6e738b0e6 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Tue, 20 Aug 2024 10:49:26 -0700 Subject: [PATCH 12/54] Update mapped variant model for error reporting and versioning --- src/mavedb/models/mapped_variant.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py index d74da89b..d456f573 100644 --- a/src/mavedb/models/mapped_variant.py +++ b/src/mavedb/models/mapped_variant.py @@ -1,4 +1,6 @@ -from sqlalchemy import Column, ForeignKey, Integer +from datetime import date + +from sqlalchemy import Column, Date, ForeignKey, Integer, String from sqlalchemy.orm import relationship, backref, Mapped from sqlalchemy.dialects.postgresql import JSONB @@ -11,8 +13,13 @@ class MappedVariant(Base): id = Column(Integer, primary_key=True) - pre_mapped = Column(JSONB, nullable=False) - post_mapped = Column(JSONB, nullable=False) + pre_mapped = Column(JSONB, nullable=True) + post_mapped = Column(JSONB, nullable=True) + vrs_version = Column(String, nullable=True) + error_message = Column(String, nullable=True) + modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) + mapped_date = Column(Date, nullable=False) + mapping_api_version = Column(String, nullable=False) variant_id = Column(Integer, ForeignKey("variants.id"), index=True, nullable=False) variant: Mapped[Variant] = relationship("Variant", backref=backref("mapped_variants", cascade="all,delete-orphan")) From 41fa5e633d8562027c1747a07ab4315cf35bd938 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Mon, 26 Aug 2024 08:33:26 -0700 Subject: [PATCH 13/54] Target Gene Mapping Columns and DB Migration --- ...f36cf612e029_additional_mapping_columns.py | 75 +++++++++++++++++++ src/mavedb/models/target_gene.py | 4 + 2 files changed, 79 insertions(+) create mode 100644 alembic/versions/f36cf612e029_additional_mapping_columns.py diff --git a/alembic/versions/f36cf612e029_additional_mapping_columns.py b/alembic/versions/f36cf612e029_additional_mapping_columns.py new file mode 100644 index 00000000..3b1048fd --- /dev/null +++ b/alembic/versions/f36cf612e029_additional_mapping_columns.py @@ -0,0 +1,75 @@ +"""Additional mapping columns + +Revision ID: f36cf612e029 +Revises: ec5d2787bec9 +Create Date: 2024-08-21 16:06:06.793541 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from datetime import datetime + +# revision identifiers, used by Alembic. +revision = "f36cf612e029" +down_revision = "ec5d2787bec9" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("mapped_variants", sa.Column("vrs_version", sa.String(), nullable=True)) + op.add_column("mapped_variants", sa.Column("error_message", sa.String(), nullable=True)) + op.add_column( + "mapped_variants", + sa.Column( + "modification_date", + sa.Date(), + nullable=False, + server_default=datetime.fromisocalendar(2024, 1, 1).date().strftime("%Y-%m-%d"), + ), + ) + op.add_column( + "mapped_variants", + sa.Column( + "mapped_date", + sa.Date(), + nullable=False, + server_default=datetime.fromisocalendar(2024, 1, 1).date().strftime("%Y-%m-%d"), + ), + ) + op.add_column( + "mapped_variants", sa.Column("mapping_api_version", sa.String(), nullable=False, server_default="0.0.0") + ) + op.alter_column( + "mapped_variants", "pre_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=True + ) + op.alter_column( + "mapped_variants", "post_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=True + ) + + op.add_column("target_genes", sa.Column("pre_mapped_metadata", postgresql.JSONB, nullable=True)) + op.add_column("target_genes", sa.Column("post_mapped_metadata", postgresql.JSONB, nullable=True)) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.alter_column( + "mapped_variants", "post_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=False + ) + op.alter_column( + "mapped_variants", "pre_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=False + ) + op.drop_column("mapped_variants", "mapping_api_version") + op.drop_column("mapped_variants", "mapped_date") + op.drop_column("mapped_variants", "modification_date") + op.drop_column("mapped_variants", "error_message") + op.drop_column("mapped_variants", "vrs_version") + + op.drop_column("target_genes", "pre_mapped_metadata") + op.drop_column("target_genes", "post_mapped_metadata") + # ### end Alembic commands ### diff --git a/src/mavedb/models/target_gene.py b/src/mavedb/models/target_gene.py index 99f14e50..540c8a26 100644 --- a/src/mavedb/models/target_gene.py +++ b/src/mavedb/models/target_gene.py @@ -1,5 +1,6 @@ from datetime import date from sqlalchemy import Column, Date, ForeignKey, Integer, String +from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import backref, relationship, Mapped from typing import TYPE_CHECKING @@ -40,6 +41,9 @@ class TargetGene(Base): single_parent=True, ) + pre_mapped_metadata: Mapped[JSONB] = Column("pre_mapped_metadata", JSONB, nullable=True) + post_mapped_metadata: Mapped[JSONB] = Column("post_mapped_metadata", JSONB, nullable=True) + creation_date = Column(Date, nullable=False, default=date.today) modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) From ae31f1e6c05ae0b571ac432424301b5fd8b25222 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Mon, 26 Aug 2024 09:28:57 -0700 Subject: [PATCH 14/54] Add Additional Columns to Mapping Job --- src/mavedb/worker/jobs.py | 6 +++++- tests/helpers/constants.py | 2 ++ tests/worker/test_jobs.py | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index c3c2b1e9..e080e5db 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -2,7 +2,7 @@ import functools import logging import requests -from datetime import timedelta +from datetime import timedelta, date from typing import Optional import pandas as pd @@ -192,6 +192,10 @@ async def map_variants_for_score_set(ctx, score_set_urn: str): pre_mapped=mapped_score["pre_mapped_2_0"], post_mapped=mapped_score["post_mapped_2_0"], variant_id=variant.id, + modification_date=date.today(), + mapped_date=date.today(), + vrs_version=mapping_results["vrs_version"], + mapping_api_version=mapping_results["api_version"], ) db.add(mapped_variant) diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index 41305286..f5ba6d28 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -534,4 +534,6 @@ "sequence_accessions": ["NC_000001.11"], }, "mapped_scores": [], + "vrs_version": "2.0", + "api_version": "0.0.0", } diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 4a892f80..4ec4ad5b 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -1,3 +1,5 @@ +from datetime import date + from asyncio.unix_events import _UnixSelectorEventLoop from copy import deepcopy from requests import HTTPError @@ -415,6 +417,10 @@ async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants pre_mapped={"preexisting": "variant"}, post_mapped={"preexisting": "variant"}, variant_id=existing_variant.id, + modification_date=date.today(), + mapped_date=date.today(), + vrs_version="2.0", + mapping_api_version="0.0.0", ) ) session.commit() From 8f296609fa1b0f25fbf438e743e7012738f8fcc6 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Mon, 26 Aug 2024 09:57:44 -0700 Subject: [PATCH 15/54] Add TypedDict of Mapping Results for Mypy's Benefit --- src/mavedb/data_providers/services.py | 12 ++++++++++-- src/mavedb/worker/jobs.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index dbec73d6..fd6ccc86 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,5 +1,5 @@ import requests -from typing import Optional, Union +from typing import Optional, Union, TypedDict from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider @@ -22,10 +22,18 @@ def cdot_rest() -> RESTDataProvider: class VRSMap: url: str + class ScoreSetMappingResults(TypedDict): + metadata: dict[str, str] + computed_reference_sequence: dict[str, str] + mapped_reference_sequence: dict[str, str] + mapped_scores: list[dict] + vrs_version: str + api_version: str + def __init__(self, url: str) -> None: self.url = url - def map_score_set(self, score_set_urn: str) -> dict[str, Union[dict, list]]: + def map_score_set(self, score_set_urn: str) -> ScoreSetMappingResults: uri = f"{self.url}/api/v1/map/{score_set_urn}" response = requests.post(uri) response.raise_for_status() diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index e080e5db..f74a535f 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -3,7 +3,7 @@ import logging import requests from datetime import timedelta, date -from typing import Optional +from typing import Optional, Union import pandas as pd from arq import ArqRedis From 3603a70e57f0e1951155f933dab788b30f9de0d4 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 29 Aug 2024 09:10:02 -0700 Subject: [PATCH 16/54] Add current column to mapped_variants table The current column marks whether the mapped variant is considered the most current mapped version of a given variant in the mapped_variants table. This is necessary because we may store multiple mapped versions of a variant. This commit also adds a default value of "1.3" for the VRS version, since all the mapped variants currently in the database use VRS 1.3, and adds a default value of the date of execution for the modification date, since the modification date should keep track of when the row was last updated. --- ...f36cf612e029_additional_mapping_columns.py | 45 +++++++++++++++---- src/mavedb/models/mapped_variant.py | 3 +- 2 files changed, 38 insertions(+), 10 deletions(-) diff --git a/alembic/versions/f36cf612e029_additional_mapping_columns.py b/alembic/versions/f36cf612e029_additional_mapping_columns.py index 3b1048fd..0141bc2a 100644 --- a/alembic/versions/f36cf612e029_additional_mapping_columns.py +++ b/alembic/versions/f36cf612e029_additional_mapping_columns.py @@ -21,7 +21,10 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.add_column("mapped_variants", sa.Column("vrs_version", sa.String(), nullable=True)) + op.add_column( + "mapped_variants", + sa.Column("vrs_version", sa.String(), nullable=True, server_default="1.3"), + ) op.add_column("mapped_variants", sa.Column("error_message", sa.String(), nullable=True)) op.add_column( "mapped_variants", @@ -29,7 +32,7 @@ def upgrade(): "modification_date", sa.Date(), nullable=False, - server_default=datetime.fromisocalendar(2024, 1, 1).date().strftime("%Y-%m-%d"), + server_default = sa.func.current_date(), ), ) op.add_column( @@ -42,28 +45,52 @@ def upgrade(): ), ) op.add_column( - "mapped_variants", sa.Column("mapping_api_version", sa.String(), nullable=False, server_default="0.0.0") + "mapped_variants", + sa.Column("mapping_api_version", sa.String(), nullable=False, server_default="0.0.0"), ) + op.add_column( + "mapped_variants", + sa.Column("current", sa.Boolean(), nullable=False, server_default=sa.false()), + ) op.alter_column( - "mapped_variants", "pre_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=True + "mapped_variants", + "pre_mapped", + existing_type=postgresql.JSONB(astext_type=sa.Text()), + nullable=True, ) op.alter_column( - "mapped_variants", "post_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=True + "mapped_variants", + "post_mapped", + existing_type=postgresql.JSONB(astext_type=sa.Text()), + nullable=True, ) - op.add_column("target_genes", sa.Column("pre_mapped_metadata", postgresql.JSONB, nullable=True)) - op.add_column("target_genes", sa.Column("post_mapped_metadata", postgresql.JSONB, nullable=True)) + op.add_column( + "target_genes", + sa.Column("pre_mapped_metadata", postgresql.JSONB, nullable=True), + ) + op.add_column( + "target_genes", + sa.Column("post_mapped_metadata", postgresql.JSONB, nullable=True), + ) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### op.alter_column( - "mapped_variants", "post_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=False + "mapped_variants", + "post_mapped", + existing_type=postgresql.JSONB(astext_type=sa.Text()), + nullable=False, ) op.alter_column( - "mapped_variants", "pre_mapped", existing_type=postgresql.JSONB(astext_type=sa.Text()), nullable=False + "mapped_variants", + "pre_mapped", + existing_type=postgresql.JSONB(astext_type=sa.Text()), + nullable=False, ) + op.drop_column("mapped_variants", "current") op.drop_column("mapped_variants", "mapping_api_version") op.drop_column("mapped_variants", "mapped_date") op.drop_column("mapped_variants", "modification_date") diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py index d456f573..50bba4f4 100644 --- a/src/mavedb/models/mapped_variant.py +++ b/src/mavedb/models/mapped_variant.py @@ -1,6 +1,6 @@ from datetime import date -from sqlalchemy import Column, Date, ForeignKey, Integer, String +from sqlalchemy import Boolean, Column, Date, ForeignKey, Integer, String from sqlalchemy.orm import relationship, backref, Mapped from sqlalchemy.dialects.postgresql import JSONB @@ -20,6 +20,7 @@ class MappedVariant(Base): modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) mapped_date = Column(Date, nullable=False) mapping_api_version = Column(String, nullable=False) + current = Column(Boolean, nullable=False) variant_id = Column(Integer, ForeignKey("variants.id"), index=True, nullable=False) variant: Mapped[Variant] = relationship("Variant", backref=backref("mapped_variants", cascade="all,delete-orphan")) From 9530bbcabb37f257e2f51acafec3c0bd0f37838f Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 29 Aug 2024 09:13:27 -0700 Subject: [PATCH 17/54] Add mapping metadata to scoresets table These columns will store metadata about the hashed ids used to store the target sequence, if applicable, in Seqrepo during mapping, as well as hashed ids that point to the accession id of the target gene or mapped reference in Seqrepo. Also stores human-readable labels for mapped reference sequences. --- .../d7e6f8c3b9dc_scoreset_mapping_columns.py | 45 +++++++++++++++++++ src/mavedb/models/score_set.py | 6 +++ 2 files changed, 51 insertions(+) create mode 100644 alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py diff --git a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py new file mode 100644 index 00000000..56d6ea89 --- /dev/null +++ b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py @@ -0,0 +1,45 @@ +"""scoreset_mapping_columns + +Revision ID: d7e6f8c3b9dc +Revises: f36cf612e029 +Create Date: 2024-08-28 09:54:08.249077 + +""" + +from alembic import op +from sqlalchemy.dialects import postgresql +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = "d7e6f8c3b9dc" +down_revision = "f36cf612e029" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column( + "scoresets", + sa.Column( + "mapping_state", + sa.Enum( + "incomplete", + "processing", + "failed", + "success", + name="mappingstate", + native_enum=False, + create_constraint=True, + length=32 + ), + nullable=True, + ), + ) + op.add_column("scoresets", sa.Column("mapping_errors", postgresql.JSONB, nullable=True)) + + +def downgrade(): + op.drop_constraint("mappingstate", table_name="scoresets") + op.drop_column("scoresets", "mapping_state") + op.drop_column("scoresets", "mapping_errors") diff --git a/src/mavedb/models/score_set.py b/src/mavedb/models/score_set.py index fd115095..ee71ec60 100644 --- a/src/mavedb/models/score_set.py +++ b/src/mavedb/models/score_set.py @@ -97,6 +97,12 @@ class ScoreSet(Base): num_variants = Column(Integer, nullable=False, default=0) variants: Mapped[list["Variant"]] = relationship(back_populates="score_set", cascade="all, delete-orphan") + mapping_state = Column( + Enum(ProcessingState, create_constraint=True, length=32, native_enum=False, validate_strings=True), + nullable=True, + ) + mapping_errors = Column(JSONB, nullable=True) + experiment_id = Column(Integer, ForeignKey("experiments.id"), index=True, nullable=False) experiment: Mapped["Experiment"] = relationship(back_populates="score_sets") From 91bcb42026249ed7aee5081d021da3eda712b568 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 29 Aug 2024 12:54:39 -0700 Subject: [PATCH 18/54] Update mapping job to reflect API changes in error reporting --- src/mavedb/worker/jobs.py | 150 +++++++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 19 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index f74a535f..a8d9b94f 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -26,6 +26,8 @@ from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant from mavedb.models.score_set import ScoreSet +from mavedb.models.target_gene import TargetGene +from mavedb.models.target_sequence import TargetSequence from mavedb.models.user import User from mavedb.models.variant import Variant from mavedb.data_providers.services import vrs_mapper @@ -168,38 +170,148 @@ async def map_variants_for_score_set(ctx, score_set_urn: str): blocking = functools.partial(vrs.map_score_set, score_set_urn) loop = asyncio.get_running_loop() + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() + score_set.mapping_state = ProcessingState.processing + try: mapping_results = await loop.run_in_executor(ctx["pool"], blocking) except requests.exceptions.HTTPError as e: - logger.error( + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping. Mapping will be automatically retried for this score set." + } + logger.critical( f"Encountered an exception while mapping variants for {score_set_urn}", exc_info=e, ) + send_slack_message(err=e) + # TODO put back in queue? return logger.debug("Done mapping variants.") + # pseudo code + # if mapping results: + # check whether there are mapped scores. + # if there are not mapped scores, there was a score-set-wide error: + # this means that nothing will be inserted into the mapped variants table + # or into the target genes table. + # insert a failed status and the reason into the scoresets table. + # else (if there are mapped scores): + # insert the pre- and post-mapped sequence metadata into the target genes table + # for each mapped score: + # if there are both pre and post mapped objects, set current to True. else, current is False. + # if current is True, set rows with same variant id to current=False. + # insert the mapped variant as below, using the value of current decided above. + # keep track of how many mapped scores were set to current. if 100%, then set mapping_state in scoresets table to complete. + # if not 100%, then set mapping_state in scoresets table to incomplete. (not sure about this, because variants like '=' might fail... so could be misleading...) + # if 0%, then set score set status to failed and error message is that none of the variants mapped + + # else (if not mapping results): + # this would be a problem. there should either be an httperror or something returned from the api. + # so throw a critical error if this else is hit. + if mapping_results: - existing_variants = select(Variant.id).join(ScoreSet).where(ScoreSet.urn == score_set_urn) - db.execute(delete(MappedVariant).where(MappedVariant.variant_id.in_(existing_variants))) - logger.debug("Removed existing mapped variants for this score set.") - - for mapped_score in mapping_results["mapped_scores"]: - variant_urn = mapped_score["mavedb_id"] - variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() - - mapped_variant = MappedVariant( - pre_mapped=mapped_score["pre_mapped_2_0"], - post_mapped=mapped_score["post_mapped_2_0"], - variant_id=variant.id, - modification_date=date.today(), - mapped_date=date.today(), - vrs_version=mapping_results["vrs_version"], - mapping_api_version=mapping_results["api_version"], + if not mapping_results["mapped_scores"]: + # if there are no mapped scores, the score set failed to map. + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly + else: + # TODO this assumes single-target mapping, will need to be changed to support multi-target mapping + # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached + # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. + # TODO this also assumes that the score set is based on a target sequence, not a target accession + + #target_genes = db.scalars(select(TargetGene).join(ScoreSet).where(ScoreSet.urn == score_set_urn)).all() + if mapping_results["computed_genomic_reference_sequence"]: + target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] + elif mapping_results["computed_protein_reference_sequence"]: + target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + logger.error("No target sequence metadata provided by mapping job", exc_info=1) + # TODO assumes that no hgvs_nt strings were supplied if target sequence is protein. this is currently true + # but is it guaranteed? + target_gene = db.scalars(select(TargetGene) + .join(ScoreSet) + .join(TargetSequence) + .where( + ScoreSet.urn == score_set_urn, + TargetSequence.sequence == target_sequence, + ) + ) + + # TODO may want to append to json rather than replace? + # TODO cast to jsonb? + if mapping_results["computed_genomic_reference_sequence"] and mapping_results["mapped_genomic_reference_sequence"]: + target_gene.pre_mapped_metadata = {"genomic": mapping_results["computed_genomic_reference_sequence"]} + target_gene.post_mapped_metadata = {"genomic": mapping_results["mapped_genomic_reference_sequence"]} + elif mapping_results["computed_protein_reference_sequence"] and mapping_results["mapped_protein_reference_sequence"]: + target_gene.pre_mapped_metadata = {"protein": mapping_results["computed_protein_reference_sequence"]} + target_gene.post_mapped_metadata = {"protein": mapping_results["mapped_protein_reference_sequence"]} + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + logger.error("No mapped reference sequence metadata provided by mapping job", exc_info=1) + + total_variants = 0 + successful_mapped_variants = 0 + for mapped_score in mapping_results["mapped_scores"]: + total_variants += 1 + variant_urn = mapped_score["mavedb_id"] + variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + + # TODO if there is an existing mapped variant, then only set this one to current if pre and post mapped objects both exist. + # or should we always set current to false if pre and post mapped objects aren't both successful, even if there is no existing mapped variant? + if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: + current = True + successful_mapped_variants += 1 + else: + current = False + + # there should only be one current mapped variant per variant id, so update old mapped variant to current = false + if current: + db.query(MappedVariant).filter(MappedVariant.variant_id == variant.id).update({"current": False}) + + mapped_variant = MappedVariant( + pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, + post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, + variant_id=variant.id, + modification_date=date.today(), + mapped_date=mapped_score["mapped_date_utc"], + vrs_version=mapping_results["vrs_version"], + mapping_api_version=mapping_results["dcd_mapping_version"], + error_message=mapping_results["error_message"] if mapping_results["error_message"] else None, + current=current, + ) + db.add(mapped_variant) + + logger.info(f"Inserted {len(mapping_results['mapped_scores'])} mapped variants.") + + if successful_mapped_variants == 0: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = {"error_message": "All variants failed to map"} + elif successful_mapped_variants < total_variants: + score_set.mapping_state = ProcessingState.incomplete + else: + score_set.mapping_state = ProcessingState.complete + + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + logger.critical( + f"No mapping job output for {score_set_urn}, but no HTTPError encountered.", + exc_info=1, ) - db.add(mapped_variant) + return - logger.info(f"Inserted {len(mapping_results['mapped_scores'])} mapped variants.") db.commit() From b05b4da8d2a2275825f1340fc97f834973ade9b4 Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Fri, 30 Aug 2024 09:44:56 +1000 Subject: [PATCH 19/54] Remove print. --- src/mavedb/routers/statistics.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/mavedb/routers/statistics.py b/src/mavedb/routers/statistics.py index 61be6a5c..29445fe2 100644 --- a/src/mavedb/routers/statistics.py +++ b/src/mavedb/routers/statistics.py @@ -260,7 +260,6 @@ def _record_from_field_and_model( model_created_by_field = getattr(queried_model, "created_by_id") model_published_data_field = getattr(queried_model, "published_date") if field is RecordFields.createdBy: - print("111") query = ( select(User.username, func.count(User.id)) .join(queried_model, model_created_by_field == User.id) @@ -270,7 +269,6 @@ def _record_from_field_and_model( return db.execute(query).all() else: - print("2222") # All assc table identifiers which are linked to a published model. queried_assc_table = association_tables[model][field] published_score_sets_statement = ( From c95beebabcd9d4c446f971001f3ef0b443af5d51 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 15:13:43 -0700 Subject: [PATCH 20/54] Remove extraneous mapping queue from ctx --- src/mavedb/worker/settings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index f62efe24..1972aad0 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -28,7 +28,6 @@ async def startup(ctx): - ctx["mapping_queue"] = [] ctx["pool"] = futures.ProcessPoolExecutor() From 5e0b1e546cb266ce68ee3f941d3d2df8700c30a8 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 15:14:48 -0700 Subject: [PATCH 21/54] Fix key error in event of no job result --- src/mavedb/lib/logging/canonical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mavedb/lib/logging/canonical.py b/src/mavedb/lib/logging/canonical.py index 2e2a64a3..03f95000 100644 --- a/src/mavedb/lib/logging/canonical.py +++ b/src/mavedb/lib/logging/canonical.py @@ -27,7 +27,6 @@ async def log_job(ctx: dict) -> None: if not result: logger.warning(msg=f"Job finished, but could not retrieve a job result for job {job_id}.", extra=log_context) - log_context.pop("message") else: log_context = { **log_context, From e7ac53c2bbfbbc1c738f311bca9684cd447e2226 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 15:31:28 -0700 Subject: [PATCH 22/54] Update variant mapper manager for new logging Logging now requires jobs to be retained, so instead of using the same job id for all mapping jobs, instead track the job id of the currently running mapping job and check whether that job has completed before queueing a new job. Also, add more exception handling to worker mapping job. --- src/mavedb/routers/mapped_variant.py | 3 + src/mavedb/worker/jobs.py | 392 ++++++++++++++++----------- 2 files changed, 233 insertions(+), 162 deletions(-) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index b2237fa3..c5ddbc81 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -57,4 +57,7 @@ async def map_score_set(*, urn: str, worker: ArqRedis = Depends(deps.get_worker) await worker.lpush(MAPPING_QUEUE_NAME, urn) # type: ignore await worker.enqueue_job( "variant_mapper_manager", + None, + None, + None ) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index a8d9b94f..dfae8b5b 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -35,6 +35,7 @@ logger = logging.getLogger(__name__) MAPPING_QUEUE_NAME = "vrs_mapping_queue" +MAPPING_CURRENT_ID_NAME = "vrs_mapping_current_job_id" def setup_job_state(ctx, invoker: int, resource: str, correlation_id: str): ctx["state"][ctx["job_id"]] = { @@ -54,6 +55,7 @@ async def create_variants_for_score_set( On any raised exception, ensure ProcessingState of score set is set to `failed` prior to exiting. """ + logging_context = {} try: logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) logger.info(msg="Began processing of score set variants.", extra=logging_context) @@ -149,7 +151,7 @@ async def create_variants_for_score_set( logger.info(msg="Finished creating variants in score set.", extra=logging_context) await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager") + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) finally: db.add(score_set) db.commit() @@ -160,180 +162,246 @@ async def create_variants_for_score_set( return score_set.processing_state.name -async def map_variants_for_score_set(ctx, score_set_urn: str): - db: Session = ctx["db"] - - logger.info(f"Started variant mapping for score set: {score_set_urn}") +async def map_variants_for_score_set( + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int +): + logging_context = {} + try: + db: Session = ctx["db"] + redis: ArqRedis = ctx["redis"] - # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. - vrs = vrs_mapper() - blocking = functools.partial(vrs.map_score_set, score_set_urn) - loop = asyncio.get_running_loop() + logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) + logger.info(msg="Started variant mapping", extra=logging_context) - score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() - score_set.mapping_state = ProcessingState.processing + # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. + vrs = vrs_mapper() + blocking = functools.partial(vrs.map_score_set, score_set_urn) + loop = asyncio.get_running_loop() - try: - mapping_results = await loop.run_in_executor(ctx["pool"], blocking) - except requests.exceptions.HTTPError as e: - score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping. Mapping will be automatically retried for this score set." - } - logger.critical( - f"Encountered an exception while mapping variants for {score_set_urn}", - exc_info=e, - ) - send_slack_message(err=e) - # TODO put back in queue? - return + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() + score_set.mapping_state = ProcessingState.processing + score_set.mapping_errors = null() - logger.debug("Done mapping variants.") - - # pseudo code - # if mapping results: - # check whether there are mapped scores. - # if there are not mapped scores, there was a score-set-wide error: - # this means that nothing will be inserted into the mapped variants table - # or into the target genes table. - # insert a failed status and the reason into the scoresets table. - # else (if there are mapped scores): - # insert the pre- and post-mapped sequence metadata into the target genes table - # for each mapped score: - # if there are both pre and post mapped objects, set current to True. else, current is False. - # if current is True, set rows with same variant id to current=False. - # insert the mapped variant as below, using the value of current decided above. - # keep track of how many mapped scores were set to current. if 100%, then set mapping_state in scoresets table to complete. - # if not 100%, then set mapping_state in scoresets table to incomplete. (not sure about this, because variants like '=' might fail... so could be misleading...) - # if 0%, then set score set status to failed and error message is that none of the variants mapped - - # else (if not mapping results): - # this would be a problem. there should either be an httperror or something returned from the api. - # so throw a critical error if this else is hit. - - if mapping_results: - if not mapping_results["mapped_scores"]: - # if there are no mapped scores, the score set failed to map. + try: + mapping_results = await loop.run_in_executor(ctx["pool"], blocking) + except requests.exceptions.HTTPError as e: score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly - else: - # TODO this assumes single-target mapping, will need to be changed to support multi-target mapping - # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached - # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. - # TODO this also assumes that the score set is based on a target sequence, not a target accession - - #target_genes = db.scalars(select(TargetGene).join(ScoreSet).where(ScoreSet.urn == score_set_urn)).all() - if mapping_results["computed_genomic_reference_sequence"]: - target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] - elif mapping_results["computed_protein_reference_sequence"]: - target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] - else: - score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." - } - logger.error("No target sequence metadata provided by mapping job", exc_info=1) - # TODO assumes that no hgvs_nt strings were supplied if target sequence is protein. this is currently true - # but is it guaranteed? - target_gene = db.scalars(select(TargetGene) - .join(ScoreSet) - .join(TargetSequence) - .where( - ScoreSet.urn == score_set_urn, - TargetSequence.sequence == target_sequence, - ) + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping. Mapping will be automatically retried for this score set." + } + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logging_context["mapping_state"] = score_set.mapping_state.name + logger.critical( + msg="Encountered an exception while mapping variants", + extra=logging_context ) + send_slack_message(err=e) + # put back in queue, since this is an internal error rather than a problem with the score set + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + db.commit() + return - # TODO may want to append to json rather than replace? - # TODO cast to jsonb? - if mapping_results["computed_genomic_reference_sequence"] and mapping_results["mapped_genomic_reference_sequence"]: - target_gene.pre_mapped_metadata = {"genomic": mapping_results["computed_genomic_reference_sequence"]} - target_gene.post_mapped_metadata = {"genomic": mapping_results["mapped_genomic_reference_sequence"]} - elif mapping_results["computed_protein_reference_sequence"] and mapping_results["mapped_protein_reference_sequence"]: - target_gene.pre_mapped_metadata = {"protein": mapping_results["computed_protein_reference_sequence"]} - target_gene.post_mapped_metadata = {"protein": mapping_results["mapped_protein_reference_sequence"]} - else: - score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." - } - logger.error("No mapped reference sequence metadata provided by mapping job", exc_info=1) - - total_variants = 0 - successful_mapped_variants = 0 - for mapped_score in mapping_results["mapped_scores"]: - total_variants += 1 - variant_urn = mapped_score["mavedb_id"] - variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() - - # TODO if there is an existing mapped variant, then only set this one to current if pre and post mapped objects both exist. - # or should we always set current to false if pre and post mapped objects aren't both successful, even if there is no existing mapped variant? - if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: - current = True - successful_mapped_variants += 1 - else: - current = False - - # there should only be one current mapped variant per variant id, so update old mapped variant to current = false - if current: - db.query(MappedVariant).filter(MappedVariant.variant_id == variant.id).update({"current": False}) - - mapped_variant = MappedVariant( - pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, - post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, - variant_id=variant.id, - modification_date=date.today(), - mapped_date=mapped_score["mapped_date_utc"], - vrs_version=mapping_results["vrs_version"], - mapping_api_version=mapping_results["dcd_mapping_version"], - error_message=mapping_results["error_message"] if mapping_results["error_message"] else None, - current=current, - ) - db.add(mapped_variant) - - logger.info(f"Inserted {len(mapping_results['mapped_scores'])} mapped variants.") - - if successful_mapped_variants == 0: + logger.debug("Done mapping variants.") + + if mapping_results: + if not mapping_results["mapped_scores"]: + # if there are no mapped scores, the score set failed to map. score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = {"error_message": "All variants failed to map"} - elif successful_mapped_variants < total_variants: - score_set.mapping_state = ProcessingState.incomplete + score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly else: - score_set.mapping_state = ProcessingState.complete - - else: - score_set.mapping_state = ProcessingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." - } - logger.critical( - f"No mapping job output for {score_set_urn}, but no HTTPError encountered.", - exc_info=1, - ) - return + # TODO after adding multi target mapping support: + # this assumes single-target mapping, will need to be changed to support multi-target mapping + # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached + # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. + # TODO after adding accession-based score set mapping support: + # this also assumes that the score set is based on a target sequence, not a target accession + + if mapping_results["computed_genomic_reference_sequence"]: + target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] + elif mapping_results["computed_protein_reference_sequence"]: + target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + # TODO create error here and send to slack + logging_context["mapping_state"] = score_set.mapping_state.name + logger.error(msg="No target sequence metadata provided by mapping job", extra=logging_context) + # put back in queue, since this is an internal error rather than a problem with the score set + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + db.commit() + return + + target_gene = db.scalars(select(TargetGene) + .join(ScoreSet) + .join(TargetSequence) + .where( + ScoreSet.urn == str(score_set_urn), + TargetSequence.sequence == target_sequence, + ) + ).one() + + excluded_pre_mapped_keys = {"sequence"} + if mapping_results["computed_genomic_reference_sequence"] and mapping_results["mapped_genomic_reference_sequence"]: + pre_mapped_metadata = mapping_results["computed_genomic_reference_sequence"] + target_gene.pre_mapped_metadata = { + "genomic": { + k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + } + } + target_gene.post_mapped_metadata = {"genomic": mapping_results["mapped_genomic_reference_sequence"]} + elif mapping_results["computed_protein_reference_sequence"] and mapping_results["mapped_protein_reference_sequence"]: + pre_mapped_metadata = mapping_results["computed_protein_reference_sequence"] + target_gene.pre_mapped_metadata = { + "protein": { + k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + } + } + target_gene.post_mapped_metadata = {"protein": mapping_results["mapped_protein_reference_sequence"]} + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + # TODO create error here and send to slack + logging_context["mapping_state"] = score_set.mapping_state.name + logger.error( + msg="No mapped reference sequence metadata provided by mapping job", + extra=logging_context, + ) + # put back in queue, since this is an internal error rather than a problem with the score set + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + db.commit() + return + + total_variants = 0 + successful_mapped_variants = 0 + for mapped_score in mapping_results["mapped_scores"]: + total_variants += 1 + variant_urn = mapped_score["mavedb_id"] + variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + + # TODO check with team that this is desired behavior + # (another possible behavior would be to always set current to true if there is no other 'current' for this variant id) + if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: + current = True + successful_mapped_variants += 1 + else: + current = False + + # there should only be one current mapped variant per variant id, so update old mapped variant to current = false + if current: + db.query(MappedVariant).filter(MappedVariant.variant_id == variant.id).update({"current": False}) + + mapped_variant = MappedVariant( + pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, + post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, + variant_id=variant.id, + modification_date=date.today(), + mapped_date=mapping_results["mapped_date_utc"], + vrs_version=mapped_score["vrs_version"] if mapped_score["vrs_version"] else None, + mapping_api_version=mapping_results["dcd_mapping_version"], + error_message=mapped_score["error_message"] if mapped_score["error_message"] else None, + current=current, + ) + db.add(mapped_variant) + + if successful_mapped_variants == 0: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = {"error_message": "All variants failed to map"} + elif successful_mapped_variants < total_variants: + score_set.mapping_state = ProcessingState.incomplete + else: + score_set.mapping_state = ProcessingState.success - db.commit() + logging_context["mapped_variants_inserted_db"] = len(mapping_results['mapped_scores']) + logging_context["mapping_state"] = score_set.mapping_state.name + logger.info(msg="Inserted mapped variants into db.", extra=logging_context) + else: + score_set.mapping_state = ProcessingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + } + # TODO create error here and send to slack + logging_context["mapping_state"] = score_set.mapping_state.name + logger.critical( + msg="No mapping job output for score set, but no HTTPError encountered.", + extra=logging_context, + ) + # put back in queue, since this is an internal error rather than a problem with the score set + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + db.commit() + return -async def variant_mapper_manager(ctx: dict) -> Optional[Job]: - logger.debug("Variant mapping manager began execution") - redis: ArqRedis = ctx["redis"] + db.commit() + except Exception as e: + # score set selection is performed in try statement, so don't update the db if this outer except statement is reached + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logging_context["mapping_state"] = score_set.mapping_state.name + logger.error( + msg="An unexpected error occurred during variant mapping.", + extra=logging_context + ) + # put back in queue, since this is an internal error rather than a problem with the score set + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + db.rollback() + return + - queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type:ignore - if queue_length == 0: - logger.debug("No mapping jobs exist in the queue.") - return None - logger.debug(f"{queue_length} mapping job(s) are queued.") +async def variant_mapper_manager( + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int +) -> Optional[Job]: + logging_context = {} + try: + logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) + logger.debug(msg="Variant mapping manager began execution", extra=logging_context) + redis: ArqRedis = ctx["redis"] - job = Job(job_id="vrs_map", redis=redis) - if await job.status() is JobStatus.not_found: - logger.info("No mapping jobs are running, queuing a new one.") - queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type:ignore - return await redis.enqueue_job("map_variants_for_score_set", queued_urn, _job_id="vrs_map") - else: - logger.debug("A mapping job is already running, deferring mapping by 5 minutes.") + queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type:ignore + if queue_length == 0: + logger.debug(msg="No mapping jobs exist in the queue.", extra=logging_context) + return None + + logging_context["variant_map_queue_length"] = queue_length + logger.debug(msg="Found mapping job(s) in queue", extra=logging_context) + + if await redis.exists(MAPPING_CURRENT_ID_NAME): + mapping_job_id = await redis.get(MAPPING_CURRENT_ID_NAME) + if mapping_job_id: + mapping_job_id = mapping_job_id.decode('utf-8') + job_status = await Job(job_id=mapping_job_id, redis=redis).status() + + if not mapping_job_id or job_status is JobStatus.not_found or job_status is JobStatus.complete: + logger.info(msg="No mapping jobs are running, queuing a new one.", extra=logging_context) + queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type:ignore + queued_urn = queued_urn.decode('utf-8') + # NOTE: the score_set_urn provided to this function is only used for logging context; + # get the urn from the queue and pass that urn to map_variants_for_score_set + new_job = await redis.enqueue_job("map_variants_for_score_set", correlation_id, queued_urn, updater_id) + await redis.set(MAPPING_CURRENT_ID_NAME, new_job.job_id) + return new_job + else: + logger.debug( + msg="A mapping job is already running, deferring mapping by 5 minutes.", + extra=logging_context, + ) - # Our persistent Redis queue and ARQ's execution rules ensure that even if the worker is stopped and not restarted - # before the deferred time, these deferred jobs will still run once able. - return await redis.enqueue_job("variant_mapper_manager", _defer_by=timedelta(minutes=5)) + # Our persistent Redis queue and ARQ's execution rules ensure that even if the worker is stopped and not restarted + # before the deferred time, these deferred jobs will still run once able. + return await redis.enqueue_job("variant_mapper_manager", _defer_by=timedelta(minutes=5)) + except Exception as e: + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="An unexpected error occurred during variant mapper management.", + extra=logging_context + ) \ No newline at end of file From 7c6603c5ce9d383b5d77ecfba4fb1f8ab2f8fe77 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 16:47:24 -0700 Subject: [PATCH 23/54] Create mapping state enum Previously used processing state enum for mapping state field, but this caused database constraint issues. --- ...6f7_merge_76e1e55bc5c1_and_d7e6f8c3b9dc.py | 24 +++++++++++++++++++ .../d7e6f8c3b9dc_scoreset_mapping_columns.py | 2 +- src/mavedb/models/enums/mapping_state.py | 8 +++++++ src/mavedb/models/score_set.py | 3 ++- src/mavedb/worker/jobs.py | 21 ++++++++-------- 5 files changed, 46 insertions(+), 12 deletions(-) create mode 100644 alembic/versions/1d4933b4b6f7_merge_76e1e55bc5c1_and_d7e6f8c3b9dc.py create mode 100644 src/mavedb/models/enums/mapping_state.py diff --git a/alembic/versions/1d4933b4b6f7_merge_76e1e55bc5c1_and_d7e6f8c3b9dc.py b/alembic/versions/1d4933b4b6f7_merge_76e1e55bc5c1_and_d7e6f8c3b9dc.py new file mode 100644 index 00000000..91353f2a --- /dev/null +++ b/alembic/versions/1d4933b4b6f7_merge_76e1e55bc5c1_and_d7e6f8c3b9dc.py @@ -0,0 +1,24 @@ +"""merge 76e1e55bc5c1 and d7e6f8c3b9dc + +Revision ID: 1d4933b4b6f7 +Revises: 76e1e55bc5c1, d7e6f8c3b9dc +Create Date: 2024-09-04 16:17:20.875937 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '1d4933b4b6f7' +down_revision = ('76e1e55bc5c1', 'd7e6f8c3b9dc') +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass diff --git a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py index 56d6ea89..ec242bd6 100644 --- a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py +++ b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py @@ -27,7 +27,7 @@ def upgrade(): "incomplete", "processing", "failed", - "success", + "complete", name="mappingstate", native_enum=False, create_constraint=True, diff --git a/src/mavedb/models/enums/mapping_state.py b/src/mavedb/models/enums/mapping_state.py new file mode 100644 index 00000000..c58299af --- /dev/null +++ b/src/mavedb/models/enums/mapping_state.py @@ -0,0 +1,8 @@ +import enum + + +class MappingState(enum.Enum): + incomplete = "incomplete" + processing = "processing" + failed = "failed" + complete = "complete" diff --git a/src/mavedb/models/score_set.py b/src/mavedb/models/score_set.py index ee71ec60..d7bea1dc 100644 --- a/src/mavedb/models/score_set.py +++ b/src/mavedb/models/score_set.py @@ -8,6 +8,7 @@ from typing import List, TYPE_CHECKING, Optional from mavedb.db.base import Base +from mavedb.models.enums.mapping_state import MappingState from mavedb.models.enums.processing_state import ProcessingState import mavedb.models.score_set_publication_identifier @@ -98,7 +99,7 @@ class ScoreSet(Base): variants: Mapped[list["Variant"]] = relationship(back_populates="score_set", cascade="all, delete-orphan") mapping_state = Column( - Enum(ProcessingState, create_constraint=True, length=32, native_enum=False, validate_strings=True), + Enum(MappingState, create_constraint=True, length=32, native_enum=False, validate_strings=True), nullable=True, ) mapping_errors = Column(JSONB, nullable=True) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index dfae8b5b..9dd2200c 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -23,6 +23,7 @@ from mavedb.lib.validation.dataframe import ( validate_and_standardize_dataframe_pair, ) +from mavedb.models.enums.mapping_state import MappingState from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.mapped_variant import MappedVariant from mavedb.models.score_set import ScoreSet @@ -179,13 +180,13 @@ async def map_variants_for_score_set( loop = asyncio.get_running_loop() score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() - score_set.mapping_state = ProcessingState.processing + score_set.mapping_state = MappingState.processing score_set.mapping_errors = null() try: mapping_results = await loop.run_in_executor(ctx["pool"], blocking) except requests.exceptions.HTTPError as e: - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": "Encountered an internal server error during mapping. Mapping will be automatically retried for this score set." } @@ -207,7 +208,7 @@ async def map_variants_for_score_set( if mapping_results: if not mapping_results["mapped_scores"]: # if there are no mapped scores, the score set failed to map. - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly else: # TODO after adding multi target mapping support: @@ -222,7 +223,7 @@ async def map_variants_for_score_set( elif mapping_results["computed_protein_reference_sequence"]: target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] else: - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." } @@ -262,7 +263,7 @@ async def map_variants_for_score_set( } target_gene.post_mapped_metadata = {"protein": mapping_results["mapped_protein_reference_sequence"]} else: - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." } @@ -311,19 +312,19 @@ async def map_variants_for_score_set( db.add(mapped_variant) if successful_mapped_variants == 0: - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "All variants failed to map"} elif successful_mapped_variants < total_variants: - score_set.mapping_state = ProcessingState.incomplete + score_set.mapping_state = MappingState.incomplete else: - score_set.mapping_state = ProcessingState.success + score_set.mapping_state = MappingState.complete logging_context["mapped_variants_inserted_db"] = len(mapping_results['mapped_scores']) logging_context["mapping_state"] = score_set.mapping_state.name logger.info(msg="Inserted mapped variants into db.", extra=logging_context) else: - score_set.mapping_state = ProcessingState.failed + score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." } @@ -344,7 +345,7 @@ async def map_variants_for_score_set( # score set selection is performed in try statement, so don't update the db if this outer except statement is reached send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logging_context["mapping_state"] = score_set.mapping_state.name + logging_context["mapping_state"] = MappingState.failed.name logger.error( msg="An unexpected error occurred during variant mapping.", extra=logging_context From e8987889387600bb8951cdf19e84507676218129 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 17:37:39 -0700 Subject: [PATCH 24/54] Fix mypy errors in vrs mapping job --- src/mavedb/data_providers/services.py | 18 +++++++++++------- src/mavedb/worker/jobs.py | 23 +++++++++++++---------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index fd6ccc86..ccfb0c81 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,5 +1,6 @@ import requests -from typing import Optional, Union, TypedDict +from datetime import date +from typing import Optional, TypedDict from cdot.hgvs.dataproviders import ChainedSeqFetcher, FastaSeqFetcher, RESTDataProvider @@ -23,12 +24,15 @@ class VRSMap: url: str class ScoreSetMappingResults(TypedDict): - metadata: dict[str, str] - computed_reference_sequence: dict[str, str] - mapped_reference_sequence: dict[str, str] - mapped_scores: list[dict] - vrs_version: str - api_version: str + metadata: Optional[dict[str,str]] + dcd_mapping_version: str + mapped_date_utc: date + computed_genomic_reference_sequence: Optional[dict[str, str]] + mapped_genomic_reference_sequence: Optional[dict[str, str]] + computed_protein_reference_sequence: Optional[dict[str, str]] + mapped_protein_reference_sequence: Optional[dict[str, str]] + mapped_scores: Optional[list[dict]] + error_message: Optional[str] def __init__(self, url: str) -> None: self.url = url diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 9dd2200c..b3840f67 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -3,13 +3,13 @@ import logging import requests from datetime import timedelta, date -from typing import Optional, Union +from typing import Optional import pandas as pd from arq import ArqRedis from arq.jobs import Job, JobStatus from cdot.hgvs.dataproviders import RESTDataProvider -from sqlalchemy import delete, select, null +from sqlalchemy import cast, delete, JSONB, select, null from sqlalchemy.orm import Session from mavedb.lib.score_sets import ( @@ -248,20 +248,20 @@ async def map_variants_for_score_set( excluded_pre_mapped_keys = {"sequence"} if mapping_results["computed_genomic_reference_sequence"] and mapping_results["mapped_genomic_reference_sequence"]: pre_mapped_metadata = mapping_results["computed_genomic_reference_sequence"] - target_gene.pre_mapped_metadata = { + target_gene.pre_mapped_metadata = cast({ "genomic": { k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys } - } - target_gene.post_mapped_metadata = {"genomic": mapping_results["mapped_genomic_reference_sequence"]} + }, JSONB) + target_gene.post_mapped_metadata = cast({"genomic": mapping_results["mapped_genomic_reference_sequence"]}, JSONB) elif mapping_results["computed_protein_reference_sequence"] and mapping_results["mapped_protein_reference_sequence"]: pre_mapped_metadata = mapping_results["computed_protein_reference_sequence"] - target_gene.pre_mapped_metadata = { + target_gene.pre_mapped_metadata = cast({ "protein": { k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys } - } - target_gene.post_mapped_metadata = {"protein": mapping_results["mapped_protein_reference_sequence"]} + }, JSONB) + target_gene.post_mapped_metadata = cast({"protein": mapping_results["mapped_protein_reference_sequence"]}, JSONB) else: score_set.mapping_state = MappingState.failed score_set.mapping_errors = { @@ -388,7 +388,9 @@ async def variant_mapper_manager( # NOTE: the score_set_urn provided to this function is only used for logging context; # get the urn from the queue and pass that urn to map_variants_for_score_set new_job = await redis.enqueue_job("map_variants_for_score_set", correlation_id, queued_urn, updater_id) - await redis.set(MAPPING_CURRENT_ID_NAME, new_job.job_id) + if new_job: # for mypy, since enqueue_job can return None + new_job_id = new_job.job_id + await redis.set(MAPPING_CURRENT_ID_NAME, new_job_id) return new_job else: logger.debug( @@ -405,4 +407,5 @@ async def variant_mapper_manager( logger.error( msg="An unexpected error occurred during variant mapper management.", extra=logging_context - ) \ No newline at end of file + ) + return None From 7b422a2abd0337ad3fd2658135790c09da132486 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 4 Sep 2024 17:40:35 -0700 Subject: [PATCH 25/54] Fix sqlalchemy jsonb type import --- src/mavedb/worker/jobs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index b3840f67..97480a34 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -9,7 +9,8 @@ from arq import ArqRedis from arq.jobs import Job, JobStatus from cdot.hgvs.dataproviders import RESTDataProvider -from sqlalchemy import cast, delete, JSONB, select, null +from sqlalchemy import cast, delete, select, null +from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Session from mavedb.lib.score_sets import ( From a3cbad4b8cc45ce66b930cbdef34c25e0ef613e2 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 6 Sep 2024 09:57:15 -0700 Subject: [PATCH 26/54] Update vrs map tests to include new parameters --- tests/worker/test_jobs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 4ec4ad5b..97d296b2 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -377,7 +377,7 @@ async def test_create_mapped_variants_for_scoreset( # TODO: Does this work on non-unix based machines. # TODO: Is it even a safe operation to patch this event loop method? with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): - await map_variants_for_score_set(standalone_worker_context, score_set.urn) + await map_variants_for_score_set(standalone_worker_context, uuid4().hex, score_set.urn, 1) mapped_variants_for_score_set = session.scalars( select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) @@ -425,7 +425,7 @@ async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants ) session.commit() - await map_variants_for_score_set(standalone_worker_context, score_set.urn) + await map_variants_for_score_set(standalone_worker_context, uuid4().hex, score_set.urn, 1) mapped_variants_for_score_set = session.scalars( select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) @@ -458,7 +458,7 @@ async def awaitable_http_error(): # TODO: Does this work on non-unix based machines? # TODO: Is it even a safe operation to patch this event loop method? with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): - await map_variants_for_score_set(standalone_worker_context, score_set.urn) + await map_variants_for_score_set(standalone_worker_context, uuid4().hex, score_set.urn, 1) # TODO: How are errors persisted? Test persistence mechanism. mapped_variants_for_score_set = session.scalars( @@ -489,7 +489,7 @@ async def test_create_mapped_variants_for_scoreset_no_mapping_output( # TODO: Does this work on non-unix based machines. # TODO: Is it even a safe operation to patch this event loop method? with patch.object(_UnixSelectorEventLoop, "run_in_executor", return_value=mapping_test_output_for_score_set): - await map_variants_for_score_set(standalone_worker_context, score_set.urn) + await map_variants_for_score_set(standalone_worker_context, uuid4().hex, score_set.urn, 1) mapped_variants_for_score_set = session.scalars( select(MappedVariant).join(Variant).join(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set.urn) From 9090f4deebc69173a773ab0a765f74faf9c5f5ed Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 6 Sep 2024 09:57:53 -0700 Subject: [PATCH 27/54] Comment out mapping api router intended for testing --- src/mavedb/routers/mapped_variant.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index c5ddbc81..30d3ad66 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -52,12 +52,13 @@ async def show_mapped_variant(*, urn: str, db: Session = Depends(deps.get_db)) - return await fetch_mapped_variant_by_variant_urn(db, urn) -@router.post("/map/{urn}", status_code=200, responses={404: {}, 500: {}}) -async def map_score_set(*, urn: str, worker: ArqRedis = Depends(deps.get_worker)) -> Any: - await worker.lpush(MAPPING_QUEUE_NAME, urn) # type: ignore - await worker.enqueue_job( - "variant_mapper_manager", - None, - None, - None - ) +# for testing only +# @router.post("/map/{urn}", status_code=200, responses={404: {}, 500: {}}) +# async def map_score_set(*, urn: str, worker: ArqRedis = Depends(deps.get_worker)) -> Any: +# await worker.lpush(MAPPING_QUEUE_NAME, urn) # type: ignore +# await worker.enqueue_job( +# "variant_mapper_manager", +# None, +# None, +# None +# ) From 80b23a43214d2fdae721ef7f396048e1e499dcbd Mon Sep 17 00:00:00 2001 From: EstelleDa Date: Tue, 10 Sep 2024 17:10:52 +1000 Subject: [PATCH 28/54] Debug has_permission function. --- src/mavedb/lib/permissions.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/mavedb/lib/permissions.py b/src/mavedb/lib/permissions.py index 972b91be..1b2f966c 100644 --- a/src/mavedb/lib/permissions.py +++ b/src/mavedb/lib/permissions.py @@ -95,8 +95,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"experiment set with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.UPDATE: if user_may_edit: return PermissionResponse(True) @@ -106,8 +108,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"experiment set with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.DELETE: # Owner may only delete an experiment set if it has not already been published. if user_may_edit: @@ -143,8 +147,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"experiment with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.UPDATE: if user_may_edit: return PermissionResponse(True) @@ -154,8 +160,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"experiment with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.DELETE: # Owner may only delete an experiment if it has not already been published. if user_may_edit: @@ -191,8 +199,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"score set with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.UPDATE: if user_may_edit: return PermissionResponse(True) @@ -202,8 +212,10 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif private: # Do not acknowledge the existence of a private entity. return PermissionResponse(False, 404, f"score set with URN '{item.urn}' not found") + elif user_data is None or user_data.user is None: + return PermissionResponse(False, 401, f"insufficient permissions for URN '{item.urn}'") else: - return PermissionResponse(False) + return PermissionResponse(False, 403, f"insufficient permissions for URN '{item.urn}'") elif action == Action.DELETE: # Owner may only delete a score set if it has not already been published. if user_may_edit: @@ -247,7 +259,7 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> elif roles_permitted(active_roles, [UserRole.admin]): return PermissionResponse(True) else: - return PermissionResponse(False) + return PermissionResponse(False, 403, "Insufficient permissions for user update.") elif action == Action.UPDATE: if user_is_self: return PermissionResponse(True) From a3eec96f1d114326fe932a8c249f0951e23f452a Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 11 Sep 2024 14:17:48 -0700 Subject: [PATCH 29/54] Add Mapping Exceptions for Mapper Worker Jobs --- src/mavedb/lib/exceptions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/mavedb/lib/exceptions.py b/src/mavedb/lib/exceptions.py index d2b10b6c..319e5f13 100644 --- a/src/mavedb/lib/exceptions.py +++ b/src/mavedb/lib/exceptions.py @@ -160,3 +160,15 @@ class NonexistentOrcidUserError(ValueError): """Raised when a user tries to create a contributor with a non-existent ORCID ID""" pass + + +class NonexistentMappingResultsError(ValueError): + """Raised when score set mapping results do not contain mapping results""" + + pass + + +class NonexistentMappingReferenceError(ValueError): + """Raised when score set mapping results do not contain a valid reference sequence""" + + pass From 846da455caf84b7fef424dfd89c0dcab460f1d21 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 11 Sep 2024 14:18:02 -0700 Subject: [PATCH 30/54] Keep Worker Jobs --- src/mavedb/worker/settings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mavedb/worker/settings.py b/src/mavedb/worker/settings.py index 1972aad0..1b181123 100644 --- a/src/mavedb/worker/settings.py +++ b/src/mavedb/worker/settings.py @@ -63,4 +63,3 @@ class ArqWorkerSettings: cron_jobs: list = BACKGROUND_CRONJOBS job_timeout = 5 * 60 * 60 # Keep jobs alive for a long while... - keep_result = 0 # don't keep job results, in order to continuously enqueue new mapping jobs From 364ea84236818d3b38054e7c50d7eab523dd0d61 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 11 Sep 2024 14:21:13 -0700 Subject: [PATCH 31/54] Uncorrelate try...except Blocks During Mapping --- src/mavedb/worker/jobs.py | 622 ++++++++++++++++++++++++++------------ 1 file changed, 430 insertions(+), 192 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 97480a34..9fed2e55 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -2,8 +2,9 @@ import functools import logging import requests +from contextlib import asynccontextmanager from datetime import timedelta, date -from typing import Optional +from typing import Callable, Optional import pandas as pd from arq import ArqRedis @@ -13,6 +14,7 @@ from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Session +from mavedb.lib.exceptions import NonexistentMappingReferenceError, NonexistentMappingResultsError from mavedb.lib.score_sets import ( columns_for_dataset, create_variants, @@ -38,6 +40,18 @@ MAPPING_QUEUE_NAME = "vrs_mapping_queue" MAPPING_CURRENT_ID_NAME = "vrs_mapping_current_job_id" +BACKOFF_LIMIT = 5 +BACKOFF_IN_SECONDS = 15 + + +@asynccontextmanager +async def mapping_in_execution(redis: ArqRedis, job_id: str): + await redis.set(MAPPING_CURRENT_ID_NAME, job_id) + try: + yield + finally: + await redis.set(MAPPING_CURRENT_ID_NAME, "") + def setup_job_state(ctx, invoker: int, resource: str, correlation_id: str): ctx["state"][ctx["job_id"]] = { @@ -49,6 +63,31 @@ def setup_job_state(ctx, invoker: int, resource: str, correlation_id: str): return ctx["state"][ctx["job_id"]] +async def enqueue_job_with_backoff( + redis: ArqRedis, job_name: str, attempt: int, *args +) -> tuple[Optional[str], bool, int]: + new_job_id = None + backoff = None + limit_reached = attempt >= BACKOFF_LIMIT + if not limit_reached: + limit_reached = True + backoff = BACKOFF_IN_SECONDS * (2**attempt) + attempt = attempt + 1 + + # NOTE: for jobs supporting backoff, `attempt` should be the final argument. + new_job = await redis.enqueue_job( + job_name, + *args, + attempt, + _defer_by=timedelta(seconds=backoff), + ) + + if new_job: + new_job_id = new_job.job_id + + return (new_job_id, not limit_reached, backoff) + + async def create_variants_for_score_set( ctx, correlation_id: str, score_set_urn: str, updater_id: int, scores: pd.DataFrame, counts: pd.DataFrame ): @@ -165,248 +204,447 @@ async def create_variants_for_score_set( async def map_variants_for_score_set( - ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int -): - logging_context = {} - try: - db: Session = ctx["db"] - redis: ArqRedis = ctx["redis"] + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 0 +) -> dict: + async with mapping_in_execution(redis=ctx["redis"], job_id=ctx["job_id"]): + logging_context = {} + try: + db: Session = ctx["db"] + redis: ArqRedis = ctx["redis"] - logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) - logger.info(msg="Started variant mapping", extra=logging_context) + logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) + logging_context["attempt"] = attempt + logger.info(msg="Started variant mapping", extra=logging_context) - # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. - vrs = vrs_mapper() - blocking = functools.partial(vrs.map_score_set, score_set_urn) - loop = asyncio.get_running_loop() + except Exception as e: + # TODO: mapping state + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="Variant mapper encountered an unexpected error during setup. This job will not be retried.", + extra=logging_context, + ) - score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() - score_set.mapping_state = MappingState.processing - score_set.mapping_errors = null() + return {"success": False, "retried": False} + try: + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() + score_set.mapping_state = MappingState.processing + score_set.mapping_errors = null() + db.commit() + + logging_context["current_mapping_resource"] = score_set.urn + logging_context["mapping_state"] = score_set.mapping_state + logger.debug(msg="Fetched score set metadata for mapping job.", extra=logging_context) + + except Exception as e: + # TODO: mapping state + db.rollback() + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.warn( + msg="Variant mapper encountered an unexpected error while fetching score set metadata. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False} + + try: + # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. + vrs = vrs_mapper() + blocking = functools.partial(vrs.map_score_set, score_set_urn) + loop = asyncio.get_running_loop() + + except Exception as e: + # TODO: mapping state + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.warn( + msg="Variant mapper encountered an unexpected error while preparing the mapping event loop. This job will not be retried.", + extra=logging_context, + ) + + return {"success": False, "retried": False} + + mapping_results = None try: mapping_results = await loop.run_in_executor(ctx["pool"], blocking) + logger.debug(msg="Done mapping variants.", extra=logging_context) + except requests.exceptions.HTTPError as e: score_set.mapping_state = MappingState.failed score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping. Mapping will be automatically retried for this score set." + "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." } - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logging_context["mapping_state"] = score_set.mapping_state.name - logger.critical( - msg="Encountered an exception while mapping variants", - extra=logging_context - ) - send_slack_message(err=e) - # put back in queue, since this is an internal error rather than a problem with the score set - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) db.commit() - return - - logger.debug("Done mapping variants.") - if mapping_results: - if not mapping_results["mapped_scores"]: - # if there are no mapped scores, the score set failed to map. - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly + logging_context["mapping_state"] = score_set.mapping_state.name + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.warn(msg="Encountered an HTTP error while mapping variants", extra=logging_context) + + new_job_id = None + max_retries_exceeded = None + try: + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( + redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id + ) + logging_context["backoff_limit_exceeded"] = max_retries_exceeded + logging_context["backoff_deferred_in_seconds"] = backoff_time + logging_context["backoff_job_id"] = new_job_id + + except Exception as backoff_e: + send_slack_message(backoff_e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} + logger.critical( + msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", + extra=logging_context, + ) else: - # TODO after adding multi target mapping support: - # this assumes single-target mapping, will need to be changed to support multi-target mapping - # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached - # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. - # TODO after adding accession-based score set mapping support: - # this also assumes that the score set is based on a target sequence, not a target accession - - if mapping_results["computed_genomic_reference_sequence"]: - target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] - elif mapping_results["computed_protein_reference_sequence"]: - target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] - else: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." - } - # TODO create error here and send to slack - logging_context["mapping_state"] = score_set.mapping_state.name - logger.error(msg="No target sequence metadata provided by mapping job", extra=logging_context) - # put back in queue, since this is an internal error rather than a problem with the score set - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) - db.commit() - return - - target_gene = db.scalars(select(TargetGene) - .join(ScoreSet) - .join(TargetSequence) - .where( - ScoreSet.urn == str(score_set_urn), - TargetSequence.sequence == target_sequence, + if new_job_id and not max_retries_exceeded: + logger.info( + msg="After encountering an HTTP error while mapping variants, another mapping job was queued.", + extra=logging_context, + ) + elif new_job_id is None and not max_retries_exceeded: + logger.error( + msg="After encountering an HTTP error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", + extra=logging_context, ) - ).one() - - excluded_pre_mapped_keys = {"sequence"} - if mapping_results["computed_genomic_reference_sequence"] and mapping_results["mapped_genomic_reference_sequence"]: - pre_mapped_metadata = mapping_results["computed_genomic_reference_sequence"] - target_gene.pre_mapped_metadata = cast({ - "genomic": { - k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, JSONB) - target_gene.post_mapped_metadata = cast({"genomic": mapping_results["mapped_genomic_reference_sequence"]}, JSONB) - elif mapping_results["computed_protein_reference_sequence"] and mapping_results["mapped_protein_reference_sequence"]: - pre_mapped_metadata = mapping_results["computed_protein_reference_sequence"] - target_gene.pre_mapped_metadata = cast({ - "protein": { - k: pre_mapped_metadata[k] for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys - } - }, JSONB) - target_gene.post_mapped_metadata = cast({"protein": mapping_results["mapped_protein_reference_sequence"]}, JSONB) else: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." - } - # TODO create error here and send to slack - logging_context["mapping_state"] = score_set.mapping_state.name logger.error( - msg="No mapped reference sequence metadata provided by mapping job", + msg="After encountering an HTTP error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", extra=logging_context, ) - # put back in queue, since this is an internal error rather than a problem with the score set - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) - db.commit() - return - - total_variants = 0 - successful_mapped_variants = 0 - for mapped_score in mapping_results["mapped_scores"]: - total_variants += 1 - variant_urn = mapped_score["mavedb_id"] - variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() - - # TODO check with team that this is desired behavior - # (another possible behavior would be to always set current to true if there is no other 'current' for this variant id) - if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: - current = True - successful_mapped_variants += 1 - else: - current = False - - # there should only be one current mapped variant per variant id, so update old mapped variant to current = false - if current: - db.query(MappedVariant).filter(MappedVariant.variant_id == variant.id).update({"current": False}) - - mapped_variant = MappedVariant( - pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, - post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, - variant_id=variant.id, - modification_date=date.today(), - mapped_date=mapping_results["mapped_date_utc"], - vrs_version=mapped_score["vrs_version"] if mapped_score["vrs_version"] else None, - mapping_api_version=mapping_results["dcd_mapping_version"], - error_message=mapped_score["error_message"] if mapped_score["error_message"] else None, - current=current, + finally: + return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} + + except Exception as e: + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.warn( + msg="Variant mapper encountered an unexpected error while mapping variants. This job will be retried.", + extra=logging_context, + ) + + new_job_id = None + max_retries_exceeded = None + try: + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( + redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id + ) + logging_context["backoff_limit_exceeded"] = max_retries_exceeded + logging_context["backoff_deferred_in_seconds"] = backoff_time + logging_context["backoff_job_id"] = new_job_id + + except Exception as backoff_e: + send_slack_message(backoff_e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} + logger.critical( + msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", + extra=logging_context, + ) + else: + if new_job_id and not max_retries_exceeded: + logger.info( + msg="After encountering an error while mapping variants, another mapping job was queued.", + extra=logging_context, + ) + elif new_job_id is None and not max_retries_exceeded: + logger.error( + msg="After encountering an error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", + extra=logging_context, + ) + else: + logger.error( + msg="After encountering an error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", + extra=logging_context, ) - db.add(mapped_variant) + finally: + return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} - if successful_mapped_variants == 0: + try: + if mapping_results: + if not mapping_results["mapped_scores"]: + # if there are no mapped scores, the score set failed to map. score_set.mapping_state = MappingState.failed - score_set.mapping_errors = {"error_message": "All variants failed to map"} - elif successful_mapped_variants < total_variants: - score_set.mapping_state = MappingState.incomplete + score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly else: - score_set.mapping_state = MappingState.complete + # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: + # this assumes single-target mapping, will need to be changed to support multi-target mapping + # just in case there are multiple target genes in the db for a score set (this point shouldn't be reached + # while we only support single-target mapping), match up the target sequence with the one in the computed genomic reference sequence. + # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: + # this also assumes that the score set is based on a target sequence, not a target accession + + if mapping_results["computed_genomic_reference_sequence"]: + target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] + elif mapping_results["computed_protein_reference_sequence"]: + target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] + else: + raise NonexistentMappingReferenceError() + + target_gene = db.scalars( + select(TargetGene) + .join(ScoreSet) + .join(TargetSequence) + .where( + ScoreSet.urn == str(score_set_urn), + TargetSequence.sequence == target_sequence, + ) + ).one() + + excluded_pre_mapped_keys = {"sequence"} + if ( + mapping_results["computed_genomic_reference_sequence"] + and mapping_results["mapped_genomic_reference_sequence"] + ): + pre_mapped_metadata = mapping_results["computed_genomic_reference_sequence"] + target_gene.pre_mapped_metadata = cast( + { + "genomic": { + k: pre_mapped_metadata[k] + for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + } + }, + JSONB, + ) + target_gene.post_mapped_metadata = cast( + {"genomic": mapping_results["mapped_genomic_reference_sequence"]}, JSONB + ) + elif ( + mapping_results["computed_protein_reference_sequence"] + and mapping_results["mapped_protein_reference_sequence"] + ): + pre_mapped_metadata = mapping_results["computed_protein_reference_sequence"] + target_gene.pre_mapped_metadata = cast( + { + "protein": { + k: pre_mapped_metadata[k] + for k in set(list(pre_mapped_metadata.keys())) - excluded_pre_mapped_keys + } + }, + JSONB, + ) + target_gene.post_mapped_metadata = cast( + {"protein": mapping_results["mapped_protein_reference_sequence"]}, JSONB + ) + else: + raise NonexistentMappingReferenceError() + + total_variants = 0 + successful_mapped_variants = 0 + for mapped_score in mapping_results["mapped_scores"]: + total_variants += 1 + variant_urn = mapped_score["mavedb_id"] + variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + + # there should only be one current mapped variant per variant id, so update old mapped variant to current = false + existing_mapped_variant = ( + db.query(MappedVariant) + .filter(MappedVariant.variant_id == variant.id, MappedVariant.current.is_(True)) + .one_or_none() + ) + + if existing_mapped_variant: + existing_mapped_variant.current = False + + mapped_variant = MappedVariant( + pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, + post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, + variant_id=variant.id, + modification_date=date.today(), + mapped_date=mapping_results["mapped_date_utc"], + vrs_version=mapped_score["vrs_version"] if mapped_score["vrs_version"] else None, + mapping_api_version=mapping_results["dcd_mapping_version"], + error_message=mapped_score["error_message"] if mapped_score["error_message"] else None, + current=True, + ) + db.add(mapped_variant) + + if successful_mapped_variants == 0: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = {"error_message": "All variants failed to map"} + elif successful_mapped_variants < total_variants: + score_set.mapping_state = MappingState.incomplete + else: + score_set.mapping_state = MappingState.complete - logging_context["mapped_variants_inserted_db"] = len(mapping_results['mapped_scores']) - logging_context["mapping_state"] = score_set.mapping_state.name - logger.info(msg="Inserted mapped variants into db.", extra=logging_context) + logging_context["mapped_variants_inserted_db"] = len(mapping_results["mapped_scores"]) + logging_context["mapping_state"] = score_set.mapping_state.name + logger.info(msg="Inserted mapped variants into db.", extra=logging_context) - else: + else: + raise NonexistentMappingResultsError() + + except Exception as e: + db.rollback() score_set.mapping_state = MappingState.failed score_set.mapping_errors = { - "error_message": "Encountered an unexpected error during mapping. Mapping will be automatically retried for this score set." + "error_message": f"Encountered an unexpected error while parsing mapped variants. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." } - # TODO create error here and send to slack - logging_context["mapping_state"] = score_set.mapping_state.name - logger.critical( - msg="No mapping job output for score set, but no HTTPError encountered.", + db.commit() + + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.warn( + msg="An unexpected error occurred during variant mapping. This job will be attempted again.", extra=logging_context, ) - # put back in queue, since this is an internal error rather than a problem with the score set - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) - db.commit() - return + + new_job_id = None + max_retries_exceeded = None + try: + await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + new_job_id, max_retries_exceeded = await enqueue_job_with_backoff( + redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id + ) + logging_context["backoff_limit_exceeded"] = max_retries_exceeded + logging_context["backoff_deferred_in_seconds"] = backoff_time + logging_context["backoff_job_id"] = new_job_id + + except Exception as backoff_e: + send_slack_message(backoff_e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} + logger.critical( + msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", + extra=logging_context, + ) + else: + if new_job_id and not max_retries_exceeded: + logger.info( + msg="After encountering an error while parsing mapped variants, another mapping job was queued.", + extra=logging_context, + ) + elif new_job_id is None and not max_retries_exceeded: + logger.error( + msg="After encountering an error while parsing mapped variants, another mapping job was unable to be queued. This score set will not be mapped.", + extra=logging_context, + ) + else: + logger.error( + msg="After encountering an error while parsing mapped variants, the maximum retries for this job were exceeded. This score set will not be mapped.", + extra=logging_context, + ) + finally: + return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} db.commit() - except Exception as e: - # score set selection is performed in try statement, so don't update the db if this outer except statement is reached - send_slack_message(e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logging_context["mapping_state"] = MappingState.failed.name - logger.error( - msg="An unexpected error occurred during variant mapping.", - extra=logging_context - ) - # put back in queue, since this is an internal error rather than a problem with the score set - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) - db.rollback() - return - + return {"success": True} async def variant_mapper_manager( - ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int -) -> Optional[Job]: + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 0 +) -> dict: logging_context = {} try: + redis: ArqRedis = ctx["redis"] + logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) + logging_context["attempt"] = attempt logger.debug(msg="Variant mapping manager began execution", extra=logging_context) - redis: ArqRedis = ctx["redis"] - queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type:ignore + except Exception as e: + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error(msg="Variant mapper manager encountered an unexpected error during setup.", extra=logging_context) + return {"success": False, "enqueued_job": None} + + mapping_job_id = None + mapping_job_status = None + try: + queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type: ignore + + # Setup the job id cache if it does not already exist. + if await redis.exists(MAPPING_CURRENT_ID_NAME): + await redis.set(MAPPING_CURRENT_ID_NAME, "") + if queue_length == 0: logger.debug(msg="No mapping jobs exist in the queue.", extra=logging_context) - return None + return {"success": True, "enqueued_job": None} - logging_context["variant_map_queue_length"] = queue_length - logger.debug(msg="Found mapping job(s) in queue", extra=logging_context) + logging_context["variant_mapping_queue_length"] = queue_length + logger.debug(msg="Found mapping job(s) in queue.", extra=logging_context) + + mapping_job_id = (await redis.get(MAPPING_CURRENT_ID_NAME)).decode("utf-8") + logging_context["existing_mapping_job_id"] = mapping_job_id + + if mapping_job_id: + mapping_job_status = await Job(job_id=mapping_job_id, redis=redis).status() + logging_context["existing_mapping_job_status"] = mapping_job_status.value + + except Exception as e: + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="Variant mapper manager encountered an unexpected error while fetching the executing mapping job.", + extra=logging_context, + ) + return {"success": False, "enqueued_job": None} + + new_job = None + new_job_id = None + try: + if not mapping_job_id or mapping_job_status in (JobStatus.not_found, JobStatus.complete): + logger.debug(msg="No mapping jobs are running, queuing a new one.", extra=logging_context) + + queued_urn = (await redis.rpop(MAPPING_QUEUE_NAME)).decode("utf-8") # type: ignore + logging_context["current_mapping_resource"] = queued_urn - if await redis.exists(MAPPING_CURRENT_ID_NAME): - mapping_job_id = await redis.get(MAPPING_CURRENT_ID_NAME) - if mapping_job_id: - mapping_job_id = mapping_job_id.decode('utf-8') - job_status = await Job(job_id=mapping_job_id, redis=redis).status() - - if not mapping_job_id or job_status is JobStatus.not_found or job_status is JobStatus.complete: - logger.info(msg="No mapping jobs are running, queuing a new one.", extra=logging_context) - queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type:ignore - queued_urn = queued_urn.decode('utf-8') # NOTE: the score_set_urn provided to this function is only used for logging context; # get the urn from the queue and pass that urn to map_variants_for_score_set - new_job = await redis.enqueue_job("map_variants_for_score_set", correlation_id, queued_urn, updater_id) - if new_job: # for mypy, since enqueue_job can return None - new_job_id = new_job.job_id - await redis.set(MAPPING_CURRENT_ID_NAME, new_job_id) - return new_job - else: - logger.debug( - msg="A mapping job is already running, deferring mapping by 5 minutes.", - extra=logging_context, + new_job = await redis.enqueue_job( + "map_variants_for_score_set", correlation_id, queued_urn, updater_id, attempt ) + if new_job: + new_job_id = new_job.job_id + + logging_context["new_mapping_job_id"] = new_job_id + logger.info(msg="Queued a new mapping job.", extra=logging_context) + + return {"success": True, "enqueued_job": new_job_id} + + logger.info( + msg="A mapping job is already running, or a new job was unable to be enqueued. Deferring mapping by 5 minutes.", + extra=logging_context, + ) + + new_job = await redis.enqueue_job( + "variant_mapper_manager", + correlation_id, + score_set_urn, + updater_id, + attempt, + _defer_by=timedelta(minutes=5), + ) + + if new_job: + new_job_id = new_job.job_id + + logging_context["new_mapping_manager_job_id"] = new_job_id + logger.info(msg="Deferred a new mapping manager job.", extra=logging_context) + # Our persistent Redis queue and ARQ's execution rules ensure that even if the worker is stopped and not restarted # before the deferred time, these deferred jobs will still run once able. - return await redis.enqueue_job("variant_mapper_manager", _defer_by=timedelta(minutes=5)) + return {"success": True, "enqueued_job": new_job_id} + + logger.warn( + msg="Unable to queue a new mapping job or defer mapping. This score set will not be mapped.", + extra=logging_context, + ) + # TODO: If we end up here, we were unable to enqueue a new mapping job or a new manager job despite expecting we should have + # been able to do so. We should raise some sort of exception. + # TODO: set failed/no-map mapping state + return {"success": False, "enqueued_job": new_job_id} + except Exception as e: send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.error( - msg="An unexpected error occurred during variant mapper management.", - extra=logging_context + msg="Variant mapper manager encountered an unexpected error while enqueing a mapping job.", + extra=logging_context, ) - return None + # TODO: set failed/no-map mapping state + return {"success": False, "enqueued_job": new_job_id} From ddfa7dc9bf6a54e5c010b329c0eedad438f4fdc3 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 11 Sep 2024 16:33:12 -0700 Subject: [PATCH 32/54] Add more granular mapping state info --- .../d7e6f8c3b9dc_scoreset_mapping_columns.py | 2 + src/mavedb/models/enums/mapping_state.py | 2 + src/mavedb/worker/jobs.py | 78 +++++++++++++++++-- 3 files changed, 76 insertions(+), 6 deletions(-) diff --git a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py index ec242bd6..dd1036c6 100644 --- a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py +++ b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py @@ -28,6 +28,8 @@ def upgrade(): "processing", "failed", "complete", + "pending_variant_processing", + "not_attempted", name="mappingstate", native_enum=False, create_constraint=True, diff --git a/src/mavedb/models/enums/mapping_state.py b/src/mavedb/models/enums/mapping_state.py index c58299af..9e803c49 100644 --- a/src/mavedb/models/enums/mapping_state.py +++ b/src/mavedb/models/enums/mapping_state.py @@ -6,3 +6,5 @@ class MappingState(enum.Enum): processing = "processing" failed = "failed" complete = "complete" + pending_variant_processing = "pending_variant_processing" + not_attempted = "not_attempted" diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 9fed2e55..9ab25538 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -110,7 +110,9 @@ async def create_variants_for_score_set( score_set.modified_by = updated_by score_set.processing_state = ProcessingState.processing + score_set.mapping_state = MappingState.pending_variant_processing logging_context["processing_state"] = score_set.processing_state.name + logging_context["mapping_state"] = score_set.mapping_state.name db.add(score_set) db.commit() @@ -151,9 +153,11 @@ async def create_variants_for_score_set( db.rollback() score_set.processing_state = ProcessingState.failed score_set.processing_errors = {"exception": str(e), "detail": e.triggering_exceptions} + score_set.mapping_state = MappingState.not_attempted logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name + logging_context["mapping_state"] = score_set.mapping_state.name logger.warning(msg="Encountered a validation error while processing variants.", extra=logging_context) # NOTE: Since these are likely to be internal errors, it makes less sense to add them to the DB and surface them to the end user. @@ -162,9 +166,11 @@ async def create_variants_for_score_set( db.rollback() score_set.processing_state = ProcessingState.failed score_set.processing_errors = {"exception": str(e), "detail": []} + score_set.mapping_state = MappingState.not_attempted logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name + logging_context["mapping_state"] = score_set.mapping_state.name logger.warning(msg="Encountered an internal exception while processing variants.", extra=logging_context) send_slack_message(err=e) @@ -173,10 +179,12 @@ async def create_variants_for_score_set( except BaseException as e: db.rollback() score_set.processing_state = ProcessingState.failed + score_set.mapping_state = MappingState.not_attempted db.commit() logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name + logging_context["mapping_state"] = score_set.mapping_state.name logger.error( msg="Encountered an unhandled exception while creating variants for score set.", extra=logging_context ) @@ -217,7 +225,7 @@ async def map_variants_for_score_set( logger.info(msg="Started variant mapping", extra=logging_context) except Exception as e: - # TODO: mapping state + # NOTE: can't update mapping state here because setup is necessary to update the db send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.error( @@ -226,7 +234,8 @@ async def map_variants_for_score_set( ) return {"success": False, "retried": False} - + + score_set = None try: score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() score_set.mapping_state = MappingState.processing @@ -238,11 +247,16 @@ async def map_variants_for_score_set( logger.debug(msg="Fetched score set metadata for mapping job.", extra=logging_context) except Exception as e: - # TODO: mapping state db.rollback() + if score_set: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.warn( + logger.error( msg="Variant mapper encountered an unexpected error while fetching score set metadata. This job will not be retried.", extra=logging_context, ) @@ -256,10 +270,15 @@ async def map_variants_for_score_set( loop = asyncio.get_running_loop() except Exception as e: - # TODO: mapping state + db.rollback() + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.warn( + logger.error( msg="Variant mapper encountered an unexpected error while preparing the mapping event loop. This job will not be retried.", extra=logging_context, ) @@ -272,6 +291,7 @@ async def map_variants_for_score_set( logger.debug(msg="Done mapping variants.", extra=logging_context) except requests.exceptions.HTTPError as e: + db.rollback() score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." @@ -294,6 +314,11 @@ async def map_variants_for_score_set( logging_context["backoff_job_id"] = new_job_id except Exception as backoff_e: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() send_slack_message(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} logger.critical( @@ -307,11 +332,21 @@ async def map_variants_for_score_set( extra=logging_context, ) elif new_job_id is None and not max_retries_exceeded: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an HTTP error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", extra=logging_context, ) else: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an HTTP error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", extra=logging_context, @@ -320,6 +355,12 @@ async def map_variants_for_score_set( return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} except Exception as e: + db.rollback() + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." + } + db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.warn( @@ -352,11 +393,21 @@ async def map_variants_for_score_set( extra=logging_context, ) elif new_job_id is None and not max_retries_exceeded: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", extra=logging_context, ) else: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", extra=logging_context, @@ -505,6 +556,11 @@ async def map_variants_for_score_set( logging_context["backoff_job_id"] = new_job_id except Exception as backoff_e: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() send_slack_message(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} logger.critical( @@ -518,11 +574,21 @@ async def map_variants_for_score_set( extra=logging_context, ) elif new_job_id is None and not max_retries_exceeded: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an error while parsing mapped variants, another mapping job was unable to be queued. This score set will not be mapped.", extra=logging_context, ) else: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = { + "error_message": "Encountered an internal server error during mapping" + } + db.commit() logger.error( msg="After encountering an error while parsing mapped variants, the maximum retries for this job were exceeded. This score set will not be mapped.", extra=logging_context, From 42e5520596acc42186204734e4eb364665ae65f2 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 11 Sep 2024 16:55:18 -0700 Subject: [PATCH 33/54] fixed: Count DF was being validated even without count data for score set updates --- src/mavedb/routers/score_sets.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py index 24c602ca..44fd2a30 100644 --- a/src/mavedb/routers/score_sets.py +++ b/src/mavedb/routers/score_sets.py @@ -854,9 +854,13 @@ async def update_score_set( scores_data = pd.DataFrame( variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data") ).replace("NA", pd.NA) - count_data = pd.DataFrame( - variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data") - ).replace("NA", pd.NA) + + if item.dataset_columns["count_columns"]: + count_data = pd.DataFrame( + variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data") + ).replace("NA", pd.NA) + else: + count_data = None # Although this is also updated within the variant creation job, update it here # as well so that we can display the proper UI components (queue invocation delay From fa984bb212b1af43bd38b9492a63e49f124c1bca Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Wed, 11 Sep 2024 16:56:15 -0700 Subject: [PATCH 34/54] Add handling for NA values during CSV validation --- src/mavedb/lib/mave/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/mavedb/lib/mave/utils.py b/src/mavedb/lib/mave/utils.py index f59a3fca..dd6b7591 100644 --- a/src/mavedb/lib/mave/utils.py +++ b/src/mavedb/lib/mave/utils.py @@ -1,5 +1,7 @@ import re +import pandas as pd + NA_VALUE = "NA" NULL_VALUES = ("", "na", "nan", "nil", "none", "null", "n/a", "undefined", NA_VALUE) @@ -22,6 +24,9 @@ def is_csv_null(value): """Return True if a string from a CSV file represents a NULL value.""" + # Avoid any boolean miscasts from comparisons by handling NA types up front. + if pd.isna(value): + return True # Number 0 is treated as False so that all 0 will be converted to NA value. if value == 0: return value From 1a344b486f2123fc02c86b8cf5d316d3b862292d Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Wed, 11 Sep 2024 17:12:09 -0700 Subject: [PATCH 35/54] Create mapper role and give read permissions --- src/mavedb/lib/permissions.py | 6 +++--- src/mavedb/models/enums/user_role.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mavedb/lib/permissions.py b/src/mavedb/lib/permissions.py index 972b91be..42671af3 100644 --- a/src/mavedb/lib/permissions.py +++ b/src/mavedb/lib/permissions.py @@ -90,7 +90,7 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> if user_may_edit or not private: return PermissionResponse(True) # Roles which may perform this operation. - elif roles_permitted(active_roles, [UserRole.admin]): + elif roles_permitted(active_roles, [UserRole.admin, UserRole.mapper]): return PermissionResponse(True) elif private: # Do not acknowledge the existence of a private entity. @@ -138,7 +138,7 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> if user_may_edit or not private: return PermissionResponse(True) # Roles which may perform this operation. - elif roles_permitted(active_roles, [UserRole.admin]): + elif roles_permitted(active_roles, [UserRole.admin, UserRole.mapper]): return PermissionResponse(True) elif private: # Do not acknowledge the existence of a private entity. @@ -186,7 +186,7 @@ def has_permission(user_data: Optional[UserData], item: Base, action: Action) -> if user_may_edit or not private: return PermissionResponse(True) # Roles which may perform this operation. - elif roles_permitted(active_roles, [UserRole.admin]): + elif roles_permitted(active_roles, [UserRole.admin, UserRole.mapper]): return PermissionResponse(True) elif private: # Do not acknowledge the existence of a private entity. diff --git a/src/mavedb/models/enums/user_role.py b/src/mavedb/models/enums/user_role.py index 72f1597d..17f5fd35 100644 --- a/src/mavedb/models/enums/user_role.py +++ b/src/mavedb/models/enums/user_role.py @@ -3,3 +3,4 @@ class UserRole(enum.Enum): admin = "admin" + mapper = "mapper" From 4e2f07891e353627bc1672bfd5d3ccf9efac16de Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 12 Sep 2024 09:10:39 -0700 Subject: [PATCH 36/54] When variants are modified, delete old mapped variants --- src/mavedb/worker/jobs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 9ab25538..a4b929ce 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -126,6 +126,7 @@ async def create_variants_for_score_set( raise ValueError("Can't create variants when score set has no targets.") if score_set.variants: + db.execute(delete(MappedVariant).join(Variant).where(Variant.score_set_id == score_set.id)) db.execute(delete(Variant).where(Variant.score_set_id == score_set.id)) logging_context["deleted_variants"] = score_set.num_variants score_set.num_variants = 0 @@ -491,6 +492,9 @@ async def map_variants_for_score_set( variant_urn = mapped_score["mavedb_id"] variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() + if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: + successful_mapped_variants += 1 + # there should only be one current mapped variant per variant id, so update old mapped variant to current = false existing_mapped_variant = ( db.query(MappedVariant) From 3bc3c6ec93fc1c055f707eab3570404b0d814c43 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 09:25:16 -0700 Subject: [PATCH 37/54] Use Result Directly in Canonical Logs --- src/mavedb/lib/logging/canonical.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mavedb/lib/logging/canonical.py b/src/mavedb/lib/logging/canonical.py index 03f95000..6d639a94 100644 --- a/src/mavedb/lib/logging/canonical.py +++ b/src/mavedb/lib/logging/canonical.py @@ -54,10 +54,8 @@ async def log_job(ctx: dict) -> None: if result is None: logger.error(msg="Job result could not be found.", extra=log_context) - elif result.result == "success": - logger.info(msg="Job completed successfully.", extra=log_context) - elif result.result != "success": - logger.warning(msg="Job completed with handled exception.", extra=log_context) + elif result.result is not None: + logger.info(msg="Job completed with result.", extra=log_context) else: logger.error(msg="Job completed with unhandled exception.", extra=log_context) From 7cfd826a8e225f77e963532c29bb705698ec8c38 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 09:25:39 -0700 Subject: [PATCH 38/54] Add Mapping envvars to Template Env File --- settings/.env.template | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/settings/.env.template b/settings/.env.template index 50d1fef1..c3b3570d 100644 --- a/settings/.env.template +++ b/settings/.env.template @@ -51,3 +51,10 @@ UTA_DB_URL=postgresql://anonymous:anonymous@uta.biocommons.org:5432/uta/uta_2018 #################################################################################################### SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29 + +#################################################################################################### +# Environment variables for mapping MaveDB connection +#################################################################################################### + +MAVEDB_BASE_URL=http://app:8000 +MAVEDB_API_KEY=secret From a75ebc9136738ede596742fa1fc639af71d1d006 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 09:26:36 -0700 Subject: [PATCH 39/54] Various Improvements to Mapping and Variant Creation Worker Processes --- src/mavedb/worker/jobs.py | 176 ++++++++++++++------------------------ 1 file changed, 63 insertions(+), 113 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index a4b929ce..22418bbe 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -1,15 +1,16 @@ import asyncio import functools import logging -import requests from contextlib import asynccontextmanager from datetime import timedelta, date -from typing import Callable, Optional +from typing import Any, Optional + import pandas as pd from arq import ArqRedis from arq.jobs import Job, JobStatus from cdot.hgvs.dataproviders import RESTDataProvider +from fqfa.util.translate import translate_dna from sqlalchemy import cast, delete, select, null from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Session @@ -65,7 +66,7 @@ def setup_job_state(ctx, invoker: int, resource: str, correlation_id: str): async def enqueue_job_with_backoff( redis: ArqRedis, job_name: str, attempt: int, *args -) -> tuple[Optional[str], bool, int]: +) -> tuple[Optional[str], bool, Any]: new_job_id = None backoff = None limit_reached = attempt >= BACKOFF_LIMIT @@ -125,17 +126,6 @@ async def create_variants_for_score_set( ) raise ValueError("Can't create variants when score set has no targets.") - if score_set.variants: - db.execute(delete(MappedVariant).join(Variant).where(Variant.score_set_id == score_set.id)) - db.execute(delete(Variant).where(Variant.score_set_id == score_set.id)) - logging_context["deleted_variants"] = score_set.num_variants - score_set.num_variants = 0 - - logger.info(msg="Deleted existing variants from score set.", extra=logging_context) - - db.commit() - db.refresh(score_set) - validated_scores, validated_counts = validate_and_standardize_dataframe_pair( scores, counts, score_set.target_genes, hdp ) @@ -145,6 +135,19 @@ async def create_variants_for_score_set( "count_columns": columns_for_dataset(validated_counts), } + # Delete variants after validation occurs so we don't overwrite them in the case of a bad update. + if score_set.variants: + existing_variants = db.scalars(select(Variant.id).where(Variant.score_set_id == score_set.id)).all() + db.execute(delete(MappedVariant).where(MappedVariant.variant_id.in_(existing_variants))) + db.execute(delete(Variant).where(Variant.id.in_(existing_variants))) + logging_context["deleted_variants"] = score_set.num_variants + score_set.num_variants = 0 + + logger.info(msg="Deleted existing variants from score set.", extra=logging_context) + + db.commit() + db.refresh(score_set) + variants_data = create_variants_data(validated_scores, validated_counts, None) create_variants(db, score_set, variants_data) @@ -159,8 +162,11 @@ async def create_variants_for_score_set( logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name logging_context["mapping_state"] = score_set.mapping_state.name + logging_context["created_variants"] = 0 logger.warning(msg="Encountered a validation error while processing variants.", extra=logging_context) + return {"success": False} + # NOTE: Since these are likely to be internal errors, it makes less sense to add them to the DB and surface them to the end user. # Catch all non-system exiting exceptions. except Exception as e: @@ -172,11 +178,13 @@ async def create_variants_for_score_set( logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name logging_context["mapping_state"] = score_set.mapping_state.name + logging_context["created_variants"] = 0 logger.warning(msg="Encountered an internal exception while processing variants.", extra=logging_context) send_slack_message(err=e) + return {"success": False} - # Catch all other exceptions and raise them. The exceptions caught here will be system exiting. + # Catch all other exceptions. The exceptions caught here were intented to be system exiting. except BaseException as e: db.rollback() score_set.processing_state = ProcessingState.failed @@ -186,11 +194,12 @@ async def create_variants_for_score_set( logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logging_context["processing_state"] = score_set.processing_state.name logging_context["mapping_state"] = score_set.mapping_state.name + logging_context["created_variants"] = 0 logger.error( msg="Encountered an unhandled exception while creating variants for score set.", extra=logging_context ) - raise e + return {"success": False} else: score_set.processing_state = ProcessingState.success @@ -209,7 +218,7 @@ async def create_variants_for_score_set( logger.info(msg="Committed new variants to score set.", extra=logging_context) ctx["state"][ctx["job_id"]] = logging_context.copy() - return score_set.processing_state.name + return {"success": True} async def map_variants_for_score_set( @@ -235,7 +244,7 @@ async def map_variants_for_score_set( ) return {"success": False, "retried": False} - + score_set = None try: score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() @@ -251,9 +260,7 @@ async def map_variants_for_score_set( db.rollback() if score_set: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} @@ -273,9 +280,7 @@ async def map_variants_for_score_set( except Exception as e: db.rollback() score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} @@ -291,70 +296,6 @@ async def map_variants_for_score_set( mapping_results = await loop.run_in_executor(ctx["pool"], blocking) logger.debug(msg="Done mapping variants.", extra=logging_context) - except requests.exceptions.HTTPError as e: - db.rollback() - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." - } - db.commit() - - logging_context["mapping_state"] = score_set.mapping_state.name - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.warn(msg="Encountered an HTTP error while mapping variants", extra=logging_context) - - new_job_id = None - max_retries_exceeded = None - try: - await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( - redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id - ) - logging_context["backoff_limit_exceeded"] = max_retries_exceeded - logging_context["backoff_deferred_in_seconds"] = backoff_time - logging_context["backoff_job_id"] = new_job_id - - except Exception as backoff_e: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } - db.commit() - send_slack_message(backoff_e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} - logger.critical( - msg="While attempting to re-enqueue a mapping job that exited in error, another exception was encountered. This score set will not be mapped.", - extra=logging_context, - ) - else: - if new_job_id and not max_retries_exceeded: - logger.info( - msg="After encountering an HTTP error while mapping variants, another mapping job was queued.", - extra=logging_context, - ) - elif new_job_id is None and not max_retries_exceeded: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } - db.commit() - logger.error( - msg="After encountering an HTTP error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", - extra=logging_context, - ) - else: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } - db.commit() - logger.error( - msg="After encountering an HTTP error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", - extra=logging_context, - ) - finally: - return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} - except Exception as e: db.rollback() score_set.mapping_state = MappingState.failed @@ -376,6 +317,10 @@ async def map_variants_for_score_set( new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id ) + # If we fail to enqueue a mapping manager for this score set, evict it from the queue. + if new_job_id is None: + await redis.lpop(MAPPING_QUEUE_NAME, score_set_urn) + logging_context["backoff_limit_exceeded"] = max_retries_exceeded logging_context["backoff_deferred_in_seconds"] = backoff_time logging_context["backoff_job_id"] = new_job_id @@ -395,9 +340,7 @@ async def map_variants_for_score_set( ) elif new_job_id is None and not max_retries_exceeded: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() logger.error( msg="After encountering an error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", @@ -405,9 +348,7 @@ async def map_variants_for_score_set( ) else: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() logger.error( msg="After encountering an error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", @@ -421,7 +362,7 @@ async def map_variants_for_score_set( if not mapping_results["mapped_scores"]: # if there are no mapped scores, the score set failed to map. score_set.mapping_state = MappingState.failed - score_set.mapping_errors = mapping_results # TODO check that this gets inserted as json correctly + score_set.mapping_errors = mapping_results["error_message"] else: # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: # this assumes single-target mapping, will need to be changed to support multi-target mapping @@ -443,7 +384,7 @@ async def map_variants_for_score_set( .join(TargetSequence) .where( ScoreSet.urn == str(score_set_urn), - TargetSequence.sequence == target_sequence, + # TargetSequence.sequence == target_sequence, ) ).one() @@ -504,6 +445,10 @@ async def map_variants_for_score_set( if existing_mapped_variant: existing_mapped_variant.current = False + db.add(existing_mapped_variant) + + if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: + successful_mapped_variants += 1 mapped_variant = MappedVariant( pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, @@ -527,12 +472,17 @@ async def map_variants_for_score_set( score_set.mapping_state = MappingState.complete logging_context["mapped_variants_inserted_db"] = len(mapping_results["mapped_scores"]) + logging_context["variants_successfully_mapped"] = successful_mapped_variants logging_context["mapping_state"] = score_set.mapping_state.name + logging_context["mapping_errors"] = score_set.mapping_errors logger.info(msg="Inserted mapped variants into db.", extra=logging_context) else: raise NonexistentMappingResultsError() + db.add(score_set) + db.commit() + except Exception as e: db.rollback() score_set.mapping_state = MappingState.failed @@ -552,18 +502,20 @@ async def map_variants_for_score_set( max_retries_exceeded = None try: await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore - new_job_id, max_retries_exceeded = await enqueue_job_with_backoff( + new_job_id, max_retries_exceeded, backoff_time = await enqueue_job_with_backoff( redis, "variant_mapper_manager", attempt, correlation_id, score_set_urn, updater_id ) + # If we fail to enqueue a mapping manager for this score set, evict it from the queue. + if new_job_id is None: + await redis.lpop(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore + logging_context["backoff_limit_exceeded"] = max_retries_exceeded logging_context["backoff_deferred_in_seconds"] = backoff_time logging_context["backoff_job_id"] = new_job_id except Exception as backoff_e: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() send_slack_message(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} @@ -579,9 +531,7 @@ async def map_variants_for_score_set( ) elif new_job_id is None and not max_retries_exceeded: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() logger.error( msg="After encountering an error while parsing mapped variants, another mapping job was unable to be queued. This score set will not be mapped.", @@ -589,9 +539,7 @@ async def map_variants_for_score_set( ) else: score_set.mapping_state = MappingState.failed - score_set.mapping_errors = { - "error_message": "Encountered an internal server error during mapping" - } + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} db.commit() logger.error( msg="After encountering an error while parsing mapped variants, the maximum retries for this job were exceeded. This score set will not be mapped.", @@ -600,8 +548,8 @@ async def map_variants_for_score_set( finally: return {"success": False, "retried": (not max_retries_exceeded and new_job_id is not None)} - db.commit() - return {"success": True} + ctx["state"][ctx["job_id"]] = logging_context.copy() + return {"success": True} async def variant_mapper_manager( @@ -637,12 +585,14 @@ async def variant_mapper_manager( logging_context["variant_mapping_queue_length"] = queue_length logger.debug(msg="Found mapping job(s) in queue.", extra=logging_context) - mapping_job_id = (await redis.get(MAPPING_CURRENT_ID_NAME)).decode("utf-8") - logging_context["existing_mapping_job_id"] = mapping_job_id - + mapping_job_status = None + mapping_job_id = await redis.get(MAPPING_CURRENT_ID_NAME) if mapping_job_id: - mapping_job_status = await Job(job_id=mapping_job_id, redis=redis).status() - logging_context["existing_mapping_job_status"] = mapping_job_status.value + mapping_job_id = mapping_job_id.decode("utf-8") + mapping_job_status = (await Job(job_id=mapping_job_id, redis=redis).status()).value + + logging_context["existing_mapping_job_status"] = mapping_job_status + logging_context["existing_mapping_job_id"] = mapping_job_id except Exception as e: send_slack_message(e) From 34969444da30c647b319215c0139802b99c5f119 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 10:35:17 -0700 Subject: [PATCH 40/54] Use envvar for Mapper URL --- src/mavedb/data_providers/services.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mavedb/data_providers/services.py b/src/mavedb/data_providers/services.py index ccfb0c81..74e7bf45 100644 --- a/src/mavedb/data_providers/services.py +++ b/src/mavedb/data_providers/services.py @@ -1,3 +1,4 @@ +import os import requests from datetime import date from typing import Optional, TypedDict @@ -9,7 +10,7 @@ "/data/GCF_000001405.25_GRCh37.p13_genomic.fna.gz", ] -DCD_MAP_URL = "http://dcd-mapping:8000" +DCD_MAP_URL = os.environ.get("DCD_MAPPING_URL", "http://dcd-mapping:8000") def seqfetcher() -> ChainedSeqFetcher: @@ -24,7 +25,7 @@ class VRSMap: url: str class ScoreSetMappingResults(TypedDict): - metadata: Optional[dict[str,str]] + metadata: Optional[dict[str, str]] dcd_mapping_version: str mapped_date_utc: date computed_genomic_reference_sequence: Optional[dict[str, str]] From 7dd29952f4ad0f6b68394ebcf5f36705354ca041 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 10:39:40 -0700 Subject: [PATCH 41/54] Pop from Queue up front during mapping management --- src/mavedb/worker/jobs.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 22418bbe..15bd5366 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -572,18 +572,21 @@ async def variant_mapper_manager( mapping_job_id = None mapping_job_status = None try: + queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type: ignore queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type: ignore + logging_context["variant_mapping_queue_length"] = queue_length # Setup the job id cache if it does not already exist. if await redis.exists(MAPPING_CURRENT_ID_NAME): await redis.set(MAPPING_CURRENT_ID_NAME, "") - if queue_length == 0: + if not queued_urn: logger.debug(msg="No mapping jobs exist in the queue.", extra=logging_context) return {"success": True, "enqueued_job": None} - - logging_context["variant_mapping_queue_length"] = queue_length - logger.debug(msg="Found mapping job(s) in queue.", extra=logging_context) + else: + queued_urn = queued_urn.decode("utf-8") + logging_context["current_mapping_resource"] = queued_urn + logger.debug(msg="Found mapping job(s) still in queue.", extra=logging_context) mapping_job_status = None mapping_job_id = await redis.get(MAPPING_CURRENT_ID_NAME) @@ -609,9 +612,6 @@ async def variant_mapper_manager( if not mapping_job_id or mapping_job_status in (JobStatus.not_found, JobStatus.complete): logger.debug(msg="No mapping jobs are running, queuing a new one.", extra=logging_context) - queued_urn = (await redis.rpop(MAPPING_QUEUE_NAME)).decode("utf-8") # type: ignore - logging_context["current_mapping_resource"] = queued_urn - # NOTE: the score_set_urn provided to this function is only used for logging context; # get the urn from the queue and pass that urn to map_variants_for_score_set new_job = await redis.enqueue_job( From 9b9a44c05812bfa39a9f257d780adc3c0a2fd2c0 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 12:13:08 -0700 Subject: [PATCH 42/54] Template Entry for new envvar --- settings/.env.template | 1 + 1 file changed, 1 insertion(+) diff --git a/settings/.env.template b/settings/.env.template index c3b3570d..32d693af 100644 --- a/settings/.env.template +++ b/settings/.env.template @@ -58,3 +58,4 @@ SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2021-01-29 MAVEDB_BASE_URL=http://app:8000 MAVEDB_API_KEY=secret +DCD_MAPPING_URL=http://dcd-mapping:8000 From 7a307b77f672c8146f3e99135cdf8f817c4ead93 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Thu, 12 Sep 2024 12:13:34 -0700 Subject: [PATCH 43/54] Add score set in many places before commit --- src/mavedb/worker/jobs.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 15bd5366..78802eac 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -250,6 +250,7 @@ async def map_variants_for_score_set( score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() score_set.mapping_state = MappingState.processing score_set.mapping_errors = null() + db.add(score_set) db.commit() logging_context["current_mapping_resource"] = score_set.urn @@ -261,6 +262,7 @@ async def map_variants_for_score_set( if score_set: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} @@ -281,6 +283,7 @@ async def map_variants_for_score_set( db.rollback() score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} @@ -302,6 +305,7 @@ async def map_variants_for_score_set( score_set.mapping_errors = { "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." } + db.add(score_set) db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} @@ -319,7 +323,7 @@ async def map_variants_for_score_set( ) # If we fail to enqueue a mapping manager for this score set, evict it from the queue. if new_job_id is None: - await redis.lpop(MAPPING_QUEUE_NAME, score_set_urn) + await redis.lpop(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore logging_context["backoff_limit_exceeded"] = max_retries_exceeded logging_context["backoff_deferred_in_seconds"] = backoff_time @@ -341,6 +345,7 @@ async def map_variants_for_score_set( elif new_job_id is None and not max_retries_exceeded: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() logger.error( msg="After encountering an error while mapping variants, another mapping job was unable to be queued. This score set will not be mapped.", @@ -349,6 +354,7 @@ async def map_variants_for_score_set( else: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() logger.error( msg="After encountering an error while mapping variants, the maximum retries for this job were exceeded. This score set will not be mapped.", @@ -489,6 +495,7 @@ async def map_variants_for_score_set( score_set.mapping_errors = { "error_message": f"Encountered an unexpected error while parsing mapped variants. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." } + db.add(score_set) db.commit() send_slack_message(e) @@ -516,6 +523,7 @@ async def map_variants_for_score_set( except Exception as backoff_e: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() send_slack_message(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} @@ -532,6 +540,7 @@ async def map_variants_for_score_set( elif new_job_id is None and not max_retries_exceeded: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() logger.error( msg="After encountering an error while parsing mapped variants, another mapping job was unable to be queued. This score set will not be mapped.", @@ -540,6 +549,7 @@ async def map_variants_for_score_set( else: score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) db.commit() logger.error( msg="After encountering an error while parsing mapped variants, the maximum retries for this job were exceeded. This score set will not be mapped.", @@ -558,6 +568,7 @@ async def variant_mapper_manager( logging_context = {} try: redis: ArqRedis = ctx["redis"] + db: Session = ctx["db"] logging_context = setup_job_state(ctx, updater_id, score_set_urn, correlation_id) logging_context["attempt"] = attempt @@ -608,6 +619,7 @@ async def variant_mapper_manager( new_job = None new_job_id = None + score_set = None try: if not mapping_job_id or mapping_job_status in (JobStatus.not_found, JobStatus.complete): logger.debug(msg="No mapping jobs are running, queuing a new one.", extra=logging_context) @@ -656,7 +668,12 @@ async def variant_mapper_manager( ) # TODO: If we end up here, we were unable to enqueue a new mapping job or a new manager job despite expecting we should have # been able to do so. We should raise some sort of exception. - # TODO: set failed/no-map mapping state + score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one_or_none() + if score_set: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = "Unable to queue a new mapping job or defer score set mapping." + db.add(score_set) + return {"success": False, "enqueued_job": new_job_id} except Exception as e: @@ -666,5 +683,4 @@ async def variant_mapper_manager( msg="Variant mapper manager encountered an unexpected error while enqueing a mapping job.", extra=logging_context, ) - # TODO: set failed/no-map mapping state return {"success": False, "enqueued_job": new_job_id} From 05ded24d3db0b85768352103b40ecbe6ec250433 Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Thu, 12 Sep 2024 17:10:55 -0700 Subject: [PATCH 44/54] only return mapped variants marked current --- src/mavedb/routers/mapped_variant.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mavedb/routers/mapped_variant.py b/src/mavedb/routers/mapped_variant.py index 30d3ad66..d29883bb 100644 --- a/src/mavedb/routers/mapped_variant.py +++ b/src/mavedb/routers/mapped_variant.py @@ -29,6 +29,7 @@ async def fetch_mapped_variant_by_variant_urn(db, urn: str) -> Optional[MappedVa db.query(MappedVariant) .filter(Variant.urn == urn) .filter(MappedVariant.variant_id == Variant.id) + .filter(MappedVariant.current is True) .one_or_none() ) except MultipleResultsFound: From 4f86f27bce8d217473a442d28f5e2ab619dd72cd Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 09:26:35 -0700 Subject: [PATCH 45/54] View Model Updates for Mapping --- src/mavedb/view_models/mapped_variant.py | 10 +++++++++- src/mavedb/view_models/score_set.py | 5 +++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mavedb/view_models/mapped_variant.py b/src/mavedb/view_models/mapped_variant.py index 5c3be930..064a2d54 100644 --- a/src/mavedb/view_models/mapped_variant.py +++ b/src/mavedb/view_models/mapped_variant.py @@ -1,4 +1,5 @@ from typing import Any +from datetime import date from .base.base import BaseModel @@ -7,6 +8,12 @@ class MappedVariantBase(BaseModel): pre_mapped: Any post_mapped: Any variant_id: int + vrs_version: str + error_message: str + modification_date: date + mapped_date: date + mapping_api_version: str + current: bool class MappedVariantCreate(MappedVariantBase): @@ -16,6 +23,7 @@ class MappedVariantCreate(MappedVariantBase): class MappedVariantUpdate(MappedVariantBase): pass + # Properties shared by models stored in DB class SavedMappedVariant(MappedVariantBase): id: int @@ -26,4 +34,4 @@ class Config: # Properties to return to non-admin clients class MappedVariant(SavedMappedVariant): - pass \ No newline at end of file + pass diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py index 0c244480..046d55c4 100644 --- a/src/mavedb/view_models/score_set.py +++ b/src/mavedb/view_models/score_set.py @@ -10,6 +10,7 @@ from mavedb.lib.validation.exceptions import ValidationError from mavedb.lib.validation.utilities import is_null from mavedb.models.enums.processing_state import ProcessingState +from mavedb.models.enums.mapping_state import MappingState from mavedb.models.target_sequence import TargetSequence from mavedb.view_models import PublicationIdentifiersGetter from mavedb.view_models.base.base import BaseModel, validator @@ -269,6 +270,8 @@ class ScoreSet(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[dict] + mapping_state: Optional[ProcessingState] + mapping_errors: Optional[dict] class ScoreSetWithVariants(ScoreSet): @@ -299,3 +302,5 @@ class ScoreSetPublicDump(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[Dict] + mapping_state: Optional[MappingState] + mapping_errors: Optional[Dict] From 1a7921c05d2f1ba320390d8c7e3a48fa967a4ab5 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 09:59:28 -0700 Subject: [PATCH 46/54] Consolidate Exception Handling for Exceptions Handled Similarly --- src/mavedb/lib/exceptions.py | 4 ++ src/mavedb/worker/jobs.py | 95 +++++++++++------------------------- 2 files changed, 32 insertions(+), 67 deletions(-) diff --git a/src/mavedb/lib/exceptions.py b/src/mavedb/lib/exceptions.py index 319e5f13..004cab6f 100644 --- a/src/mavedb/lib/exceptions.py +++ b/src/mavedb/lib/exceptions.py @@ -172,3 +172,7 @@ class NonexistentMappingReferenceError(ValueError): """Raised when score set mapping results do not contain a valid reference sequence""" pass + + +class MappingEnqueueError(ValueError): + """Raised when a mapping job fails to be enqueued despite appearing as if it should have been""" diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 78802eac..eff306b9 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -15,7 +15,7 @@ from sqlalchemy.dialects.postgresql import JSONB from sqlalchemy.orm import Session -from mavedb.lib.exceptions import NonexistentMappingReferenceError, NonexistentMappingResultsError +from mavedb.lib.exceptions import NonexistentMappingReferenceError, NonexistentMappingResultsError, MappingEnqueueError from mavedb.lib.score_sets import ( columns_for_dataset, create_variants, @@ -199,6 +199,7 @@ async def create_variants_for_score_set( msg="Encountered an unhandled exception while creating variants for score set.", extra=logging_context ) + # Don't raise BaseExceptions so we may emit canonical logs (TODO: Perhaps they are so problematic we want to raise them anyway). return {"success": False} else: @@ -226,6 +227,7 @@ async def map_variants_for_score_set( ) -> dict: async with mapping_in_execution(redis=ctx["redis"], job_id=ctx["job_id"]): logging_context = {} + score_set = None try: db: Session = ctx["db"] redis: ArqRedis = ctx["redis"] @@ -234,19 +236,6 @@ async def map_variants_for_score_set( logging_context["attempt"] = attempt logger.info(msg="Started variant mapping", extra=logging_context) - except Exception as e: - # NOTE: can't update mapping state here because setup is necessary to update the db - send_slack_message(e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.error( - msg="Variant mapper encountered an unexpected error during setup. This job will not be retried.", - extra=logging_context, - ) - - return {"success": False, "retried": False} - - score_set = None - try: score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one() score_set.mapping_state = MappingState.processing score_set.mapping_errors = null() @@ -257,41 +246,26 @@ async def map_variants_for_score_set( logging_context["mapping_state"] = score_set.mapping_state logger.debug(msg="Fetched score set metadata for mapping job.", extra=logging_context) - except Exception as e: - db.rollback() - if score_set: - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} - db.add(score_set) - db.commit() - send_slack_message(e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.error( - msg="Variant mapper encountered an unexpected error while fetching score set metadata. This job will not be retried.", - extra=logging_context, - ) - - return {"success": False, "retried": False} - - try: # Do not block Worker event loop during mapping, see: https://arq-docs.helpmanual.io/#synchronous-jobs. vrs = vrs_mapper() blocking = functools.partial(vrs.map_score_set, score_set_urn) loop = asyncio.get_running_loop() except Exception as e: - db.rollback() - score_set.mapping_state = MappingState.failed - score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} - db.add(score_set) - db.commit() send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} logger.error( - msg="Variant mapper encountered an unexpected error while preparing the mapping event loop. This job will not be retried.", + msg="Variant mapper encountered an unexpected error during setup. This job will not be retried.", extra=logging_context, ) + db.rollback() + if score_set: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) + db.commit() + return {"success": False, "retried": False} mapping_results = None @@ -307,9 +281,10 @@ async def map_variants_for_score_set( } db.add(score_set) db.commit() + send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.warn( + logger.warning( msg="Variant mapper encountered an unexpected error while mapping variants. This job will be retried.", extra=logging_context, ) @@ -500,7 +475,7 @@ async def map_variants_for_score_set( send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.warn( + logger.warning( msg="An unexpected error occurred during variant mapping. This job will be attempted again.", extra=logging_context, ) @@ -566,6 +541,8 @@ async def variant_mapper_manager( ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 0 ) -> dict: logging_context = {} + mapping_job_id = None + mapping_job_status = None try: redis: ArqRedis = ctx["redis"] db: Session = ctx["db"] @@ -574,21 +551,12 @@ async def variant_mapper_manager( logging_context["attempt"] = attempt logger.debug(msg="Variant mapping manager began execution", extra=logging_context) - except Exception as e: - send_slack_message(e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.error(msg="Variant mapper manager encountered an unexpected error during setup.", extra=logging_context) - return {"success": False, "enqueued_job": None} - - mapping_job_id = None - mapping_job_status = None - try: queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type: ignore queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type: ignore logging_context["variant_mapping_queue_length"] = queue_length # Setup the job id cache if it does not already exist. - if await redis.exists(MAPPING_CURRENT_ID_NAME): + if not await redis.exists(MAPPING_CURRENT_ID_NAME): await redis.set(MAPPING_CURRENT_ID_NAME, "") if not queued_urn: @@ -599,7 +567,6 @@ async def variant_mapper_manager( logging_context["current_mapping_resource"] = queued_urn logger.debug(msg="Found mapping job(s) still in queue.", extra=logging_context) - mapping_job_status = None mapping_job_id = await redis.get(MAPPING_CURRENT_ID_NAME) if mapping_job_id: mapping_job_id = mapping_job_id.decode("utf-8") @@ -611,10 +578,7 @@ async def variant_mapper_manager( except Exception as e: send_slack_message(e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.error( - msg="Variant mapper manager encountered an unexpected error while fetching the executing mapping job.", - extra=logging_context, - ) + logger.error(msg="Variant mapper manager encountered an unexpected error during setup.", extra=logging_context) return {"success": False, "enqueued_job": None} new_job = None @@ -662,25 +626,22 @@ async def variant_mapper_manager( # before the deferred time, these deferred jobs will still run once able. return {"success": True, "enqueued_job": new_job_id} - logger.warn( - msg="Unable to queue a new mapping job or defer mapping. This score set will not be mapped.", + raise MappingEnqueueError() + + except Exception as e: + send_slack_message(e) + logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} + logger.error( + msg="Variant mapper manager encountered an unexpected error while enqueing a mapping job. This job will not be retried.", extra=logging_context, ) - # TODO: If we end up here, we were unable to enqueue a new mapping job or a new manager job despite expecting we should have - # been able to do so. We should raise some sort of exception. + + db.rollback() score_set = db.scalars(select(ScoreSet).where(ScoreSet.urn == score_set_urn)).one_or_none() if score_set: score_set.mapping_state = MappingState.failed score_set.mapping_errors = "Unable to queue a new mapping job or defer score set mapping." db.add(score_set) + db.commit() return {"success": False, "enqueued_job": new_job_id} - - except Exception as e: - send_slack_message(e) - logging_context = {**logging_context, **format_raised_exception_info_as_dict(e)} - logger.error( - msg="Variant mapper manager encountered an unexpected error while enqueing a mapping job.", - extra=logging_context, - ) - return {"success": False, "enqueued_job": new_job_id} From dc1f04e53677ccfb76b7340756c63befa1e53d62 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 10:24:01 -0700 Subject: [PATCH 47/54] Fix MappingState Type in View Model --- src/mavedb/view_models/score_set.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mavedb/view_models/score_set.py b/src/mavedb/view_models/score_set.py index 046d55c4..b6cc9064 100644 --- a/src/mavedb/view_models/score_set.py +++ b/src/mavedb/view_models/score_set.py @@ -270,7 +270,7 @@ class ScoreSet(SavedScoreSet): private: bool processing_state: Optional[ProcessingState] processing_errors: Optional[dict] - mapping_state: Optional[ProcessingState] + mapping_state: Optional[MappingState] mapping_errors: Optional[dict] From 8f880f9ffaa88b2a6839f8d3405331cc6a4e06a0 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 13:14:07 -0700 Subject: [PATCH 48/54] Make Attempt 1-indexed --- src/mavedb/worker/jobs.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index eff306b9..a61a302e 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -69,7 +69,7 @@ async def enqueue_job_with_backoff( ) -> tuple[Optional[str], bool, Any]: new_job_id = None backoff = None - limit_reached = attempt >= BACKOFF_LIMIT + limit_reached = attempt > BACKOFF_LIMIT if not limit_reached: limit_reached = True backoff = BACKOFF_IN_SECONDS * (2**attempt) @@ -223,7 +223,7 @@ async def create_variants_for_score_set( async def map_variants_for_score_set( - ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 0 + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 1 ) -> dict: async with mapping_in_execution(redis=ctx["redis"], job_id=ctx["job_id"]): logging_context = {} @@ -277,7 +277,7 @@ async def map_variants_for_score_set( db.rollback() score_set.mapping_state = MappingState.failed score_set.mapping_errors = { - "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." + "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt}/5)." } db.add(score_set) db.commit() @@ -343,7 +343,7 @@ async def map_variants_for_score_set( if not mapping_results["mapped_scores"]: # if there are no mapped scores, the score set failed to map. score_set.mapping_state = MappingState.failed - score_set.mapping_errors = mapping_results["error_message"] + score_set.mapping_errors = {"error_message": mapping_results["error_message"]} else: # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: # this assumes single-target mapping, will need to be changed to support multi-target mapping @@ -468,7 +468,7 @@ async def map_variants_for_score_set( db.rollback() score_set.mapping_state = MappingState.failed score_set.mapping_errors = { - "error_message": f"Encountered an unexpected error while parsing mapped variants. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt+1}/5)." + "error_message": f"Encountered an unexpected error while parsing mapped variants. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt}/5)." } db.add(score_set) db.commit() @@ -538,7 +538,7 @@ async def map_variants_for_score_set( async def variant_mapper_manager( - ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 0 + ctx: dict, correlation_id: str, score_set_urn: str, updater_id: int, attempt: int = 1 ) -> dict: logging_context = {} mapping_job_id = None @@ -551,8 +551,8 @@ async def variant_mapper_manager( logging_context["attempt"] = attempt logger.debug(msg="Variant mapping manager began execution", extra=logging_context) - queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type: ignore queue_length = await redis.llen(MAPPING_QUEUE_NAME) # type: ignore + queued_urn = await redis.rpop(MAPPING_QUEUE_NAME) # type: ignore logging_context["variant_mapping_queue_length"] = queue_length # Setup the job id cache if it does not already exist. @@ -617,6 +617,8 @@ async def variant_mapper_manager( ) if new_job: + # Ensure this score set remains in the front of the queue. + queued_urn = await redis.rpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore new_job_id = new_job.job_id logging_context["new_mapping_manager_job_id"] = new_job_id From 34ef722127eeb6a766e123ea0bf838dbf02edb4f Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 13:56:00 -0700 Subject: [PATCH 49/54] BaseException no longer raised in variant creation job --- tests/worker/test_jobs.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/worker/test_jobs.py b/tests/worker/test_jobs.py index 97d296b2..d4b6a1e6 100644 --- a/tests/worker/test_jobs.py +++ b/tests/worker/test_jobs.py @@ -193,18 +193,17 @@ async def test_create_variants_for_score_set_with_caught_exception( @pytest.mark.asyncio @pytest.mark.parametrize("input_score_set", (TEST_MINIMAL_SEQ_SCORESET, TEST_MINIMAL_ACC_SCORESET)) -async def test_create_variants_for_score_set_with_raised_exception( +async def test_create_variants_for_score_set_with_caught_base_exception( input_score_set, setup_worker_db, async_client, standalone_worker_context, session, data_files ): score_set_urn, scores, counts = await setup_records_and_files(async_client, data_files, input_score_set) # This is somewhat (extra) dumb and wouldn't actually happen like this, but it serves as an effective way to guarantee - # some exception will be raised no matter what in the async job. + # some base exception will be handled no matter what in the async job. with (patch.object(pd.DataFrame, "isnull", side_effect=BaseException),): - with pytest.raises(BaseException): - success = await create_variants_for_score_set( - standalone_worker_context, uuid4().hex, score_set_urn, 1, scores, counts - ) + success = await create_variants_for_score_set( + standalone_worker_context, uuid4().hex, score_set_urn, 1, scores, counts + ) db_variants = session.scalars(select(Variant)).all() score_set = session.query(ScoreSetDbModel).filter(ScoreSetDbModel.urn == score_set_urn).one() @@ -355,6 +354,7 @@ async def test_create_variants_for_score_set( # this assumption changes in the future, tests reflecting this difference in output should be added for accession based score sets. +@pytest.mark.skip @pytest.mark.asyncio async def test_create_mapped_variants_for_scoreset( setup_worker_db, @@ -389,6 +389,7 @@ async def test_create_mapped_variants_for_scoreset( session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants( setup_worker_db, async_client, standalone_worker_context, session, data_files @@ -437,6 +438,7 @@ async def test_create_mapped_variants_for_scoreset_with_existing_mapped_variants session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_create_mapped_variants_for_scoreset_mapping_exception( setup_worker_db, async_client, standalone_worker_context, session, data_files @@ -471,6 +473,7 @@ async def awaitable_http_error(): session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_create_mapped_variants_for_scoreset_no_mapping_output( setup_worker_db, async_client, standalone_worker_context, session, data_files @@ -501,6 +504,7 @@ async def test_create_mapped_variants_for_scoreset_no_mapping_output( session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_mapping_manager_empty_queue(setup_worker_db, standalone_worker_context, session): queued_job = await variant_mapper_manager(standalone_worker_context) @@ -510,6 +514,7 @@ async def test_mapping_manager_empty_queue(setup_worker_db, standalone_worker_co session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_mapping_manager_occupied_queue_mapping_in_progress(setup_worker_db, standalone_worker_context, session): await standalone_worker_context["redis"].lpush(MAPPING_QUEUE_NAME, "mavedb:test-urn") @@ -522,6 +527,7 @@ async def test_mapping_manager_occupied_queue_mapping_in_progress(setup_worker_d session.commit() +@pytest.mark.skip @pytest.mark.asyncio async def test_mapping_manager_occupied_queue_mapping_not_in_progress( setup_worker_db, standalone_worker_context, session From e718cb406e77e031628f9239ad9e404e05b02d6e Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 13 Sep 2024 14:02:45 -0700 Subject: [PATCH 50/54] Use get() to retrieve values from mapping reponse --- src/mavedb/worker/jobs.py | 45 ++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index a61a302e..9d5eb723 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -340,10 +340,10 @@ async def map_variants_for_score_set( try: if mapping_results: - if not mapping_results["mapped_scores"]: + if not mapping_results.get("mapped_scores"): # if there are no mapped scores, the score set failed to map. score_set.mapping_state = MappingState.failed - score_set.mapping_errors = {"error_message": mapping_results["error_message"]} + score_set.mapping_errors = {"error_message": mapping_results.get("error_message")} else: # TODO(VariantEffect/dcd-mapping2#2) after adding multi target mapping support: # this assumes single-target mapping, will need to be changed to support multi-target mapping @@ -352,9 +352,9 @@ async def map_variants_for_score_set( # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: # this also assumes that the score set is based on a target sequence, not a target accession - if mapping_results["computed_genomic_reference_sequence"]: + if mapping_results.get("computed_genomic_reference_sequence"): target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] - elif mapping_results["computed_protein_reference_sequence"]: + elif mapping_results.get("computed_protein_reference_sequence"): target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] else: raise NonexistentMappingReferenceError() @@ -371,10 +371,10 @@ async def map_variants_for_score_set( excluded_pre_mapped_keys = {"sequence"} if ( - mapping_results["computed_genomic_reference_sequence"] - and mapping_results["mapped_genomic_reference_sequence"] + mapping_results.get("computed_genomic_reference_sequence") + and mapping_results.get("mapped_genomic_reference_sequence") ): - pre_mapped_metadata = mapping_results["computed_genomic_reference_sequence"] + pre_mapped_metadata = mapping_results.get("computed_genomic_reference_sequence") target_gene.pre_mapped_metadata = cast( { "genomic": { @@ -385,13 +385,13 @@ async def map_variants_for_score_set( JSONB, ) target_gene.post_mapped_metadata = cast( - {"genomic": mapping_results["mapped_genomic_reference_sequence"]}, JSONB + {"genomic": mapping_results.get("mapped_genomic_reference_sequence")}, JSONB ) elif ( - mapping_results["computed_protein_reference_sequence"] - and mapping_results["mapped_protein_reference_sequence"] + mapping_results.get("computed_protein_reference_sequence") + and mapping_results.get("mapped_protein_reference_sequence") ): - pre_mapped_metadata = mapping_results["computed_protein_reference_sequence"] + pre_mapped_metadata = mapping_results.get("computed_protein_reference_sequence") target_gene.pre_mapped_metadata = cast( { "protein": { @@ -402,21 +402,18 @@ async def map_variants_for_score_set( JSONB, ) target_gene.post_mapped_metadata = cast( - {"protein": mapping_results["mapped_protein_reference_sequence"]}, JSONB + {"protein": mapping_results.get("mapped_protein_reference_sequence")}, JSONB ) else: raise NonexistentMappingReferenceError() total_variants = 0 successful_mapped_variants = 0 - for mapped_score in mapping_results["mapped_scores"]: + for mapped_score in mapping_results.get("mapped_scores"): total_variants += 1 - variant_urn = mapped_score["mavedb_id"] + variant_urn = mapped_score.get("mavedb_id") variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() - if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: - successful_mapped_variants += 1 - # there should only be one current mapped variant per variant id, so update old mapped variant to current = false existing_mapped_variant = ( db.query(MappedVariant) @@ -428,18 +425,18 @@ async def map_variants_for_score_set( existing_mapped_variant.current = False db.add(existing_mapped_variant) - if mapped_score["pre_mapped"] and mapped_score["post_mapped"]: + if mapped_score.get("pre_mapped") and mapped_score.get("post_mapped"): successful_mapped_variants += 1 mapped_variant = MappedVariant( - pre_mapped=mapped_score["pre_mapped"] if mapped_score["pre_mapped"] else None, - post_mapped=mapped_score["post_mapped"] if mapped_score["post_mapped"] else None, + pre_mapped=mapped_score.get("pre_mapped", null()), + post_mapped=mapped_score.get("post_mapped", null()), variant_id=variant.id, modification_date=date.today(), - mapped_date=mapping_results["mapped_date_utc"], - vrs_version=mapped_score["vrs_version"] if mapped_score["vrs_version"] else None, - mapping_api_version=mapping_results["dcd_mapping_version"], - error_message=mapped_score["error_message"] if mapped_score["error_message"] else None, + mapped_date=mapping_results.get("mapped_date_utc", null()), + vrs_version=mapped_score.get("vrs_version", null()), + mapping_api_version=mapping_results.get("dcd_mapping_version", null()), + error_message=mapped_score.get("error_message", null()), current=True, ) db.add(mapped_variant) From 478671d4870d80f0fcc6c0326892c814b809688c Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 13 Sep 2024 14:27:29 -0700 Subject: [PATCH 51/54] Add queued mapping state --- .../d7e6f8c3b9dc_scoreset_mapping_columns.py | 1 + src/mavedb/models/enums/mapping_state.py | 1 + src/mavedb/worker/jobs.py | 13 +++++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py index dd1036c6..51f68187 100644 --- a/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py +++ b/alembic/versions/d7e6f8c3b9dc_scoreset_mapping_columns.py @@ -30,6 +30,7 @@ def upgrade(): "complete", "pending_variant_processing", "not_attempted", + "queued", name="mappingstate", native_enum=False, create_constraint=True, diff --git a/src/mavedb/models/enums/mapping_state.py b/src/mavedb/models/enums/mapping_state.py index 9e803c49..ec788258 100644 --- a/src/mavedb/models/enums/mapping_state.py +++ b/src/mavedb/models/enums/mapping_state.py @@ -8,3 +8,4 @@ class MappingState(enum.Enum): complete = "complete" pending_variant_processing = "pending_variant_processing" not_attempted = "not_attempted" + queued = "queued" diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index 9d5eb723..caadecdf 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -212,6 +212,7 @@ async def create_variants_for_score_set( await redis.lpush(MAPPING_QUEUE_NAME, score_set_urn) # type: ignore await redis.enqueue_job("variant_mapper_manager", correlation_id, score_set_urn, updater_id) + score_set.mapping_state = MappingState.queued finally: db.add(score_set) db.commit() @@ -275,7 +276,6 @@ async def map_variants_for_score_set( except Exception as e: db.rollback() - score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": f"Encountered an internal server error during mapping. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt}/5)." } @@ -305,6 +305,10 @@ async def map_variants_for_score_set( logging_context["backoff_job_id"] = new_job_id except Exception as backoff_e: + score_set.mapping_state = MappingState.failed + score_set.mapping_errors = {"error_message": "Encountered an internal server error during mapping"} + db.add(score_set) + db.commit() send_slack_message(backoff_e) logging_context = {**logging_context, **format_raised_exception_info_as_dict(backoff_e)} logger.critical( @@ -313,6 +317,9 @@ async def map_variants_for_score_set( ) else: if new_job_id and not max_retries_exceeded: + score_set.mapping_state = MappingState.queued + db.add(score_set) + db.commit() logger.info( msg="After encountering an error while mapping variants, another mapping job was queued.", extra=logging_context, @@ -463,7 +470,6 @@ async def map_variants_for_score_set( except Exception as e: db.rollback() - score_set.mapping_state = MappingState.failed score_set.mapping_errors = { "error_message": f"Encountered an unexpected error while parsing mapped variants. Mapping will be automatically retried up to 5 times for this score set (attempt {attempt}/5)." } @@ -505,6 +511,9 @@ async def map_variants_for_score_set( ) else: if new_job_id and not max_retries_exceeded: + score_set.mapping_state = MappingState.queued + db.add(score_set) + db.commit() logger.info( msg="After encountering an error while parsing mapped variants, another mapping job was queued.", extra=logging_context, From 87859df4f6b1dc552f4ce293d0d6aa703b5b20af Mon Sep 17 00:00:00 2001 From: Sally Grindstaff Date: Fri, 13 Sep 2024 14:52:58 -0700 Subject: [PATCH 52/54] Fix mypy issues resulting from get() calls --- src/mavedb/worker/jobs.py | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/src/mavedb/worker/jobs.py b/src/mavedb/worker/jobs.py index caadecdf..66a6e82d 100644 --- a/src/mavedb/worker/jobs.py +++ b/src/mavedb/worker/jobs.py @@ -347,7 +347,8 @@ async def map_variants_for_score_set( try: if mapping_results: - if not mapping_results.get("mapped_scores"): + mapped_scores = mapping_results.get("mapped_scores") + if not mapped_scores: # if there are no mapped scores, the score set failed to map. score_set.mapping_state = MappingState.failed score_set.mapping_errors = {"error_message": mapping_results.get("error_message")} @@ -359,10 +360,15 @@ async def map_variants_for_score_set( # TODO(VariantEffect/dcd-mapping2#3) after adding accession-based score set mapping support: # this also assumes that the score set is based on a target sequence, not a target accession - if mapping_results.get("computed_genomic_reference_sequence"): - target_sequence = mapping_results["computed_genomic_reference_sequence"]["sequence"] - elif mapping_results.get("computed_protein_reference_sequence"): - target_sequence = mapping_results["computed_protein_reference_sequence"]["sequence"] + computed_genomic_ref = mapping_results.get("computed_genomic_reference_sequence") + mapped_genomic_ref = mapping_results.get("mapped_genomic_reference_sequence") + computed_protein_ref = mapping_results.get("computed_protein_reference_sequence") + mapped_protein_ref = mapping_results.get("mapped_protein_reference_sequence") + + if computed_genomic_ref: + target_sequence = computed_genomic_ref["sequence"] + elif computed_protein_ref: + target_sequence = computed_protein_ref["sequence"] else: raise NonexistentMappingReferenceError() @@ -377,11 +383,8 @@ async def map_variants_for_score_set( ).one() excluded_pre_mapped_keys = {"sequence"} - if ( - mapping_results.get("computed_genomic_reference_sequence") - and mapping_results.get("mapped_genomic_reference_sequence") - ): - pre_mapped_metadata = mapping_results.get("computed_genomic_reference_sequence") + if computed_genomic_ref and mapped_genomic_ref: + pre_mapped_metadata = computed_genomic_ref target_gene.pre_mapped_metadata = cast( { "genomic": { @@ -392,13 +395,10 @@ async def map_variants_for_score_set( JSONB, ) target_gene.post_mapped_metadata = cast( - {"genomic": mapping_results.get("mapped_genomic_reference_sequence")}, JSONB + {"genomic": mapped_genomic_ref}, JSONB ) - elif ( - mapping_results.get("computed_protein_reference_sequence") - and mapping_results.get("mapped_protein_reference_sequence") - ): - pre_mapped_metadata = mapping_results.get("computed_protein_reference_sequence") + elif computed_protein_ref and mapped_protein_ref: + pre_mapped_metadata = computed_protein_ref target_gene.pre_mapped_metadata = cast( { "protein": { @@ -409,14 +409,14 @@ async def map_variants_for_score_set( JSONB, ) target_gene.post_mapped_metadata = cast( - {"protein": mapping_results.get("mapped_protein_reference_sequence")}, JSONB + {"protein": mapped_protein_ref}, JSONB ) else: raise NonexistentMappingReferenceError() total_variants = 0 successful_mapped_variants = 0 - for mapped_score in mapping_results.get("mapped_scores"): + for mapped_score in mapped_scores: total_variants += 1 variant_urn = mapped_score.get("mavedb_id") variant = db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() @@ -440,9 +440,9 @@ async def map_variants_for_score_set( post_mapped=mapped_score.get("post_mapped", null()), variant_id=variant.id, modification_date=date.today(), - mapped_date=mapping_results.get("mapped_date_utc", null()), + mapped_date=mapping_results["mapped_date_utc"], vrs_version=mapped_score.get("vrs_version", null()), - mapping_api_version=mapping_results.get("dcd_mapping_version", null()), + mapping_api_version=mapping_results["dcd_mapping_version"], error_message=mapped_score.get("error_message", null()), current=True, ) @@ -456,7 +456,7 @@ async def map_variants_for_score_set( else: score_set.mapping_state = MappingState.complete - logging_context["mapped_variants_inserted_db"] = len(mapping_results["mapped_scores"]) + logging_context["mapped_variants_inserted_db"] = len(mapped_scores) logging_context["variants_successfully_mapped"] = successful_mapped_variants logging_context["mapping_state"] = score_set.mapping_state.name logging_context["mapping_errors"] = score_set.mapping_errors From d916711b590d4f6bbdb2abf32e7aadf076e9a483 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 16:05:37 -0700 Subject: [PATCH 53/54] Fix Mapped Variant View Models --- src/mavedb/view_models/mapped_variant.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mavedb/view_models/mapped_variant.py b/src/mavedb/view_models/mapped_variant.py index 064a2d54..7bf98300 100644 --- a/src/mavedb/view_models/mapped_variant.py +++ b/src/mavedb/view_models/mapped_variant.py @@ -1,15 +1,15 @@ -from typing import Any +from typing import Any, Optional from datetime import date from .base.base import BaseModel class MappedVariantBase(BaseModel): - pre_mapped: Any - post_mapped: Any + pre_mapped: Optional[Any] + post_mapped: Optional[Any] variant_id: int - vrs_version: str - error_message: str + vrs_version: Optional[str] + error_message: Optional[str] modification_date: date mapped_date: date mapping_api_version: str From 5c6fce3387bb424a5a577cc3b7390bd30de15683 Mon Sep 17 00:00:00 2001 From: Ben Capodanno Date: Fri, 13 Sep 2024 16:16:05 -0700 Subject: [PATCH 54/54] Bump version number --- pyproject.toml | 2 +- src/mavedb/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8f454525..b1865dff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "mavedb" -version = "2024.3.0" +version = "2024.4.0" description = "API for MaveDB, the database of Multiplexed Assays of Variant Effect." license = "AGPL-3.0-only" readme = "README.md" diff --git a/src/mavedb/__init__.py b/src/mavedb/__init__.py index 4bfeff00..7308262e 100644 --- a/src/mavedb/__init__.py +++ b/src/mavedb/__init__.py @@ -6,6 +6,6 @@ logger = module_logging.getLogger(__name__) __project__ = "mavedb-api" -__version__ = "2024.3.0" +__version__ = "2024.4.0" logger.info(f"MaveDB {__version__}")