From 8eeed0841a05920e78f738d6753f1d1ab04fe485 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Fri, 24 Nov 2023 14:08:54 -0500 Subject: [PATCH] enhancement!: clean up __init__.py files (#302) --- docs/scripts/generate_normalize_figure.py | 2 +- src/gene/__init__.py | 65 +---------------------- src/gene/cli.py | 3 +- src/gene/database/dynamodb.py | 10 +++- src/gene/etl/base.py | 11 ++-- src/gene/etl/ensembl.py | 3 +- src/gene/etl/hgnc.py | 4 +- src/gene/etl/ncbi.py | 4 +- src/gene/main.py | 3 +- src/gene/query.py | 18 ++++--- src/gene/schemas.py | 27 ++++++++++ 11 files changed, 65 insertions(+), 85 deletions(-) diff --git a/docs/scripts/generate_normalize_figure.py b/docs/scripts/generate_normalize_figure.py index 1a39a085..bb05c7c1 100644 --- a/docs/scripts/generate_normalize_figure.py +++ b/docs/scripts/generate_normalize_figure.py @@ -12,8 +12,8 @@ import gravis as gv -from gene import APP_ROOT from gene.database import create_db +from gene.etl.base import APP_ROOT from gene.query import QueryHandler from gene.schemas import UnmergedNormalizationService diff --git a/src/gene/__init__.py b/src/gene/__init__.py index 2c569554..d3dab1ac 100644 --- a/src/gene/__init__.py +++ b/src/gene/__init__.py @@ -1,65 +1,4 @@ """The VICC library for normalizing genes.""" -import logging -from os import environ -from pathlib import Path +from .version import __version__ -from .version import __version__ # noqa: F401 - -APP_ROOT = Path(__file__).resolve().parent - -logging.basicConfig( - filename="gene.log", format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s" -) -logger = logging.getLogger("gene") -logger.setLevel(logging.DEBUG) -logger.handlers = [] - -logging.getLogger("boto3").setLevel(logging.INFO) -logging.getLogger("botocore").setLevel(logging.INFO) -logging.getLogger("urllib3").setLevel(logging.INFO) -logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) - - -SEQREPO_ROOT_DIR = Path( - environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") -) - - -class DownloadException(Exception): # noqa: N818 - """Exception for failures relating to source file downloads.""" - - -from gene.schemas import ( # noqa: E402 - NamespacePrefix, - RefType, - SourceIDAfterNamespace, - SourceName, -) - -ITEM_TYPES = {k.lower(): v.value for k, v in RefType.__members__.items()} - -# Sources we import directly (HGNC, Ensembl, NCBI) -SOURCES = { - source.value.lower(): source.value for source in SourceName.__members__.values() -} - -# Set of sources we import directly -XREF_SOURCES = {src.lower() for src in SourceName.__members__} - -# use to fetch source name from schema based on concept id namespace -# e.g. {"hgnc": "HGNC"} -PREFIX_LOOKUP = { - v.value: SourceName[k].value - for k, v in NamespacePrefix.__members__.items() - if k in SourceName.__members__.keys() -} - -# use to generate namespace prefix from source ID value -# e.g. {"ensg": "ensembl"} -NAMESPACE_LOOKUP = { - v.value.lower(): NamespacePrefix[k].value - for k, v in SourceIDAfterNamespace.__members__.items() - if v.value != "" -} +__all__ = ["__version__"] diff --git a/src/gene/cli.py b/src/gene/cli.py index ceb3e911..30beb762 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -7,7 +7,6 @@ import click -from gene import SOURCES from gene.database import ( AbstractDatabase, DatabaseReadException, @@ -15,7 +14,7 @@ create_db, ) from gene.database.database import DatabaseException -from gene.schemas import SourceName +from gene.schemas import SOURCES, SourceName logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 5df9e0d0..161972e1 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -11,7 +11,6 @@ from boto3.dynamodb.conditions import Key from botocore.exceptions import ClientError -from gene import ITEM_TYPES, PREFIX_LOOKUP from gene.database.database import ( AWS_ENV_VAR_NAME, SKIP_AWS_DB_ENV_NAME, @@ -23,7 +22,14 @@ DatabaseWriteException, confirm_aws_db_use, ) -from gene.schemas import RecordType, RefType, SourceMeta, SourceName +from gene.schemas import ( + ITEM_TYPES, + PREFIX_LOOKUP, + RecordType, + RefType, + SourceMeta, + SourceName, +) logger = logging.getLogger(__name__) diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 020cd3ed..f46f0044 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -6,7 +6,7 @@ import shutil from abc import ABC, abstractmethod from ftplib import FTP -from os import remove +from os import environ, remove from pathlib import Path from typing import Callable, Dict, List, Optional @@ -15,14 +15,19 @@ from dateutil import parser from gffutils.feature import Feature -from gene import ITEM_TYPES, SEQREPO_ROOT_DIR from gene.database import AbstractDatabase -from gene.schemas import Gene, GeneSequenceLocation, MatchType, SourceName +from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) +APP_ROOT = Path(__file__).resolve().parent +SEQREPO_ROOT_DIR = Path( + environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") +) + + class Base(ABC): """The ETL base class.""" diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 8ff78f23..f75ed034 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -10,9 +10,8 @@ import requests from gffutils.feature import Feature -from gene import APP_ROOT from gene.database import AbstractDatabase -from gene.etl.base import Base +from gene.etl.base import APP_ROOT, Base from gene.etl.exceptions import ( GeneFileVersionError, GeneNormalizerEtlError, diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index c78ce294..bfd07a7e 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -10,15 +10,15 @@ from dateutil import parser -from gene import APP_ROOT, PREFIX_LOOKUP from gene.database import AbstractDatabase -from gene.etl.base import Base +from gene.etl.base import APP_ROOT, Base from gene.etl.exceptions import ( GeneFileVersionError, GeneNormalizerEtlError, GeneSourceFetchError, ) from gene.schemas import ( + PREFIX_LOOKUP, Annotation, Chromosome, NamespacePrefix, diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index d57bc614..45d3c308 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -9,15 +9,15 @@ import gffutils -from gene import APP_ROOT, PREFIX_LOOKUP from gene.database import AbstractDatabase -from gene.etl.base import Base +from gene.etl.base import APP_ROOT, Base from gene.etl.exceptions import ( GeneFileVersionError, GeneNormalizerEtlError, GeneSourceFetchError, ) from gene.schemas import ( + PREFIX_LOOKUP, Annotation, Chromosome, NamespacePrefix, diff --git a/src/gene/main.py b/src/gene/main.py index 59195068..8135da4e 100644 --- a/src/gene/main.py +++ b/src/gene/main.py @@ -4,10 +4,11 @@ from fastapi import FastAPI, HTTPException, Query -from gene import SOURCES, __version__ +from gene import __version__ from gene.database import create_db from gene.query import QueryHandler from gene.schemas import ( + SOURCES, NormalizeService, SearchService, SourceName, diff --git a/src/gene/query.py b/src/gene/query.py index 64c8b536..d402390d 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -1,4 +1,5 @@ """Provides methods for handling queries.""" +import logging import re from datetime import datetime from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar @@ -6,9 +7,11 @@ from ga4gh.core import core_models, ga4gh_identify from ga4gh.vrs import models -from gene import ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP, logger from gene.database import AbstractDatabase, DatabaseReadException from gene.schemas import ( + ITEM_TYPES, + NAMESPACE_LOOKUP, + PREFIX_LOOKUP, BaseGene, BaseNormalizationService, Gene, @@ -28,6 +31,7 @@ ) from gene.version import __version__ +_logger = logging.getLogger(__name__) NormService = TypeVar("NormService", bound=BaseNormalizationService) @@ -72,7 +76,7 @@ def _emit_warnings(query_str: str) -> List: "non_breaking_space_characters": "Query contains non-breaking space characters" } ] - logger.warning( + _logger.warning( f"Query ({query_str}) contains non-breaking space characters." ) return warnings @@ -188,14 +192,14 @@ def _fetch_record( try: match = self.db.get_record_by_id(concept_id, case_sensitive=False) except DatabaseReadException as e: - logger.error( + _logger.error( f"Encountered DatabaseReadException looking up {concept_id}: {e}" ) else: if match: self._add_record(response, match, match_type) else: - logger.error( + _logger.error( f"Unable to find expected record for {concept_id} matching as {match_type}" ) # noqa: E501 @@ -263,7 +267,7 @@ def _get_search_response(self, query: str, sources: Iterable[SourceName]) -> Dic matched_concept_ids.append(ref) except DatabaseReadException as e: - logger.error( + _logger.error( f"Encountered DatabaseReadException looking up {item_type}" f" {term}: {e}" ) @@ -492,7 +496,7 @@ def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: :param query: original query value :return: response with no match """ - logger.error( + _logger.error( f"Merge ref lookup failed for ref {record['merge_ref']} " f"in record {record['concept_id']} from query {query}" ) @@ -557,7 +561,7 @@ def _resolve_merge( merge = self.db.get_record_by_id(merge_ref, False, True) if merge is None: query = response.query - logger.error( + _logger.error( f"Merge ref lookup failed for ref {record['merge_ref']} " f"in record {record['concept_id']} from query `{query}`" ) diff --git a/src/gene/schemas.py b/src/gene/schemas.py index 6f85b1bc..e6cb5183 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -147,6 +147,12 @@ class SourceName(Enum): NCBI = "NCBI" +# lowercase imported source name to correctly-cased name, e.g. {"ensembl": "Ensembl"} +SOURCES = { + source.value.lower(): source.value for source in SourceName.__members__.values() +} + + class SourcePriority(IntEnum): """Define priorities for sources when building merged concepts.""" @@ -196,6 +202,23 @@ class NamespacePrefix(Enum): RFAM = "rfam" +# use to fetch source name from schema based on concept id namespace +# e.g. {"hgnc": "HGNC"} +PREFIX_LOOKUP = { + v.value: SourceName[k].value + for k, v in NamespacePrefix.__members__.items() + if k in SourceName.__members__.keys() +} + +# use to generate namespace prefix from source ID value +# e.g. {"ensg": "ensembl"} +NAMESPACE_LOOKUP = { + v.value.lower(): NamespacePrefix[k].value + for k, v in SourceIDAfterNamespace.__members__.items() + if v.value != "" +} + + class DataLicenseAttributes(BaseModel): """Define constraints for data license attributes.""" @@ -222,6 +245,10 @@ class RefType(str, Enum): ASSOCIATED_WITH = "associated_with" +# collective name to singular name, e.g. {"previous_symbols": "prev_symbol"} +ITEM_TYPES = {k.lower(): v.value for k, v in RefType.__members__.items()} + + class SourceMeta(BaseModel): """Metadata for a given source to return in response object."""