diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2b0ba8a9..1c696108 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,5 +10,6 @@ repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.2
hooks:
- - id: ruff
- id: ruff-format
+ - id: ruff
+ args: [ --fix, --exit-non-zero-on-fix ]
diff --git a/docs/scripts/generate_normalize_figure.py b/docs/scripts/generate_normalize_figure.py
index 1a39a085..d5863a38 100644
--- a/docs/scripts/generate_normalize_figure.py
+++ b/docs/scripts/generate_normalize_figure.py
@@ -18,9 +18,9 @@
from gene.schemas import UnmergedNormalizationService
COLORS = [
- "#F8766D",
- "#00BA38",
- "#00B9E3",
+ '#F8766D',
+ '#00BA38',
+ '#00B9E3',
]
@@ -30,50 +30,50 @@ def create_gjgf(result: UnmergedNormalizationService) -> Dict:
:param result: result from Unmerged Normalization search
"""
graph = {
- "graph": {
- "label": "tmp",
- "nodes": {},
- "edges": [],
- "metadata": {
- "arrow_size": 15,
- "node_size": 15,
- "node_label_size": 20,
- "edge_size": 2,
+ 'graph': {
+ 'label': 'tmp',
+ 'nodes': {},
+ 'edges': [],
+ 'metadata': {
+ 'arrow_size': 15,
+ 'node_size': 15,
+ 'node_label_size': 20,
+ 'edge_size': 2,
},
}
}
- for i, (source, matches) in enumerate(result.source_matches.items()):
+ for i, (_, matches) in enumerate(result.source_matches.items()):
for match in matches.records:
- graph["graph"]["nodes"][match.concept_id] = {
- "metadata": {
- "color": COLORS[i],
- "hover": f"{match.concept_id}\n{match.symbol}\n{match.label}", # noqa: E501
- "click": f"
{json.dumps(match.model_dump(), indent=2)}
", # noqa: E501
+ graph['graph']['nodes'][match.concept_id] = {
+ 'metadata': {
+ 'color': COLORS[i],
+ 'hover': f'{match.concept_id}\n{match.symbol}\n{match.label}',
+ 'click': f"{json.dumps(match.model_dump(), indent=2)}
",
}
}
for xref in match.xrefs:
- graph["graph"]["edges"].append(
- {"source": match.concept_id, "target": xref}
+ graph['graph']['edges'].append(
+ {'source': match.concept_id, 'target': xref}
)
included_edges = []
- for edge in graph["graph"]["edges"]:
+ for edge in graph['graph']['edges']:
if (
- edge["target"] in graph["graph"]["nodes"]
- and edge["source"] in graph["graph"]["nodes"]
+ edge['target'] in graph['graph']['nodes']
+ and edge['source'] in graph['graph']['nodes']
):
included_edges.append(edge)
- graph["graph"]["edges"] = included_edges
+ graph['graph']['edges'] = included_edges
- included_nodes = {k["source"] for k in graph["graph"]["edges"]}.union(
- {k["target"] for k in graph["graph"]["edges"]}
+ included_nodes = {k['source'] for k in graph['graph']['edges']}.union(
+ {k['target'] for k in graph['graph']['edges']}
)
new_nodes = {}
- for key, value in graph["graph"]["nodes"].items():
+ for key, value in graph['graph']['nodes'].items():
if key in included_nodes:
new_nodes[key] = value
- graph["graph"]["nodes"] = new_nodes
+ graph['graph']['nodes'] = new_nodes
return graph
@@ -82,8 +82,8 @@ def gen_norm_figure() -> None:
"""Generate normalized graph figure for docs."""
q = QueryHandler(create_db())
- otx2p1 = "OTX2P1"
- otx2p2 = "OTX2P2"
+ otx2p1 = 'OTX2P1'
+ otx2p2 = 'OTX2P2'
otx2p1_result = q.normalize_unmerged(otx2p1)
otx2p2_result = q.normalize_unmerged(otx2p2)
@@ -91,15 +91,15 @@ def gen_norm_figure() -> None:
otx2p1_graph = create_gjgf(otx2p1_result)
otx2p2_graph = create_gjgf(otx2p2_result)
- nodes = otx2p1_graph["graph"]["nodes"]
- nodes.update(otx2p2_graph["graph"]["nodes"])
+ nodes = otx2p1_graph['graph']['nodes']
+ nodes.update(otx2p2_graph['graph']['nodes'])
graph = {
- "graph": {
- "label": f"Reference network for {otx2p1} and {otx2p2}",
- "metadata": otx2p1_graph["graph"]["metadata"],
- "nodes": nodes,
- "edges": otx2p1_graph["graph"]["edges"] + otx2p2_graph["graph"]["edges"],
+ 'graph': {
+ 'label': f'Reference network for {otx2p1} and {otx2p2}',
+ 'metadata': otx2p1_graph['graph']['metadata'],
+ 'nodes': nodes,
+ 'edges': otx2p1_graph['graph']['edges'] + otx2p2_graph['graph']['edges'],
}
}
@@ -107,20 +107,20 @@ def gen_norm_figure() -> None:
data=graph,
graph_height=250,
node_hover_neighborhood=True,
- node_label_font="arial",
+ node_label_font='arial',
)
fig.export_html(
(
APP_ROOT.parents[0]
- / "docs"
- / "source"
- / "_static"
- / "html"
- / "normalize_example.html"
+ / 'docs'
+ / 'source'
+ / '_static'
+ / 'html'
+ / 'normalize_example.html'
).absolute(),
overwrite=True,
)
-if __name__ == "__main__":
+if __name__ == '__main__':
gen_norm_figure()
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 46a04cce..9526290b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -57,14 +57,14 @@
],
}
# -- autodoc things ----------------------------------------------------------
-import os # noqa: E402
-import sys # noqa: E402
+import os
+import sys
sys.path.insert(0, os.path.abspath("../../gene"))
autodoc_preserve_defaults = True
# -- get version -------------------------------------------------------------
-from gene import __version__ # noqa: E402
+from gene import __version__
version = __version__
release = version
@@ -77,7 +77,7 @@ def linkcode_resolve(domain, info):
if not info["module"]:
return None
filename = info["module"].replace(".", "/")
- return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py" # noqa: E501
+ return f"https://github.com/cancervariants/gene-normalization/blob/main/{filename}.py"
# -- code block style --------------------------------------------------------
diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
index 2ec80652..af533cba 100644
--- a/docs/source/contributing.rst
+++ b/docs/source/contributing.rst
@@ -48,7 +48,7 @@ When running the web server, enable hot-reloading on new code changes: ::
Style
-----
-Code style is managed by `Ruff `_ and `Black `_, and should be checked via pre-commit hook before commits. Final QC is applied with GitHub Actions to every pull request.
+Code style is managed by `Ruff `_, and should be checked via pre-commit hook before commits. Final QC is applied with GitHub Actions to every pull request.
Tests
-----
diff --git a/pyproject.toml b/pyproject.toml
index 41c8f7fd..f1981eb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -92,38 +92,44 @@ exclude = ["docs/source/conf.py"]
# pycodestyle (E, W)
# Pyflakes (F)
# flake8-annotations (ANN)
-# flake8-quotes (Q)
# pydocstyle (D)
# pep8-naming (N)
# isort (I)
-select = ["E", "W", "F", "ANN", "Q", "D", "N", "I"]
-
+select = ["E", "W", "F", "ANN", "D", "N", "I"]
fixable = ["I", "F401"]
# D203 - one-blank-line-before-class
# D205 - blank-line-after-summary
+# D206 - indent-with-spaces*
# D213 - multi-line-summary-second-line
+# D300 - triple-single-quotes
# D400 - ends-in-period
# D415 - ends-in-punctuation
# ANN101 - missing-type-self
# ANN003 - missing-type-kwargs
-# E501 - line-too-long
-ignore = ["D203", "D205", "D213", "D400", "D415", "ANN101", "ANN003", "E501"]
+# E111 - indentation-with-invalid-multiple*
+# E114 - indentation-with-invalid-multiple-comment*
+# E117 - over-indented*
+# E501 - line-too-long*
+# W191 - tab-indentation*
+# *ignored for compatibility with formatter
+ignore = [
+ "D203", "D205", "D206", "D213", "D300", "D400", "D415",
+ "ANN101", "ANN003",
+ "E111", "E114", "E117", "E501",
+ "W191"
+]
-[tool.ruff.flake8-quotes]
-docstring-quotes = "double"
+[tool.ruff.format]
+quote-style = "single"
[tool.ruff.per-file-ignores]
# ANN001 - missing-type-function-argument
+# ANN102 - missing-type-cls
# ANN2 - missing-return-type
# ANN201 - Missing type annotation
-# ANN102 - missing-type-cls
-# D103 - Missing docstring in public function
-# F821 - undefined-name
-# F401 - unused-import
-# I001 - Import block unsorted or unformatted
+# D301 - escape-sequence-in-docstring
# N805 - invalid-first-argument-name-for-method
"tests/*" = ["ANN001", "ANN102", "ANN2"]
-"*__init__.py" = ["F401"]
-"gene/schemas.py" = ["ANN001", "ANN201", "N805"]
-"docs/source/conf.py" = ["D100", "I001", "D103", "ANN201", "ANN001"]
+"src/gene/schemas.py" = ["ANN001", "ANN201", "N805"]
+"src/gene/cli.py" = ["D301"]
diff --git a/src/gene/__init__.py b/src/gene/__init__.py
index 2c569554..9dd82081 100644
--- a/src/gene/__init__.py
+++ b/src/gene/__init__.py
@@ -3,27 +3,25 @@
from os import environ
from pathlib import Path
-from .version import __version__ # noqa: F401
-
APP_ROOT = Path(__file__).resolve().parent
logging.basicConfig(
- filename="gene.log", format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s"
+ filename='gene.log', format='[%(asctime)s] - %(name)s - %(levelname)s : %(message)s'
)
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
logger.handlers = []
-logging.getLogger("boto3").setLevel(logging.INFO)
-logging.getLogger("botocore").setLevel(logging.INFO)
-logging.getLogger("urllib3").setLevel(logging.INFO)
-logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO)
-logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO)
-logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO)
+logging.getLogger('boto3').setLevel(logging.INFO)
+logging.getLogger('botocore').setLevel(logging.INFO)
+logging.getLogger('urllib3').setLevel(logging.INFO)
+logging.getLogger('python_jsonschema_objects').setLevel(logging.INFO)
+logging.getLogger('biocommons.seqrepo.seqaliasdb.seqaliasdb').setLevel(logging.INFO)
+logging.getLogger('biocommons.seqrepo.fastadir.fastadir').setLevel(logging.INFO)
SEQREPO_ROOT_DIR = Path(
- environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest")
+ environ.get('SEQREPO_ROOT_DIR', '/usr/local/share/seqrepo/latest')
)
@@ -61,5 +59,5 @@ class DownloadException(Exception): # noqa: N818
NAMESPACE_LOOKUP = {
v.value.lower(): NamespacePrefix[k].value
for k, v in SourceIDAfterNamespace.__members__.items()
- if v.value != ""
+ if v.value != ''
}
diff --git a/src/gene/cli.py b/src/gene/cli.py
index a14ce952..f19a6537 100644
--- a/src/gene/cli.py
+++ b/src/gene/cli.py
@@ -17,13 +17,13 @@
from gene.database.database import DatabaseException
from gene.schemas import SourceName
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@click.command()
-@click.option("--db_url", help="URL endpoint for the application database.")
-@click.option("--verbose", "-v", is_flag=True, help="Print result to console if set.")
+@click.option('--db_url', help='URL endpoint for the application database.')
+@click.option('--verbose', '-v', is_flag=True, help='Print result to console if set.')
def check_db(db_url: str, verbose: bool = False) -> None:
"""Perform basic checks on DB health and population. Exits with status code 1
if DB schema is uninitialized or if critical tables appear to be empty.
@@ -31,25 +31,25 @@ def check_db(db_url: str, verbose: bool = False) -> None:
\f
:param db_url: URL to normalizer database
:param verbose: if true, print result to console
- """ # noqa: D301
+ """
db = create_db(db_url, False)
if not db.check_schema_initialized():
if verbose:
- click.echo("Health check failed: DB schema uninitialized.")
+ click.echo('Health check failed: DB schema uninitialized.')
click.get_current_context().exit(1)
if not db.check_tables_populated():
if verbose:
- click.echo("Health check failed: DB is incompletely populated.")
+ click.echo('Health check failed: DB is incompletely populated.')
click.get_current_context().exit(1)
if verbose:
- click.echo("DB health check successful: tables appear complete.")
+ click.echo('DB health check successful: tables appear complete.')
@click.command()
-@click.option("--data_url", help="URL to data dump")
-@click.option("--db_url", help="URL endpoint for the application database.")
+@click.option('--data_url', help='URL to data dump')
+@click.option('--db_url', help='URL endpoint for the application database.')
def update_from_remote(data_url: Optional[str], db_url: str) -> None:
"""Update data from remotely-hosted DB dump. By default, fetches from latest
available dump on VICC S3 bucket; specific URLs can be provided instead by
@@ -58,52 +58,52 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None:
\f
:param data_url: user-specified location to pull DB dump from
:param db_url: URL to normalizer database
- """ # noqa: D301
- if not click.confirm("Are you sure you want to overwrite existing data?"):
+ """
+ if not click.confirm('Are you sure you want to overwrite existing data?'):
click.get_current_context().exit()
if not data_url:
- data_url = os.environ.get("GENE_NORM_REMOTE_DB_URL")
+ data_url = os.environ.get('GENE_NORM_REMOTE_DB_URL')
db = create_db(db_url, False)
try:
db.load_from_remote(data_url)
except NotImplementedError:
click.echo(
- f"Error: Fetching remote data dump not supported for {db.__class__.__name__}"
- ) # noqa: E501
+ f'Error: Fetching remote data dump not supported for {db.__class__.__name__}'
+ )
click.get_current_context().exit(1)
except DatabaseException as e:
- click.echo(f"Encountered exception during update: {str(e)}")
+ click.echo(f'Encountered exception during update: {str(e)}')
click.get_current_context().exit(1)
@click.command()
@click.option(
- "--output_directory",
- "-o",
- help="Output location to write to",
+ '--output_directory',
+ '-o',
+ help='Output location to write to',
type=click.Path(exists=True, path_type=Path),
)
-@click.option("--db_url", help="URL endpoint for the application database.")
+@click.option('--db_url', help='URL endpoint for the application database.')
def dump_database(output_directory: Path, db_url: str) -> None:
"""Dump data from database into file.
\f
:param output_directory: path to existing directory
:param db_url: URL to normalizer database
- """ # noqa: D301
+ """
if not output_directory:
- output_directory = Path(".")
+ output_directory = Path('.')
db = create_db(db_url, False)
try:
db.export_db(output_directory)
except NotImplementedError:
click.echo(
- f"Error: Dumping data to file not supported for {db.__class__.__name__}"
- ) # noqa: E501
+ f'Error: Dumping data to file not supported for {db.__class__.__name__}'
+ )
click.get_current_context().exit(1)
except DatabaseException as e:
- click.echo(f"Encountered exception during update: {str(e)}")
+ click.echo(f'Encountered exception during update: {str(e)}')
click.get_current_context().exit(1)
@@ -138,20 +138,20 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float:
:param db: database instance
:return: time taken (in seconds) to delete
"""
- msg = f"Deleting {n.value}..."
- click.echo(f"\n{msg}")
+ msg = f'Deleting {n.value}...'
+ click.echo(f'\n{msg}')
logger.info(msg)
start_delete = timer()
db.delete_source(n)
end_delete = timer()
delete_time = end_delete - start_delete
- msg = f"Deleted {n.value} in {delete_time:.5f} seconds."
- click.echo(f"{msg}\n")
+ msg = f'Deleted {n.value} in {delete_time:.5f} seconds.'
+ click.echo(f'{msg}\n')
logger.info(msg)
return delete_time
-_etl_dependency_help = "Are ETL dependencies installed? See the Installation page in the documentation for more info."
+_etl_dependency_help = 'Are ETL dependencies installed? See the Installation page in the documentation for more info.'
def _load_source(
@@ -170,7 +170,7 @@ def _load_source(
:param use_existing: if True, use most recent local data files instead of
fetching from remote
"""
- msg = f"Loading {n.value}..."
+ msg = f'Loading {n.value}...'
click.echo(msg)
logger.info(msg)
start_load = timer()
@@ -181,7 +181,7 @@ def _load_source(
from gene.etl.exceptions import GeneNormalizerEtlError
except ModuleNotFoundError as e:
click.echo(
- f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}"
+ f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}'
)
click.get_current_context().exit()
SourceClass = eval(n.value) # noqa: N806
@@ -191,14 +191,14 @@ def _load_source(
processed_ids += source.perform_etl(use_existing)
except GeneNormalizerEtlError as e:
logger.error(e)
- click.echo(f"Encountered error while loading {n}: {e}.")
+ click.echo(f'Encountered error while loading {n}: {e}.')
click.get_current_context().exit()
end_load = timer()
load_time = end_load - start_load
- msg = f"Loaded {n.value} in {load_time:.5f} seconds."
+ msg = f'Loaded {n.value} in {load_time:.5f} seconds.'
click.echo(msg)
logger.info(msg)
- msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds."
+ msg = f'Total time for {n.value}: {(delete_time + load_time):.5f} seconds.'
click.echo(msg)
logger.info(msg)
@@ -208,15 +208,15 @@ def _delete_normalized_data(database: AbstractDatabase) -> None:
:param database: DB instance
"""
- click.echo("\nDeleting normalized records...")
+ click.echo('\nDeleting normalized records...')
start_delete = timer()
try:
database.delete_normalized_concepts()
except (DatabaseReadException, DatabaseWriteException) as e:
- click.echo(f"Encountered exception during normalized data deletion: {e}")
+ click.echo(f'Encountered exception during normalized data deletion: {e}')
end_delete = timer()
delete_time = end_delete - start_delete
- click.echo(f"Deleted normalized records in {delete_time:.5f} seconds.")
+ click.echo(f'Deleted normalized records in {delete_time:.5f} seconds.')
def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None:
@@ -234,34 +234,34 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None:
from gene.etl.merge import Merge
except ModuleNotFoundError as e:
click.echo(
- f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}"
+ f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}'
)
click.get_current_context().exit()
merge = Merge(database=db)
- click.echo("Constructing normalized records...")
+ click.echo('Constructing normalized records...')
merge.create_merged_concepts(processed_ids)
end = timer()
click.echo(
- f"Merged concept generation completed in " f"{(end - start):.5f} seconds"
+ f'Merged concept generation completed in ' f'{(end - start):.5f} seconds'
)
@click.command()
-@click.option("--sources", help="The source(s) you wish to update separated by spaces.")
-@click.option("--aws_instance", is_flag=True, help="Using AWS DynamodDB instance.")
-@click.option("--db_url", help="URL endpoint for the application database.")
-@click.option("--update_all", is_flag=True, help="Update all normalizer sources.")
+@click.option('--sources', help='The source(s) you wish to update separated by spaces.')
+@click.option('--aws_instance', is_flag=True, help='Using AWS DynamodDB instance.')
+@click.option('--db_url', help='URL endpoint for the application database.')
+@click.option('--update_all', is_flag=True, help='Update all normalizer sources.')
@click.option(
- "--update_merged",
+ '--update_merged',
is_flag=True,
- help="Update concepts for normalize endpoint from accepted sources.",
+ help='Update concepts for normalize endpoint from accepted sources.',
)
@click.option(
- "--use_existing",
+ '--use_existing',
is_flag=True,
default=False,
- help="Use most recent local source data instead of fetching latest version",
+ help='Use most recent local source data instead of fetching latest version',
)
def update_normalizer_db(
sources: str,
@@ -285,7 +285,7 @@ def update_normalizer_db(
:param update_all: if true, update all sources (ignore `normalizer` parameter)
:param update_merged: if true, update normalized records
:param use_existing: if True, use most recent local data instead of fetching latest version
- """ # noqa: D301
+ """
db = create_db(db_url, aws_instance)
if update_all:
@@ -296,24 +296,24 @@ def update_normalizer_db(
else:
ctx = click.get_current_context()
click.echo(
- "Must either enter 1 or more sources, or use `--update_all` parameter"
- ) # noqa: E501
+ 'Must either enter 1 or more sources, or use `--update_all` parameter'
+ )
click.echo(ctx.get_help())
ctx.exit()
else:
sources_split = sources.lower().split()
if len(sources_split) == 0:
- raise Exception("Must enter 1 or more source names to update")
+ raise Exception('Must enter 1 or more source names to update')
non_sources = set(sources_split) - set(SOURCES)
if len(non_sources) != 0:
- raise Exception(f"Not valid source(s): {non_sources}")
+ raise Exception(f'Not valid source(s): {non_sources}')
parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split}
_update_normalizer(parsed_source_names, db, update_merged, use_existing)
-if __name__ == "__main__":
+if __name__ == '__main__':
update_normalizer_db()
diff --git a/src/gene/database/__init__.py b/src/gene/database/__init__.py
index 3a71e721..216d9fb2 100644
--- a/src/gene/database/__init__.py
+++ b/src/gene/database/__init__.py
@@ -1,10 +1 @@
"""Provide database clients."""
-from .database import (
- AWS_ENV_VAR_NAME,
- AbstractDatabase,
- DatabaseException,
- DatabaseInitializationException,
- DatabaseReadException,
- DatabaseWriteException,
- create_db,
-)
diff --git a/src/gene/database/database.py b/src/gene/database/database.py
index 67bcafd6..93ef3cdf 100644
--- a/src/gene/database/database.py
+++ b/src/gene/database/database.py
@@ -61,12 +61,12 @@ def _check_delete_okay() -> bool:
:raise DatabaseWriteException: if skip confirmation variable is set -- manual
approval is required.
"""
- if environ.get(AWS_ENV_VAR_NAME, "") == AwsEnvName.PRODUCTION:
- if environ.get(SKIP_AWS_DB_ENV_NAME, "") == "true":
+ if environ.get(AWS_ENV_VAR_NAME, '') == AwsEnvName.PRODUCTION:
+ if environ.get(SKIP_AWS_DB_ENV_NAME, '') == 'true':
raise DatabaseWriteException(
- f"Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()" # noqa: E501
+ f'Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()'
)
- return click.confirm("Are you sure you want to delete existing data?")
+ return click.confirm('Are you sure you want to delete existing data?')
else:
return True
@@ -242,19 +242,19 @@ def export_db(self, export_location: Path) -> None:
# can be set to either `Dev`, `Staging`, or `Prod`
# ONLY set when wanting to access aws instance
-AWS_ENV_VAR_NAME = "GENE_NORM_ENV"
+AWS_ENV_VAR_NAME = 'GENE_NORM_ENV'
# Set to "true" if want to skip db confirmation check. Should ONLY be used for
# deployment needs
-SKIP_AWS_DB_ENV_NAME = "SKIP_AWS_CONFIRMATION"
+SKIP_AWS_DB_ENV_NAME = 'SKIP_AWS_CONFIRMATION'
class AwsEnvName(str, Enum):
"""AWS environment name that is being used"""
- DEVELOPMENT = "Dev"
- STAGING = "Staging"
- PRODUCTION = "Prod"
+ DEVELOPMENT = 'Dev'
+ STAGING = 'Staging'
+ PRODUCTION = 'Prod'
VALID_AWS_ENV_NAMES = {v.value for v in AwsEnvName.__members__.values()}
@@ -263,11 +263,11 @@ class AwsEnvName(str, Enum):
def confirm_aws_db_use(env_name: str) -> None:
"""Check to ensure that AWS instance should actually be used."""
if click.confirm(
- f"Are you sure you want to use the AWS {env_name} database?", default=False
+ f'Are you sure you want to use the AWS {env_name} database?', default=False
):
- click.echo(f"***GENE AWS {env_name.upper()} DATABASE IN USE***")
+ click.echo(f'***GENE AWS {env_name.upper()} DATABASE IN USE***')
else:
- click.echo("Exiting.")
+ click.echo('Exiting.')
sys.exit()
@@ -324,13 +324,13 @@ def create_db(
else:
if db_url:
endpoint_url = db_url
- elif "GENE_NORM_DB_URL" in environ.keys():
- endpoint_url = environ["GENE_NORM_DB_URL"]
+ elif 'GENE_NORM_DB_URL' in environ.keys():
+ endpoint_url = environ['GENE_NORM_DB_URL']
else:
- endpoint_url = "http://localhost:8000"
+ endpoint_url = 'http://localhost:8000'
# prefer DynamoDB unless connection explicitly reads like a libpq URI
- if endpoint_url.startswith("postgres"):
+ if endpoint_url.startswith('postgres'):
from gene.database.postgresql import PostgresDatabase
db = PostgresDatabase(endpoint_url)
diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py
index 5df9e0d0..b7658aa2 100644
--- a/src/gene/database/dynamodb.py
+++ b/src/gene/database/dynamodb.py
@@ -39,48 +39,48 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None:
* region_name: AWS region (defaults to "us-east-2")
:raise DatabaseInitializationException: if initial setup fails
"""
- self.gene_table = environ.get("GENE_DYNAMO_TABLE", "gene_normalizer")
- region_name = db_args.get("region_name", "us-east-2")
+ self.gene_table = environ.get('GENE_DYNAMO_TABLE', 'gene_normalizer')
+ region_name = db_args.get('region_name', 'us-east-2')
if AWS_ENV_VAR_NAME in environ:
- if "GENE_TEST" in environ:
+ if 'GENE_TEST' in environ:
raise DatabaseInitializationException(
- f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set."
- ) # noqa: E501
+ f'Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set.'
+ )
aws_env = environ[AWS_ENV_VAR_NAME]
if aws_env not in VALID_AWS_ENV_NAMES:
raise DatabaseInitializationException(
- f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}"
- ) # noqa: E501
+ f'{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}'
+ )
skip_confirmation = environ.get(SKIP_AWS_DB_ENV_NAME)
if (not skip_confirmation) or (
- skip_confirmation and skip_confirmation != "true"
- ): # noqa: E501
+ skip_confirmation and skip_confirmation != 'true'
+ ):
confirm_aws_db_use(environ[AWS_ENV_VAR_NAME])
- boto_params = {"region_name": region_name}
+ boto_params = {'region_name': region_name}
if aws_env == AwsEnvName.DEVELOPMENT:
self.gene_table = environ.get(
- "GENE_DYNAMO_TABLE", "gene_normalizer_nonprod"
+ 'GENE_DYNAMO_TABLE', 'gene_normalizer_nonprod'
)
else:
if db_url:
endpoint_url = db_url
- elif "GENE_NORM_DB_URL" in environ:
- endpoint_url = environ["GENE_NORM_DB_URL"]
+ elif 'GENE_NORM_DB_URL' in environ:
+ endpoint_url = environ['GENE_NORM_DB_URL']
else:
- endpoint_url = "http://localhost:8000"
- click.echo(f"***Using Gene Database Endpoint: {endpoint_url}***")
- boto_params = {"region_name": region_name, "endpoint_url": endpoint_url}
+ endpoint_url = 'http://localhost:8000'
+ click.echo(f'***Using Gene Database Endpoint: {endpoint_url}***')
+ boto_params = {'region_name': region_name, 'endpoint_url': endpoint_url}
- self.dynamodb = boto3.resource("dynamodb", **boto_params)
- self.dynamodb_client = boto3.client("dynamodb", **boto_params)
+ self.dynamodb = boto3.resource('dynamodb', **boto_params)
+ self.dynamodb_client = boto3.client('dynamodb', **boto_params)
# Only create tables for local instance
- envs_do_not_create_tables = {AWS_ENV_VAR_NAME, "GENE_TEST"}
+ envs_do_not_create_tables = {AWS_ENV_VAR_NAME, 'GENE_TEST'}
if not set(envs_do_not_create_tables) & set(environ):
self.initialize_db()
@@ -94,7 +94,7 @@ def list_tables(self) -> List[str]:
:return: Table names in DynamoDB
"""
- return self.dynamodb_client.list_tables()["TableNames"]
+ return self.dynamodb_client.list_tables()['TableNames']
def drop_db(self) -> None:
"""Delete all tables from database. Requires manual confirmation.
@@ -116,36 +116,36 @@ def _create_genes_table(self) -> None:
self.dynamodb.create_table(
TableName=self.gene_table,
KeySchema=[
- {"AttributeName": "label_and_type", "KeyType": "HASH"}, # Partition key
- {"AttributeName": "concept_id", "KeyType": "RANGE"}, # Sort key
+ {'AttributeName': 'label_and_type', 'KeyType': 'HASH'}, # Partition key
+ {'AttributeName': 'concept_id', 'KeyType': 'RANGE'}, # Sort key
],
AttributeDefinitions=[
- {"AttributeName": "label_and_type", "AttributeType": "S"},
- {"AttributeName": "concept_id", "AttributeType": "S"},
- {"AttributeName": "src_name", "AttributeType": "S"},
- {"AttributeName": "item_type", "AttributeType": "S"},
+ {'AttributeName': 'label_and_type', 'AttributeType': 'S'},
+ {'AttributeName': 'concept_id', 'AttributeType': 'S'},
+ {'AttributeName': 'src_name', 'AttributeType': 'S'},
+ {'AttributeName': 'item_type', 'AttributeType': 'S'},
],
GlobalSecondaryIndexes=[
{
- "IndexName": "src_index",
- "KeySchema": [{"AttributeName": "src_name", "KeyType": "HASH"}],
- "Projection": {"ProjectionType": "KEYS_ONLY"},
- "ProvisionedThroughput": {
- "ReadCapacityUnits": 10,
- "WriteCapacityUnits": 10,
+ 'IndexName': 'src_index',
+ 'KeySchema': [{'AttributeName': 'src_name', 'KeyType': 'HASH'}],
+ 'Projection': {'ProjectionType': 'KEYS_ONLY'},
+ 'ProvisionedThroughput': {
+ 'ReadCapacityUnits': 10,
+ 'WriteCapacityUnits': 10,
},
},
{
- "IndexName": "item_type_index",
- "KeySchema": [{"AttributeName": "item_type", "KeyType": "HASH"}],
- "Projection": {"ProjectionType": "KEYS_ONLY"},
- "ProvisionedThroughput": {
- "ReadCapacityUnits": 10,
- "WriteCapacityUnits": 10,
+ 'IndexName': 'item_type_index',
+ 'KeySchema': [{'AttributeName': 'item_type', 'KeyType': 'HASH'}],
+ 'Projection': {'ProjectionType': 'KEYS_ONLY'},
+ 'ProvisionedThroughput': {
+ 'ReadCapacityUnits': 10,
+ 'WriteCapacityUnits': 10,
},
},
],
- ProvisionedThroughput={"ReadCapacityUnits": 10, "WriteCapacityUnits": 10},
+ ProvisionedThroughput={'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10},
)
def check_schema_initialized(self) -> bool:
@@ -156,7 +156,7 @@ def check_schema_initialized(self) -> bool:
existing_tables = self.list_tables()
exists = self.gene_table in existing_tables
if not exists:
- logger.info(f"{self.gene_table} table is missing or unavailable.")
+ logger.info(f'{self.gene_table} table is missing or unavailable.')
return exists
def check_tables_populated(self) -> bool:
@@ -169,29 +169,29 @@ def check_tables_populated(self) -> bool:
:return: True if queries successful, false if DB appears empty
"""
sources = self.genes.query(
- IndexName="item_type_index",
- KeyConditionExpression=Key("item_type").eq("source"),
- ).get("Items", [])
+ IndexName='item_type_index',
+ KeyConditionExpression=Key('item_type').eq('source'),
+ ).get('Items', [])
if len(sources) < len(SourceName):
- logger.info("Gene sources table is missing expected sources.")
+ logger.info('Gene sources table is missing expected sources.')
return False
records = self.genes.query(
- IndexName="item_type_index",
- KeyConditionExpression=Key("item_type").eq("identity"),
+ IndexName='item_type_index',
+ KeyConditionExpression=Key('item_type').eq('identity'),
Limit=1,
)
- if len(records.get("Items", [])) < 1:
- logger.info("Gene records index is empty.")
+ if len(records.get('Items', [])) < 1:
+ logger.info('Gene records index is empty.')
return False
normalized_records = self.genes.query(
- IndexName="item_type_index",
- KeyConditionExpression=Key("item_type").eq(RecordType.MERGER.value),
+ IndexName='item_type_index',
+ KeyConditionExpression=Key('item_type').eq(RecordType.MERGER.value),
Limit=1,
)
- if len(normalized_records.get("Items", [])) < 1:
- logger.info("Normalized gene records index is empty.")
+ if len(normalized_records.get('Items', [])) < 1:
+ logger.info('Normalized gene records index is empty.')
return False
return True
@@ -211,14 +211,14 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict:
if src_name in self._cached_sources:
return self._cached_sources[src_name]
else:
- pk = f"{src_name.lower()}##source"
- concept_id = f"source:{src_name.lower()}"
+ pk = f'{src_name.lower()}##source'
+ concept_id = f'source:{src_name.lower()}'
metadata = self.genes.get_item(
- Key={"label_and_type": pk, "concept_id": concept_id}
- ).get("Item")
+ Key={'label_and_type': pk, 'concept_id': concept_id}
+ ).get('Item')
if not metadata:
raise DatabaseReadException(
- f"Unable to retrieve data for source {src_name}"
+ f'Unable to retrieve data for source {src_name}'
)
self._cached_sources[src_name] = metadata
return metadata
@@ -238,19 +238,19 @@ def get_record_by_id(
"""
try:
if merge:
- pk = f"{concept_id.lower()}##{RecordType.MERGER.value}"
+ pk = f'{concept_id.lower()}##{RecordType.MERGER.value}'
else:
- pk = f"{concept_id.lower()}##{RecordType.IDENTITY.value}"
+ pk = f'{concept_id.lower()}##{RecordType.IDENTITY.value}'
if case_sensitive:
match = self.genes.get_item(
- Key={"label_and_type": pk, "concept_id": concept_id}
+ Key={'label_and_type': pk, 'concept_id': concept_id}
)
- return match["Item"]
+ return match['Item']
else:
- exp = Key("label_and_type").eq(pk)
+ exp = Key('label_and_type').eq(pk)
response = self.genes.query(KeyConditionExpression=exp)
- record = response["Items"][0]
- del record["label_and_type"]
+ record = response['Items'][0]
+ del record['label_and_type']
return record
except ClientError as e:
logger.error(
@@ -270,11 +270,11 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
:param ref_type: type of match to look for.
:return: list of associated concept IDs. Empty if lookup fails.
"""
- pk = f"{search_term}##{ref_type.value.lower()}"
- filter_exp = Key("label_and_type").eq(pk)
+ pk = f'{search_term}##{ref_type.value.lower()}'
+ filter_exp = Key('label_and_type').eq(pk)
try:
matches = self.genes.query(KeyConditionExpression=filter_exp)
- return [m["concept_id"] for m in matches.get("Items", None)]
+ return [m['concept_id'] for m in matches.get('Items', None)]
except ClientError as e:
logger.error(
f"boto3 client error on get_refs_by_type for "
@@ -291,7 +291,7 @@ def get_all_concept_ids(self) -> Set[str]:
last_evaluated_key = None
concept_ids = []
params = {
- "ProjectionExpression": "concept_id",
+ 'ProjectionExpression': 'concept_id',
}
while True:
if last_evaluated_key:
@@ -300,10 +300,10 @@ def get_all_concept_ids(self) -> Set[str]:
)
else:
response = self.genes.scan(**params)
- records = response["Items"]
+ records = response['Items']
for record in records:
- concept_ids.append(record["concept_id"])
- last_evaluated_key = response.get("LastEvaluatedKey")
+ concept_ids.append(record['concept_id'])
+ last_evaluated_key = response.get('LastEvaluatedKey')
if not last_evaluated_key:
break
return set(concept_ids)
@@ -332,19 +332,19 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None
)
else:
response = self.genes.scan()
- records = response.get("Items", [])
+ records = response.get('Items', [])
for record in records:
- incoming_record_type = record.get("item_type")
+ incoming_record_type = record.get('item_type')
if record_type == RecordType.IDENTITY:
if incoming_record_type == record_type:
yield record
else:
if (
incoming_record_type == RecordType.IDENTITY
- and not record.get("merge_ref") # noqa: E501
+ and not record.get('merge_ref')
) or incoming_record_type == RecordType.MERGER:
yield record
- last_evaluated_key = response.get("LastEvaluatedKey")
+ last_evaluated_key = response.get('LastEvaluatedKey')
if not last_evaluated_key:
break
@@ -357,10 +357,10 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non
"""
src_name_value = src_name.value
metadata_item = metadata.model_dump()
- metadata_item["src_name"] = src_name_value
- metadata_item["label_and_type"] = f"{str(src_name_value).lower()}##source"
- metadata_item["concept_id"] = f"source:{str(src_name_value).lower()}"
- metadata_item["item_type"] = "source"
+ metadata_item['src_name'] = src_name_value
+ metadata_item['label_and_type'] = f'{str(src_name_value).lower()}##source'
+ metadata_item['concept_id'] = f'source:{str(src_name_value).lower()}'
+ metadata_item['item_type'] = 'source'
try:
self.genes.put_item(Item=metadata_item)
except ClientError as e:
@@ -372,11 +372,11 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
:param Dict record: record to upload
:param SourceName src_name: name of source for record
"""
- concept_id = record["concept_id"]
- record["src_name"] = src_name.value
- label_and_type = f"{concept_id.lower()}##identity"
- record["label_and_type"] = label_and_type
- record["item_type"] = "identity"
+ concept_id = record['concept_id']
+ record['src_name'] = src_name.value
+ label_and_type = f'{concept_id.lower()}##identity'
+ record['label_and_type'] = label_and_type
+ record['item_type'] = 'identity'
try:
self.batch.put_item(Item=record)
except ClientError as e:
@@ -395,7 +395,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
items = {item.lower() for item in value}
for item in items:
self._add_ref_record(
- item, record["concept_id"], item_type, src_name
+ item, record['concept_id'], item_type, src_name
)
def add_merged_record(self, record: Dict) -> None:
@@ -403,12 +403,12 @@ def add_merged_record(self, record: Dict) -> None:
:param record: merged record to add
"""
- concept_id = record["concept_id"]
- id_prefix = concept_id.split(":")[0].lower()
- record["src_name"] = PREFIX_LOOKUP[id_prefix]
- label_and_type = f"{concept_id.lower()}##{RecordType.MERGER.value}"
- record["label_and_type"] = label_and_type
- record["item_type"] = RecordType.MERGER.value
+ concept_id = record['concept_id']
+ id_prefix = concept_id.split(':')[0].lower()
+ record['src_name'] = PREFIX_LOOKUP[id_prefix]
+ label_and_type = f'{concept_id.lower()}##{RecordType.MERGER.value}'
+ record['label_and_type'] = label_and_type
+ record['item_type'] = RecordType.MERGER.value
try:
self.batch.put_item(Item=record)
except ClientError as e:
@@ -428,12 +428,12 @@ def _add_ref_record(
'associated_with'}
:param src_name: name of source for record
"""
- label_and_type = f"{term.lower()}##{ref_type}"
+ label_and_type = f'{term.lower()}##{ref_type}'
record = {
- "label_and_type": label_and_type,
- "concept_id": concept_id.lower(),
- "src_name": src_name.value,
- "item_type": ref_type,
+ 'label_and_type': label_and_type,
+ 'concept_id': concept_id.lower(),
+ 'src_name': src_name.value,
+ 'item_type': ref_type,
}
try:
self.batch.put_item(Item=record)
@@ -451,11 +451,11 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
:param merge_ref: new ref value
:raise DatabaseWriteException: if attempting to update non-existent record
"""
- label_and_type = f"{concept_id.lower()}##identity"
- key = {"label_and_type": label_and_type, "concept_id": concept_id}
- update_expression = "set merge_ref=:r"
- update_values = {":r": merge_ref.lower()}
- condition_expression = "attribute_exists(label_and_type)"
+ label_and_type = f'{concept_id.lower()}##identity'
+ key = {'label_and_type': label_and_type, 'concept_id': concept_id}
+ update_expression = 'set merge_ref=:r'
+ update_values = {':r': merge_ref.lower()}
+ condition_expression = 'attribute_exists(label_and_type)'
try:
self.genes.update_item(
Key=key,
@@ -464,10 +464,10 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
ConditionExpression=condition_expression,
)
except ClientError as e:
- code = e.response.get("Error", {}).get("Code")
- if code == "ConditionalCheckFailedException":
+ code = e.response.get('Error', {}).get('Code')
+ if code == 'ConditionalCheckFailedException':
raise DatabaseWriteException(
- f"No such record exists for keys {label_and_type}, {concept_id}"
+ f'No such record exists for keys {label_and_type}, {concept_id}'
)
else:
logger.error(
@@ -485,25 +485,25 @@ def delete_normalized_concepts(self) -> None:
"""
while True:
with self.genes.batch_writer(
- overwrite_by_pkeys=["label_and_type", "concept_id"]
+ overwrite_by_pkeys=['label_and_type', 'concept_id']
) as batch:
try:
response = self.genes.query(
- IndexName="item_type_index",
- KeyConditionExpression=Key("item_type").eq(
+ IndexName='item_type_index',
+ KeyConditionExpression=Key('item_type').eq(
RecordType.MERGER.value
),
)
except ClientError as e:
raise DatabaseReadException(e)
- records = response["Items"]
+ records = response['Items']
if not records:
break
for record in records:
batch.delete_item(
Key={
- "label_and_type": record["label_and_type"],
- "concept_id": record["concept_id"],
+ 'label_and_type': record['label_and_type'],
+ 'concept_id': record['concept_id'],
}
)
@@ -518,23 +518,23 @@ def delete_source(self, src_name: SourceName) -> None:
while True:
try:
response = self.genes.query(
- IndexName="src_index",
- KeyConditionExpression=Key("src_name").eq(src_name.value),
+ IndexName='src_index',
+ KeyConditionExpression=Key('src_name').eq(src_name.value),
)
except ClientError as e:
raise DatabaseReadException(e)
- records = response["Items"]
+ records = response['Items']
if not records:
break
with self.genes.batch_writer(
- overwrite_by_pkeys=["label_and_type", "concept_id"]
+ overwrite_by_pkeys=['label_and_type', 'concept_id']
) as batch:
for record in records:
try:
batch.delete_item(
Key={
- "label_and_type": record["label_and_type"],
- "concept_id": record["concept_id"],
+ 'label_and_type': record['label_and_type'],
+ 'concept_id': record['concept_id'],
}
)
except ClientError as e:
diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py
index f62a1819..90924c8c 100644
--- a/src/gene/database/postgresql.py
+++ b/src/gene/database/postgresql.py
@@ -29,7 +29,7 @@
logger = logging.getLogger(__name__)
-SCRIPTS_DIR = Path(__file__).parent / "postgresql"
+SCRIPTS_DIR = Path(__file__).parent / 'postgresql'
class PostgresDatabase(AbstractDatabase):
@@ -56,16 +56,16 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None:
"""
if db_url:
conninfo = db_url
- elif "GENE_NORM_DB_URL" in os.environ:
- conninfo = os.environ["GENE_NORM_DB_URL"]
+ elif 'GENE_NORM_DB_URL' in os.environ:
+ conninfo = os.environ['GENE_NORM_DB_URL']
else:
- user = db_args.get("user", "postgres")
- password = db_args.get("password", "")
- db_name = db_args.get("db_name", "gene_normalizer")
+ user = db_args.get('user', 'postgres')
+ password = db_args.get('password', '')
+ db_name = db_args.get('db_name', 'gene_normalizer')
if password:
- conninfo = f"dbname={db_name} user={user} password={password}"
+ conninfo = f'dbname={db_name} user={user} password={password}'
else:
- conninfo = f"dbname={db_name} user={user}"
+ conninfo = f'dbname={db_name} user={user}'
self.conn = psycopg.connect(conninfo)
self.initialize_db()
@@ -119,7 +119,7 @@ def drop_db(self) -> None:
with self.conn.cursor() as cur:
cur.execute(self._drop_db_query)
self.conn.commit()
- logger.info("Dropped all existing gene normalizer tables.")
+ logger.info('Dropped all existing gene normalizer tables.')
def check_schema_initialized(self) -> bool:
"""Check if database schema is properly initialized.
@@ -128,48 +128,48 @@ def check_schema_initialized(self) -> bool:
"""
try:
with self.conn.cursor() as cur:
- cur.execute((SCRIPTS_DIR / "create_tables.sql").read_bytes())
+ cur.execute((SCRIPTS_DIR / 'create_tables.sql').read_bytes())
except DuplicateTable:
self.conn.rollback()
else:
- logger.info("Gene table existence check failed.")
+ logger.info('Gene table existence check failed.')
self.conn.rollback()
return False
try:
with self.conn.cursor() as cur:
- cur.execute((SCRIPTS_DIR / "add_fkeys.sql").read_bytes())
+ cur.execute((SCRIPTS_DIR / 'add_fkeys.sql').read_bytes())
except DuplicateObject:
self.conn.rollback()
else:
- logger.info("Gene foreign key existence check failed.")
+ logger.info('Gene foreign key existence check failed.')
self.conn.rollback()
return False
try:
with self.conn.cursor() as cur:
cur.execute(
- (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes()
+ (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes()
)
except DuplicateTable:
self.conn.rollback()
else:
- logger.info("Gene normalized view lookup failed.")
+ logger.info('Gene normalized view lookup failed.')
self.conn.rollback()
return False
try:
with self.conn.cursor() as cur:
- cur.execute((SCRIPTS_DIR / "add_indexes.sql").read_bytes())
+ cur.execute((SCRIPTS_DIR / 'add_indexes.sql').read_bytes())
except DuplicateTable:
self.conn.rollback()
else:
- logger.info("Gene indexes check failed.")
+ logger.info('Gene indexes check failed.')
self.conn.rollback()
return False
return True
- _check_sources_query = b"SELECT name FROM gene_sources;"
- _check_concepts_query = b"SELECT COUNT(1) FROM gene_concepts LIMIT 1;"
- _check_merged_query = b"SELECT COUNT(1) FROM gene_merged LIMIT 1;"
+ _check_sources_query = b'SELECT name FROM gene_sources;'
+ _check_concepts_query = b'SELECT COUNT(1) FROM gene_concepts LIMIT 1;'
+ _check_merged_query = b'SELECT COUNT(1) FROM gene_merged LIMIT 1;'
def check_tables_populated(self) -> bool:
"""Perform rudimentary checks to see if tables are populated.
@@ -184,21 +184,21 @@ def check_tables_populated(self) -> bool:
cur.execute(self._check_sources_query)
results = cur.fetchall()
if len(results) < len(SourceName):
- logger.info("Gene sources table is missing expected sources.")
+ logger.info('Gene sources table is missing expected sources.')
return False
with self.conn.cursor() as cur:
cur.execute(self._check_concepts_query)
result = cur.fetchone()
if not result or result[0] < 1:
- logger.info("Gene records table is empty.")
+ logger.info('Gene records table is empty.')
return False
with self.conn.cursor() as cur:
cur.execute(self._check_merged_query)
result = cur.fetchone()
if not result or result[0] < 1:
- logger.info("Normalized gene records table is empty.")
+ logger.info('Normalized gene records table is empty.')
return False
return True
@@ -213,12 +213,12 @@ def initialize_db(self) -> None:
def _create_views(self) -> None:
"""Create materialized views."""
- create_view_query = (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes()
+ create_view_query = (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(create_view_query)
self.conn.commit()
- _refresh_views_query = b"REFRESH MATERIALIZED VIEW record_lookup_view;"
+ _refresh_views_query = b'REFRESH MATERIALIZED VIEW record_lookup_view;'
def _refresh_views(self) -> None:
"""Update materialized views.
@@ -232,36 +232,36 @@ def _refresh_views(self) -> None:
def _add_fkeys(self) -> None:
"""Add fkey relationships."""
- add_fkey_query = (SCRIPTS_DIR / "add_fkeys.sql").read_bytes()
+ add_fkey_query = (SCRIPTS_DIR / 'add_fkeys.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(add_fkey_query)
self.conn.commit()
def _drop_fkeys(self) -> None:
"""Drop fkey relationships."""
- drop_fkey_query = (SCRIPTS_DIR / "drop_fkeys.sql").read_bytes()
+ drop_fkey_query = (SCRIPTS_DIR / 'drop_fkeys.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(drop_fkey_query)
self.conn.commit()
def _add_indexes(self) -> None:
"""Create core search indexes."""
- add_indexes_query = (SCRIPTS_DIR / "add_indexes.sql").read_bytes()
+ add_indexes_query = (SCRIPTS_DIR / 'add_indexes.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(add_indexes_query)
self.conn.commit()
def _drop_indexes(self) -> None:
"""Drop all custom indexes."""
- drop_indexes_query = (SCRIPTS_DIR / "drop_indexes.sql").read_bytes()
+ drop_indexes_query = (SCRIPTS_DIR / 'drop_indexes.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(drop_indexes_query)
self.conn.commit()
def _create_tables(self) -> None:
"""Create all tables, indexes, and views."""
- logger.debug("Creating new gene normalizer tables.")
- tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes()
+ logger.debug('Creating new gene normalizer tables.')
+ tables_query = (SCRIPTS_DIR / 'create_tables.sql').read_bytes()
with self.conn.cursor() as cur:
cur.execute(tables_query)
@@ -278,30 +278,30 @@ def get_source_metadata(self, src_name: SourceName) -> Dict:
if src_name in self._cached_sources:
return self._cached_sources[src_name]
- metadata_query = "SELECT * FROM gene_sources WHERE name = %s;"
+ metadata_query = 'SELECT * FROM gene_sources WHERE name = %s;'
with self.conn.cursor() as cur:
cur.execute(metadata_query, [src_name])
metadata_result = cur.fetchone()
if not metadata_result:
- raise DatabaseReadException(f"{src_name} metadata lookup failed")
+ raise DatabaseReadException(f'{src_name} metadata lookup failed')
metadata = {
- "data_license": metadata_result[1],
- "data_license_url": metadata_result[2],
- "version": metadata_result[3],
- "data_url": metadata_result[4],
- "rdp_url": metadata_result[5],
- "data_license_attributes": {
- "non_commercial": metadata_result[6],
- "attribution": metadata_result[7],
- "share_alike": metadata_result[8],
+ 'data_license': metadata_result[1],
+ 'data_license_url': metadata_result[2],
+ 'version': metadata_result[3],
+ 'data_url': metadata_result[4],
+ 'rdp_url': metadata_result[5],
+ 'data_license_attributes': {
+ 'non_commercial': metadata_result[6],
+ 'attribution': metadata_result[7],
+ 'share_alike': metadata_result[8],
},
- "genome_assemblies": metadata_result[9],
+ 'genome_assemblies': metadata_result[9],
}
self._cached_sources[src_name] = metadata
return metadata
_get_record_query = (
- b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" # noqa: E501
+ b'SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;'
)
def _format_source_record(self, source_row: Tuple) -> Dict:
@@ -311,21 +311,21 @@ def _format_source_record(self, source_row: Tuple) -> Dict:
:return: reformatted dictionary keying gene properties to row values
"""
gene_record = {
- "concept_id": source_row[0],
- "symbol_status": source_row[1],
- "label": source_row[2],
- "strand": source_row[3],
- "location_annotations": source_row[4],
- "locations": source_row[5],
- "gene_type": source_row[6],
- "aliases": source_row[7],
- "associated_with": source_row[8],
- "previous_symbols": source_row[9],
- "symbol": source_row[10],
- "xrefs": source_row[11],
- "src_name": source_row[12],
- "merge_ref": source_row[13],
- "item_type": RecordType.IDENTITY.value,
+ 'concept_id': source_row[0],
+ 'symbol_status': source_row[1],
+ 'label': source_row[2],
+ 'strand': source_row[3],
+ 'location_annotations': source_row[4],
+ 'locations': source_row[5],
+ 'gene_type': source_row[6],
+ 'aliases': source_row[7],
+ 'associated_with': source_row[8],
+ 'previous_symbols': source_row[9],
+ 'symbol': source_row[10],
+ 'xrefs': source_row[11],
+ 'src_name': source_row[12],
+ 'merge_ref': source_row[13],
+ 'item_type': RecordType.IDENTITY.value,
}
return {k: v for k, v in gene_record.items() if v}
@@ -354,28 +354,28 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict:
:return: reformatted dictionary keying normalized gene properties to row values
"""
merged_record = {
- "concept_id": merged_row[0],
- "symbol": merged_row[1],
- "symbol_status": merged_row[2],
- "previous_symbols": merged_row[3],
- "label": merged_row[4],
- "strand": merged_row[5],
- "ensembl_locations": merged_row[6],
- "hgnc_locations": merged_row[7],
- "ncbi_locations": merged_row[8],
- "location_annotations": merged_row[9],
- "ensembl_biotype": merged_row[10],
- "hgnc_locus_type": merged_row[11],
- "ncbi_gene_type": merged_row[12],
- "aliases": merged_row[13],
- "associated_with": merged_row[14],
- "xrefs": merged_row[15],
- "item_type": RecordType.MERGER.value,
+ 'concept_id': merged_row[0],
+ 'symbol': merged_row[1],
+ 'symbol_status': merged_row[2],
+ 'previous_symbols': merged_row[3],
+ 'label': merged_row[4],
+ 'strand': merged_row[5],
+ 'ensembl_locations': merged_row[6],
+ 'hgnc_locations': merged_row[7],
+ 'ncbi_locations': merged_row[8],
+ 'location_annotations': merged_row[9],
+ 'ensembl_biotype': merged_row[10],
+ 'hgnc_locus_type': merged_row[11],
+ 'ncbi_gene_type': merged_row[12],
+ 'aliases': merged_row[13],
+ 'associated_with': merged_row[14],
+ 'xrefs': merged_row[15],
+ 'item_type': RecordType.MERGER.value,
}
return {k: v for k, v in merged_record.items() if v}
_get_merged_record_query = (
- b"SELECT * FROM gene_merged WHERE lower(concept_id) = %s;" # noqa: E501
+ b'SELECT * FROM gene_merged WHERE lower(concept_id) = %s;'
)
def _get_merged_record(
@@ -412,11 +412,11 @@ def get_record_by_id(
return self._get_record(concept_id, case_sensitive)
_ref_types_query = {
- RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", # noqa: E501
- RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501
- RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501
- RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;",
- RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501
+ RefType.SYMBOL: b'SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;',
+ RefType.PREVIOUS_SYMBOLS: b'SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;',
+ RefType.ALIASES: b'SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;',
+ RefType.XREFS: b'SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;',
+ RefType.ASSOCIATED_WITH: b'SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;',
}
def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
@@ -429,7 +429,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
"""
query = self._ref_types_query.get(ref_type)
if not query:
- raise ValueError("invalid reference type")
+ raise ValueError('invalid reference type')
with self.conn.cursor() as cur:
cur.execute(query, (search_term.lower(),))
@@ -439,7 +439,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
else:
return []
- _ids_query = b"SELECT concept_id FROM gene_concepts;"
+ _ids_query = b'SELECT concept_id FROM gene_concepts;'
def get_all_concept_ids(self) -> Set[str]:
"""Retrieve concept IDs for use in generating normalized records.
@@ -451,11 +451,11 @@ def get_all_concept_ids(self) -> Set[str]:
ids_tuple = cur.fetchall()
return {i[0] for i in ids_tuple}
- _get_all_normalized_records_query = b"SELECT * FROM gene_merged;"
+ _get_all_normalized_records_query = b'SELECT * FROM gene_merged;'
_get_all_unmerged_source_records_query = (
- b"SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;" # noqa: E501
+ b'SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;'
)
- _get_all_source_records_query = b"SELECT * FROM record_lookup_view;"
+ _get_all_source_records_query = b'SELECT * FROM record_lookup_view;'
def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None]:
"""Retrieve all source or normalized records. Either return all source records,
@@ -530,9 +530,9 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
meta.version,
json.dumps(meta.data_url),
meta.rdp_url,
- meta.data_license_attributes["non_commercial"],
- meta.data_license_attributes["attribution"],
- meta.data_license_attributes["share_alike"],
+ meta.data_license_attributes['non_commercial'],
+ meta.data_license_attributes['attribution'],
+ meta.data_license_attributes['share_alike'],
meta.genome_assemblies,
],
)
@@ -546,15 +546,15 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
VALUES (%s, %s, %s, %s, %s, %s, %s, %s);
"""
_ins_symbol_query = (
- b"INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);"
+ b'INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);'
)
_ins_prev_symbol_query = (
- b"INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);"
+ b'INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);'
)
- _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);"
- _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);"
+ _ins_alias_query = b'INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);'
+ _ins_xref_query = b'INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);'
_ins_assoc_query = (
- b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);"
+ b'INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);'
)
def add_record(self, record: Dict, src_name: SourceName) -> None:
@@ -563,8 +563,8 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
:param record: record to upload
:param src_name: name of source for record. Not used by PostgreSQL instance.
"""
- concept_id = record["concept_id"]
- locations = [json.dumps(loc) for loc in record.get("locations", [])]
+ concept_id = record['concept_id']
+ locations = [json.dumps(loc) for loc in record.get('locations', [])]
if not locations:
locations = None
with self.conn.cursor() as cur:
@@ -573,28 +573,28 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
self._add_record_query,
[
concept_id,
- record["src_name"],
- record.get("symbol_status"),
- record.get("label"),
- record.get("strand"),
- record.get("location_annotations"),
+ record['src_name'],
+ record.get('symbol_status'),
+ record.get('label'),
+ record.get('strand'),
+ record.get('location_annotations'),
locations,
- record.get("gene_type"),
+ record.get('gene_type'),
],
)
- for a in record.get("aliases", []):
+ for a in record.get('aliases', []):
cur.execute(self._ins_alias_query, [a, concept_id])
- for x in record.get("xrefs", []):
+ for x in record.get('xrefs', []):
cur.execute(self._ins_xref_query, [x, concept_id])
- for a in record.get("associated_with", []):
+ for a in record.get('associated_with', []):
cur.execute(self._ins_assoc_query, [a, concept_id])
- for p in record.get("previous_symbols", []):
+ for p in record.get('previous_symbols', []):
cur.execute(self._ins_prev_symbol_query, [p, concept_id])
- if record.get("symbol"):
- cur.execute(self._ins_symbol_query, [record["symbol"], concept_id])
+ if record.get('symbol'):
+ cur.execute(self._ins_symbol_query, [record['symbol'], concept_id])
self.conn.commit()
except UniqueViolation:
- logger.error(f"Record with ID {concept_id} already exists")
+ logger.error(f'Record with ID {concept_id} already exists')
self.conn.rollback()
_add_merged_record_query = b"""
@@ -612,35 +612,35 @@ def add_merged_record(self, record: Dict) -> None:
:param record: merged record to add
"""
- ensembl_locations = record.get("ensembl_locations")
+ ensembl_locations = record.get('ensembl_locations')
if ensembl_locations:
ensembl_locations = [json.dumps(i) for i in ensembl_locations]
- ncbi_locations = record.get("ncbi_locations")
+ ncbi_locations = record.get('ncbi_locations')
if ncbi_locations:
ncbi_locations = [json.dumps(i) for i in ncbi_locations]
- hgnc_locations = record.get("hgnc_locations")
+ hgnc_locations = record.get('hgnc_locations')
if hgnc_locations:
hgnc_locations = [json.dumps(i) for i in hgnc_locations]
with self.conn.cursor() as cur:
cur.execute(
self._add_merged_record_query,
[
- record["concept_id"],
- record.get("symbol"),
- record.get("symbol_status"),
- record.get("previous_symbols"),
- record.get("label"),
- record.get("strand"),
- record.get("location_annotations"),
+ record['concept_id'],
+ record.get('symbol'),
+ record.get('symbol_status'),
+ record.get('previous_symbols'),
+ record.get('label'),
+ record.get('strand'),
+ record.get('location_annotations'),
ensembl_locations,
hgnc_locations,
ncbi_locations,
- record.get("hgnc_locus_type"),
- record.get("ensembl_biotype"),
- record.get("ncbi_gene_type"),
- record.get("aliases"),
- record.get("associated_with"),
- record.get("xrefs"),
+ record.get('hgnc_locus_type'),
+ record.get('ensembl_biotype'),
+ record.get('ncbi_gene_type'),
+ record.get('aliases'),
+ record.get('associated_with'),
+ record.get('xrefs'),
],
)
self.conn.commit()
@@ -661,7 +661,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
with self.conn.cursor() as cur:
cur.execute(
self._update_merge_ref_query,
- {"merge_ref": merge_ref, "concept_id": concept_id},
+ {'merge_ref': merge_ref, 'concept_id': concept_id},
)
row_count = cur.rowcount
self.conn.commit()
@@ -669,7 +669,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
# UPDATE will fail silently unless we check the # of affected rows
if row_count < 1:
raise DatabaseWriteException(
- f"No such record exists for primary key {concept_id}"
+ f'No such record exists for primary key {concept_id}'
)
def delete_normalized_concepts(self) -> None:
@@ -687,7 +687,7 @@ def delete_normalized_concepts(self) -> None:
:raise DatabaseWriteException: if deletion call fails
"""
with self.conn.cursor() as cur:
- cur.execute((SCRIPTS_DIR / "delete_normalized_concepts.sql").read_bytes())
+ cur.execute((SCRIPTS_DIR / 'delete_normalized_concepts.sql').read_bytes())
self.conn.commit()
_drop_aliases_query = b"""
@@ -725,8 +725,8 @@ def delete_normalized_concepts(self) -> None:
WHERE gc.source = %s
);
"""
- _drop_concepts_query = b"DELETE FROM gene_concepts WHERE source = %s;"
- _drop_source_query = b"DELETE FROM gene_sources gs WHERE gs.name = %s;"
+ _drop_concepts_query = b'DELETE FROM gene_concepts WHERE source = %s;'
+ _drop_source_query = b'DELETE FROM gene_sources gs WHERE gs.name = %s;'
def delete_source(self, src_name: SourceName) -> None:
"""Delete all data for a source. Use when updating source data.
@@ -784,35 +784,35 @@ def load_from_remote(self, url: Optional[str]) -> None:
command fails
"""
if not url:
- url = "https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz" # noqa: E501
+ url = 'https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz'
with tempfile.TemporaryDirectory() as tempdir:
tempdir_path = Path(tempdir)
- temp_tarfile = tempdir_path / "gene_norm_latest.tar.gz"
+ temp_tarfile = tempdir_path / 'gene_norm_latest.tar.gz'
with requests.get(url, stream=True) as r:
try:
r.raise_for_status()
except requests.HTTPError:
raise DatabaseException(
- f"Unable to retrieve PostgreSQL dump file from {url}"
+ f'Unable to retrieve PostgreSQL dump file from {url}'
)
- with open(temp_tarfile, "wb") as h:
+ with open(temp_tarfile, 'wb') as h:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
h.write(chunk)
- tar = tarfile.open(temp_tarfile, "r:gz")
+ tar = tarfile.open(temp_tarfile, 'r:gz')
tar_dump_file = [
- f for f in tar.getmembers() if f.name.startswith("gene_norm_")
+ f for f in tar.getmembers() if f.name.startswith('gene_norm_')
][0]
tar.extractall(path=tempdir_path, members=[tar_dump_file])
dump_file = tempdir_path / tar_dump_file.name
if self.conn.info.password:
- pw_param = f"-W {self.conn.info.password}"
+ pw_param = f'-W {self.conn.info.password}'
else:
- pw_param = "-w"
+ pw_param = '-w'
self.drop_db()
- system_call = f"psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}" # noqa: E501
+ system_call = f'psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}'
result = os.system(system_call)
if result != 0:
raise DatabaseException(
@@ -831,19 +831,19 @@ def export_db(self, output_directory: Path) -> None:
if not output_directory.is_dir() or not output_directory.exists():
raise ValueError(
f"Output location {output_directory} isn't a directory or doesn't exist"
- ) # noqa: E501
- now = datetime.now().strftime("%Y%m%d%H%M%S")
- output_location = output_directory / f"gene_norm_{now}.sql"
+ )
+ now = datetime.now().strftime('%Y%m%d%H%M%S')
+ output_location = output_directory / f'gene_norm_{now}.sql'
user = self.conn.info.user
host = self.conn.info.host
port = self.conn.info.port
database_name = self.conn.info.dbname
if self.conn.info.password:
- pw_param = f"-W {self.conn.info.password}"
+ pw_param = f'-W {self.conn.info.password}'
else:
- pw_param = "-w"
+ pw_param = '-w'
- system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" # noqa: E501
+ system_call = f'pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}'
result = os.system(system_call)
if result != 0:
raise DatabaseException(
diff --git a/src/gene/etl/__init__.py b/src/gene/etl/__init__.py
index 569df1d7..1d7020b3 100644
--- a/src/gene/etl/__init__.py
+++ b/src/gene/etl/__init__.py
@@ -9,10 +9,10 @@
from .ncbi import NCBI
__all__ = [
- "Ensembl",
- "HGNC",
- "NCBI",
- "GeneNormalizerEtlError",
- "GeneFileVersionError",
- "GeneSourceFetchError",
+ 'Ensembl',
+ 'HGNC',
+ 'NCBI',
+ 'GeneNormalizerEtlError',
+ 'GeneFileVersionError',
+ 'GeneSourceFetchError',
]
diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py
index 77e9eee1..771e2294 100644
--- a/src/gene/etl/base.py
+++ b/src/gene/etl/base.py
@@ -15,7 +15,7 @@
from gene.database import AbstractDatabase
from gene.schemas import Gene, GeneSequenceLocation, MatchType, SourceName
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@@ -71,7 +71,7 @@ def perform_etl(self, use_existing: bool = False) -> List[str]:
"""
self._extract_data(use_existing)
if not self._silent:
- click.echo("Transforming and loading data to DB...")
+ click.echo('Transforming and loading data to DB...')
self._add_meta()
self._transform_data()
self._database.complete_write_transaction()
@@ -110,12 +110,12 @@ def _load_gene(self, gene: Dict) -> None:
try:
assert Gene(match_type=MatchType.NO_MATCH, **gene)
except pydantic.ValidationError as e:
- logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
+ logger.warning(f'Unable to load {gene} due to validation error: ' f'{e}')
else:
- concept_id = gene["concept_id"]
- gene["label_and_type"] = f"{concept_id.lower()}##identity"
- gene["src_name"] = self._src_name.value
- gene["item_type"] = "identity"
+ concept_id = gene['concept_id']
+ gene['label_and_type'] = f'{concept_id.lower()}##identity'
+ gene['src_name'] = self._src_name.value
+ gene['item_type'] = 'identity'
for attr_type in ITEM_TYPES:
if attr_type in gene:
@@ -136,7 +136,7 @@ def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo:
:return: SeqRepo instance
"""
if not Path(seqrepo_dir).exists():
- raise NotADirectoryError(f"Could not find {seqrepo_dir}")
+ raise NotADirectoryError(f'Could not find {seqrepo_dir}')
return SeqRepo(seqrepo_dir)
def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None:
@@ -146,33 +146,33 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None:
:param arm_ix: The index of the q or p arm for a given location
:param location: VRS chromosome location. This will be mutated.
"""
- range_ix = re.search("-", loc).start() # type: ignore
+ range_ix = re.search('-', loc).start() # type: ignore
start = loc[arm_ix:range_ix]
- start_arm_ix = re.search("[pq]", start).start() # type: ignore
+ start_arm_ix = re.search('[pq]', start).start() # type: ignore
start_arm = start[start_arm_ix]
end = loc[range_ix + 1 :]
- end_arm_match = re.search("[pq]", end)
+ end_arm_match = re.search('[pq]', end)
if not end_arm_match:
# Does not specify the arm, so use the same as start"s
- end = f"{start[0]}{end}"
- end_arm_match = re.search("[pq]", end)
+ end = f'{start[0]}{end}'
+ end_arm_match = re.search('[pq]', end)
end_arm_ix = end_arm_match.start() # type: ignore
end_arm = end[end_arm_ix]
if (start_arm == end_arm and start > end) or (
- start_arm != end_arm and start_arm == "p" and end_arm == "q"
+ start_arm != end_arm and start_arm == 'p' and end_arm == 'q'
):
- location["start"] = start
- location["end"] = end
+ location['start'] = start
+ location['end'] = end
elif (start_arm == end_arm and start < end) or (
- start_arm != end_arm and start_arm == "q" and end_arm == "p"
+ start_arm != end_arm and start_arm == 'q' and end_arm == 'p'
):
- location["start"] = end
- location["end"] = start
+ location['start'] = end
+ location['end'] = start
# Add back once VRS Chromosome Location is supported in 2.0-alpha
# def _get_chromosome_location(self, location: Dict, gene: Dict) -> Optional[Dict]:
@@ -209,9 +209,9 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]:
"""
aliases = []
try:
- aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh")
+ aliases = self.seqrepo.translate_alias(seq_id, target_namespaces='ga4gh')
except KeyError as e:
- logger.warning(f"SeqRepo raised KeyError: {e}")
+ logger.warning(f'SeqRepo raised KeyError: {e}')
return aliases
def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict:
@@ -230,7 +230,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di
sequence = aliases[0]
- if gene.start != "." and gene.end != "." and sequence:
+ if gene.start != '.' and gene.end != '.' and sequence:
if 0 <= gene.start <= gene.end: # type: ignore
location = GeneSequenceLocation(
start=gene.start - 1, # type: ignore
diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py
index 4a52975a..4e775afd 100644
--- a/src/gene/etl/ensembl.py
+++ b/src/gene/etl/ensembl.py
@@ -12,7 +12,7 @@
)
from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@@ -30,36 +30,36 @@ def _extract_data(self, use_existing: bool) -> None:
self._data_file, raw_version = self._data_source.get_latest(
from_local=use_existing
)
- match = re.match(r"(GRCh\d+)_(\d+)", raw_version)
+ match = re.match(r'(GRCh\d+)_(\d+)', raw_version)
self._assembly = match.groups()[0]
self._version = match.groups()[1]
def _transform_data(self) -> None:
"""Transform the Ensembl source."""
- logger.info("Transforming Ensembl...")
+ logger.info('Transforming Ensembl...')
db = gffutils.create_db(
str(self._data_file),
- dbfn=":memory:",
+ dbfn=':memory:',
force=True,
- merge_strategy="create_unique",
+ merge_strategy='create_unique',
keep_order=True,
)
# Get accession numbers
accession_numbers = dict()
- for item in db.features_of_type("scaffold"):
- accession_numbers[item[0]] = item[8]["Alias"][-1]
- for item in db.features_of_type("chromosome"):
- accession_numbers[item[0]] = item[8]["Alias"][-1]
+ for item in db.features_of_type('scaffold'):
+ accession_numbers[item[0]] = item[8]['Alias'][-1]
+ for item in db.features_of_type('chromosome'):
+ accession_numbers[item[0]] = item[8]['Alias'][-1]
for f in db.all_features():
- if f.attributes.get("ID"):
- f_id = f.attributes.get("ID")[0].split(":")[0]
- if f_id == "gene":
+ if f.attributes.get('ID'):
+ f_id = f.attributes.get('ID')[0].split(':')[0]
+ if f_id == 'gene':
gene = self._add_gene(f, accession_numbers)
if gene:
self._load_gene(gene)
- logger.info("Successfully transformed Ensembl.")
+ logger.info('Successfully transformed Ensembl.')
def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
"""Create a transformed gene record.
@@ -69,19 +69,19 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
:return: A gene dictionary containing data if the ID attribute exists.
"""
gene = dict()
- if f.strand == "-":
- gene["strand"] = Strand.REVERSE.value
- elif f.strand == "+":
- gene["strand"] = Strand.FORWARD.value
- gene["src_name"] = SourceName.ENSEMBL.value
+ if f.strand == '-':
+ gene['strand'] = Strand.REVERSE.value
+ elif f.strand == '+':
+ gene['strand'] = Strand.FORWARD.value
+ gene['src_name'] = SourceName.ENSEMBL.value
self._add_attributes(f, gene)
location = self._add_location(f, gene, accession_numbers)
if location:
- gene["locations"] = [location]
+ gene['locations'] = [location]
- gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity"
- gene["item_type"] = "identity"
+ gene['label_and_type'] = f"{gene['concept_id'].lower()}##identity"
+ gene['item_type'] = 'identity'
return gene
@@ -92,10 +92,10 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None:
:param gene: A transformed gene record
"""
attributes = {
- "ID": "concept_id",
- "Name": "symbol",
- "description": "xrefs",
- "biotype": "gene_type",
+ 'ID': 'concept_id',
+ 'Name': 'symbol',
+ 'description': 'xrefs',
+ 'biotype': 'gene_type',
}
for attribute in f.attributes.items():
@@ -106,30 +106,30 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None:
if len(val) == 1:
val = val[0]
- if key == "ID":
- if val.startswith("gene"):
+ if key == 'ID':
+ if val.startswith('gene'):
val = (
f"{NamespacePrefix.ENSEMBL.value}:"
f"{val.split(':')[1]}"
)
- if key == "description":
- gene["label"] = val.split("[")[0].strip()
- if "Source:" in val:
+ if key == 'description':
+ gene['label'] = val.split('[')[0].strip()
+ if 'Source:' in val:
src_name = (
- val.split("[")[-1]
- .split("Source:")[-1]
- .split("Acc")[0]
- .split(";")[0]
+ val.split('[')[-1]
+ .split('Source:')[-1]
+ .split('Acc')[0]
+ .split(';')[0]
)
- src_id = val.split("Acc:")[-1].split("]")[0]
- if ":" in src_id:
- src_id = src_id.split(":")[-1]
+ src_id = val.split('Acc:')[-1].split(']')[0]
+ if ':' in src_id:
+ src_id = src_id.split(':')[-1]
source = self._get_xref_associated_with(src_name, src_id)
- if "xrefs" in source:
- gene["xrefs"] = source["xrefs"]
- elif "associated_with" in source:
- gene["associated_with"] = source["associated_with"]
+ if 'xrefs' in source:
+ gene['xrefs'] = source['xrefs']
+ elif 'associated_with' in source:
+ gene['associated_with'] = source['associated_with']
continue
gene[attributes[key]] = val
@@ -153,16 +153,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
:return: A dict containing an other identifier or xref
"""
source = dict()
- if src_name.startswith("HGNC"):
- source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
- elif src_name.startswith("NCBI"):
- source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
- elif src_name.startswith("UniProt"):
- source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
- elif src_name.startswith("miRBase"):
- source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
- elif src_name.startswith("RFAM"):
- source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
+ if src_name.startswith('HGNC'):
+ source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}']
+ elif src_name.startswith('NCBI'):
+ source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}']
+ elif src_name.startswith('UniProt'):
+ source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}']
+ elif src_name.startswith('miRBase'):
+ source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}']
+ elif src_name.startswith('RFAM'):
+ source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}']
return source
def _add_meta(self) -> None:
@@ -172,21 +172,21 @@ def _add_meta(self) -> None:
"""
if not self._version or not self._assembly:
raise GeneNormalizerEtlError(
- "Source metadata unavailable -- was data properly acquired before attempting to load DB?"
+ 'Source metadata unavailable -- was data properly acquired before attempting to load DB?'
)
metadata = SourceMeta(
- data_license="custom",
- data_license_url="https://useast.ensembl.org/info/about"
- "/legal/disclaimer.html",
+ data_license='custom',
+ data_license_url='https://useast.ensembl.org/info/about'
+ '/legal/disclaimer.html',
version=self._version,
data_url={
- "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz"
+ 'genome_annotations': f'ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz'
},
rdp_url=None,
data_license_attributes={
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
genome_assemblies=[self._assembly],
)
diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py
index 2fee6117..5e4f7c2a 100644
--- a/src/gene/etl/hgnc.py
+++ b/src/gene/etl/hgnc.py
@@ -18,7 +18,7 @@
SymbolStatus,
)
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@@ -27,38 +27,38 @@ class HGNC(Base):
def _transform_data(self) -> None:
"""Transform the HGNC source."""
- logger.info("Transforming HGNC...")
- with open(self._data_file, "r") as f: # type: ignore
+ logger.info('Transforming HGNC...')
+ with open(self._data_file, 'r') as f: # type: ignore
data = json.load(f)
- records = data["response"]["docs"]
+ records = data['response']['docs']
for r in records:
gene = dict()
- gene["concept_id"] = r["hgnc_id"].lower()
- gene["label_and_type"] = f"{gene['concept_id']}##identity"
- gene["item_type"] = "identity"
- gene["symbol"] = r["symbol"]
- gene["label"] = r["name"]
- gene["src_name"] = SourceName.HGNC.value
- if r["status"]:
- if r["status"] == "Approved":
- gene["symbol_status"] = SymbolStatus.APPROVED.value
- elif r["status"] == "Entry Withdrawn":
- gene["symbol_status"] = SymbolStatus.WITHDRAWN.value
- gene["src_name"] = SourceName.HGNC.value
+ gene['concept_id'] = r['hgnc_id'].lower()
+ gene['label_and_type'] = f"{gene['concept_id']}##identity"
+ gene['item_type'] = 'identity'
+ gene['symbol'] = r['symbol']
+ gene['label'] = r['name']
+ gene['src_name'] = SourceName.HGNC.value
+ if r['status']:
+ if r['status'] == 'Approved':
+ gene['symbol_status'] = SymbolStatus.APPROVED.value
+ elif r['status'] == 'Entry Withdrawn':
+ gene['symbol_status'] = SymbolStatus.WITHDRAWN.value
+ gene['src_name'] = SourceName.HGNC.value
# store alias, xref, associated_with, prev_symbols, location
self._get_aliases(r, gene)
self._get_xrefs_associated_with(r, gene)
- if "prev_symbol" in r:
+ if 'prev_symbol' in r:
self._get_previous_symbols(r, gene)
- if "location" in r:
+ if 'location' in r:
self._get_location(r, gene)
- if "locus_type" in r:
- gene["gene_type"] = r["locus_type"]
+ if 'locus_type' in r:
+ gene['gene_type'] = r['locus_type']
self._load_gene(gene)
- logger.info("Successfully transformed HGNC.")
+ logger.info('Successfully transformed HGNC.')
def _get_aliases(self, r: Dict, gene: Dict) -> None:
"""Store aliases in a gene record.
@@ -68,14 +68,14 @@ def _get_aliases(self, r: Dict, gene: Dict) -> None:
"""
alias_symbol = list()
enzyme_id = list()
- if "alias_symbol" in r:
- alias_symbol = r["alias_symbol"]
+ if 'alias_symbol' in r:
+ alias_symbol = r['alias_symbol']
- if "enzyme_id" in r:
- enzyme_id = r["enzyme_id"]
+ if 'enzyme_id' in r:
+ enzyme_id = r['enzyme_id']
if alias_symbol or enzyme_id:
- gene["aliases"] = list(set(alias_symbol + enzyme_id))
+ gene['aliases'] = list(set(alias_symbol + enzyme_id))
def _get_previous_symbols(self, r: Dict, gene: Dict) -> None:
"""Store previous symbols in a gene record.
@@ -83,9 +83,9 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None:
:param r: A gene record in the HGNC data file
:param gene: A transformed gene record
"""
- prev_symbols = r["prev_symbol"]
+ prev_symbols = r['prev_symbol']
if prev_symbols:
- gene["previous_symbols"] = list(set(prev_symbols))
+ gene['previous_symbols'] = list(set(prev_symbols))
def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
"""Store xrefs and/or associated_with refs in a gene record.
@@ -96,40 +96,40 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
xrefs = list()
associated_with = list()
sources = [
- "entrez_id",
- "ensembl_gene_id",
- "vega_id",
- "ucsc_id",
- "ccds_id",
- "uniprot_ids",
- "pubmed_id",
- "cosmic",
- "omim_id",
- "mirbase",
- "homeodb",
- "snornabase",
- "orphanet",
- "horde_id",
- "merops",
- "imgt",
- "iuphar",
- "kznf_gene_catalog",
- "mamit-trnadb",
- "cd",
- "lncrnadb",
- "ena",
- "pseudogene.org",
- "refseq_accession",
+ 'entrez_id',
+ 'ensembl_gene_id',
+ 'vega_id',
+ 'ucsc_id',
+ 'ccds_id',
+ 'uniprot_ids',
+ 'pubmed_id',
+ 'cosmic',
+ 'omim_id',
+ 'mirbase',
+ 'homeodb',
+ 'snornabase',
+ 'orphanet',
+ 'horde_id',
+ 'merops',
+ 'imgt',
+ 'iuphar',
+ 'kznf_gene_catalog',
+ 'mamit-trnadb',
+ 'cd',
+ 'lncrnadb',
+ 'ena',
+ 'pseudogene.org',
+ 'refseq_accession',
]
for src in sources:
if src in r:
- if "-" in src:
- key = src.split("-")[0]
- elif "." in src:
- key = src.split(".")[0]
- elif "_" in src:
- key = src.split("_")[0]
+ if '-' in src:
+ key = src.split('-')[0]
+ elif '.' in src:
+ key = src.split('.')[0]
+ elif '_' in src:
+ key = src.split('_')[0]
else:
key = src
@@ -139,12 +139,12 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
else:
self._get_xref_associated_with(key, src, r, associated_with)
else:
- logger.warning(f"{key} not in schemas.py")
+ logger.warning(f'{key} not in schemas.py')
if xrefs:
- gene["xrefs"] = xrefs
+ gene['xrefs'] = xrefs
if associated_with:
- gene["associated_with"] = associated_with
+ gene['associated_with'] = associated_with
def _get_xref_associated_with(
self, key: str, src: str, r: Dict, src_type: Dict
@@ -158,11 +158,11 @@ def _get_xref_associated_with(
"""
if isinstance(r[src], list):
for xref in r[src]:
- src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}")
+ src_type.append(f'{NamespacePrefix[key.upper()].value}:{xref}')
else:
- if isinstance(r[src], str) and ":" in r[src]:
- r[src] = r[src].split(":")[-1].strip()
- src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}")
+ if isinstance(r[src], str) and ':' in r[src]:
+ r[src] = r[src].split(':')[-1].strip()
+ src_type.append(f'{NamespacePrefix[key.upper()].value}' f':{r[src]}')
def _get_location(self, r: Dict, gene: Dict) -> None:
"""Store GA4GH VRS ChromosomeLocation in a gene record.
@@ -172,20 +172,20 @@ def _get_location(self, r: Dict, gene: Dict) -> None:
:param gene: A transformed gene record
"""
# Get list of a gene's map locations
- if "and" in r["location"]:
- locations = r["location"].split("and")
+ if 'and' in r['location']:
+ locations = r['location'].split('and')
else:
- locations = [r["location"]]
+ locations = [r['location']]
location_list = list()
- gene["location_annotations"] = list()
+ gene['location_annotations'] = list()
for loc in locations:
loc = loc.strip()
loc = self._set_annotation(loc, gene)
if loc:
- if loc == "mitochondria":
- gene["location_annotations"].append(Chromosome.MITOCHONDRIA.value)
+ if loc == 'mitochondria':
+ gene['location_annotations'].append(Chromosome.MITOCHONDRIA.value)
else:
location = dict()
self._set_location(loc, location, gene)
@@ -194,9 +194,9 @@ def _get_location(self, r: Dict, gene: Dict) -> None:
# location_list.append(chr_location)
if location_list:
- gene["locations"] = location_list
- if not gene["location_annotations"]:
- del gene["location_annotations"]
+ gene['locations'] = location_list
+ if not gene['location_annotations']:
+ del gene['location_annotations']
def _set_annotation(self, loc: str, gene: Dict) -> None:
"""Set the annotations attribute if one is provided.
@@ -210,7 +210,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None:
for annotation in annotations:
if annotation in loc:
- gene["location_annotations"].append(annotation)
+ gene['location_annotations'].append(annotation)
# Check if location is also included
loc = loc.split(annotation)[0].strip()
if not loc:
@@ -224,24 +224,24 @@ def _set_location(self, loc: str, location: Dict, gene: Dict) -> None:
:param location: GA4GH location
:param gene: A transformed gene record
"""
- arm_match = re.search("[pq]", loc)
+ arm_match = re.search('[pq]', loc)
if arm_match:
# Location gives arm and sub / sub band
arm_ix = arm_match.start()
- location["chr"] = loc[:arm_ix]
+ location['chr'] = loc[:arm_ix]
- if "-" in loc:
+ if '-' in loc:
# Location gives both start and end
self._set_cl_interval_range(loc, arm_ix, location)
else:
# Location only gives start
start = loc[arm_ix:]
- location["start"] = start
- location["end"] = start
+ location['start'] = start
+ location['end'] = start
else:
# Only gives chromosome
- gene["location_annotations"].append(loc)
+ gene['location_annotations'].append(loc)
def _add_meta(self) -> None:
"""Add HGNC metadata.
@@ -250,20 +250,20 @@ def _add_meta(self) -> None:
"""
if not self._version:
raise GeneNormalizerEtlError(
- "Source metadata unavailable -- was data properly acquired before attempting to load DB?"
+ 'Source metadata unavailable -- was data properly acquired before attempting to load DB?'
)
metadata = SourceMeta(
- data_license="CC0",
- data_license_url="https://www.genenames.org/about/license/",
+ data_license='CC0',
+ data_license_url='https://www.genenames.org/about/license/',
version=self._version,
data_url={
- "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
+ 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json'
},
rdp_url=None,
data_license_attributes={
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
genome_assemblies=[],
)
diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py
index 8124d294..9121e498 100644
--- a/src/gene/etl/merge.py
+++ b/src/gene/etl/merge.py
@@ -7,7 +7,7 @@
from gene.database.database import DatabaseWriteException
from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@@ -28,7 +28,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None:
:param record_ids: concept identifiers from which groups should be generated.
Should *not* include any records from excluded sources.
"""
- logger.info("Generating record ID sets...")
+ logger.info('Generating record ID sets...')
start = timer()
for record_id in record_ids:
new_group = self._create_record_id_set(record_id)
@@ -36,11 +36,11 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None:
for concept_id in new_group:
self._groups[concept_id] = new_group
end = timer()
- logger.debug(f"Built record ID sets in {end - start} seconds")
+ logger.debug(f'Built record ID sets in {end - start} seconds')
self._groups = {k: v for k, v in self._groups.items() if len(v) > 1}
- logger.info("Creating merged records and updating database...")
+ logger.info('Creating merged records and updating database...')
uploaded_ids = set()
start = timer()
for record_id, group in self._groups.items():
@@ -53,22 +53,22 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None:
# add updated references
for concept_id in group:
- merge_ref = merged_record["concept_id"]
+ merge_ref = merged_record['concept_id']
try:
self._database.update_merge_ref(concept_id, merge_ref)
except DatabaseWriteException as dw:
- if str(dw).startswith("No such record exists"):
+ if str(dw).startswith('No such record exists'):
logger.error(
- f"Updating nonexistent record: {concept_id} "
- f"for merge ref to {merge_ref}"
+ f'Updating nonexistent record: {concept_id} '
+ f'for merge ref to {merge_ref}'
)
else:
logger.error(str(dw))
uploaded_ids |= group
self._database.complete_write_transaction()
- logger.info("Merged concept generation successful.")
+ logger.info('Merged concept generation successful.')
end = timer()
- logger.debug(f"Generated and added concepts in {end - start} seconds")
+ logger.debug(f'Generated and added concepts in {end - start} seconds')
def _create_record_id_set(
self, record_id: str, observed_id_set: Optional[Set] = None
@@ -89,15 +89,15 @@ def _create_record_id_set(
db_record = self._database.get_record_by_id(record_id)
if not db_record:
logger.warning(
- f"Record ID set creator could not resolve "
- f"lookup for {record_id} in ID set: "
- f"{observed_id_set}"
+ f'Record ID set creator could not resolve '
+ f'lookup for {record_id} in ID set: '
+ f'{observed_id_set}'
)
return observed_id_set - {record_id}
- record_xrefs = db_record.get("xrefs")
+ record_xrefs = db_record.get('xrefs')
if not record_xrefs:
- return observed_id_set | {db_record["concept_id"]}
+ return observed_id_set | {db_record['concept_id']}
else:
local_id_set = set(record_xrefs)
merged_id_set = {record_id} | observed_id_set
@@ -125,40 +125,40 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict:
records.append(record)
else:
logger.error(
- f"Merge record generator could not retrieve "
- f"record for {record_id} in {record_id_set}"
+ f'Merge record generator could not retrieve '
+ f'record for {record_id} in {record_id_set}'
)
def record_order(record: Dict) -> Tuple:
"""Provide priority values of concepts for sort function."""
- src = record["src_name"].upper()
+ src = record['src_name'].upper()
if src in SourcePriority.__members__:
source_rank = SourcePriority[src].value
else:
raise Exception(
f"Prohibited source: {src} in concept_id " f"{record['concept_id']}"
)
- return source_rank, record["concept_id"]
+ return source_rank, record['concept_id']
records.sort(key=record_order)
# initialize merged record
merged_attrs = {
- "concept_id": records[0]["concept_id"],
- "aliases": set(),
- "associated_with": set(),
- "previous_symbols": set(),
- "hgnc_locus_type": set(),
- "ncbi_gene_type": set(),
- "ensembl_biotype": set(),
- "strand": set(),
+ 'concept_id': records[0]['concept_id'],
+ 'aliases': set(),
+ 'associated_with': set(),
+ 'previous_symbols': set(),
+ 'hgnc_locus_type': set(),
+ 'ncbi_gene_type': set(),
+ 'ensembl_biotype': set(),
+ 'strand': set(),
}
if len(records) > 1:
- merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]})
+ merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]})
# merge from constituent records
- set_fields = ["aliases", "associated_with", "previous_symbols", "strand"]
- scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"]
+ set_fields = ['aliases', 'associated_with', 'previous_symbols', 'strand']
+ scalar_fields = ['symbol', 'symbol_status', 'label', 'location_annotations']
for record in records:
for field in set_fields:
merged_attrs[field] |= set(record.get(field, set()))
@@ -167,19 +167,19 @@ def record_order(record: Dict) -> Tuple:
if field not in merged_attrs and field in record:
merged_attrs[field] = record[field]
- locations = record.get("locations")
+ locations = record.get('locations')
if locations:
merged_attrs[f"{record['src_name'].lower()}_locations"] = locations
- gene_type = record.get("gene_type")
+ gene_type = record.get('gene_type')
if gene_type:
- merged_field = GeneTypeFieldName[record["src_name"].upper()]
+ merged_field = GeneTypeFieldName[record['src_name'].upper()]
merged_attrs[merged_field] |= {gene_type}
for field in set_fields + [
- "hgnc_locus_type",
- "ncbi_gene_type",
- "ensembl_biotype",
+ 'hgnc_locus_type',
+ 'ncbi_gene_type',
+ 'ensembl_biotype',
]:
field_value = merged_attrs[field]
if field_value:
@@ -188,12 +188,12 @@ def record_order(record: Dict) -> Tuple:
del merged_attrs[field]
# ensure no conflicting strands
- unique_strand_values = set(merged_attrs.get("strand", []))
+ unique_strand_values = set(merged_attrs.get('strand', []))
num_unique_strand_values = len(unique_strand_values)
if num_unique_strand_values > 1:
- del merged_attrs["strand"]
+ del merged_attrs['strand']
elif num_unique_strand_values == 1:
- merged_attrs["strand"] = list(unique_strand_values)[0]
+ merged_attrs['strand'] = list(unique_strand_values)[0]
- merged_attrs["item_type"] = RecordType.MERGER.value
+ merged_attrs['item_type'] = RecordType.MERGER.value
return merged_attrs
diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py
index a3b2e706..ba675dc3 100644
--- a/src/gene/etl/ncbi.py
+++ b/src/gene/etl/ncbi.py
@@ -24,7 +24,7 @@
SymbolStatus,
)
-logger = logging.getLogger("gene")
+logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)
@@ -63,10 +63,10 @@ def _extract_data(self, use_existing: bool) -> None:
self._info_src = gene_paths.gene_info
self._history_src = gene_paths.gene_history
self._gene_url = (
- "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz"
+ 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
)
- self._history_url = "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz"
- self._assembly_url = "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/"
+ self._history_url = 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz'
+ self._assembly_url = 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/'
def _get_prev_symbols(self) -> Dict[str, str]:
"""Store a gene's symbol history.
@@ -74,14 +74,14 @@ def _get_prev_symbols(self) -> Dict[str, str]:
:return: A dictionary of a gene's previous symbols
"""
# get symbol history
- history_file = open(self._history_src, "r")
- history = csv.reader(history_file, delimiter="\t")
+ history_file = open(self._history_src, 'r')
+ history = csv.reader(history_file, delimiter='\t')
next(history)
prev_symbols = {}
for row in history:
# Only interested in rows that have homo sapiens tax id
- if row[0] == "9606":
- if row[1] != "-":
+ if row[0] == '9606':
+ if row[1] != '-':
gene_id = row[1]
if gene_id in prev_symbols.keys():
prev_symbols[gene_id].append(row[3])
@@ -90,9 +90,9 @@ def _get_prev_symbols(self) -> Dict[str, str]:
else:
# Load discontinued genes
params = {
- "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}",
- "symbol": row[3],
- "symbol_status": SymbolStatus.DISCONTINUED.value,
+ 'concept_id': f'{NamespacePrefix.NCBI.value}:{row[2]}',
+ 'symbol': row[3],
+ 'symbol_status': SymbolStatus.DISCONTINUED.value,
}
self._load_gene(params)
history_file.close()
@@ -104,37 +104,37 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None:
:param val: A list of source ids for a given gene
:param params: A transformed gene record
"""
- params["xrefs"] = []
- params["associated_with"] = []
+ params['xrefs'] = []
+ params['associated_with'] = []
for src in val:
- src_name = src.split(":")[0].upper()
- src_id = src.split(":")[-1]
- if src_name == "GENEID":
- params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{src_id}"
+ src_name = src.split(':')[0].upper()
+ src_id = src.split(':')[-1]
+ if src_name == 'GENEID':
+ params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{src_id}'
elif (
src_name in NamespacePrefix.__members__
and NamespacePrefix[src_name].value in PREFIX_LOOKUP
):
- params["xrefs"].append(
- f"{NamespacePrefix[src_name].value}" f":{src_id}"
+ params['xrefs'].append(
+ f'{NamespacePrefix[src_name].value}' f':{src_id}'
)
else:
- if src_name.startswith("MIM"):
+ if src_name.startswith('MIM'):
prefix = NamespacePrefix.OMIM.value
- elif src_name.startswith("IMGT/GENE-DB"):
+ elif src_name.startswith('IMGT/GENE-DB'):
prefix = NamespacePrefix.IMGT_GENE_DB.value
- elif src_name.startswith("MIRBASE"):
+ elif src_name.startswith('MIRBASE'):
prefix = NamespacePrefix.MIRBASE.value
else:
prefix = None
if prefix:
- params["associated_with"].append(f"{prefix}:{src_id}")
+ params['associated_with'].append(f'{prefix}:{src_id}')
else:
- logger.info(f"{src_name} is not in NameSpacePrefix.")
- if not params["xrefs"]:
- del params["xrefs"]
- if not params["associated_with"]:
- del params["associated_with"]
+ logger.info(f'{src_name} is not in NameSpacePrefix.')
+ if not params['xrefs']:
+ del params['xrefs']
+ if not params['associated_with']:
+ del params['associated_with']
def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]:
"""Store genes from NCBI info file.
@@ -143,42 +143,42 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]:
:return: A dictionary of gene's from the NCBI info file.
"""
# open info file, skip headers
- info_file = open(self._info_src, "r")
- info = csv.reader(info_file, delimiter="\t")
+ info_file = open(self._info_src, 'r')
+ info = csv.reader(info_file, delimiter='\t')
next(info)
info_genes = dict()
for row in info:
params = dict()
- params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{row[1]}"
+ params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{row[1]}'
# get symbol
- params["symbol"] = row[2]
+ params['symbol'] = row[2]
# get aliases
- if row[4] != "-":
- params["aliases"] = row[4].split("|")
+ if row[4] != '-':
+ params['aliases'] = row[4].split('|')
else:
- params["aliases"] = []
+ params['aliases'] = []
# get associated_with
- if row[5] != "-":
- associated_with = row[5].split("|")
+ if row[5] != '-':
+ associated_with = row[5].split('|')
self._add_xrefs_associated_with(associated_with, params)
# get chromosome location
vrs_chr_location = self._get_vrs_chr_location(row, params)
- if "exclude" in vrs_chr_location:
+ if 'exclude' in vrs_chr_location:
# Exclude genes with multiple distinct locations (e.g. OMS)
continue
if not vrs_chr_location:
vrs_chr_location = []
- params["locations"] = vrs_chr_location
+ params['locations'] = vrs_chr_location
# get label
- if row[8] != "-":
- params["label"] = row[8]
+ if row[8] != '-':
+ params['label'] = row[8]
# add prev symbols
if row[1] in prev_symbols.keys():
- params["previous_symbols"] = prev_symbols[row[1]]
- info_genes[params["symbol"]] = params
+ params['previous_symbols'] = prev_symbols[row[1]]
+ info_genes[params['symbol']] = params
# get type
- params["gene_type"] = row[9]
+ params['gene_type'] = row[9]
return info_genes
def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None:
@@ -188,20 +188,20 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None:
:param info_genes: A dictionary of gene's from the NCBI info file.
"""
for f in db.all_features():
- if f.attributes.get("ID"):
- f_id = f.attributes.get("ID")[0]
- if f_id.startswith("gene"):
- symbol = f.attributes["Name"][0]
+ if f.attributes.get('ID'):
+ f_id = f.attributes.get('ID')[0]
+ if f_id.startswith('gene'):
+ symbol = f.attributes['Name'][0]
if symbol in info_genes:
# Just need to add SequenceLocation
params = info_genes.get(symbol)
vrs_sq_location = self._get_vrs_sq_location(db, params, f_id)
if vrs_sq_location:
- params["locations"].append(vrs_sq_location) # type: ignore
+ params['locations'].append(vrs_sq_location) # type: ignore
else:
# Need to add entire gene
gene = self._add_gff_gene(db, f, f_id)
- info_genes[gene["symbol"]] = gene
+ info_genes[gene['symbol']] = gene
def _add_gff_gene(
self, db: gffutils.FeatureDB, f: gffutils.Feature, f_id: str
@@ -214,14 +214,14 @@ def _add_gff_gene(
:return: A gene dictionary if the ID attribute exists. Else return None.
"""
params = dict()
- params["src_name"] = SourceName.NCBI.value
+ params['src_name'] = SourceName.NCBI.value
self._add_attributes(f, params)
sq_loc = self._get_vrs_sq_location(db, params, f_id)
if sq_loc:
- params["locations"] = [sq_loc]
+ params['locations'] = [sq_loc]
else:
- params["locations"] = list()
- params["label_and_type"] = f"{params['concept_id'].lower()}##identity"
+ params['locations'] = list()
+ params['label_and_type'] = f"{params['concept_id'].lower()}##identity"
return params
def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None:
@@ -230,20 +230,20 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None:
:param gffutils.feature.Feature f: A gene from the data
:param gene: A transformed gene record
"""
- attributes = ["ID", "Name", "description", "Dbxref"]
+ attributes = ['ID', 'Name', 'description', 'Dbxref']
for attribute in f.attributes.items():
key = attribute[0]
if key in attributes:
val = attribute[1]
- if len(val) == 1 and key != "Dbxref":
+ if len(val) == 1 and key != 'Dbxref':
val = val[0]
- if key == "Dbxref":
+ if key == 'Dbxref':
self._add_xrefs_associated_with(val, gene)
- elif key == "Name":
- gene["symbol"] = val
+ elif key == 'Name':
+ gene['symbol'] = val
def _get_vrs_sq_location(
self, db: gffutils.FeatureDB, params: Dict, f_id: str
@@ -257,7 +257,7 @@ def _get_vrs_sq_location(
:return: A GA4GH VRS SequenceLocation
"""
gene = db[f_id]
- params["strand"] = gene.strand
+ params['strand'] = gene.strand
return self._get_sequence_location(gene.seqid, gene, params)
def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
@@ -268,16 +268,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
:return: A dict containing an xref or associated_with ref
"""
source = dict()
- if src_name.startswith("HGNC"):
- source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
- elif src_name.startswith("NCBI"):
- source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
- elif src_name.startswith("UniProt"):
- source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
- elif src_name.startswith("miRBase"):
- source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
- elif src_name.startswith("RFAM"):
- source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
+ if src_name.startswith('HGNC'):
+ source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}']
+ elif src_name.startswith('NCBI'):
+ source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}']
+ elif src_name.startswith('UniProt'):
+ source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}']
+ elif src_name.startswith('miRBase'):
+ source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}']
+ elif src_name.startswith('RFAM'):
+ source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}']
return source
def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List:
@@ -288,24 +288,24 @@ def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List:
:param params: A transformed gene record
:return: A list of GA4GH VRS ChromosomeLocations
"""
- params["location_annotations"] = list()
+ params['location_annotations'] = list()
chromosomes_locations = self._set_chromsomes_locations(row, params)
- locations = chromosomes_locations["locations"]
- chromosomes = chromosomes_locations["chromosomes"]
- if chromosomes_locations["exclude"]:
- return ["exclude"]
+ locations = chromosomes_locations['locations']
+ chromosomes = chromosomes_locations['chromosomes']
+ if chromosomes_locations['exclude']:
+ return ['exclude']
location_list = list()
if chromosomes and not locations:
for chromosome in chromosomes:
- if chromosome == "MT":
- params["location_annotations"].append(Chromosome.MITOCHONDRIA.value)
+ if chromosome == 'MT':
+ params['location_annotations'].append(Chromosome.MITOCHONDRIA.value)
else:
- params["location_annotations"].append(chromosome.strip())
+ params['location_annotations'].append(chromosome.strip())
elif locations:
self._add_chromosome_location(locations, location_list, params)
- if not params["location_annotations"]:
- del params["location_annotations"]
+ if not params['location_annotations']:
+ del params['location_annotations']
return location_list
def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict:
@@ -316,29 +316,29 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict:
:return: A dictionary containing a gene's chromosomes and locations
"""
chromosomes = None
- if row[6] != "-":
- if "|" in row[6]:
- chromosomes = row[6].split("|")
+ if row[6] != '-':
+ if '|' in row[6]:
+ chromosomes = row[6].split('|')
else:
chromosomes = [row[6]]
if len(chromosomes) >= 2:
- if chromosomes and "X" not in chromosomes and "Y" not in chromosomes:
+ if chromosomes and 'X' not in chromosomes and 'Y' not in chromosomes:
logger.info(
- f"{row[2]} contains multiple distinct "
- f"chromosomes: {chromosomes}."
+ f'{row[2]} contains multiple distinct '
+ f'chromosomes: {chromosomes}.'
)
chromosomes = None
locations = None
exclude = False
- if row[7] != "-":
- if "|" in row[7]:
- locations = row[7].split("|")
- elif ";" in row[7]:
- locations = row[7].split(";")
- elif "and" in row[7]:
- locations = row[7].split("and")
+ if row[7] != '-':
+ if '|' in row[7]:
+ locations = row[7].split('|')
+ elif ';' in row[7]:
+ locations = row[7].split(';')
+ elif 'and' in row[7]:
+ locations = row[7].split('and')
else:
locations = [row[7]]
@@ -351,7 +351,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict:
# i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3'
if len(locations) > 2:
logger.info(
- f"{row[2]} contains multiple distinct " f"locations: {locations}."
+ f'{row[2]} contains multiple distinct ' f'locations: {locations}.'
)
locations = None
exclude = True
@@ -360,13 +360,13 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict:
if locations:
for i in range(len(locations)):
loc = locations[i].strip()
- if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc):
+ if not re.match('^([1-9][0-9]?|X[pq]?|Y[pq]?)', loc):
logger.info(
- f"{row[2]} contains invalid map location:" f"{loc}."
+ f'{row[2]} contains invalid map location:' f'{loc}.'
)
- params["location_annotations"].append(loc)
+ params['location_annotations'].append(loc)
del locations[i]
- return {"locations": locations, "chromosomes": chromosomes, "exclude": exclude}
+ return {'locations': locations, 'chromosomes': chromosomes, 'exclude': exclude}
def _add_chromosome_location(
self, locations: List, location_list: List, params: Dict
@@ -382,42 +382,42 @@ def _add_chromosome_location(
location = dict()
if Annotation.ALT_LOC.value in loc:
- loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip()
- params["location_annotations"].append(Annotation.ALT_LOC.value)
+ loc = loc.split(f'{Annotation.ALT_LOC.value}')[0].strip()
+ params['location_annotations'].append(Annotation.ALT_LOC.value)
contains_centromere = False
- if "cen" in loc:
+ if 'cen' in loc:
contains_centromere = True
- arm_match = re.search("[pq]", loc)
+ arm_match = re.search('[pq]', loc)
if arm_match and not contains_centromere:
arm_ix = arm_match.start()
chromosome = loc[:arm_ix].strip()
# NCBI sometimes stores invalid map locations
# i.e. 7637 stores 'map from Rosati ref via FISH [AFS]'
- if not re.match("^([1-9][0-9]?|X|Y|MT)$", chromosome):
+ if not re.match('^([1-9][0-9]?|X|Y|MT)$', chromosome):
continue
- location["chr"] = chromosome
+ location['chr'] = chromosome
# Check to see if there is a band / sub band included
if arm_ix != len(loc) - 1:
- if "-" in loc:
+ if '-' in loc:
self._set_cl_interval_range(loc, arm_ix, location)
else:
# Location only gives start
start = loc[arm_ix:]
- location["start"] = start
- location["end"] = start
+ location['start'] = start
+ location['end'] = start
else:
# Only arm is included
- location["start"] = loc[arm_ix]
- location["end"] = loc[arm_ix]
+ location['start'] = loc[arm_ix]
+ location['end'] = loc[arm_ix]
elif contains_centromere:
self._set_centromere_location(loc, location)
else:
# Location only gives chr
- params["location_annotations"].append(loc)
+ params['location_annotations'].append(loc)
# chr_location = self._get_chromosome_location(location, params)
# if chr_location:
@@ -429,36 +429,36 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None:
:param loc: A gene location
:param location: GA4GH location
"""
- centromere_ix = re.search("cen", loc).start() # type: ignore
- if "-" in loc:
+ centromere_ix = re.search('cen', loc).start() # type: ignore
+ if '-' in loc:
# Location gives both start and end
- range_ix = re.search("-", loc).start() # type: ignore
- if "q" in loc:
- location["chr"] = loc[:centromere_ix].strip()
- location["start"] = "cen"
- location["end"] = loc[range_ix + 1 :]
- elif "p" in loc:
- p_ix = re.search("p", loc).start() # type: ignore
- location["chr"] = loc[:p_ix].strip()
- location["end"] = "cen"
- location["start"] = loc[:range_ix]
+ range_ix = re.search('-', loc).start() # type: ignore
+ if 'q' in loc:
+ location['chr'] = loc[:centromere_ix].strip()
+ location['start'] = 'cen'
+ location['end'] = loc[range_ix + 1 :]
+ elif 'p' in loc:
+ p_ix = re.search('p', loc).start() # type: ignore
+ location['chr'] = loc[:p_ix].strip()
+ location['end'] = 'cen'
+ location['start'] = loc[:range_ix]
else:
- location["chr"] = loc[:centromere_ix].strip()
- location["start"] = "cen"
- location["end"] = "cen"
+ location['chr'] = loc[:centromere_ix].strip()
+ location['start'] = 'cen'
+ location['end'] = 'cen'
def _transform_data(self) -> None:
"""Modify data and pass to loading functions."""
- logger.info("Transforming NCBI...")
+ logger.info('Transforming NCBI...')
prev_symbols = self._get_prev_symbols()
info_genes = self._get_gene_info(prev_symbols)
# create db for gff file
db = gffutils.create_db(
str(self._gff_src),
- dbfn=":memory:",
+ dbfn=':memory:',
force=True,
- merge_strategy="create_unique",
+ merge_strategy='create_unique',
keep_order=True,
)
@@ -466,7 +466,7 @@ def _transform_data(self) -> None:
for gene in info_genes.keys():
self._load_gene(info_genes[gene])
- logger.info("Successfully transformed NCBI.")
+ logger.info('Successfully transformed NCBI.')
def _add_meta(self) -> None:
"""Add Ensembl metadata.
@@ -483,22 +483,22 @@ def _add_meta(self) -> None:
]
):
raise GeneNormalizerEtlError(
- "Source metadata unavailable -- was data properly acquired before attempting to load DB?"
+ 'Source metadata unavailable -- was data properly acquired before attempting to load DB?'
)
metadata = SourceMeta(
- data_license="custom",
- data_license_url="https://www.ncbi.nlm.nih.gov/home/about/policies/",
+ data_license='custom',
+ data_license_url='https://www.ncbi.nlm.nih.gov/home/about/policies/',
version=self._version,
data_url={
- "info_file": self._gene_url,
- "history_file": self._history_url,
- "assembly_file": self._assembly_url,
+ 'info_file': self._gene_url,
+ 'history_file': self._history_url,
+ 'assembly_file': self._assembly_url,
},
- rdp_url="https://reusabledata.org/ncbi-gene.html",
+ rdp_url='https://reusabledata.org/ncbi-gene.html',
data_license_attributes={
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
genome_assemblies=[self._assembly],
)
diff --git a/src/gene/main.py b/src/gene/main.py
index e6c87223..31db7076 100644
--- a/src/gene/main.py
+++ b/src/gene/main.py
@@ -21,27 +21,27 @@
"""
app = FastAPI(
- title="Gene Normalizer",
+ title='Gene Normalizer',
description=description,
version=__version__,
contact={
- "name": "Alex H. Wagner",
- "email": "Alex.Wagner@nationwidechildrens.org",
- "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", # noqa: E501
+ 'name': 'Alex H. Wagner',
+ 'email': 'Alex.Wagner@nationwidechildrens.org',
+ 'url': 'https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab',
},
license={
- "name": "MIT",
- "url": "https://github.com/cancervariants/gene-normalization/blob/main/LICENSE",
+ 'name': 'MIT',
+ 'url': 'https://github.com/cancervariants/gene-normalization/blob/main/LICENSE',
},
- docs_url="/gene",
- openapi_url="/gene/openapi.json",
- swagger_ui_parameters={"tryItOutEnabled": True},
+ docs_url='/gene',
+ openapi_url='/gene/openapi.json',
+ swagger_ui_parameters={'tryItOutEnabled': True},
)
-read_query_summary = "Given query, provide best-matching source records."
-response_description = "A response to a validly-formed query"
-q_descr = "Gene to normalize."
+read_query_summary = 'Given query, provide best-matching source records.'
+response_description = 'A response to a validly-formed query'
+q_descr = 'Gene to normalize.'
incl_descr = """Optional. Comma-separated list of source names to include in
response. Will exclude all other sources. Returns HTTP status code
422: Unprocessable Entity if both 'incl' and 'excl' parameters
@@ -51,21 +51,21 @@
code 422: Unprocessable Entity if both 'incl' and 'excl'
parameters are given."""
search_description = (
- "For each source, return strongest-match concepts "
- "for query string provided by user"
+ 'For each source, return strongest-match concepts '
+ 'for query string provided by user'
)
@app.get(
- "/gene/search",
+ '/gene/search',
summary=read_query_summary,
response_description=response_description,
response_model=SearchService,
description=search_description,
- tags=["Query"],
+ tags=['Query'],
)
def search(
- q: str = Query(..., description=q_descr), # noqa: D103
+ q: str = Query(..., description=q_descr),
incl: Optional[str] = Query(None, description=incl_descr),
excl: Optional[str] = Query(None, description=excl_descr),
) -> SearchService:
@@ -87,20 +87,20 @@ def search(
return resp
-normalize_summary = "Given query, provide merged normalized record."
-normalize_response_descr = "A response to a validly-formed query."
-normalize_descr = "Return merged highest-match concept for query."
-normalize_q_descr = "Gene to normalize."
+normalize_summary = 'Given query, provide merged normalized record.'
+normalize_response_descr = 'A response to a validly-formed query.'
+normalize_descr = 'Return merged highest-match concept for query.'
+normalize_q_descr = 'Gene to normalize.'
@app.get(
- "/gene/normalize",
+ '/gene/normalize',
summary=normalize_summary,
response_description=normalize_response_descr,
response_model=NormalizeService,
response_model_exclude_none=True,
description=normalize_descr,
- tags=["Query"],
+ tags=['Query'],
)
def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeService:
"""Return strongest match concepts to query string provided by user.
@@ -113,29 +113,29 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe
unmerged_matches_summary = (
- "Given query, provide source records corresponding to " "normalized concept."
+ 'Given query, provide source records corresponding to ' 'normalized concept.'
)
unmerged_response_descr = (
- "Response containing source records contained within " "normalized concept."
+ 'Response containing source records contained within ' 'normalized concept.'
)
unmerged_normalize_description = (
- "Return unmerged records associated with the "
- "normalized result of the user-provided query "
- "string."
+ 'Return unmerged records associated with the '
+ 'normalized result of the user-provided query '
+ 'string.'
)
@app.get(
- "/gene/normalize_unmerged",
+ '/gene/normalize_unmerged',
summary=unmerged_matches_summary,
- operation_id="getUnmergedRecords",
+ operation_id='getUnmergedRecords',
response_description=unmerged_response_descr,
response_model=UnmergedNormalizationService,
description=unmerged_normalize_description,
- tags=["Query"],
+ tags=['Query'],
)
def normalize_unmerged(
- q: str = Query(..., description=normalize_q_descr)
+ q: str = Query(..., description=normalize_q_descr),
) -> UnmergedNormalizationService:
"""Return all individual records associated with a normalized concept.
diff --git a/src/gene/query.py b/src/gene/query.py
index e30a79d8..59a5bd5b 100644
--- a/src/gene/query.py
+++ b/src/gene/query.py
@@ -28,7 +28,7 @@
)
from gene.version import __version__
-NormService = TypeVar("NormService", bound=BaseNormalizationService)
+NormService = TypeVar('NormService', bound=BaseNormalizationService)
class InvalidParameterException(Exception): # noqa: N818
@@ -65,15 +65,15 @@ def _emit_warnings(query_str: str) -> List:
:return: List of warnings
"""
warnings = []
- nbsp = re.search("\xa0| ", query_str)
+ nbsp = re.search('\xa0| ', query_str)
if nbsp:
warnings = [
{
- "non_breaking_space_characters": "Query contains non-breaking space characters"
+ 'non_breaking_space_characters': 'Query contains non-breaking space characters'
}
]
logger.warning(
- f"Query ({query_str}) contains non-breaking space characters."
+ f'Query ({query_str}) contains non-breaking space characters.'
)
return warnings
@@ -84,12 +84,12 @@ def _transform_sequence_location(loc: Dict) -> models.SequenceLocation:
:param loc: GeneSequenceLocation represented as a dict
:return: VRS sequence location
"""
- refget_ac = loc["sequence_id"].split("ga4gh:")[-1]
+ refget_ac = loc['sequence_id'].split('ga4gh:')[-1]
return models.SequenceLocation(
sequenceReference=models.SequenceReference(refgetAccession=refget_ac),
- start=int(loc["start"]),
- end=int(loc["end"]),
+ start=int(loc['start']),
+ end=int(loc['end']),
)
# @staticmethod
@@ -128,11 +128,11 @@ def _transform_locations(self, record: Dict) -> Dict:
:return: record with transformed locations attributes, if applicable
"""
record_locations = list()
- if "locations" in record:
- for loc in record["locations"]:
- if loc["type"] == "SequenceLocation":
+ if 'locations' in record:
+ for loc in record['locations']:
+ if loc['type'] == 'SequenceLocation':
record_locations.append(self._transform_location(loc))
- record["locations"] = record_locations
+ record['locations'] = record_locations
return record
def _get_src_name(self, concept_id: str) -> SourceName:
@@ -149,7 +149,7 @@ def _get_src_name(self, concept_id: str) -> SourceName:
elif concept_id.startswith(NamespacePrefix.HGNC.value):
return SourceName.HGNC
else:
- raise ValueError("Invalid or unrecognized concept ID provided")
+ raise ValueError('Invalid or unrecognized concept ID provided')
def _add_record(
self, response: Dict[str, Dict], item: Dict, match_type: MatchType
@@ -161,20 +161,20 @@ def _add_record(
:param match_type: match type for query
"""
item = self._transform_locations(item)
- item["match_type"] = match_type
+ item['match_type'] = match_type
gene = Gene(**item)
- src_name = item["src_name"]
+ src_name = item['src_name']
- matches = response["source_matches"]
+ matches = response['source_matches']
if src_name not in matches.keys():
pass
elif matches[src_name] is None:
matches[src_name] = {
- "records": [gene],
- "source_meta_": self.db.get_source_metadata(src_name),
+ 'records': [gene],
+ 'source_meta_': self.db.get_source_metadata(src_name),
}
else:
- matches[src_name]["records"].append(gene)
+ matches[src_name]['records'].append(gene)
def _fetch_record(
self, response: Dict[str, Dict], concept_id: str, match_type: MatchType
@@ -189,15 +189,15 @@ def _fetch_record(
match = self.db.get_record_by_id(concept_id, case_sensitive=False)
except DatabaseReadException as e:
logger.error(
- f"Encountered DatabaseReadException looking up {concept_id}: {e}"
+ f'Encountered DatabaseReadException looking up {concept_id}: {e}'
)
else:
if match:
self._add_record(response, match, match_type)
else:
logger.error(
- f"Unable to find expected record for {concept_id} matching as {match_type}"
- ) # noqa: E501
+ f'Unable to find expected record for {concept_id} matching as {match_type}'
+ )
def _post_process_resp(self, resp: Dict) -> Dict:
"""Fill all empty source_matches slots with NO_MATCH results and
@@ -207,15 +207,15 @@ def _post_process_resp(self, resp: Dict) -> Dict:
:return: response object with empty source slots filled with NO_MATCH results
and corresponding source metadata
"""
- for src_name in resp["source_matches"].keys():
- if resp["source_matches"][src_name] is None:
- resp["source_matches"][src_name] = {
- "match_type": MatchType.NO_MATCH,
- "records": [],
- "source_meta_": self.db.get_source_metadata(src_name),
+ for src_name in resp['source_matches'].keys():
+ if resp['source_matches'][src_name] is None:
+ resp['source_matches'][src_name] = {
+ 'match_type': MatchType.NO_MATCH,
+ 'records': [],
+ 'source_meta_': self.db.get_source_metadata(src_name),
}
else:
- records = resp["source_matches"][src_name]["records"]
+ records = resp['source_matches'][src_name]['records']
if len(records) > 1:
records = sorted(records, key=lambda k: k.match_type, reverse=True)
return resp
@@ -229,11 +229,11 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict:
:return: completed response object to return to client
"""
resp = {
- "query": query,
- "warnings": self._emit_warnings(query),
- "source_matches": {source: None for source in sources},
+ 'query': query,
+ 'warnings': self._emit_warnings(query),
+ 'source_matches': {source: None for source in sources},
}
- if query == "":
+ if query == '':
return self._post_process_resp(resp)
query_l = query.lower()
@@ -242,7 +242,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict:
queries.append((query_l, RecordType.IDENTITY.value))
for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)]:
- term = f"{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}"
+ term = f'{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}'
queries.append((term, RecordType.IDENTITY.value))
for match in ITEM_TYPES.values():
@@ -253,7 +253,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict:
try:
if item_type == RecordType.IDENTITY.value:
record = self.db.get_record_by_id(term, False)
- if record and record["concept_id"] not in matched_concept_ids:
+ if record and record['concept_id'] not in matched_concept_ids:
self._add_record(resp, record, MatchType.CONCEPT_ID)
else:
refs = self.db.get_refs_by_type(term, RefType(item_type))
@@ -264,8 +264,8 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict:
except DatabaseReadException as e:
logger.error(
- f"Encountered DatabaseReadException looking up {item_type}"
- f" {term}: {e}"
+ f'Encountered DatabaseReadException looking up {item_type}'
+ f' {term}: {e}'
)
continue
@@ -283,8 +283,8 @@ def _get_service_meta() -> ServiceMeta:
def search(
self,
query_str: str,
- incl: str = "",
- excl: str = "",
+ incl: str = '',
+ excl: str = '',
**params,
) -> SearchService:
"""Return highest match for each source.
@@ -316,10 +316,10 @@ def search(
if not incl and not excl:
query_sources = set(sources.values())
elif incl and excl:
- detail = "Cannot request both source inclusions and exclusions."
+ detail = 'Cannot request both source inclusions and exclusions.'
raise InvalidParameterException(detail)
elif incl:
- req_sources = [n.strip() for n in incl.split(",")]
+ req_sources = [n.strip() for n in incl.split(',')]
invalid_sources = []
query_sources = set()
for source in req_sources:
@@ -328,10 +328,10 @@ def search(
else:
invalid_sources.append(source)
if invalid_sources:
- detail = f"Invalid source name(s): {invalid_sources}"
+ detail = f'Invalid source name(s): {invalid_sources}'
raise InvalidParameterException(detail)
else:
- req_exclusions = [n.strip() for n in excl.lower().split(",")]
+ req_exclusions = [n.strip() for n in excl.lower().split(',')]
req_excl_dict = {r.lower(): r for r in req_exclusions}
invalid_sources = []
query_sources = set()
@@ -342,14 +342,14 @@ def search(
if src_l not in req_excl_dict.keys():
query_sources.add(src)
if invalid_sources:
- detail = f"Invalid source name(s): {invalid_sources}"
+ detail = f'Invalid source name(s): {invalid_sources}'
raise InvalidParameterException(detail)
query_str = query_str.strip()
resp = self._get_search_response(query_str, query_sources)
- resp["service_meta_"] = self._get_service_meta()
+ resp['service_meta_'] = self._get_service_meta()
return SearchService(**resp)
def _add_merged_meta(self, response: NormalizeService) -> NormalizeService:
@@ -360,7 +360,7 @@ def _add_merged_meta(self, response: NormalizeService) -> NormalizeService:
"""
sources_meta = {}
gene = response.gene
- sources = [response.normalized_id.split(":")[0]]
+ sources = [response.normalized_id.split(':')[0]]
if gene.mappings:
sources += [m.coding.system for m in gene.mappings]
@@ -391,13 +391,13 @@ def _add_alt_matches(
for concept_id in possible_concepts:
r = self.db.get_record_by_id(concept_id, True)
if r:
- merge_ref = r.get("merge_ref")
+ merge_ref = r.get('merge_ref')
if merge_ref:
norm_concepts.add(merge_ref)
- norm_concepts = norm_concepts - {record["concept_id"]}
+ norm_concepts = norm_concepts - {record['concept_id']}
if norm_concepts:
response.warnings.append(
- {"multiple_normalized_concepts_found": list(norm_concepts)}
+ {'multiple_normalized_concepts_found': list(norm_concepts)}
)
return response
@@ -418,14 +418,14 @@ def _add_gene(
"""
gene_obj = core_models.Gene(
id=f"normalize.gene.{record['concept_id']}",
- label=record["symbol"],
+ label=record['symbol'],
)
# mappings
- source_ids = record.get("xrefs", []) + record.get("associated_with", [])
+ source_ids = record.get('xrefs', []) + record.get('associated_with', [])
mappings = []
for source_id in source_ids:
- system, code = source_id.split(":")
+ system, code = source_id.split(':')
mappings.append(
core_models.Mapping(
coding=core_models.Coding(
@@ -439,7 +439,7 @@ def _add_gene(
# aliases
aliases = set()
- for key in ["previous_symbols", "aliases"]:
+ for key in ['previous_symbols', 'aliases']:
if key in record and record[key]:
val = record[key]
if isinstance(val, str):
@@ -451,11 +451,11 @@ def _add_gene(
# extensions
extensions = []
extension_and_record_labels = [
- ("symbol_status", "symbol_status"),
- ("approved_name", "label"),
- ("previous_symbols", "previous_symbols"),
- ("location_annotations", "location_annotations"),
- ("strand", "strand"),
+ ('symbol_status', 'symbol_status'),
+ ('approved_name', 'label'),
+ ('previous_symbols', 'previous_symbols'),
+ ('location_annotations', 'location_annotations'),
+ ('strand', 'strand'),
]
for ext_label, record_label in extension_and_record_labels:
if record_label in record and record[record_label]:
@@ -464,19 +464,19 @@ def _add_gene(
)
record_locations = {}
- if record["item_type"] == RecordType.IDENTITY:
- locs = record.get("locations")
+ if record['item_type'] == RecordType.IDENTITY:
+ locs = record.get('locations')
if locs:
record_locations[f"{record['src_name'].lower()}_locations"] = locs
- elif record["item_type"] == RecordType.MERGER:
+ elif record['item_type'] == RecordType.MERGER:
for k, v in record.items():
- if k.endswith("locations") and v:
+ if k.endswith('locations') and v:
record_locations[k] = v
for loc_name, locations in record_locations.items():
transformed_locs = []
for loc in locations:
- if loc["type"] == "SequenceLocation":
+ if loc['type'] == 'SequenceLocation':
transformed_locs.append(self._transform_location(loc))
if transformed_locs:
@@ -485,12 +485,12 @@ def _add_gene(
)
# handle gene types separately because they're wonky
- if record["item_type"] == RecordType.IDENTITY:
- gene_type = record.get("gene_type")
+ if record['item_type'] == RecordType.IDENTITY:
+ gene_type = record.get('gene_type')
if gene_type:
extensions.append(
core_models.Extension(
- name=GeneTypeFieldName[record["src_name"].upper()].value,
+ name=GeneTypeFieldName[record['src_name'].upper()].value,
value=gene_type,
)
)
@@ -509,7 +509,7 @@ def _add_gene(
if possible_concepts:
response = self._add_alt_matches(response, record, possible_concepts)
- response.normalized_id = record["concept_id"]
+ response.normalized_id = record['concept_id']
response.gene = gene_obj
response = self._add_merged_meta(response)
response.match_type = match_type
@@ -522,9 +522,9 @@ def _record_order(record: Dict) -> Tuple[int, str]:
:param record: individual record item in iterable to sort
:return: tuple with rank value and concept ID
"""
- src = record["src_name"].upper()
+ src = record['src_name'].upper()
source_rank = SourcePriority[src]
- return source_rank, record["concept_id"]
+ return source_rank, record['concept_id']
@staticmethod
def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict:
@@ -539,7 +539,7 @@ def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict:
f"Merge ref lookup failed for ref {record['merge_ref']} "
f"in record {record['concept_id']} from query {query}"
)
- response["match_type"] = MatchType.NO_MATCH
+ response['match_type'] = MatchType.NO_MATCH
return response
def _prepare_normalized_response(self, query: str) -> Dict[str, Any]:
@@ -549,10 +549,10 @@ def _prepare_normalized_response(self, query: str) -> Dict[str, Any]:
:return: basic normalization response boilerplate
"""
return {
- "query": query,
- "match_type": MatchType.NO_MATCH,
- "warnings": self._emit_warnings(query),
- "service_meta_": ServiceMeta(
+ 'query': query,
+ 'match_type': MatchType.NO_MATCH,
+ 'warnings': self._emit_warnings(query),
+ 'service_meta_': ServiceMeta(
version=__version__, response_datetime=str(datetime.now())
),
}
@@ -594,7 +594,7 @@ def _resolve_merge(
:param possible_concepts: alternate possible matches
:return: Normalized response object
"""
- merge_ref = record.get("merge_ref")
+ merge_ref = record.get('merge_ref')
if merge_ref:
# follow merge_ref
merge = self.db.get_record_by_id(merge_ref, False, True)
@@ -621,7 +621,7 @@ def _perform_normalized_lookup(
:param response_builder: response constructor callback method
:return: completed service response object
"""
- if query == "":
+ if query == '':
return response
query_str = query.lower().strip()
@@ -653,7 +653,7 @@ def _perform_normalized_lookup(
# attempt merge ref resolution until successful
for match in matching_records:
assert match is not None
- record = self.db.get_record_by_id(match["concept_id"], False)
+ record = self.db.get_record_by_id(match['concept_id'], False)
if record:
match_type_value = MatchType[match_type.value.upper()]
return self._resolve_merge(
@@ -682,23 +682,23 @@ def _add_normalized_records(
:return: Completed response object
"""
response.match_type = match_type
- response.normalized_concept_id = normalized_record["concept_id"]
- if normalized_record["item_type"] == RecordType.IDENTITY:
- record_source = SourceName[normalized_record["src_name"].upper()]
+ response.normalized_concept_id = normalized_record['concept_id']
+ if normalized_record['item_type'] == RecordType.IDENTITY:
+ record_source = SourceName[normalized_record['src_name'].upper()]
meta = self.db.get_source_metadata(record_source.value)
response.source_matches[record_source] = MatchesNormalized(
records=[BaseGene(**self._transform_locations(normalized_record))],
source_meta_=meta, # type: ignore
)
else:
- concept_ids = [normalized_record["concept_id"]] + normalized_record.get(
- "xrefs", []
+ concept_ids = [normalized_record['concept_id']] + normalized_record.get(
+ 'xrefs', []
)
for concept_id in concept_ids:
record = self.db.get_record_by_id(concept_id, case_sensitive=False)
if not record:
continue
- record_source = SourceName[record["src_name"].upper()]
+ record_source = SourceName[record['src_name'].upper()]
gene = BaseGene(**self._transform_locations(record))
if record_source in response.source_matches:
response.source_matches[record_source].records.append(gene)
diff --git a/src/gene/schemas.py b/src/gene/schemas.py
index 6f85b1bc..602c9abb 100644
--- a/src/gene/schemas.py
+++ b/src/gene/schemas.py
@@ -15,22 +15,22 @@
from gene.version import __version__
-CURIE = constr(pattern=r"^\w[^:]*:.+$")
+CURIE = constr(pattern=r'^\w[^:]*:.+$')
class SymbolStatus(str, Enum):
"""Define string constraints for symbol status attribute."""
- WITHDRAWN = "withdrawn"
- APPROVED = "approved"
- DISCONTINUED = "discontinued"
+ WITHDRAWN = 'withdrawn'
+ APPROVED = 'approved'
+ DISCONTINUED = 'discontinued'
class Strand(str, Enum):
"""Define string constraints for strand attribute."""
- FORWARD = "+"
- REVERSE = "-"
+ FORWARD = '+'
+ REVERSE = '-'
class Annotation(str, Enum):
@@ -38,16 +38,16 @@ class Annotation(str, Enum):
is absent.
"""
- NOT_FOUND_ON_REFERENCE = "not on reference assembly"
- UNPLACED = "unplaced"
- RESERVED = "reserved"
- ALT_LOC = "alternate reference locus"
+ NOT_FOUND_ON_REFERENCE = 'not on reference assembly'
+ UNPLACED = 'unplaced'
+ RESERVED = 'reserved'
+ ALT_LOC = 'alternate reference locus'
class Chromosome(str, Enum):
"""Define string constraints for chromosomes."""
- MITOCHONDRIA = "MT"
+ MITOCHONDRIA = 'MT'
class MatchType(IntEnum):
@@ -66,10 +66,10 @@ class MatchType(IntEnum):
class GeneSequenceLocation(BaseModel):
"""Sequence Location model when storing in DynamoDB."""
- type: Literal["SequenceLocation"] = "SequenceLocation"
+ type: Literal['SequenceLocation'] = 'SequenceLocation'
start: StrictInt
end: StrictInt
- sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # noqa: F722
+ sequence_id: constr(pattern=r'^ga4gh:SQ.[0-9A-Za-z_\-]{32}$') # noqa: F722
# class GeneChromosomeLocation(BaseModel):
@@ -112,20 +112,20 @@ class Gene(BaseGene):
model_config = ConfigDict(
json_schema_extra={
- "example": {
- "label": None,
- "concept_id": "ensembl:ENSG00000157764",
- "symbol": "BRAF",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": [],
- "symbol_status": None,
- "strand": "-",
- "locations": [],
- "location_annotations": [],
- "associated_with": [],
- "gene_type": None,
- "match_type": 100,
+ 'example': {
+ 'label': None,
+ 'concept_id': 'ensembl:ENSG00000157764',
+ 'symbol': 'BRAF',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': [],
+ 'symbol_status': None,
+ 'strand': '-',
+ 'locations': [],
+ 'location_annotations': [],
+ 'associated_with': [],
+ 'gene_type': None,
+ 'match_type': 100,
}
}
)
@@ -142,9 +142,9 @@ class GeneGroup(Gene):
class SourceName(Enum):
"""Define string constraints to ensure consistent capitalization."""
- HGNC = "HGNC"
- ENSEMBL = "Ensembl"
- NCBI = "NCBI"
+ HGNC = 'HGNC'
+ ENSEMBL = 'Ensembl'
+ NCBI = 'NCBI'
class SourcePriority(IntEnum):
@@ -158,42 +158,42 @@ class SourcePriority(IntEnum):
class SourceIDAfterNamespace(Enum):
"""Define string constraints after namespace."""
- HGNC = ""
- ENSEMBL = "ENSG"
- NCBI = ""
+ HGNC = ''
+ ENSEMBL = 'ENSG'
+ NCBI = ''
class NamespacePrefix(Enum):
"""Define string constraints for namespace prefixes on concept IDs."""
- HGNC = "hgnc"
- ENSEMBL = "ensembl"
- NCBI = "ncbigene"
+ HGNC = 'hgnc'
+ ENSEMBL = 'ensembl'
+ NCBI = 'ncbigene'
ENTREZ = NCBI
- VEGA = "vega"
- UCSC = "ucsc"
- ENA = "ena.embl"
- REFSEQ = "refseq"
- CCDS = "ccds"
- UNIPROT = "uniprot"
- PUBMED = "pubmed"
- COSMIC = "cosmic"
- OMIM = "omim"
- MIRBASE = "mirbase"
- HOMEODB = "homeodb"
- SNORNABASE = "snornabase"
- ORPHANET = "orphanet"
- PSEUDOGENE = "pseudogene.org"
- HORDE = "hordedb"
- MEROPS = "merops"
- IUPHAR = "iuphar"
- KZNF = "knzfgc"
- MAMIT = "mamittrnadb"
- CD = "hcdmdb"
- LNCRNADB = "lncrnadb"
- IMGT = "imgt" # .hla? .ligm? leave as is?
- IMGT_GENE_DB = "imgt/gene-db" # redundant w/ above?
- RFAM = "rfam"
+ VEGA = 'vega'
+ UCSC = 'ucsc'
+ ENA = 'ena.embl'
+ REFSEQ = 'refseq'
+ CCDS = 'ccds'
+ UNIPROT = 'uniprot'
+ PUBMED = 'pubmed'
+ COSMIC = 'cosmic'
+ OMIM = 'omim'
+ MIRBASE = 'mirbase'
+ HOMEODB = 'homeodb'
+ SNORNABASE = 'snornabase'
+ ORPHANET = 'orphanet'
+ PSEUDOGENE = 'pseudogene.org'
+ HORDE = 'hordedb'
+ MEROPS = 'merops'
+ IUPHAR = 'iuphar'
+ KZNF = 'knzfgc'
+ MAMIT = 'mamittrnadb'
+ CD = 'hcdmdb'
+ LNCRNADB = 'lncrnadb'
+ IMGT = 'imgt' # .hla? .ligm? leave as is?
+ IMGT_GENE_DB = 'imgt/gene-db' # redundant w/ above?
+ RFAM = 'rfam'
class DataLicenseAttributes(BaseModel):
@@ -207,19 +207,19 @@ class DataLicenseAttributes(BaseModel):
class RecordType(str, Enum):
"""Record item types."""
- IDENTITY = "identity"
- MERGER = "merger"
+ IDENTITY = 'identity'
+ MERGER = 'merger'
class RefType(str, Enum):
"""Reference item types."""
# Must be in descending MatchType order.
- SYMBOL = "symbol"
- PREVIOUS_SYMBOLS = "prev_symbol"
- ALIASES = "alias"
- XREFS = "xref"
- ASSOCIATED_WITH = "associated_with"
+ SYMBOL = 'symbol'
+ PREVIOUS_SYMBOLS = 'prev_symbol'
+ ALIASES = 'alias'
+ XREFS = 'xref'
+ ASSOCIATED_WITH = 'associated_with'
class SourceMeta(BaseModel):
@@ -235,22 +235,22 @@ class SourceMeta(BaseModel):
model_config = ConfigDict(
json_schema_extra={
- "example": {
- "data_license": "custom",
- "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
- "version": "20201215",
- "data_url": {
- "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz",
- "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz",
- "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/",
+ 'example': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/',
+ 'version': '20201215',
+ 'data_url': {
+ 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
+ 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz',
+ 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/',
},
- "rdp_url": "https://reusabledata.org/ncbi-gene.html",
- "data_license_attributes": {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'rdp_url': 'https://reusabledata.org/ncbi-gene.html',
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
- "genome_assemblies": [],
+ 'genome_assemblies': [],
}
}
)
@@ -262,26 +262,26 @@ class SourceSearchMatches(BaseModel):
records: List[Gene] = []
source_meta_: SourceMeta
- model_config = ConfigDict(json_schema_extra={"example": {}}) # TODO
+ model_config = ConfigDict(json_schema_extra={'example': {}}) # TODO
class ServiceMeta(BaseModel):
"""Metadata regarding the gene-normalization service."""
- name: Literal["gene-normalizer"] = "gene-normalizer"
+ name: Literal['gene-normalizer'] = 'gene-normalizer'
version: StrictStr
response_datetime: StrictStr
url: Literal[
- "https://github.com/cancervariants/gene-normalization"
- ] = "https://github.com/cancervariants/gene-normalization" # noqa: E501
+ 'https://github.com/cancervariants/gene-normalization'
+ ] = 'https://github.com/cancervariants/gene-normalization'
model_config = ConfigDict(
json_schema_extra={
- "example": {
- "name": "gene-normalizer",
- "version": __version__,
- "response_datetime": "2022-03-23 15:57:14.180908",
- "url": "https://github.com/cancervariants/gene-normalization",
+ 'example': {
+ 'name': 'gene-normalizer',
+ 'version': __version__,
+ 'response_datetime': '2022-03-23 15:57:14.180908',
+ 'url': 'https://github.com/cancervariants/gene-normalization',
}
}
)
@@ -303,9 +303,9 @@ class GeneTypeFieldName(str, Enum):
internal records.
"""
- HGNC = "hgnc_locus_type"
- NCBI = "ncbi_gene_type"
- ENSEMBL = "ensembl_biotype"
+ HGNC = 'hgnc_locus_type'
+ NCBI = 'ncbi_gene_type'
+ ENSEMBL = 'ensembl_biotype'
class BaseNormalizationService(BaseModel):
@@ -326,93 +326,93 @@ class NormalizeService(BaseNormalizationService):
model_config = ConfigDict(
json_schema_extra={
- "example": {
- "query": "BRAF",
- "warnings": [],
- "match_type": 100,
- "normalized_id": "hgnc:1037",
- "gene": {
- "type": "Gene",
- "id": "normalize.gene.hgnc:1097",
- "label": "BRAF",
- "mappings": [
+ 'example': {
+ 'query': 'BRAF',
+ 'warnings': [],
+ 'match_type': 100,
+ 'normalized_id': 'hgnc:1037',
+ 'gene': {
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:1097',
+ 'label': 'BRAF',
+ 'mappings': [
{
- "coding": {"code": "673", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '673', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "ENSG00000157764", "system": "ensembl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS5863", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS5863', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1943", "system": "iuphar"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1943', 'system': 'iuphar'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "119066", "system": "orphanet"},
- "relation": "relatedMatch",
+ 'coding': {'code': '119066', 'system': 'orphanet'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "BRAF", "system": "cosmic"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'BRAF', 'system': 'cosmic'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "2284096", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '2284096', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "uc003vwc.5", "system": "ucsc"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "164757", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '164757', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "NM_004333", "system": "refseq"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'NM_004333', 'system': 'refseq'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS87555", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS87555', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "P15056", "system": "uniprot"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'P15056', 'system': 'uniprot'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "M95712", "system": "ena.embl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'M95712', 'system': 'ena.embl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "OTTHUMG00000157457", "system": "vega"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1565476", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1565476', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"],
- "extensions": [
+ 'aliases': ['BRAF1', 'RAFB1', 'B-raf', 'NS7', 'B-RAF1'],
+ 'extensions': [
{
- "name": "approved_name",
- "value": "B-Raf proto-oncogene, serine/threonine kinase",
- "type": "Extension",
+ 'name': 'approved_name',
+ 'value': 'B-Raf proto-oncogene, serine/threonine kinase',
+ 'type': 'Extension',
},
{
- "name": "symbol_status",
- "value": "approved",
- "type": "Extension",
+ 'name': 'symbol_status',
+ 'value': 'approved',
+ 'type': 'Extension',
},
# {
# "name": "chromosome_location",
# "value": {
- # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501
+ # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw",
# "type": "ChromosomeLocation",
# "species_id": "taxonomy:9606",
# "chr": "7",
@@ -423,60 +423,60 @@ class NormalizeService(BaseNormalizationService):
# }
],
},
- "source_meta_": {
- "HGNC": {
- "data_license": "custom",
- "data_license_url": "https://www.genenames.org/about/",
- "version": "20210810",
- "data_url": {
- "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
+ 'source_meta_': {
+ 'HGNC': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://www.genenames.org/about/',
+ 'version': '20210810',
+ 'data_url': {
+ 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json'
},
- "rdp_url": None,
- "data_license_attributes": {
- "non_commercial": False,
- "attribution": False,
- "share_alike": False,
+ 'rdp_url': None,
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'attribution': False,
+ 'share_alike': False,
},
- "genome_assemblies": [],
+ 'genome_assemblies': [],
},
- "Ensembl": {
- "data_license": "custom",
- "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501
- "version": "104",
- "data_url": {
- "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz"
+ 'Ensembl': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html',
+ 'version': '104',
+ 'data_url': {
+ 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz'
},
- "rdp_url": None,
- "data_license_attributes": {
- "non_commercial": False,
- "attribution": False,
- "share_alike": False,
+ 'rdp_url': None,
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'attribution': False,
+ 'share_alike': False,
},
- "genome_assemblies": ["GRCh38"],
+ 'genome_assemblies': ['GRCh38'],
},
- "NCBI": {
- "data_license": "custom",
- "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501
- "version": "20210813",
- "data_url": {
- "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz",
- "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz",
- "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/",
+ 'NCBI': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/',
+ 'version': '20210813',
+ 'data_url': {
+ 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
+ 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz',
+ 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/',
},
- "rdp_url": "https://reusabledata.org/ncbi-gene.html",
- "data_license_attributes": {
- "non_commercial": False,
- "attribution": False,
- "share_alike": False,
+ 'rdp_url': 'https://reusabledata.org/ncbi-gene.html',
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'attribution': False,
+ 'share_alike': False,
},
- "genome_assemblies": ["GRCh38.p13"],
+ 'genome_assemblies': ['GRCh38.p13'],
},
},
- "service_meta_": {
- "name": "gene-normalizer",
- "version": __version__,
- "response_datetime": "2022-03-23 15:57:14.180908",
- "url": "https://github.com/cancervariants/gene-normalization",
+ 'service_meta_': {
+ 'name': 'gene-normalizer',
+ 'version': __version__,
+ 'response_datetime': '2022-03-23 15:57:14.180908',
+ 'url': 'https://github.com/cancervariants/gene-normalization',
},
}
}
@@ -501,168 +501,168 @@ class UnmergedNormalizationService(BaseNormalizationService):
model_config = ConfigDict(
json_schema_extra={
- "example": {
- "query": "hgnc:108",
- "warnings": [],
- "match_type": 100,
- "service_meta_": {
- "version": __version__,
- "response_datetime": "2022-04-26 14:20:54.180240",
- "name": "gene-normalizer",
- "url": "https://github.com/cancervariants/gene-normalization",
+ 'example': {
+ 'query': 'hgnc:108',
+ 'warnings': [],
+ 'match_type': 100,
+ 'service_meta_': {
+ 'version': __version__,
+ 'response_datetime': '2022-04-26 14:20:54.180240',
+ 'name': 'gene-normalizer',
+ 'url': 'https://github.com/cancervariants/gene-normalization',
},
- "normalized_concept_id": "hgnc:108",
- "source_matches": {
- "HGNC": {
- "records": [
+ 'normalized_concept_id': 'hgnc:108',
+ 'source_matches': {
+ 'HGNC': {
+ 'records': [
{
- "concept_id": "hgnc:108",
- "symbol": "ACHE",
- "symbol_status": "approved",
- "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501
- "strand": None,
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'hgnc:108',
+ 'symbol': 'ACHE',
+ 'symbol_status': 'approved',
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': None,
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
- # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
+ # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",
# "species_id": "taxonomy:9606",
# "chr": "7",
# "start": "q22.1",
# "end": "q22.1"
# }
],
- "aliases": ["3.1.1.7"],
- "previous_symbols": ["YT"],
- "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"],
- "associated_with": [
- "ucsc:uc003uxi.4",
- "vega:OTTHUMG00000157033",
- "merops:S09.979",
- "ccds:CCDS5710",
- "omim:100740",
- "iuphar:2465",
- "ccds:CCDS5709",
- "refseq:NM_015831",
- "pubmed:1380483",
- "uniprot:P22303",
- "ccds:CCDS64736",
+ 'aliases': ['3.1.1.7'],
+ 'previous_symbols': ['YT'],
+ 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'],
+ 'associated_with': [
+ 'ucsc:uc003uxi.4',
+ 'vega:OTTHUMG00000157033',
+ 'merops:S09.979',
+ 'ccds:CCDS5710',
+ 'omim:100740',
+ 'iuphar:2465',
+ 'ccds:CCDS5709',
+ 'refseq:NM_015831',
+ 'pubmed:1380483',
+ 'uniprot:P22303',
+ 'ccds:CCDS64736',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
],
- "source_meta_": {
- "data_license": "custom",
- "data_license_url": "https://www.genenames.org/about/",
- "version": "20220407",
- "data_url": {
- "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
+ 'source_meta_': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://www.genenames.org/about/',
+ 'version': '20220407',
+ 'data_url': {
+ 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json'
},
- "rdp_url": None,
- "data_license_attributes": {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'rdp_url': None,
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
- "genome_assemblies": [],
+ 'genome_assemblies': [],
},
},
- "Ensembl": {
- "records": [
+ 'Ensembl': {
+ 'records': [
{
- "concept_id": "ensembl:ENSG00000087085",
- "symbol": "ACHE",
- "symbol_status": None,
- "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501
- "strand": "-",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ensembl:ENSG00000087085',
+ 'symbol': 'ACHE',
+ 'symbol_status': None,
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': '-',
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
+ 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896974,
+ 'start': 100889993,
+ 'end': 100896974,
}
],
- "aliases": [],
- "previous_symbols": [],
- "xrefs": ["hgnc:108"],
- "associated_with": [],
- "gene_type": "protein_coding",
+ 'aliases': [],
+ 'previous_symbols': [],
+ 'xrefs': ['hgnc:108'],
+ 'associated_with': [],
+ 'gene_type': 'protein_coding',
}
],
- "source_meta_": {
- "data_license": "custom",
- "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501
- "version": "104",
- "data_url": {
- "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz"
+ 'source_meta_': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html',
+ 'version': '104',
+ 'data_url': {
+ 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz'
},
- "rdp_url": None,
- "data_license_attributes": {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'rdp_url': None,
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
- "genome_assemblies": ["GRCh38"],
+ 'genome_assemblies': ['GRCh38'],
},
},
- "NCBI": {
- "records": [
+ 'NCBI': {
+ 'records': [
{
- "concept_id": "ncbigene:43",
- "symbol": "ACHE",
- "symbol_status": None,
- "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501
- "strand": "-",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ncbigene:43',
+ 'symbol': 'ACHE',
+ 'symbol_status': None,
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': '-',
+ 'location_annotations': [],
+ 'locations': [
{
# "type": "ChromosomeLocation",
- # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
+ # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",
# "species_id": "taxonomy:9606",
# "chr": "7",
# "start": "q22.1",
# "end": "q22.1"
},
{
- "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
+ 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896994,
+ 'start': 100889993,
+ 'end': 100896994,
},
],
- "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"],
- "previous_symbols": ["ACEE"],
- "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"],
- "associated_with": ["omim:100740"],
- "gene_type": "protein-coding",
+ 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'],
+ 'previous_symbols': ['ACEE'],
+ 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'],
+ 'associated_with': ['omim:100740'],
+ 'gene_type': 'protein-coding',
}
],
- "source_meta_": {
- "data_license": "custom",
- "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501
- "version": "20220407",
- "data_url": {
- "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz",
- "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz",
- "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/",
+ 'source_meta_': {
+ 'data_license': 'custom',
+ 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/',
+ 'version': '20220407',
+ 'data_url': {
+ 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
+ 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz',
+ 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/',
},
- "rdp_url": "https://reusabledata.org/ncbi-gene.html",
- "data_license_attributes": {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'rdp_url': 'https://reusabledata.org/ncbi-gene.html',
+ 'data_license_attributes': {
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
},
- "genome_assemblies": ["GRCh38.p13"],
+ 'genome_assemblies': ['GRCh38.p13'],
},
},
},
diff --git a/src/gene/version.py b/src/gene/version.py
index 75c5d6c1..b4913868 100644
--- a/src/gene/version.py
+++ b/src/gene/version.py
@@ -1,2 +1,2 @@
"""Gene normalizer version"""
-__version__ = "0.3.0-dev1"
+__version__ = '0.3.0-dev1'
diff --git a/tests/conftest.py b/tests/conftest.py
index ad1a14a2..ba941b0a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,7 +6,7 @@
from gene.database import AbstractDatabase, create_db
-@pytest.fixture(scope="session")
+@pytest.fixture(scope='session')
def database() -> AbstractDatabase:
"""Create database instance."""
return create_db()
@@ -17,19 +17,19 @@ def pytest_addoption(parser):
See https://docs.pytest.org/en/7.1.x/reference/reference.html#parser
"""
parser.addoption(
- "--verbose-logs",
- action="store_true",
+ '--verbose-logs',
+ action='store_true',
default=False,
- help="show noisy module logs",
+ help='show noisy module logs',
)
def pytest_configure(config):
"""Configure pytest setup."""
- if not config.getoption("--verbose-logs"):
- logging.getLogger("botocore").setLevel(logging.ERROR)
- logging.getLogger("boto3").setLevel(logging.ERROR)
- logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
+ if not config.getoption('--verbose-logs'):
+ logging.getLogger('botocore').setLevel(logging.ERROR)
+ logging.getLogger('boto3').setLevel(logging.ERROR)
+ logging.getLogger('urllib3.connectionpool').setLevel(logging.ERROR)
def _compare_records(normalized_gene, test_gene, match_type):
@@ -53,7 +53,7 @@ def _compare_records(normalized_gene, test_gene, match_type):
assert normalized_gene.gene_type == test_gene.gene_type
-@pytest.fixture(scope="session")
+@pytest.fixture(scope='session')
def compare_records():
"""Provide record(s) comparison function"""
return _compare_records
@@ -65,7 +65,7 @@ def _check_resp_single_record(resp, test_gene, match_type):
_compare_records(resp.records[0], test_gene, match_type)
-@pytest.fixture(scope="session")
+@pytest.fixture(scope='session')
def check_resp_single_record():
"""Provide record comparison function for single record"""
return _check_resp_single_record
diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py
index 092cc6c3..58ef6461 100644
--- a/tests/unit/test_database_and_etl.py
+++ b/tests/unit/test_database_and_etl.py
@@ -12,28 +12,28 @@
from gene.schemas import RecordType
ALIASES = {
- "NC_000001.11": ["ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"],
- "NC_000002.12": ["ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g"],
- "NC_000003.12": ["ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX"],
- "NC_000007.14": ["ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"],
- "NC_000009.12": ["ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"],
- "NC_000011.10": ["ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1"],
- "NC_000015.10": ["ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6"],
- "NC_000017.11": ["ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7"],
- "NC_000019.10": ["ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"],
- "NC_000023.11": ["ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP"],
- "NC_000008.11": ["ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs"],
- "NC_000012.12": ["ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl"],
- "NC_000024.10": ["ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5"],
- "NT_167246.2": ["ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1"],
- "NT_167249.2": ["ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-"],
+ 'NC_000001.11': ['ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO'],
+ 'NC_000002.12': ['ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g'],
+ 'NC_000003.12': ['ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX'],
+ 'NC_000007.14': ['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'],
+ 'NC_000009.12': ['ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'],
+ 'NC_000011.10': ['ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1'],
+ 'NC_000015.10': ['ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6'],
+ 'NC_000017.11': ['ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7'],
+ 'NC_000019.10': ['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'],
+ 'NC_000023.11': ['ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP'],
+ 'NC_000008.11': ['ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs'],
+ 'NC_000012.12': ['ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl'],
+ 'NC_000024.10': ['ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5'],
+ 'NT_167246.2': ['ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1'],
+ 'NT_167249.2': ['ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-'],
}
-IS_TEST_ENV = environ.get("GENE_TEST", "").lower() == "true"
-IS_DDB_TEST = not environ.get("GENE_NORM_DB_URL", "").lower().startswith("postgres")
+IS_TEST_ENV = environ.get('GENE_TEST', '').lower() == 'true'
+IS_DDB_TEST = not environ.get('GENE_NORM_DB_URL', '').lower().startswith('postgres')
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def db_fixture(database):
"""Create a database test fixture."""
@@ -49,7 +49,7 @@ def __init__(self):
return DB()
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def processed_ids():
"""Create a test fixture to store processed ids for merged concepts."""
return list()
@@ -64,33 +64,33 @@ def _get_aliases(seqid):
return ALIASES[seqid]
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def etl_data_path():
"""Create a test fixture to return etl data path."""
test_root = Path(__file__).resolve().parents[2]
- return test_root / "tests" / "unit" / "data" / "etl_data"
+ return test_root / 'tests' / 'unit' / 'data' / 'etl_data'
def test_tables_created(db_fixture):
"""Check that requisite tables are created."""
existing_tables = db_fixture.db.list_tables()
- if db_fixture.db_name == "PostgresDatabase":
+ if db_fixture.db_name == 'PostgresDatabase':
assert set(existing_tables) == {
- "gene_associations",
- "gene_symbols",
- "gene_previous_symbols",
- "gene_aliases",
- "gene_xrefs",
- "gene_concepts",
- "gene_merged",
- "gene_sources",
+ 'gene_associations',
+ 'gene_symbols',
+ 'gene_previous_symbols',
+ 'gene_aliases',
+ 'gene_xrefs',
+ 'gene_concepts',
+ 'gene_merged',
+ 'gene_sources',
}
else:
assert db_fixture.db.gene_table in existing_tables
-@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment")
-@patch.object(Ensembl, "get_seqrepo")
+@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment')
+@patch.object(Ensembl, 'get_seqrepo')
def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path):
"""Test that ensembl etl methods work correctly."""
test_get_seqrepo.return_value = None
@@ -100,8 +100,8 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path)
processed_ids += ensembl_ids
-@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment")
-@patch.object(HGNC, "get_seqrepo")
+@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment')
+@patch.object(HGNC, 'get_seqrepo')
def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path):
"""Test that hgnc etl methods work correctly."""
test_get_seqrepo.return_value = None
@@ -110,8 +110,8 @@ def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path):
processed_ids += hgnc_ids
-@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment")
-@patch.object(NCBI, "get_seqrepo")
+@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment')
+@patch.object(NCBI, 'get_seqrepo')
def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path):
"""Test that ncbi etl methods work correctly."""
test_get_seqrepo.return_value = None
@@ -121,47 +121,47 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path):
processed_ids += ncbi_ids
-@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment")
+@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment')
def test_merged_concepts(processed_ids, db_fixture):
"""Create merged concepts and load to db."""
db_fixture.merge.create_merged_concepts(processed_ids)
-@pytest.mark.skipif(not IS_DDB_TEST, reason="only applies to DynamoDB in test env")
+@pytest.mark.skipif(not IS_DDB_TEST, reason='only applies to DynamoDB in test env')
def test_item_type(db_fixture):
"""Check that items are tagged with item_type attribute."""
- filter_exp = Key("label_and_type").eq("ncbigene:8193##identity")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "identity"
-
- filter_exp = Key("label_and_type").eq("prkrap1##symbol")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "symbol"
-
- filter_exp = Key("label_and_type").eq("loc157663##prev_symbol")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "prev_symbol"
-
- filter_exp = Key("label_and_type").eq("flj23569##alias")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "alias"
-
- filter_exp = Key("label_and_type").eq("omim:606689##associated_with")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "associated_with"
-
- filter_exp = Key("label_and_type").eq("ensembl:ensg00000268895##xref")
- item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0]
- assert "item_type" in item
- assert item["item_type"] == "xref"
-
-
-@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment")
+ filter_exp = Key('label_and_type').eq('ncbigene:8193##identity')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'identity'
+
+ filter_exp = Key('label_and_type').eq('prkrap1##symbol')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'symbol'
+
+ filter_exp = Key('label_and_type').eq('loc157663##prev_symbol')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'prev_symbol'
+
+ filter_exp = Key('label_and_type').eq('flj23569##alias')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'alias'
+
+ filter_exp = Key('label_and_type').eq('omim:606689##associated_with')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'associated_with'
+
+ filter_exp = Key('label_and_type').eq('ensembl:ensg00000268895##xref')
+ item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0]
+ assert 'item_type' in item
+ assert item['item_type'] == 'xref'
+
+
+@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment')
def test_get_all_records(db_fixture):
"""Basic test of get_all_records method.
@@ -171,10 +171,10 @@ def test_get_all_records(db_fixture):
"""
source_records = list(db_fixture.db.get_all_records(RecordType.IDENTITY))
assert len(source_records) == 63
- source_ids = {r["concept_id"] for r in source_records}
+ source_ids = {r['concept_id'] for r in source_records}
assert len(source_ids) == 63
normalized_records = list(db_fixture.db.get_all_records(RecordType.MERGER))
assert len(normalized_records) == 46
- normalized_ids = {r["concept_id"] for r in normalized_records}
+ normalized_ids = {r['concept_id'] for r in normalized_records}
assert len(normalized_ids) == 46
diff --git a/tests/unit/test_emit_warnings.py b/tests/unit/test_emit_warnings.py
index c8309aac..c28e7ae5 100644
--- a/tests/unit/test_emit_warnings.py
+++ b/tests/unit/test_emit_warnings.py
@@ -7,25 +7,25 @@ def test_emit_warnings():
"""Test that emit_warnings works correctly."""
expected_warnings = [
{
- "non_breaking_space_characters": "Query contains non-breaking space characters"
+ 'non_breaking_space_characters': 'Query contains non-breaking space characters'
}
]
db = create_db()
query_handler = QueryHandler(db)
# Test emit no warnings
- actual_warnings = query_handler._emit_warnings("spry3")
+ actual_warnings = query_handler._emit_warnings('spry3')
assert actual_warnings == []
# Test emit warnings
- actual_warnings = query_handler._emit_warnings("sp ry3")
+ actual_warnings = query_handler._emit_warnings('sp ry3')
assert actual_warnings == actual_warnings
- actual_warnings = query_handler._emit_warnings("sp\u00A0ry3")
+ actual_warnings = query_handler._emit_warnings('sp\u00A0ry3')
assert expected_warnings == actual_warnings
- actual_warnings = query_handler._emit_warnings("sp ry3")
+ actual_warnings = query_handler._emit_warnings('sp ry3')
assert expected_warnings == actual_warnings
- actual_warnings = query_handler._emit_warnings("sp\xa0ry3")
+ actual_warnings = query_handler._emit_warnings('sp\xa0ry3')
assert expected_warnings == actual_warnings
diff --git a/tests/unit/test_endpoints.py b/tests/unit/test_endpoints.py
index 0639e6a0..25e3aa05 100644
--- a/tests/unit/test_endpoints.py
+++ b/tests/unit/test_endpoints.py
@@ -10,7 +10,7 @@
from gene.main import app
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def api_client():
"""Provide test client fixture."""
return TestClient(app)
@@ -18,26 +18,26 @@ def api_client():
def test_search(api_client):
"""Test /search endpoint."""
- response = api_client.get("/gene/search?q=braf")
+ response = api_client.get('/gene/search?q=braf')
assert response.status_code == 200
assert (
- response.json()["source_matches"]["HGNC"]["records"][0]["concept_id"]
- == "hgnc:1097"
+ response.json()['source_matches']['HGNC']['records'][0]['concept_id']
+ == 'hgnc:1097'
)
- response = api_client.get("/gene/search?q=braf&incl=sdkl")
+ response = api_client.get('/gene/search?q=braf&incl=sdkl')
assert response.status_code == 422
def test_normalize(api_client):
"""Test /normalize endpoint."""
- response = api_client.get("/gene/normalize?q=braf")
+ response = api_client.get('/gene/normalize?q=braf')
assert response.status_code == 200
- assert response.json()["normalized_id"] == "hgnc:1097"
+ assert response.json()['normalized_id'] == 'hgnc:1097'
def test_normalize_unmerged(api_client):
"""Test /normalize_unmerged endpoint."""
- response = api_client.get("/gene/normalize_unmerged?q=braf")
+ response = api_client.get('/gene/normalize_unmerged?q=braf')
assert response.status_code == 200
- assert response.json()["normalized_concept_id"] == "hgnc:1097"
+ assert response.json()['normalized_concept_id'] == 'hgnc:1097'
diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py
index 7660be3e..0e012a78 100644
--- a/tests/unit/test_ensembl_source.py
+++ b/tests/unit/test_ensembl_source.py
@@ -5,7 +5,7 @@
from gene.schemas import Gene, MatchType, SourceName
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def ensembl(database):
"""Build ensembl test fixture."""
@@ -13,7 +13,7 @@ class QueryGetter:
def __init__(self):
self.query_handler = QueryHandler(database)
- def search(self, query_str, incl="ensembl"):
+ def search(self, query_str, incl='ensembl'):
resp = self.query_handler.search(query_str, incl=incl)
return resp.source_matches[SourceName.ENSEMBL]
@@ -21,162 +21,162 @@ def search(self, query_str, incl="ensembl"):
return e
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def ddx11l1():
"""Create a DDX11L1 fixutre."""
params = {
- "match_type": MatchType.NO_MATCH,
- "concept_id": "ensembl:ENSG00000223972",
- "symbol": "DDX11L1",
- "label": "DEAD/H-box helicase 11 like 1 (pseudogene)",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": ["hgnc:37102"],
- "symbol_status": None,
- "location_annotations": [],
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'concept_id': 'ensembl:ENSG00000223972',
+ 'symbol': 'DDX11L1',
+ 'label': 'DEAD/H-box helicase 11 like 1 (pseudogene)',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': ['hgnc:37102'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L",
- "end": 14409,
- "start": 11868,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO",
+ 'id': 'ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L',
+ 'end': 14409,
+ 'start': 11868,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "strand": "+",
- "associated_with": [],
- "gene_type": "transcribed_unprocessed_pseudogene",
+ 'strand': '+',
+ 'associated_with': [],
+ 'gene_type': 'transcribed_unprocessed_pseudogene',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def tp53():
"""Create a TP53 fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "concept_id": "ensembl:ENSG00000141510",
- "symbol": "TP53",
- "label": "tumor protein p53",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": ["hgnc:11998"],
- "symbol_status": None,
- "location_annotations": [],
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'concept_id': 'ensembl:ENSG00000141510',
+ 'symbol': 'TP53',
+ 'label': 'tumor protein p53',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': ['hgnc:11998'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9",
- "end": 7687538,
- "start": 7661778,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
+ 'id': 'ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9',
+ 'end': 7687538,
+ 'start': 7661778,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "strand": "-",
- "associated_with": [],
- "gene_type": "protein_coding",
+ 'strand': '-',
+ 'associated_with': [],
+ 'gene_type': 'protein_coding',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def ATP6AP1_DT(): # noqa: N802
"""Create a ATP6AP1-DT test fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "concept_id": "ensembl:ENSG00000197180",
- "symbol": "ATP6AP1-DT",
- "label": "ATP6AP1 divergent transcript",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": ["hgnc:25138"],
- "symbol_status": None,
- "location_annotations": [],
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'concept_id': 'ensembl:ENSG00000197180',
+ 'symbol': 'ATP6AP1-DT',
+ 'label': 'ATP6AP1 divergent transcript',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': ['hgnc:25138'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3",
- "end": 154428526,
- "start": 154424377,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
+ 'id': 'ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3',
+ 'end': 154428526,
+ 'start': 154424377,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "strand": "-",
- "associated_with": [],
- "gene_type": "lncRNA",
+ 'strand': '-',
+ 'associated_with': [],
+ 'gene_type': 'lncRNA',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def hsa_mir_1253():
"""Create a hsa-miR-1253 test fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "concept_id": "ensembl:ENSG00000272920",
- "symbol": "hsa-mir-1253",
- "label": "hsa-mir-1253",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": [],
- "symbol_status": None,
- "location_annotations": [],
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'concept_id': 'ensembl:ENSG00000272920',
+ 'symbol': 'hsa-mir-1253',
+ 'label': 'hsa-mir-1253',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': [],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR",
- "end": 2748182,
- "start": 2748077,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7",
+ 'id': 'ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR',
+ 'end': 2748182,
+ 'start': 2748077,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "strand": "+",
- "associated_with": ["mirbase:MI0006387"],
- "gene_type": "lncRNA",
+ 'strand': '+',
+ 'associated_with': ['mirbase:MI0006387'],
+ 'gene_type': 'lncRNA',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def spry3():
"""Create a SPRY3 test fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "concept_id": "ensembl:ENSG00000168939",
- "symbol": "SPRY3",
- "label": "sprouty RTK signaling antagonist 3",
- "previous_symbols": [],
- "aliases": [],
- "xrefs": ["hgnc:11271"],
- "symbol_status": None,
- "location_annotations": [],
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'concept_id': 'ensembl:ENSG00000168939',
+ 'symbol': 'SPRY3',
+ 'label': 'sprouty RTK signaling antagonist 3',
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'xrefs': ['hgnc:11271'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0",
- "end": 155782459,
- "start": 155612571,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
+ 'id': 'ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0',
+ 'end': 155782459,
+ 'start': 155612571,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "strand": "+",
- "associated_with": [],
- "gene_type": "protein_coding",
+ 'strand': '+',
+ 'associated_with': [],
+ 'gene_type': 'protein_coding',
}
return Gene(**params)
@@ -184,137 +184,137 @@ def spry3():
def test_ddx11l1(check_resp_single_record, ensembl, ddx11l1):
"""Test that DDX11L1 normalizes to correct gene concept."""
# Concept ID
- resp = ensembl.search("ensembl:ENSG00000223972")
+ resp = ensembl.search('ensembl:ENSG00000223972')
check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSEMBL:ENSG00000223972")
+ resp = ensembl.search('ENSEMBL:ENSG00000223972')
check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSG00000223972")
+ resp = ensembl.search('ENSG00000223972')
check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID)
# Symbol
- resp = ensembl.search("ddx11l1")
+ resp = ensembl.search('ddx11l1')
check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL)
- resp = ensembl.search("DDX11L1")
+ resp = ensembl.search('DDX11L1')
check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL)
def test_tp53(check_resp_single_record, ensembl, tp53):
"""Test that tp53 normalizes to correct gene concept."""
# Concept ID
- resp = ensembl.search("ensembl:ENSG00000141510")
+ resp = ensembl.search('ensembl:ENSG00000141510')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSEMBL:ENSG00000141510")
+ resp = ensembl.search('ENSEMBL:ENSG00000141510')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSG00000141510")
+ resp = ensembl.search('ENSG00000141510')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
# Symbol
- resp = ensembl.search("tp53")
+ resp = ensembl.search('tp53')
check_resp_single_record(resp, tp53, MatchType.SYMBOL)
- resp = ensembl.search("TP53")
+ resp = ensembl.search('TP53')
check_resp_single_record(resp, tp53, MatchType.SYMBOL)
def test_ATP6AP1_DT(check_resp_single_record, ensembl, ATP6AP1_DT): # noqa: N802 N803
"""Test that ATP6AP1-DT normalizes to correct gene concept."""
# Concept ID
- resp = ensembl.search("ensembl:ENSG00000197180")
+ resp = ensembl.search('ensembl:ENSG00000197180')
check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSEMBL:ENSG00000197180")
+ resp = ensembl.search('ENSEMBL:ENSG00000197180')
check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSG00000197180")
+ resp = ensembl.search('ENSG00000197180')
check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID)
# Symbol
- resp = ensembl.search("ATP6AP1-DT")
+ resp = ensembl.search('ATP6AP1-DT')
check_resp_single_record(resp, ATP6AP1_DT, MatchType.SYMBOL)
def test_hsa_mir_1253(check_resp_single_record, ensembl, hsa_mir_1253):
"""Test that hsa-mir-1253 normalizes to correct gene concept."""
# Concept ID
- resp = ensembl.search("ensembl:ENSG00000272920")
+ resp = ensembl.search('ensembl:ENSG00000272920')
check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSEMBL:ENSG00000272920")
+ resp = ensembl.search('ENSEMBL:ENSG00000272920')
check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSG00000272920")
+ resp = ensembl.search('ENSG00000272920')
check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID)
# Symbol
- resp = ensembl.search("hsa-mir-1253")
+ resp = ensembl.search('hsa-mir-1253')
check_resp_single_record(resp, hsa_mir_1253, MatchType.SYMBOL)
# associated_with
- resp = ensembl.search("mirbase:MI0006387")
+ resp = ensembl.search('mirbase:MI0006387')
check_resp_single_record(resp, hsa_mir_1253, MatchType.ASSOCIATED_WITH)
def test_spry3(check_resp_single_record, ensembl, spry3):
"""Test that spry3 normalizes to correct gene concept."""
# Concept ID
- resp = ensembl.search("ensembl:EnSG00000168939")
+ resp = ensembl.search('ensembl:EnSG00000168939')
check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID)
- resp = ensembl.search("ENSEMBL:EnSG00000168939")
+ resp = ensembl.search('ENSEMBL:EnSG00000168939')
check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID)
- resp = ensembl.search("EnSG00000168939")
+ resp = ensembl.search('EnSG00000168939')
check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID)
# Symbol
- resp = ensembl.search("spry3")
+ resp = ensembl.search('spry3')
check_resp_single_record(resp, spry3, MatchType.SYMBOL)
def test_no_match(ensembl):
"""Test that a term normalizes to correct gene concept as a NO match."""
- resp = ensembl.search("A1BG - AS1")
+ resp = ensembl.search('A1BG - AS1')
assert len(resp.records) == 0
- resp = ensembl.search("hnc:5")
+ resp = ensembl.search('hnc:5')
assert len(resp.records) == 0
# Test empty query
- resp = ensembl.search("")
+ resp = ensembl.search('')
assert len(resp.records) == 0
# Do not search on label
- resp = ensembl.search("A1BG antisense RNA 1")
+ resp = ensembl.search('A1BG antisense RNA 1')
assert len(resp.records) == 0
- resp = ensembl.search("ensembl:ENSG00000278704")
+ resp = ensembl.search('ensembl:ENSG00000278704')
assert len(resp.records) == 0
- resp = ensembl.search("ensembl:ENSG00000284906")
+ resp = ensembl.search('ensembl:ENSG00000284906')
assert len(resp.records) == 0
def test_meta_info(ensembl):
"""Test that the meta field is correct."""
- resp = ensembl.search("chromosome:1")
- assert resp.source_meta_.data_license == "custom"
+ resp = ensembl.search('chromosome:1')
+ assert resp.source_meta_.data_license == 'custom'
assert (
resp.source_meta_.data_license_url
- == "https://useast.ensembl.org/info/about/legal/disclaimer.html"
+ == 'https://useast.ensembl.org/info/about/legal/disclaimer.html'
)
- assert resp.source_meta_.version == "110"
+ assert resp.source_meta_.version == '110'
assert resp.source_meta_.data_url == {
- "genome_annotations": "ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz"
+ 'genome_annotations': 'ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz'
}
assert resp.source_meta_.rdp_url is None
- assert resp.source_meta_.genome_assemblies == ["GRCh38"]
+ assert resp.source_meta_.genome_assemblies == ['GRCh38']
assert resp.source_meta_.data_license_attributes == {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
}
diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py
index 54d0aff0..1673c2ba 100644
--- a/tests/unit/test_hgnc_source.py
+++ b/tests/unit/test_hgnc_source.py
@@ -7,7 +7,7 @@
from gene.schemas import Gene, MatchType, SourceName
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def hgnc(database):
"""Build hgnc test fixture."""
@@ -15,7 +15,7 @@ class QueryGetter:
def __init__(self):
self.query_handler = QueryHandler(database)
- def search(self, query_str, incl="hgnc"):
+ def search(self, query_str, incl='hgnc'):
resp = self.query_handler.search(query_str, incl=incl)
return resp.source_matches[SourceName.HGNC]
@@ -26,17 +26,17 @@ def search(self, query_str, incl="hgnc"):
# Test Non Alt Loci Set
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def a1bg_as1():
"""Create an A1BG-AS1 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "A1BG antisense RNA 1",
- "concept_id": "hgnc:37133",
- "symbol": "A1BG-AS1",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'A1BG antisense RNA 1',
+ 'concept_id': 'hgnc:37133',
+ 'symbol': 'A1BG-AS1',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.Rz-M5wA0_bIhQYLKi2ZPqlqW3nBPfAx5",
# "chr": "19",
@@ -46,34 +46,34 @@ def a1bg_as1():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": ["NCRNA00181", "A1BGAS", "A1BG-AS"],
- "aliases": ["FLJ23569"],
- "symbol_status": "approved",
- "associated_with": [
- "vega:OTTHUMG00000183508",
- "ucsc:uc002qse.3",
- "refseq:NR_015380",
- "ena.embl:BC040926",
- "refseq:NR_015380",
- "ena.embl:BC040926",
+ 'previous_symbols': ['NCRNA00181', 'A1BGAS', 'A1BG-AS'],
+ 'aliases': ['FLJ23569'],
+ 'symbol_status': 'approved',
+ 'associated_with': [
+ 'vega:OTTHUMG00000183508',
+ 'ucsc:uc002qse.3',
+ 'refseq:NR_015380',
+ 'ena.embl:BC040926',
+ 'refseq:NR_015380',
+ 'ena.embl:BC040926',
],
- "xrefs": ["ensembl:ENSG00000268895", "ncbigene:503538"],
- "gene_type": "RNA, long non-coding",
+ 'xrefs': ['ensembl:ENSG00000268895', 'ncbigene:503538'],
+ 'gene_type': 'RNA, long non-coding',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def tp53():
"""Create a TP53 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "tumor protein p53",
- "concept_id": "hgnc:11998",
- "symbol": "TP53",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'tumor protein p53',
+ 'concept_id': 'hgnc:11998',
+ 'symbol': 'TP53',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.BPk3okUhv4BBatjkyC7eQQsyXL6YwmeF",
# "chr": "17",
@@ -83,51 +83,51 @@ def tp53():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": ["p53", "LFS1"],
- "symbol_status": "approved",
- "associated_with": [
- "vega:OTTHUMG00000162125",
- "refseq:NM_000546",
- "cosmic:TP53",
- "omim:191170",
- "ucsc:uc060aur.1",
- "uniprot:P04637",
- "orphanet:120204",
- "ccds:CCDS73968",
- "ccds:CCDS73971",
- "ccds:CCDS73970",
- "ccds:CCDS73969",
- "ccds:CCDS73967",
- "ccds:CCDS73966",
- "ccds:CCDS73965",
- "ccds:CCDS73964",
- "ccds:CCDS73963",
- "ccds:CCDS11118",
- "ccds:CCDS45605",
- "ccds:CCDS45606",
- "ena.embl:AF307851",
- "pubmed:6396087",
- "pubmed:3456488",
- "pubmed:2047879",
+ 'previous_symbols': [],
+ 'aliases': ['p53', 'LFS1'],
+ 'symbol_status': 'approved',
+ 'associated_with': [
+ 'vega:OTTHUMG00000162125',
+ 'refseq:NM_000546',
+ 'cosmic:TP53',
+ 'omim:191170',
+ 'ucsc:uc060aur.1',
+ 'uniprot:P04637',
+ 'orphanet:120204',
+ 'ccds:CCDS73968',
+ 'ccds:CCDS73971',
+ 'ccds:CCDS73970',
+ 'ccds:CCDS73969',
+ 'ccds:CCDS73967',
+ 'ccds:CCDS73966',
+ 'ccds:CCDS73965',
+ 'ccds:CCDS73964',
+ 'ccds:CCDS73963',
+ 'ccds:CCDS11118',
+ 'ccds:CCDS45605',
+ 'ccds:CCDS45606',
+ 'ena.embl:AF307851',
+ 'pubmed:6396087',
+ 'pubmed:3456488',
+ 'pubmed:2047879',
],
- "xrefs": ["ensembl:ENSG00000141510", "ncbigene:7157"],
- "gene_type": "gene with protein product",
+ 'xrefs': ['ensembl:ENSG00000141510', 'ncbigene:7157'],
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def a3galt2():
"""Create an A3GALT2 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "alpha 1,3-galactosyltransferase 2",
- "concept_id": "hgnc:30005",
- "symbol": "A3GALT2",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'alpha 1,3-galactosyltransferase 2',
+ 'concept_id': 'hgnc:30005',
+ 'symbol': 'A3GALT2',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.iiwv6oaDfVVkjMZ_OH6XEQmM0daVft4u",
# "chr": "1",
@@ -137,37 +137,37 @@ def a3galt2():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": ["A3GALT2P"],
- "aliases": ["IGBS3S", "IGB3S"],
- "symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000184389", "ncbigene:127550"],
- "associated_with": [
- "vega:OTTHUMG00000004125",
- "vega:OTTHUMG00000004125",
- "ucsc:uc031plq.1",
- "uniprot:U3KPV4",
- "ccds:CCDS60080",
- "pubmed:10854427",
- "pubmed:18630988",
- "refseq:NM_001080438",
- "omim:619850",
+ 'previous_symbols': ['A3GALT2P'],
+ 'aliases': ['IGBS3S', 'IGB3S'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ensembl:ENSG00000184389', 'ncbigene:127550'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000004125',
+ 'vega:OTTHUMG00000004125',
+ 'ucsc:uc031plq.1',
+ 'uniprot:U3KPV4',
+ 'ccds:CCDS60080',
+ 'pubmed:10854427',
+ 'pubmed:18630988',
+ 'refseq:NM_001080438',
+ 'omim:619850',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def wdhd1():
"""Create a WDHD1 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "WD repeat and HMG-box DNA binding protein 1",
- "concept_id": "hgnc:23170",
- "symbol": "WDHD1",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'WD repeat and HMG-box DNA binding protein 1',
+ 'concept_id': 'hgnc:23170',
+ 'symbol': 'WDHD1',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.sNe5mpPbxivH2KE6HdaDA3U29BkCQXc3",
# "chr": "14",
@@ -177,80 +177,80 @@ def wdhd1():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": ["AND-1", "CTF4", "CHTF4"],
- "symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000198554", "ncbigene:11169"],
- "associated_with": [
- "vega:OTTHUMG00000140304",
- "refseq:NM_007086",
- "omim:608126",
- "ucsc:uc001xbm.3",
- "uniprot:O75717",
- "ccds:CCDS41955",
- "ccds:CCDS9721",
- "ena.embl:AJ006266",
- "pubmed:9175701",
- "pubmed:20028748",
+ 'previous_symbols': [],
+ 'aliases': ['AND-1', 'CTF4', 'CHTF4'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ensembl:ENSG00000198554', 'ncbigene:11169'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000140304',
+ 'refseq:NM_007086',
+ 'omim:608126',
+ 'ucsc:uc001xbm.3',
+ 'uniprot:O75717',
+ 'ccds:CCDS41955',
+ 'ccds:CCDS9721',
+ 'ena.embl:AJ006266',
+ 'pubmed:9175701',
+ 'pubmed:20028748',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def g6pr():
"""Create a G6PR gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "glucose-6-phosphatase regulator",
- "concept_id": "hgnc:4059",
- "symbol": "G6PR",
- "location_annotations": ["reserved"],
- "locations": [],
- "strand": None,
- "previous_symbols": [],
- "aliases": ["GSD1aSP"],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:2541"],
- "associated_with": ["pubmed:2172641", "pubmed:7814621", "pubmed:2996501"],
- "gene_type": "unknown",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'glucose-6-phosphatase regulator',
+ 'concept_id': 'hgnc:4059',
+ 'symbol': 'G6PR',
+ 'location_annotations': ['reserved'],
+ 'locations': [],
+ 'strand': None,
+ 'previous_symbols': [],
+ 'aliases': ['GSD1aSP'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:2541'],
+ 'associated_with': ['pubmed:2172641', 'pubmed:7814621', 'pubmed:2996501'],
+ 'gene_type': 'unknown',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def pirc24():
"""Create a PIRC24 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "piwi-interacting RNA cluster 24",
- "concept_id": "hgnc:37528",
- "symbol": "PIRC24",
- "location_annotations": ["6"],
- "locations": [],
- "strand": None,
- "previous_symbols": [],
- "aliases": [],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:100313810"],
- "associated_with": ["pubmed:17881367"],
- "gene_type": "RNA, cluster",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'piwi-interacting RNA cluster 24',
+ 'concept_id': 'hgnc:37528',
+ 'symbol': 'PIRC24',
+ 'location_annotations': ['6'],
+ 'locations': [],
+ 'strand': None,
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:100313810'],
+ 'associated_with': ['pubmed:17881367'],
+ 'gene_type': 'RNA, cluster',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def gage4():
"""Create a GAGE4 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "G antigen 4",
- "concept_id": "hgnc:4101",
- "symbol": "GAGE4",
- "location_annotations": ["not on reference assembly"],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'G antigen 4',
+ 'concept_id': 'hgnc:4101',
+ 'symbol': 'GAGE4',
+ 'location_annotations': ['not on reference assembly'],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.6KzwrFm2WeSXqwIIiNbAu-pKQQHt2q5Q",
# "chr": "X",
@@ -260,83 +260,83 @@ def gage4():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": ["CT4.4"],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:2576"],
- "associated_with": [
- "refseq:NM_001474",
- "omim:300597",
- "uniprot:P0DSO3",
- "ena.embl:U19145",
- "pubmed:7544395",
+ 'previous_symbols': [],
+ 'aliases': ['CT4.4'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:2576'],
+ 'associated_with': [
+ 'refseq:NM_001474',
+ 'omim:300597',
+ 'uniprot:P0DSO3',
+ 'ena.embl:U19145',
+ 'pubmed:7544395',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def mafip():
"""Create a MAFIP gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "MAFF interacting protein",
- "concept_id": "hgnc:31102",
- "symbol": "MAFIP",
- "location_annotations": ["unplaced", "14"],
- "locations": [],
- "strand": None,
- "previous_symbols": [],
- "aliases": ["FLJ35473", "FLJ00219", "FLJ39633", "MIP", "pp5644", "TEKT4P4"],
- "symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000274847", "ncbigene:727764"],
- "associated_with": [
- "vega:OTTHUMG00000188065",
- "refseq:NR_046439",
- "uniprot:Q8WZ33",
- "ena.embl:AK074146",
- "ena.embl:AF289559",
- "pubmed:16549056",
- "pubmed:15881666",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'MAFF interacting protein',
+ 'concept_id': 'hgnc:31102',
+ 'symbol': 'MAFIP',
+ 'location_annotations': ['unplaced', '14'],
+ 'locations': [],
+ 'strand': None,
+ 'previous_symbols': [],
+ 'aliases': ['FLJ35473', 'FLJ00219', 'FLJ39633', 'MIP', 'pp5644', 'TEKT4P4'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ensembl:ENSG00000274847', 'ncbigene:727764'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000188065',
+ 'refseq:NR_046439',
+ 'uniprot:Q8WZ33',
+ 'ena.embl:AK074146',
+ 'ena.embl:AF289559',
+ 'pubmed:16549056',
+ 'pubmed:15881666',
],
- "gene_type": "unknown",
+ 'gene_type': 'unknown',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def mt_7sdna():
"""Create a MT-7SDNA gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "mitochondrially encoded 7S DNA",
- "concept_id": "hgnc:7409",
- "symbol": "MT-7SDNA",
- "location_annotations": ["MT"],
- "locations": [],
- "strand": None,
- "previous_symbols": ["MT7SDNA"],
- "aliases": [],
- "symbol_status": "approved",
- "xrefs": [],
- "associated_with": ["pubmed:24709344", "pubmed:273237"],
- "gene_type": "region",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'mitochondrially encoded 7S DNA',
+ 'concept_id': 'hgnc:7409',
+ 'symbol': 'MT-7SDNA',
+ 'location_annotations': ['MT'],
+ 'locations': [],
+ 'strand': None,
+ 'previous_symbols': ['MT7SDNA'],
+ 'aliases': [],
+ 'symbol_status': 'approved',
+ 'xrefs': [],
+ 'associated_with': ['pubmed:24709344', 'pubmed:273237'],
+ 'gene_type': 'region',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def cecr():
"""Create a CECR gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "cat eye syndrome chromosome region",
- "concept_id": "hgnc:1838",
- "symbol": "CECR",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'cat eye syndrome chromosome region',
+ 'concept_id': 'hgnc:1838',
+ 'symbol': 'CECR',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.AgASk5sB6LCeaB6rcqOwmrm16ise3pof",
# "chr": "22",
@@ -346,27 +346,27 @@ def cecr():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": [],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:1055"],
- "associated_with": [],
- "gene_type": "region",
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:1055'],
+ 'associated_with': [],
+ 'gene_type': 'region',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def csf2ra():
"""Create a CSF2RA gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "colony stimulating factor 2 receptor subunit alpha",
- "concept_id": "hgnc:2435",
- "symbol": "CSF2RA",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'colony stimulating factor 2 receptor subunit alpha',
+ 'concept_id': 'hgnc:2435',
+ 'symbol': 'CSF2RA',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.cITg67iNn_QNZTKpJd0I-1JMMhW_yHGU",
# "chr": "X",
@@ -384,45 +384,45 @@ def csf2ra():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": ["CSF2R"],
- "aliases": ["CD116", "alphaGMR"],
- "symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000198223", "ncbigene:1438"],
- "associated_with": [
- "vega:OTTHUMG00000012533",
- "refseq:NM_001161529",
- "orphanet:209477",
- "iuphar:1707",
- "hcdmdb:CD116",
- "omim:306250",
- "omim:425000",
- "ucsc:uc010nvv.3",
- "uniprot:P15509",
- "ena.embl:M64445",
- "ccds:CCDS35190",
- "ccds:CCDS55360",
- "ccds:CCDS35191",
- "ccds:CCDS55359",
- "ccds:CCDS35192",
- "ccds:CCDS35193",
- "pubmed:1702217",
+ 'previous_symbols': ['CSF2R'],
+ 'aliases': ['CD116', 'alphaGMR'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ensembl:ENSG00000198223', 'ncbigene:1438'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000012533',
+ 'refseq:NM_001161529',
+ 'orphanet:209477',
+ 'iuphar:1707',
+ 'hcdmdb:CD116',
+ 'omim:306250',
+ 'omim:425000',
+ 'ucsc:uc010nvv.3',
+ 'uniprot:P15509',
+ 'ena.embl:M64445',
+ 'ccds:CCDS35190',
+ 'ccds:CCDS55360',
+ 'ccds:CCDS35191',
+ 'ccds:CCDS55359',
+ 'ccds:CCDS35192',
+ 'ccds:CCDS35193',
+ 'pubmed:1702217',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def rps24p5():
"""Create a RPS24P5 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "ribosomal protein S24 pseudogene 5",
- "concept_id": "hgnc:36026",
- "symbol": "RPS24P5",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'ribosomal protein S24 pseudogene 5',
+ 'concept_id': 'hgnc:36026',
+ 'symbol': 'RPS24P5',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.Ri0ddtMpe6DGzrC9_QGbL35gYAtU2bh_",
# "chr": "1",
@@ -432,27 +432,27 @@ def rps24p5():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": [],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:100271094"],
- "associated_with": ["refseq:NG_011274", "pubmed:19123937"],
- "gene_type": "pseudogene",
+ 'previous_symbols': [],
+ 'aliases': [],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:100271094'],
+ 'associated_with': ['refseq:NG_011274', 'pubmed:19123937'],
+ 'gene_type': 'pseudogene',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def trl_cag2_1():
"""Create a TRL-CAG2-1 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "tRNA-Leu (anticodon CAG) 2-1",
- "concept_id": "hgnc:34692",
- "symbol": "TRL-CAG2-1",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'tRNA-Leu (anticodon CAG) 2-1',
+ 'concept_id': 'hgnc:34692',
+ 'symbol': 'TRL-CAG2-1',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.aZ5aYHaC3GhDWgwhKkAcd9GBvkEo034v",
# "chr": "16",
@@ -462,27 +462,27 @@ def trl_cag2_1():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": ["TRNAL13"],
- "aliases": ["tRNA-Leu-CAG-2-1"],
- "symbol_status": "approved",
- "xrefs": ["ncbigene:100189130"],
- "associated_with": ["ena.embl:HG983896"],
- "gene_type": "RNA, transfer",
+ 'previous_symbols': ['TRNAL13'],
+ 'aliases': ['tRNA-Leu-CAG-2-1'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ncbigene:100189130'],
+ 'associated_with': ['ena.embl:HG983896'],
+ 'gene_type': 'RNA, transfer',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def myo5b():
"""Create a MYO5B gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "myosin VB",
- "concept_id": "hgnc:7603",
- "symbol": "MYO5B",
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'myosin VB',
+ 'concept_id': 'hgnc:7603',
+ 'symbol': 'MYO5B',
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.hFukVqPVLD70cshAz1Gtmd6EC1imobpO",
# "chr": "18",
@@ -492,23 +492,23 @@ def myo5b():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": ["KIAA1119"],
- "symbol_status": "approved",
- "xrefs": ["ensembl:ENSG00000167306", "ncbigene:4645"],
- "associated_with": [
- "vega:OTTHUMG00000179843",
- "refseq:NM_001080467",
- "omim:606540",
- "ucsc:uc002leb.3",
- "uniprot:Q9ULV0",
- "orphanet:171089",
- "ccds:CCDS42436",
- "ena.embl:AB032945",
- "pubmed:8884266",
- "pubmed:17462998",
+ 'previous_symbols': [],
+ 'aliases': ['KIAA1119'],
+ 'symbol_status': 'approved',
+ 'xrefs': ['ensembl:ENSG00000167306', 'ncbigene:4645'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000179843',
+ 'refseq:NM_001080467',
+ 'omim:606540',
+ 'ucsc:uc002leb.3',
+ 'uniprot:Q9ULV0',
+ 'orphanet:171089',
+ 'ccds:CCDS42436',
+ 'ena.embl:AB032945',
+ 'pubmed:8884266',
+ 'pubmed:17462998',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
@@ -516,17 +516,17 @@ def myo5b():
# Test Alt Loci Set
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def gstt1():
"""Create an GSTT1 gene fixture."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "glutathione S-transferase theta 1",
- "concept_id": "hgnc:4641",
- "symbol": "GSTT1",
- "location_annotations": ["alternate reference locus"],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'glutathione S-transferase theta 1',
+ 'concept_id': 'hgnc:4641',
+ 'symbol': 'GSTT1',
+ 'location_annotations': ['alternate reference locus'],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.g74mxFvAzPoenOlyMjY32j-UFMvjjas_",
# "chr": "22",
@@ -536,20 +536,20 @@ def gstt1():
# "type": "ChromosomeLocation"
# }
],
- "previous_symbols": [],
- "aliases": ["2.5.1.18"],
- "symbol_status": "approved",
- "associated_with": [
- "refseq:NM_000853",
- "omim:600436",
- "ucsc:uc002zze.4",
- "uniprot:P30711",
- "orphanet:470418",
- "ena.embl:KI270879",
- "pubmed:8617495",
+ 'previous_symbols': [],
+ 'aliases': ['2.5.1.18'],
+ 'symbol_status': 'approved',
+ 'associated_with': [
+ 'refseq:NM_000853',
+ 'omim:600436',
+ 'ucsc:uc002zze.4',
+ 'uniprot:P30711',
+ 'orphanet:470418',
+ 'ena.embl:KI270879',
+ 'pubmed:8617495',
],
- "xrefs": ["ensembl:ENSG00000277656", "ncbigene:2952"],
- "gene_type": "gene with protein product",
+ 'xrefs': ['ensembl:ENSG00000277656', 'ncbigene:2952'],
+ 'gene_type': 'gene with protein product',
}
return Gene(**params)
@@ -557,273 +557,273 @@ def gstt1():
def test_a1bg_as1(check_resp_single_record, a1bg_as1, hgnc):
"""Test that a1bg_as1 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:37133")
+ resp = hgnc.search('hgnc:37133')
check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID)
- resp = hgnc.search("HGNC:37133")
+ resp = hgnc.search('HGNC:37133')
check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID)
- resp = hgnc.search("Hgnc:37133")
+ resp = hgnc.search('Hgnc:37133')
check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("A1BG-AS1")
+ resp = hgnc.search('A1BG-AS1')
check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL)
- resp = hgnc.search("A1BG-as1")
+ resp = hgnc.search('A1BG-as1')
check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL)
# Previous Symbol
- resp = hgnc.search("NCRNA00181")
+ resp = hgnc.search('NCRNA00181')
check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL)
- resp = hgnc.search("A1BGAS")
+ resp = hgnc.search('A1BGAS')
check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL)
- resp = hgnc.search("A1BG-AS")
+ resp = hgnc.search('A1BG-AS')
check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL)
# Alias
- resp = hgnc.search("FLJ23569")
+ resp = hgnc.search('FLJ23569')
check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS)
- resp = hgnc.search("flj23569")
+ resp = hgnc.search('flj23569')
check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS)
def test_a3galt2(check_resp_single_record, a3galt2, hgnc):
"""Test that a3galt2 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:30005")
+ resp = hgnc.search('hgnc:30005')
check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID)
- resp = hgnc.search("HGNC:30005")
+ resp = hgnc.search('HGNC:30005')
check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID)
- resp = hgnc.search("Hgnc:30005")
+ resp = hgnc.search('Hgnc:30005')
check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("A3GALT2")
+ resp = hgnc.search('A3GALT2')
check_resp_single_record(resp, a3galt2, MatchType.SYMBOL)
- resp = hgnc.search("a3galt2")
+ resp = hgnc.search('a3galt2')
check_resp_single_record(resp, a3galt2, MatchType.SYMBOL)
# Previous Symbol
- resp = hgnc.search("A3GALT2P")
+ resp = hgnc.search('A3GALT2P')
check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL)
- resp = hgnc.search("A3GALT2p")
+ resp = hgnc.search('A3GALT2p')
check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL)
# Alias
- resp = hgnc.search("IGBS3S")
+ resp = hgnc.search('IGBS3S')
check_resp_single_record(resp, a3galt2, MatchType.ALIAS)
- resp = hgnc.search("igB3s")
+ resp = hgnc.search('igB3s')
check_resp_single_record(resp, a3galt2, MatchType.ALIAS)
def test_tp53(check_resp_single_record, tp53, hgnc):
"""Test that tp53 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:11998")
+ resp = hgnc.search('hgnc:11998')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
- resp = hgnc.search("HGNC:11998")
+ resp = hgnc.search('HGNC:11998')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
- resp = hgnc.search("Hgnc:11998")
+ resp = hgnc.search('Hgnc:11998')
check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("tp53")
+ resp = hgnc.search('tp53')
check_resp_single_record(resp, tp53, MatchType.SYMBOL)
- resp = hgnc.search("TP53")
+ resp = hgnc.search('TP53')
check_resp_single_record(resp, tp53, MatchType.SYMBOL)
# Alias
- resp = hgnc.search("LFS1")
+ resp = hgnc.search('LFS1')
check_resp_single_record(resp, tp53, MatchType.ALIAS)
- resp = hgnc.search("p53")
+ resp = hgnc.search('p53')
check_resp_single_record(resp, tp53, MatchType.ALIAS)
def test_wdhd1(check_resp_single_record, wdhd1, hgnc):
"""Test that a1bg_as1 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:23170")
+ resp = hgnc.search('hgnc:23170')
check_resp_single_record(resp, wdhd1, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("WDHD1")
+ resp = hgnc.search('WDHD1')
check_resp_single_record(resp, wdhd1, MatchType.SYMBOL)
def test_g6pr(check_resp_single_record, g6pr, hgnc):
"""Test that g6pr normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:4059")
+ resp = hgnc.search('hgnc:4059')
check_resp_single_record(resp, g6pr, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("G6PR")
+ resp = hgnc.search('G6PR')
check_resp_single_record(resp, g6pr, MatchType.SYMBOL)
def test_pirc24(check_resp_single_record, pirc24, hgnc):
"""Test that pirc24 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:37528")
+ resp = hgnc.search('hgnc:37528')
check_resp_single_record(resp, pirc24, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("PIRC24")
+ resp = hgnc.search('PIRC24')
check_resp_single_record(resp, pirc24, MatchType.SYMBOL)
def test_gage4(check_resp_single_record, gage4, hgnc):
"""Test that gage4 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:4101")
+ resp = hgnc.search('hgnc:4101')
check_resp_single_record(resp, gage4, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("GAGE4")
+ resp = hgnc.search('GAGE4')
check_resp_single_record(resp, gage4, MatchType.SYMBOL)
def test_mafip(check_resp_single_record, mafip, hgnc):
"""Test that mafip normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:31102")
+ resp = hgnc.search('hgnc:31102')
check_resp_single_record(resp, mafip, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("MAFIP")
+ resp = hgnc.search('MAFIP')
check_resp_single_record(resp, mafip, MatchType.SYMBOL)
def test_mt_7sdna(check_resp_single_record, mt_7sdna, hgnc):
"""Test that mt_7sdna normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:7409")
+ resp = hgnc.search('hgnc:7409')
check_resp_single_record(resp, mt_7sdna, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("MT-7SDNA")
+ resp = hgnc.search('MT-7SDNA')
check_resp_single_record(resp, mt_7sdna, MatchType.SYMBOL)
def test_cecr(check_resp_single_record, cecr, hgnc):
"""Test that cecr normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:1838")
+ resp = hgnc.search('hgnc:1838')
check_resp_single_record(resp, cecr, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("CECR")
+ resp = hgnc.search('CECR')
check_resp_single_record(resp, cecr, MatchType.SYMBOL)
def test_csf2ra(check_resp_single_record, csf2ra, hgnc):
"""Test that csf2ra normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:2435")
+ resp = hgnc.search('hgnc:2435')
check_resp_single_record(resp, csf2ra, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("CSF2RA")
+ resp = hgnc.search('CSF2RA')
check_resp_single_record(resp, csf2ra, MatchType.SYMBOL)
def test_rps24p5(check_resp_single_record, rps24p5, hgnc):
"""Test that rps24p5 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:36026")
+ resp = hgnc.search('hgnc:36026')
check_resp_single_record(resp, rps24p5, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("rpS24P5")
+ resp = hgnc.search('rpS24P5')
check_resp_single_record(resp, rps24p5, MatchType.SYMBOL)
def test_trl_cag2_1(check_resp_single_record, trl_cag2_1, hgnc):
"""Test that trl_cag2_1 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:34692")
+ resp = hgnc.search('hgnc:34692')
check_resp_single_record(resp, trl_cag2_1, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("TRL-CAG2-1")
+ resp = hgnc.search('TRL-CAG2-1')
check_resp_single_record(resp, trl_cag2_1, MatchType.SYMBOL)
def test_myo5b(check_resp_single_record, myo5b, hgnc):
"""Test that myo5b normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:7603")
+ resp = hgnc.search('hgnc:7603')
check_resp_single_record(resp, myo5b, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("MYO5B")
+ resp = hgnc.search('MYO5B')
check_resp_single_record(resp, myo5b, MatchType.SYMBOL)
# associated_with
- resp = hgnc.search("refseq:NM_001080467")
+ resp = hgnc.search('refseq:NM_001080467')
check_resp_single_record(resp, myo5b, MatchType.ASSOCIATED_WITH)
def test_gstt1(check_resp_single_record, gstt1, hgnc):
"""Test that gstt1 normalizes to correct gene concept."""
# Concept ID
- resp = hgnc.search("hgnc:4641")
+ resp = hgnc.search('hgnc:4641')
check_resp_single_record(resp, gstt1, MatchType.CONCEPT_ID)
# Symbol
- resp = hgnc.search("GSTT1")
+ resp = hgnc.search('GSTT1')
check_resp_single_record(resp, gstt1, MatchType.SYMBOL)
# associated_with
- resp = hgnc.search("omim:600436")
+ resp = hgnc.search('omim:600436')
check_resp_single_record(resp, gstt1, MatchType.ASSOCIATED_WITH)
def test_no_match(hgnc):
"""Test that a term normalizes to correct gene concept as a NO match."""
- resp = hgnc.search("A1BG - AS1")
+ resp = hgnc.search('A1BG - AS1')
assert len(resp.records) == 0
- resp = hgnc.search("hnc:5")
+ resp = hgnc.search('hnc:5')
assert len(resp.records) == 0
# Test empty query
- resp = hgnc.search("")
+ resp = hgnc.search('')
assert len(resp.records) == 0
# Do not search on label
- resp = hgnc.search("A1BG antisense RNA 1")
+ resp = hgnc.search('A1BG antisense RNA 1')
assert len(resp.records) == 0
def test_meta_info(hgnc):
"""Test that the meta field is correct."""
- resp = hgnc.search("HGNC:37133")
- assert resp.source_meta_.data_license == "CC0"
+ resp = hgnc.search('HGNC:37133')
+ assert resp.source_meta_.data_license == 'CC0'
assert (
- resp.source_meta_.data_license_url == "https://www.genenames.org/about/license/"
+ resp.source_meta_.data_license_url == 'https://www.genenames.org/about/license/'
)
- assert datetime.strptime(resp.source_meta_.version, "%Y%m%d")
+ assert datetime.strptime(resp.source_meta_.version, '%Y%m%d')
assert resp.source_meta_.data_url == {
- "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json"
+ 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json'
}
assert resp.source_meta_.rdp_url is None
assert resp.source_meta_.genome_assemblies == []
assert resp.source_meta_.data_license_attributes == {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
}
diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py
index d0083a43..2476a725 100644
--- a/tests/unit/test_ncbi_source.py
+++ b/tests/unit/test_ncbi_source.py
@@ -25,7 +25,7 @@ def check_ncbi_discontinued_gene(normalizer_response, concept_id, symbol, match_
assert resp.associated_with == []
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def ncbi(database):
"""Build ncbi test fixture."""
@@ -33,7 +33,7 @@ class QueryGetter:
def __init__(self):
self.query_handler = QueryHandler(database)
- def search(self, query_str, incl="ncbi"):
+ def search(self, query_str, incl='ncbi'):
resp = self.query_handler.search(query_str, incl=incl)
return resp.source_matches[SourceName.NCBI]
@@ -41,22 +41,22 @@ def search(self, query_str, incl="ncbi"):
return n
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def dpf1():
"""Create gene fixture for DPF1."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "double PHD fingers 1",
- "concept_id": "ncbigene:8193",
- "symbol": "DPF1",
- "aliases": ["BAF45b", "NEUD4", "neuro-d4", "SMARCG1"],
- "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332"],
- "previous_symbols": [],
- "associated_with": ["omim:601670"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": "-",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'double PHD fingers 1',
+ 'concept_id': 'ncbigene:8193',
+ 'symbol': 'DPF1',
+ 'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'],
+ 'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:601670'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': '-',
+ 'locations': [
# {
# "id": "ga4gh:CL.bzgLv8gt3KHK00OWTAEUNZcdgUjbHU8i",
# "chr": "19",
@@ -66,37 +66,37 @@ def dpf1():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20",
- "end": 38229695,
- "start": 38211005,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
+ 'id': 'ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20',
+ 'end': 38229695,
+ 'start': 38211005,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def pdp1_symbol():
"""Create gene fixture for PDP1 (ncbigene:54704)."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "pyruvate dehydrogenase phosphatase catalytic subunit 1",
- "concept_id": "ncbigene:54704",
- "symbol": "PDP1",
- "aliases": ["PDH", "PDP", "PDPC", "PPM2A", "PPM2C"],
- "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951"],
- "previous_symbols": ["LOC157663", "PPM2C"],
- "associated_with": ["omim:605993"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": "+",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'pyruvate dehydrogenase phosphatase catalytic subunit 1',
+ 'concept_id': 'ncbigene:54704',
+ 'symbol': 'PDP1',
+ 'aliases': ['PDH', 'PDP', 'PDPC', 'PPM2A', 'PPM2C'],
+ 'xrefs': ['hgnc:9279', 'ensembl:ENSG00000164951'],
+ 'previous_symbols': ['LOC157663', 'PPM2C'],
+ 'associated_with': ['omim:605993'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': '+',
+ 'locations': [
# {
# "id": "ga4gh:CL.cJsZWKrEtzpFn5psdCtgofb6NaEDVPfB",
# "chr": "8",
@@ -106,37 +106,37 @@ def pdp1_symbol():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG",
- "end": 93926068,
- "start": 93916922,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs",
+ 'id': 'ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG',
+ 'end': 93926068,
+ 'start': 93916922,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def pdp1_alias():
"""Create gene fixture for PDP1 (ncbigene:403313)."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "phospholipid phosphatase 6",
- "concept_id": "ncbigene:403313",
- "symbol": "PLPP6",
- "aliases": ["PDP1", "PSDP", "PPAPDC2", "bA6J24.6", "LPRP-B", "PA-PSP"],
- "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808"],
- "previous_symbols": [],
- "associated_with": ["omim:611666"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": "+",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'phospholipid phosphatase 6',
+ 'concept_id': 'ncbigene:403313',
+ 'symbol': 'PLPP6',
+ 'aliases': ['PDP1', 'PSDP', 'PPAPDC2', 'bA6J24.6', 'LPRP-B', 'PA-PSP'],
+ 'xrefs': ['hgnc:23682', 'ensembl:ENSG00000205808'],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:611666'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': '+',
+ 'locations': [
# {
# "id": "ga4gh:CL.7ivmMgKAqiFiRh1qsbA909w2kUcPabr_",
# "chr": "9",
@@ -146,38 +146,38 @@ def pdp1_alias():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY",
- "end": 4665258,
- "start": 4662293,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
+ 'id': 'ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY',
+ 'end': 4665258,
+ 'start': 4662293,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
# X and Y chromosomes
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def spry3():
"""Create gene fixture for SPRY3."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "sprouty RTK signaling antagonist 3",
- "concept_id": "ncbigene:10251",
- "symbol": "SPRY3",
- "aliases": ["spry-3"],
- "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939"],
- "previous_symbols": ["LOC170187", "LOC253479"],
- "associated_with": ["omim:300531"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": "+",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'sprouty RTK signaling antagonist 3',
+ 'concept_id': 'ncbigene:10251',
+ 'symbol': 'SPRY3',
+ 'aliases': ['spry-3'],
+ 'xrefs': ['hgnc:11271', 'ensembl:ENSG00000168939'],
+ 'previous_symbols': ['LOC170187', 'LOC253479'],
+ 'associated_with': ['omim:300531'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': '+',
+ 'locations': [
# {
# "id": "ga4gh:CL.r8Qv_b-B3SoguReqdunL3GCkt1RH-es1",
# "chr": "Y",
@@ -195,92 +195,92 @@ def spry3():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV",
- "end": 155782459,
- "start": 155612585,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
+ 'id': 'ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV',
+ 'end': 155782459,
+ 'start': 155612585,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
},
{
- "id": "ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL",
- "end": 56968979,
- "start": 56954315,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
+ 'id': 'ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL',
+ 'end': 56968979,
+ 'start': 56954315,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
},
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
# chromosome but no map locations
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def adcp1():
"""Create gene fixture for ADCP1."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "adenosine deaminase complexing protein 1",
- "concept_id": "ncbigene:106",
- "symbol": "ADCP1",
- "aliases": [],
- "xrefs": ["hgnc:229"],
- "previous_symbols": [],
- "associated_with": [],
- "symbol_status": None,
- "strand": None,
- "location_annotations": ["6"],
- "locations": [],
- "gene_type": "unknown",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'adenosine deaminase complexing protein 1',
+ 'concept_id': 'ncbigene:106',
+ 'symbol': 'ADCP1',
+ 'aliases': [],
+ 'xrefs': ['hgnc:229'],
+ 'previous_symbols': [],
+ 'associated_with': [],
+ 'symbol_status': None,
+ 'strand': None,
+ 'location_annotations': ['6'],
+ 'locations': [],
+ 'gene_type': 'unknown',
}
return Gene(**params)
# no chromosome or map locations
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def afa():
"""Create gene fixture for AFA."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "ankyloblepharon filiforme adnatum",
- "concept_id": "ncbigene:170",
- "symbol": "AFA",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": ["omim:106250"],
- "symbol_status": None,
- "strand": None,
- "location_annotations": [],
- "locations": [],
- "gene_type": "unknown",
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'ankyloblepharon filiforme adnatum',
+ 'concept_id': 'ncbigene:170',
+ 'symbol': 'AFA',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:106250'],
+ 'symbol_status': None,
+ 'strand': None,
+ 'location_annotations': [],
+ 'locations': [],
+ 'gene_type': 'unknown',
}
return Gene(**params)
# Contains non cytogenic locations (i.e. "map from Rosati....")
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def znf84():
"""Create gene fixture for ZNF84."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "zinc finger protein 84",
- "concept_id": "ncbigene:7637",
- "symbol": "ZNF84",
- "aliases": ["HPF2"],
- "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040"],
- "previous_symbols": ["LOC100287429"],
- "associated_with": ["omim:618554"],
- "symbol_status": None,
- "location_annotations": ["map from Rosati ref via FISH [AFS]"],
- "strand": "+",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'zinc finger protein 84',
+ 'concept_id': 'ncbigene:7637',
+ 'symbol': 'ZNF84',
+ 'aliases': ['HPF2'],
+ 'xrefs': ['hgnc:13159', 'ensembl:ENSG00000198040'],
+ 'previous_symbols': ['LOC100287429'],
+ 'associated_with': ['omim:618554'],
+ 'symbol_status': None,
+ 'location_annotations': ['map from Rosati ref via FISH [AFS]'],
+ 'strand': '+',
+ 'locations': [
# {
# "id": "ga4gh:CL.6YvQEs6MuHuNvt0Vlv8r4hMKIOK5Ktq4",
# "chr": "12",
@@ -290,38 +290,38 @@ def znf84():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud",
- "end": 133063299,
- "start": 133037508,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl",
+ 'id': 'ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud',
+ 'end': 133063299,
+ 'start': 133037508,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
}
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
# No arm or sub band
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def slc25a6():
"""Create gene fixture for SLC25A6."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "solute carrier family 25 member 6",
- "concept_id": "ncbigene:293",
- "symbol": "SLC25A6",
- "aliases": ["AAC3", "ANT", "ANT 2", "ANT 3", "ANT3", "ANT3Y"],
- "xrefs": ["hgnc:10992", "ensembl:ENSG00000169100", "ensembl:ENSG00000292334"],
- "previous_symbols": ["ANT3Y"],
- "associated_with": ["omim:300151", "omim:403000"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": "-",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'solute carrier family 25 member 6',
+ 'concept_id': 'ncbigene:293',
+ 'symbol': 'SLC25A6',
+ 'aliases': ['AAC3', 'ANT', 'ANT 2', 'ANT 3', 'ANT3', 'ANT3Y'],
+ 'xrefs': ['hgnc:10992', 'ensembl:ENSG00000169100', 'ensembl:ENSG00000292334'],
+ 'previous_symbols': ['ANT3Y'],
+ 'associated_with': ['omim:300151', 'omim:403000'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': '-',
+ 'locations': [
# {
# "id": "ga4gh:CL.Z5pOXNI2Bt8L2NpypNYsbbtgC9L1uyl4",
# "type": "ChromosomeLocation",
@@ -339,48 +339,48 @@ def slc25a6():
# "end": "p11.2"
# },
{
- "id": "ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP",
+ 'id': 'ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP',
},
- "start": 1386151,
- "end": 1392113,
+ 'start': 1386151,
+ 'end': 1392113,
},
{
- "id": "ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5",
+ 'id': 'ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5',
},
- "start": 1386151,
- "end": 1392113,
+ 'start': 1386151,
+ 'end': 1392113,
},
],
- "gene_type": "protein-coding",
+ 'gene_type': 'protein-coding',
}
return Gene(**params)
# Contains arm but no sub band
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def loc106783576():
"""Create gene fixture for ."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "nonconserved acetylation island sequence 68 enhancer",
- "concept_id": "ncbigene:106783576",
- "symbol": "LOC106783576",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": [],
- "symbol_status": None,
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'nonconserved acetylation island sequence 68 enhancer',
+ 'concept_id': 'ncbigene:106783576',
+ 'symbol': 'LOC106783576',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': [],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.YYGQrLtmKwKgp38asAkHT8AydAidnui8",
# "chr": "10",
@@ -390,28 +390,28 @@ def loc106783576():
# "type": "ChromosomeLocation"
# }
],
- "gene_type": "biological-region",
+ 'gene_type': 'biological-region',
}
return Gene(**params)
# Testing for cen
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def glc1b():
"""Create gene fixture for GLC1B."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "glaucoma 1, open angle, B (adult-onset)",
- "concept_id": "ncbigene:2722",
- "symbol": "GLC1B",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": ["omim:606689"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'glaucoma 1, open angle, B (adult-onset)',
+ 'concept_id': 'ncbigene:2722',
+ 'symbol': 'GLC1B',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:606689'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.8D0hLCktRxyPrx4Etgabq10vEq6TtU43",
# "chr": "2",
@@ -421,28 +421,28 @@ def glc1b():
# "type": "ChromosomeLocation"
# }
],
- "gene_type": "unknown",
+ 'gene_type': 'unknown',
}
return Gene(**params)
# Testing for ter ranges
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def hdpa():
"""Create gene fixture for HDPA."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "Hodgkin disease, susceptibility, pseudoautosomal",
- "concept_id": "ncbigene:50829",
- "symbol": "HDPA",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": ["omim:300221"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'Hodgkin disease, susceptibility, pseudoautosomal',
+ 'concept_id': 'ncbigene:50829',
+ 'symbol': 'HDPA',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:300221'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.kl9HXvnUCE6Z1ktXibt83NBdXvxnT2RA",
# "chr": "X",
@@ -452,29 +452,29 @@ def hdpa():
# "type": "ChromosomeLocation"
# }
],
- "gene_type": "unknown",
+ 'gene_type': 'unknown',
}
return Gene(**params)
# Testing for annotation
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def prkrap1():
"""Create gene fixture for PRKRAP1."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "protein activator of interferon induced protein kinase "
- "EIF2AK2 pseudogene 1",
- "concept_id": "ncbigene:731716",
- "symbol": "PRKRAP1",
- "aliases": [],
- "xrefs": ["hgnc:33447"],
- "previous_symbols": ["LOC100289695"],
- "associated_with": [],
- "symbol_status": None,
- "location_annotations": ["alternate reference locus"],
- "strand": "+",
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'protein activator of interferon induced protein kinase '
+ 'EIF2AK2 pseudogene 1',
+ 'concept_id': 'ncbigene:731716',
+ 'symbol': 'PRKRAP1',
+ 'aliases': [],
+ 'xrefs': ['hgnc:33447'],
+ 'previous_symbols': ['LOC100289695'],
+ 'associated_with': [],
+ 'symbol_status': None,
+ 'location_annotations': ['alternate reference locus'],
+ 'strand': '+',
+ 'locations': [
# {
# "id": "ga4gh:CL.FYt7UkCHZVLpkYe7zhNdMk1K6lxl_k7I",
# "chr": "6",
@@ -484,48 +484,48 @@ def prkrap1():
# "type": "ChromosomeLocation"
# },
{
- "id": "ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb",
- "end": 3941874,
- "start": 3940269,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1",
+ 'id': 'ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb',
+ 'end': 3941874,
+ 'start': 3940269,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
},
{
- "id": "ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c",
- "end": 3932085,
- "start": 3930480,
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-",
+ 'id': 'ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c',
+ 'end': 3932085,
+ 'start': 3930480,
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-',
},
- "type": "SequenceLocation",
+ 'type': 'SequenceLocation',
},
],
- "gene_type": "pseudo",
+ 'gene_type': 'pseudo',
}
return Gene(**params)
# start > end
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def mhb():
"""Create gene fixture for MHB."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "myopathy, hyaline body, autosomal recessive",
- "concept_id": "ncbigene:619511",
- "symbol": "MHB",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": ["omim:255160"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'myopathy, hyaline body, autosomal recessive',
+ 'concept_id': 'ncbigene:619511',
+ 'symbol': 'MHB',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:255160'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.6vlmdqdXYxSAGsJI9no7kLN5iLKpvr5X",
# "chr": "3",
@@ -535,28 +535,28 @@ def mhb():
# "type": "ChromosomeLocation"
# }
],
- "gene_type": "unknown",
+ 'gene_type': 'unknown',
}
return Gene(**params)
# Different arms
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def spg37():
"""Create gene fixture for SPG37."""
params = {
- "match_type": MatchType.NO_MATCH,
- "label": "spastic paraplegia 37 (autosomal dominant)",
- "concept_id": "ncbigene:100049159",
- "symbol": "SPG37",
- "aliases": [],
- "xrefs": [],
- "previous_symbols": [],
- "associated_with": ["omim:611945"],
- "symbol_status": None,
- "location_annotations": [],
- "strand": None,
- "locations": [
+ 'match_type': MatchType.NO_MATCH,
+ 'label': 'spastic paraplegia 37 (autosomal dominant)',
+ 'concept_id': 'ncbigene:100049159',
+ 'symbol': 'SPG37',
+ 'aliases': [],
+ 'xrefs': [],
+ 'previous_symbols': [],
+ 'associated_with': ['omim:611945'],
+ 'symbol_status': None,
+ 'location_annotations': [],
+ 'strand': None,
+ 'locations': [
# {
# "id": "ga4gh:CL.XWbwTwmJ95KD-aCuXfJcD8cNIvXbiXRh",
# "chr": "8",
@@ -566,349 +566,349 @@ def spg37():
# "type": "ChromosomeLocation"
# }
],
- "gene_type": "unknown",
+ 'gene_type': 'unknown',
}
return Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def source_urls():
"""Provide source data URLs fixture."""
return {
- "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz",
- "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz",
- "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/",
+ 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz',
+ 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz',
+ 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/',
}
def test_dpf1(check_resp_single_record, ncbi, dpf1):
"""Test that DPF1 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("ncbigene:8193")
+ resp = ncbi.search('ncbigene:8193')
check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID)
- resp = ncbi.search("ncbIgene:8193")
+ resp = ncbi.search('ncbIgene:8193')
check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("DPF1")
+ resp = ncbi.search('DPF1')
check_resp_single_record(resp, dpf1, MatchType.SYMBOL)
- resp = ncbi.search("DpF1")
+ resp = ncbi.search('DpF1')
check_resp_single_record(resp, dpf1, MatchType.SYMBOL)
# Alias
- resp = ncbi.search("BAF45b")
+ resp = ncbi.search('BAF45b')
check_resp_single_record(resp, dpf1, MatchType.ALIAS)
- resp = ncbi.search("NEUD4")
+ resp = ncbi.search('NEUD4')
check_resp_single_record(resp, dpf1, MatchType.ALIAS)
- resp = ncbi.search("neuro-d4")
+ resp = ncbi.search('neuro-d4')
check_resp_single_record(resp, dpf1, MatchType.ALIAS)
# associated_with
- resp = ncbi.search("omim:601670")
+ resp = ncbi.search('omim:601670')
check_resp_single_record(resp, dpf1, MatchType.ASSOCIATED_WITH)
# No Match
- resp = ncbi.search("DPF 1")
+ resp = ncbi.search('DPF 1')
assert len(resp.records) == 0
- resp = ncbi.search("DPG1")
+ resp = ncbi.search('DPG1')
assert len(resp.records) == 0
def test_pdp1(compare_records, check_resp_single_record, ncbi, pdp1_symbol, pdp1_alias):
"""Test that PDP1 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("ncbigene:54704")
+ resp = ncbi.search('ncbigene:54704')
check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID)
- resp = ncbi.search("NCBIGENE:54704")
+ resp = ncbi.search('NCBIGENE:54704')
check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("PDP1")
+ resp = ncbi.search('PDP1')
assert len(resp.records) == 2
# first record check (should always be symbol)
compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL)
compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS)
- resp = ncbi.search("pdp1")
+ resp = ncbi.search('pdp1')
assert len(resp.records) == 2
# first record check (should always be symbol)
compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL)
compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS)
# Previous Symbol
- resp = ncbi.search("LOC157663")
+ resp = ncbi.search('LOC157663')
check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL)
- resp = ncbi.search("PPM2C")
+ resp = ncbi.search('PPM2C')
check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL)
- resp = ncbi.search("loc157663")
+ resp = ncbi.search('loc157663')
check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL)
# Alias
- resp = ncbi.search("pdh")
+ resp = ncbi.search('pdh')
check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS)
- resp = ncbi.search("PDP")
+ resp = ncbi.search('PDP')
check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS)
- resp = ncbi.search("PDPC")
+ resp = ncbi.search('PDPC')
check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS)
- resp = ncbi.search("PPM2A")
+ resp = ncbi.search('PPM2A')
check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS)
def test_spry3(check_resp_single_record, ncbi, spry3):
"""Test that SPRY3 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:10251")
+ resp = ncbi.search('NCBIgene:10251')
check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("sprY3")
+ resp = ncbi.search('sprY3')
check_resp_single_record(resp, spry3, MatchType.SYMBOL)
# Alias
- resp = ncbi.search("SPRY-3")
+ resp = ncbi.search('SPRY-3')
check_resp_single_record(resp, spry3, MatchType.ALIAS)
def test_adcp1(check_resp_single_record, ncbi, adcp1):
"""Test that ADCP1 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:106")
+ resp = ncbi.search('NCBIgene:106')
check_resp_single_record(resp, adcp1, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("ADCP1")
+ resp = ncbi.search('ADCP1')
check_resp_single_record(resp, adcp1, MatchType.SYMBOL)
def test_afa(check_resp_single_record, ncbi, afa):
"""Test that AFA normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:170")
+ resp = ncbi.search('NCBIgene:170')
check_resp_single_record(resp, afa, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("AFA")
+ resp = ncbi.search('AFA')
check_resp_single_record(resp, afa, MatchType.SYMBOL)
def test_znf84(check_resp_single_record, ncbi, znf84):
"""Test that ZNF84 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:7637")
+ resp = ncbi.search('NCBIgene:7637')
check_resp_single_record(resp, znf84, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("ZNF84")
+ resp = ncbi.search('ZNF84')
check_resp_single_record(resp, znf84, MatchType.SYMBOL)
def test_slc25a6(check_resp_single_record, ncbi, slc25a6):
"""Test that SLC25A6 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:293")
+ resp = ncbi.search('NCBIgene:293')
check_resp_single_record(resp, slc25a6, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("SLC25A6")
+ resp = ncbi.search('SLC25A6')
check_resp_single_record(resp, slc25a6, MatchType.SYMBOL)
def test_loc106783576(check_resp_single_record, ncbi, loc106783576):
"""Test that LOC106783576 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:106783576")
+ resp = ncbi.search('NCBIgene:106783576')
check_resp_single_record(resp, loc106783576, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("LOC106783576")
+ resp = ncbi.search('LOC106783576')
check_resp_single_record(resp, loc106783576, MatchType.SYMBOL)
def test_oms(ncbi):
"""Test that OMS matches to correct gene concept."""
- resp = ncbi.search("NCBIgene:619538")
+ resp = ncbi.search('NCBIgene:619538')
assert len(resp.records) == 0
def test_glc1b(check_resp_single_record, ncbi, glc1b):
"""Test that GLC1B normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:2722")
+ resp = ncbi.search('NCBIgene:2722')
check_resp_single_record(resp, glc1b, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("GLC1B")
+ resp = ncbi.search('GLC1B')
check_resp_single_record(resp, glc1b, MatchType.SYMBOL)
# associated_with
- resp = ncbi.search("omim:606689")
+ resp = ncbi.search('omim:606689')
check_resp_single_record(resp, glc1b, MatchType.ASSOCIATED_WITH)
def test_hdpa(check_resp_single_record, ncbi, hdpa):
"""Test that HDPA normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:50829")
+ resp = ncbi.search('NCBIgene:50829')
check_resp_single_record(resp, hdpa, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("HDPA")
+ resp = ncbi.search('HDPA')
check_resp_single_record(resp, hdpa, MatchType.SYMBOL)
def test_prkrap1(check_resp_single_record, ncbi, prkrap1):
"""Test that PRKRAP1 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:731716")
+ resp = ncbi.search('NCBIgene:731716')
check_resp_single_record(resp, prkrap1, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("PRKRAP1")
+ resp = ncbi.search('PRKRAP1')
check_resp_single_record(resp, prkrap1, MatchType.SYMBOL)
# xref
- resp = ncbi.search("hgnc:33447")
+ resp = ncbi.search('hgnc:33447')
check_resp_single_record(resp, prkrap1, MatchType.XREF)
def test_mhb(check_resp_single_record, ncbi, mhb):
"""Test that MHB normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:619511")
+ resp = ncbi.search('NCBIgene:619511')
check_resp_single_record(resp, mhb, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("MHB")
+ resp = ncbi.search('MHB')
check_resp_single_record(resp, mhb, MatchType.SYMBOL)
# associated_with
- resp = ncbi.search("OMIM:255160")
+ resp = ncbi.search('OMIM:255160')
check_resp_single_record(resp, mhb, MatchType.ASSOCIATED_WITH)
def test_spg37(check_resp_single_record, ncbi, spg37):
"""Test that SPG37 normalizes to correct gene concept."""
# Concept ID
- resp = ncbi.search("NCBIgene:100049159")
+ resp = ncbi.search('NCBIgene:100049159')
check_resp_single_record(resp, spg37, MatchType.CONCEPT_ID)
# Symbol
- resp = ncbi.search("SPG37")
+ resp = ncbi.search('SPG37')
check_resp_single_record(resp, spg37, MatchType.SYMBOL)
# associated_with
- resp = ncbi.search("omim:611945")
+ resp = ncbi.search('omim:611945')
check_resp_single_record(resp, spg37, MatchType.ASSOCIATED_WITH)
def test_discontinued_genes(ncbi):
"""Test searches for discontinued genes."""
# HOTS
- resp = ncbi.search("ncbigene:103344718")
+ resp = ncbi.search('ncbigene:103344718')
check_ncbi_discontinued_gene(
- resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID
+ resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID
)
- resp = ncbi.search("HOTS")
+ resp = ncbi.search('HOTS')
check_ncbi_discontinued_gene(
- resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID
+ resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID
)
- resp = ncbi.search("hots")
+ resp = ncbi.search('hots')
check_ncbi_discontinued_gene(
- resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID
+ resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID
)
# AASTH23
- resp = ncbi.search("ncbigene:544580")
+ resp = ncbi.search('ncbigene:544580')
check_ncbi_discontinued_gene(
- resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID
+ resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID
)
- resp = ncbi.search("AASTH23")
+ resp = ncbi.search('AASTH23')
check_ncbi_discontinued_gene(
- resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID
+ resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID
)
- resp = ncbi.search("aastH23")
+ resp = ncbi.search('aastH23')
check_ncbi_discontinued_gene(
- resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID
+ resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID
)
def test_no_match(ncbi, source_urls):
"""Test that nonexistent query doesn"t normalize to a match."""
- response = ncbi.search("cisplatin")
+ response = ncbi.search('cisplatin')
assert len(response.records) == 0
# double-check that meta still populates
- assert response.source_meta_.data_license == "custom"
+ assert response.source_meta_.data_license == 'custom'
assert (
response.source_meta_.data_license_url
- == "https://www.ncbi.nlm.nih.gov/home/about/policies/"
+ == 'https://www.ncbi.nlm.nih.gov/home/about/policies/'
)
- assert datetime.strptime(response.source_meta_.version, "%Y%m%d")
+ assert datetime.strptime(response.source_meta_.version, '%Y%m%d')
assert response.source_meta_.data_url == source_urls
- assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html"
- assert not response.source_meta_.data_license_attributes["non_commercial"]
- assert not response.source_meta_.data_license_attributes["share_alike"]
- assert not response.source_meta_.data_license_attributes["attribution"]
+ assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html'
+ assert not response.source_meta_.data_license_attributes['non_commercial']
+ assert not response.source_meta_.data_license_attributes['share_alike']
+ assert not response.source_meta_.data_license_attributes['attribution']
# check blank
- response = ncbi.search("")
+ response = ncbi.search('')
assert len(response.records) == 0
# check some strange characters
- response = ncbi.search("----")
+ response = ncbi.search('----')
assert len(response.records) == 0
response = ncbi.search("''")
assert len(response.records) == 0
- response = ncbi.search("~~~")
+ response = ncbi.search('~~~')
assert len(response.records) == 0
- response = ncbi.search(" ")
+ response = ncbi.search(' ')
assert len(response.records) == 0
# Incorrect Concept IDs
- response = ncbi.search("ncblgene:8193")
+ response = ncbi.search('ncblgene:8193')
assert len(response.records) == 0
- response = ncbi.search("NCBIGENE54704")
+ response = ncbi.search('NCBIGENE54704')
assert len(response.records) == 0
- response = ncbi.search("54704")
+ response = ncbi.search('54704')
assert len(response.records) == 0
- response = ncbi.search("ncbigene;54704")
+ response = ncbi.search('ncbigene;54704')
assert len(response.records) == 0
def test_meta(ncbi, source_urls):
"""Test NCBI source metadata."""
- response = ncbi.search("PDP1")
- assert response.source_meta_.data_license == "custom"
+ response = ncbi.search('PDP1')
+ assert response.source_meta_.data_license == 'custom'
assert (
response.source_meta_.data_license_url
- == "https://www.ncbi.nlm.nih.gov/home/about/policies/"
+ == 'https://www.ncbi.nlm.nih.gov/home/about/policies/'
)
- assert datetime.strptime(response.source_meta_.version, "%Y%m%d")
+ assert datetime.strptime(response.source_meta_.version, '%Y%m%d')
assert response.source_meta_.data_url == source_urls
- assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html"
- assert response.source_meta_.genome_assemblies == ["GRCh38.p14"]
+ assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html'
+ assert response.source_meta_.genome_assemblies == ['GRCh38.p14']
assert response.source_meta_.data_license_attributes == {
- "non_commercial": False,
- "share_alike": False,
- "attribution": False,
+ 'non_commercial': False,
+ 'share_alike': False,
+ 'attribution': False,
}
diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py
index f767ced1..31da2878 100644
--- a/tests/unit/test_query.py
+++ b/tests/unit/test_query.py
@@ -6,7 +6,7 @@
from gene.schemas import BaseGene, MatchType, SourceName
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def query_handler(database):
"""Build query_handler test fixture."""
@@ -14,7 +14,7 @@ class QueryGetter:
def __init__(self):
self.query_handler = QueryHandler(database)
- def search(self, query_str, incl="", excl=""):
+ def search(self, query_str, incl='', excl=''):
return self.query_handler.search(query_str=query_str, incl=incl, excl=excl)
def normalize(self, query_str):
@@ -26,79 +26,79 @@ def normalize_unmerged(self, query_str):
return QueryGetter()
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_ache():
"""Return normalized core Gene object for ACHE."""
params = {
- "type": "Gene",
- "id": "normalize.gene.hgnc:108",
- "label": "ACHE",
- "mappings": [
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:108',
+ 'label': 'ACHE',
+ 'mappings': [
{
- "coding": {"code": "ENSG00000087085", "system": "ensembl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ENSG00000087085', 'system': 'ensembl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "43", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '43', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "OTTHUMG00000157033", "system": "vega"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'OTTHUMG00000157033', 'system': 'vega'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "uc003uxi.4", "system": "ucsc"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'uc003uxi.4', 'system': 'ucsc'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS5710", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS5710', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS64736", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS64736', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS5709", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS5709', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "P22303", "system": "uniprot"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'P22303', 'system': 'uniprot'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1380483", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1380483', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "100740", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '100740', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "S09.979", "system": "merops"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'S09.979', 'system': 'merops'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "2465", "system": "iuphar"},
- "relation": "relatedMatch",
+ 'coding': {'code': '2465', 'system': 'iuphar'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "NM_015831", "system": "refseq"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'NM_015831', 'system': 'refseq'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"],
- "extensions": [
- {"name": "previous_symbols", "value": ["ACEE", "YT"], "type": "Extension"},
+ 'aliases': ['3.1.1.7', 'YT', 'N-ACHE', 'ARACHE', 'ACEE'],
+ 'extensions': [
+ {'name': 'previous_symbols', 'value': ['ACEE', 'YT'], 'type': 'Extension'},
{
- "name": "approved_name",
- "value": "acetylcholinesterase (Cartwright blood group)",
- "type": "Extension",
+ 'name': 'approved_name',
+ 'value': 'acetylcholinesterase (Cartwright blood group)',
+ 'type': 'Extension',
},
- {"name": "symbol_status", "value": "approved", "type": "Extension"},
+ {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'},
{
- "name": "ncbi_locations",
- "value": [
+ 'name': 'ncbi_locations',
+ 'value': [
# {
# "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5",
# "type": "ChromosomeLocation",
@@ -108,17 +108,17 @@ def normalized_ache():
# "start": "q22.1"
# },
{
- "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
+ 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896994,
+ 'start': 100889993,
+ 'end': 100896994,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
# {
# "name": "hgnc_locations",
@@ -135,117 +135,117 @@ def normalized_ache():
# "type": "Extension"
# },
{
- "name": "ensembl_locations",
- "value": [
+ 'name': 'ensembl_locations',
+ 'value': [
{
- "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
+ 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896974,
+ 'start': 100889993,
+ 'end': 100896974,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
- {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"},
+ {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'},
{
- "name": "hgnc_locus_type",
- "type": "Extension",
- "value": "gene with protein product",
+ 'name': 'hgnc_locus_type',
+ 'type': 'Extension',
+ 'value': 'gene with protein product',
},
- {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"},
- {"name": "strand", "type": "Extension", "value": "-"},
+ {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'},
+ {'name': 'strand', 'type': 'Extension', 'value': '-'},
],
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_braf():
"""Return normalized core Gene object for BRAF."""
params = {
- "type": "Gene",
- "id": "normalize.gene.hgnc:1097",
- "label": "BRAF",
- "mappings": [
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:1097',
+ 'label': 'BRAF',
+ 'mappings': [
{
- "coding": {"code": "673", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '673', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "ENSG00000157764", "system": "ensembl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS5863", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS5863', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1943", "system": "iuphar"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1943', 'system': 'iuphar'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "119066", "system": "orphanet"},
- "relation": "relatedMatch",
+ 'coding': {'code': '119066', 'system': 'orphanet'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "BRAF", "system": "cosmic"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'BRAF', 'system': 'cosmic'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "2284096", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '2284096', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "uc003vwc.5", "system": "ucsc"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "164757", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '164757', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "NM_004333", "system": "refseq"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'NM_004333', 'system': 'refseq'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS87555", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS87555', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "P15056", "system": "uniprot"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'P15056', 'system': 'uniprot'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "M95712", "system": "ena.embl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'M95712', 'system': 'ena.embl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "OTTHUMG00000157457", "system": "vega"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1565476", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1565476', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS94219", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS94219', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS94218", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS94218', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"],
- "extensions": [
+ 'aliases': ['BRAF1', 'BRAF-1', 'RAFB1', 'NS7', 'B-RAF1', 'B-raf'],
+ 'extensions': [
{
- "name": "approved_name",
- "value": "B-Raf proto-oncogene, serine/threonine kinase",
- "type": "Extension",
+ 'name': 'approved_name',
+ 'value': 'B-Raf proto-oncogene, serine/threonine kinase',
+ 'type': 'Extension',
},
# {
# "name": "hgnc_locations",
@@ -262,24 +262,24 @@ def normalized_braf():
# "type": "Extension"
# },
{
- "name": "ensembl_locations",
- "value": [
+ 'name': 'ensembl_locations',
+ 'value': [
{
- "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
+ 'id': 'ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 140719326,
- "end": 140924929,
+ 'start': 140719326,
+ 'end': 140924929,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
{
- "name": "ncbi_locations",
- "value": [
+ 'name': 'ncbi_locations',
+ 'value': [
# {
# "id": "ga4gh:CL.ZZZYpOwuW1BLLJXc_Dm4eVZ5E0smVYCc",
# "type": "ChromosomeLocation",
@@ -289,124 +289,124 @@ def normalized_braf():
# "end": "q34"
# },
{
- "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",
+ 'id': 'ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 140713327,
- "end": 140924929,
+ 'start': 140713327,
+ 'end': 140924929,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
- {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"},
+ {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'},
{
- "name": "hgnc_locus_type",
- "type": "Extension",
- "value": "gene with protein product",
+ 'name': 'hgnc_locus_type',
+ 'type': 'Extension',
+ 'value': 'gene with protein product',
},
- {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"},
- {"name": "strand", "type": "Extension", "value": "-"},
- {"name": "symbol_status", "type": "Extension", "value": "approved"},
+ {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'},
+ {'name': 'strand', 'type': 'Extension', 'value': '-'},
+ {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'},
],
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_abl1():
"""Return normalized core Gene object for ABL1."""
params = {
- "type": "Gene",
- "id": "normalize.gene.hgnc:76",
- "label": "ABL1",
- "mappings": [
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:76',
+ 'label': 'ABL1',
+ 'mappings': [
{
- "coding": {"code": "ENSG00000097007", "system": "ensembl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ENSG00000097007', 'system': 'ensembl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "25", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '25', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "OTTHUMG00000020813", "system": "vega"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'OTTHUMG00000020813', 'system': 'vega'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "uc004bzv.4", "system": "ucsc"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'uc004bzv.4', 'system': 'ucsc'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS35166", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS35166', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS35165", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS35165', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "P00519", "system": "uniprot"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'P00519', 'system': 'uniprot'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1857987", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1857987', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "12626632", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '12626632', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "ABL1", "system": "cosmic"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ABL1', 'system': 'cosmic'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "189980", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '189980', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "117691", "system": "orphanet"},
- "relation": "relatedMatch",
+ 'coding': {'code': '117691', 'system': 'orphanet'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1923", "system": "iuphar"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1923', 'system': 'iuphar'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "M14752", "system": "ena.embl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'M14752', 'system': 'ena.embl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "NM_007313", "system": "refseq"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'NM_007313', 'system': 'refseq'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": [
- "c-ABL",
- "JTK7",
- "p150",
- "CHDSKM",
- "BCR-ABL",
- "v-abl",
- "c-ABL1",
- "bcr/abl",
- "LOC116063",
- "LOC112779",
- "ABL",
+ 'aliases': [
+ 'c-ABL',
+ 'JTK7',
+ 'p150',
+ 'CHDSKM',
+ 'BCR-ABL',
+ 'v-abl',
+ 'c-ABL1',
+ 'bcr/abl',
+ 'LOC116063',
+ 'LOC112779',
+ 'ABL',
],
- "extensions": [
+ 'extensions': [
{
- "name": "previous_symbols",
- "value": ["LOC116063", "LOC112779", "ABL"],
- "type": "Extension",
+ 'name': 'previous_symbols',
+ 'value': ['LOC116063', 'LOC112779', 'ABL'],
+ 'type': 'Extension',
},
{
- "name": "approved_name",
- "value": "ABL proto-oncogene 1, non-receptor tyrosine kinase",
- "type": "Extension",
+ 'name': 'approved_name',
+ 'value': 'ABL proto-oncogene 1, non-receptor tyrosine kinase',
+ 'type': 'Extension',
},
# {
# "name": "hgnc_locations",
@@ -423,8 +423,8 @@ def normalized_abl1():
# "type": "Extension"
# },
{
- "name": "ncbi_locations",
- "value": [
+ 'name': 'ncbi_locations',
+ 'value': [
# {
# "id": "ga4gh:CL.1vsxettosueUHyFIOoTPzwIFD1DodLuT",
# "type": "ChromosomeLocation",
@@ -434,111 +434,111 @@ def normalized_abl1():
# "end": "q34.12"
# },
{
- "id": "ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
+ 'id': 'ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI',
},
- "start": 130713042,
- "end": 130887675,
+ 'start': 130713042,
+ 'end': 130887675,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
{
- "name": "ensembl_locations",
- "value": [
+ 'name': 'ensembl_locations',
+ 'value': [
{
- "id": "ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI",
+ 'id': 'ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI',
},
- "start": 130713015,
- "end": 130887675,
+ 'start': 130713015,
+ 'end': 130887675,
}
],
- "type": "Extension",
+ 'type': 'Extension',
},
- {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"},
+ {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'},
{
- "name": "hgnc_locus_type",
- "type": "Extension",
- "value": "gene with protein product",
+ 'name': 'hgnc_locus_type',
+ 'type': 'Extension',
+ 'value': 'gene with protein product',
},
- {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"},
- {"name": "strand", "type": "Extension", "value": "+"},
- {"name": "symbol_status", "type": "Extension", "value": "approved"},
+ {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'},
+ {'name': 'strand', 'type': 'Extension', 'value': '+'},
+ {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'},
],
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_p150():
"""Return normalized core Gene object for p150."""
params = {
- "type": "Gene",
- "id": "normalize.gene.hgnc:1910",
- "label": "CHAF1A",
- "mappings": [
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:1910',
+ 'label': 'CHAF1A',
+ 'mappings': [
{
- "coding": {"code": "ENSG00000167670", "system": "ensembl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'ENSG00000167670', 'system': 'ensembl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "10036", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '10036', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "601246", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '601246', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "CCDS32875", "system": "ccds"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'CCDS32875', 'system': 'ccds'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "7600578", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '7600578', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "OTTHUMG00000181922", "system": "vega"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'OTTHUMG00000181922', 'system': 'vega'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "Q13111", "system": "uniprot"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'Q13111', 'system': 'uniprot'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "NM_005483", "system": "refseq"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'NM_005483', 'system': 'refseq'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "U20979", "system": "ena.embl"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'U20979', 'system': 'ena.embl'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "uc002mal.4", "system": "ucsc"},
- "relation": "relatedMatch",
+ 'coding': {'code': 'uc002mal.4', 'system': 'ucsc'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": [
- "CAF1P150",
- "MGC71229",
- "CAF-1",
- "P150",
- "CAF1B",
- "CAF1",
- "LOC107985297",
+ 'aliases': [
+ 'CAF1P150',
+ 'MGC71229',
+ 'CAF-1',
+ 'P150',
+ 'CAF1B',
+ 'CAF1',
+ 'LOC107985297',
],
- "extensions": [
+ 'extensions': [
{
- "name": "approved_name",
- "value": "chromatin assembly factor 1 subunit A",
- "type": "Extension",
+ 'name': 'approved_name',
+ 'value': 'chromatin assembly factor 1 subunit A',
+ 'type': 'Extension',
},
# {
# "name": "hgnc_locations",
@@ -555,23 +555,23 @@ def normalized_p150():
# "type": "Extension"
# },
{
- "name": "ensembl_locations",
- "value": [
+ 'name': 'ensembl_locations',
+ 'value': [
{
- "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
+ 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
},
- "start": 4402639,
- "end": 4445018,
+ 'start': 4402639,
+ 'end': 4445018,
}
],
},
{
- "name": "ncbi_locations",
- "value": [
+ 'name': 'ncbi_locations',
+ 'value': [
# {
# "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ",
# "type": "ChromosomeLocation",
@@ -581,54 +581,54 @@ def normalized_p150():
# "end": "p13.3"
# },
{
- "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl",
+ 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
},
- "start": 4402639,
- "end": 4450830,
+ 'start': 4402639,
+ 'end': 4450830,
}
],
},
- {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"},
+ {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'},
{
- "name": "hgnc_locus_type",
- "type": "Extension",
- "value": "gene with protein product",
+ 'name': 'hgnc_locus_type',
+ 'type': 'Extension',
+ 'value': 'gene with protein product',
},
- {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"},
+ {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'},
{
- "name": "previous_symbols",
- "type": "Extension",
- "value": ["LOC107985297"],
+ 'name': 'previous_symbols',
+ 'type': 'Extension',
+ 'value': ['LOC107985297'],
},
- {"name": "strand", "type": "Extension", "value": "+"},
- {"name": "symbol_status", "type": "Extension", "value": "approved"},
+ {'name': 'strand', 'type': 'Extension', 'value': '+'},
+ {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'},
],
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_loc_653303():
"""Provide test fixture for NCBI gene LOC653303. Used to validate
normalized results that don't merge records.
"""
params = {
- "type": "Gene",
- "label": "LOC653303",
- "aliases": ["LOC196266", "LOC654080", "LOC731196"],
- "extensions": [
+ 'type': 'Gene',
+ 'label': 'LOC653303',
+ 'aliases': ['LOC196266', 'LOC654080', 'LOC731196'],
+ 'extensions': [
{
- "type": "Extension",
- "name": "approved_name",
- "value": "proprotein convertase subtilisin/kexin type 7 pseudogene",
+ 'type': 'Extension',
+ 'name': 'approved_name',
+ 'value': 'proprotein convertase subtilisin/kexin type 7 pseudogene',
},
{
- "name": "ncbi_locations",
- "value": [
+ 'name': 'ncbi_locations',
+ 'value': [
# {
# "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl",
# "type": "ChromosomeLocation",
@@ -638,48 +638,48 @@ def normalized_loc_653303():
# "end": "q23.3"
# },
{
- "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1",
+ 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1',
},
- "start": 117135528,
- "end": 117138867,
+ 'start': 117135528,
+ 'end': 117138867,
}
],
},
{
- "type": "Extension",
- "name": "previous_symbols",
- "value": ["LOC196266", "LOC731196", "LOC654080"],
+ 'type': 'Extension',
+ 'name': 'previous_symbols',
+ 'value': ['LOC196266', 'LOC731196', 'LOC654080'],
},
- {"type": "Extension", "name": "ncbi_gene_type", "value": "pseudo"},
- {"name": "strand", "type": "Extension", "value": "+"},
+ {'type': 'Extension', 'name': 'ncbi_gene_type', 'value': 'pseudo'},
+ {'name': 'strand', 'type': 'Extension', 'value': '+'},
],
- "id": "normalize.gene.ncbigene:653303",
+ 'id': 'normalize.gene.ncbigene:653303',
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalize_unmerged_loc_653303():
"""Provide fixture for NCBI gene LOC655303. Used to validate normalized results
that don't merge records.
"""
return {
- "normalized_concept_id": "ncbigene:653303",
- "source_matches": {
- "NCBI": {
- "records": [
+ 'normalized_concept_id': 'ncbigene:653303',
+ 'source_matches': {
+ 'NCBI': {
+ 'records': [
{
- "concept_id": "ncbigene:653303",
- "symbol": "LOC653303",
- "symbol_status": None,
- "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", # noqa: E501
- "strand": "+",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ncbigene:653303',
+ 'symbol': 'LOC653303',
+ 'symbol_status': None,
+ 'label': 'proprotein convertase subtilisin/kexin type 7 pseudogene',
+ 'strand': '+',
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
# "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl",
@@ -689,21 +689,21 @@ def normalize_unmerged_loc_653303():
# "end": "q23.3"
# },
{
- "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", # noqa: E501
+ 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1',
},
- "start": 117135528,
- "end": 117138867,
+ 'start': 117135528,
+ 'end': 117138867,
}
],
- "aliases": [],
- "previous_symbols": ["LOC196266", "LOC731196", "LOC654080"],
- "xrefs": [],
- "associated_with": [],
- "gene_type": "pseudo",
+ 'aliases': [],
+ 'previous_symbols': ['LOC196266', 'LOC731196', 'LOC654080'],
+ 'xrefs': [],
+ 'associated_with': [],
+ 'gene_type': 'pseudo',
}
]
}
@@ -711,22 +711,22 @@ def normalize_unmerged_loc_653303():
}
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalize_unmerged_chaf1a():
"""Return expected results from /normalize_unmerged for CHAF1A."""
return {
- "normalized_concept_id": "hgnc:1910",
- "source_matches": {
- "HGNC": {
- "records": [
+ 'normalized_concept_id': 'hgnc:1910',
+ 'source_matches': {
+ 'HGNC': {
+ 'records': [
{
- "concept_id": "hgnc:1910",
- "symbol": "CHAF1A",
- "symbol_status": "approved",
- "label": "chromatin assembly factor 1 subunit A",
- "strand": None,
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'hgnc:1910',
+ 'symbol': 'CHAF1A',
+ 'symbol_status': 'approved',
+ 'label': 'chromatin assembly factor 1 subunit A',
+ 'strand': None,
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
# "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ",
@@ -736,69 +736,69 @@ def normalize_unmerged_chaf1a():
# "end": "p13.3"
# }
],
- "aliases": [
- "CAF1P150",
- "P150",
- "CAF1",
- "CAF1B",
- "MGC71229",
- "CAF-1",
+ 'aliases': [
+ 'CAF1P150',
+ 'P150',
+ 'CAF1',
+ 'CAF1B',
+ 'MGC71229',
+ 'CAF-1',
],
- "previous_symbols": [],
- "xrefs": ["ensembl:ENSG00000167670", "ncbigene:10036"],
- "associated_with": [
- "vega:OTTHUMG00000181922",
- "ccds:CCDS32875",
- "ucsc:uc002mal.4",
- "pubmed:7600578",
- "uniprot:Q13111",
- "omim:601246",
- "ena.embl:U20979",
- "refseq:NM_005483",
+ 'previous_symbols': [],
+ 'xrefs': ['ensembl:ENSG00000167670', 'ncbigene:10036'],
+ 'associated_with': [
+ 'vega:OTTHUMG00000181922',
+ 'ccds:CCDS32875',
+ 'ucsc:uc002mal.4',
+ 'pubmed:7600578',
+ 'uniprot:Q13111',
+ 'omim:601246',
+ 'ena.embl:U20979',
+ 'refseq:NM_005483',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
],
},
- "Ensembl": {
- "records": [
+ 'Ensembl': {
+ 'records': [
{
- "concept_id": "ensembl:ENSG00000167670",
- "symbol": "CHAF1A",
- "symbol_status": None,
- "label": "chromatin assembly factor 1 subunit A",
- "strand": "+",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ensembl:ENSG00000167670',
+ 'symbol': 'CHAF1A',
+ 'symbol_status': None,
+ 'label': 'chromatin assembly factor 1 subunit A',
+ 'strand': '+',
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501
+ 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
},
- "start": 4402639,
- "end": 4445018,
+ 'start': 4402639,
+ 'end': 4445018,
}
],
- "aliases": [],
- "previous_symbols": [],
- "xrefs": ["hgnc:1910"],
- "associated_with": [],
- "gene_type": "protein_coding",
+ 'aliases': [],
+ 'previous_symbols': [],
+ 'xrefs': ['hgnc:1910'],
+ 'associated_with': [],
+ 'gene_type': 'protein_coding',
}
],
},
- "NCBI": {
- "records": [
+ 'NCBI': {
+ 'records': [
{
- "concept_id": "ncbigene:10036",
- "symbol": "CHAF1A",
- "symbol_status": None,
- "label": "chromatin assembly factor 1 subunit A",
- "strand": "+",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ncbigene:10036',
+ 'symbol': 'CHAF1A',
+ 'symbol_status': None,
+ 'label': 'chromatin assembly factor 1 subunit A',
+ 'strand': '+',
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
# "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ",
@@ -808,21 +808,21 @@ def normalize_unmerged_chaf1a():
# "end": "p13.3"
# },
{
- "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501
+ 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl',
},
- "start": 4402639,
- "end": 4450830,
+ 'start': 4402639,
+ 'end': 4450830,
}
],
- "aliases": ["CAF1P150", "P150", "CAF1", "CAF1B", "CAF-1"],
- "previous_symbols": ["LOC107985297"],
- "xrefs": ["ensembl:ENSG00000167670", "hgnc:1910"],
- "associated_with": ["omim:601246"],
- "gene_type": "protein-coding",
+ 'aliases': ['CAF1P150', 'P150', 'CAF1', 'CAF1B', 'CAF-1'],
+ 'previous_symbols': ['LOC107985297'],
+ 'xrefs': ['ensembl:ENSG00000167670', 'hgnc:1910'],
+ 'associated_with': ['omim:601246'],
+ 'gene_type': 'protein-coding',
}
]
},
@@ -830,22 +830,22 @@ def normalize_unmerged_chaf1a():
}
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalize_unmerged_ache():
"""Provide ACHE fixture for unmerged normalize endpoint."""
return {
- "normalized_concept_id": "hgnc:108",
- "source_matches": {
- "NCBI": {
- "records": [
+ 'normalized_concept_id': 'hgnc:108',
+ 'source_matches': {
+ 'NCBI': {
+ 'records': [
{
- "concept_id": "ncbigene:43",
- "symbol": "ACHE",
- "symbol_status": None,
- "label": "acetylcholinesterase (Cartwright blood group)",
- "strand": "-",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ncbigene:43',
+ 'symbol': 'ACHE',
+ 'symbol_status': None,
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': '-',
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
# "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5",
@@ -855,63 +855,63 @@ def normalize_unmerged_ache():
# "end": "q22.1"
# },
{
- "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
+ 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896994,
+ 'start': 100889993,
+ 'end': 100896994,
}
],
- "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"],
- "previous_symbols": ["ACEE"],
- "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"],
- "associated_with": ["omim:100740"],
- "gene_type": "protein-coding",
+ 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'],
+ 'previous_symbols': ['ACEE'],
+ 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'],
+ 'associated_with': ['omim:100740'],
+ 'gene_type': 'protein-coding',
}
],
},
- "Ensembl": {
- "records": [
+ 'Ensembl': {
+ 'records': [
{
- "concept_id": "ensembl:ENSG00000087085",
- "symbol": "ACHE",
- "symbol_status": None,
- "label": "acetylcholinesterase (Cartwright blood group)",
- "strand": "-",
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'ensembl:ENSG00000087085',
+ 'symbol': 'ACHE',
+ 'symbol_status': None,
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': '-',
+ 'location_annotations': [],
+ 'locations': [
{
- "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1",
- "type": "SequenceLocation",
- "sequenceReference": {
- "type": "SequenceReference",
- "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
+ 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1',
+ 'type': 'SequenceLocation',
+ 'sequenceReference': {
+ 'type': 'SequenceReference',
+ 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul',
},
- "start": 100889993,
- "end": 100896974,
+ 'start': 100889993,
+ 'end': 100896974,
}
],
- "aliases": [],
- "previous_symbols": [],
- "xrefs": ["hgnc:108"],
- "associated_with": [],
- "gene_type": "protein_coding",
+ 'aliases': [],
+ 'previous_symbols': [],
+ 'xrefs': ['hgnc:108'],
+ 'associated_with': [],
+ 'gene_type': 'protein_coding',
}
]
},
- "HGNC": {
- "records": [
+ 'HGNC': {
+ 'records': [
{
- "concept_id": "hgnc:108",
- "symbol": "ACHE",
- "symbol_status": "approved",
- "label": "acetylcholinesterase (Cartwright blood group)",
- "strand": None,
- "location_annotations": [],
- "locations": [
+ 'concept_id': 'hgnc:108',
+ 'symbol': 'ACHE',
+ 'symbol_status': 'approved',
+ 'label': 'acetylcholinesterase (Cartwright blood group)',
+ 'strand': None,
+ 'location_annotations': [],
+ 'locations': [
# {
# "type": "ChromosomeLocation",
# "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5",
@@ -921,23 +921,23 @@ def normalize_unmerged_ache():
# "end": "q22.1"
# }
],
- "aliases": ["3.1.1.7"],
- "previous_symbols": ["YT"],
- "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"],
- "associated_with": [
- "ucsc:uc003uxi.4",
- "vega:OTTHUMG00000157033",
- "merops:S09.979",
- "ccds:CCDS5710",
- "omim:100740",
- "iuphar:2465",
- "ccds:CCDS5709",
- "refseq:NM_015831",
- "pubmed:1380483",
- "uniprot:P22303",
- "ccds:CCDS64736",
+ 'aliases': ['3.1.1.7'],
+ 'previous_symbols': ['YT'],
+ 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'],
+ 'associated_with': [
+ 'ucsc:uc003uxi.4',
+ 'vega:OTTHUMG00000157033',
+ 'merops:S09.979',
+ 'ccds:CCDS5710',
+ 'omim:100740',
+ 'iuphar:2465',
+ 'ccds:CCDS5709',
+ 'refseq:NM_015831',
+ 'pubmed:1380483',
+ 'uniprot:P22303',
+ 'ccds:CCDS64736',
],
- "gene_type": "gene with protein product",
+ 'gene_type': 'gene with protein product',
}
]
},
@@ -945,55 +945,55 @@ def normalize_unmerged_ache():
}
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def normalized_ifnr():
"""Return normalized core Gene object for IFNR."""
params = {
- "type": "Gene",
- "id": "normalize.gene.hgnc:5447",
- "label": "IFNR",
- "mappings": [
+ 'type': 'Gene',
+ 'id': 'normalize.gene.hgnc:5447',
+ 'label': 'IFNR',
+ 'mappings': [
{
- "coding": {"code": "3466", "system": "ncbigene"},
- "relation": "relatedMatch",
+ 'coding': {'code': '3466', 'system': 'ncbigene'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1906174", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1906174', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "147573", "system": "omim"},
- "relation": "relatedMatch",
+ 'coding': {'code': '147573', 'system': 'omim'},
+ 'relation': 'relatedMatch',
},
{
- "coding": {"code": "1193239", "system": "pubmed"},
- "relation": "relatedMatch",
+ 'coding': {'code': '1193239', 'system': 'pubmed'},
+ 'relation': 'relatedMatch',
},
],
- "aliases": ["IFNGM", "IFNGM2"],
- "extensions": [
- {
- "name": "approved_name",
- "value": "interferon production regulator",
- "type": "Extension",
- },
- {"name": "symbol_status", "value": "approved", "type": "Extension"},
- {"name": "symbol_status", "value": "approved", "type": "Extension"},
- {"name": "ncbi_gene_type", "type": "Extension", "value": "unknown"},
- {"name": "hgnc_locus_type", "type": "Extension", "value": "unknown"},
- {"name": "location_annotations", "type": "Extension", "value": ["16"]},
+ 'aliases': ['IFNGM', 'IFNGM2'],
+ 'extensions': [
+ {
+ 'name': 'approved_name',
+ 'value': 'interferon production regulator',
+ 'type': 'Extension',
+ },
+ {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'},
+ {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'},
+ {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'unknown'},
+ {'name': 'hgnc_locus_type', 'type': 'Extension', 'value': 'unknown'},
+ {'name': 'location_annotations', 'type': 'Extension', 'value': ['16']},
],
}
return core_models.Gene(**params)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def num_sources():
"""Get the number of sources."""
return len({s for s in SourceName})
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def source_meta():
"""Create test fixture for source meta"""
return [SourceName.HGNC, SourceName.ENSEMBL, SourceName.NCBI]
@@ -1002,18 +1002,18 @@ def source_meta():
def compare_warnings(actual_warnings, expected_warnings):
"""Compare response warnings against expected results."""
if expected_warnings:
- assert len(actual_warnings) == len(expected_warnings), "warnings len"
+ assert len(actual_warnings) == len(expected_warnings), 'warnings len'
for e_warnings in expected_warnings:
for r_warnings in actual_warnings:
for e_key, e_val in e_warnings.items():
for r_val in r_warnings.values():
if e_key == r_val:
if isinstance(e_val, list):
- assert set(r_val) == set(e_val), "warnings val"
+ assert set(r_val) == set(e_val), 'warnings val'
else:
- assert r_val == e_val, "warnings val"
+ assert r_val == e_val, 'warnings val'
else:
- assert actual_warnings == [], "warnings != []"
+ assert actual_warnings == [], 'warnings != []'
def compare_normalize_resp(
@@ -1028,7 +1028,7 @@ def compare_normalize_resp(
assert resp.query == expected_query
compare_warnings(resp.warnings, expected_warnings)
assert resp.match_type == expected_match_type
- assert resp.normalized_id == expected_gene.id.split("normalize.gene.")[-1]
+ assert resp.normalized_id == expected_gene.id.split('normalize.gene.')[-1]
compare_gene(expected_gene, resp.gene)
if not expected_source_meta:
assert resp.source_meta_ == {}
@@ -1036,7 +1036,7 @@ def compare_normalize_resp(
resp_source_meta_keys = resp.source_meta_.keys()
assert len(resp_source_meta_keys) == len(
expected_source_meta
- ), "source_meta_keys" # noqa: E501
+ ), 'source_meta_keys'
for src in expected_source_meta:
assert src in resp_source_meta_keys
compare_service_meta(resp.service_meta_)
@@ -1065,7 +1065,7 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture):
assert actual.query == query
compare_warnings(actual.warnings, warnings)
assert actual.match_type == match_type
- assert actual.normalized_concept_id == fixture["normalized_concept_id"]
+ assert actual.normalized_concept_id == fixture['normalized_concept_id']
for source, match in actual.source_matches.items():
assert match.source_meta_ # check that it's there
@@ -1073,20 +1073,20 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture):
concept_id = record.concept_id
fixture_gene = None
# get corresponding fixture record
- for gene in fixture["source_matches"][source.value]["records"]:
- if gene["concept_id"] == concept_id:
+ for gene in fixture['source_matches'][source.value]['records']:
+ if gene['concept_id'] == concept_id:
fixture_gene = BaseGene(**gene)
break
- assert fixture_gene, f"Unable to find fixture for {concept_id}"
+ assert fixture_gene, f'Unable to find fixture for {concept_id}'
compare_unmerged_record(record, fixture_gene)
def compare_service_meta(service_meta):
"""Check that service metadata is correct."""
- assert service_meta.name == "gene-normalizer"
- assert service_meta.version >= "0.1.0"
+ assert service_meta.name == 'gene-normalizer'
+ assert service_meta.version >= '0.1.0'
assert isinstance(service_meta.response_datetime, str)
- assert service_meta.url == "https://github.com/cancervariants/gene-normalization"
+ assert service_meta.url == 'https://github.com/cancervariants/gene-normalization'
def compare_gene(test, actual):
@@ -1109,15 +1109,15 @@ def compare_gene(test, actual):
assert no_matches == [], no_matches
assert len(actual.mappings) == len(test.mappings)
- assert set(actual.aliases) == set(test.aliases), "aliases"
- extensions_present = "extensions" in test.model_fields.keys()
- assert ("extensions" in actual.model_fields.keys()) == extensions_present
+ assert set(actual.aliases) == set(test.aliases), 'aliases'
+ extensions_present = 'extensions' in test.model_fields.keys()
+ assert ('extensions' in actual.model_fields.keys()) == extensions_present
if extensions_present:
actual_ext_names = sorted([ext.name for ext in actual.extensions])
unique_actual_ext_names = sorted(set(actual_ext_names))
- assert actual_ext_names == unique_actual_ext_names, "duplicate extension names"
+ assert actual_ext_names == unique_actual_ext_names, 'duplicate extension names'
test_ext_names = {ext.name for ext in test.extensions}
- assert set(actual_ext_names) == test_ext_names, "extension names dont match"
+ assert set(actual_ext_names) == test_ext_names, 'extension names dont match'
n_ext_correct = 0
for test_ext in test.extensions:
for actual_ext in actual.extensions:
@@ -1130,20 +1130,20 @@ def compare_gene(test, actual):
else:
assert set(actual_ext.value) == set(
test_ext.value
- ), f"{test_ext.value} value"
+ ), f'{test_ext.value} value'
else:
assert actual_ext.value == test_ext.value
else:
assert actual_ext.value == test_ext.value
assert actual_ext.type == test_ext.type
n_ext_correct += 1
- assert n_ext_correct == len(test.extensions), "number of correct extensions"
+ assert n_ext_correct == len(test.extensions), 'number of correct extensions'
def test_search_query(query_handler, num_sources):
"""Test that query returns properly-structured response."""
- resp = query_handler.search(" BRAF ")
- assert resp.query == "BRAF"
+ resp = query_handler.search(' BRAF ')
+ assert resp.query == 'BRAF'
matches = resp.source_matches
assert isinstance(matches, dict)
assert len(matches) == num_sources
@@ -1151,20 +1151,20 @@ def test_search_query(query_handler, num_sources):
def test_search_query_inc_exc(query_handler, num_sources):
"""Test that query incl and excl work correctly."""
- sources = "hgnc, ensembl, ncbi"
- resp = query_handler.search("BRAF", excl=sources)
+ sources = 'hgnc, ensembl, ncbi'
+ resp = query_handler.search('BRAF', excl=sources)
matches = resp.source_matches
assert len(matches) == num_sources - len(sources.split())
- sources = "Hgnc, NCBi"
- resp = query_handler.search("BRAF", incl=sources)
+ sources = 'Hgnc, NCBi'
+ resp = query_handler.search('BRAF', incl=sources)
matches = resp.source_matches
assert len(matches) == len(sources.split())
assert SourceName.HGNC in matches
assert SourceName.NCBI in matches
- sources = "HGnC"
- resp = query_handler.search("BRAF", excl=sources)
+ sources = 'HGnC'
+ resp = query_handler.search('BRAF', excl=sources)
matches = resp.source_matches
assert len(matches) == num_sources - len(sources.split())
assert SourceName.ENSEMBL in matches
@@ -1174,30 +1174,30 @@ def test_search_query_inc_exc(query_handler, num_sources):
def test_search_invalid_parameter_exception(query_handler):
"""Test that Invalid parameter exception works correctly."""
with pytest.raises(InvalidParameterException):
- _ = query_handler.search("BRAF", incl="hgn") # noqa: F841, E501
+ _ = query_handler.search('BRAF', incl='hgn') # noqa: F841
with pytest.raises(InvalidParameterException):
- resp = query_handler.search("BRAF", incl="hgnc", excl="hgnc") # noqa: F841
+ resp = query_handler.search('BRAF', incl='hgnc', excl='hgnc') # noqa: F841
def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
"""Test that ACHE concept_id shows xref matches."""
# Search
- resp = query_handler.search("ncbigene:43")
+ resp = query_handler.search('ncbigene:43')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
assert len(matches[SourceName.ENSEMBL].records) == 0
assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID
- resp = query_handler.search("hgnc:108")
+ resp = query_handler.search('hgnc:108')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID
assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
- resp = query_handler.search("ensembl:ENSG00000087085")
+ resp = query_handler.search('ensembl:ENSG00000087085')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
@@ -1205,49 +1205,49 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
# Normalize
- q = "ACHE"
+ q = 'ACHE'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta
)
- q = "ache"
+ q = 'ache'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta
)
- q = "hgnc:108"
+ q = 'hgnc:108'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta
)
- q = "ensembl:ENSG00000087085"
+ q = 'ensembl:ENSG00000087085'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta
)
- q = "ncbigene:43"
+ q = 'ncbigene:43'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta
)
- q = "3.1.1.7"
+ q = '3.1.1.7'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta
)
- q = "ARACHE"
+ q = 'ARACHE'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta
)
- q = "YT"
+ q = 'YT'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1257,7 +1257,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
expected_source_meta=source_meta,
)
- q = "ACEE"
+ q = 'ACEE'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1267,7 +1267,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
expected_source_meta=source_meta,
)
- q = "omim:100740"
+ q = 'omim:100740'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1281,21 +1281,21 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta):
def test_braf_query(query_handler, num_sources, normalized_braf, source_meta):
"""Test that BRAF concept_id shows xref matches."""
# Search
- resp = query_handler.search("ncbigene:673")
+ resp = query_handler.search('ncbigene:673')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
assert len(matches[SourceName.ENSEMBL].records) == 0
assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID
- resp = query_handler.search("hgnc:1097")
+ resp = query_handler.search('hgnc:1097')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID
assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
- resp = query_handler.search("ensembl:ENSG00000157764")
+ resp = query_handler.search('ensembl:ENSG00000157764')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
@@ -1303,49 +1303,49 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta):
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
# Normalize
- q = "BRAF"
+ q = 'BRAF'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta
)
- q = "braf"
+ q = 'braf'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta
)
- q = "hgnc:1097"
+ q = 'hgnc:1097'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta
)
- q = "ensembl:ENSG00000157764"
+ q = 'ensembl:ENSG00000157764'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta
)
- q = "ncbigene:673"
+ q = 'ncbigene:673'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta
)
- q = "NS7"
+ q = 'NS7'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta
)
- q = "b-raf"
+ q = 'b-raf'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta
)
- q = "omim:164757"
+ q = 'omim:164757'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1359,21 +1359,21 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta):
def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
"""Test that ABL1 concept_id shows xref matches."""
# Search
- resp = query_handler.search("ncbigene:25")
+ resp = query_handler.search('ncbigene:25')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
assert len(matches[SourceName.ENSEMBL].records) == 0
assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID
- resp = query_handler.search("hgnc:76")
+ resp = query_handler.search('hgnc:76')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID
assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
- resp = query_handler.search("ensembl:ENSG00000097007")
+ resp = query_handler.search('ensembl:ENSG00000097007')
matches = resp.source_matches
assert len(matches) == num_sources
assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF
@@ -1381,43 +1381,43 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF
# Normalize
- q = "ABL1"
+ q = 'ABL1'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta
)
- q = "abl1"
+ q = 'abl1'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta
)
- q = "hgnc:76"
+ q = 'hgnc:76'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta
)
- q = "ensembl:ENSG00000097007"
+ q = 'ensembl:ENSG00000097007'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta
)
- q = "ncbigene:25"
+ q = 'ncbigene:25'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta
)
- q = "v-abl"
+ q = 'v-abl'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp, q, MatchType.ALIAS, normalized_abl1, expected_source_meta=source_meta
)
- q = "LOC116063"
+ q = 'LOC116063'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1427,7 +1427,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
expected_source_meta=source_meta,
)
- q = "LOC112779"
+ q = 'LOC112779'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1437,7 +1437,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
expected_source_meta=source_meta,
)
- q = "ABL"
+ q = 'ABL'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1447,7 +1447,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
expected_source_meta=source_meta,
)
- q = "refseq:NM_007313"
+ q = 'refseq:NM_007313'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1460,16 +1460,16 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta):
def test_multiple_norm_concepts(query_handler, normalized_p150, source_meta):
"""Tests where more than one normalized concept is found."""
- q = "P150"
+ q = 'P150'
resp = query_handler.normalize(q)
expected_warnings = [
{
- "multiple_normalized_concepts_found": [
- "hgnc:16850",
- "hgnc:76",
- "hgnc:17168",
- "hgnc:500",
- "hgnc:8982",
+ 'multiple_normalized_concepts_found': [
+ 'hgnc:16850',
+ 'hgnc:76',
+ 'hgnc:17168',
+ 'hgnc:500',
+ 'hgnc:8982',
]
}
]
@@ -1487,7 +1487,7 @@ def test_normalize_single_entry(query_handler, normalized_loc_653303):
"""Test that the normalized endpoint correctly shapes unmerged identity
records into core gene objects.
"""
- q = "LOC653303"
+ q = 'LOC653303'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1502,7 +1502,7 @@ def test_normalize_no_locations(query_handler, normalized_ifnr):
"""Test that the normalized endpoint correcly shapes merged entity with no
locations
"""
- q = "IFNR"
+ q = 'IFNR'
resp = query_handler.normalize(q)
compare_normalize_resp(
resp,
@@ -1521,55 +1521,55 @@ def test_normalize_unmerged(
):
"""Test that unmerged normalization produces correct results."""
# concept ID
- q = "ncbigene:653303"
+ q = 'ncbigene:653303'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_loc_653303
)
- q = "hgnc:1910"
+ q = 'hgnc:1910'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_chaf1a
)
- q = "HGNC:108"
+ q = 'HGNC:108'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_ache
)
# symbol
- q = "LOC653303"
+ q = 'LOC653303'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.SYMBOL, normalize_unmerged_loc_653303
)
# prev symbol
- q = "ACEE"
+ q = 'ACEE'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_ache
)
- q = "LOC196266"
+ q = 'LOC196266'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_loc_653303
)
# alias
- q = "P150"
+ q = 'P150'
resp = query_handler.normalize_unmerged(q)
expected_warnings = [
{
- "multiple_normalized_concepts_found": [
- "hgnc:500",
- "hgnc:8982",
- "hgnc:17168",
- "hgnc:16850",
- "hgnc:76",
+ 'multiple_normalized_concepts_found': [
+ 'hgnc:500',
+ 'hgnc:8982',
+ 'hgnc:17168',
+ 'hgnc:16850',
+ 'hgnc:76',
]
}
]
@@ -1577,22 +1577,22 @@ def test_normalize_unmerged(
resp, q, expected_warnings, MatchType.ALIAS, normalize_unmerged_chaf1a
)
- q = "ARACHE"
+ q = 'ARACHE'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_ache)
- q = "MGC71229"
+ q = 'MGC71229'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a)
# assoc with
- q = "omim:100740"
+ q = 'omim:100740'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_ache
)
- q = "uniprot:Q13111"
+ q = 'uniprot:Q13111'
resp = query_handler.normalize_unmerged(q)
compare_unmerged_response(
resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_chaf1a
@@ -1601,18 +1601,18 @@ def test_normalize_unmerged(
def test_invalid_queries(query_handler):
"""Test invalid queries"""
- resp = query_handler.normalize("B R A F")
+ resp = query_handler.normalize('B R A F')
assert resp.match_type is MatchType.NO_MATCH
with pytest.raises(TypeError):
- resp["match_type"]
+ resp['match_type']
- resp = query_handler.search("B R A F")
+ resp = query_handler.search('B R A F')
records = [r for matches in resp.source_matches.values() for r in matches.records]
assert len(records) == 0
def test_service_meta(query_handler):
"""Test service meta info in response."""
- resp = query_handler.search("pheno")
+ resp = query_handler.search('pheno')
compare_service_meta(resp.service_meta_)
diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py
index 3d5fceed..fbf6f339 100644
--- a/tests/unit/test_schemas.py
+++ b/tests/unit/test_schemas.py
@@ -16,22 +16,22 @@
# )
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def sequence_location():
"""Create a valid sequence location test fixture."""
return models.SequenceLocation(
sequence=models.SequenceReference(
- refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"
+ refgetAccession='SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'
),
start=140719327,
end=140924929,
)
-@pytest.fixture(scope="module")
+@pytest.fixture(scope='module')
def gene():
"""Create a valid gene test fixture."""
- return Gene(match_type=100, concept_id="hgnc:1097", symbol="BRAF")
+ return Gene(match_type=100, concept_id='hgnc:1097', symbol='BRAF')
def test_gene(gene, sequence_location):
@@ -39,77 +39,77 @@ def test_gene(gene, sequence_location):
assert gene
assert Gene(
match_type=100,
- concept_id="ensembl:1",
- symbol="GENE",
+ concept_id='ensembl:1',
+ symbol='GENE',
# locations=[chromosome_location, sequence_location]
locations=[sequence_location],
)
assert Gene(
match_type=100,
- concept_id="ensembl:1",
- symbol="GENE",
+ concept_id='ensembl:1',
+ symbol='GENE',
locations=[sequence_location],
)
assert Gene(
match_type=100,
- concept_id="ensembl:1",
- symbol="GENE",
+ concept_id='ensembl:1',
+ symbol='GENE',
locations=[sequence_location],
)
# id not a valid curie
with pytest.raises(pydantic.ValidationError):
- Gene(match_type=100, concept_id="hgnc1096", symbol="BRAF")
+ Gene(match_type=100, concept_id='hgnc1096', symbol='BRAF')
# symbol not a str
with pytest.raises(pydantic.ValidationError):
- Gene(match_type=100, concept_id="hgnc:1096", symbol=1)
+ Gene(match_type=100, concept_id='hgnc:1096', symbol=1)
# strand not -/+
with pytest.raises(pydantic.ValidationError):
- Gene(match_type=100, concept_id="hgnc:1096", symbol="BRAF", strand="positive")
+ Gene(match_type=100, concept_id='hgnc:1096', symbol='BRAF', strand='positive')
# xrefs not a valid curie
with pytest.raises(pydantic.ValidationError):
Gene(
match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
- xrefs=["hgnc", "hgnc:1"],
+ concept_id='hgnc:1096',
+ symbol='BRAF',
+ xrefs=['hgnc', 'hgnc:1'],
)
# associated_with not a valid curie
with pytest.raises(pydantic.ValidationError):
Gene(
match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
- associated_with=["hgnc", "hgnc:1"],
+ concept_id='hgnc:1096',
+ symbol='BRAF',
+ associated_with=['hgnc', 'hgnc:1'],
)
# symbol status invalid
with pytest.raises(pydantic.ValidationError):
Gene(
match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
- symbol_status="nothing",
+ concept_id='hgnc:1096',
+ symbol='BRAF',
+ symbol_status='nothing',
)
# locations not a sequence or chromosome location
with pytest.raises(pydantic.ValidationError):
Gene(
match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
- locations=["GRCh38:chr1"],
+ concept_id='hgnc:1096',
+ symbol='BRAF',
+ locations=['GRCh38:chr1'],
)
# location not a list
with pytest.raises(pydantic.ValidationError):
Gene(
match_type=100,
- concept_id="hgnc:1096",
- symbol="BRAF",
+ concept_id='hgnc:1096',
+ symbol='BRAF',
locations=sequence_location,
)