From c700e45b96ea520a78cb7dcf3e0848b45cca7a3a Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 8 Nov 2024 17:17:09 +0000 Subject: [PATCH 01/30] Update doc string and main call --- src/python/ensembl/io/genomio/database/meta_getter.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/python/ensembl/io/genomio/database/meta_getter.py b/src/python/ensembl/io/genomio/database/meta_getter.py index 01c4e0599..da75fcaae 100644 --- a/src/python/ensembl/io/genomio/database/meta_getter.py +++ b/src/python/ensembl/io/genomio/database/meta_getter.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""A simple helper script to connect to a core database and retrieve a single meta_value -or multiple meta_value and dump meta_key/value pairs to stdout / JSON.""" +"""Connect to a core database and retrieve a meta_key:meta_value pair(s) +and dump meta_key/value pairs to stdout / JSON.""" __all__ = ["get_meta_values"] @@ -91,7 +91,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: """ parser = ArgumentParser(description=__doc__) - parser.add_server_arguments(include_database=True, help="core database") + parser.add_server_arguments(include_database=True, help="server url and core database") parser.add_argument_src_path( "--meta_keys_list", help="Input File | List with >=2 meta_keys to query target database." ) @@ -104,7 +104,6 @@ def main(arg_list: list[str] | None = None) -> None: Args: arg_list: Arguments to parse passing list to parse_args(). - """ args = parse_args(arg_list) init_logging_with_args(args) From d76dd1364516a22a5ccb70c4f5b336b54401ecf8 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 8 Nov 2024 17:18:10 +0000 Subject: [PATCH 02/30] Refactor dump, parse_args + add functionality --- .../io/genomio/genome_metadata/dump.py | 111 ++++++++++++++---- 1 file changed, 91 insertions(+), 20 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 7cb7702b5..b29738efa 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -19,22 +19,27 @@ "filter_genome_meta", "check_assembly_version", "check_genebuild_version", + "metadata_dump_setup", ] +import argparse import json from typing import Any, Dict, Type import logging from sqlalchemy import select from sqlalchemy.orm import Session +from sqlalchemy.engine import URL from ensembl.core.models import Meta +from ensembl.io.genomio.utils.json_utils import get_json from ensembl.io.genomio.database import DBConnectionLite from ensembl.utils.argparse import ArgumentParser +from ensembl.utils import StrPath from ensembl.utils.logging import init_logging_with_args -METADATA_FILTER: Dict[str, Dict[str, Type]] = { +DEFAULT_FILTER: Dict[str, Dict[str, Type]] = { "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, "assembly": { @@ -60,7 +65,7 @@ } -def get_genome_metadata(session: Session) -> Dict[str, Any]: +def get_genome_metadata(session: Session, db_name: Dict[str, str] | None) -> Dict[str, Any]: """Returns the meta table content from the core database in a nested dictionary. Args: @@ -68,6 +73,7 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: """ genome_metadata: Dict[str, Any] = {} + meta_statement = select(Meta) for row in session.execute(meta_statement).unique().all(): meta_key = row[0].meta_key @@ -81,6 +87,10 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: genome_metadata[main_key][subkey] = [meta_value] else: genome_metadata[main_key] = {subkey: [meta_value]} + + if db_name: + genome_metadata["database"] = {"name": f"{db_name}"} + # Parse genome metadata to simplify dictionary and check data consistency for main_key, subkeys_dict in genome_metadata.items(): # Replace single-value lists by the value itself @@ -96,19 +106,33 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: return genome_metadata -def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: +def filter_genome_meta( + genome_metadata: Dict[str, Any], metafilter: StrPath | None, restrict_filter: bool +) -> Dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. Also converts to expected data types (to follow the genome JSON schema). Args: genome_metadata: Nested metadata key values from the core metadata table. + metafilter: Input JSON containing subset of meta table values to filter on. + restrict_filter: Deactivates additional meta updating. + """ filtered_metadata: Dict[str, Any] = {} - for key, subfilter in METADATA_FILTER.items(): + + if metafilter: + DYNAMIC_METADATA_FILTER: Dict[str, Dict[str, type]] = get_json(metafilter) + else: + DYNAMIC_METADATA_FILTER = DEFAULT_FILTER + + for key, subfilter in DYNAMIC_METADATA_FILTER.items(): if key in genome_metadata: filtered_metadata[key] = {} for subkey, value_type in subfilter.items(): + if isinstance(value_type, str): + value_type = type(value_type) + if subkey in genome_metadata[key]: value = genome_metadata[key][subkey] if isinstance(value, list): @@ -116,10 +140,14 @@ def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: else: value = value_type(value) filtered_metadata[key][subkey] = value - # Check assembly and genebuild versions - check_assembly_refseq(filtered_metadata) - check_assembly_version(filtered_metadata) - check_genebuild_version(filtered_metadata) + + # Optional assembly and genebuild based filtering: + if not restrict_filter: + # Check assembly and genebuild versions + check_assembly_refseq(filtered_metadata) + check_assembly_version(filtered_metadata) + check_genebuild_version(filtered_metadata) + return filtered_metadata @@ -188,19 +216,62 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -def main() -> None: - """Main script entry-point.""" - parser = ArgumentParser( - description="Fetch the genome metadata from a core database and print it in JSON format." - ) - parser.add_server_arguments(include_database=True) - parser.add_log_arguments(add_log_file=True) - args = parser.parse_args() - init_logging_with_args(args) +# def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool, append_db: bool) -> Dict[str, Any]: +def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool, append_db: bool) -> None: + """ + Args: + db_url: Target core database URL. + metafilter: Input JSON containing subset of meta table values to filter on. + no_update: Deactivate additional meta updating. + append_db: Append target core database name to output JSON. + + """ + dbc = DBConnectionLite(db_url) + db_name = None + if append_db: + db_name = db_url.database - dbc = DBConnectionLite(args.url) with dbc.session_scope() as session: - genome_meta = get_genome_metadata(session) - genome_meta = filter_genome_meta(genome_meta) + genome_meta = get_genome_metadata(session, db_name) + genome_meta = filter_genome_meta(genome_meta, metafilter, no_update) print(json.dumps(genome_meta, indent=2, sort_keys=True)) + + +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + """Return a populated namespace with the arguments parsed from a list or from the command line. + + Args: + arg_list: List of arguments to parse. If `None`, grab them from the command line. + + """ + parser = ArgumentParser(description=__doc__) + parser.add_server_arguments(include_database=True, help="server url and core database") + parser.add_argument_src_path( + "--metafilter", default=None, help="Input File | List with >=2 meta_keys to query target database." + ) + parser.add_argument( + "--no_update", + default=False, + type=bool, + help="Deactivate additional assembly and genebuild metadata update.", + ) + parser.add_argument( + "--append_db", default=False, type=bool, help="Append core database name to output JSON." + ) + parser.add_log_arguments(add_log_file=True) + return parser.parse_args(arg_list) + + +def main(arg_list: list[str] | None = None) -> None: + """Main script entry-point. + + Args: + arg_list: Arguments to parse passing list to parse_args(). + """ + args = parse_args(arg_list) + init_logging_with_args(args) + + metadata_dump_setup( + db_url=args.url, metafilter=args.metafilter, no_update=args.no_update, append_db=args.append_db + ) From da5f9e60483d5bd18abba41e61999fa97b5ed1bd Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 8 Nov 2024 17:33:36 +0000 Subject: [PATCH 03/30] Add database name to default meta_data --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index b29738efa..b880362ea 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -40,6 +40,7 @@ DEFAULT_FILTER: Dict[str, Dict[str, Type]] = { + "database": {"name": str}, "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, "assembly": { From 7a05032f6e70e002d04b514631f4098c1eacf633 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 8 Nov 2024 17:45:40 +0000 Subject: [PATCH 04/30] Mypy Fix func type hint --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index b880362ea..e9ea1f025 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -66,7 +66,7 @@ } -def get_genome_metadata(session: Session, db_name: Dict[str, str] | None) -> Dict[str, Any]: +def get_genome_metadata(session: Session, db_name: str | None) -> Dict[str, Any]: """Returns the meta table content from the core database in a nested dictionary. Args: From ed14d5bb8165c1b90321636a2dcb00fb4041e129 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 8 Nov 2024 18:00:29 +0000 Subject: [PATCH 05/30] Fix pylint --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index e9ea1f025..9cc6c5af9 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -217,7 +217,6 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -# def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool, append_db: bool) -> Dict[str, Any]: def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool, append_db: bool) -> None: """ Args: From 399a06f02b542529abd093b1b7da8ac803c541e1 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Mon, 11 Nov 2024 18:46:27 +0000 Subject: [PATCH 06/30] Polish and refactor/update dump tests --- .../io/genomio/genome_metadata/dump.py | 30 ++- src/python/tests/genome_metadata/test_dump.py | 229 ++++++++++++++++-- .../test_dump/filter_noupdate.json | 5 + .../test_dump/species_filter.json | 7 + .../test_dump/version_filter.json | 9 + 5 files changed, 246 insertions(+), 34 deletions(-) create mode 100644 src/python/tests/genome_metadata/test_dump/filter_noupdate.json create mode 100644 src/python/tests/genome_metadata/test_dump/species_filter.json create mode 100644 src/python/tests/genome_metadata/test_dump/version_filter.json diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 9cc6c5af9..0d7e1c421 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -108,7 +108,7 @@ def get_genome_metadata(session: Session, db_name: str | None) -> Dict[str, Any] def filter_genome_meta( - genome_metadata: Dict[str, Any], metafilter: StrPath | None, restrict_filter: bool + genome_metadata: Dict[str, Any], metafilter: StrPath | None, meta_update: bool ) -> Dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. @@ -117,7 +117,7 @@ def filter_genome_meta( Args: genome_metadata: Nested metadata key values from the core metadata table. metafilter: Input JSON containing subset of meta table values to filter on. - restrict_filter: Deactivates additional meta updating. + meta_update: Deactivates additional meta updating. """ filtered_metadata: Dict[str, Any] = {} @@ -133,7 +133,6 @@ def filter_genome_meta( for subkey, value_type in subfilter.items(): if isinstance(value_type, str): value_type = type(value_type) - if subkey in genome_metadata[key]: value = genome_metadata[key][subkey] if isinstance(value, list): @@ -143,7 +142,7 @@ def filter_genome_meta( filtered_metadata[key][subkey] = value # Optional assembly and genebuild based filtering: - if not restrict_filter: + if meta_update: # Check assembly and genebuild versions check_assembly_refseq(filtered_metadata) check_assembly_version(filtered_metadata) @@ -217,8 +216,8 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool, append_db: bool) -> None: - """ +def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, meta_update: bool, append_db: bool) -> Dict[str, Any]: + """Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. metafilter: Input JSON containing subset of meta table values to filter on. @@ -233,9 +232,9 @@ def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, no_update: bool with dbc.session_scope() as session: genome_meta = get_genome_metadata(session, db_name) - genome_meta = filter_genome_meta(genome_meta, metafilter, no_update) + genome_meta = filter_genome_meta(genome_meta, metafilter, meta_update) - print(json.dumps(genome_meta, indent=2, sort_keys=True)) + return genome_meta def parse_args(arg_list: list[str] | None) -> argparse.Namespace: @@ -251,14 +250,11 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: "--metafilter", default=None, help="Input File | List with >=2 meta_keys to query target database." ) parser.add_argument( - "--no_update", - default=False, - type=bool, + "--meta_update", + action="store_true", help="Deactivate additional assembly and genebuild metadata update.", ) - parser.add_argument( - "--append_db", default=False, type=bool, help="Append core database name to output JSON." - ) + parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.") parser.add_log_arguments(add_log_file=True) return parser.parse_args(arg_list) @@ -272,6 +268,8 @@ def main(arg_list: list[str] | None = None) -> None: args = parse_args(arg_list) init_logging_with_args(args) - metadata_dump_setup( - db_url=args.url, metafilter=args.metafilter, no_update=args.no_update, append_db=args.append_db + genome_meta = metadata_dump_setup( + db_url=args.url, metafilter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db ) + + print(json.dumps(genome_meta, indent=2, sort_keys=True)) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 75c4a4494..d298bad99 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -18,15 +18,19 @@ $ pytest test_dump.py """ - +from pathlib import Path +from unittest.mock import Mock, patch +from typing import Any, ContextManager, Dict, List from collections import namedtuple from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict, List -from unittest.mock import Mock, patch - from deepdiff import DeepDiff import pytest +from pytest import param +from _pytest.capture import CaptureFixture +from sqlalchemy.engine import make_url, URL + +from ensembl.utils import StrPath from ensembl.io.genomio.genome_metadata import dump @@ -115,44 +119,112 @@ def test_check_genebuild_version( @patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) @patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) @pytest.mark.parametrize( - "genome_metadata, output", + "genome_metadata, output, meta_filter, restrict_filter", [ - ({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}), - ({"species": {"display_name": "Dog"}}, {"species": {"display_name": "Dog"}}), - ({"genebuild": {"new_key": "_"}}, {"genebuild": {}}), - ({"BRC5": "new_value"}, {}), - ({"meta": "key", "species": {"alias": "woof"}}, {"species": {"alias": "woof"}}), - ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), + pytest.param({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}, + None, False, id="Meta matches, no filter, allow meta update"), + pytest.param({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}, + None, + True, + id="Meta matches, no meta filter, prevent meta update"), + pytest.param({"genebuild": {"new_key": "_"}}, {"genebuild": {}}, + None, + False, + id="Filters on '_' value"), + pytest.param({"BRC5": "new_value"}, {}, None, False, id="BRC5 new value"), + pytest.param( + {"meta": "key", "species": {"alias": "woof"}}, + {"species": {"alias": "woof"}}, + None, + False, + id="Test alias"), + pytest.param( + {"added_seq": {"region_name": [1, 2]}}, + {"added_seq": {"region_name": ["1", "2"]}}, + None, + False, + id="Added seq region_name"), + pytest.param({}, {}, None, False, id="BRC5 new value"), + pytest.param( + {"species": + {"display_name": "Honeybee","annotation_source": "Ensembl", + "production_name": "apis_melifera_gca123v1", + "scientific_name": "apis_melifera", + "taxonomy_id": "70921" + } + }, + {"species": { + "display_name": "Honeybee", + "production_name": "apis_melifera_gca123v1", + "taxonomy_id": "70921"} + }, + "species_filter.json", + False, + id="Filter via input meta JSON"), + pytest.param( + { "annotation": { "provider_name": "ENA",}, + "assembly": { "accession": "GCA_000111222.3", "version": "1"}, + "genebuild": { "method": "import", "version": "1"} + }, + {"assembly": { "version": "1"}, + "genebuild": { "method": "import", "version": "1"}}, + "version_filter.json", False, id="Asm + Genebuild version filter"), + pytest.param( + { "annotation": { "provider_name": "ENA",}, + "assembly": { "accession": "GCA_000111222.3", "version": "1"}, + "genebuild": { "method": "import", "version": "1"} + }, + {"genebuild": { "method": "import"}}, + "filter_noupdate.json", + True, + id="Only geneBuild method, restrict update"), ], ) -def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: +def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], output: Dict[str, Any], + meta_filter: StrPath, restrict_filter: bool) -> None: """Tests the `dump.filter_genome_meta()` method. Args: genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. + meta_filter: + restrict_filter: """ - result = dump.filter_genome_meta(genome_metadata) + if meta_filter is not None: + meta_filter_file = data_dir / meta_filter + result = dump.filter_genome_meta(genome_metadata, meta_filter_file, restrict_filter) + else: + result = dump.filter_genome_meta(genome_metadata, meta_filter, restrict_filter) assert not DeepDiff(result, output) + # assert not DeepDiff(expected_meta, meta_filter) @patch("sqlalchemy.engine.Result") @patch("sqlalchemy.orm.Session") @pytest.mark.parametrize( - "meta_data, output, expectation", + "db_name, meta_data, output, expectation", [ - pytest.param([], {}, does_not_raise(), id="Empty meta table"), + pytest.param(None, [], {}, does_not_raise(), id="Empty meta table"), + pytest.param("test_dbname_core_110_1", [], {"database": {"name": "test_dbname_core_110_1"}}, + does_not_raise(), id="db_name append, Empty meta table"), pytest.param( + None, [ [MetaRow("sample", "gene1")], [MetaRow("species.name", "dog")], [MetaRow("species.synonym", "puppy")], ], - {"sample": "gene1", "species": {"name": "dog", "synonym": "puppy"}}, + { + "sample": "gene1", + "species": { + "name": "dog", + "synonym": "puppy"} + }, does_not_raise(), id="Meta table with simple values", ), pytest.param( + None, [ [MetaRow("sample", "gene1")], [MetaRow("sample", "gene2")], @@ -164,16 +236,40 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A id="Meta table with lists", ), pytest.param( - [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], + None, + [ + [MetaRow("species", "dog")], + [MetaRow("species.synonym", "puppy")] + ], {}, pytest.raises(ValueError), id="'species' and 'species.synonym' meta keys", ), + pytest.param( + "test_dbname_core_110_1", + [ + [MetaRow("assembly.accession", "GCA_000111222.3")], + [MetaRow("species.annotation_source", "Community")], + [MetaRow("species.production_name", "genus_species_gca000111222v3cm")]], + { + "assembly":{"accession":"GCA_000111222.3"}, + "database":{ + "name":"test_dbname_core_110_1" + }, + "species":{ + "annotation_source":"Community", + "production_name":"genus_species_gca000111222v3cm" + } + }, + does_not_raise(), + id="dbname append to meta", + ), ], ) def test_get_genome_metadata( mock_session: Mock, mock_result: Mock, + db_name: str | None, meta_data: List[MetaRow], output: Dict[str, Any], expectation: ContextManager, @@ -182,13 +278,110 @@ def test_get_genome_metadata( Args: mock_session: A mock of `sqlalchemy.orm.Session()` class. + db_name: meta_data: `meta` table content in a list of named tuples. output: Expected genome metadata dictionary. expectation: Context manager for the expected exception (if any). """ + # pylint: disable=too-many-positional-arguments mock_result.unique.return_value = mock_result mock_result.all.return_value = meta_data mock_session.execute.return_value = mock_result with expectation: - result = dump.get_genome_metadata(mock_session) + result = dump.get_genome_metadata(mock_session, db_name) assert not DeepDiff(result, output) + +@pytest.mark.parametrize( + "arg_list, expected", + [ + param( + ["--host", "localhost", "--port", "42", "--user", "me", "--database", "test_db"], + { + "host": "localhost", + "port": 42, + "user": "me", + "password": None, + "url": make_url("mysql://me@localhost:42/test_db"), + "database": "test_db", + "metafilter": None, + "meta_update": False, + "append_db": False, + "log_file": None, + "log_level": "WARNING", + "log_file_level": "DEBUG", + }, + id="Default args", + ), + param([ + "--host", "localhost", "--port", "42", "--user", "me", "--database", "test_db", + "--metafilter", f"{__file__}", "--append_db"], + { + "host": "localhost", + "port": 42, + "user": "me", + "password": None, + "url": make_url("mysql://me@localhost:42/test_db"), + "database": "test_db", + "metafilter": __file__, + "meta_update": False, + "append_db": True, + "log_file": None, + "log_level": "WARNING", + "log_file_level": "DEBUG", + }, + id="Filter, non-default args" + ), + ], +) +def test_parse_args(arg_list: list[str], expected: dict) -> None: + """Tests the `dump.parse_args()` function.""" + # pylint: disable=too-many-positional-arguments + args = dump.parse_args(arg_list) + if args.metafilter: + # DeepDiff is not able to compare two objects of Path type, so convert it to string + setattr(args, "metafilter", str(args.metafilter)) + assert not DeepDiff(vars(args), expected) + + +@pytest.mark.parametrize( + "arg_list, db_url, metafilter, meta_update, append_db, stdout", + [ + param( + [ + "--host", "localhost", "--port", "42", "--user", "me", + "--database", "test_dbname_core_110_1", "--append_db" + ], + make_url("mysql://me@localhost:42/test_dbname_core_110_1"), + None, + False, + True, + '{\n "database": {\n "name": "test_dbname_core_110_1"\n }\n}\n', + id="Call main and append_db", + ), + ], +) +@patch("ensembl.io.genomio.genome_metadata.dump.metadata_dump_setup") +def test_main( + mock_metadata_dump_setup: Mock, + capsys: CaptureFixture[str], + arg_list: list[str], + db_url: URL, + metafilter: StrPath, + meta_update: bool, + append_db: bool, + stdout: str, +) -> None: + """Tests the `dump.main()` function (entry point). + + Fixtures: capsys + """ + # pylint: disable=too-many-positional-arguments + mock_metadata_dump_setup.return_value = {"database": {"name": "test_dbname_core_110_1"}} + dump.main(arg_list) + # Check that we have called the mocked function once with the expected parameters + mock_metadata_dump_setup.assert_called_once_with( + db_url=db_url, metafilter=metafilter, meta_update=meta_update, append_db=append_db + ) + # Check that the stdout is as expected + captured = capsys.readouterr() + assert captured.out == stdout diff --git a/src/python/tests/genome_metadata/test_dump/filter_noupdate.json b/src/python/tests/genome_metadata/test_dump/filter_noupdate.json new file mode 100644 index 000000000..59a5f8f06 --- /dev/null +++ b/src/python/tests/genome_metadata/test_dump/filter_noupdate.json @@ -0,0 +1,5 @@ +{ + "genebuild": { + "method": "str" + } +} diff --git a/src/python/tests/genome_metadata/test_dump/species_filter.json b/src/python/tests/genome_metadata/test_dump/species_filter.json new file mode 100644 index 000000000..31ca5ddd5 --- /dev/null +++ b/src/python/tests/genome_metadata/test_dump/species_filter.json @@ -0,0 +1,7 @@ +{ + "species": { + "display_name": "str", + "production_name": "str", + "taxonomy_id": "int" + } +} diff --git a/src/python/tests/genome_metadata/test_dump/version_filter.json b/src/python/tests/genome_metadata/test_dump/version_filter.json new file mode 100644 index 000000000..df77276e3 --- /dev/null +++ b/src/python/tests/genome_metadata/test_dump/version_filter.json @@ -0,0 +1,9 @@ +{ + "assembly": { + "version": "str" + }, + "genebuild": { + "version": "str", + "method": "str" + } +} From 3ab16b9855a591af343f1adb10ea66c59f6059eb Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Mon, 11 Nov 2024 18:50:57 +0000 Subject: [PATCH 07/30] Update vars in test --- src/python/tests/genome_metadata/test_dump.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index d298bad99..e719ec38d 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -119,7 +119,7 @@ def test_check_genebuild_version( @patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) @patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) @pytest.mark.parametrize( - "genome_metadata, output, meta_filter, restrict_filter", + "genome_metadata, output, meta_filter, meta_update", [ pytest.param({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}, None, False, id="Meta matches, no filter, allow meta update"), @@ -181,7 +181,7 @@ def test_check_genebuild_version( ], ) def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], output: Dict[str, Any], - meta_filter: StrPath, restrict_filter: bool) -> None: + meta_filter: StrPath, meta_update: bool) -> None: """Tests the `dump.filter_genome_meta()` method. Args: @@ -192,9 +192,9 @@ def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], out """ if meta_filter is not None: meta_filter_file = data_dir / meta_filter - result = dump.filter_genome_meta(genome_metadata, meta_filter_file, restrict_filter) + result = dump.filter_genome_meta(genome_metadata, meta_filter_file, meta_update) else: - result = dump.filter_genome_meta(genome_metadata, meta_filter, restrict_filter) + result = dump.filter_genome_meta(genome_metadata, meta_filter, meta_update) assert not DeepDiff(result, output) # assert not DeepDiff(expected_meta, meta_filter) From 41620689141355525ab3f6f06c448925306343ee Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Mon, 11 Nov 2024 18:55:03 +0000 Subject: [PATCH 08/30] Update --meta_update description --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 0d7e1c421..bd1a69fd0 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -252,7 +252,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser.add_argument( "--meta_update", action="store_true", - help="Deactivate additional assembly and genebuild metadata update.", + help="Perform additional assembly and genebuild 'version' metadata updates.", ) parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.") parser.add_log_arguments(add_log_file=True) From 10ca796f7f9f956698ff4ed6a868bad0617802bf Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Mon, 11 Nov 2024 18:56:37 +0000 Subject: [PATCH 09/30] Make pylint happy --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index bd1a69fd0..de762827b 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -216,7 +216,8 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, meta_update: bool, append_db: bool) -> Dict[str, Any]: +def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, + meta_update: bool, append_db: bool) -> Dict[str, Any]: """Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. From 3c9ac321f4c65b8f89359a2e4078a4b2f1c01ed0 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Mon, 11 Nov 2024 18:57:15 +0000 Subject: [PATCH 10/30] Make black happy --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index de762827b..6d02b56b9 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -216,8 +216,9 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -def metadata_dump_setup(db_url: URL, metafilter: StrPath | None, - meta_update: bool, append_db: bool) -> Dict[str, Any]: +def metadata_dump_setup( + db_url: URL, metafilter: StrPath | None, meta_update: bool, append_db: bool +) -> Dict[str, Any]: """Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. From 3fbf7addbf957bd5bba7b08062ea47349feaf04c Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Tue, 12 Nov 2024 09:35:37 +0000 Subject: [PATCH 11/30] black --- src/python/tests/genome_metadata/test_dump.py | 163 +++++++++++------- 1 file changed, 103 insertions(+), 60 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index e719ec38d..1abe60dd4 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -121,67 +121,95 @@ def test_check_genebuild_version( @pytest.mark.parametrize( "genome_metadata, output, meta_filter, meta_update", [ - pytest.param({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}, - None, False, id="Meta matches, no filter, allow meta update"), - pytest.param({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}, - None, - True, - id="Meta matches, no meta filter, prevent meta update"), - pytest.param({"genebuild": {"new_key": "_"}}, {"genebuild": {}}, + pytest.param( + {"species": {"taxonomy_id": "5485"}}, + {"species": {"taxonomy_id": 5485}}, None, False, - id="Filters on '_' value"), + id="Meta matches, no filter, allow meta update", + ), + pytest.param( + {"species": {"taxonomy_id": "5485"}}, + {"species": {"taxonomy_id": 5485}}, + None, + True, + id="Meta matches, no meta filter, prevent meta update", + ), + pytest.param( + {"genebuild": {"new_key": "_"}}, {"genebuild": {}}, None, False, id="Filters on '_' value" + ), pytest.param({"BRC5": "new_value"}, {}, None, False, id="BRC5 new value"), pytest.param( {"meta": "key", "species": {"alias": "woof"}}, {"species": {"alias": "woof"}}, None, False, - id="Test alias"), + id="Test alias", + ), pytest.param( {"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}, None, False, - id="Added seq region_name"), + id="Added seq region_name", + ), pytest.param({}, {}, None, False, id="BRC5 new value"), pytest.param( - {"species": - {"display_name": "Honeybee","annotation_source": "Ensembl", + { + "species": { + "display_name": "Honeybee", + "annotation_source": "Ensembl", "production_name": "apis_melifera_gca123v1", "scientific_name": "apis_melifera", - "taxonomy_id": "70921" + "taxonomy_id": "70921", } }, - {"species": { - "display_name": "Honeybee", - "production_name": "apis_melifera_gca123v1", - "taxonomy_id": "70921"} + { + "species": { + "display_name": "Honeybee", + "production_name": "apis_melifera_gca123v1", + "taxonomy_id": "70921", + } }, - "species_filter.json", + "species_filter.json", False, - id="Filter via input meta JSON"), + id="Filter via input meta JSON", + ), pytest.param( - { "annotation": { "provider_name": "ENA",}, - "assembly": { "accession": "GCA_000111222.3", "version": "1"}, - "genebuild": { "method": "import", "version": "1"} + { + "annotation": { + "provider_name": "ENA", + }, + "assembly": {"accession": "GCA_000111222.3", "version": "1"}, + "genebuild": {"method": "import", "version": "1"}, }, - {"assembly": { "version": "1"}, - "genebuild": { "method": "import", "version": "1"}}, - "version_filter.json", False, id="Asm + Genebuild version filter"), + {"assembly": {"version": "1"}, "genebuild": {"method": "import", "version": "1"}}, + "version_filter.json", + False, + id="Asm + Genebuild version filter", + ), pytest.param( - { "annotation": { "provider_name": "ENA",}, - "assembly": { "accession": "GCA_000111222.3", "version": "1"}, - "genebuild": { "method": "import", "version": "1"} + { + "annotation": { + "provider_name": "ENA", + }, + "assembly": {"accession": "GCA_000111222.3", "version": "1"}, + "genebuild": {"method": "import", "version": "1"}, }, - {"genebuild": { "method": "import"}}, - "filter_noupdate.json", + {"genebuild": {"method": "import"}}, + "filter_noupdate.json", True, - id="Only geneBuild method, restrict update"), + id="Only geneBuild method, restrict update", + ), ], ) -def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], output: Dict[str, Any], - meta_filter: StrPath, meta_update: bool) -> None: +def test_filter_genome_meta( + data_dir: Path, + genome_metadata: Dict[str, Any], + output: Dict[str, Any], + meta_filter: StrPath, + meta_update: bool, +) -> None: """Tests the `dump.filter_genome_meta()` method. Args: @@ -205,8 +233,13 @@ def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], out "db_name, meta_data, output, expectation", [ pytest.param(None, [], {}, does_not_raise(), id="Empty meta table"), - pytest.param("test_dbname_core_110_1", [], {"database": {"name": "test_dbname_core_110_1"}}, - does_not_raise(), id="db_name append, Empty meta table"), + pytest.param( + "test_dbname_core_110_1", + [], + {"database": {"name": "test_dbname_core_110_1"}}, + does_not_raise(), + id="db_name append, Empty meta table", + ), pytest.param( None, [ @@ -214,12 +247,7 @@ def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], out [MetaRow("species.name", "dog")], [MetaRow("species.synonym", "puppy")], ], - { - "sample": "gene1", - "species": { - "name": "dog", - "synonym": "puppy"} - }, + {"sample": "gene1", "species": {"name": "dog", "synonym": "puppy"}}, does_not_raise(), id="Meta table with simple values", ), @@ -237,10 +265,7 @@ def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], out ), pytest.param( None, - [ - [MetaRow("species", "dog")], - [MetaRow("species.synonym", "puppy")] - ], + [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], {}, pytest.raises(ValueError), id="'species' and 'species.synonym' meta keys", @@ -250,16 +275,15 @@ def test_filter_genome_meta(data_dir: Path, genome_metadata: Dict[str, Any], out [ [MetaRow("assembly.accession", "GCA_000111222.3")], [MetaRow("species.annotation_source", "Community")], - [MetaRow("species.production_name", "genus_species_gca000111222v3cm")]], + [MetaRow("species.production_name", "genus_species_gca000111222v3cm")], + ], { - "assembly":{"accession":"GCA_000111222.3"}, - "database":{ - "name":"test_dbname_core_110_1" + "assembly": {"accession": "GCA_000111222.3"}, + "database": {"name": "test_dbname_core_110_1"}, + "species": { + "annotation_source": "Community", + "production_name": "genus_species_gca000111222v3cm", }, - "species":{ - "annotation_source":"Community", - "production_name":"genus_species_gca000111222v3cm" - } }, does_not_raise(), id="dbname append to meta", @@ -291,6 +315,7 @@ def test_get_genome_metadata( result = dump.get_genome_metadata(mock_session, db_name) assert not DeepDiff(result, output) + @pytest.mark.parametrize( "arg_list, expected", [ @@ -312,9 +337,20 @@ def test_get_genome_metadata( }, id="Default args", ), - param([ - "--host", "localhost", "--port", "42", "--user", "me", "--database", "test_db", - "--metafilter", f"{__file__}", "--append_db"], + param( + [ + "--host", + "localhost", + "--port", + "42", + "--user", + "me", + "--database", + "test_db", + "--metafilter", + f"{__file__}", + "--append_db", + ], { "host": "localhost", "port": 42, @@ -329,7 +365,7 @@ def test_get_genome_metadata( "log_level": "WARNING", "log_file_level": "DEBUG", }, - id="Filter, non-default args" + id="Filter, non-default args", ), ], ) @@ -338,7 +374,7 @@ def test_parse_args(arg_list: list[str], expected: dict) -> None: # pylint: disable=too-many-positional-arguments args = dump.parse_args(arg_list) if args.metafilter: - # DeepDiff is not able to compare two objects of Path type, so convert it to string + # DeepDiff is not able to compare two objects of Path type, so convert it to string setattr(args, "metafilter", str(args.metafilter)) assert not DeepDiff(vars(args), expected) @@ -348,8 +384,15 @@ def test_parse_args(arg_list: list[str], expected: dict) -> None: [ param( [ - "--host", "localhost", "--port", "42", "--user", "me", - "--database", "test_dbname_core_110_1", "--append_db" + "--host", + "localhost", + "--port", + "42", + "--user", + "me", + "--database", + "test_dbname_core_110_1", + "--append_db", ], make_url("mysql://me@localhost:42/test_dbname_core_110_1"), None, From 01de7fec4a59134de4733e24a37b5ffb7ee69801 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Tue, 12 Nov 2024 10:14:36 +0000 Subject: [PATCH 12/30] Update factory main() docstring --- src/python/ensembl/io/genomio/database/factory.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/database/factory.py b/src/python/ensembl/io/genomio/database/factory.py index 450992431..e62e904cf 100644 --- a/src/python/ensembl/io/genomio/database/factory.py +++ b/src/python/ensembl/io/genomio/database/factory.py @@ -153,8 +153,8 @@ def main(arg_list: list[str] | None = None) -> None: """Main script entry-point. Args: - arg_list: TODO - + arg_list: Arguments to parse passing list to parse_args(). + """ args = parse_args(arg_list) init_logging_with_args(args) From 660a42836a1053911d5ee5164eed63d899b4b6d0 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Tue, 12 Nov 2024 11:51:26 +0000 Subject: [PATCH 13/30] black on database/factory --- src/python/ensembl/io/genomio/database/factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/database/factory.py b/src/python/ensembl/io/genomio/database/factory.py index e62e904cf..58a57b0df 100644 --- a/src/python/ensembl/io/genomio/database/factory.py +++ b/src/python/ensembl/io/genomio/database/factory.py @@ -154,7 +154,7 @@ def main(arg_list: list[str] | None = None) -> None: Args: arg_list: Arguments to parse passing list to parse_args(). - + """ args = parse_args(arg_list) init_logging_with_args(args) From 4bd220688f1817684719535d6596d61cbe760bfb Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Wed, 13 Nov 2024 10:17:15 +0000 Subject: [PATCH 14/30] Update argparse help info --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 6d02b56b9..ee3c6c492 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -249,12 +249,12 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: parser = ArgumentParser(description=__doc__) parser.add_server_arguments(include_database=True, help="server url and core database") parser.add_argument_src_path( - "--metafilter", default=None, help="Input File | List with >=2 meta_keys to query target database." + "--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output." ) parser.add_argument( "--meta_update", action="store_true", - help="Perform additional assembly and genebuild 'version' metadata updates.", + help="Perform assembly and genebuild 'version' metadata checks & update if needed.", ) parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.") parser.add_log_arguments(add_log_file=True) From cc46c0e66b8585446e7221bf0036a2a3e6d153f0 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Thu, 14 Nov 2024 12:30:25 +0000 Subject: [PATCH 15/30] Update genome schema to account for optional database name meta info --- .../ensembl/io/genomio/data/schemas/genome.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/data/schemas/genome.json b/src/python/ensembl/io/genomio/data/schemas/genome.json index 54985bb7a..28f56583d 100644 --- a/src/python/ensembl/io/genomio/data/schemas/genome.json +++ b/src/python/ensembl/io/genomio/data/schemas/genome.json @@ -110,6 +110,14 @@ { "type" : "array", "items" : { "type" : "string" } } ] } + }, + "database_info" : { + "type": "object", + "additionalProperties": false, + "description" : "Optional name of target database where meta data was retrieved.", + "properties" : { + "name": { "type" : "string" } + } } }, @@ -123,7 +131,8 @@ "genebuild" : { "$ref" : "#/definitions/genebuild_info" }, "provider" : { "$ref" : "#/definitions/provider_info" }, "BRC4" : { "$ref" : "#/definitions/BRC4_info" }, - "added_seq" : { "$ref" : "#/definitions/added_sequence_info" } + "added_seq" : { "$ref" : "#/definitions/added_sequence_info" }, + "database" : { "$ref" : "#/definitions/database_info" } }, "required" : [ "species", From fe1baaec844d3363a05ae7f26634ae4671a51cc3 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Thu, 14 Nov 2024 16:02:53 +0000 Subject: [PATCH 16/30] Add meta filter type evaluation --- .../io/genomio/genome_metadata/dump.py | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index ee3c6c492..d0d1428fb 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -108,7 +108,7 @@ def get_genome_metadata(session: Session, db_name: str | None) -> Dict[str, Any] def filter_genome_meta( - genome_metadata: Dict[str, Any], metafilter: StrPath | None, meta_update: bool + genome_metadata: Dict[str, Any], metafilter: dict | None, meta_update: bool ) -> Dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. @@ -123,7 +123,7 @@ def filter_genome_meta( filtered_metadata: Dict[str, Any] = {} if metafilter: - DYNAMIC_METADATA_FILTER: Dict[str, Dict[str, type]] = get_json(metafilter) + DYNAMIC_METADATA_FILTER: Dict[str, Dict[str, type]] = metafilter else: DYNAMIC_METADATA_FILTER = DEFAULT_FILTER @@ -133,6 +133,8 @@ def filter_genome_meta( for subkey, value_type in subfilter.items(): if isinstance(value_type, str): value_type = type(value_type) + if isinstance(value_type, int): + value_type = type(value_type) if subkey in genome_metadata[key]: value = genome_metadata[key][subkey] if isinstance(value, list): @@ -158,8 +160,17 @@ def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None: genome_metadata: Nested metadata key values from the core metadata table. """ assembly = gmeta_out.get("assembly", {}) - if assembly.get("provider_name", "") == "RefSeq": - assembly["accession"] = assembly["accession"].replace("GCA", "GCF") + if bool(assembly.get("provider_name")): + if assembly.get("provider_name", "") == "RefSeq": + assembly["accession"] = assembly["accession"].replace("GCA", "GCF") + logging.info("GCA accession updated to RefSeq GFC accession.") + else: + logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly.get('provider_name')}") + else: + logging.debug( + "Meta filter update to RefSeq accession not done: user meta filter \ + missing: 'assembly.provider_name'" + ) def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: @@ -216,25 +227,45 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) +def convert_dict(meta_dict: dict) -> dict: + """Converts text json to add type properties from string + + Args: + meta_dict: User meta dictionary with literal string typing to be converted + """ + new_dict = meta_dict.copy() + for key, value in meta_dict.items(): + if isinstance(value, dict): + new_dict[key] = convert_dict(value) + else: + new_dict[key] = eval(value) + return new_dict + + def metadata_dump_setup( - db_url: URL, metafilter: StrPath | None, meta_update: bool, append_db: bool + db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool ) -> Dict[str, Any]: """Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. - metafilter: Input JSON containing subset of meta table values to filter on. + input_filter: Input JSON containing subset of meta table values to filter on. no_update: Deactivate additional meta updating. append_db: Append target core database name to output JSON. """ dbc = DBConnectionLite(db_url) db_name = None + meta_filter = {} if append_db: db_name = db_url.database + if input_filter: + unconverted_json = get_json(input_filter) + meta_filter = convert_dict(unconverted_json) + with dbc.session_scope() as session: genome_meta = get_genome_metadata(session, db_name) - genome_meta = filter_genome_meta(genome_meta, metafilter, meta_update) + genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update) return genome_meta @@ -271,7 +302,7 @@ def main(arg_list: list[str] | None = None) -> None: init_logging_with_args(args) genome_meta = metadata_dump_setup( - db_url=args.url, metafilter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db + db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db ) print(json.dumps(genome_meta, indent=2, sort_keys=True)) From 2a4036cd140a0cad786f8d92302f4dc635eb209d Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:05:42 +0000 Subject: [PATCH 17/30] Use new typing Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index d0d1428fb..084002acc 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -39,7 +39,7 @@ from ensembl.utils.logging import init_logging_with_args -DEFAULT_FILTER: Dict[str, Dict[str, Type]] = { +DEFAULT_FILTER: dict[str, dict[str, Type]] = { "database": {"name": str}, "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, From afe9da8e31dafecac1c64d367ef44acc009d584a Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:06:04 +0000 Subject: [PATCH 18/30] Add missing docstring arg Co-authored-by: Disha Lodha <87130059+Dishalodha@users.noreply.github.com> --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 084002acc..e72bdf139 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -71,7 +71,7 @@ def get_genome_metadata(session: Session, db_name: str | None) -> Dict[str, Any] Args: session: Session for the current core. - + db_name: Target database name """ genome_metadata: Dict[str, Any] = {} From a5bb7f5db53a8db3e37ba6d8fd99621f07286dd6 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Thu, 14 Nov 2024 16:07:02 +0000 Subject: [PATCH 19/30] Use new typing Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index e72bdf139..731c9a295 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -66,7 +66,7 @@ } -def get_genome_metadata(session: Session, db_name: str | None) -> Dict[str, Any]: +def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]: """Returns the meta table content from the core database in a nested dictionary. Args: From af2c86bea394810a06db5f6776ddaebb0009f008 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 12:30:41 +0000 Subject: [PATCH 20/30] Reorder imports Co-authored-by: J. Alvarez-Jarreta --- src/python/tests/genome_metadata/test_dump.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 1abe60dd4..5907f2b7a 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -18,9 +18,10 @@ $ pytest test_dump.py """ + from pathlib import Path -from unittest.mock import Mock, patch from typing import Any, ContextManager, Dict, List +from unittest.mock import Mock, patch from collections import namedtuple from contextlib import nullcontext as does_not_raise from deepdiff import DeepDiff From 17f2cc2a4bafaa704e99f26cbbb1ecb621e323bf Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 12:31:04 +0000 Subject: [PATCH 21/30] Add missing docstring arg Co-authored-by: Disha Lodha <87130059+Dishalodha@users.noreply.github.com> --- src/python/tests/genome_metadata/test_dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 5907f2b7a..38bfca723 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -303,7 +303,7 @@ def test_get_genome_metadata( Args: mock_session: A mock of `sqlalchemy.orm.Session()` class. - db_name: + db_name: Target core database name. meta_data: `meta` table content in a list of named tuples. output: Expected genome metadata dictionary. expectation: Context manager for the expected exception (if any). From f83fef8c3391cf7102a1eff0670beb35113605b3 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 15 Nov 2024 15:33:20 +0000 Subject: [PATCH 22/30] PR review update changes, new function, update test --- .../io/genomio/genome_metadata/dump.py | 31 ++++---- src/python/tests/genome_metadata/test_dump.py | 79 +++++++++++-------- .../test_dump/filter_noupdate.json | 5 -- .../test_dump/species_filter.json | 7 -- .../test_dump/version_filter.json | 9 --- 5 files changed, 64 insertions(+), 67 deletions(-) delete mode 100644 src/python/tests/genome_metadata/test_dump/filter_noupdate.json delete mode 100644 src/python/tests/genome_metadata/test_dump/species_filter.json delete mode 100644 src/python/tests/genome_metadata/test_dump/version_filter.json diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 731c9a295..10cea0d47 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -24,8 +24,9 @@ import argparse import json -from typing import Any, Dict, Type +from typing import Any, Type import logging +from pydoc import locate from sqlalchemy import select from sqlalchemy.orm import Session @@ -73,7 +74,7 @@ def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any] session: Session for the current core. db_name: Target database name """ - genome_metadata: Dict[str, Any] = {} + genome_metadata: dict[str, Any] = {} meta_statement = select(Meta) for row in session.execute(meta_statement).unique().all(): @@ -108,8 +109,8 @@ def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any] def filter_genome_meta( - genome_metadata: Dict[str, Any], metafilter: dict | None, meta_update: bool -) -> Dict[str, Any]: + genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool +) -> dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. Also converts to expected data types (to follow the genome JSON schema). @@ -120,14 +121,14 @@ def filter_genome_meta( meta_update: Deactivates additional meta updating. """ - filtered_metadata: Dict[str, Any] = {} + filtered_metadata: dict[str, Any] = {} if metafilter: - DYNAMIC_METADATA_FILTER: Dict[str, Dict[str, type]] = metafilter + metadata_filter: dict[str, dict[str, type]] = metafilter else: - DYNAMIC_METADATA_FILTER = DEFAULT_FILTER + metadata_filter = DEFAULT_FILTER - for key, subfilter in DYNAMIC_METADATA_FILTER.items(): + for key, subfilter in metadata_filter.items(): if key in genome_metadata: filtered_metadata[key] = {} for subkey, value_type in subfilter.items(): @@ -153,7 +154,7 @@ def filter_genome_meta( return filtered_metadata -def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None: +def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: """Update the GCA accession to use GCF if it is from RefSeq. Args: @@ -168,12 +169,12 @@ def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None: logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly.get('provider_name')}") else: logging.debug( - "Meta filter update to RefSeq accession not done: user meta filter \ - missing: 'assembly.provider_name'" + "Meta filter update to RefSeq accession not done: user meta filter missing: \ + 'assembly.provider_name'" ) -def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: +def check_assembly_version(genome_metadata: dict[str, Any]) -> None: """Updates the assembly version of the genome metadata provided. If `version` meta key is not and integer or it is not available, the assembly accession's version @@ -203,7 +204,7 @@ def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: logging.info(f'Located version [v{assembly["version"]}] info from meta data.') -def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: +def check_genebuild_version(genome_metadata: dict[str, Any]) -> None: """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. Args: @@ -238,13 +239,13 @@ def convert_dict(meta_dict: dict) -> dict: if isinstance(value, dict): new_dict[key] = convert_dict(value) else: - new_dict[key] = eval(value) + new_dict[key] = locate(value) return new_dict def metadata_dump_setup( db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool -) -> Dict[str, Any]: +) -> dict[str, Any]: """Setup main stages of genome meta dump from user input arguments provided. Args: db_url: Target core database URL. diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 38bfca723..89d1d40dc 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -18,9 +18,10 @@ $ pytest test_dump.py """ +# pylint: disable=too-many-positional-arguments from pathlib import Path -from typing import Any, ContextManager, Dict, List +from typing import Any, ContextManager from unittest.mock import Mock, patch from collections import namedtuple from contextlib import nullcontext as does_not_raise @@ -31,15 +32,15 @@ from sqlalchemy.engine import make_url, URL -from ensembl.utils import StrPath from ensembl.io.genomio.genome_metadata import dump +from ensembl.utils import StrPath MetaRow = namedtuple("MetaRow", "meta_key meta_value") @pytest.mark.parametrize( - "genome_metadata, output, expectation", + ("genome_metadata", "output", "expectation"), [ pytest.param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), pytest.param( @@ -63,7 +64,7 @@ ], ) def test_check_assembly_version( - genome_metadata: Dict[str, Any], output: int, expectation: ContextManager + genome_metadata: dict[str, Any], output: int, expectation: ContextManager ) -> None: """Tests the `dump.check_assembly_version()` method. @@ -78,7 +79,7 @@ def test_check_assembly_version( @pytest.mark.parametrize( - "genome_metadata, output, expectation", + ("genome_metadata", "output", "expectation"), [ pytest.param({}, {}, does_not_raise(), id="No 'genebuild' entry"), pytest.param( @@ -103,7 +104,7 @@ def test_check_assembly_version( ], ) def test_check_genebuild_version( - genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager + genome_metadata: dict[str, Any], output: dict[str, Any], expectation: ContextManager ) -> None: """Tests the `dump.check_genebuild_version()` method. @@ -120,7 +121,7 @@ def test_check_genebuild_version( @patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) @patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) @pytest.mark.parametrize( - "genome_metadata, output, meta_filter, meta_update", + ("genome_metadata", "output", "metafilter", "meta_update"), [ pytest.param( {"species": {"taxonomy_id": "5485"}}, @@ -134,7 +135,7 @@ def test_check_genebuild_version( {"species": {"taxonomy_id": 5485}}, None, True, - id="Meta matches, no meta filter, prevent meta update", + id="Meta matches, no filter, perform meta update", ), pytest.param( {"genebuild": {"new_key": "_"}}, {"genebuild": {}}, None, False, id="Filters on '_' value" @@ -172,7 +173,7 @@ def test_check_genebuild_version( "taxonomy_id": "70921", } }, - "species_filter.json", + {"species": {"display_name": "str", "production_name": "str", "taxonomy_id": "int"}}, False, id="Filter via input meta JSON", ), @@ -185,7 +186,7 @@ def test_check_genebuild_version( "genebuild": {"method": "import", "version": "1"}, }, {"assembly": {"version": "1"}, "genebuild": {"method": "import", "version": "1"}}, - "version_filter.json", + {"assembly": {"version": "str"}, "genebuild": {"version": "str", "method": "str"}}, False, id="Asm + Genebuild version filter", ), @@ -198,17 +199,16 @@ def test_check_genebuild_version( "genebuild": {"method": "import", "version": "1"}, }, {"genebuild": {"method": "import"}}, - "filter_noupdate.json", + {"genebuild": {"method": "str"}}, True, - id="Only geneBuild method, restrict update", + id="Only genebuild method, perform meta update", ), ], ) def test_filter_genome_meta( - data_dir: Path, - genome_metadata: Dict[str, Any], - output: Dict[str, Any], - meta_filter: StrPath, + genome_metadata: dict[str, Any], + output: dict[str, Any], + metafilter: StrPath, meta_update: bool, ) -> None: """Tests the `dump.filter_genome_meta()` method. @@ -216,22 +216,40 @@ def test_filter_genome_meta( Args: genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. - meta_filter: - restrict_filter: + metafilter: Type evaluated meta filter. + meta_update: Permit meta updating. """ - if meta_filter is not None: - meta_filter_file = data_dir / meta_filter - result = dump.filter_genome_meta(genome_metadata, meta_filter_file, meta_update) - else: - result = dump.filter_genome_meta(genome_metadata, meta_filter, meta_update) + result = dump.filter_genome_meta(genome_metadata, metafilter, meta_update) assert not DeepDiff(result, output) # assert not DeepDiff(expected_meta, meta_filter) +@pytest.mark.parametrize( + ("meta_dict", "expected_dict"), + [ + pytest.param( + {"key1": {"sub1": "str"}, "key2": {"sub2": "float"}, "key3": {"sub3": "int"}}, + "{'key1': {'sub1': }, 'key2': {'sub2': }, 'key3': {'sub3': }}", + id="Filter conversion", + ), + ], +) +def test_convert_dict(meta_dict: dict, expected_dict: dict) -> None: + """Tests the `dump.convert_dict()` method. + + Args: + meta_dict: Dict containing string based meta 'subkey' value pairs. + expected_dict: Dict with converted 'subkey' class types. + """ + convert_dict = dump.convert_dict(meta_dict) + string_convert = str(convert_dict) + assert not DeepDiff(string_convert, expected_dict) + + @patch("sqlalchemy.engine.Result") @patch("sqlalchemy.orm.Session") @pytest.mark.parametrize( - "db_name, meta_data, output, expectation", + ("db_name", "meta_data", "output", "expectation"), [ pytest.param(None, [], {}, does_not_raise(), id="Empty meta table"), pytest.param( @@ -239,7 +257,7 @@ def test_filter_genome_meta( [], {"database": {"name": "test_dbname_core_110_1"}}, does_not_raise(), - id="db_name append, Empty meta table", + id="db_name append, empty meta table", ), pytest.param( None, @@ -287,7 +305,7 @@ def test_filter_genome_meta( }, }, does_not_raise(), - id="dbname append to meta", + id="db_name append to meta", ), ], ) @@ -295,8 +313,8 @@ def test_get_genome_metadata( mock_session: Mock, mock_result: Mock, db_name: str | None, - meta_data: List[MetaRow], - output: Dict[str, Any], + meta_data: list[MetaRow], + output: dict[str, Any], expectation: ContextManager, ) -> None: """Tests the `dump.get_genome_metadata()` method. @@ -308,7 +326,6 @@ def test_get_genome_metadata( output: Expected genome metadata dictionary. expectation: Context manager for the expected exception (if any). """ - # pylint: disable=too-many-positional-arguments mock_result.unique.return_value = mock_result mock_result.all.return_value = meta_data mock_session.execute.return_value = mock_result @@ -381,7 +398,7 @@ def test_parse_args(arg_list: list[str], expected: dict) -> None: @pytest.mark.parametrize( - "arg_list, db_url, metafilter, meta_update, append_db, stdout", + ("arg_list", "db_url", "metafilter", "meta_update", "append_db", "stdout"), [ param( [ @@ -424,7 +441,7 @@ def test_main( dump.main(arg_list) # Check that we have called the mocked function once with the expected parameters mock_metadata_dump_setup.assert_called_once_with( - db_url=db_url, metafilter=metafilter, meta_update=meta_update, append_db=append_db + db_url=db_url, input_filter=metafilter, meta_update=meta_update, append_db=append_db ) # Check that the stdout is as expected captured = capsys.readouterr() diff --git a/src/python/tests/genome_metadata/test_dump/filter_noupdate.json b/src/python/tests/genome_metadata/test_dump/filter_noupdate.json deleted file mode 100644 index 59a5f8f06..000000000 --- a/src/python/tests/genome_metadata/test_dump/filter_noupdate.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "genebuild": { - "method": "str" - } -} diff --git a/src/python/tests/genome_metadata/test_dump/species_filter.json b/src/python/tests/genome_metadata/test_dump/species_filter.json deleted file mode 100644 index 31ca5ddd5..000000000 --- a/src/python/tests/genome_metadata/test_dump/species_filter.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "species": { - "display_name": "str", - "production_name": "str", - "taxonomy_id": "int" - } -} diff --git a/src/python/tests/genome_metadata/test_dump/version_filter.json b/src/python/tests/genome_metadata/test_dump/version_filter.json deleted file mode 100644 index df77276e3..000000000 --- a/src/python/tests/genome_metadata/test_dump/version_filter.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "assembly": { - "version": "str" - }, - "genebuild": { - "version": "str", - "method": "str" - } -} From f9440b9eb165540569acfbed9759cdf46a53e6ce Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:35:02 +0000 Subject: [PATCH 23/30] Remove comment line Co-authored-by: J. Alvarez-Jarreta --- src/python/tests/genome_metadata/test_dump.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 89d1d40dc..ab49ec993 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -221,7 +221,6 @@ def test_filter_genome_meta( """ result = dump.filter_genome_meta(genome_metadata, metafilter, meta_update) assert not DeepDiff(result, output) - # assert not DeepDiff(expected_meta, meta_filter) @pytest.mark.parametrize( From 113f1d2440c3c5f75ffc0143405752646877c31d Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 15 Nov 2024 16:08:33 +0000 Subject: [PATCH 24/30] cicd fix --- src/python/tests/genome_metadata/test_dump.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index ab49ec993..2c207b252 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -20,7 +20,6 @@ """ # pylint: disable=too-many-positional-arguments -from pathlib import Path from typing import Any, ContextManager from unittest.mock import Mock, patch from collections import namedtuple @@ -227,8 +226,8 @@ def test_filter_genome_meta( ("meta_dict", "expected_dict"), [ pytest.param( - {"key1": {"sub1": "str"}, "key2": {"sub2": "float"}, "key3": {"sub3": "int"}}, - "{'key1': {'sub1': }, 'key2': {'sub2': }, 'key3': {'sub3': }}", + {"k1": {"sk1": "str"}, "k2": {"sk2": "float"}, "k3": {"sk3": "int"}}, + "{'k1': {'sk1': }, 'k2': {'sk2': }, 'k3': {'sk3': }}", id="Filter conversion", ), ], From f3b844d459d13d4f68f478e51d033403289cda70 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:33:13 +0000 Subject: [PATCH 25/30] Update src/python/ensembl/io/genomio/genome_metadata/dump.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 10cea0d47..e0b9583a1 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -161,7 +161,7 @@ def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: genome_metadata: Nested metadata key values from the core metadata table. """ assembly = gmeta_out.get("assembly", {}) - if bool(assembly.get("provider_name")): + if assembly.get("provider_name"): if assembly.get("provider_name", "") == "RefSeq": assembly["accession"] = assembly["accession"].replace("GCA", "GCF") logging.info("GCA accession updated to RefSeq GFC accession.") From ffcec2ffa413d54f73f844b1f8037268793667b8 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:33:23 +0000 Subject: [PATCH 26/30] Update src/python/ensembl/io/genomio/genome_metadata/dump.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index e0b9583a1..4d2073fb3 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -166,7 +166,7 @@ def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: assembly["accession"] = assembly["accession"].replace("GCA", "GCF") logging.info("GCA accession updated to RefSeq GFC accession.") else: - logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly.get('provider_name')}") + logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}") else: logging.debug( "Meta filter update to RefSeq accession not done: user meta filter missing: \ From ad328528e744946d29d5ff89469d9ca74d8643bd Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:33:32 +0000 Subject: [PATCH 27/30] Update src/python/ensembl/io/genomio/genome_metadata/dump.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 4d2073fb3..433e72c14 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -162,7 +162,7 @@ def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: """ assembly = gmeta_out.get("assembly", {}) if assembly.get("provider_name"): - if assembly.get("provider_name", "") == "RefSeq": + if assembly["provider_name"] == "RefSeq": assembly["accession"] = assembly["accession"].replace("GCA", "GCF") logging.info("GCA accession updated to RefSeq GFC accession.") else: From 66397399af171930ab180222bc9604e31cba8705 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:33:40 +0000 Subject: [PATCH 28/30] Update src/python/ensembl/io/genomio/genome_metadata/dump.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 433e72c14..de90effc6 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -229,7 +229,7 @@ def check_genebuild_version(genome_metadata: dict[str, Any]) -> None: def convert_dict(meta_dict: dict) -> dict: - """Converts text json to add type properties from string + """Converts text JSON to add type properties from string Args: meta_dict: User meta dictionary with literal string typing to be converted From ae578fbf44469d41678976196b53e9bf003b4490 Mon Sep 17 00:00:00 2001 From: Lahcen Campbell <32962169+ens-LCampbell@users.noreply.github.com> Date: Fri, 15 Nov 2024 16:33:54 +0000 Subject: [PATCH 29/30] Update src/python/ensembl/io/genomio/genome_metadata/dump.py Co-authored-by: J. Alvarez-Jarreta --- src/python/ensembl/io/genomio/genome_metadata/dump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index de90effc6..b219bc6df 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -232,7 +232,7 @@ def convert_dict(meta_dict: dict) -> dict: """Converts text JSON to add type properties from string Args: - meta_dict: User meta dictionary with literal string typing to be converted + meta_dict: User meta dictionary with literal string typing to be converted. """ new_dict = meta_dict.copy() for key, value in meta_dict.items(): From 07f756dc1d201074b6dd4b8a3f85f1335d066872 Mon Sep 17 00:00:00 2001 From: ens-LCampbell Date: Fri, 15 Nov 2024 16:46:14 +0000 Subject: [PATCH 30/30] Fix test import PEP8 --- src/python/tests/genome_metadata/test_dump.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 2c207b252..3c22e7d09 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -20,17 +20,17 @@ """ # pylint: disable=too-many-positional-arguments -from typing import Any, ContextManager -from unittest.mock import Mock, patch from collections import namedtuple from contextlib import nullcontext as does_not_raise +from typing import Any, ContextManager +from unittest.mock import Mock, patch + from deepdiff import DeepDiff import pytest from pytest import param from _pytest.capture import CaptureFixture from sqlalchemy.engine import make_url, URL - from ensembl.io.genomio.genome_metadata import dump from ensembl.utils import StrPath