diff --git a/src/python/ensembl/io/genomio/data/schemas/genome.json b/src/python/ensembl/io/genomio/data/schemas/genome.json index 54985bb7a..28f56583d 100644 --- a/src/python/ensembl/io/genomio/data/schemas/genome.json +++ b/src/python/ensembl/io/genomio/data/schemas/genome.json @@ -110,6 +110,14 @@ { "type" : "array", "items" : { "type" : "string" } } ] } + }, + "database_info" : { + "type": "object", + "additionalProperties": false, + "description" : "Optional name of target database where meta data was retrieved.", + "properties" : { + "name": { "type" : "string" } + } } }, @@ -123,7 +131,8 @@ "genebuild" : { "$ref" : "#/definitions/genebuild_info" }, "provider" : { "$ref" : "#/definitions/provider_info" }, "BRC4" : { "$ref" : "#/definitions/BRC4_info" }, - "added_seq" : { "$ref" : "#/definitions/added_sequence_info" } + "added_seq" : { "$ref" : "#/definitions/added_sequence_info" }, + "database" : { "$ref" : "#/definitions/database_info" } }, "required" : [ "species", diff --git a/src/python/ensembl/io/genomio/database/factory.py b/src/python/ensembl/io/genomio/database/factory.py index 450992431..58a57b0df 100644 --- a/src/python/ensembl/io/genomio/database/factory.py +++ b/src/python/ensembl/io/genomio/database/factory.py @@ -153,7 +153,7 @@ def main(arg_list: list[str] | None = None) -> None: """Main script entry-point. Args: - arg_list: TODO + arg_list: Arguments to parse passing list to parse_args(). """ args = parse_args(arg_list) diff --git a/src/python/ensembl/io/genomio/database/meta_getter.py b/src/python/ensembl/io/genomio/database/meta_getter.py index 01c4e0599..da75fcaae 100644 --- a/src/python/ensembl/io/genomio/database/meta_getter.py +++ b/src/python/ensembl/io/genomio/database/meta_getter.py @@ -12,8 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""A simple helper script to connect to a core database and retrieve a single meta_value -or multiple meta_value and dump meta_key/value pairs to stdout / JSON.""" +"""Connect to a core database and retrieve a meta_key:meta_value pair(s) +and dump meta_key/value pairs to stdout / JSON.""" __all__ = ["get_meta_values"] @@ -91,7 +91,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: """ parser = ArgumentParser(description=__doc__) - parser.add_server_arguments(include_database=True, help="core database") + parser.add_server_arguments(include_database=True, help="server url and core database") parser.add_argument_src_path( "--meta_keys_list", help="Input File | List with >=2 meta_keys to query target database." ) @@ -104,7 +104,6 @@ def main(arg_list: list[str] | None = None) -> None: Args: arg_list: Arguments to parse passing list to parse_args(). - """ args = parse_args(arg_list) init_logging_with_args(args) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 7cb7702b5..b219bc6df 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -19,22 +19,29 @@ "filter_genome_meta", "check_assembly_version", "check_genebuild_version", + "metadata_dump_setup", ] +import argparse import json -from typing import Any, Dict, Type +from typing import Any, Type import logging +from pydoc import locate from sqlalchemy import select from sqlalchemy.orm import Session +from sqlalchemy.engine import URL from ensembl.core.models import Meta +from ensembl.io.genomio.utils.json_utils import get_json from ensembl.io.genomio.database import DBConnectionLite from ensembl.utils.argparse import ArgumentParser +from ensembl.utils import StrPath from ensembl.utils.logging import init_logging_with_args -METADATA_FILTER: Dict[str, Dict[str, Type]] = { +DEFAULT_FILTER: dict[str, dict[str, Type]] = { + "database": {"name": str}, "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, "assembly": { @@ -60,14 +67,15 @@ } -def get_genome_metadata(session: Session) -> Dict[str, Any]: +def get_genome_metadata(session: Session, db_name: str | None) -> dict[str, Any]: """Returns the meta table content from the core database in a nested dictionary. Args: session: Session for the current core. - + db_name: Target database name """ - genome_metadata: Dict[str, Any] = {} + genome_metadata: dict[str, Any] = {} + meta_statement = select(Meta) for row in session.execute(meta_statement).unique().all(): meta_key = row[0].meta_key @@ -81,6 +89,10 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: genome_metadata[main_key][subkey] = [meta_value] else: genome_metadata[main_key] = {subkey: [meta_value]} + + if db_name: + genome_metadata["database"] = {"name": f"{db_name}"} + # Parse genome metadata to simplify dictionary and check data consistency for main_key, subkeys_dict in genome_metadata.items(): # Replace single-value lists by the value itself @@ -96,19 +108,34 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: return genome_metadata -def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: +def filter_genome_meta( + genome_metadata: dict[str, Any], metafilter: dict | None, meta_update: bool +) -> dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. Also converts to expected data types (to follow the genome JSON schema). Args: genome_metadata: Nested metadata key values from the core metadata table. + metafilter: Input JSON containing subset of meta table values to filter on. + meta_update: Deactivates additional meta updating. + """ - filtered_metadata: Dict[str, Any] = {} - for key, subfilter in METADATA_FILTER.items(): + filtered_metadata: dict[str, Any] = {} + + if metafilter: + metadata_filter: dict[str, dict[str, type]] = metafilter + else: + metadata_filter = DEFAULT_FILTER + + for key, subfilter in metadata_filter.items(): if key in genome_metadata: filtered_metadata[key] = {} for subkey, value_type in subfilter.items(): + if isinstance(value_type, str): + value_type = type(value_type) + if isinstance(value_type, int): + value_type = type(value_type) if subkey in genome_metadata[key]: value = genome_metadata[key][subkey] if isinstance(value, list): @@ -116,25 +143,38 @@ def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: else: value = value_type(value) filtered_metadata[key][subkey] = value - # Check assembly and genebuild versions - check_assembly_refseq(filtered_metadata) - check_assembly_version(filtered_metadata) - check_genebuild_version(filtered_metadata) + + # Optional assembly and genebuild based filtering: + if meta_update: + # Check assembly and genebuild versions + check_assembly_refseq(filtered_metadata) + check_assembly_version(filtered_metadata) + check_genebuild_version(filtered_metadata) + return filtered_metadata -def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None: +def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: """Update the GCA accession to use GCF if it is from RefSeq. Args: genome_metadata: Nested metadata key values from the core metadata table. """ assembly = gmeta_out.get("assembly", {}) - if assembly.get("provider_name", "") == "RefSeq": - assembly["accession"] = assembly["accession"].replace("GCA", "GCF") + if assembly.get("provider_name"): + if assembly["provider_name"] == "RefSeq": + assembly["accession"] = assembly["accession"].replace("GCA", "GCF") + logging.info("GCA accession updated to RefSeq GFC accession.") + else: + logging.info(f"Meta check 'assembly is RefSeq': Asm provider = {assembly['provider_name']}") + else: + logging.debug( + "Meta filter update to RefSeq accession not done: user meta filter missing: \ + 'assembly.provider_name'" + ) -def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: +def check_assembly_version(genome_metadata: dict[str, Any]) -> None: """Updates the assembly version of the genome metadata provided. If `version` meta key is not and integer or it is not available, the assembly accession's version @@ -164,7 +204,7 @@ def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: logging.info(f'Located version [v{assembly["version"]}] info from meta data.') -def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: +def check_genebuild_version(genome_metadata: dict[str, Any]) -> None: """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. Args: @@ -188,19 +228,82 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: genome_metadata["genebuild"].pop("id", None) -def main() -> None: - """Main script entry-point.""" - parser = ArgumentParser( - description="Fetch the genome metadata from a core database and print it in JSON format." +def convert_dict(meta_dict: dict) -> dict: + """Converts text JSON to add type properties from string + + Args: + meta_dict: User meta dictionary with literal string typing to be converted. + """ + new_dict = meta_dict.copy() + for key, value in meta_dict.items(): + if isinstance(value, dict): + new_dict[key] = convert_dict(value) + else: + new_dict[key] = locate(value) + return new_dict + + +def metadata_dump_setup( + db_url: URL, input_filter: StrPath | None, meta_update: bool, append_db: bool +) -> dict[str, Any]: + """Setup main stages of genome meta dump from user input arguments provided. + Args: + db_url: Target core database URL. + input_filter: Input JSON containing subset of meta table values to filter on. + no_update: Deactivate additional meta updating. + append_db: Append target core database name to output JSON. + + """ + dbc = DBConnectionLite(db_url) + db_name = None + meta_filter = {} + if append_db: + db_name = db_url.database + + if input_filter: + unconverted_json = get_json(input_filter) + meta_filter = convert_dict(unconverted_json) + + with dbc.session_scope() as session: + genome_meta = get_genome_metadata(session, db_name) + genome_meta = filter_genome_meta(genome_meta, meta_filter, meta_update) + + return genome_meta + + +def parse_args(arg_list: list[str] | None) -> argparse.Namespace: + """Return a populated namespace with the arguments parsed from a list or from the command line. + + Args: + arg_list: List of arguments to parse. If `None`, grab them from the command line. + + """ + parser = ArgumentParser(description=__doc__) + parser.add_server_arguments(include_database=True, help="server url and core database") + parser.add_argument_src_path( + "--metafilter", default=None, help="JSON file of nested meta_key:meta_value to filter dump output." + ) + parser.add_argument( + "--meta_update", + action="store_true", + help="Perform assembly and genebuild 'version' metadata checks & update if needed.", ) - parser.add_server_arguments(include_database=True) + parser.add_argument("--append_db", action="store_true", help="Append core database name to output JSON.") parser.add_log_arguments(add_log_file=True) - args = parser.parse_args() + return parser.parse_args(arg_list) + + +def main(arg_list: list[str] | None = None) -> None: + """Main script entry-point. + + Args: + arg_list: Arguments to parse passing list to parse_args(). + """ + args = parse_args(arg_list) init_logging_with_args(args) - dbc = DBConnectionLite(args.url) - with dbc.session_scope() as session: - genome_meta = get_genome_metadata(session) - genome_meta = filter_genome_meta(genome_meta) + genome_meta = metadata_dump_setup( + db_url=args.url, input_filter=args.metafilter, meta_update=args.meta_update, append_db=args.append_db + ) print(json.dumps(genome_meta, indent=2, sort_keys=True)) diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 75c4a4494..3c22e7d09 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -18,23 +18,28 @@ $ pytest test_dump.py """ +# pylint: disable=too-many-positional-arguments from collections import namedtuple from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict, List +from typing import Any, ContextManager from unittest.mock import Mock, patch from deepdiff import DeepDiff import pytest +from pytest import param +from _pytest.capture import CaptureFixture +from sqlalchemy.engine import make_url, URL from ensembl.io.genomio.genome_metadata import dump +from ensembl.utils import StrPath MetaRow = namedtuple("MetaRow", "meta_key meta_value") @pytest.mark.parametrize( - "genome_metadata, output, expectation", + ("genome_metadata", "output", "expectation"), [ pytest.param({"assembly": {"version": "1"}}, 1, does_not_raise(), id="Version is '1'"), pytest.param( @@ -58,7 +63,7 @@ ], ) def test_check_assembly_version( - genome_metadata: Dict[str, Any], output: int, expectation: ContextManager + genome_metadata: dict[str, Any], output: int, expectation: ContextManager ) -> None: """Tests the `dump.check_assembly_version()` method. @@ -73,7 +78,7 @@ def test_check_assembly_version( @pytest.mark.parametrize( - "genome_metadata, output, expectation", + ("genome_metadata", "output", "expectation"), [ pytest.param({}, {}, does_not_raise(), id="No 'genebuild' entry"), pytest.param( @@ -98,7 +103,7 @@ def test_check_assembly_version( ], ) def test_check_genebuild_version( - genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager + genome_metadata: dict[str, Any], output: dict[str, Any], expectation: ContextManager ) -> None: """Tests the `dump.check_genebuild_version()` method. @@ -115,34 +120,145 @@ def test_check_genebuild_version( @patch("ensembl.io.genomio.genome_metadata.dump.check_genebuild_version", Mock()) @patch("ensembl.io.genomio.genome_metadata.dump.check_assembly_version", Mock()) @pytest.mark.parametrize( - "genome_metadata, output", + ("genome_metadata", "output", "metafilter", "meta_update"), [ - ({"species": {"taxonomy_id": "5485"}}, {"species": {"taxonomy_id": 5485}}), - ({"species": {"display_name": "Dog"}}, {"species": {"display_name": "Dog"}}), - ({"genebuild": {"new_key": "_"}}, {"genebuild": {}}), - ({"BRC5": "new_value"}, {}), - ({"meta": "key", "species": {"alias": "woof"}}, {"species": {"alias": "woof"}}), - ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), + pytest.param( + {"species": {"taxonomy_id": "5485"}}, + {"species": {"taxonomy_id": 5485}}, + None, + False, + id="Meta matches, no filter, allow meta update", + ), + pytest.param( + {"species": {"taxonomy_id": "5485"}}, + {"species": {"taxonomy_id": 5485}}, + None, + True, + id="Meta matches, no filter, perform meta update", + ), + pytest.param( + {"genebuild": {"new_key": "_"}}, {"genebuild": {}}, None, False, id="Filters on '_' value" + ), + pytest.param({"BRC5": "new_value"}, {}, None, False, id="BRC5 new value"), + pytest.param( + {"meta": "key", "species": {"alias": "woof"}}, + {"species": {"alias": "woof"}}, + None, + False, + id="Test alias", + ), + pytest.param( + {"added_seq": {"region_name": [1, 2]}}, + {"added_seq": {"region_name": ["1", "2"]}}, + None, + False, + id="Added seq region_name", + ), + pytest.param({}, {}, None, False, id="BRC5 new value"), + pytest.param( + { + "species": { + "display_name": "Honeybee", + "annotation_source": "Ensembl", + "production_name": "apis_melifera_gca123v1", + "scientific_name": "apis_melifera", + "taxonomy_id": "70921", + } + }, + { + "species": { + "display_name": "Honeybee", + "production_name": "apis_melifera_gca123v1", + "taxonomy_id": "70921", + } + }, + {"species": {"display_name": "str", "production_name": "str", "taxonomy_id": "int"}}, + False, + id="Filter via input meta JSON", + ), + pytest.param( + { + "annotation": { + "provider_name": "ENA", + }, + "assembly": {"accession": "GCA_000111222.3", "version": "1"}, + "genebuild": {"method": "import", "version": "1"}, + }, + {"assembly": {"version": "1"}, "genebuild": {"method": "import", "version": "1"}}, + {"assembly": {"version": "str"}, "genebuild": {"version": "str", "method": "str"}}, + False, + id="Asm + Genebuild version filter", + ), + pytest.param( + { + "annotation": { + "provider_name": "ENA", + }, + "assembly": {"accession": "GCA_000111222.3", "version": "1"}, + "genebuild": {"method": "import", "version": "1"}, + }, + {"genebuild": {"method": "import"}}, + {"genebuild": {"method": "str"}}, + True, + id="Only genebuild method, perform meta update", + ), ], ) -def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: +def test_filter_genome_meta( + genome_metadata: dict[str, Any], + output: dict[str, Any], + metafilter: StrPath, + meta_update: bool, +) -> None: """Tests the `dump.filter_genome_meta()` method. Args: genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. + metafilter: Type evaluated meta filter. + meta_update: Permit meta updating. """ - result = dump.filter_genome_meta(genome_metadata) + result = dump.filter_genome_meta(genome_metadata, metafilter, meta_update) assert not DeepDiff(result, output) +@pytest.mark.parametrize( + ("meta_dict", "expected_dict"), + [ + pytest.param( + {"k1": {"sk1": "str"}, "k2": {"sk2": "float"}, "k3": {"sk3": "int"}}, + "{'k1': {'sk1': }, 'k2': {'sk2': }, 'k3': {'sk3': }}", + id="Filter conversion", + ), + ], +) +def test_convert_dict(meta_dict: dict, expected_dict: dict) -> None: + """Tests the `dump.convert_dict()` method. + + Args: + meta_dict: Dict containing string based meta 'subkey' value pairs. + expected_dict: Dict with converted 'subkey' class types. + """ + convert_dict = dump.convert_dict(meta_dict) + string_convert = str(convert_dict) + assert not DeepDiff(string_convert, expected_dict) + + @patch("sqlalchemy.engine.Result") @patch("sqlalchemy.orm.Session") @pytest.mark.parametrize( - "meta_data, output, expectation", + ("db_name", "meta_data", "output", "expectation"), [ - pytest.param([], {}, does_not_raise(), id="Empty meta table"), + pytest.param(None, [], {}, does_not_raise(), id="Empty meta table"), + pytest.param( + "test_dbname_core_110_1", + [], + {"database": {"name": "test_dbname_core_110_1"}}, + does_not_raise(), + id="db_name append, empty meta table", + ), pytest.param( + None, [ [MetaRow("sample", "gene1")], [MetaRow("species.name", "dog")], @@ -153,6 +269,7 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A id="Meta table with simple values", ), pytest.param( + None, [ [MetaRow("sample", "gene1")], [MetaRow("sample", "gene2")], @@ -164,24 +281,45 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A id="Meta table with lists", ), pytest.param( + None, [[MetaRow("species", "dog")], [MetaRow("species.synonym", "puppy")]], {}, pytest.raises(ValueError), id="'species' and 'species.synonym' meta keys", ), + pytest.param( + "test_dbname_core_110_1", + [ + [MetaRow("assembly.accession", "GCA_000111222.3")], + [MetaRow("species.annotation_source", "Community")], + [MetaRow("species.production_name", "genus_species_gca000111222v3cm")], + ], + { + "assembly": {"accession": "GCA_000111222.3"}, + "database": {"name": "test_dbname_core_110_1"}, + "species": { + "annotation_source": "Community", + "production_name": "genus_species_gca000111222v3cm", + }, + }, + does_not_raise(), + id="db_name append to meta", + ), ], ) def test_get_genome_metadata( mock_session: Mock, mock_result: Mock, - meta_data: List[MetaRow], - output: Dict[str, Any], + db_name: str | None, + meta_data: list[MetaRow], + output: dict[str, Any], expectation: ContextManager, ) -> None: """Tests the `dump.get_genome_metadata()` method. Args: mock_session: A mock of `sqlalchemy.orm.Session()` class. + db_name: Target core database name. meta_data: `meta` table content in a list of named tuples. output: Expected genome metadata dictionary. expectation: Context manager for the expected exception (if any). @@ -190,5 +328,119 @@ def test_get_genome_metadata( mock_result.all.return_value = meta_data mock_session.execute.return_value = mock_result with expectation: - result = dump.get_genome_metadata(mock_session) + result = dump.get_genome_metadata(mock_session, db_name) assert not DeepDiff(result, output) + + +@pytest.mark.parametrize( + "arg_list, expected", + [ + param( + ["--host", "localhost", "--port", "42", "--user", "me", "--database", "test_db"], + { + "host": "localhost", + "port": 42, + "user": "me", + "password": None, + "url": make_url("mysql://me@localhost:42/test_db"), + "database": "test_db", + "metafilter": None, + "meta_update": False, + "append_db": False, + "log_file": None, + "log_level": "WARNING", + "log_file_level": "DEBUG", + }, + id="Default args", + ), + param( + [ + "--host", + "localhost", + "--port", + "42", + "--user", + "me", + "--database", + "test_db", + "--metafilter", + f"{__file__}", + "--append_db", + ], + { + "host": "localhost", + "port": 42, + "user": "me", + "password": None, + "url": make_url("mysql://me@localhost:42/test_db"), + "database": "test_db", + "metafilter": __file__, + "meta_update": False, + "append_db": True, + "log_file": None, + "log_level": "WARNING", + "log_file_level": "DEBUG", + }, + id="Filter, non-default args", + ), + ], +) +def test_parse_args(arg_list: list[str], expected: dict) -> None: + """Tests the `dump.parse_args()` function.""" + # pylint: disable=too-many-positional-arguments + args = dump.parse_args(arg_list) + if args.metafilter: + # DeepDiff is not able to compare two objects of Path type, so convert it to string + setattr(args, "metafilter", str(args.metafilter)) + assert not DeepDiff(vars(args), expected) + + +@pytest.mark.parametrize( + ("arg_list", "db_url", "metafilter", "meta_update", "append_db", "stdout"), + [ + param( + [ + "--host", + "localhost", + "--port", + "42", + "--user", + "me", + "--database", + "test_dbname_core_110_1", + "--append_db", + ], + make_url("mysql://me@localhost:42/test_dbname_core_110_1"), + None, + False, + True, + '{\n "database": {\n "name": "test_dbname_core_110_1"\n }\n}\n', + id="Call main and append_db", + ), + ], +) +@patch("ensembl.io.genomio.genome_metadata.dump.metadata_dump_setup") +def test_main( + mock_metadata_dump_setup: Mock, + capsys: CaptureFixture[str], + arg_list: list[str], + db_url: URL, + metafilter: StrPath, + meta_update: bool, + append_db: bool, + stdout: str, +) -> None: + """Tests the `dump.main()` function (entry point). + + Fixtures: capsys + """ + # pylint: disable=too-many-positional-arguments + mock_metadata_dump_setup.return_value = {"database": {"name": "test_dbname_core_110_1"}} + dump.main(arg_list) + # Check that we have called the mocked function once with the expected parameters + mock_metadata_dump_setup.assert_called_once_with( + db_url=db_url, input_filter=metafilter, meta_update=meta_update, append_db=append_db + ) + # Check that the stdout is as expected + captured = capsys.readouterr() + assert captured.out == stdout