diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f56d8df..9bc4e1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -67,7 +67,7 @@ repos: - '^hotfix/([a-z][a-z0-9]*)(-[a-z0-9]+)*$' - '^refactor/([a-z][a-z0-9]*)(-[a-z0-9]+)*$' - '^review/([a-z][a-z0-9]*)(-[a-z0-9]+)*$' - - '^release/(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$' + - '^enhancement/([a-z][a-z0-9]*)(-[a-z0-9]+)*$' - repo: https://github.com/octue/pre-commit-hooks rev: 0.5.0 diff --git a/setup.py b/setup.py index 2fc16e7..79c367d 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name="twined", - version="0.1.2", + version="0.2.0", py_modules=[], install_requires=["jsonschema ~= 3.2.0", "python-dotenv"], url="https://www.github.com/octue/twined", diff --git a/tests/data/apps/empty_app/twine.json b/tests/data/apps/empty_app/twine.json index 8c410a5..ad44e51 100644 --- a/tests/data/apps/empty_app/twine.json +++ b/tests/data/apps/empty_app/twine.json @@ -12,7 +12,7 @@ "credentials": [ ], "input_manifest": { - "datasets": [] + "datasets": {} }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", @@ -23,7 +23,7 @@ } }, "output_manifest": { - "datasets": [] + "datasets": {} }, "output_values_schema": { "title": "Output Values", diff --git a/tests/data/apps/example_app/twine.json b/tests/data/apps/example_app/twine.json index 1a14fc8..1f93e78 100644 --- a/tests/data/apps/example_app/twine.json +++ b/tests/data/apps/example_app/twine.json @@ -32,16 +32,14 @@ } ], "input_manifest": { - "datasets": [ - { - "key": "met_mast_data", + "datasets": { + "met_mast_data": { "purpose": "A dataset containing meteorological mast data" }, - { - "key": "scada_data", + "scada_data": { "purpose": "A dataset containing scada data" } - ] + } }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", @@ -57,12 +55,11 @@ } }, "output_manifest": { - "datasets": [ - { - "key": "production_data", + "datasets": { + "production_data": { "purpose": "A dataset containing production data" } - ] + } }, "output_values_schema": { "title": "Output Values", diff --git a/tests/data/apps/simple_app/twine.json b/tests/data/apps/simple_app/twine.json index 31fd99c..c3ceffa 100644 --- a/tests/data/apps/simple_app/twine.json +++ b/tests/data/apps/simple_app/twine.json @@ -68,6 +68,6 @@ } }, "output_manifest": { - "datasets": [] + "datasets": {} } } diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index a879e15..01a511e 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -1,6 +1,5 @@ import copy import os -import unittest from unittest.mock import patch from jsonschema.validators import RefResolver @@ -14,32 +13,28 @@ class TestManifestStrands(BaseTestCase): VALID_MANIFEST_STRAND = """ { "configuration_manifest": { - "datasets": [ - { - "key": "configuration_files_data", + "datasets": { + "configuration_files_data": { "purpose": "A dataset containing files used in configuration" } - ] + } }, "input_manifest": { - "datasets": [ - { - "key": "met_mast_data", + "datasets": { + "met_mast_data": { "purpose": "A dataset containing meteorological mast data" }, - { - "key": "scada_data", + "scada_data": { "purpose": "A dataset containing scada data" } - ] + } }, "output_manifest": { - "datasets": [ - { - "key": "output_files_data", + "datasets": { + "output_files_data": { "purpose": "A dataset containing output results" } - ] + } } } """ @@ -47,9 +42,8 @@ class TestManifestStrands(BaseTestCase): TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ { "input_manifest": { - "datasets": [ - { - "key": "met_mast_data", + "datasets": { + "met_mast_data": { "purpose": "A dataset containing meteorological mast data", "file_tags_template": { "type": "object", @@ -75,7 +69,7 @@ class TestManifestStrands(BaseTestCase): ] } } - ] + } } } """ @@ -83,8 +77,8 @@ class TestManifestStrands(BaseTestCase): INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "met_mast_data", "tags": {}, @@ -122,7 +116,7 @@ class TestManifestStrands(BaseTestCase): } ] } - ] + } } """ @@ -145,8 +139,8 @@ def test_valid_manifest_files(self): valid_configuration_manifest = """ { "id": "3ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "configuration_files_data": { "id": "34ad7669-8162-4f64-8cd5-4abe92509e17", "name": "configuration_files_data", "tags": {}, @@ -182,15 +176,15 @@ def test_valid_manifest_files(self): } ] } - ] + } } """ valid_input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "met_mast_data", "tags": {}, @@ -226,15 +220,15 @@ def test_valid_manifest_files(self): } ] } - ] + } } """ valid_output_manifest = """ { "id": "2ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "output_files_data": { "id": "1ead7669-8162-4f64-8cd5-4abe92509e17", "name": "output_files_data", "tags": {}, @@ -270,7 +264,7 @@ def test_valid_manifest_files(self): } ] } - ] + } } """ @@ -348,8 +342,8 @@ def test_error_raised_when_required_tags_missing_for_validate_input_manifest(sel input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "met_mast_data", "tags": {}, @@ -367,7 +361,7 @@ def test_error_raised_when_required_tags_missing_for_validate_input_manifest(sel } ] } - ] + } } """ @@ -383,8 +377,8 @@ def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_requir input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "met_mast_data", "tags": {}, @@ -402,7 +396,7 @@ def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_requir } ] } - ] + } } """ @@ -431,15 +425,14 @@ def test_validate_input_manifest_with_required_tags_for_remote_tag_template_sche """ { "input_manifest": { - "datasets": [ - { - "key": "met_mast_data", + "datasets": { + "met_mast_data": { "purpose": "A dataset containing meteorological mast data", "file_tags_template": { "$ref": "%s" } } - ] + } } } """ @@ -481,9 +474,8 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): twine_with_input_manifest_with_required_tags_for_multiple_datasets = """ { "input_manifest": { - "datasets": [ - { - "key": "first_dataset", + "datasets": { + "first_dataset": { "purpose": "A dataset containing meteorological mast data", "file_tags_template": { "type": "object", @@ -497,8 +489,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): } } }, - { - "key": "second_dataset", + "second_dataset": { "file_tags_template": { "type": "object", "properties": { @@ -511,7 +502,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): } } } - ] + } } } """ @@ -519,8 +510,8 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "first_dataset": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", "name": "first_dataset", "tags": {}, @@ -541,7 +532,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): } ] }, - { + "second_dataset": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", "name": "second_dataset", "tags": {}, @@ -562,7 +553,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): } ] } - ] + } } """ @@ -574,30 +565,63 @@ def test_error_raised_if_multiple_datasets_have_same_name(self): input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { + "datasets": { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", "name": "met_mast_data", "tags": {}, "labels": [], "files": [] }, - { + "met_mast_data": { "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", "name": "met_mast_data", "tags": {}, "labels": [], "files": [] } - ] + } } """ twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - with self.assertRaises(exceptions.DatasetNameIsNotUnique): + with self.assertRaises(KeyError): twine.validate_input_manifest(source=input_manifest) + def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_given_as_list(self): + """Test that, if datasets are given as a list (the old format), a deprecation warning is issued and the list + is translated to a dictionary (the new format). + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [] + } + ] + } + """ -if __name__ == "__main__": - unittest.main() + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertWarns(DeprecationWarning): + manifest = twine.validate_input_manifest(source=input_manifest) + + self.assertEqual( + manifest["datasets"], + { + "met_mast_data": { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [], + } + }, + ) diff --git a/tests/test_twine.py b/tests/test_twine.py index 5389823..d638a0d 100644 --- a/tests/test_twine.py +++ b/tests/test_twine.py @@ -1,7 +1,6 @@ import os -import unittest -from twined import Twine, exceptions +from twined import MANIFEST_STRANDS, Twine, exceptions from .base import BaseTestCase @@ -63,6 +62,37 @@ def test_broken_json_twine(self): with self.assertRaises(exceptions.InvalidTwineJson): Twine(source=invalid_json_twine) + def test_deprecation_warning_issued_if_datasets_not_given_as_dictionary_in_manifest_strands(self): + """Test that, if datasets are given as a list in the manifest strands, a deprecation warning is issued and the + list (the old form) is converted to a dictionary (the new form). + """ + for manifest_strand in MANIFEST_STRANDS: + with self.subTest(manifest_strand=manifest_strand): + invalid_twine = ( + """ + { + "%s": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data" + } + ] + } + } + """ + % manifest_strand + ) + + with self.assertWarns(DeprecationWarning): + twine = Twine(source=invalid_twine) -if __name__ == "__main__": - unittest.main() + self.assertEqual( + getattr(twine, manifest_strand)["datasets"], + { + "met_mast_data": { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + } + }, + ) diff --git a/twined/exceptions.py b/twined/exceptions.py index 03a01c9..fdcb25d 100644 --- a/twined/exceptions.py +++ b/twined/exceptions.py @@ -124,10 +124,6 @@ class InvalidManifestContents(InvalidManifest, ValidationError): """Raised when the manifest files are missing or do not match tags, sequences, clusters, extensions etc as required""" -class DatasetNameIsNotUnique(InvalidManifest): - """Raise when a dataset's name is not unique within its manifest.""" - - # --------------------- Exceptions relating to access of data using the Twine instance ------------------------ # TODO This is related to filtering files from a manifest. Determine whether this belongs here, diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index fdb2131..4bf3df2 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -21,62 +21,81 @@ "type": "string" }, "datasets": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "description": "ID of the dataset, typically a uuid", - "type": "string" - }, - "name": { - "description": "Name of the dataset", - "type": "string" - }, - "tags": {"$ref": "#/$defs/tags"}, - "labels": {"$ref": "#/$defs/labels"}, - "files": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "description": "A file id", - "type": "string" - }, - "path": { - "description": "Path at which the file can be found", - "type": "string" - }, - "extension": { - "description": "The file extension (not including a '.')", - "type": "string" + "type": "object", + "patternProperties": { + ".+": { + "type": "object", + "properties": { + "id": { + "description": "ID of the dataset, typically a uuid", + "type": "string" + }, + "name": { + "description": "Name of the dataset (the same as its key in the 'datasets' field).", + "type": "string" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + }, + "files": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "description": "A file id", + "type": "string" + }, + "path": { + "description": "Path at which the file can be found", + "type": "string" + }, + "extension": { + "description": "The file extension (not including a '.')", + "type": "string" + }, + "sequence": { + "description": "The ordering on the file, if any, within its group/cluster", + "type": [ + "integer", + "null" + ] + }, + "cluster": { + "description": "The group, or cluster, to which the file belongs", + "type": "integer" + }, + "posix_timestamp": { + "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", + "type": "number" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + } }, - "sequence": { - "description": "The ordering on the file, if any, within its group/cluster", - "type": ["integer", "null"] - }, - "cluster": { - "description": "The group, or cluster, to which the file belongs", - "type": "integer" - }, - "posix_timestamp": { - "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", - "type": "number" - }, - "tags": {"$ref": "#/$defs/tags"}, - "labels": {"$ref": "#/$defs/labels"} - }, - "required": [ - "id", - "path", - "tags", - "labels" - ] + "required": [ + "id", + "path", + "tags", + "labels" + ] + } } - } - }, - "required": ["id", "name", "tags", "labels", "files"] + }, + "required": [ + "id", + "name", + "tags", + "labels", + "files" + ] + } } } }, diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index 6e94a2e..06877a9 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -38,27 +38,23 @@ "type": "object", "properties": { "datasets": { - "type": "array", + "type": "object", "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "file_tags_template": { - "$ref": "#/$defs/file_tags_template" + "patternProperties": { + ".+": { + "description": "A dataset representation whose property name/key uniquely identifies the dataset to the service", + "type": "object", + "properties": { + "purpose": { + "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", + "type": "string", + "default": "" + }, + "file_tags_template": { + "$ref": "#/$defs/file_tags_template" + } } - }, - "required": [ - "key" - ] + } } } }, diff --git a/twined/twine.py b/twined/twine.py index 34e497d..5cb2255 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -1,6 +1,7 @@ import json as jsonlib import logging import os +import warnings import pkg_resources from dotenv import load_dotenv from jsonschema import ValidationError, validate as jsonschema_validate @@ -52,14 +53,27 @@ def _load_twine(self, source=None): """Load twine from a *.json filename, file-like or a json string and validates twine contents.""" if source is None: # If loading an unspecified twine, return an empty one rather than raising error (like in _load_data()) - raw = {} + raw_twine = {} logger.warning("No twine source specified. Loading empty twine.") else: - raw = self._load_json("twine", source, allowed_kinds=("file-like", "filename", "string", "object")) + raw_twine = self._load_json("twine", source, allowed_kinds=("file-like", "filename", "string", "object")) + + for strand in set(MANIFEST_STRANDS) & raw_twine.keys(): + if isinstance(raw_twine[strand]["datasets"], list): + raw_twine[strand]["datasets"] = {dataset["key"]: dataset for dataset in raw_twine[strand]["datasets"]} + + warnings.warn( + message=( + f"Datasets in the {strand!r} strand of the `twine.json` file should be provided as a " + "dictionary mapping their name to themselves. Support for providing a list of datasets will be " + "phased out soon." + ), + category=DeprecationWarning, + ) - self._validate_against_schema("twine", raw) - self._validate_twine_version(twine_file_twined_version=raw.get("twined_version", None)) - return raw + self._validate_against_schema("twine", raw_twine) + self._validate_twine_version(twine_file_twined_version=raw_twine.get("twined_version", None)) + return raw_twine def _load_json(self, kind, source, **kwargs): """Load data from either a *.json file, an open file pointer or a json string. Directly returns any other data.""" @@ -165,9 +179,19 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): # TODO elegant way of cleaning up this nasty serialisation hack to manage conversion of outbound manifests to primitive inbound = True - if hasattr(data, "serialise"): + if hasattr(data, "to_primitive"): inbound = False - data = data.serialise() + data = data.to_primitive() + + if isinstance(data["datasets"], list): + data["datasets"] = {dataset["name"]: dataset for dataset in data["datasets"]} + warnings.warn( + message=( + "Datasets belonging to a manifest should be provided as a dictionary mapping their name to" + "themselves. Support for providing a list of datasets will be phased out soon." + ), + category=DeprecationWarning, + ) self._validate_against_schema(kind, data) self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) @@ -189,20 +213,12 @@ def _validate_dataset_file_tags(self, manifest_kind, manifest): # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. manifest_schema = getattr(self, manifest_kind) - for dataset_schema in manifest_schema["datasets"]: - datasets = [dataset for dataset in manifest["datasets"] if dataset["name"] == dataset_schema["key"]] + for dataset_name, dataset_schema in manifest_schema["datasets"].items(): + dataset = manifest["datasets"].get(dataset_name) - if not datasets: + if not dataset: continue - if len(datasets) > 1: - raise exceptions.DatasetNameIsNotUnique( - f"There is more than one dataset named {dataset_schema['key']!r} - ensure each dataset within a " - f"manifest is uniquely named." - ) - - dataset = datasets.pop(0) - file_tags_template = dataset_schema.get("file_tags_template") if not file_tags_template: diff --git a/twined/utils/load_json.py b/twined/utils/load_json.py index 264708a..cb78544 100644 --- a/twined/utils/load_json.py +++ b/twined/utils/load_json.py @@ -33,7 +33,7 @@ def check(kind): if isinstance(source, io.IOBase): logger.debug("Detected source is a file-like object, loading contents...") check("file-like") - return json.load(source, *args, **kwargs) + return json.load(source, object_pairs_hook=raise_error_if_duplicate_keys, *args, **kwargs) elif not isinstance(source, str): logger.debug("Source is not a string, bypassing (returning raw data)") @@ -44,9 +44,27 @@ def check(kind): logger.debug("Detected source is name of a *.json file, loading from %s", source) check("filename") with open(source) as f: - return json.load(f, *args, **kwargs) + return json.load(f, object_pairs_hook=raise_error_if_duplicate_keys, *args, **kwargs) else: logger.debug("Detected source is string containing json data, parsing...") check("string") - return json.loads(source, *args, **kwargs) + return json.loads(source, object_pairs_hook=raise_error_if_duplicate_keys, *args, **kwargs) + + +def raise_error_if_duplicate_keys(pairs): + """Raise an error if any of the given key-value pairs have the same key. + + :param list(tuple) pairs: a JSON object converted to a list of key-value pairs + :raise KeyError: if any of the pairs have the same key + :return dict: + """ + result = {} + + for key, value in pairs: + if key in result: + raise KeyError(f"Duplicate key detected: {key!r}.") + + result[key] = value + + return result