diff --git a/setup.py b/setup.py index c6e3c93..a9012bd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name="twined", - version="0.2.2", + version="0.3.0", py_modules=[], install_requires=["jsonschema ~= 4.4.0", "python-dotenv"], url="https://www.github.com/octue/twined", diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 01a511e..a089069 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -1,7 +1,4 @@ -import copy import os -from unittest.mock import patch -from jsonschema.validators import RefResolver from twined import Twine, exceptions from .base import BaseTestCase @@ -39,87 +36,6 @@ class TestManifestStrands(BaseTestCase): } """ - TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ - { - "input_manifest": { - "datasets": { - "met_mast_data": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" - }, - "is_recycled": { - "type": "boolean" - }, - "number_of_blades": { - "type": "number" - } - }, - "required": [ - "manufacturer", - "height", - "is_recycled", - "number_of_blades" - ] - } - } - } - } - } - """ - - INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "labels": ["mykeyword1", "mykeyword2"], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - }, - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 1, - "extension": "csv", - "labels": [], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - def test_missing_manifest_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=self.VALID_MANIFEST_STRAND) @@ -134,6 +50,38 @@ def test_missing_manifest_files(self): with self.assertRaises(exceptions.OutputManifestFileNotFound): twine.validate_output_manifest(source=file) + def test_error_raised_if_datasets_are_missing_from_manifest(self): + """Test that an error is raised if a dataset is missing from a manifest.""" + twine = """ + { + "input_manifest": { + "datasets": { + "cat": { + "purpose": "blah" + }, + "dog": { + "purpose": "blah" + } + } + } + } + """ + + input_manifest = { + "id": "30d2c75c-a7b9-4f16-8627-9c8d5cc04bf4", + "datasets": {"my-dataset": "gs://my-bucket/my_dataset", "dog": "gs://dog-house/dog"}, + } + + twine = Twine(source=twine) + + with self.assertRaises(exceptions.InvalidManifestContents) as context: + twine.validate_input_manifest(source=input_manifest) + + self.assertEqual( + context.exception.message, + "A dataset named 'cat' is expected in the input_manifest but is missing.", + ) + def test_valid_manifest_files(self): """Ensures that a manifest file will validate.""" valid_configuration_manifest = """ @@ -219,7 +167,8 @@ def test_valid_manifest_files(self): "sha-512/256": "someothersha" } ] - } + }, + "scada_data": "gs://my-bucket/scada-data" } } """ @@ -335,233 +284,20 @@ def test_valid_manifest_files(self): # values_file = os.path.join(self.path, "configurations", "valid_with_extra.json") # twine.validate_configuration(file=values_file) - def test_error_raised_when_required_tags_missing_for_validate_input_manifest(self): - """Test that an error is raised when required tags from the file tags template for a dataset are missing when - validating the input manifest. - """ - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": {}, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - - with self.assertRaises(exceptions.InvalidManifestContents): - twine.validate_input_manifest(source=input_manifest) - - def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type(self): - """Test that an error is raised if the required tags from the file tags template for a dataset are present but - are not of the required type when validating an input manifest. - """ - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": %s, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - - for tags in ( - '{"manufacturer": "Vestas", "height": 350, "is_recycled": false, "number_of_blades": "3"}', - '{"manufacturer": "Vestas", "height": 350, "is_recycled": "no", "number_of_blades": 3}', - '{"manufacturer": false, "height": 350, "is_recycled": "false", "number_of_blades": 3}', - ): - with self.assertRaises(exceptions.InvalidManifestContents): - twine.validate_input_manifest(source=input_manifest % tags) - - def test_validate_input_manifest_with_required_tags(self): - """Test that validating an input manifest with required tags from the file tags template for a dataset works - for tags meeting the requirements. - """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) - - def test_validate_input_manifest_with_required_tags_for_remote_tag_template_schema(self): - """Test that a remote tag template can be used for validating tags on the datafiles in a manifest.""" - schema_url = "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" - - twine_with_input_manifest_with_remote_tag_template = ( - """ + def test_error_raised_if_multiple_datasets_have_same_name(self): + """Test that an error is raised if the input manifest has more than one dataset with the same name.""" + twine = """ { "input_manifest": { "datasets": { "met_mast_data": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "$ref": "%s" - } + "purpose": "A dataset containing meteorological mast data" } } } } - """ - % schema_url - ) - - remote_schema = { - "type": "object", - "properties": { - "manufacturer": {"type": "string"}, - "height": {"type": "number"}, - "is_recycled": {"type": "boolean"}, - }, - "required": ["manufacturer", "height", "is_recycled"], - } - - twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) - - original_resolve_from_url = copy.copy(RefResolver.resolve_from_url) - - def patch_if_url_is_schema_url(instance, url): - """Patch the jsonschema validator `RefResolver.resolve_from_url` if the url is the schema URL, otherwise - leave it unpatched. - - :param jsonschema.validators.RefResolver instance: - :param str url: - :return mixed: - """ - if url == schema_url: - return remote_schema - else: - return original_resolve_from_url(instance, url) - - with patch("jsonschema.validators.RefResolver.resolve_from_url", new=patch_if_url_is_schema_url): - twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) - - def test_validate_input_manifest_with_required_tags_in_several_datasets(self): - """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" - twine_with_input_manifest_with_required_tags_for_multiple_datasets = """ - { - "input_manifest": { - "datasets": { - "first_dataset": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" - } - } - } - }, - "second_dataset": { - "file_tags_template": { - "type": "object", - "properties": { - "is_recycled": { - "type": "boolean" - }, - "number_of_blades": { - "type": "number" - } - } - } - } - } - } - } - """ - - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "first_dataset": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", - "name": "first_dataset", - "tags": {}, - "labels": [], - "files": [ - { - "path": "input/datasets/7ead7669/file_0.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": { - "manufacturer": "Vestas", - "height": 503.7 - }, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_0.csv" - } - ] - }, - "second_dataset": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", - "name": "second_dataset", - "tags": {}, - "labels": [], - "files": [ - { - "path": "input/datasets/blah/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": { - "is_recycled": true, - "number_of_blades": 3 - }, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e82", - "name": "file_1.csv" - } - ] - } - } - } """ - twine = Twine(source=twine_with_input_manifest_with_required_tags_for_multiple_datasets) - twine.validate_input_manifest(source=input_manifest) - - def test_error_raised_if_multiple_datasets_have_same_name(self): - """Test that an error is raised if the input manifest has more than one dataset with the same name.""" input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", @@ -584,7 +320,7 @@ def test_error_raised_if_multiple_datasets_have_same_name(self): } """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine = Twine(source=twine) with self.assertRaises(KeyError): twine.validate_input_manifest(source=input_manifest) @@ -593,6 +329,18 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g """Test that, if datasets are given as a list (the old format), a deprecation warning is issued and the list is translated to a dictionary (the new format). """ + twine = """ + { + "input_manifest": { + "datasets": { + "met_mast_data": { + "purpose": "A dataset containing meteorological mast data" + } + } + } + } + """ + input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", @@ -608,7 +356,7 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g } """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine = Twine(source=twine) with self.assertWarns(DeprecationWarning): manifest = twine.validate_input_manifest(source=input_manifest) diff --git a/tests/test_twine.py b/tests/test_twine.py index eda8192..a347db9 100644 --- a/tests/test_twine.py +++ b/tests/test_twine.py @@ -62,6 +62,68 @@ def test_broken_json_twine(self): with self.assertRaises(exceptions.InvalidTwineJson): Twine(source=invalid_json_twine) + def test_available_strands_properties(self): + """Test that the `available_strands` and `available_manifest_strands` properties work correctly.""" + twine = """ + { + "configuration_values_schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "The example configuration form", + "description": "The configuration strand of an example twine", + "type": "object", + "properties": { + "n_iterations": { + "description": "An example of an integer configuration variable, called 'n_iterations'.", + "type": "integer", + "minimum": 1, + "maximum": 10, + "default": 5 + } + } + }, + "input_values_schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Input Values", + "description": "The input values strand of an example twine, with a required height value", + "type": "object", + "properties": { + "height": { + "description": "An example of an integer value called 'height'", + "type": "integer", + "minimum": 2 + } + }, + "required": ["height"] + }, + "output_values_schema": { + "title": "Output Values", + "description": "The output values strand of an example twine", + "type": "object", + "properties": { + "width": { + "description": "An example of an integer value called 'result'", + "type": "integer", + "minimum": 2 + } + } + }, + "output_manifest": { + "datasets": { + "my-dataset": {} + } + } + } + """ + + twine = Twine(source=twine) + + self.assertEqual( + twine.available_strands, + {"configuration_values", "input_values", "output_values", "output_manifest"}, + ) + + self.assertEqual(twine.available_manifest_strands, {"output_manifest"}) + def test_deprecation_warning_issued_if_datasets_not_given_as_dictionary_in_manifest_strands(self): """Test that, if datasets are given as a list in the manifest strands, a deprecation warning is issued and the list (the old form) is converted to a dictionary (the new form). diff --git a/twined/migrations/__init__.py b/twined/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/twined/migrations/manifest.py b/twined/migrations/manifest.py new file mode 100644 index 0000000..5722c01 --- /dev/null +++ b/twined/migrations/manifest.py @@ -0,0 +1,23 @@ +import warnings + + +def convert_dataset_list_to_dictionary(datasets): + """Convert a list of datasets into a dictionary of datasets. + + :param list datasets: + :return dict: + """ + converted_datasets = {} + + for i, dataset in enumerate(datasets): + converted_datasets[dataset.get("name", f"dataset_{i}")] = dataset + + warnings.warn( + message=( + "Datasets belonging to a manifest should be provided as a dictionary mapping their name to " + "themselves. Support for providing a list of datasets will be phased out soon." + ), + category=DeprecationWarning, + ) + + return converted_datasets diff --git a/twined/migrations/twine.py b/twined/migrations/twine.py new file mode 100644 index 0000000..912c536 --- /dev/null +++ b/twined/migrations/twine.py @@ -0,0 +1,22 @@ +import warnings + + +def convert_manifest_datasets_from_list_to_dictionary(datasets, strand): + """Convert the list of datasets in a manifest strand into a dictionary. + + :param list datasets: + :param str strand: + :return dict: + """ + datasets = {dataset["key"]: dataset for dataset in datasets} + + warnings.warn( + message=( + f"Datasets in the {strand!r} strand of the `twine.json` file should be provided as a " + "dictionary mapping their name to themselves. Support for providing a list of datasets will be " + "phased out soon." + ), + category=DeprecationWarning, + ) + + return datasets diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index cb8e311..57c2a25 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -22,76 +22,83 @@ "type": "object", "patternProperties": { ".+": { - "type": "object", - "properties": { - "id": { - "description": "ID of the dataset, typically a uuid", + "oneOf": [ + { "type": "string" }, - "name": { - "description": "Name of the dataset (the same as its key in the 'datasets' field).", - "type": "string" - }, - "tags": { - "$ref": "#/$defs/tags" - }, - "labels": { - "$ref": "#/$defs/labels" - }, - "files": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "description": "A file id", - "type": "string" - }, - "path": { - "description": "Path at which the file can be found", - "type": "string" - }, - "extension": { - "description": "The file extension (not including a '.')", - "type": "string" - }, - "sequence": { - "description": "The ordering on the file, if any, within its group/cluster", - "type": [ - "integer", - "null" + { + "type": "object", + "properties": { + "id": { + "description": "ID of the dataset, typically a uuid", + "type": "string" + }, + "name": { + "description": "Name of the dataset (the same as its key in the 'datasets' field).", + "type": "string" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + }, + "files": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "description": "A file id", + "type": "string" + }, + "path": { + "description": "Path at which the file can be found", + "type": "string" + }, + "extension": { + "description": "The file extension (not including a '.')", + "type": "string" + }, + "sequence": { + "description": "The ordering on the file, if any, within its group/cluster", + "type": [ + "integer", + "null" + ] + }, + "cluster": { + "description": "The group, or cluster, to which the file belongs", + "type": "integer" + }, + "posix_timestamp": { + "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", + "type": "number" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + } + }, + "required": [ + "id", + "path", + "tags", + "labels" ] - }, - "cluster": { - "description": "The group, or cluster, to which the file belongs", - "type": "integer" - }, - "posix_timestamp": { - "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", - "type": "number" - }, - "tags": { - "$ref": "#/$defs/tags" - }, - "labels": { - "$ref": "#/$defs/labels" } - }, - "required": [ - "id", - "path", - "tags", - "labels" - ] - } + } + }, + "required": [ + "id", + "name", + "tags", + "labels", + "files" + ] } - }, - "required": [ - "id", - "name", - "tags", - "labels", - "files" ] } } diff --git a/twined/twine.py b/twined/twine.py index 4faf357..081b026 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -1,11 +1,12 @@ import json as jsonlib import logging import os -import warnings import pkg_resources from dotenv import load_dotenv from jsonschema import ValidationError, validate as jsonschema_validate +import twined.migrations.manifest as manifest_migrations +import twined.migrations.twine as twine_migrations from . import exceptions from .utils import load_json, trim_suffix @@ -47,7 +48,8 @@ def __init__(self, **kwargs): for name, strand in self._load_twine(**kwargs).items(): setattr(self, name, strand) - self._available_strands = tuple(trim_suffix(name, "_schema") for name in vars(self)) + self._available_strands = set(trim_suffix(name, "_schema") for name in vars(self)) + self._available_manifest_strands = self._available_strands & set(MANIFEST_STRANDS) def _load_twine(self, source=None): """Load twine from a *.json filename, file-like or a json string and validates twine contents.""" @@ -60,15 +62,9 @@ def _load_twine(self, source=None): for strand in set(MANIFEST_STRANDS) & raw_twine.keys(): if isinstance(raw_twine[strand]["datasets"], list): - raw_twine[strand]["datasets"] = {dataset["key"]: dataset for dataset in raw_twine[strand]["datasets"]} - - warnings.warn( - message=( - f"Datasets in the {strand!r} strand of the `twine.json` file should be provided as a " - "dictionary mapping their name to themselves. Support for providing a list of datasets will be " - "phased out soon." - ), - category=DeprecationWarning, + raw_twine[strand]["datasets"] = twine_migrations.convert_manifest_datasets_from_list_to_dictionary( + datasets=raw_twine[strand]["datasets"], + strand=strand, ) self._validate_against_schema("twine", raw_twine) @@ -184,57 +180,52 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data = data.to_primitive() if isinstance(data["datasets"], list): - data["datasets"] = {dataset["name"]: dataset for dataset in data["datasets"]} - warnings.warn( - message=( - "Datasets belonging to a manifest should be provided as a dictionary mapping their name to " - "themselves. Support for providing a list of datasets will be phased out soon." - ), - category=DeprecationWarning, - ) + data["datasets"] = manifest_migrations.convert_dataset_list_to_dictionary(data["datasets"]) self._validate_against_schema(kind, data) - self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) + self._validate_all_expected_datasets_are_present_in_manifest(manifest_kind=kind, manifest=data) if cls and inbound: - # TODO verify that all the required keys etc are there return cls(**data) return data - def _validate_dataset_file_tags(self, manifest_kind, manifest): - """Validate the tags of the files of each dataset in the manifest against the file tags template in the - corresponding dataset field in the given manifest field of the twine. + def _validate_all_expected_datasets_are_present_in_manifest(self, manifest_kind, manifest): + """Check that all datasets specified in the corresponding manifest strand in the twine are present in the given + manifest. - :param str manifest_kind: - :param dict manifest: + :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed) + :param dict manifest: the manifest whose datasets are to be validated + :raise twined.exceptions.InvalidManifestContents: if one or more of the expected datasets is missing :return None: """ - # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. + # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files. manifest_schema = getattr(self, manifest_kind) - for dataset_name, dataset_schema in manifest_schema["datasets"].items(): - dataset = manifest["datasets"].get(dataset_name) - - if not dataset: - continue - - file_tags_template = dataset_schema.get("file_tags_template") - - if not file_tags_template: + for expected_dataset in manifest_schema["datasets"]: + if expected_dataset in manifest["datasets"]: continue - for file in dataset["files"]: - try: - jsonschema_validate(instance=file["tags"], schema=file_tags_template) - except ValidationError as e: - raise exceptions.invalid_contents_map[manifest_kind](str(e)) + raise exceptions.invalid_contents_map[manifest_kind]( + f"A dataset named {expected_dataset!r} is expected in the {manifest_kind} but is missing." + ) @property def available_strands(self): - """Tuple of strand names that are found in this twine""" + """Get the names of strands that are found in this twine. + + :return set: + """ return self._available_strands + @property + def available_manifest_strands(self): + """Get the names of the manifest strands that are found in this twine. + + :return set: + """ + return self._available_manifest_strands + def validate_children(self, source, **kwargs): """Validate that the children values, passed as either a file or a json string, are correct.""" # TODO cache this loaded data keyed on a hashed version of kwargs