From 71a66263ca6ebe25648c813ae17777a38221965a Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 28 Mar 2022 12:15:44 +0100 Subject: [PATCH 01/14] ENH: Update manifest schema to only expect dataset paths BREAKING CHANGE: This changes the public manifest schema. --- setup.py | 2 +- twined/schema/manifest_schema.json | 72 +----------------------------- 2 files changed, 2 insertions(+), 72 deletions(-) diff --git a/setup.py b/setup.py index c6e3c93..a9012bd 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name="twined", - version="0.2.2", + version="0.3.0", py_modules=[], install_requires=["jsonschema ~= 4.4.0", "python-dotenv"], url="https://www.github.com/octue/twined", diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index cb8e311..d4e6f8e 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -22,77 +22,7 @@ "type": "object", "patternProperties": { ".+": { - "type": "object", - "properties": { - "id": { - "description": "ID of the dataset, typically a uuid", - "type": "string" - }, - "name": { - "description": "Name of the dataset (the same as its key in the 'datasets' field).", - "type": "string" - }, - "tags": { - "$ref": "#/$defs/tags" - }, - "labels": { - "$ref": "#/$defs/labels" - }, - "files": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "description": "A file id", - "type": "string" - }, - "path": { - "description": "Path at which the file can be found", - "type": "string" - }, - "extension": { - "description": "The file extension (not including a '.')", - "type": "string" - }, - "sequence": { - "description": "The ordering on the file, if any, within its group/cluster", - "type": [ - "integer", - "null" - ] - }, - "cluster": { - "description": "The group, or cluster, to which the file belongs", - "type": "integer" - }, - "posix_timestamp": { - "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", - "type": "number" - }, - "tags": { - "$ref": "#/$defs/tags" - }, - "labels": { - "$ref": "#/$defs/labels" - } - }, - "required": [ - "id", - "path", - "tags", - "labels" - ] - } - } - }, - "required": [ - "id", - "name", - "tags", - "labels", - "files" - ] + "type": "string" } } } From dabb2f1763e6f737fc3befcbf9035a63b19d20ac Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 11:02:04 +0100 Subject: [PATCH 02/14] FIX: Allow new or old manifest datasets format in schema --- twined/schema/manifest_schema.json | 79 +++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index d4e6f8e..57c2a25 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -22,7 +22,84 @@ "type": "object", "patternProperties": { ".+": { - "type": "string" + "oneOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "id": { + "description": "ID of the dataset, typically a uuid", + "type": "string" + }, + "name": { + "description": "Name of the dataset (the same as its key in the 'datasets' field).", + "type": "string" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + }, + "files": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "description": "A file id", + "type": "string" + }, + "path": { + "description": "Path at which the file can be found", + "type": "string" + }, + "extension": { + "description": "The file extension (not including a '.')", + "type": "string" + }, + "sequence": { + "description": "The ordering on the file, if any, within its group/cluster", + "type": [ + "integer", + "null" + ] + }, + "cluster": { + "description": "The group, or cluster, to which the file belongs", + "type": "integer" + }, + "posix_timestamp": { + "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", + "type": "number" + }, + "tags": { + "$ref": "#/$defs/tags" + }, + "labels": { + "$ref": "#/$defs/labels" + } + }, + "required": [ + "id", + "path", + "tags", + "labels" + ] + } + } + }, + "required": [ + "id", + "name", + "tags", + "labels", + "files" + ] + } + ] } } } From 8c9d0916dfe4ebca0c373ee46f2428682aca0de0 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 11:11:36 +0100 Subject: [PATCH 03/14] REF: Move manifest datasets migration into migration package --- twined/migrations/__init__.py | 0 twined/migrations/manifest.py | 27 +++++++++++++++++++++++++++ twined/twine.py | 10 ++-------- 3 files changed, 29 insertions(+), 8 deletions(-) create mode 100644 twined/migrations/__init__.py create mode 100644 twined/migrations/manifest.py diff --git a/twined/migrations/__init__.py b/twined/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/twined/migrations/manifest.py b/twined/migrations/manifest.py new file mode 100644 index 0000000..95cf2e3 --- /dev/null +++ b/twined/migrations/manifest.py @@ -0,0 +1,27 @@ +import warnings + + +def convert_dataset_list_to_dictionary(datasets): + """Convert a list of datasets into a dictionary of datasets. + + :param list datasets: + :return dict: + """ + converted_datasets = {} + + for i, dataset in enumerate(datasets): + if isinstance(dataset, str): + converted_datasets[f"dataset_{i}"] = dataset + continue + + converted_datasets[dataset.get("name", f"dataset_{i}")] = dataset + + warnings.warn( + message=( + "Datasets belonging to a manifest should be provided as a dictionary mapping their name to " + "themselves. Support for providing a list of datasets will be phased out soon." + ), + category=DeprecationWarning, + ) + + return converted_datasets diff --git a/twined/twine.py b/twined/twine.py index 4faf357..bb11568 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -6,6 +6,7 @@ from dotenv import load_dotenv from jsonschema import ValidationError, validate as jsonschema_validate +from twined.migrations.manifest import convert_dataset_list_to_dictionary from . import exceptions from .utils import load_json, trim_suffix @@ -184,14 +185,7 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data = data.to_primitive() if isinstance(data["datasets"], list): - data["datasets"] = {dataset["name"]: dataset for dataset in data["datasets"]} - warnings.warn( - message=( - "Datasets belonging to a manifest should be provided as a dictionary mapping their name to " - "themselves. Support for providing a list of datasets will be phased out soon." - ), - category=DeprecationWarning, - ) + data["datasets"] = convert_dataset_list_to_dictionary(data["datasets"]) self._validate_against_schema(kind, data) self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) From 41880e17afb491eac9678517b68c02d12f2f8c5a Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 11:18:17 +0100 Subject: [PATCH 04/14] FIX: Skip file tag template validation on datasets with paths for files --- twined/twine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/twined/twine.py b/twined/twine.py index bb11568..16ffb0d 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -200,17 +200,17 @@ def _validate_dataset_file_tags(self, manifest_kind, manifest): """Validate the tags of the files of each dataset in the manifest against the file tags template in the corresponding dataset field in the given manifest field of the twine. - :param str manifest_kind: - :param dict manifest: + :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed) + :param dict manifest: the manifest whose datasets' files are to be validated :return None: """ - # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. + # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files. manifest_schema = getattr(self, manifest_kind) for dataset_name, dataset_schema in manifest_schema["datasets"].items(): dataset = manifest["datasets"].get(dataset_name) - if not dataset: + if not dataset or isinstance(dataset, str): continue file_tags_template = dataset_schema.get("file_tags_template") From 0bb2d51dccf55053737ffda07cfa975d4696b1de Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 11:29:01 +0100 Subject: [PATCH 05/14] REF: Move twine migration into migrations subpackage --- twined/migrations/twine.py | 22 ++++++++++++++++++++++ twined/twine.py | 18 ++++++------------ 2 files changed, 28 insertions(+), 12 deletions(-) create mode 100644 twined/migrations/twine.py diff --git a/twined/migrations/twine.py b/twined/migrations/twine.py new file mode 100644 index 0000000..912c536 --- /dev/null +++ b/twined/migrations/twine.py @@ -0,0 +1,22 @@ +import warnings + + +def convert_manifest_datasets_from_list_to_dictionary(datasets, strand): + """Convert the list of datasets in a manifest strand into a dictionary. + + :param list datasets: + :param str strand: + :return dict: + """ + datasets = {dataset["key"]: dataset for dataset in datasets} + + warnings.warn( + message=( + f"Datasets in the {strand!r} strand of the `twine.json` file should be provided as a " + "dictionary mapping their name to themselves. Support for providing a list of datasets will be " + "phased out soon." + ), + category=DeprecationWarning, + ) + + return datasets diff --git a/twined/twine.py b/twined/twine.py index 16ffb0d..768818f 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -1,12 +1,12 @@ import json as jsonlib import logging import os -import warnings import pkg_resources from dotenv import load_dotenv from jsonschema import ValidationError, validate as jsonschema_validate -from twined.migrations.manifest import convert_dataset_list_to_dictionary +import twined.migrations.manifest as manifest_migrations +import twined.migrations.twine as twine_migrations from . import exceptions from .utils import load_json, trim_suffix @@ -61,15 +61,9 @@ def _load_twine(self, source=None): for strand in set(MANIFEST_STRANDS) & raw_twine.keys(): if isinstance(raw_twine[strand]["datasets"], list): - raw_twine[strand]["datasets"] = {dataset["key"]: dataset for dataset in raw_twine[strand]["datasets"]} - - warnings.warn( - message=( - f"Datasets in the {strand!r} strand of the `twine.json` file should be provided as a " - "dictionary mapping their name to themselves. Support for providing a list of datasets will be " - "phased out soon." - ), - category=DeprecationWarning, + raw_twine[strand]["datasets"] = twine_migrations.convert_manifest_datasets_from_list_to_dictionary( + datasets=raw_twine[strand]["datasets"], + strand=strand, ) self._validate_against_schema("twine", raw_twine) @@ -185,7 +179,7 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data = data.to_primitive() if isinstance(data["datasets"], list): - data["datasets"] = convert_dataset_list_to_dictionary(data["datasets"]) + data["datasets"] = manifest_migrations.convert_dataset_list_to_dictionary(data["datasets"]) self._validate_against_schema(kind, data) self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) From 237fe722233f12480f6da5f46f1225404744b787 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 15:54:00 +0100 Subject: [PATCH 06/14] TST: Test manifest path-only datasets translation --- tests/test_manifest_strands.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 01a511e..b79a1f2 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -625,3 +625,27 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g } }, ) + + def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_given_as_list_of_paths(self): + """Test that, if datasets are given as a list (the old format) of paths, a deprecation warning is issued and the + list is translated to a dictionary (the new format). + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + "path/to/met_mast_data", + "gs://blah/my_dataset" + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertWarns(DeprecationWarning): + manifest = twine.validate_input_manifest(source=input_manifest) + + self.assertEqual( + manifest["datasets"], + {"dataset_0": "path/to/met_mast_data", "dataset_1": "gs://blah/my_dataset"}, + ) From 1ca8bc2d5bc1009235210b806955577349d120f3 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 17:02:18 +0100 Subject: [PATCH 07/14] ENH: Raise error if a dataset is missing from a manifest --- tests/test_manifest_strands.py | 35 +++++++++++++++++++++++++++++++++- twined/twine.py | 22 ++++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index b79a1f2..e6c3580 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -134,6 +134,38 @@ def test_missing_manifest_files(self): with self.assertRaises(exceptions.OutputManifestFileNotFound): twine.validate_output_manifest(source=file) + def test_error_raised_if_datasets_are_missing_from_manifest(self): + """Test that an error is raised if a dataset is missing from a manifest.""" + twine = """ + { + "input_manifest": { + "datasets": { + "cat": { + "purpose": "blah" + }, + "dog": { + "purpose": "blah" + } + } + } + } + """ + + input_manifest = { + "id": "30d2c75c-a7b9-4f16-8627-9c8d5cc04bf4", + "datasets": {"my-dataset": "gs://my-bucket/my_dataset", "dog": "gs://dog-house/dog"}, + } + + twine = Twine(source=twine) + + with self.assertRaises(exceptions.InvalidManifestContents) as context: + twine.validate_input_manifest(source=input_manifest) + + self.assertEqual( + context.exception.message, + "A dataset named 'cat' is expected in the input_manifest but is missing.", + ) + def test_valid_manifest_files(self): """Ensures that a manifest file will validate.""" valid_configuration_manifest = """ @@ -219,7 +251,8 @@ def test_valid_manifest_files(self): "sha-512/256": "someothersha" } ] - } + }, + "scada_data": "gs://my-bucket/scada-data" } } """ diff --git a/twined/twine.py b/twined/twine.py index 768818f..4269084 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -182,14 +182,34 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data["datasets"] = manifest_migrations.convert_dataset_list_to_dictionary(data["datasets"]) self._validate_against_schema(kind, data) + self._validate_all_expected_datasets_are_present_in_manifest(manifest_kind=kind, manifest=data) self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) if cls and inbound: - # TODO verify that all the required keys etc are there return cls(**data) return data + def _validate_all_expected_datasets_are_present_in_manifest(self, manifest_kind, manifest): + """Check that all datasets specified in the corresponding manifest strand in the twine are present in the given + manifest. + + :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed) + :param dict manifest: the manifest whose datasets are to be validated + :raise twined.exceptions.InvalidManifestContents: if one or more of the expected datasets is missing + :return None: + """ + # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files. + manifest_schema = getattr(self, manifest_kind) + + for expected_dataset in manifest_schema["datasets"]: + if expected_dataset in manifest["datasets"]: + continue + + raise exceptions.invalid_contents_map[manifest_kind]( + f"A dataset named {expected_dataset!r} is expected in the {manifest_kind} but is missing." + ) + def _validate_dataset_file_tags(self, manifest_kind, manifest): """Validate the tags of the files of each dataset in the manifest against the file tags template in the corresponding dataset field in the given manifest field of the twine. From ac84f092c2f4f111b5b608528b787b09f73d89f6 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 17:06:55 +0100 Subject: [PATCH 08/14] REV: Undo support for lists of paths for a manifest's datasets --- tests/test_manifest_strands.py | 24 ------------------------ twined/migrations/manifest.py | 4 ---- 2 files changed, 28 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index e6c3580..8ac352b 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -658,27 +658,3 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g } }, ) - - def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_given_as_list_of_paths(self): - """Test that, if datasets are given as a list (the old format) of paths, a deprecation warning is issued and the - list is translated to a dictionary (the new format). - """ - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - "path/to/met_mast_data", - "gs://blah/my_dataset" - ] - } - """ - - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - - with self.assertWarns(DeprecationWarning): - manifest = twine.validate_input_manifest(source=input_manifest) - - self.assertEqual( - manifest["datasets"], - {"dataset_0": "path/to/met_mast_data", "dataset_1": "gs://blah/my_dataset"}, - ) diff --git a/twined/migrations/manifest.py b/twined/migrations/manifest.py index 95cf2e3..5722c01 100644 --- a/twined/migrations/manifest.py +++ b/twined/migrations/manifest.py @@ -10,10 +10,6 @@ def convert_dataset_list_to_dictionary(datasets): converted_datasets = {} for i, dataset in enumerate(datasets): - if isinstance(dataset, str): - converted_datasets[f"dataset_{i}"] = dataset - continue - converted_datasets[dataset.get("name", f"dataset_{i}")] = dataset warnings.warn( From 034a01e7d0435557311fa6f6000e69ff4e8fc65f Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 17:30:42 +0100 Subject: [PATCH 09/14] ENH: Move dataset files tag template checking to octue-sdk-python --- tests/test_manifest_strands.py | 228 --------------------------------- twined/twine.py | 29 ----- 2 files changed, 257 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 8ac352b..ba00105 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -1,7 +1,4 @@ -import copy import os -from unittest.mock import patch -from jsonschema.validators import RefResolver from twined import Twine, exceptions from .base import BaseTestCase @@ -368,231 +365,6 @@ def test_valid_manifest_files(self): # values_file = os.path.join(self.path, "configurations", "valid_with_extra.json") # twine.validate_configuration(file=values_file) - def test_error_raised_when_required_tags_missing_for_validate_input_manifest(self): - """Test that an error is raised when required tags from the file tags template for a dataset are missing when - validating the input manifest. - """ - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": {}, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - - with self.assertRaises(exceptions.InvalidManifestContents): - twine.validate_input_manifest(source=input_manifest) - - def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type(self): - """Test that an error is raised if the required tags from the file tags template for a dataset are present but - are not of the required type when validating an input manifest. - """ - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": %s, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - - for tags in ( - '{"manufacturer": "Vestas", "height": 350, "is_recycled": false, "number_of_blades": "3"}', - '{"manufacturer": "Vestas", "height": 350, "is_recycled": "no", "number_of_blades": 3}', - '{"manufacturer": false, "height": 350, "is_recycled": "false", "number_of_blades": 3}', - ): - with self.assertRaises(exceptions.InvalidManifestContents): - twine.validate_input_manifest(source=input_manifest % tags) - - def test_validate_input_manifest_with_required_tags(self): - """Test that validating an input manifest with required tags from the file tags template for a dataset works - for tags meeting the requirements. - """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) - - def test_validate_input_manifest_with_required_tags_for_remote_tag_template_schema(self): - """Test that a remote tag template can be used for validating tags on the datafiles in a manifest.""" - schema_url = "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" - - twine_with_input_manifest_with_remote_tag_template = ( - """ - { - "input_manifest": { - "datasets": { - "met_mast_data": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "$ref": "%s" - } - } - } - } - } - """ - % schema_url - ) - - remote_schema = { - "type": "object", - "properties": { - "manufacturer": {"type": "string"}, - "height": {"type": "number"}, - "is_recycled": {"type": "boolean"}, - }, - "required": ["manufacturer", "height", "is_recycled"], - } - - twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) - - original_resolve_from_url = copy.copy(RefResolver.resolve_from_url) - - def patch_if_url_is_schema_url(instance, url): - """Patch the jsonschema validator `RefResolver.resolve_from_url` if the url is the schema URL, otherwise - leave it unpatched. - - :param jsonschema.validators.RefResolver instance: - :param str url: - :return mixed: - """ - if url == schema_url: - return remote_schema - else: - return original_resolve_from_url(instance, url) - - with patch("jsonschema.validators.RefResolver.resolve_from_url", new=patch_if_url_is_schema_url): - twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) - - def test_validate_input_manifest_with_required_tags_in_several_datasets(self): - """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" - twine_with_input_manifest_with_required_tags_for_multiple_datasets = """ - { - "input_manifest": { - "datasets": { - "first_dataset": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" - } - } - } - }, - "second_dataset": { - "file_tags_template": { - "type": "object", - "properties": { - "is_recycled": { - "type": "boolean" - }, - "number_of_blades": { - "type": "number" - } - } - } - } - } - } - } - """ - - input_manifest = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "first_dataset": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", - "name": "first_dataset", - "tags": {}, - "labels": [], - "files": [ - { - "path": "input/datasets/7ead7669/file_0.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": { - "manufacturer": "Vestas", - "height": 503.7 - }, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_0.csv" - } - ] - }, - "second_dataset": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", - "name": "second_dataset", - "tags": {}, - "labels": [], - "files": [ - { - "path": "input/datasets/blah/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "tags": { - "is_recycled": true, - "number_of_blades": 3 - }, - "labels": [], - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e82", - "name": "file_1.csv" - } - ] - } - } - } - """ - - twine = Twine(source=twine_with_input_manifest_with_required_tags_for_multiple_datasets) - twine.validate_input_manifest(source=input_manifest) - def test_error_raised_if_multiple_datasets_have_same_name(self): """Test that an error is raised if the input manifest has more than one dataset with the same name.""" input_manifest = """ diff --git a/twined/twine.py b/twined/twine.py index 4269084..d5693aa 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -183,7 +183,6 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): self._validate_against_schema(kind, data) self._validate_all_expected_datasets_are_present_in_manifest(manifest_kind=kind, manifest=data) - self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) if cls and inbound: return cls(**data) @@ -210,34 +209,6 @@ def _validate_all_expected_datasets_are_present_in_manifest(self, manifest_kind, f"A dataset named {expected_dataset!r} is expected in the {manifest_kind} but is missing." ) - def _validate_dataset_file_tags(self, manifest_kind, manifest): - """Validate the tags of the files of each dataset in the manifest against the file tags template in the - corresponding dataset field in the given manifest field of the twine. - - :param str manifest_kind: the kind of manifest that's being validated (so the correct schema can be accessed) - :param dict manifest: the manifest whose datasets' files are to be validated - :return None: - """ - # This is the manifest schema included in the twine.json file, not the schema for `manifest.json` files. - manifest_schema = getattr(self, manifest_kind) - - for dataset_name, dataset_schema in manifest_schema["datasets"].items(): - dataset = manifest["datasets"].get(dataset_name) - - if not dataset or isinstance(dataset, str): - continue - - file_tags_template = dataset_schema.get("file_tags_template") - - if not file_tags_template: - continue - - for file in dataset["files"]: - try: - jsonschema_validate(instance=file["tags"], schema=file_tags_template) - except ValidationError as e: - raise exceptions.invalid_contents_map[manifest_kind](str(e)) - @property def available_strands(self): """Tuple of strand names that are found in this twine""" From df2129bde83940b51f3ca44e51fe75d0f50ce8c0 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 17:33:48 +0100 Subject: [PATCH 10/14] ENH: Add Twine.available_manifest_strands property --- twined/twine.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/twined/twine.py b/twined/twine.py index d5693aa..74fa185 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -49,6 +49,7 @@ def __init__(self, **kwargs): setattr(self, name, strand) self._available_strands = tuple(trim_suffix(name, "_schema") for name in vars(self)) + self._available_manifest_strands = tuple(set(self._available_strands) & set(MANIFEST_STRANDS)) def _load_twine(self, source=None): """Load twine from a *.json filename, file-like or a json string and validates twine contents.""" @@ -211,9 +212,20 @@ def _validate_all_expected_datasets_are_present_in_manifest(self, manifest_kind, @property def available_strands(self): - """Tuple of strand names that are found in this twine""" + """Get the names of strands that are found in this twine. + + :return tuple: + """ return self._available_strands + @property + def available_manifest_strands(self): + """Get the names of the manifest strands that are found in this twine. + + :return tuple: + """ + return self._available_manifest_strands + def validate_children(self, source, **kwargs): """Validate that the children values, passed as either a file or a json string, are correct.""" # TODO cache this loaded data keyed on a hashed version of kwargs From 30274336829ba51521656ed79e2f994a5ab9b429 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 17:50:23 +0100 Subject: [PATCH 11/14] TST: Move remaining dataset tag template test fixtures to octue --- tests/test_manifest_strands.py | 109 ++++++++------------------------- 1 file changed, 26 insertions(+), 83 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index ba00105..a089069 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -36,87 +36,6 @@ class TestManifestStrands(BaseTestCase): } """ - TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ - { - "input_manifest": { - "datasets": { - "met_mast_data": { - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" - }, - "is_recycled": { - "type": "boolean" - }, - "number_of_blades": { - "type": "number" - } - }, - "required": [ - "manufacturer", - "height", - "is_recycled", - "number_of_blades" - ] - } - } - } - } - } - """ - - INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS = """ - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": { - "met_mast_data": { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "labels": ["mykeyword1", "mykeyword2"], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - }, - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 1, - "extension": "csv", - "labels": [], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - } - } - """ - def test_missing_manifest_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=self.VALID_MANIFEST_STRAND) @@ -367,6 +286,18 @@ def test_valid_manifest_files(self): def test_error_raised_if_multiple_datasets_have_same_name(self): """Test that an error is raised if the input manifest has more than one dataset with the same name.""" + twine = """ + { + "input_manifest": { + "datasets": { + "met_mast_data": { + "purpose": "A dataset containing meteorological mast data" + } + } + } + } + """ + input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", @@ -389,7 +320,7 @@ def test_error_raised_if_multiple_datasets_have_same_name(self): } """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine = Twine(source=twine) with self.assertRaises(KeyError): twine.validate_input_manifest(source=input_manifest) @@ -398,6 +329,18 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g """Test that, if datasets are given as a list (the old format), a deprecation warning is issued and the list is translated to a dictionary (the new format). """ + twine = """ + { + "input_manifest": { + "datasets": { + "met_mast_data": { + "purpose": "A dataset containing meteorological mast data" + } + } + } + } + """ + input_manifest = """ { "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", @@ -413,7 +356,7 @@ def test_deprecation_warning_issued_and_datasets_format_translated_if_datasets_g } """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine = Twine(source=twine) with self.assertWarns(DeprecationWarning): manifest = twine.validate_input_manifest(source=input_manifest) From 9b274c779b8c0e8709df2a99ecf767478c0dfe9b Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 18:03:16 +0100 Subject: [PATCH 12/14] ENH: Make Twine available strands properties sets BREAKING CHANGE: This changes a public property on the existing Twine interface. --- twined/twine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/twined/twine.py b/twined/twine.py index 74fa185..ea0fdf8 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -48,8 +48,8 @@ def __init__(self, **kwargs): for name, strand in self._load_twine(**kwargs).items(): setattr(self, name, strand) - self._available_strands = tuple(trim_suffix(name, "_schema") for name in vars(self)) - self._available_manifest_strands = tuple(set(self._available_strands) & set(MANIFEST_STRANDS)) + self._available_strands = set(trim_suffix(name, "_schema") for name in vars(self)) + self._available_manifest_strands = set(set(self._available_strands) & set(MANIFEST_STRANDS)) def _load_twine(self, source=None): """Load twine from a *.json filename, file-like or a json string and validates twine contents.""" From 103c4fdf97ab331e33666752220bbdc70b674006 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Mon, 4 Apr 2022 18:04:08 +0100 Subject: [PATCH 13/14] TST: Test Twine available strands properties --- tests/test_twine.py | 62 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tests/test_twine.py b/tests/test_twine.py index eda8192..a347db9 100644 --- a/tests/test_twine.py +++ b/tests/test_twine.py @@ -62,6 +62,68 @@ def test_broken_json_twine(self): with self.assertRaises(exceptions.InvalidTwineJson): Twine(source=invalid_json_twine) + def test_available_strands_properties(self): + """Test that the `available_strands` and `available_manifest_strands` properties work correctly.""" + twine = """ + { + "configuration_values_schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "The example configuration form", + "description": "The configuration strand of an example twine", + "type": "object", + "properties": { + "n_iterations": { + "description": "An example of an integer configuration variable, called 'n_iterations'.", + "type": "integer", + "minimum": 1, + "maximum": 10, + "default": 5 + } + } + }, + "input_values_schema": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Input Values", + "description": "The input values strand of an example twine, with a required height value", + "type": "object", + "properties": { + "height": { + "description": "An example of an integer value called 'height'", + "type": "integer", + "minimum": 2 + } + }, + "required": ["height"] + }, + "output_values_schema": { + "title": "Output Values", + "description": "The output values strand of an example twine", + "type": "object", + "properties": { + "width": { + "description": "An example of an integer value called 'result'", + "type": "integer", + "minimum": 2 + } + } + }, + "output_manifest": { + "datasets": { + "my-dataset": {} + } + } + } + """ + + twine = Twine(source=twine) + + self.assertEqual( + twine.available_strands, + {"configuration_values", "input_values", "output_values", "output_manifest"}, + ) + + self.assertEqual(twine.available_manifest_strands, {"output_manifest"}) + def test_deprecation_warning_issued_if_datasets_not_given_as_dictionary_in_manifest_strands(self): """Test that, if datasets are given as a list in the manifest strands, a deprecation warning is issued and the list (the old form) is converted to a dictionary (the new form). From d6df9e3f27b210a37e488f18ae8b337cb79aab36 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 5 Apr 2022 12:48:46 +0100 Subject: [PATCH 14/14] REF: Simplify Twine.available_manifest_strands property calculation --- twined/twine.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/twined/twine.py b/twined/twine.py index ea0fdf8..081b026 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -49,7 +49,7 @@ def __init__(self, **kwargs): setattr(self, name, strand) self._available_strands = set(trim_suffix(name, "_schema") for name in vars(self)) - self._available_manifest_strands = set(set(self._available_strands) & set(MANIFEST_STRANDS)) + self._available_manifest_strands = self._available_strands & set(MANIFEST_STRANDS) def _load_twine(self, source=None): """Load twine from a *.json filename, file-like or a json string and validates twine contents.""" @@ -214,7 +214,7 @@ def _validate_all_expected_datasets_are_present_in_manifest(self, manifest_kind, def available_strands(self): """Get the names of strands that are found in this twine. - :return tuple: + :return set: """ return self._available_strands @@ -222,7 +222,7 @@ def available_strands(self): def available_manifest_strands(self): """Get the names of the manifest strands that are found in this twine. - :return tuple: + :return set: """ return self._available_manifest_strands