From c37943150f625039e228ca45bc6fcea98c27c71e Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 16:44:25 +0100 Subject: [PATCH 01/23] IMP: Make tags an object and add labels field to manifest schema --- twined/schema/manifest_schema.json | 42 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index ebbf19b..4c5cd25 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -1,4 +1,19 @@ { + "$defs": { + "tags": { + "description": "Key-value tags associated with the object.", + "type": "object" + }, + "labels": { + "description": "Textual labels associated with the object", + "type": "array", + "items": [ + { + "type": "string" + } + ] + } + }, "type": "object", "properties": { "kind": { @@ -32,17 +47,9 @@ "description": "Name of the dataset", "type": "string" }, - "tags": { - "description": "Textual tags associated with the dataset", - "type": "array", - "items": [ - { - "type": "string" - } - ] - }, + "tags": {"$ref": "#/$defs/tags"}, + "labels": {"$ref": "#/$defs/labels"}, "files": { - "description": "Textual tags associated with the manifest", "type": "array", "items": { "type": "object", @@ -71,20 +78,14 @@ "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", "type": "number" }, - "tags": { - "description": "Textual tags associated with the file", - "type": "array", - "items": [ - { - "type": "string" - } - ] - } + "tags": {"$ref": "#/$defs/tags"}, + "labels": {"$ref": "#/$defs/labels"} }, "required": [ "id", "path", - "tags" + "tags", + "labels" ] } } @@ -92,6 +93,7 @@ "required": [ "id", "tags", + "labels", "files" ] } From a30143b1bd650adaf43e49e717d394533d12549b Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 16:47:06 +0100 Subject: [PATCH 02/23] IMP: Add tags template to twine schema --- twined/schema/twine_schema.json | 99 ++++++++++++++++----------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index eda7828..9de28d7 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -1,4 +1,49 @@ { + "$defs": { + "tags_template": { + "type": "object", + "properties": { + "$schema": { + "type": "string" + }, + "type": { + "const": "object" + }, + "properties": { + "type": "object" + }, + "required": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["$schema", "type", "properties"] + }, + "manifest": { + "type": "array", + "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", + "items": { + "type": "object", + "properties": { + "key": { + "description": "A textual key identifying this dataset within the application/twin", + "type": "string" + }, + "purpose": { + "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", + "type": "string", + "default": "" + }, + "tags_template": { + "$ref": "#/defs/tags_template" + } + }, + "required": ["key"] + } + } + }, "type": "object", "$schema": "http://json-schema.org/2019-09/schema#", "properties": { @@ -52,61 +97,11 @@ "additionalProperties": false } }, - "input_manifest": { - "type": "array", - "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "filters": { - "description": "A search term, using the Lucene Query Language, which can be used to automatically refine the list of available datasets down to ones suitable for use with this twin", - "type": "string", - "default": "" - } - }, - "required": ["key"] - } - }, + "input_manifest": {"$ref": "#/$defs/manifest"}, "input_values_schema": { "type": "object" }, - "output_manifest": { - "type": "array", - "description": "A list of entries, each describing a dataset that may be created/updated when the twin is run", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "tags": { - "description": "Comma separated tags that will be applied to the dataset when created", - "type": "array", - "items": [ - { - "type": "string" - } - ] - } - }, - "required": ["key"] - } - }, + "output_manifest": {"$ref": "#/$defs/manifest"}, "output_values_schema": { "type": "object" } From f45c71fb0ec7fb6cef5da93d50c70d792e9eb5a1 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 16:50:47 +0100 Subject: [PATCH 03/23] REF: Rename tags template in twine schema; make "$schema" field optional --- twined/schema/twine_schema.json | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index 9de28d7..dfecc25 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -1,6 +1,6 @@ { "$defs": { - "tags_template": { + "file_tags_template": { "type": "object", "properties": { "$schema": { @@ -19,7 +19,7 @@ } } }, - "required": ["$schema", "type", "properties"] + "required": ["type", "properties"] }, "manifest": { "type": "array", @@ -36,9 +36,7 @@ "type": "string", "default": "" }, - "tags_template": { - "$ref": "#/defs/tags_template" - } + "file_tags_template": {"$ref": "#/$defs/file_tags_template"} }, "required": ["key"] } From dfac6fc5962074c54e77a70347a01bfe13a645c5 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 17:11:49 +0100 Subject: [PATCH 04/23] REF: Split getting schema from validating against schema --- twined/twine.py | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/twined/twine.py b/twined/twine.py index 2a4e18f..7fd6433 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -88,40 +88,60 @@ def _load_json(self, kind, source, **kwargs): return data - def _validate_against_schema(self, strand, data): - """Validates data against a schema, raises exceptions of type InvalidJson if not compliant. + def _get_schema(self, strand): + """Get the schema for the given strand. Can be used to validate: - the twine file contents itself against the present version twine spec - children data against the required schema for the present version twine spec - values data for compliance with schema written in the twine (for strands like input_values_schema) + + :param str strand: + :return dict: """ if strand == "twine": # The data is a twine. A twine *contains* schema, but we also need to verify that it matches a certain # schema itself. The twine schema is distributed with this packaged to ensure version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/twine_schema.json")) + schema_path = "schema/twine_schema.json" elif strand in CHILDREN_STRANDS: # The data is a list of children. The "children" strand of the twine describes matching criteria for # the children, not the schema of the "children" data, which is distributed with this package to ensure # version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/children_schema.json")) + schema_path = "schema/children_schema.json" elif strand in MANIFEST_STRANDS: # The data is a manifest of files. The "*_manifest" strands of the twine describe matching criteria used to # filter files appropriate for consumption by the digital twin, not the schema of the manifest data, which # is distributed with this package to ensure version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/manifest_schema.json")) + schema_path = "schema/manifest_schema.json" else: if strand not in SCHEMA_STRANDS: raise exceptions.UnknownStrand(f"Unknown strand {strand}. Try one of {ALL_STRANDS}.") schema_key = strand + "_schema" + try: - schema = getattr(self, schema_key) + return getattr(self, schema_key) except AttributeError: raise exceptions.StrandNotFound(f"Cannot validate - no {schema_key} strand in the twine") + return jsonlib.loads(pkg_resources.resource_string("twined", schema_path)) + + def _validate_against_schema(self, strand, data): + """Validates data against a schema, raises exceptions of type InvalidJson if not compliant. + + Can be used to validate: + - the twine file contents itself against the present version twine spec + - children data against the required schema for the present version twine spec + - values data for compliance with schema written in the twine (for strands like input_values_schema) + + :param str strand: + :param dict data: + :return None: + """ + schema = self._get_schema(strand) + try: jsonschema_validate(instance=data, schema=schema) logger.debug("Validated %s against schema", strand) From 75d7d4537f70cef81c65d20c96483b1f9eefe99b Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 17:16:40 +0100 Subject: [PATCH 05/23] IMP: Add file tag template validation for datasets --- twined/twine.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/twined/twine.py b/twined/twine.py index 7fd6433..4b19416 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -179,6 +179,7 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data = data.serialise() self._validate_against_schema(kind, data) + self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) if cls and inbound: # TODO verify that all the required keys etc are there @@ -186,6 +187,20 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): return data + def _validate_dataset_file_tags(self, manifest_kind, manifest): + """Validate the tags of the files of each dataset in the manifest against the file tags template in the + corresponding dataset field in the given manifest field of the twine. + + :param str manifest_kind: + :param dict manifest: + :return None: + """ + manifest_schema = getattr(self, manifest_kind) + + for dataset_schema, dataset in zip(manifest_schema, manifest["datasets"]): + for file in dataset["files"]: + jsonschema_validate(instance=file["tags"], schema=dataset_schema["file_tags_template"]) + @property def available_strands(self): """Tuple of strand names that are found in this twine""" From 773a6f078d984bf6e362f060320ec46805817d03 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 17:45:37 +0100 Subject: [PATCH 06/23] FIX: Skip file tag check for datasets with no template; raise manifest error on failed check --- twined/twine.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/twined/twine.py b/twined/twine.py index 4b19416..67ca7c2 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -198,8 +198,16 @@ def _validate_dataset_file_tags(self, manifest_kind, manifest): manifest_schema = getattr(self, manifest_kind) for dataset_schema, dataset in zip(manifest_schema, manifest["datasets"]): + file_tags_template = dataset_schema.get("file_tags_template") + + if not file_tags_template: + continue + for file in dataset["files"]: - jsonschema_validate(instance=file["tags"], schema=dataset_schema["file_tags_template"]) + try: + jsonschema_validate(instance=file["tags"], schema=file_tags_template) + except ValidationError as e: + raise exceptions.invalid_contents_map[manifest_kind](str(e)) @property def available_strands(self): From 4dcde49814ecc82fffb4f48a48353266e2164589 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Thu, 13 May 2021 17:54:13 +0100 Subject: [PATCH 07/23] TST: Update tests and test file tag templates --- tests/test_manifest_strands.py | 280 +++++++++++++++++++++++++++++++-- 1 file changed, 271 insertions(+), 9 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 5ca0bb0..a3dbd27 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -39,6 +39,40 @@ class TestManifestStrands(BaseTestCase): } """ + TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ + { + "input_manifest": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + }, + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } + }, + "required": [ + "manufacturer", + "height", + "is_recycled", + "number_of_blades" + ] + } + } + ] + } + """ + def test_missing_manifest_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=self.VALID_MANIFEST_STRAND) @@ -62,14 +96,16 @@ def test_valid_manifest_files(self): { "id": "34ad7669-8162-4f64-8cd5-4abe92509e17", "name": "my configuration dataset", - "tags": ["the", "config", "tags"], + "tags": {}, + "labels": ["the", "config", "labels"], "files": [ { "path": "configuration/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -82,7 +118,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -103,14 +140,16 @@ def test_valid_manifest_files(self): { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "my meteorological dataset", - "tags": ["met", "mast", "wind"], + "tags": {}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -123,7 +162,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -144,14 +184,16 @@ def test_valid_manifest_files(self): { "id": "1ead7669-8162-4f64-8cd5-4abe92509e17", "name": "my output dataset", - "tags": ["the", "output", "tags"], + "tags": {}, + "labels": ["the", "output", "labels"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -164,7 +206,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -245,6 +288,225 @@ def test_valid_manifest_files(self): # values_file = os.path.join(self.path, "configurations", "valid_with_extra.json") # twine.validate_configuration(file=values_file) + def test_error_raised_when_required_tags_missing_for_validate_input_manifest(self): + """Test that an error is raised when required tags from the file tags template for a dataset are missing when + validating the input manifest. + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "my meteorological dataset", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": {}, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertRaises(exceptions.InvalidManifestContents): + twine.validate_input_manifest(source=input_manifest) + + def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type(self): + """Test that an error is raised if the required tags from the file tags template for a dataset are present but + are not of the required type when validating an input manifest. + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "my meteorological dataset", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": %s, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + for tags in ( + '{"manufacturer": "Vestas", "height": 350, "is_recycled": false, "number_of_blades": "3"}', + '{"manufacturer": "Vestas", "height": 350, "is_recycled": "no", "number_of_blades": 3}', + '{"manufacturer": false, "height": 350, "is_recycled": "false", "number_of_blades": 3}', + ): + with self.assertRaises(exceptions.InvalidManifestContents): + twine.validate_input_manifest(source=input_manifest % tags) + + def test_validate_input_manifest_with_required_tags(self): + """Test that validating an input manifest with required tags from the file tags template for a dataset works + for tags meeting the requirements. + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "my meteorological dataset", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine.validate_input_manifest(source=input_manifest) + + def test_validate_input_manifest_with_required_tags_in_several_datasets(self): + """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" + TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS = """ + { + "input_manifest": [ + { + "key": "first_dataset", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + } + } + } + }, + { + "key": "second_dataset", + "file_tags_template": { + "type": "object", + "properties": { + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } + } + } + } + ] + } + """ + + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "first dataset", + "tags": {}, + "labels": [], + "files": [ + { + "path": "input/datasets/7ead7669/file_0.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": { + "manufacturer": "Vestas", + "height": 503.7 + }, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_0.csv" + } + ] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", + "name": "second dataset", + "tags": {}, + "labels": [], + "files": [ + { + "path": "input/datasets/blah/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": { + "is_recycled": true, + "number_of_blades": 3 + }, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e82", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS) + twine.validate_input_manifest(source=input_manifest) + if __name__ == "__main__": unittest.main() From 7d0799f06d262bb8964a3c3593c2186ba6641e55 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Fri, 21 May 2021 12:16:34 +0100 Subject: [PATCH 08/23] REF: Separate old tags into labels and new tags --- .../data/configuration_manifest.json | 5 ++-- .../data/input_manifest.json | 14 +++++----- .../data/output_manifest.json | 5 ++-- .../data/output_manifest.json | 26 ++++++++++++------- .../data/apps/example_app/input/manifest.json | 7 ++--- tests/data/apps/example_app/twine.json | 3 +-- 6 files changed, 36 insertions(+), 24 deletions(-) diff --git a/examples/damage_classifier_service/data/configuration_manifest.json b/examples/damage_classifier_service/data/configuration_manifest.json index 0bfcfdd..005e057 100644 --- a/examples/damage_classifier_service/data/configuration_manifest.json +++ b/examples/damage_classifier_service/data/configuration_manifest.json @@ -5,14 +5,15 @@ "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "training data for system abc123", "organisation": "megacorp", - "tags": ["classifier", "damage", "system:abc123"], + "tags": {"system": "abc123"}, + "labels": ["classifier", "damage"], "files": [ { "path": "datasets/7ead7669/blade_damage.mdl", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", diff --git a/examples/met_mast_scada_service/data/input_manifest.json b/examples/met_mast_scada_service/data/input_manifest.json index b4f42be..68e0766 100644 --- a/examples/met_mast_scada_service/data/input_manifest.json +++ b/examples/met_mast_scada_service/data/input_manifest.json @@ -4,14 +4,15 @@ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "meteorological mast dataset", - "tags": ["met", "mast", "wind", "location:108346"], + "tags": {"location": 108346}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/mast_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": 1551393630, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -24,7 +25,7 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": 1551394230, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:50:40.633001Z", @@ -37,14 +38,15 @@ { "id": "5cf9e445-c288-4567-9072-edc31003b022", "name": "scada data exports", - "tags": ["wind", "turbine", "scada", "system:ab32", "location:108346"], + "tags": {"location": 108346, "system": "ab32"}, + "labels": ["wind", "turbine", "scada"], "files": [ { "path": "input/datasets/7ead7669/export_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": 1551393600, "id": "78fa511f-3e28-4bc2-aa28-7b6a2e8e6ef9", "last_modified": "2019-02-28T22:40:00.000000Z", @@ -57,7 +59,7 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": 1551394200, "id": "204d7316-7ae6-45e3-8f90-443225b21226", "last_modified": "2019-02-28T22:50:00.000000Z", diff --git a/examples/met_mast_scada_service/data/output_manifest.json b/examples/met_mast_scada_service/data/output_manifest.json index 50f7e89..4406386 100644 --- a/examples/met_mast_scada_service/data/output_manifest.json +++ b/examples/met_mast_scada_service/data/output_manifest.json @@ -5,14 +5,15 @@ "id": "4564deca-5654-42e8-aadf-70690b393a30", "name": "visual cross check data", "organisation": "megacorp", - "tags": ["figure", "met", "mast", "scada", "check", "location:108346"], + "tags": {"location": 108346}, + "labels": ["figure", "met", "mast", "scada", "check"], "files": [ { "path": "datasets/7ead7669/cross_check.fig", "cluster": 0, "sequence": 0, "extension": "fig", - "tags": [], + "tags": {}, "posix_timestamp": 1551394800, "id": "38f77fe2-c8c0-49d1-a08c-0928d53a742f", "last_modified": "2019-02-28T23:00:00.000000Z", diff --git a/examples/wind_tunnel_datalogger_service/data/output_manifest.json b/examples/wind_tunnel_datalogger_service/data/output_manifest.json index ae2241e..727d7b3 100644 --- a/examples/wind_tunnel_datalogger_service/data/output_manifest.json +++ b/examples/wind_tunnel_datalogger_service/data/output_manifest.json @@ -5,14 +5,15 @@ "id": "1eba4346-daff-421b-921c-8f1c05d6997d", "name": "Test results from naca0012 section", "organisation": "megacorp", - "tags": ["section:naca0012"], + "tags": {"section": "naca0012"}, "files": [ { "path": "datasets/7ead7669/sys_temp.json", "cluster": 0, "sequence": 0, "extension": "json", - "tags": ["system", "temperature"], + "tags": {}, + "labels": ["system", "temperature"], "posix_timestamp": 1551394800, "id": "afcdef45-da6b-4805-95d6-7a889d81f5b9", "last_modified": "2020-02-28T13:12:42.000000Z", @@ -25,7 +26,8 @@ "cluster": 1, "sequence": 0, "extension": "json", - "tags": ["wind", "tunnel", "velocity", "profile", "background", "turbulence"], + "tags": {}, + "labels": ["wind", "tunnel", "velocity", "profile", "background", "turbulence"], "posix_timestamp": 1551394800, "id": "3667aa6d-ee64-4cd4-a2fd-e72bcdc65791", "last_modified": "2020-02-28T13:24:43.000000Z", @@ -38,7 +40,8 @@ "cluster": 2, "sequence": 0, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference"], + "tags": {}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "310bc665-fe8c-4948-b821-0ad43fcd480d", "last_modified": "2020-02-28T13:47:23.000000Z", @@ -51,7 +54,8 @@ "cluster": 3, "sequence": 0, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference", "alpha:0"], + "tags": {"alpha": 0}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "c3a6c14d-19d8-44da-9aa5-119798f53d15", "last_modified": "2020-02-28T13:54:24.000000Z", @@ -64,7 +68,8 @@ "cluster": 3, "sequence": 1, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference", "alpha:1"], + "tags": {"alpha": 1}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "fac62036-722c-481a-9daf-87897c4872ec", "last_modified": "2020-02-28T13:56:21.000000Z", @@ -77,7 +82,8 @@ "cluster": 3, "sequence": 2, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference", "alpha:2"], + "tags": {"alpha": 2}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "70cda7f6-c97d-4b99-9156-2ff6f5947b7e", "last_modified": "2020-02-28T13:57:03.000000Z", @@ -90,7 +96,8 @@ "cluster": 3, "sequence": 3, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference", "alpha:3"], + "tags": {"alpha": 3}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "5ab4015a-608a-4ecd-9e30-95aee82d86d9", "last_modified": "2020-02-28T13:58:46.000000Z", @@ -103,7 +110,8 @@ "cluster": 3, "sequence": 4, "extension": "dat", - "tags": ["pressure", "coefficient", "cp", "profile", "reference", "alpha:4"], + "tags": {"alpha": 4}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "3ba97d4b-002d-4ca3-a6b0-54573a5eefde", "last_modified": "2020-02-28T13:59:32.000000Z", diff --git a/tests/data/apps/example_app/input/manifest.json b/tests/data/apps/example_app/input/manifest.json index 047cc10..19d7bcc 100644 --- a/tests/data/apps/example_app/input/manifest.json +++ b/tests/data/apps/example_app/input/manifest.json @@ -5,14 +5,15 @@ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "my meteorological dataset", - "tags": ["met", "mast", "wind"], + "tags": {}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": null, "data_file": { "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", @@ -27,7 +28,7 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, "posix_timestamp": null, "data_file": { "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", diff --git a/tests/data/apps/example_app/twine.json b/tests/data/apps/example_app/twine.json index 4319080..0453713 100644 --- a/tests/data/apps/example_app/twine.json +++ b/tests/data/apps/example_app/twine.json @@ -57,8 +57,7 @@ "output_manifest": [ { "key": "production_data", - "purpose": "A dataset containing production data", - "tags": ["production", "wind"] + "purpose": "A dataset containing production data" } ], "output_values_schema": { From d8d94066994b1b579e4865d9dcd82f2fd3f4d778 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Fri, 21 May 2021 12:26:13 +0100 Subject: [PATCH 09/23] FIX: Add missing labels fields to files in example and test manifest files --- .../data/configuration_manifest.json | 1 + examples/met_mast_scada_service/data/input_manifest.json | 4 ++++ examples/met_mast_scada_service/data/output_manifest.json | 1 + .../wind_tunnel_datalogger_service/data/output_manifest.json | 1 + tests/data/apps/example_app/input/manifest.json | 2 ++ 5 files changed, 9 insertions(+) diff --git a/examples/damage_classifier_service/data/configuration_manifest.json b/examples/damage_classifier_service/data/configuration_manifest.json index 005e057..9fb2598 100644 --- a/examples/damage_classifier_service/data/configuration_manifest.json +++ b/examples/damage_classifier_service/data/configuration_manifest.json @@ -14,6 +14,7 @@ "sequence": 0, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", diff --git a/examples/met_mast_scada_service/data/input_manifest.json b/examples/met_mast_scada_service/data/input_manifest.json index 68e0766..eddffba 100644 --- a/examples/met_mast_scada_service/data/input_manifest.json +++ b/examples/met_mast_scada_service/data/input_manifest.json @@ -13,6 +13,7 @@ "sequence": 0, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": 1551393630, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -26,6 +27,7 @@ "sequence": 1, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": 1551394230, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:50:40.633001Z", @@ -47,6 +49,7 @@ "sequence": 0, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": 1551393600, "id": "78fa511f-3e28-4bc2-aa28-7b6a2e8e6ef9", "last_modified": "2019-02-28T22:40:00.000000Z", @@ -60,6 +63,7 @@ "sequence": 1, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": 1551394200, "id": "204d7316-7ae6-45e3-8f90-443225b21226", "last_modified": "2019-02-28T22:50:00.000000Z", diff --git a/examples/met_mast_scada_service/data/output_manifest.json b/examples/met_mast_scada_service/data/output_manifest.json index 4406386..2d37665 100644 --- a/examples/met_mast_scada_service/data/output_manifest.json +++ b/examples/met_mast_scada_service/data/output_manifest.json @@ -14,6 +14,7 @@ "sequence": 0, "extension": "fig", "tags": {}, + "labels": [], "posix_timestamp": 1551394800, "id": "38f77fe2-c8c0-49d1-a08c-0928d53a742f", "last_modified": "2019-02-28T23:00:00.000000Z", diff --git a/examples/wind_tunnel_datalogger_service/data/output_manifest.json b/examples/wind_tunnel_datalogger_service/data/output_manifest.json index 727d7b3..aae804d 100644 --- a/examples/wind_tunnel_datalogger_service/data/output_manifest.json +++ b/examples/wind_tunnel_datalogger_service/data/output_manifest.json @@ -6,6 +6,7 @@ "name": "Test results from naca0012 section", "organisation": "megacorp", "tags": {"section": "naca0012"}, + "labels": [], "files": [ { "path": "datasets/7ead7669/sys_temp.json", diff --git a/tests/data/apps/example_app/input/manifest.json b/tests/data/apps/example_app/input/manifest.json index 19d7bcc..8244102 100644 --- a/tests/data/apps/example_app/input/manifest.json +++ b/tests/data/apps/example_app/input/manifest.json @@ -14,6 +14,7 @@ "sequence": 0, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": null, "data_file": { "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", @@ -29,6 +30,7 @@ "sequence": 1, "extension": "csv", "tags": {}, + "labels": [], "posix_timestamp": null, "data_file": { "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", From e3eef6584e4e16b20ce765f5c1ffe625414dae89 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Fri, 21 May 2021 16:07:36 +0100 Subject: [PATCH 10/23] FIX: Ensure datafiles are matched up by key to name between twine.json and manifest.json --- tests/test_manifest_strands.py | 18 +++++++++--------- twined/exceptions.py | 4 ++++ twined/schema/manifest_schema.json | 7 +------ twined/twine.py | 16 +++++++++++++++- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 6934d51..43b695b 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -84,14 +84,14 @@ def test_missing_manifest_files(self): twine.validate_output_manifest(source=file) def test_valid_manifest_files(self): - """Ensures that a manifest file will validate""" + """Ensures that a manifest file will validate.""" valid_configuration_manifest = """ { "id": "3ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": [ { "id": "34ad7669-8162-4f64-8cd5-4abe92509e17", - "name": "my configuration dataset", + "name": "configuration_files_data", "tags": {}, "labels": ["the", "config", "labels"], "files": [ @@ -135,7 +135,7 @@ def test_valid_manifest_files(self): "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my meteorological dataset", + "name": "met_mast_data", "tags": {}, "labels": ["met", "mast", "wind"], "files": [ @@ -179,7 +179,7 @@ def test_valid_manifest_files(self): "datasets": [ { "id": "1ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my output dataset", + "name": "output_files_data", "tags": {}, "labels": ["the", "output", "labels"], "files": [ @@ -294,7 +294,7 @@ def test_error_raised_when_required_tags_missing_for_validate_input_manifest(sel "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my meteorological dataset", + "name": "met_mast_data", "tags": {}, "labels": ["met", "mast", "wind"], "files": [ @@ -329,7 +329,7 @@ def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_requir "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my meteorological dataset", + "name": "met_mast_data", "tags": {}, "labels": ["met", "mast", "wind"], "files": [ @@ -369,7 +369,7 @@ def test_validate_input_manifest_with_required_tags(self): "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my meteorological dataset", + "name": "met_mast_data", "tags": {}, "labels": ["met", "mast", "wind"], "files": [ @@ -456,7 +456,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", - "name": "first dataset", + "name": "first_dataset", "tags": {}, "labels": [], "files": [ @@ -477,7 +477,7 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): }, { "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", - "name": "second dataset", + "name": "second_dataset", "tags": {}, "labels": [], "files": [ diff --git a/twined/exceptions.py b/twined/exceptions.py index 959556d..fb03716 100644 --- a/twined/exceptions.py +++ b/twined/exceptions.py @@ -124,6 +124,10 @@ class InvalidManifestContents(InvalidManifest, ValidationError): """Raised when the manifest files are missing or do not match tags, sequences, clusters, extensions etc as required""" +class DatasetNameIsNotUnique(InvalidManifest): + """Raise when a dataset's name is not unique within its manifest.""" + + # --------------------- Exceptions relating to access of data using the Twine instance ------------------------ # TODO This is related to filtering files from a manifest. Determine whether this belongs here, diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index a9165e7..fdb2131 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -76,12 +76,7 @@ } } }, - "required": [ - "id", - "tags", - "labels", - "files" - ] + "required": ["id", "name", "tags", "labels", "files"] } } }, diff --git a/twined/twine.py b/twined/twine.py index 67ca7c2..db1fa23 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -195,9 +195,23 @@ def _validate_dataset_file_tags(self, manifest_kind, manifest): :param dict manifest: :return None: """ + # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. manifest_schema = getattr(self, manifest_kind) - for dataset_schema, dataset in zip(manifest_schema, manifest["datasets"]): + for dataset_schema in manifest_schema: + datasets = [dataset for dataset in manifest["datasets"] if dataset["name"] == dataset_schema["key"]] + + if not datasets: + continue + + if len(datasets) > 1: + raise exceptions.DatasetNameIsNotUnique( + f"There is more than one dataset named {dataset_schema['key']!r} - ensure each dataset within a " + f"manifest is uniquely named." + ) + + dataset = datasets.pop(0) + file_tags_template = dataset_schema.get("file_tags_template") if not file_tags_template: From 20c3503215d986318b3361b4d76dba6f6906ab9a Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Fri, 21 May 2021 16:29:23 +0100 Subject: [PATCH 11/23] IMP: Require datasets to be inside "datasets" field in "manifest" field of twine.json --- tests/data/apps/example_app/twine.json | 36 +++--- tests/data/apps/simple_app/twine.json | 5 +- tests/test_manifest_strands.py | 160 +++++++++++++------------ twined/schema/twine_schema.json | 46 ++++--- twined/twine.py | 2 +- 5 files changed, 137 insertions(+), 112 deletions(-) diff --git a/tests/data/apps/example_app/twine.json b/tests/data/apps/example_app/twine.json index 0453713..1a14fc8 100644 --- a/tests/data/apps/example_app/twine.json +++ b/tests/data/apps/example_app/twine.json @@ -31,16 +31,18 @@ "purpose": "A URI for accessing an external database from within a twin or analysis" } ], - "input_manifest": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data" - } - ], + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values", @@ -54,12 +56,14 @@ } } }, - "output_manifest": [ - { - "key": "production_data", - "purpose": "A dataset containing production data" - } - ], + "output_manifest": { + "datasets": [ + { + "key": "production_data", + "purpose": "A dataset containing production data" + } + ] + }, "output_values_schema": { "title": "Output Values", "description": "The output values strand of an example twine", diff --git a/tests/data/apps/simple_app/twine.json b/tests/data/apps/simple_app/twine.json index 340959a..31fd99c 100644 --- a/tests/data/apps/simple_app/twine.json +++ b/tests/data/apps/simple_app/twine.json @@ -67,6 +67,7 @@ } } }, - "output_manifest": [ - ] + "output_manifest": { + "datasets": [] + } } diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 43b695b..f7897eb 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -10,62 +10,70 @@ class TestManifestStrands(BaseTestCase): VALID_MANIFEST_STRAND = """ { - "configuration_manifest": [ - { - "key": "configuration_files_data", - "purpose": "A dataset containing files used in configuration" - } - ], - "input_manifest": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data" - } - ], - "output_manifest": [ - { - "key": "output_files_data", - "purpose": "A dataset containing output results" - } - ] + "configuration_manifest": { + "datasets": [ + { + "key": "configuration_files_data", + "purpose": "A dataset containing files used in configuration" + } + ] + }, + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + }, + "output_manifest": { + "datasets": [ + { + "key": "output_files_data", + "purpose": "A dataset containing output results" + } + ] + } } """ TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ { - "input_manifest": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" - }, - "is_recycled": { - "type": "boolean" + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + }, + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } }, - "number_of_blades": { - "type": "number" - } - }, - "required": [ - "manufacturer", - "height", - "is_recycled", - "number_of_blades" - ] + "required": [ + "manufacturer", + "height", + "is_recycled", + "number_of_blades" + ] + } } - } - ] + ] + } } """ @@ -416,37 +424,39 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS = """ { - "input_manifest": [ - { - "key": "first_dataset", - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": { - "type": "string" - }, - "height": { - "type": "number" + "input_manifest": { + "datasets": [ + { + "key": "first_dataset", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + } } } - } - }, - { - "key": "second_dataset", - "file_tags_template": { - "type": "object", - "properties": { - "is_recycled": { - "type": "boolean" - }, - "number_of_blades": { - "type": "number" + }, + { + "key": "second_dataset", + "file_tags_template": { + "type": "object", + "properties": { + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } } } } - } - ] + ] + } } """ diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index 6b3a015..9b3b71b 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -22,24 +22,34 @@ "required": ["type", "properties"] }, "manifest": { - "type": "array", - "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "file_tags_template": {"$ref": "#/$defs/file_tags_template"} - }, - "required": ["key"] - } + "type": "object", + "properties": { + "datasets": { + "type": "array", + "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", + "items": { + "type": "object", + "properties": { + "key": { + "description": "A textual key identifying this dataset within the application/twin", + "type": "string" + }, + "purpose": { + "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", + "type": "string", + "default": "" + }, + "file_tags_template": { + "$ref": "#/$defs/file_tags_template" + } + }, + "required": [ + "key" + ] + } + } + }, + "required": ["datasets"] } }, "type": "object", diff --git a/twined/twine.py b/twined/twine.py index db1fa23..9ccb7e3 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -198,7 +198,7 @@ def _validate_dataset_file_tags(self, manifest_kind, manifest): # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. manifest_schema = getattr(self, manifest_kind) - for dataset_schema in manifest_schema: + for dataset_schema in manifest_schema["datasets"]: datasets = [dataset for dataset in manifest["datasets"] if dataset["name"] == dataset_schema["key"]] if not datasets: From 9868d31f3679ff93a9ddbb976021ac14afc530e8 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 17:22:08 +0100 Subject: [PATCH 12/23] IMP: Allow remote refs to be used for tag templates --- tests/test_manifest_strands.py | 107 +++++++++++++++++++------------- twined/schema/twine_schema.json | 3 +- 2 files changed, 65 insertions(+), 45 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index f7897eb..b6af80f 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -77,6 +77,52 @@ class TestManifestStrands(BaseTestCase): } """ + INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + def test_missing_manifest_files(self): """Ensures that if you try to read values from missing files, the right exceptions get raised""" twine = Twine(source=self.VALID_MANIFEST_STRAND) @@ -371,54 +417,29 @@ def test_validate_input_manifest_with_required_tags(self): """Test that validating an input manifest with required tags from the file tags template for a dataset works for tags meeting the requirements. """ - input_manifest = """ + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) + + def test_validate_input_manifest_with_required_tags_for_remote_tag_template_schema(self): + """Test that a remote tag template can be used for validating tags on the datafiles in a manifest. """ + twine_with_input_manifest_with_remote_tag_template = """ { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "labels": ["mykeyword1", "mykeyword2"], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - }, - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 1, - "extension": "csv", - "labels": [], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true, - "number_of_blades": 3 - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "$ref": "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" } - ] - } - ] + } + ] + } } """ - twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) - twine.validate_input_manifest(source=input_manifest) + twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) + twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) def test_validate_input_manifest_with_required_tags_in_several_datasets(self): """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index 9b3b71b..23fb839 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -18,8 +18,7 @@ "type": "string" } } - }, - "required": ["type", "properties"] + } }, "manifest": { "type": "object", From fff398909f693d43a69c7db46c26c69a4601a5ab Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 17:31:12 +0100 Subject: [PATCH 13/23] IMP: Make tag template schema more stringent --- twined/schema/twine_schema.json | 46 +++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index 23fb839..6e94a2e 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -1,24 +1,38 @@ { "$defs": { "file_tags_template": { - "type": "object", - "properties": { - "$schema": { - "type": "string" - }, - "type": { - "const": "object" - }, - "properties": { - "type": "object" + "oneOf": [ + { + "type": "object", + "properties": { + "$schema": { + "type": "string" + }, + "type": { + "const": "object" + }, + "properties": { + "type": "object" + }, + "required": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["type", "properties"] }, - "required": { - "type": "array", - "items": { - "type": "string" - } + { + "type": "object", + "properties": { + "$ref": { + "type": "string" + } + }, + "required": ["$ref"] } - } + ] }, "manifest": { "type": "object", From c0015341acddb37add98d179d874d6ef6e2d5ad8 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 18:09:46 +0100 Subject: [PATCH 14/23] TST: Mock remote file tag schema in test --- tests/test_manifest_strands.py | 46 ++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index b6af80f..8466ef2 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -1,5 +1,8 @@ +import copy import os import unittest +from unittest.mock import patch +from jsonschema.validators import RefResolver from twined import Twine, exceptions from .base import BaseTestCase @@ -421,8 +424,11 @@ def test_validate_input_manifest_with_required_tags(self): twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) def test_validate_input_manifest_with_required_tags_for_remote_tag_template_schema(self): - """Test that a remote tag template can be used for validating tags on the datafiles in a manifest. """ - twine_with_input_manifest_with_remote_tag_template = """ + """Test that a remote tag template can be used for validating tags on the datafiles in a manifest.""" + schema_url = "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" + + twine_with_input_manifest_with_remote_tag_template = ( + """ { "input_manifest": { "datasets": [ @@ -430,16 +436,46 @@ def test_validate_input_manifest_with_required_tags_for_remote_tag_template_sche "key": "met_mast_data", "purpose": "A dataset containing meteorological mast data", "file_tags_template": { - "$ref": "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" + "$ref": "%s" } } ] } } - """ + """ + % schema_url + ) + + remote_schema = { + "type": "object", + "properties": { + "manufacturer": {"type": "string"}, + "height": {"type": "number"}, + "is_recycled": {"type": "boolean"}, + "number_of_blades": {"type": "number"}, + }, + "required": ["manufacturer", "height", "is_recycled", "number_of_blades"], + } twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) - twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) + + original_resolve_from_url = copy.copy(RefResolver.resolve_from_url) + + def patch_if_url_is_schema_url(instance, url): + """Patch the jsonschema validator `RefResolver.resolve_from_url` if the url is the schema URL, otherwise + leave it unpatched. + + :param jsonschema.validators.RefResolver instance: + :param str url: + :return mixed: + """ + if url == schema_url: + return remote_schema + else: + return original_resolve_from_url(instance, url) + + with patch("jsonschema.validators.RefResolver.resolve_from_url", new=patch_if_url_is_schema_url): + twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) def test_validate_input_manifest_with_required_tags_in_several_datasets(self): """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" From 4c4ca59b98a8fc4239708bd70f586d332d1f60f4 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 18:18:23 +0100 Subject: [PATCH 15/23] TST: Test error is raised if >1 dataset has same name in manifest --- tests/test_manifest_strands.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 8466ef2..956560e 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -570,6 +570,35 @@ def test_validate_input_manifest_with_required_tags_in_several_datasets(self): twine = Twine(source=TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS) twine.validate_input_manifest(source=input_manifest) + def test_error_raised_if_multiple_datasets_have_same_name(self): + """Test that an error is raised if the input manifest has more than one dataset with the same name.""" + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertRaises(exceptions.DatasetNameIsNotUnique): + twine.validate_input_manifest(source=input_manifest) + if __name__ == "__main__": unittest.main() From 8786d8dc8b8881e3e37f25577f5badcd98f2472f Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 12:26:22 +0100 Subject: [PATCH 16/23] DOC: Update tags format and add labels in docs --- docs/source/anatomy_manifest.rst | 9 ++++++--- docs/source/examples.rst | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index eff3e78..c59fb59 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -171,7 +171,8 @@ for examples. "metadata": { }, "size_bytes": 59684813, - "tags": "lidar, helpful, information, like, sequence:1", // Searchable, parsable and filterable + "tags": {"sequence": 1}, + "labels": ["lidar", "helpful", "information", "like"], // Searchable, parsable and filterable }, { "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", @@ -181,7 +182,8 @@ for examples. "metadata": { }, "size_bytes": 59684813, - "tags": "lidar, helpful, information, like, sequence:2", // Searchable, parsable and filterable + "tags": {"sequence": 2}, + "labels": ["lidar", "helpful", "information", "like"] // Searchable, parsable and filterable }, { "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", @@ -191,7 +193,8 @@ for examples. "metadata": { }, "size_bytes": 484813, - "tags": "report", // Searchable, parsable and filterable + "tags": {}, + "labels": ["report"] // Searchable, parsable and filterable } ] }, diff --git a/docs/source/examples.rst b/docs/source/examples.rst index c6bf981..060902f 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -164,7 +164,8 @@ copied straight from the unit test cases, so you can always check there to see h { "key": "production_data", "purpose": "A dataset containing production data", - "tags": "production, wind" + "tags": {"cleaned": True}, + "labels": ["production", "wind"] } ], "output_values_schema": { From 3c4c50e2a220bd581441eb6f70deb72a121d3dba Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 12:34:59 +0100 Subject: [PATCH 17/23] DOC: Remove mention of defaults from credentials strand docs skip_ci_tests --- docs/source/anatomy_credentials.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/source/anatomy_credentials.rst b/docs/source/anatomy_credentials.rst index 857d5b7..9f6e447 100644 --- a/docs/source/anatomy_credentials.rst +++ b/docs/source/anatomy_credentials.rst @@ -20,7 +20,7 @@ Defining the Credentials Strand =============================== This is the simplest of the strands, containing a list of credentials (whose ``NAMES_SHOULD_BE_SHOUTY_SNAKE_CASE``) with -a reminder of the purpose. Defaults can also be provided, useful for running on local or closed networks. +a reminder of the purpose. .. code-block:: javascript @@ -35,9 +35,7 @@ a reminder of the purpose. Defaults can also be provided, useful for running on "purpose": "Token for accessing a 3rd party API service" }, { - "name": "SECRET_THE_THIRD", - "purpose": "Usually a big secret but sometimes has a convenient non-secret default, like a sandbox or local database", - "default": "postgres://pguser:pgpassword@localhost:5432/pgdb" + "name": "SECRET_THE_THIRD" } ] } From 0d167dcf9b85ba9c88c69f3a9f9264b170e33bf5 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 15:21:09 +0100 Subject: [PATCH 18/23] DOC: Use new manifest strand format in twine examples skip_ci_tests --- docs/source/anatomy_manifest.rst | 4 +- docs/source/examples.rst | 37 +++++++++++-------- examples/damage_classifier_service/twine.json | 18 +++++---- .../strands/input_manifest.json | 17 +++++++++ .../strands/input_manifest_filters.json | 15 -------- .../strands/output_manifest.json | 12 ++++++ .../strands/output_manifest_filters.json | 10 ----- .../strands/output_manifest.json | 12 ++++++ .../strands/output_manifest_filters.json | 10 ----- tests/data/apps/empty_app/twine.json | 10 +++-- 10 files changed, 80 insertions(+), 65 deletions(-) create mode 100644 examples/met_mast_scada_service/strands/input_manifest.json delete mode 100644 examples/met_mast_scada_service/strands/input_manifest_filters.json create mode 100644 examples/met_mast_scada_service/strands/output_manifest.json delete mode 100644 examples/met_mast_scada_service/strands/output_manifest_filters.json create mode 100644 examples/wind_tunnel_datalogger_service/strands/output_manifest.json delete mode 100644 examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index c59fb59..d261169 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -103,7 +103,7 @@ for examples. .. accordion-row:: Show twine containing this strand - .. literalinclude:: ../../examples/met_mast_scada_service/strands/input_manifest_filters.json + .. literalinclude:: ../../examples/met_mast_scada_service/strands/input_manifest.json :language: javascript .. accordion-row:: Show a matching file manifest @@ -120,7 +120,7 @@ for examples. .. accordion-row:: Show twine containing this strand - .. literalinclude:: ../../examples/met_mast_scada_service/strands/output_manifest_filters.json + .. literalinclude:: ../../examples/met_mast_scada_service/strands/output_manifest.json :language: javascript .. accordion-row:: Show a matching file manifest diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 060902f..108da7a 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -81,8 +81,9 @@ copied straight from the unit test cases, so you can always check there to see h } } }, - "output_manifest": [ - ], + "output_manifest": { + "datasets": [] + }, "output_values_schema": { "title": "Output Values schema for the foundation cost twin", "description": "The response supplied to a change in input values will always conform to this schema.", @@ -125,12 +126,14 @@ copied straight from the unit test cases, so you can always check there to see h "purpose": "Token for accessing a 3rd party weather API service" } ], - "input_manifest": [ - { - "key": "wind_resource_data", - "purpose": "A dataset containing Wind Resource Grid files" - } - ], + "input_manifest": { + "datasets": [ + { + "key": "wind_resource_data", + "purpose": "A dataset containing Wind Resource Grid files" + } + ] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values for the weather service twin", @@ -160,14 +163,16 @@ copied straight from the unit test cases, so you can always check there to see h } } }, - "output_manifest": [ - { - "key": "production_data", - "purpose": "A dataset containing production data", - "tags": {"cleaned": True}, - "labels": ["production", "wind"] - } - ], + "output_manifest": { + "datasets": [ + { + "key": "production_data", + "purpose": "A dataset containing production data", + "tags": {"cleaned": true}, + "labels": ["production", "wind"] + } + ] + }, "output_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Output Values for the metocean service twin", diff --git a/examples/damage_classifier_service/twine.json b/examples/damage_classifier_service/twine.json index 3788728..49ff280 100644 --- a/examples/damage_classifier_service/twine.json +++ b/examples/damage_classifier_service/twine.json @@ -1,11 +1,13 @@ { // Manifest strands contain lists, with one entry for each required dataset - "configuration_manifest": [ - { - // Once the inputs are validated, your analysis program can use this key to access the dataset - "key": "trained_model", - // General notes, which are helpful as a reminder to users of the service - "purpose": "The trained classifier" - } - ] + "configuration_manifest": { + "datasets": [ + { + // Once the inputs are validated, your analysis program can use this key to access the dataset + "key": "trained_model", + // General notes, which are helpful as a reminder to users of the service + "purpose": "The trained classifier" + } + ] + } } diff --git a/examples/met_mast_scada_service/strands/input_manifest.json b/examples/met_mast_scada_service/strands/input_manifest.json new file mode 100644 index 0000000..402202d --- /dev/null +++ b/examples/met_mast_scada_service/strands/input_manifest.json @@ -0,0 +1,17 @@ +{ + // Manifest strands contain lists, with one entry for each required dataset + "input_manifest": { + "datasets": [ + { + // Once the inputs are validated, your analysis program can use this key to access the dataset + "key": "met_mast_data", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + } +} diff --git a/examples/met_mast_scada_service/strands/input_manifest_filters.json b/examples/met_mast_scada_service/strands/input_manifest_filters.json deleted file mode 100644 index 9f1dd5f..0000000 --- a/examples/met_mast_scada_service/strands/input_manifest_filters.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - // Manifest strands contain lists, with one entry for each required dataset - "input_manifest_filters": [ - { - // Once the inputs are validated, your analysis program can use this key to access the dataset - "key": "met_mast_data", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing meteorological mast data" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data" - } - ] -} diff --git a/examples/met_mast_scada_service/strands/output_manifest.json b/examples/met_mast_scada_service/strands/output_manifest.json new file mode 100644 index 0000000..86a43cd --- /dev/null +++ b/examples/met_mast_scada_service/strands/output_manifest.json @@ -0,0 +1,12 @@ +{ + "output_manifest": { + "datasets": [ + { + // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete + "key": "met_scada_checks", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing figures showing correlations between mast and scada data" + } + ] + } +} diff --git a/examples/met_mast_scada_service/strands/output_manifest_filters.json b/examples/met_mast_scada_service/strands/output_manifest_filters.json deleted file mode 100644 index e08fa91..0000000 --- a/examples/met_mast_scada_service/strands/output_manifest_filters.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "output_manifest_filters": [ - { - // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete - "key": "met_scada_checks", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing figures showing correlations between mast and scada data" - } - ] -} diff --git a/examples/wind_tunnel_datalogger_service/strands/output_manifest.json b/examples/wind_tunnel_datalogger_service/strands/output_manifest.json new file mode 100644 index 0000000..86a43cd --- /dev/null +++ b/examples/wind_tunnel_datalogger_service/strands/output_manifest.json @@ -0,0 +1,12 @@ +{ + "output_manifest": { + "datasets": [ + { + // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete + "key": "met_scada_checks", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing figures showing correlations between mast and scada data" + } + ] + } +} diff --git a/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json b/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json deleted file mode 100644 index e08fa91..0000000 --- a/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "output_manifest_filters": [ - { - // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete - "key": "met_scada_checks", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing figures showing correlations between mast and scada data" - } - ] -} diff --git a/tests/data/apps/empty_app/twine.json b/tests/data/apps/empty_app/twine.json index c83fb06..8c410a5 100644 --- a/tests/data/apps/empty_app/twine.json +++ b/tests/data/apps/empty_app/twine.json @@ -11,8 +11,9 @@ }, "credentials": [ ], - "input_manifest": [ - ], + "input_manifest": { + "datasets": [] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values", @@ -21,8 +22,9 @@ "properties": { } }, - "output_manifest": [ - ], + "output_manifest": { + "datasets": [] + }, "output_values_schema": { "title": "Output Values", "description": "The output values strand of an example twine", From c420c8bb96b950db5810b39d38ce51d51e5f5bcc Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 15:48:40 +0100 Subject: [PATCH 19/23] DOC: Remove mention of filtering from manifest anatomy doc --- docs/source/anatomy_manifest.rst | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index d261169..1d1c8e9 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -49,38 +49,13 @@ associated files) are required / produced. Describing Manifests ==================== -Manifest-based strands are a **description of what files are needed**, NOT a list of specific files or datasets. This is -a tricky concept, but important, since services should be reusable and applicable to a range of similar datasets. - -The purpose of the manifest strands is to provide a helper to a wider system providing datafiles to digital twins. - -The manifest strands therefore use **tagging** - they contain a ``filters`` field, which should be valid -`Apache Lucene `_ search syntax. This is a powerful syntax, whose tagging features allow -us to specify incredibly broad, or extremely narrow searches (even down to a known unique result). See the tabs below -for examples. - - -.. NOTE:: - - Tagging syntax is extremely powerful. Below, you'll see how this enables a digital twin to specify things like: - - *"OK, I need this digital twin to always have access to a model file for a particular system, containing trained model data"* - - *"Uh, so I need an ordered sequence of files, that are CSV files from a meteorological mast."* - - This allows **twined** to check that the input files contain what is needed, enables quick and easy - extraction of subgroups or particular sequences of files within a dataset, and enables management systems - to map candidate datasets to twins that might be used to process them. - - +Manifest-based strands are a **description of what files are needed**. The purpose of the manifest strands is to +provide a helper to a wider system providing datafiles to digital twins. .. tabs:: .. group-tab:: Configuration Manifest Strand - Here we construct an extremely tight filter, which connects this digital twin to - datasets from a specific system. - .. accordion:: .. accordion-row:: Show twine containing this strand @@ -113,9 +88,6 @@ for examples. .. group-tab:: Output Manifest Strand - Output figure files (with *.fig extension) containing figures enabling a visual check - of correlation between met mast and scada data. - .. accordion:: .. accordion-row:: Show twine containing this strand From a576dff2cb146efe810ce98fdeda3c1b711ef270 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 16:05:37 +0100 Subject: [PATCH 20/23] DOC: Add file tag template documentation --- docs/source/anatomy_manifest.rst | 226 +++++++++++++++++++++---------- 1 file changed, 155 insertions(+), 71 deletions(-) diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index 1d1c8e9..f3a342b 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -100,78 +100,162 @@ provide a helper to a wider system providing datafiles to digital twins. .. literalinclude:: ../../examples/met_mast_scada_service/data/output_manifest.json :language: javascript -.. - TODO - clean up or remove this section - .. _how_filtering_works: - - How Filtering Works - =================== - - It's the job of **twined** to make sure of two things: - - 1. make sure the *twine* file itself is valid, - - - **File data (input, output)** - - Files are not streamed directly to the digital twin (this would require extreme bandwidth in whatever system is - orchestrating all the twins). Instead, files should be made available on the local storage system; i.e. a volume - mounted to whatever container or VM the digital twin runs in. - - Groups of files are described by a ``manifest``, where a manifest is (in essence) a catalogue of files in a - dataset. - - A digital twin might receive multiple manifests, if it uses multiple datasets. For example, it could use a 3D - point cloud LiDAR dataset, and a meteorological dataset. - - .. code-block:: javascript - - { - "manifests": [ - { - "type": "dataset", - "id": "3c15c2ba-6a32-87e0-11e9-3baa66a632fe", // UUID of the manifest - "files": [ - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", // UUID of that file - "sha1": "askjnkdfoisdnfkjnkjsnd" // for quality control to check correctness of file contents - "name": "Lidar - 4 to 10 Dec.csv", - "path": "local/file/path/to/folder/containing/it/", - "type": "csv", - "metadata": { - }, - "size_bytes": 59684813, - "tags": {"sequence": 1}, - "labels": ["lidar", "helpful", "information", "like"], // Searchable, parsable and filterable +.. _file_tag_templates: + +File tag templates +================== + +Datafiles can be tagged with key-value pairs of relevant metadata that can be used in analyses. Certain datasets might +need one set of metadata on each file, while others might need a different set. The required (or optional) file tags can be +specified in the twine in the ``file_tags_template`` property of each dataset of any ``manifest`` strand. Each file in +the corresponding manifest strand is then validated against its dataset's file tag template to ensure the required tags +are present. + +.. accordion:: + + .. accordion-row:: Show twine containing this strand with a file tag template + + .. code-block:: javascript + + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": {"type": "string"}, + "height": {"type": "number"}, + "is_recycled": {"type": "boolean"} + }, + "required": ["manufacturer", "height", "is_recycled"] + } + } + ] + } + } + + .. accordion-row:: Show a matching file manifest + + .. code-block:: javascript + + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + + +TODO - clean up or remove this section + +.. _how_filtering_works: + +How Filtering Works +=================== + +It's the job of **twined** to make sure of two things: + +1. make sure the *twine* file itself is valid, + + + **File data (input, output)** + + Files are not streamed directly to the digital twin (this would require extreme bandwidth in whatever system is + orchestrating all the twins). Instead, files should be made available on the local storage system; i.e. a volume + mounted to whatever container or VM the digital twin runs in. + + Groups of files are described by a ``manifest``, where a manifest is (in essence) a catalogue of files in a + dataset. + + A digital twin might receive multiple manifests, if it uses multiple datasets. For example, it could use a 3D + point cloud LiDAR dataset, and a meteorological dataset. + + .. code-block:: javascript + + { + "manifests": [ + { + "type": "dataset", + "id": "3c15c2ba-6a32-87e0-11e9-3baa66a632fe", // UUID of the manifest + "files": [ + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", // UUID of that file + "sha1": "askjnkdfoisdnfkjnkjsnd" // for quality control to check correctness of file contents + "name": "Lidar - 4 to 10 Dec.csv", + "path": "local/file/path/to/folder/containing/it/", + "type": "csv", + "metadata": { + }, + "size_bytes": 59684813, + "tags": {"sequence": 1}, + "labels": ["lidar", "helpful", "information", "like"], // Searchable, parsable and filterable + }, + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "Lidar - 11 to 18 Dec.csv", + "path": "local/file/path/to/folder/containing/it/", + "type": "csv", + "metadata": { }, - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "Lidar - 11 to 18 Dec.csv", - "path": "local/file/path/to/folder/containing/it/", - "type": "csv", - "metadata": { - }, - "size_bytes": 59684813, - "tags": {"sequence": 2}, - "labels": ["lidar", "helpful", "information", "like"] // Searchable, parsable and filterable + "size_bytes": 59684813, + "tags": {"sequence": 2}, + "labels": ["lidar", "helpful", "information", "like"] // Searchable, parsable and filterable + }, + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "Lidar report.pdf", + "path": "local/file/path/to/folder/containing/it/", + "type": "pdf", + "metadata": { }, - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "Lidar report.pdf", - "path": "local/file/path/to/folder/containing/it/", - "type": "pdf", - "metadata": { - }, - "size_bytes": 484813, - "tags": {}, - "labels": ["report"] // Searchable, parsable and filterable - } - ] - }, - { - // ... another dataset manifest ... - } - ] - } + "size_bytes": 484813, + "tags": {}, + "labels": ["report"] // Searchable, parsable and filterable + } + ] + }, + { + // ... another dataset manifest ... + } + ] + } From 53136d7c3d20267813a1496e176e8a53bd2f76ce Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 26 May 2021 16:49:10 +0100 Subject: [PATCH 21/23] DOC: Add example of remote tag templates --- docs/source/anatomy_manifest.rst | 250 ++++++++++++++++++++++--------- tests/test_manifest_strands.py | 3 +- 2 files changed, 180 insertions(+), 73 deletions(-) diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index f3a342b..365139f 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -113,77 +113,185 @@ specified in the twine in the ``file_tags_template`` property of each dataset of the corresponding manifest strand is then validated against its dataset's file tag template to ensure the required tags are present. -.. accordion:: - - .. accordion-row:: Show twine containing this strand with a file tag template - - .. code-block:: javascript - - { - "input_manifest": { - "datasets": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data", - "file_tags_template": { - "type": "object", - "properties": { - "manufacturer": {"type": "string"}, - "height": {"type": "number"}, - "is_recycled": {"type": "boolean"} - }, - "required": ["manufacturer", "height", "is_recycled"] - } - } - ] - } - } - - .. accordion-row:: Show a matching file manifest - - .. code-block:: javascript - - { - "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", - "datasets": [ - { - "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "met_mast_data", - "tags": {}, - "labels": ["met", "mast", "wind"], - "files": [ - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 0, - "extension": "csv", - "labels": ["mykeyword1", "mykeyword2"], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - }, - { - "path": "input/datasets/7ead7669/file_1.csv", - "cluster": 0, - "sequence": 1, - "extension": "csv", - "labels": [], - "tags": { - "manufacturer": "vestas", - "height": 500, - "is_recycled": true - }, - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "file_1.csv" - } - ] - } - ] - } +.. tabs:: + + .. group-tab:: Manifest strand with file tag template + + The example below is for an input manifest, but the format is the same for configuration and output manifests. + + .. accordion:: + + .. accordion-row:: Show twine containing a manifest strand with a file tag template + + .. code-block:: javascript + + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": {"type": "string"}, + "height": {"type": "number"}, + "is_recycled": {"type": "boolean"} + }, + "required": ["manufacturer", "height", "is_recycled"] + } + } + ] + } + } + + .. accordion-row:: Show a matching file manifest + + .. code-block:: javascript + + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + + .. group-tab:: Manifest strand with a remote file tag template + + A remote reference can also be given for a file tag template. If the tag template somewhere public, this is + useful for sharing the template between one or more teams working on the same type of data. + + The example below is for an input manifest, but the format is the same for configuration and output manifests. + It also shows two different tag templates being specified for two different types of dataset required by the + manifest. + + .. accordion:: + + .. accordion-row:: Show twine using a remote tag template + + .. code-block:: javascript + + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "$ref": "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" + } + }, + { + "key": "some_other_kind_of_dataset", + "purpose": "A dataset containing something else", + "file_tags_template": { + "$ref": "https://refs.schema.octue.com/octue/another-file-type-tag-template/0.0.0.json" + } + } + ] + } + } + + .. accordion-row:: Show a matching file manifest + + .. code-block:: javascript + + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e29", + "name": "some_other_kind_of_dataset", + "tags": {}, + "labels": ["my-label"], + "files": [ + { + "path": "input/datasets/7eadpp9/interesting_file.dat", + "cluster": 0, + "sequence": 0, + "extension": "dat", + "labels": [], + "tags": { + "length": 864, + "orientation_angle": 85 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae9071", + "name": "interesting_file.csv" + }, + } + ] + } TODO - clean up or remove this section diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 956560e..c42a291 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -452,9 +452,8 @@ def test_validate_input_manifest_with_required_tags_for_remote_tag_template_sche "manufacturer": {"type": "string"}, "height": {"type": "number"}, "is_recycled": {"type": "boolean"}, - "number_of_blades": {"type": "number"}, }, - "required": ["manufacturer", "height", "is_recycled", "number_of_blades"], + "required": ["manufacturer", "height", "is_recycled"], } twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) From c38b07e4644b6ecb441b28cba434240f0125fbb0 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 2 Jun 2021 12:39:23 +0100 Subject: [PATCH 22/23] DOC: Remove confusing example of tags from docs --- docs/source/anatomy_manifest.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index 365139f..6d93ae4 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -335,7 +335,7 @@ It's the job of **twined** to make sure of two things: "metadata": { }, "size_bytes": 59684813, - "tags": {"sequence": 1}, + "tags": {"special_number": 1}, "labels": ["lidar", "helpful", "information", "like"], // Searchable, parsable and filterable }, { @@ -346,7 +346,7 @@ It's the job of **twined** to make sure of two things: "metadata": { }, "size_bytes": 59684813, - "tags": {"sequence": 2}, + "tags": {"special_number": 2}, "labels": ["lidar", "helpful", "information", "like"] // Searchable, parsable and filterable }, { From 9fee5670bfeef788ae4051235f80699479baea89 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 2 Jun 2021 13:09:48 +0100 Subject: [PATCH 23/23] DOC: Add purpose to credential in docs --- docs/source/anatomy_credentials.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/anatomy_credentials.rst b/docs/source/anatomy_credentials.rst index 9f6e447..cbed9f0 100644 --- a/docs/source/anatomy_credentials.rst +++ b/docs/source/anatomy_credentials.rst @@ -35,7 +35,8 @@ a reminder of the purpose. "purpose": "Token for accessing a 3rd party API service" }, { - "name": "SECRET_THE_THIRD" + "name": "SECRET_THE_THIRD", + "purpose": "Another secret, like a password for a sandbox or local database" } ] }