diff --git a/docs/source/anatomy_credentials.rst b/docs/source/anatomy_credentials.rst index 857d5b7..cbed9f0 100644 --- a/docs/source/anatomy_credentials.rst +++ b/docs/source/anatomy_credentials.rst @@ -20,7 +20,7 @@ Defining the Credentials Strand =============================== This is the simplest of the strands, containing a list of credentials (whose ``NAMES_SHOULD_BE_SHOUTY_SNAKE_CASE``) with -a reminder of the purpose. Defaults can also be provided, useful for running on local or closed networks. +a reminder of the purpose. .. code-block:: javascript @@ -36,8 +36,7 @@ a reminder of the purpose. Defaults can also be provided, useful for running on }, { "name": "SECRET_THE_THIRD", - "purpose": "Usually a big secret but sometimes has a convenient non-secret default, like a sandbox or local database", - "default": "postgres://pguser:pgpassword@localhost:5432/pgdb" + "purpose": "Another secret, like a password for a sandbox or local database" } ] } diff --git a/docs/source/anatomy_manifest.rst b/docs/source/anatomy_manifest.rst index eff3e78..6d93ae4 100644 --- a/docs/source/anatomy_manifest.rst +++ b/docs/source/anatomy_manifest.rst @@ -49,38 +49,13 @@ associated files) are required / produced. Describing Manifests ==================== -Manifest-based strands are a **description of what files are needed**, NOT a list of specific files or datasets. This is -a tricky concept, but important, since services should be reusable and applicable to a range of similar datasets. - -The purpose of the manifest strands is to provide a helper to a wider system providing datafiles to digital twins. - -The manifest strands therefore use **tagging** - they contain a ``filters`` field, which should be valid -`Apache Lucene `_ search syntax. This is a powerful syntax, whose tagging features allow -us to specify incredibly broad, or extremely narrow searches (even down to a known unique result). See the tabs below -for examples. - - -.. NOTE:: - - Tagging syntax is extremely powerful. Below, you'll see how this enables a digital twin to specify things like: - - *"OK, I need this digital twin to always have access to a model file for a particular system, containing trained model data"* - - *"Uh, so I need an ordered sequence of files, that are CSV files from a meteorological mast."* - - This allows **twined** to check that the input files contain what is needed, enables quick and easy - extraction of subgroups or particular sequences of files within a dataset, and enables management systems - to map candidate datasets to twins that might be used to process them. - - +Manifest-based strands are a **description of what files are needed**. The purpose of the manifest strands is to +provide a helper to a wider system providing datafiles to digital twins. .. tabs:: .. group-tab:: Configuration Manifest Strand - Here we construct an extremely tight filter, which connects this digital twin to - datasets from a specific system. - .. accordion:: .. accordion-row:: Show twine containing this strand @@ -103,7 +78,7 @@ for examples. .. accordion-row:: Show twine containing this strand - .. literalinclude:: ../../examples/met_mast_scada_service/strands/input_manifest_filters.json + .. literalinclude:: ../../examples/met_mast_scada_service/strands/input_manifest.json :language: javascript .. accordion-row:: Show a matching file manifest @@ -113,14 +88,11 @@ for examples. .. group-tab:: Output Manifest Strand - Output figure files (with *.fig extension) containing figures enabling a visual check - of correlation between met mast and scada data. - .. accordion:: .. accordion-row:: Show twine containing this strand - .. literalinclude:: ../../examples/met_mast_scada_service/strands/output_manifest_filters.json + .. literalinclude:: ../../examples/met_mast_scada_service/strands/output_manifest.json :language: javascript .. accordion-row:: Show a matching file manifest @@ -128,75 +100,270 @@ for examples. .. literalinclude:: ../../examples/met_mast_scada_service/data/output_manifest.json :language: javascript -.. - TODO - clean up or remove this section - .. _how_filtering_works: +.. _file_tag_templates: + +File tag templates +================== - How Filtering Works - =================== +Datafiles can be tagged with key-value pairs of relevant metadata that can be used in analyses. Certain datasets might +need one set of metadata on each file, while others might need a different set. The required (or optional) file tags can be +specified in the twine in the ``file_tags_template`` property of each dataset of any ``manifest`` strand. Each file in +the corresponding manifest strand is then validated against its dataset's file tag template to ensure the required tags +are present. + +.. tabs:: - It's the job of **twined** to make sure of two things: + .. group-tab:: Manifest strand with file tag template - 1. make sure the *twine* file itself is valid, + The example below is for an input manifest, but the format is the same for configuration and output manifests. + .. accordion:: - **File data (input, output)** + .. accordion-row:: Show twine containing a manifest strand with a file tag template - Files are not streamed directly to the digital twin (this would require extreme bandwidth in whatever system is - orchestrating all the twins). Instead, files should be made available on the local storage system; i.e. a volume - mounted to whatever container or VM the digital twin runs in. + .. code-block:: javascript - Groups of files are described by a ``manifest``, where a manifest is (in essence) a catalogue of files in a - dataset. + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": {"type": "string"}, + "height": {"type": "number"}, + "is_recycled": {"type": "boolean"} + }, + "required": ["manufacturer", "height", "is_recycled"] + } + } + ] + } + } - A digital twin might receive multiple manifests, if it uses multiple datasets. For example, it could use a 3D - point cloud LiDAR dataset, and a meteorological dataset. + .. accordion-row:: Show a matching file manifest - .. code-block:: javascript + .. code-block:: javascript - { - "manifests": [ - { - "type": "dataset", - "id": "3c15c2ba-6a32-87e0-11e9-3baa66a632fe", // UUID of the manifest + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], "files": [ - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", // UUID of that file - "sha1": "askjnkdfoisdnfkjnkjsnd" // for quality control to check correctness of file contents - "name": "Lidar - 4 to 10 Dec.csv", - "path": "local/file/path/to/folder/containing/it/", - "type": "csv", - "metadata": { - }, - "size_bytes": 59684813, - "tags": "lidar, helpful, information, like, sequence:1", // Searchable, parsable and filterable + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true }, - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "Lidar - 11 to 18 Dec.csv", - "path": "local/file/path/to/folder/containing/it/", - "type": "csv", - "metadata": { - }, - "size_bytes": 59684813, - "tags": "lidar, helpful, information, like, sequence:2", // Searchable, parsable and filterable + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true }, - { - "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", - "name": "Lidar report.pdf", - "path": "local/file/path/to/folder/containing/it/", - "type": "pdf", - "metadata": { - }, - "size_bytes": 484813, - "tags": "report", // Searchable, parsable and filterable - } + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } ] - }, - { - // ... another dataset manifest ... - } - ] - } + } + ] + } + + .. group-tab:: Manifest strand with a remote file tag template + + A remote reference can also be given for a file tag template. If the tag template somewhere public, this is + useful for sharing the template between one or more teams working on the same type of data. + + The example below is for an input manifest, but the format is the same for configuration and output manifests. + It also shows two different tag templates being specified for two different types of dataset required by the + manifest. + + .. accordion:: + + .. accordion-row:: Show twine using a remote tag template + + .. code-block:: javascript + + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "$ref": "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" + } + }, + { + "key": "some_other_kind_of_dataset", + "purpose": "A dataset containing something else", + "file_tags_template": { + "$ref": "https://refs.schema.octue.com/octue/another-file-type-tag-template/0.0.0.json" + } + } + ] + } + } + + .. accordion-row:: Show a matching file manifest + + .. code-block:: javascript + + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e29", + "name": "some_other_kind_of_dataset", + "tags": {}, + "labels": ["my-label"], + "files": [ + { + "path": "input/datasets/7eadpp9/interesting_file.dat", + "cluster": 0, + "sequence": 0, + "extension": "dat", + "labels": [], + "tags": { + "length": 864, + "orientation_angle": 85 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae9071", + "name": "interesting_file.csv" + }, + } + ] + } + + +TODO - clean up or remove this section + +.. _how_filtering_works: + +How Filtering Works +=================== + +It's the job of **twined** to make sure of two things: + +1. make sure the *twine* file itself is valid, + + + **File data (input, output)** + + Files are not streamed directly to the digital twin (this would require extreme bandwidth in whatever system is + orchestrating all the twins). Instead, files should be made available on the local storage system; i.e. a volume + mounted to whatever container or VM the digital twin runs in. + + Groups of files are described by a ``manifest``, where a manifest is (in essence) a catalogue of files in a + dataset. + + A digital twin might receive multiple manifests, if it uses multiple datasets. For example, it could use a 3D + point cloud LiDAR dataset, and a meteorological dataset. + + .. code-block:: javascript + + { + "manifests": [ + { + "type": "dataset", + "id": "3c15c2ba-6a32-87e0-11e9-3baa66a632fe", // UUID of the manifest + "files": [ + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", // UUID of that file + "sha1": "askjnkdfoisdnfkjnkjsnd" // for quality control to check correctness of file contents + "name": "Lidar - 4 to 10 Dec.csv", + "path": "local/file/path/to/folder/containing/it/", + "type": "csv", + "metadata": { + }, + "size_bytes": 59684813, + "tags": {"special_number": 1}, + "labels": ["lidar", "helpful", "information", "like"], // Searchable, parsable and filterable + }, + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "Lidar - 11 to 18 Dec.csv", + "path": "local/file/path/to/folder/containing/it/", + "type": "csv", + "metadata": { + }, + "size_bytes": 59684813, + "tags": {"special_number": 2}, + "labels": ["lidar", "helpful", "information", "like"] // Searchable, parsable and filterable + }, + { + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "Lidar report.pdf", + "path": "local/file/path/to/folder/containing/it/", + "type": "pdf", + "metadata": { + }, + "size_bytes": 484813, + "tags": {}, + "labels": ["report"] // Searchable, parsable and filterable + } + ] + }, + { + // ... another dataset manifest ... + } + ] + } diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 110ba65..108da7a 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -47,7 +47,7 @@ copied straight from the unit test cases, so you can always check there to see h "description": "This twine helps compute the cost of an installed foundation.", "children": [ ], - "configuration_schema": { + "configuration_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Foundation cost twin configuration", "description": "Set config parameters and constants at startup of the twin.", @@ -81,8 +81,9 @@ copied straight from the unit test cases, so you can always check there to see h } } }, - "output_manifest": [ - ], + "output_manifest": { + "datasets": [] + }, "output_values_schema": { "title": "Output Values schema for the foundation cost twin", "description": "The response supplied to a change in input values will always conform to this schema.", @@ -125,13 +126,14 @@ copied straight from the unit test cases, so you can always check there to see h "purpose": "Token for accessing a 3rd party weather API service" } ], - "input_manifest": [ - { - "key": "wind_resource_data", - "purpose": "A dataset containing Wind Resource Grid files", - "filters": "tags:(wind AND resource) files:(extension:wrg)" - } - ], + "input_manifest": { + "datasets": [ + { + "key": "wind_resource_data", + "purpose": "A dataset containing Wind Resource Grid files" + } + ] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values for the weather service twin", @@ -161,13 +163,16 @@ copied straight from the unit test cases, so you can always check there to see h } } }, - "output_manifest": [ - { - "key": "production_data", - "purpose": "A dataset containing production data", - "tags": "production, wind" - } - ], + "output_manifest": { + "datasets": [ + { + "key": "production_data", + "purpose": "A dataset containing production data", + "tags": {"cleaned": true}, + "labels": ["production", "wind"] + } + ] + }, "output_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Output Values for the metocean service twin", @@ -184,4 +189,4 @@ copied straight from the unit test cases, so you can always check there to see h } } } - } \ No newline at end of file + } diff --git a/examples/damage_classifier_service/data/configuration_manifest.json b/examples/damage_classifier_service/data/configuration_manifest.json index 8acf4c2..9fb2598 100644 --- a/examples/damage_classifier_service/data/configuration_manifest.json +++ b/examples/damage_classifier_service/data/configuration_manifest.json @@ -5,14 +5,16 @@ "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "training data for system abc123", "organisation": "megacorp", - "tags": "classifier, damage, system:abc123", + "tags": {"system": "abc123"}, + "labels": ["classifier", "damage"], "files": [ { "path": "datasets/7ead7669/blade_damage.mdl", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", diff --git a/examples/damage_classifier_service/twine.json b/examples/damage_classifier_service/twine.json index 60f02b2..49ff280 100644 --- a/examples/damage_classifier_service/twine.json +++ b/examples/damage_classifier_service/twine.json @@ -1,14 +1,13 @@ { // Manifest strands contain lists, with one entry for each required dataset - "configuration_manifest": [ - { - // Once the inputs are validated, your analysis program can use this key to access the dataset - "key": "trained_model", - // General notes, which are helpful as a reminder to users of the service - "purpose": "The trained classifier", - // Issues a strict search for data provided by megacorp, containing *.mdl files tagged as - // classifiers for blade damage on system abc123 - "filters": "organisation: megacorp AND tags:(classifier AND damage AND system:abc123) AND files:(extension:mdl)" - } - ] + "configuration_manifest": { + "datasets": [ + { + // Once the inputs are validated, your analysis program can use this key to access the dataset + "key": "trained_model", + // General notes, which are helpful as a reminder to users of the service + "purpose": "The trained classifier" + } + ] + } } diff --git a/examples/met_mast_scada_service/data/input_manifest.json b/examples/met_mast_scada_service/data/input_manifest.json index a2b3242..eddffba 100644 --- a/examples/met_mast_scada_service/data/input_manifest.json +++ b/examples/met_mast_scada_service/data/input_manifest.json @@ -4,14 +4,16 @@ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "meteorological mast dataset", - "tags": "met, mast, wind, location:108346", + "tags": {"location": 108346}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/mast_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 1551393630, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -24,7 +26,8 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 1551394230, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:50:40.633001Z", @@ -37,14 +40,16 @@ { "id": "5cf9e445-c288-4567-9072-edc31003b022", "name": "scada data exports", - "tags": "wind, turbine, scada, system:ab32, location:108346", + "tags": {"location": 108346, "system": "ab32"}, + "labels": ["wind", "turbine", "scada"], "files": [ { "path": "input/datasets/7ead7669/export_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 1551393600, "id": "78fa511f-3e28-4bc2-aa28-7b6a2e8e6ef9", "last_modified": "2019-02-28T22:40:00.000000Z", @@ -57,7 +62,8 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 1551394200, "id": "204d7316-7ae6-45e3-8f90-443225b21226", "last_modified": "2019-02-28T22:50:00.000000Z", diff --git a/examples/met_mast_scada_service/data/output_manifest.json b/examples/met_mast_scada_service/data/output_manifest.json index dc5aed0..2d37665 100644 --- a/examples/met_mast_scada_service/data/output_manifest.json +++ b/examples/met_mast_scada_service/data/output_manifest.json @@ -5,14 +5,16 @@ "id": "4564deca-5654-42e8-aadf-70690b393a30", "name": "visual cross check data", "organisation": "megacorp", - "tags": "figure, met, mast, scada, check, location:108346", + "tags": {"location": 108346}, + "labels": ["figure", "met", "mast", "scada", "check"], "files": [ { "path": "datasets/7ead7669/cross_check.fig", "cluster": 0, "sequence": 0, "extension": "fig", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": 1551394800, "id": "38f77fe2-c8c0-49d1-a08c-0928d53a742f", "last_modified": "2019-02-28T23:00:00.000000Z", diff --git a/examples/met_mast_scada_service/strands/input_manifest.json b/examples/met_mast_scada_service/strands/input_manifest.json new file mode 100644 index 0000000..402202d --- /dev/null +++ b/examples/met_mast_scada_service/strands/input_manifest.json @@ -0,0 +1,17 @@ +{ + // Manifest strands contain lists, with one entry for each required dataset + "input_manifest": { + "datasets": [ + { + // Once the inputs are validated, your analysis program can use this key to access the dataset + "key": "met_mast_data", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + } +} diff --git a/examples/met_mast_scada_service/strands/input_manifest_filters.json b/examples/met_mast_scada_service/strands/input_manifest_filters.json deleted file mode 100644 index 5e3d851..0000000 --- a/examples/met_mast_scada_service/strands/input_manifest_filters.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - // Manifest strands contain lists, with one entry for each required dataset - "input_manifest_filters": [ - { - // Once the inputs are validated, your analysis program can use this key to access the dataset - "key": "met_mast_data", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing meteorological mast data", - // Searches datasets which are tagged "met*" (allowing for "met" and "meterological"), whose - // files are CSVs in a numbered sequence, and which occur at a particular location - "filters": "tags:(met* AND mast) AND files:(extension:csv AND sequence:>=0) AND location:10" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data", - // The organisation: filter refines search to datasets owned by a particular organisation handle - "filters": "organisation: megacorp AND tags:(scada AND mast) AND files:(extension:csv AND sequence:>=0)" - } - ], -} diff --git a/examples/met_mast_scada_service/strands/output_manifest.json b/examples/met_mast_scada_service/strands/output_manifest.json new file mode 100644 index 0000000..86a43cd --- /dev/null +++ b/examples/met_mast_scada_service/strands/output_manifest.json @@ -0,0 +1,12 @@ +{ + "output_manifest": { + "datasets": [ + { + // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete + "key": "met_scada_checks", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing figures showing correlations between mast and scada data" + } + ] + } +} diff --git a/examples/met_mast_scada_service/strands/output_manifest_filters.json b/examples/met_mast_scada_service/strands/output_manifest_filters.json deleted file mode 100644 index 32626b0..0000000 --- a/examples/met_mast_scada_service/strands/output_manifest_filters.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "output_manifest_filters": [ - { - // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete - "key": "met_scada_checks", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing figures showing correlations between mast and scada data", - // Twined will check that the output file manifest has tags appropriate to the filters - "filters": "tags:(met* AND scada AND correlation) AND files:(extension:json) AND location:*" - } - ] -} diff --git a/examples/wind_tunnel_datalogger_service/data/output_manifest.json b/examples/wind_tunnel_datalogger_service/data/output_manifest.json index eac4ac4..aae804d 100644 --- a/examples/wind_tunnel_datalogger_service/data/output_manifest.json +++ b/examples/wind_tunnel_datalogger_service/data/output_manifest.json @@ -5,14 +5,16 @@ "id": "1eba4346-daff-421b-921c-8f1c05d6997d", "name": "Test results from naca0012 section", "organisation": "megacorp", - "tags": "section:naca0012", + "tags": {"section": "naca0012"}, + "labels": [], "files": [ { "path": "datasets/7ead7669/sys_temp.json", "cluster": 0, "sequence": 0, "extension": "json", - "tags": "system, temperature", + "tags": {}, + "labels": ["system", "temperature"], "posix_timestamp": 1551394800, "id": "afcdef45-da6b-4805-95d6-7a889d81f5b9", "last_modified": "2020-02-28T13:12:42.000000Z", @@ -25,7 +27,8 @@ "cluster": 1, "sequence": 0, "extension": "json", - "tags": "wind, tunnel, velocity, profile, background, turbulence", + "tags": {}, + "labels": ["wind", "tunnel", "velocity", "profile", "background", "turbulence"], "posix_timestamp": 1551394800, "id": "3667aa6d-ee64-4cd4-a2fd-e72bcdc65791", "last_modified": "2020-02-28T13:24:43.000000Z", @@ -38,7 +41,8 @@ "cluster": 2, "sequence": 0, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference", + "tags": {}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "310bc665-fe8c-4948-b821-0ad43fcd480d", "last_modified": "2020-02-28T13:47:23.000000Z", @@ -51,7 +55,8 @@ "cluster": 3, "sequence": 0, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference, alpha:0", + "tags": {"alpha": 0}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "c3a6c14d-19d8-44da-9aa5-119798f53d15", "last_modified": "2020-02-28T13:54:24.000000Z", @@ -64,7 +69,8 @@ "cluster": 3, "sequence": 1, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference, alpha:1", + "tags": {"alpha": 1}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "fac62036-722c-481a-9daf-87897c4872ec", "last_modified": "2020-02-28T13:56:21.000000Z", @@ -77,7 +83,8 @@ "cluster": 3, "sequence": 2, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference, alpha:2", + "tags": {"alpha": 2}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "70cda7f6-c97d-4b99-9156-2ff6f5947b7e", "last_modified": "2020-02-28T13:57:03.000000Z", @@ -90,7 +97,8 @@ "cluster": 3, "sequence": 3, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference, alpha:3", + "tags": {"alpha": 3}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "5ab4015a-608a-4ecd-9e30-95aee82d86d9", "last_modified": "2020-02-28T13:58:46.000000Z", @@ -103,7 +111,8 @@ "cluster": 3, "sequence": 4, "extension": "dat", - "tags": "pressure, coefficient, cp, profile, reference, alpha:4", + "tags": {"alpha": 4}, + "labels": ["pressure", "coefficient", "cp", "profile", "reference"], "posix_timestamp": 1551394800, "id": "3ba97d4b-002d-4ca3-a6b0-54573a5eefde", "last_modified": "2020-02-28T13:59:32.000000Z", diff --git a/examples/wind_tunnel_datalogger_service/strands/output_manifest.json b/examples/wind_tunnel_datalogger_service/strands/output_manifest.json new file mode 100644 index 0000000..86a43cd --- /dev/null +++ b/examples/wind_tunnel_datalogger_service/strands/output_manifest.json @@ -0,0 +1,12 @@ +{ + "output_manifest": { + "datasets": [ + { + // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete + "key": "met_scada_checks", + // General notes, which are helpful as a reminder to users of the service + "purpose": "A dataset containing figures showing correlations between mast and scada data" + } + ] + } +} diff --git a/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json b/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json deleted file mode 100644 index 32626b0..0000000 --- a/examples/wind_tunnel_datalogger_service/strands/output_manifest_filters.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "output_manifest_filters": [ - { - // Twined will prepare a manifest with this key, which you can add to during the analysis or once its complete - "key": "met_scada_checks", - // General notes, which are helpful as a reminder to users of the service - "purpose": "A dataset containing figures showing correlations between mast and scada data", - // Twined will check that the output file manifest has tags appropriate to the filters - "filters": "tags:(met* AND scada AND correlation) AND files:(extension:json) AND location:*" - } - ] -} diff --git a/setup.py b/setup.py index be54718..207fcae 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name="twined", - version="0.0.19", + version="0.0.20", py_modules=[], install_requires=["jsonschema ~= 3.2.0", "python-dotenv"], url="https://www.github.com/octue/twined", diff --git a/tests/data/apps/empty_app/twine.json b/tests/data/apps/empty_app/twine.json index 39b9e14..8c410a5 100644 --- a/tests/data/apps/empty_app/twine.json +++ b/tests/data/apps/empty_app/twine.json @@ -1,7 +1,7 @@ { "children": [ ], - "configuration_schema": { + "configuration_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "The example configuration form", "description": "The configuration strand of an example twine", @@ -11,8 +11,9 @@ }, "credentials": [ ], - "input_manifest": [ - ], + "input_manifest": { + "datasets": [] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values", @@ -21,8 +22,9 @@ "properties": { } }, - "output_manifest": [ - ], + "output_manifest": { + "datasets": [] + }, "output_values_schema": { "title": "Output Values", "description": "The output values strand of an example twine", diff --git a/tests/data/apps/example_app/input/manifest.json b/tests/data/apps/example_app/input/manifest.json index a994280..8244102 100644 --- a/tests/data/apps/example_app/input/manifest.json +++ b/tests/data/apps/example_app/input/manifest.json @@ -5,14 +5,16 @@ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", "name": "my meteorological dataset", - "tags": "met, mast, wind", + "tags": {}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": null, "data_file": { "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", @@ -27,7 +29,8 @@ "cluster": 0, "sequence": 1, "extension": "csv", - "tags": "", + "tags": {}, + "labels": [], "posix_timestamp": null, "data_file": { "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", diff --git a/tests/data/apps/example_app/twine.json b/tests/data/apps/example_app/twine.json index a86ee38..1a14fc8 100644 --- a/tests/data/apps/example_app/twine.json +++ b/tests/data/apps/example_app/twine.json @@ -6,7 +6,7 @@ "filters": "tags:(met* AND mast AND location) files:(extension:csv AND sequence:>=0) location:10" } ], - "configuration_schema": { + "configuration_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "The example configuration form", "description": "The configuration strand of an example twine", @@ -31,18 +31,18 @@ "purpose": "A URI for accessing an external database from within a twin or analysis" } ], - "input_manifest": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data", - "filters": "tags:(met* AND mast AND location) files:(extension:csv AND sequence:>=0) location:10" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data", - "filters": "tags:(met* AND mast) files:(extension:csv AND sequence:>=0) location:10" - } - ], + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + }, "input_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Input Values", @@ -56,13 +56,14 @@ } } }, - "output_manifest": [ - { - "key": "production_data", - "purpose": "A dataset containing production data", - "tags": ["production", "wind"] - } - ], + "output_manifest": { + "datasets": [ + { + "key": "production_data", + "purpose": "A dataset containing production data" + } + ] + }, "output_values_schema": { "title": "Output Values", "description": "The output values strand of an example twine", diff --git a/tests/data/apps/simple_app/twine.json b/tests/data/apps/simple_app/twine.json index 8b86eac..31fd99c 100644 --- a/tests/data/apps/simple_app/twine.json +++ b/tests/data/apps/simple_app/twine.json @@ -1,5 +1,5 @@ { - "configuration_schema": { + "configuration_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "Configuration for a simple app", "description": "The app creates a mandelbrot plot", @@ -67,6 +67,7 @@ } } }, - "output_manifest": [ - ] + "output_manifest": { + "datasets": [] + } } diff --git a/tests/test_manifest_strands.py b/tests/test_manifest_strands.py index 5ca0bb0..c42a291 100644 --- a/tests/test_manifest_strands.py +++ b/tests/test_manifest_strands.py @@ -1,5 +1,8 @@ +import copy import os import unittest +from unittest.mock import patch +from jsonschema.validators import RefResolver from twined import Twine, exceptions from .base import BaseTestCase @@ -10,30 +13,114 @@ class TestManifestStrands(BaseTestCase): VALID_MANIFEST_STRAND = """ { - "configuration_manifest": [ - { - "key": "configuration_files_data", - "purpose": "A dataset containing files used in configuration", - "filters": "tags:(the AND config AND tags) files:(extension:csv AND sequence:>=0)" - } - ], - "input_manifest": [ - { - "key": "met_mast_data", - "purpose": "A dataset containing meteorological mast data", - "filters": "tags:(met* AND mast AND location) files:(extension:csv AND sequence:>=0) location:10" - }, - { - "key": "scada_data", - "purpose": "A dataset containing scada data", - "filters": "tags:(met* AND mast) files:(extension:csv AND sequence:>=0) location:10" - } - ], - "output_manifest": [ + "configuration_manifest": { + "datasets": [ + { + "key": "configuration_files_data", + "purpose": "A dataset containing files used in configuration" + } + ] + }, + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data" + }, + { + "key": "scada_data", + "purpose": "A dataset containing scada data" + } + ] + }, + "output_manifest": { + "datasets": [ + { + "key": "output_files_data", + "purpose": "A dataset containing output results" + } + ] + } + } + """ + + TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE = """ + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + }, + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } + }, + "required": [ + "manufacturer", + "height", + "is_recycled", + "number_of_blades" + ] + } + } + ] + } + } + """ + + INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ { - "key": "output_files_data", - "purpose": "A dataset containing output results", - "filters": "tags:(the AND output AND tags) files:(extension:csv AND sequence:>=0)" + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "labels": ["mykeyword1", "mykeyword2"], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + }, + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 1, + "extension": "csv", + "labels": [], + "tags": { + "manufacturer": "vestas", + "height": 500, + "is_recycled": true, + "number_of_blades": 3 + }, + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] } ] } @@ -54,22 +141,24 @@ def test_missing_manifest_files(self): twine.validate_output_manifest(source=file) def test_valid_manifest_files(self): - """Ensures that a manifest file will validate""" + """Ensures that a manifest file will validate.""" valid_configuration_manifest = """ { "id": "3ead7669-8162-4f64-8cd5-4abe92509e17", "datasets": [ { "id": "34ad7669-8162-4f64-8cd5-4abe92509e17", - "name": "my configuration dataset", - "tags": ["the", "config", "tags"], + "name": "configuration_files_data", + "tags": {}, + "labels": ["the", "config", "labels"], "files": [ { "path": "configuration/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -82,7 +171,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -102,15 +192,17 @@ def test_valid_manifest_files(self): "datasets": [ { "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my meteorological dataset", - "tags": ["met", "mast", "wind"], + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -123,7 +215,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -143,15 +236,17 @@ def test_valid_manifest_files(self): "datasets": [ { "id": "1ead7669-8162-4f64-8cd5-4abe92509e17", - "name": "my output dataset", - "tags": ["the", "output", "tags"], + "name": "output_files_data", + "tags": {}, + "labels": ["the", "output", "labels"], "files": [ { "path": "input/datasets/7ead7669/file_1.csv", "cluster": 0, "sequence": 0, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", "last_modified": "2019-02-28T22:40:30.533005Z", @@ -164,7 +259,8 @@ def test_valid_manifest_files(self): "cluster": 0, "sequence": 1, "extension": "csv", - "tags": [], + "tags": {}, + "labels": [], "posix_timestamp": 0, "id": "bbff07bc-7c19-4ed5-be6d-a6546eae8e45", "last_modified": "2019-02-28T22:40:40.633001Z", @@ -179,7 +275,7 @@ def test_valid_manifest_files(self): """ twine = Twine(source=self.VALID_MANIFEST_STRAND) - twine.validate_input_manifest(source=valid_configuration_manifest) + twine.validate_configuration_manifest(source=valid_configuration_manifest) twine.validate_input_manifest(source=valid_input_manifest) twine.validate_output_manifest(source=valid_output_manifest) @@ -245,6 +341,263 @@ def test_valid_manifest_files(self): # values_file = os.path.join(self.path, "configurations", "valid_with_extra.json") # twine.validate_configuration(file=values_file) + def test_error_raised_when_required_tags_missing_for_validate_input_manifest(self): + """Test that an error is raised when required tags from the file tags template for a dataset are missing when + validating the input manifest. + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": {}, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertRaises(exceptions.InvalidManifestContents): + twine.validate_input_manifest(source=input_manifest) + + def test_validate_input_manifest_raises_error_if_required_tags_are_not_of_required_type(self): + """Test that an error is raised if the required tags from the file tags template for a dataset are present but + are not of the required type when validating an input manifest. + """ + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e17", + "name": "met_mast_data", + "tags": {}, + "labels": ["met", "mast", "wind"], + "files": [ + { + "path": "input/datasets/7ead7669/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": %s, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + for tags in ( + '{"manufacturer": "Vestas", "height": 350, "is_recycled": false, "number_of_blades": "3"}', + '{"manufacturer": "Vestas", "height": 350, "is_recycled": "no", "number_of_blades": 3}', + '{"manufacturer": false, "height": 350, "is_recycled": "false", "number_of_blades": 3}', + ): + with self.assertRaises(exceptions.InvalidManifestContents): + twine.validate_input_manifest(source=input_manifest % tags) + + def test_validate_input_manifest_with_required_tags(self): + """Test that validating an input manifest with required tags from the file tags template for a dataset works + for tags meeting the requirements. + """ + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) + + def test_validate_input_manifest_with_required_tags_for_remote_tag_template_schema(self): + """Test that a remote tag template can be used for validating tags on the datafiles in a manifest.""" + schema_url = "https://refs.schema.octue.com/octue/my-file-type-tag-template/0.0.0.json" + + twine_with_input_manifest_with_remote_tag_template = ( + """ + { + "input_manifest": { + "datasets": [ + { + "key": "met_mast_data", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "$ref": "%s" + } + } + ] + } + } + """ + % schema_url + ) + + remote_schema = { + "type": "object", + "properties": { + "manufacturer": {"type": "string"}, + "height": {"type": "number"}, + "is_recycled": {"type": "boolean"}, + }, + "required": ["manufacturer", "height", "is_recycled"], + } + + twine = Twine(source=twine_with_input_manifest_with_remote_tag_template) + + original_resolve_from_url = copy.copy(RefResolver.resolve_from_url) + + def patch_if_url_is_schema_url(instance, url): + """Patch the jsonschema validator `RefResolver.resolve_from_url` if the url is the schema URL, otherwise + leave it unpatched. + + :param jsonschema.validators.RefResolver instance: + :param str url: + :return mixed: + """ + if url == schema_url: + return remote_schema + else: + return original_resolve_from_url(instance, url) + + with patch("jsonschema.validators.RefResolver.resolve_from_url", new=patch_if_url_is_schema_url): + twine.validate_input_manifest(source=self.INPUT_MANIFEST_WITH_CORRECT_FILE_TAGS) + + def test_validate_input_manifest_with_required_tags_in_several_datasets(self): + """Test that required tags from the file tags template are validated separately and correctly for each dataset.""" + TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS = """ + { + "input_manifest": { + "datasets": [ + { + "key": "first_dataset", + "purpose": "A dataset containing meteorological mast data", + "file_tags_template": { + "type": "object", + "properties": { + "manufacturer": { + "type": "string" + }, + "height": { + "type": "number" + } + } + } + }, + { + "key": "second_dataset", + "file_tags_template": { + "type": "object", + "properties": { + "is_recycled": { + "type": "boolean" + }, + "number_of_blades": { + "type": "number" + } + } + } + } + ] + } + } + """ + + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "first_dataset", + "tags": {}, + "labels": [], + "files": [ + { + "path": "input/datasets/7ead7669/file_0.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": { + "manufacturer": "Vestas", + "height": 503.7 + }, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e86", + "name": "file_0.csv" + } + ] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", + "name": "second_dataset", + "tags": {}, + "labels": [], + "files": [ + { + "path": "input/datasets/blah/file_1.csv", + "cluster": 0, + "sequence": 0, + "extension": "csv", + "tags": { + "is_recycled": true, + "number_of_blades": 3 + }, + "labels": [], + "id": "abff07bc-7c19-4ed5-be6d-a6546eae8e82", + "name": "file_1.csv" + } + ] + } + ] + } + """ + + twine = Twine(source=TWINE_WITH_INPUT_MANIFEST_WITH_REQUIRED_TAGS_FOR_MULTIPLE_DATASETS) + twine.validate_input_manifest(source=input_manifest) + + def test_error_raised_if_multiple_datasets_have_same_name(self): + """Test that an error is raised if the input manifest has more than one dataset with the same name.""" + input_manifest = """ + { + "id": "8ead7669-8162-4f64-8cd5-4abe92509e17", + "datasets": [ + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e19", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [] + }, + { + "id": "7ead7669-8162-4f64-8cd5-4abe92509e18", + "name": "met_mast_data", + "tags": {}, + "labels": [], + "files": [] + } + ] + } + """ + + twine = Twine(source=self.TWINE_WITH_INPUT_MANIFEST_WITH_TAG_TEMPLATE) + + with self.assertRaises(exceptions.DatasetNameIsNotUnique): + twine.validate_input_manifest(source=input_manifest) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_twine.py b/tests/test_twine.py index 9e27977..5389823 100644 --- a/tests/test_twine.py +++ b/tests/test_twine.py @@ -49,7 +49,7 @@ def test_broken_json_twine(self): invalid_json_twine = """ { "children": [ - "configuration_schema": { + "configuration_values_schema": { "$schema": "http://json-schema.org/2019-09/schema#", "title": "The example configuration form", "description": "The configuration strand of an example twine", diff --git a/twined/exceptions.py b/twined/exceptions.py index 959556d..fb03716 100644 --- a/twined/exceptions.py +++ b/twined/exceptions.py @@ -124,6 +124,10 @@ class InvalidManifestContents(InvalidManifest, ValidationError): """Raised when the manifest files are missing or do not match tags, sequences, clusters, extensions etc as required""" +class DatasetNameIsNotUnique(InvalidManifest): + """Raise when a dataset's name is not unique within its manifest.""" + + # --------------------- Exceptions relating to access of data using the Twine instance ------------------------ # TODO This is related to filtering files from a manifest. Determine whether this belongs here, diff --git a/twined/schema/manifest_schema.json b/twined/schema/manifest_schema.json index ebbf19b..fdb2131 100644 --- a/twined/schema/manifest_schema.json +++ b/twined/schema/manifest_schema.json @@ -1,20 +1,21 @@ { - "type": "object", - "properties": { - "kind": { - "description": "The kind of the manifest, (only 'multi-dataset' available, reserved for future use)", - "type": "string", - "default": "multi-dataset", - "anyOf": [ + "$defs": { + "tags": { + "description": "Key-value tags associated with the object.", + "type": "object" + }, + "labels": { + "description": "Textual labels associated with the object", + "type": "array", + "items": [ { - "type": "string", - "enum": [ - "multi-dataset" - ], - "title": "Multi Dataset" + "type": "string" } ] - }, + } + }, + "type": "object", + "properties": { "id": { "description": "ID of the manifest, typically a uuid", "type": "string" @@ -32,17 +33,9 @@ "description": "Name of the dataset", "type": "string" }, - "tags": { - "description": "Textual tags associated with the dataset", - "type": "array", - "items": [ - { - "type": "string" - } - ] - }, + "tags": {"$ref": "#/$defs/tags"}, + "labels": {"$ref": "#/$defs/labels"}, "files": { - "description": "Textual tags associated with the manifest", "type": "array", "items": { "type": "object", @@ -71,29 +64,19 @@ "description": "A posix based timestamp associated with the file. This may, but need not be, the created or modified time. ", "type": "number" }, - "tags": { - "description": "Textual tags associated with the file", - "type": "array", - "items": [ - { - "type": "string" - } - ] - } + "tags": {"$ref": "#/$defs/tags"}, + "labels": {"$ref": "#/$defs/labels"} }, "required": [ "id", "path", - "tags" + "tags", + "labels" ] } } }, - "required": [ - "id", - "tags", - "files" - ] + "required": ["id", "name", "tags", "labels", "files"] } } }, diff --git a/twined/schema/twine_schema.json b/twined/schema/twine_schema.json index eda7828..6e94a2e 100644 --- a/twined/schema/twine_schema.json +++ b/twined/schema/twine_schema.json @@ -1,4 +1,70 @@ { + "$defs": { + "file_tags_template": { + "oneOf": [ + { + "type": "object", + "properties": { + "$schema": { + "type": "string" + }, + "type": { + "const": "object" + }, + "properties": { + "type": "object" + }, + "required": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": ["type", "properties"] + }, + { + "type": "object", + "properties": { + "$ref": { + "type": "string" + } + }, + "required": ["$ref"] + } + ] + }, + "manifest": { + "type": "object", + "properties": { + "datasets": { + "type": "array", + "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", + "items": { + "type": "object", + "properties": { + "key": { + "description": "A textual key identifying this dataset within the application/twin", + "type": "string" + }, + "purpose": { + "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", + "type": "string", + "default": "" + }, + "file_tags_template": { + "$ref": "#/$defs/file_tags_template" + } + }, + "required": [ + "key" + ] + } + } + }, + "required": ["datasets"] + } + }, "type": "object", "$schema": "http://json-schema.org/2019-09/schema#", "properties": { @@ -27,7 +93,8 @@ ] } }, - "configuration_schema": { + "configuration_manifest": {"$ref": "#/$defs/manifest"}, + "configuration_values_schema": { "type": "object", "required": ["properties"] }, @@ -52,61 +119,11 @@ "additionalProperties": false } }, - "input_manifest": { - "type": "array", - "description": "A list of entries, each describing a dataset that should be attached to / made available to the digital twin", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "filters": { - "description": "A search term, using the Lucene Query Language, which can be used to automatically refine the list of available datasets down to ones suitable for use with this twin", - "type": "string", - "default": "" - } - }, - "required": ["key"] - } - }, + "input_manifest": {"$ref": "#/$defs/manifest"}, "input_values_schema": { "type": "object" }, - "output_manifest": { - "type": "array", - "description": "A list of entries, each describing a dataset that may be created/updated when the twin is run", - "items": { - "type": "object", - "properties": { - "key": { - "description": "A textual key identifying this dataset within the application/twin", - "type": "string" - }, - "purpose": { - "description": "What data this dataset contains, eg 'the set of data files from the energy production calculation process'", - "type": "string", - "default": "" - }, - "tags": { - "description": "Comma separated tags that will be applied to the dataset when created", - "type": "array", - "items": [ - { - "type": "string" - } - ] - } - }, - "required": ["key"] - } - }, + "output_manifest": {"$ref": "#/$defs/manifest"}, "output_values_schema": { "type": "object" } diff --git a/twined/twine.py b/twined/twine.py index 2a4e18f..9ccb7e3 100644 --- a/twined/twine.py +++ b/twined/twine.py @@ -88,40 +88,60 @@ def _load_json(self, kind, source, **kwargs): return data - def _validate_against_schema(self, strand, data): - """Validates data against a schema, raises exceptions of type InvalidJson if not compliant. + def _get_schema(self, strand): + """Get the schema for the given strand. Can be used to validate: - the twine file contents itself against the present version twine spec - children data against the required schema for the present version twine spec - values data for compliance with schema written in the twine (for strands like input_values_schema) + + :param str strand: + :return dict: """ if strand == "twine": # The data is a twine. A twine *contains* schema, but we also need to verify that it matches a certain # schema itself. The twine schema is distributed with this packaged to ensure version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/twine_schema.json")) + schema_path = "schema/twine_schema.json" elif strand in CHILDREN_STRANDS: # The data is a list of children. The "children" strand of the twine describes matching criteria for # the children, not the schema of the "children" data, which is distributed with this package to ensure # version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/children_schema.json")) + schema_path = "schema/children_schema.json" elif strand in MANIFEST_STRANDS: # The data is a manifest of files. The "*_manifest" strands of the twine describe matching criteria used to # filter files appropriate for consumption by the digital twin, not the schema of the manifest data, which # is distributed with this package to ensure version consistency... - schema = jsonlib.loads(pkg_resources.resource_string("twined", "schema/manifest_schema.json")) + schema_path = "schema/manifest_schema.json" else: if strand not in SCHEMA_STRANDS: raise exceptions.UnknownStrand(f"Unknown strand {strand}. Try one of {ALL_STRANDS}.") schema_key = strand + "_schema" + try: - schema = getattr(self, schema_key) + return getattr(self, schema_key) except AttributeError: raise exceptions.StrandNotFound(f"Cannot validate - no {schema_key} strand in the twine") + return jsonlib.loads(pkg_resources.resource_string("twined", schema_path)) + + def _validate_against_schema(self, strand, data): + """Validates data against a schema, raises exceptions of type InvalidJson if not compliant. + + Can be used to validate: + - the twine file contents itself against the present version twine spec + - children data against the required schema for the present version twine spec + - values data for compliance with schema written in the twine (for strands like input_values_schema) + + :param str strand: + :param dict data: + :return None: + """ + schema = self._get_schema(strand) + try: jsonschema_validate(instance=data, schema=schema) logger.debug("Validated %s against schema", strand) @@ -159,6 +179,7 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): data = data.serialise() self._validate_against_schema(kind, data) + self._validate_dataset_file_tags(manifest_kind=kind, manifest=data) if cls and inbound: # TODO verify that all the required keys etc are there @@ -166,6 +187,42 @@ def _validate_manifest(self, kind, source, cls=None, **kwargs): return data + def _validate_dataset_file_tags(self, manifest_kind, manifest): + """Validate the tags of the files of each dataset in the manifest against the file tags template in the + corresponding dataset field in the given manifest field of the twine. + + :param str manifest_kind: + :param dict manifest: + :return None: + """ + # This is the manifest schema included in the twine.json file, not the schema for manifest.json files. + manifest_schema = getattr(self, manifest_kind) + + for dataset_schema in manifest_schema["datasets"]: + datasets = [dataset for dataset in manifest["datasets"] if dataset["name"] == dataset_schema["key"]] + + if not datasets: + continue + + if len(datasets) > 1: + raise exceptions.DatasetNameIsNotUnique( + f"There is more than one dataset named {dataset_schema['key']!r} - ensure each dataset within a " + f"manifest is uniquely named." + ) + + dataset = datasets.pop(0) + + file_tags_template = dataset_schema.get("file_tags_template") + + if not file_tags_template: + continue + + for file in dataset["files"]: + try: + jsonschema_validate(instance=file["tags"], schema=file_tags_template) + except ValidationError as e: + raise exceptions.invalid_contents_map[manifest_kind](str(e)) + @property def available_strands(self): """Tuple of strand names that are found in this twine"""