From d50768214354e932d1ab75680b2d80ad3f755dff Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Thu, 14 Sep 2023 16:17:22 +0200 Subject: [PATCH 01/10] added arguments to azuremldataasset in order to be able to specify the path within azureml where the dataset is saved --- kedro_azureml/datasets/asset_dataset.py | 4 ++++ kedro_azureml/generator.py | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py index 9745c09..cdedfe9 100644 --- a/kedro_azureml/datasets/asset_dataset.py +++ b/kedro_azureml/datasets/asset_dataset.py @@ -77,6 +77,8 @@ def __init__( self, azureml_dataset: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], + datastore: str = "workspaceblobstore", + azureml_root_dir: str = "kedro_azureml", # maybe combine with root_dir? root_dir: str = "data", filepath_arg: str = "filepath", azureml_type: AzureMLDataAssetType = "uri_folder", @@ -93,6 +95,8 @@ def __init__( """ super().__init__(dataset=dataset, root_dir=root_dir, filepath_arg=filepath_arg) + self._azureml_root_dir = azureml_root_dir + self._datastore = datastore self._azureml_dataset = azureml_dataset self._version = version # 1 entry for load version, 1 for save version diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py index f64960b..ed4f450 100644 --- a/kedro_azureml/generator.py +++ b/kedro_azureml/generator.py @@ -177,7 +177,11 @@ def _get_output(self, name): "AzureMLAssetDataSets with azureml_type 'uri_file' cannot be used as outputs" ) # TODO: add versioning - return Output(type=ds._azureml_type, name=ds._azureml_dataset) + return Output( + type=ds._azureml_type, + name=ds._azureml_dataset, + path=f"azureml://datastores/{ds._datastore}/paths/{ds._azureml_root_dir}/{ds.resolve_save_version()}", + ) else: return Output(type="uri_folder") From 47217de16ab3fadcc31dd5422df4c62264478ad9 Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Fri, 15 Sep 2023 11:17:53 +0200 Subject: [PATCH 02/10] removed checking if the output path exists in --az-output: - This lead to a failure when outputting uri_file - removing this check does not seem to have any negative impact --- kedro_azureml/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro_azureml/cli.py b/kedro_azureml/cli.py index 4b29988..0f13dad 100644 --- a/kedro_azureml/cli.py +++ b/kedro_azureml/cli.py @@ -381,7 +381,7 @@ def compile( @click.option( "--az-output", "azure_outputs", - type=(str, click.Path(exists=True, file_okay=True, dir_okay=True)), + type=(str, click.Path(exists=False)), multiple=True, help="Name and path of Azure ML Pipeline output", ) From 8d9ec103a47bd114f384abbdc2c2c95b16e86b1d Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Fri, 15 Sep 2023 11:18:53 +0200 Subject: [PATCH 03/10] added support for uri_file output --- kedro_azureml/generator.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py index ed4f450..ff01a00 100644 --- a/kedro_azureml/generator.py +++ b/kedro_azureml/generator.py @@ -172,15 +172,23 @@ def _get_output(self, name): if name in self.catalog.list() and isinstance( ds := self.catalog._get_dataset(name), AzureMLAssetDataSet ): + output_path = ( + f"azureml://datastores/{ds._datastore}/paths/{ds._azureml_root_dir}" + ) + + # versioning system: to be discussed + output_path = ( + f"{output_path}/{ds._azureml_dataset}/{ds.resolve_save_version()}" + ) + if ds._azureml_type == "uri_file": - raise ValueError( - "AzureMLAssetDataSets with azureml_type 'uri_file' cannot be used as outputs" - ) - # TODO: add versioning + output_path = f"{output_path}/{ds._dataset_config[ds._filepath_arg]}" + # note that this will always create a new version of the dataset, even if we + # have versioned set to false. return Output( type=ds._azureml_type, name=ds._azureml_dataset, - path=f"azureml://datastores/{ds._datastore}/paths/{ds._azureml_root_dir}/{ds.resolve_save_version()}", + path=output_path, ) else: return Output(type="uri_folder") From deae7c9b883a9ef88d242b27ae8440bb6acd247b Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Sat, 16 Sep 2023 10:21:02 +0200 Subject: [PATCH 04/10] injection of the job id in the output path --- kedro_azureml/generator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py index ff01a00..a6cc8d6 100644 --- a/kedro_azureml/generator.py +++ b/kedro_azureml/generator.py @@ -181,6 +181,9 @@ def _get_output(self, name): f"{output_path}/{ds._azureml_dataset}/{ds.resolve_save_version()}" ) + # add the job id to the path (actual value is injected when job is run) + output_path = f"{output_path}/{{name}}" + if ds._azureml_type == "uri_file": output_path = f"{output_path}/{ds._dataset_config[ds._filepath_arg]}" # note that this will always create a new version of the dataset, even if we From ebdc8506e5d8ef6e9c7d7a916dd98156c71ab118 Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Sat, 16 Sep 2023 21:05:06 +0200 Subject: [PATCH 05/10] added description for new arguments + removed versioned arg in examples --- kedro_azureml/datasets/asset_dataset.py | 30 ++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py index cdedfe9..c0fad4e 100644 --- a/kedro_azureml/datasets/asset_dataset.py +++ b/kedro_azureml/datasets/asset_dataset.py @@ -43,6 +43,12 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet): | - ``filepath_arg``: Filepath arg on the wrapped dataset, defaults to `filepath` | - ``azureml_type``: Either `uri_folder` or `uri_file` | - ``version``: Version of the AzureML dataset to be used in kedro format. + | - ``datastore``: datastore name, only used to resolve the path when using the + data asset as an output (non local runs). + | - azureml_root_dir: The folder where to save the data asset, only used to + resolve the path when using the data asset as an output (non local runs). + Final output path will be start with + "azureml://datastores//paths//" Example ------- @@ -52,19 +58,17 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet): .. code-block:: yaml my_folder_dataset: - type: kedro_azureml.datasets.AzureMLAssetDataSet - azureml_dataset: my_azureml_folder_dataset - root_dir: data/01_raw/some_folder/ - versioned: True - dataset: - type: pandas.ParquetDataSet - filepath: "." + type: kedro_azureml.datasets.AzureMLAssetDataSet + azureml_dataset: my_azureml_folder_dataset + root_dir: data/01_raw/some_folder/ + dataset: + type: pandas.ParquetDataSet + filepath: "." my_file_dataset: type: kedro_azureml.datasets.AzureMLAssetDataSet azureml_dataset: my_azureml_file_dataset root_dir: data/01_raw/some_other_folder/ - versioned: True dataset: type: pandas.ParquetDataSet filepath: "companies.csv" @@ -77,12 +81,12 @@ def __init__( self, azureml_dataset: str, dataset: Union[str, Type[AbstractDataSet], Dict[str, Any]], - datastore: str = "workspaceblobstore", - azureml_root_dir: str = "kedro_azureml", # maybe combine with root_dir? root_dir: str = "data", filepath_arg: str = "filepath", azureml_type: AzureMLDataAssetType = "uri_folder", version: Optional[Version] = None, + datastore: str = "${{default_datastore}}", + azureml_root_dir: str = "kedro_azureml", # maybe combine with root_dir? ): """ azureml_dataset: Name of the AzureML file azureml_dataset. @@ -92,6 +96,12 @@ def __init__( filepath_arg: Filepath arg on the wrapped dataset, defaults to `filepath` azureml_type: Either `uri_folder` or `uri_file` version: Version of the AzureML dataset to be used in kedro format. + datastore: datastore name, only used to resolve the path when using the + data asset as an output (non local runs). Defaults to pipeline defaut + data store (resolved server side, see + https://learn.microsoft.com/en-us/azure/machine-learning/concept-expressions?view=azureml-api-2) + azureml_root_dir: The folder where to save the data asset, only used to + resolve the path when using the data asset as an output (non local runs). """ super().__init__(dataset=dataset, root_dir=root_dir, filepath_arg=filepath_arg) From dd897d2118aa968d0d65e791d05b75d13221a559 Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Sat, 16 Sep 2023 21:05:38 +0200 Subject: [PATCH 06/10] changed output path for azureml dataasset --- kedro_azureml/generator.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py index a6cc8d6..68f20dc 100644 --- a/kedro_azureml/generator.py +++ b/kedro_azureml/generator.py @@ -176,13 +176,8 @@ def _get_output(self, name): f"azureml://datastores/{ds._datastore}/paths/{ds._azureml_root_dir}" ) - # versioning system: to be discussed - output_path = ( - f"{output_path}/{ds._azureml_dataset}/{ds.resolve_save_version()}" - ) - # add the job id to the path (actual value is injected when job is run) - output_path = f"{output_path}/{{name}}" + output_path = f"{output_path}/${{{{name}}}}" if ds._azureml_type == "uri_file": output_path = f"{output_path}/{ds._dataset_config[ds._filepath_arg]}" From 07bb7491256a2e8611186b5b325c6095ea29a49b Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Wed, 4 Oct 2023 10:11:15 +0200 Subject: [PATCH 07/10] Add tests for generated Output command Modify multi_catalog to include extra parameters. Update test_azureml_asset_dataset to ensure asset is constructured with correct type. Add checks on specific node outputs in test_can_generate_azure_pipeline --- kedro_azureml/datasets/asset_dataset.py | 2 +- tests/conftest.py | 4 ++++ tests/test_datasets.py | 4 ++++ tests/test_generator.py | 17 +++++++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py index c0fad4e..d650236 100644 --- a/kedro_azureml/datasets/asset_dataset.py +++ b/kedro_azureml/datasets/asset_dataset.py @@ -45,7 +45,7 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet): | - ``version``: Version of the AzureML dataset to be used in kedro format. | - ``datastore``: datastore name, only used to resolve the path when using the data asset as an output (non local runs). - | - azureml_root_dir: The folder where to save the data asset, only used to + | - ``azureml_root_dir``: The folder where to save the data asset, only used to resolve the path when using the data asset as an output (non local runs). Final output path will be start with "azureml://datastores//paths//" diff --git a/tests/conftest.py b/tests/conftest.py index 4ed2d90..7a3df9d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -256,6 +256,7 @@ def multi_catalog(): "filepath": "abc.csv", }, azureml_dataset="test_dataset", + azureml_type="uri_file", version=Version(None, None), ) parq = AzureMLAssetDataSet( @@ -264,6 +265,9 @@ def multi_catalog(): "filepath": "xyz.parq", }, azureml_dataset="test_dataset_2", + azureml_type="uri_folder", + azureml_root_dir="azureml_root", + datastore="mydatastore", version=Version(None, None), ) return DataCatalog({"input_data": csv, "i2": parq}) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index c01e00e..996ae28 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -333,12 +333,16 @@ def test_azureml_asset_dataset( local_run, download, ): + # ensure that AzureMLAssetDataSet type matches path in the mock_azureml_client + with mock_azureml_client() as mac: + azureml_type = mac.data.get().type ds = AzureMLAssetDataSet( dataset={ "type": dataset_type, "filepath": path_in_aml, }, azureml_dataset="test_dataset", + azureml_type=azureml_type, version=Version(None, None), ) ds._local_run = local_run diff --git a/tests/test_generator.py b/tests/test_generator.py index fe1cf2e..7aeae1c 100644 --- a/tests/test_generator.py +++ b/tests/test_generator.py @@ -71,6 +71,23 @@ def test_can_generate_azure_pipeline( for node in az_pipeline.jobs.values() ), "Invalid docker image set on commands" + # check Output of generated command for AzuremlDataAssets + + i2_job = az_pipeline.jobs["node1"].outputs["i2"]._to_job_output() + i2_dataset = multi_catalog.datasets.i2 + assert i2_dataset._azureml_type == i2_job.type, "Wrong Output type" + assert i2_dataset._azureml_dataset == i2_job.name, "Wrong Output name" + assert i2_dataset._datastore in i2_job.path, "datastore not passed to Output" + assert ( + i2_dataset._azureml_root_dir in i2_job.path + ), "azureml root dir not passed to Output" + + # check Output for non AzuremlDataAssets + i3_job = az_pipeline.jobs["node2"].outputs["i3"]._to_job_output() + assert i3_job.type == "uri_folder", "Wrong Output type" + assert i3_job.path is None, "Output path is not empty" + assert i3_job.name is None, "Output name is not empty" + def test_azure_pipeline_with_different_compute( dummy_pipeline_compute_tag, dummy_plugin_config, multi_catalog From 1197c48c09e9fd2502206f29c81792827b2c1003 Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Wed, 4 Oct 2023 10:23:45 +0200 Subject: [PATCH 08/10] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 972e807..0e81ee1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ ## [Unreleased] - Added `--on-job-scheduled` argument to `kedro azureml run` to plug-in custom behaviour after Azure ML job is scheduled [@Gabriel2409](https://github.com/Gabriel2409) +- Added two new arguments (`datastore` and `azureml_root_dir`, to `AzureMLAssetDataSet` (`datastore` and `azureml_root_dir`) allowing users to specify where to save the data if `AzureMLAssetDataSet` is used as a node output (non local runs) by [@Gabriel2409](https://github.com/Gabriel2409) +- Added support for using `AzureMLAssetDataSet uri_file` as node output by [@Gabriel2409](https://github.com/Gabriel2409) ## [0.6.0] - 2023-09-01 From 45efaceb8ed3d8c7021fc347b112d62f8c6dfeab Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Thu, 5 Oct 2023 10:23:36 +0200 Subject: [PATCH 09/10] Add validation for datastore and azureml_root_dir parameter --- kedro_azureml/datasets/asset_dataset.py | 56 ++++++++++++++++++++-- tests/test_datasets.py | 63 +++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 3 deletions(-) diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py index d650236..74f5d8b 100644 --- a/kedro_azureml/datasets/asset_dataset.py +++ b/kedro_azureml/datasets/asset_dataset.py @@ -1,4 +1,5 @@ import logging +import re from functools import partial from operator import attrgetter from pathlib import Path @@ -46,7 +47,8 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet): | - ``datastore``: datastore name, only used to resolve the path when using the data asset as an output (non local runs). | - ``azureml_root_dir``: The folder where to save the data asset, only used to - resolve the path when using the data asset as an output (non local runs). + resolve the path when using the data asset as an output (non local runs). Must + be a linux file type without any spaces. Leading and trailing slash are stripped Final output path will be start with "azureml://datastores//paths//" @@ -86,7 +88,7 @@ def __init__( azureml_type: AzureMLDataAssetType = "uri_folder", version: Optional[Version] = None, datastore: str = "${{default_datastore}}", - azureml_root_dir: str = "kedro_azureml", # maybe combine with root_dir? + azureml_root_dir: str = "kedro_azureml", ): """ azureml_dataset: Name of the AzureML file azureml_dataset. @@ -101,10 +103,16 @@ def __init__( data store (resolved server side, see https://learn.microsoft.com/en-us/azure/machine-learning/concept-expressions?view=azureml-api-2) azureml_root_dir: The folder where to save the data asset, only used to - resolve the path when using the data asset as an output (non local runs). + resolve the path when using the data asset as an output (non local runs). Must + be a linux file type without any spaces. Leading and trailing slash are stripped """ super().__init__(dataset=dataset, root_dir=root_dir, filepath_arg=filepath_arg) + self._validate_datastore_name(datastore) + # needed to ensure there is no extra / in the string passed to Output + azureml_root_dir = azureml_root_dir.strip("/") + self._validate_azureml_root_dir(azureml_root_dir) + self._azureml_root_dir = azureml_root_dir self._datastore = datastore self._azureml_dataset = azureml_dataset @@ -129,6 +137,48 @@ def __init__( f"the dataset definition." ) + def _validate_datastore_name(self, datastore: str): + """ + Validates a datastore name to check it contains exclusively lowercase letters, + digits and underscores or if it matches ${{default_datastore}} + + :param datastore: The name of the datastore to be validated. + :type datastore: str + + :raises ValueError: If the provided `datastore` name is incorrect. + + :rtype: None + + Learn more about the special expression "${{default_datastore}}" at: + https://learn.microsoft.com/azure/machine-learning/concept-expressions + """ + if not re.match(r"(^[a-z0-9_]+$|^\${{default_datastore}}$)", datastore): + raise ValueError( + "The datastore name must exclusively contain " + "lowercase letters, numbers, and underscores. " + "You can also use ${[default_datastore]}: " + "https://learn.microsoft.com/azure/machine-learning/concept-expressions" + ) + + def _validate_azureml_root_dir(self, azureml_root_dir: str): + """ + Validates the azureml root_dir to check it is a valid unix file path + + :param azureml_root_dir: The name to be validated. + :type datastore: str + + :raises ValueError: If the provided `azureml_root_dir` name is incorrect. + + :rtype: None + """ + if not re.match(r"^([\w\-_]+\/?)+$", azureml_root_dir): + raise ValueError( + "azureml_root_dir must only contain " + "lowercase letters, numbers, and underscores. " + "Folders should be separated by a '/'. " + "Spaces are not allowed." + ) + @property def azure_config(self) -> AzureMLConfig: """AzureML config to be used by the dataset.""" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 996ae28..254b756 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -363,6 +363,29 @@ def test_azureml_asset_dataset( assert (ds._load()["data"] == df["data"]).all() +@pytest.mark.parametrize( + "azureml_root_dir", + [ + "azureml/root", + "/azureml/root", + "azureml/root/", + "/azureml/root/", + ], + ids=("noslash", "leading", "trailing", "both"), +) +def test_azureml_root_dir_has_no_leading_and_trailing_slashes(azureml_root_dir): + aml_dataset = AzureMLAssetDataSet( + dataset={ + "type": PickleDataSet, + "filepath": "some/random/path/test.pickle", + }, + azureml_dataset="test_dataset", + version=Version(None, None), + azureml_root_dir=azureml_root_dir, + ) + assert aml_dataset._azureml_root_dir == azureml_root_dir.strip("/") + + def test_azureml_assetdataset_raises_DataSetError_azureml_type(): with pytest.raises(DataSetError, match="mltable"): AzureMLAssetDataSet( @@ -389,6 +412,46 @@ def test_azureml_assetdataset_raises_DataSetError_wrapped_dataset_versioned(): ) +@pytest.mark.parametrize( + "datastore", + [ + "my-datastore", + "my/datastore", + "MyDatastore", + ], + ids=("hyphen", "slash", "uppercase"), +) +def test_azureml_assetdataset_raises_ValueError_bad_datastore_name(datastore): + with pytest.raises(ValueError, match=r".*datastore.*"): + AzureMLAssetDataSet( + dataset={ + "type": PickleDataSet, + "filepath": "some/random/path/test.pickle", + }, + azureml_dataset="test_dataset", + version=Version(None, None), + datastore=datastore, + ) + + +@pytest.mark.parametrize( + "azureml_root_dir", + ["my dir", "my//dir", "my\\dir", "my$dir"], + ids=("space", "doubleslash", "backslash", "special_char"), +) +def test_azureml_assetdataset_raises_ValueError_bad_azureml_root_dir(azureml_root_dir): + with pytest.raises(ValueError, match=r".*azureml_root_dir.*"): + AzureMLAssetDataSet( + dataset={ + "type": PickleDataSet, + "filepath": "some/random/path/test.pickle", + }, + azureml_dataset="test_dataset", + version=Version(None, None), + azureml_root_dir=azureml_root_dir, + ) + + def test_azureml_pipeline_dataset(tmp_path: Path): ds = AzureMLPipelineDataSet( { From dd50aab4e11c65dfa22139a8a60c150d44a5ee21 Mon Sep 17 00:00:00 2001 From: Gabriel2409 Date: Tue, 17 Oct 2023 14:09:04 +0200 Subject: [PATCH 10/10] Fix regex exponential backtracking in _validate_azureml_root_dir --- kedro_azureml/datasets/asset_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py index 74f5d8b..79cf27b 100644 --- a/kedro_azureml/datasets/asset_dataset.py +++ b/kedro_azureml/datasets/asset_dataset.py @@ -171,7 +171,7 @@ def _validate_azureml_root_dir(self, azureml_root_dir: str): :rtype: None """ - if not re.match(r"^([\w\-_]+\/?)+$", azureml_root_dir): + if not re.match(r"^\/?([\w\-_]\/?)+$", azureml_root_dir): raise ValueError( "azureml_root_dir must only contain " "lowercase letters, numbers, and underscores. "