getindata · Gabriel2409 · Sep 14, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 16, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,8 @@
 ## [Unreleased]
 
 -   Added `--on-job-scheduled` argument to `kedro azureml run` to plug-in custom behaviour after Azure ML job is scheduled [@Gabriel2409](https://github.com/Gabriel2409)
+-   Added two new arguments (`datastore` and `azureml_root_dir`, to `AzureMLAssetDataSet` (`datastore` and `azureml_root_dir`) allowing users to specify where to save the data if `AzureMLAssetDataSet` is used as a node output (non local runs) by [@Gabriel2409](https://github.com/Gabriel2409)
+-   Added support for using `AzureMLAssetDataSet uri_file` as node output by [@Gabriel2409](https://github.com/Gabriel2409)
 
 ## [0.6.0] - 2023-09-01
 

diff --git a/kedro_azureml/cli.py b/kedro_azureml/cli.py
@@ -399,7 +399,7 @@ def compile(
 @click.option(
     "--az-output",
     "azure_outputs",
-    type=(str, click.Path(exists=True, file_okay=True, dir_okay=True)),
+    type=(str, click.Path(exists=False)),
     multiple=True,
     help="Name and path of Azure ML Pipeline output",
 )

diff --git a/kedro_azureml/datasets/asset_dataset.py b/kedro_azureml/datasets/asset_dataset.py
@@ -43,6 +43,12 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet):
      | - ``filepath_arg``: Filepath arg on the wrapped dataset, defaults to `filepath`
      | - ``azureml_type``: Either `uri_folder` or `uri_file`
      | - ``version``: Version of the AzureML dataset to be used in kedro format.
+     | - ``datastore``: datastore name, only used to resolve the path when using the
+        data asset as an output (non local runs).
+     | - ``azureml_root_dir``: The folder where to save the data asset, only used to
+        resolve the path when using the data asset as an output (non local runs).
+        Final output path will be start with
+        "azureml://datastores/<datastore>/paths/<azureml_root_dir>/<job_id>"
 
     Example
     -------
@@ -52,19 +58,17 @@ class AzureMLAssetDataSet(AzureMLPipelineDataSet, AbstractVersionedDataSet):
     .. code-block:: yaml
 
         my_folder_dataset:
-          type: kedro_azureml.datasets.AzureMLAssetDataSet
-          azureml_dataset: my_azureml_folder_dataset
-          root_dir: data/01_raw/some_folder/
-          versioned: True
-          dataset:
-            type: pandas.ParquetDataSet
-            filepath: "."
+            type: kedro_azureml.datasets.AzureMLAssetDataSet
+            azureml_dataset: my_azureml_folder_dataset
+            root_dir: data/01_raw/some_folder/
+            dataset:
+                type: pandas.ParquetDataSet
+                filepath: "."
 
         my_file_dataset:
             type: kedro_azureml.datasets.AzureMLAssetDataSet
             azureml_dataset: my_azureml_file_dataset
             root_dir: data/01_raw/some_other_folder/
-            versioned: True
             dataset:
                 type: pandas.ParquetDataSet
                 filepath: "companies.csv"
@@ -81,6 +85,8 @@ def __init__(
         filepath_arg: str = "filepath",
         azureml_type: AzureMLDataAssetType = "uri_folder",
         version: Optional[Version] = None,
+        datastore: str = "${{default_datastore}}",
+        azureml_root_dir: str = "kedro_azureml",  # maybe combine with root_dir?
     ):
         """
         azureml_dataset: Name of the AzureML file azureml_dataset.
@@ -90,9 +96,17 @@ def __init__(
         filepath_arg: Filepath arg on the wrapped dataset, defaults to `filepath`
         azureml_type: Either `uri_folder` or `uri_file`
         version: Version of the AzureML dataset to be used in kedro format.
+        datastore: datastore name, only used to resolve the path when using the
+        data asset as an output (non local runs). Defaults to pipeline defaut
+        data store (resolved server side, see
+        https://learn.microsoft.com/en-us/azure/machine-learning/concept-expressions?view=azureml-api-2)
+        azureml_root_dir: The folder where to save the data asset, only used to
+        resolve the path when using the data asset as an output (non local runs).
         """
         super().__init__(dataset=dataset, root_dir=root_dir, filepath_arg=filepath_arg)
 
+        self._azureml_root_dir = azureml_root_dir
+        self._datastore = datastore
         self._azureml_dataset = azureml_dataset
         self._version = version
         # 1 entry for load version, 1 for save version

diff --git a/kedro_azureml/generator.py b/kedro_azureml/generator.py
@@ -172,12 +172,22 @@ def _get_output(self, name):
         if name in self.catalog.list() and isinstance(
             ds := self.catalog._get_dataset(name), AzureMLAssetDataSet
         ):
+            output_path = (
+                f"azureml://datastores/{ds._datastore}/paths/{ds._azureml_root_dir}"
+            )
+
+            # add the job id to the path (actual value is injected when job is run)
+            output_path = f"{output_path}/${{{{name}}}}"
+
             if ds._azureml_type == "uri_file":
-                raise ValueError(
-                    "AzureMLAssetDataSets with azureml_type 'uri_file' cannot be used as outputs"
-                )
-            # TODO: add versioning
-            return Output(type=ds._azureml_type, name=ds._azureml_dataset)
+                output_path = f"{output_path}/{ds._dataset_config[ds._filepath_arg]}"
+            # note that this will always create a new version of the dataset, even if we
+            # have versioned set to false.
+            return Output(
+                type=ds._azureml_type,
+                name=ds._azureml_dataset,
+                path=output_path,
+            )
         else:
             return Output(type="uri_folder")
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -256,6 +256,7 @@ def multi_catalog():
             "filepath": "abc.csv",
         },
         azureml_dataset="test_dataset",
+        azureml_type="uri_file",
         version=Version(None, None),
     )
     parq = AzureMLAssetDataSet(
@@ -264,6 +265,9 @@ def multi_catalog():
             "filepath": "xyz.parq",
         },
         azureml_dataset="test_dataset_2",
+        azureml_type="uri_folder",
+        azureml_root_dir="azureml_root",
+        datastore="mydatastore",
         version=Version(None, None),
     )
     return DataCatalog({"input_data": csv, "i2": parq})
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
@@ -333,12 +333,16 @@ def test_azureml_asset_dataset(
     local_run,
     download,
 ):
+    # ensure that AzureMLAssetDataSet type matches path in the mock_azureml_client
+    with mock_azureml_client() as mac:
+        azureml_type = mac.data.get().type
     ds = AzureMLAssetDataSet(
         dataset={
             "type": dataset_type,
             "filepath": path_in_aml,
         },
         azureml_dataset="test_dataset",
+        azureml_type=azureml_type,
         version=Version(None, None),
     )
     ds._local_run = local_run

diff --git a/tests/test_generator.py b/tests/test_generator.py
@@ -71,6 +71,23 @@ def test_can_generate_azure_pipeline(
                 for node in az_pipeline.jobs.values()
             ), "Invalid docker image set on commands"
 
+        # check Output of generated command for AzuremlDataAssets
+
+        i2_job = az_pipeline.jobs["node1"].outputs["i2"]._to_job_output()
+        i2_dataset = multi_catalog.datasets.i2
+        assert i2_dataset._azureml_type == i2_job.type, "Wrong Output type"
+        assert i2_dataset._azureml_dataset == i2_job.name, "Wrong Output name"
+        assert i2_dataset._datastore in i2_job.path, "datastore not passed to Output"
+        assert (
+            i2_dataset._azureml_root_dir in i2_job.path
+        ), "azureml root dir not passed to Output"
+
+        # check Output for non AzuremlDataAssets
+        i3_job = az_pipeline.jobs["node2"].outputs["i3"]._to_job_output()
+        assert i3_job.type == "uri_folder", "Wrong Output type"
+        assert i3_job.path is None, "Output path is not empty"
+        assert i3_job.name is None, "Output name is not empty"
+
 
 def test_azure_pipeline_with_different_compute(
     dummy_pipeline_compute_tag, dummy_plugin_config, multi_catalog