From 8f8f6aac4a1c062fe5274f734899d0faca0360d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Yolan=20Honor=C3=A9-Roug=C3=A9?= <29451317+Galileo-Galilei@users.noreply.github.com> Date: Sat, 26 Oct 2024 15:27:54 +0200 Subject: [PATCH] :loud_sound: Log model version information when a model is loaded from the registry (#552) (#588) * :loud_sound: Log model version information when a model is loaded from the registry (#552) * get run id from metadata * simplify and log only the run_id * add failing test * up changelog and add extra info on test * typo in changelog * remove unused client property * fix changelog typo --- CHANGELOG.md | 1 + .../models/mlflow_model_registry_dataset.py | 15 +++++- tests/framework/cli/test_cli_modelify.py | 1 - .../test_mlflow_model_registry_dataset.py | 46 +++++++++++++++++++ 4 files changed, 61 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bfcb8168..bf3310db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - :sparkles: Implement missing ``PipelineML`` filtering functionalities to let ``kedro`` display resume hints and avoid breaking ``kedro-viz`` ([#377](https://github.com/Galileo-Galilei/kedro-mlflow/pull/377), [#601, Calychas](https://github.com/Galileo-Galilei/kedro-mlflow/pull/601)) - :sparkles: Sanitize parameters name with unsupported characters to avoid ``mlflow`` errors when logging ([#595, pascalwhoop](https://github.com/Galileo-Galilei/kedro-mlflow/pull/595)) +- :loud_sound: Add logs about the exact ``run_id`` loaded within a ``MlflowRegistryDataset`` because some URI are confusing (e.g. ``latest``) and hard to debug ([#552](https://github.com/Galileo-Galilei/kedro-mlflow/pull/552)) ### Changed diff --git a/kedro_mlflow/io/models/mlflow_model_registry_dataset.py b/kedro_mlflow/io/models/mlflow_model_registry_dataset.py index 615a5bc5..11374b39 100644 --- a/kedro_mlflow/io/models/mlflow_model_registry_dataset.py +++ b/kedro_mlflow/io/models/mlflow_model_registry_dataset.py @@ -1,3 +1,4 @@ +from logging import Logger, getLogger from typing import Any, Dict, Optional, Union from kedro.io.core import DatasetError @@ -67,6 +68,10 @@ def __init__( else f"models:/{model_name}/{stage_or_version}" ) + @property + def _logger(self) -> Logger: + return getLogger(__name__) + def _load(self) -> Any: """Loads an MLflow model from local path or from MLflow run. @@ -77,10 +82,18 @@ def _load(self) -> Any: # If `run_id` is specified, pull the model from MLflow. # TODO: enable loading from another mlflow conf (with a client with another tracking uri) # Alternatively, use local path to load the model. - return self._mlflow_model_module.load_model( + model = self._mlflow_model_module.load_model( model_uri=self.model_uri, **self._load_args ) + # log some info because "latest" model is not very informative + # the model itself does not have information about its registry + # because the same run can be registered under several different names + # in the registry. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/552 + + self._logger.info(f"Loading model from run_id='{model.metadata.run_id}'") + return model + def _save(self, model: Any) -> None: raise NotImplementedError( "The 'save' method is not implemented for MlflowModelRegistryDataset. You can pass 'registered_model_name' argument in 'MLflowModelTrackingDataset(..., save_args={registered_model_name='my_model'}' to save and register a model in the same step. " diff --git a/tests/framework/cli/test_cli_modelify.py b/tests/framework/cli/test_cli_modelify.py index 6d445d7c..bae70b13 100644 --- a/tests/framework/cli/test_cli_modelify.py +++ b/tests/framework/cli/test_cli_modelify.py @@ -484,7 +484,6 @@ def test_modelify_with_pip_requirements(monkeypatch, kp_for_modelify): runs_list_before_cmd = context.mlflow.server._mlflow_client.search_runs( context.mlflow.tracking.experiment._experiment.experiment_id ) - print(runs_list_before_cmd) cli_runner = CliRunner() result = cli_runner.invoke( diff --git a/tests/io/models/test_mlflow_model_registry_dataset.py b/tests/io/models/test_mlflow_model_registry_dataset.py index a8b42b37..bb5f47c7 100644 --- a/tests/io/models/test_mlflow_model_registry_dataset.py +++ b/tests/io/models/test_mlflow_model_registry_dataset.py @@ -33,6 +33,52 @@ def test_mlflow_model_registry_alias_and_stage_or_version_fails(tmp_path): ) +# this test is failing because of long standing issues like this : +# https://github.com/pytest-dev/pytest/issues/7335 +# https://github.com/pytest-dev/pytest/issues/5160 +# To make logging occur, we need to from kedro.framework.projcet import LOGGING at the beginning +# ironically, the sderr error reported by pytest shows that logging actually occurs! +# If I remove with mlflow.start_run(), caplog is indeed not empty, it seems mlflow flushes the internal loger +# probably related to https://github.com/mlflow/mlflow/issues/4957 +@pytest.mark.xfail +def test_mlflow_model_registry_logs_run_id(caplog, tmp_path, monkeypatch): + # we must change the working directory because when + # using mlflow with a local database tracking, the artifacts + # are stored in a relative mlruns/ folder so we need to have + # the same working directory that the one of the tracking uri + monkeypatch.chdir(tmp_path) + tracking_and_registry_uri = r"sqlite:///" + (tmp_path / "mlruns3.db").as_posix() + mlflow.set_tracking_uri(tracking_and_registry_uri) + mlflow.set_registry_uri(tracking_and_registry_uri) + + # setup: we train 2 version of a model under a single + # registered model and stage the 2nd one + run_ids = {} + for i in range(2): + with mlflow.start_run(): + model = DecisionTreeClassifier() + mlflow.sklearn.log_model( + model, artifact_path="demo_model", registered_model_name="demo_model" + ) + run_ids[i + 1] = mlflow.active_run().info.run_id + + # case 1: no version is provided, we take the last one + + ml_ds = MlflowModelRegistryDataset(model_name="demo_model", stage_or_version=1) + ml_ds.load() + + # caplog.text, caplog.messages, caplog.records are all empty ???, but th stderr will show them + assert run_ids[1] in caplog.text + + # case 2: a stage is provided, we take the last model with this stage + ml_ds = MlflowModelRegistryDataset( + model_name="demo_model", stage_or_version="latest" + ) + ml_ds._load() + + assert run_ids[2] in caplog.text + + def test_mlflow_model_registry_load_given_stage_or_version(tmp_path, monkeypatch): # we must change the working directory because when # using mlflow with a local database tracking, the artifacts