From 8f8f6aac4a1c062fe5274f734899d0faca0360d4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yolan=20Honor=C3=A9-Roug=C3=A9?=
 <29451317+Galileo-Galilei@users.noreply.github.com>
Date: Sat, 26 Oct 2024 15:27:54 +0200
Subject: [PATCH] :loud_sound: Log model version information when a model is
 loaded from the registry (#552) (#588)

* :loud_sound: Log model version information when a model is loaded from the registry (#552)

* get run  id from metadata

* simplify and log only the run_id

* add failing test

* up changelog and add extra info on test

* typo in changelog

* remove unused client property

* fix changelog typo
---
 CHANGELOG.md                                  |  1 +
 .../models/mlflow_model_registry_dataset.py   | 15 +++++-
 tests/framework/cli/test_cli_modelify.py      |  1 -
 .../test_mlflow_model_registry_dataset.py     | 46 +++++++++++++++++++
 4 files changed, 61 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bfcb8168..bf3310db 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,7 @@
 
 -   :sparkles: Implement missing ``PipelineML`` filtering functionalities to let ``kedro`` display resume hints and avoid breaking ``kedro-viz`` ([#377](https://github.com/Galileo-Galilei/kedro-mlflow/pull/377), [#601, Calychas](https://github.com/Galileo-Galilei/kedro-mlflow/pull/601))
 -   :sparkles: Sanitize parameters name with unsupported characters to avoid ``mlflow`` errors when logging ([#595, pascalwhoop](https://github.com/Galileo-Galilei/kedro-mlflow/pull/595))
+-   :loud_sound: Add logs about the exact ``run_id`` loaded within a ``MlflowRegistryDataset`` because some URI are confusing (e.g. ``latest``) and hard to debug ([#552](https://github.com/Galileo-Galilei/kedro-mlflow/pull/552))
 
 ### Changed
 
diff --git a/kedro_mlflow/io/models/mlflow_model_registry_dataset.py b/kedro_mlflow/io/models/mlflow_model_registry_dataset.py
index 615a5bc5..11374b39 100644
--- a/kedro_mlflow/io/models/mlflow_model_registry_dataset.py
+++ b/kedro_mlflow/io/models/mlflow_model_registry_dataset.py
@@ -1,3 +1,4 @@
+from logging import Logger, getLogger
 from typing import Any, Dict, Optional, Union
 
 from kedro.io.core import DatasetError
@@ -67,6 +68,10 @@ def __init__(
             else f"models:/{model_name}/{stage_or_version}"
         )
 
+    @property
+    def _logger(self) -> Logger:
+        return getLogger(__name__)
+
     def _load(self) -> Any:
         """Loads an MLflow model from local path or from MLflow run.
 
@@ -77,10 +82,18 @@ def _load(self) -> Any:
         # If `run_id` is specified, pull the model from MLflow.
         # TODO: enable loading from another mlflow conf (with a client with another tracking uri)
         # Alternatively, use local path to load the model.
-        return self._mlflow_model_module.load_model(
+        model = self._mlflow_model_module.load_model(
             model_uri=self.model_uri, **self._load_args
         )
 
+        # log some info because "latest" model is not very informative
+        # the model itself does not have information about its registry
+        # because the same run can be registered under several different names
+        #  in the registry. See https://github.com/Galileo-Galilei/kedro-mlflow/issues/552
+
+        self._logger.info(f"Loading model from run_id='{model.metadata.run_id}'")
+        return model
+
     def _save(self, model: Any) -> None:
         raise NotImplementedError(
             "The 'save' method is not implemented for MlflowModelRegistryDataset. You can pass 'registered_model_name' argument in 'MLflowModelTrackingDataset(..., save_args={registered_model_name='my_model'}' to save and register a model in the same step. "
diff --git a/tests/framework/cli/test_cli_modelify.py b/tests/framework/cli/test_cli_modelify.py
index 6d445d7c..bae70b13 100644
--- a/tests/framework/cli/test_cli_modelify.py
+++ b/tests/framework/cli/test_cli_modelify.py
@@ -484,7 +484,6 @@ def test_modelify_with_pip_requirements(monkeypatch, kp_for_modelify):
     runs_list_before_cmd = context.mlflow.server._mlflow_client.search_runs(
         context.mlflow.tracking.experiment._experiment.experiment_id
     )
-    print(runs_list_before_cmd)
     cli_runner = CliRunner()
 
     result = cli_runner.invoke(
diff --git a/tests/io/models/test_mlflow_model_registry_dataset.py b/tests/io/models/test_mlflow_model_registry_dataset.py
index a8b42b37..bb5f47c7 100644
--- a/tests/io/models/test_mlflow_model_registry_dataset.py
+++ b/tests/io/models/test_mlflow_model_registry_dataset.py
@@ -33,6 +33,52 @@ def test_mlflow_model_registry_alias_and_stage_or_version_fails(tmp_path):
         )
 
 
+# this test is failing because of long standing issues like this :
+# https://github.com/pytest-dev/pytest/issues/7335
+# https://github.com/pytest-dev/pytest/issues/5160
+# To make logging occur, we need to from kedro.framework.projcet import LOGGING at the beginning
+# ironically, the sderr error reported by pytest shows that logging actually occurs!
+# If I remove with mlflow.start_run(), caplog is indeed not empty, it seems mlflow flushes the internal loger
+# probably related to https://github.com/mlflow/mlflow/issues/4957
+@pytest.mark.xfail
+def test_mlflow_model_registry_logs_run_id(caplog, tmp_path, monkeypatch):
+    # we must change the working directory because when
+    # using mlflow with a local database tracking, the artifacts
+    # are stored in a relative mlruns/ folder so we need to have
+    # the same working directory that the one of the tracking uri
+    monkeypatch.chdir(tmp_path)
+    tracking_and_registry_uri = r"sqlite:///" + (tmp_path / "mlruns3.db").as_posix()
+    mlflow.set_tracking_uri(tracking_and_registry_uri)
+    mlflow.set_registry_uri(tracking_and_registry_uri)
+
+    # setup: we train 2 version of a model under a single
+    #  registered model and stage the 2nd one
+    run_ids = {}
+    for i in range(2):
+        with mlflow.start_run():
+            model = DecisionTreeClassifier()
+            mlflow.sklearn.log_model(
+                model, artifact_path="demo_model", registered_model_name="demo_model"
+            )
+            run_ids[i + 1] = mlflow.active_run().info.run_id
+
+    # case 1: no version is provided, we take the last one
+
+    ml_ds = MlflowModelRegistryDataset(model_name="demo_model", stage_or_version=1)
+    ml_ds.load()
+
+    # caplog.text, caplog.messages, caplog.records are all empty ???, but th stderr will show them
+    assert run_ids[1] in caplog.text
+
+    # case 2: a stage is provided, we take the last model with this stage
+    ml_ds = MlflowModelRegistryDataset(
+        model_name="demo_model", stage_or_version="latest"
+    )
+    ml_ds._load()
+
+    assert run_ids[2] in caplog.text
+
+
 def test_mlflow_model_registry_load_given_stage_or_version(tmp_path, monkeypatch):
     # we must change the working directory because when
     # using mlflow with a local database tracking, the artifacts