From bec5bd7ef9aeefd364cab74476f93457e289e8d8 Mon Sep 17 00:00:00 2001 From: ThibaultFy <50656860+ThibaultFy@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:54:55 +0100 Subject: [PATCH] feat: merge predict and test tasks (#376) Signed-off-by: ThibaultFy <50656860+ThibaultFy@users.noreply.github.com> Signed-off-by: ThibaultFy --- CHANGELOG.md | 12 ++++-- Makefile | 6 +-- .../get_started/run_mnist_torch.ipynb | 28 ++++++------- .../go_further/run_iris_sklearn.ipynb | 33 +++++----------- .../go_further/run_mnist_cyclic.ipynb | 39 ++++++++++++------- 5 files changed, 58 insertions(+), 60 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5af522093..1ddb6bbff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Changed + +- Test and predict tasks are now merged, after [SubstraFL #177](https://github.com/Substra/substrafl/pull/177) +- Rename `predictions_path` to `predictions` in metrics ([#376](https://github.com/Substra/substra-documentation/pull/376)) +- Pass `metric_functions` to `Strategy` instead to `TestDataNodes` ([#376](https://github.com/Substra/substra-documentation/pull/376)) + ## [0.35.0] ### Added @@ -17,15 +23,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Bump Sphinx to 7.2.6, and upgrade linked dependencies ([#388](https://github.com/Substra/substra-documentation/pull/388)) - Examples are not executed when building the documentation ([#388](https://github.com/Substra/substra-documentation/pull/388)) - + ### Fixed - Restor custom css on nbshpinx gallery ([#394](https://github.com/Substra/substra-documentation/pull/394)) - + ### Removed - Mentions to Orchestrator distributed mode ([#379](https://github.com/Substra/substra-documentation/pull/379)) - + ## [0.34.0] ### Added diff --git a/Makefile b/Makefile index c3ef32e92..f70dde700 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,16 @@ install-examples-dependencies: pip3 install -r examples_requirements.txt -examples: example-substra example-substrafl +examples: examples-substra examples-substrafl -example-substra: example-core-diabetes example-core-titanic +examples-substra: example-core-diabetes example-core-titanic example-core-diabetes: cd docs/source/examples/substra_core/diabetes_example/ && ipython -c "%run run_diabetes.ipynb" example-core-titanic: cd docs/source/examples/substra_core/titanic_example/ && ipython -c "%run run_titanic.ipynb" -example-substrafl: example-fl-mnist example-fl-iris example-fl-cyclic example-fl-diabetes +examples-substrafl: example-fl-mnist example-fl-iris example-fl-cyclic example-fl-diabetes example-fl-mnist: cd docs/source/examples/substrafl/get_started/ && ipython -c "%run run_mnist_torch.ipynb" diff --git a/docs/source/examples/substrafl/get_started/run_mnist_torch.ipynb b/docs/source/examples/substrafl/get_started/run_mnist_torch.ipynb index cae04b4c9..31f69498a 100644 --- a/docs/source/examples/substrafl/get_started/run_mnist_torch.ipynb +++ b/docs/source/examples/substrafl/get_started/run_mnist_torch.ipynb @@ -249,21 +249,19 @@ "import numpy as np\n", "\n", "\n", - "def accuracy(datasamples, predictions_path):\n", + "def accuracy(datasamples, predictions):\n", " y_true = datasamples[\"labels\"]\n", - " y_pred = np.load(predictions_path)\n", "\n", - " return accuracy_score(y_true, np.argmax(y_pred, axis=1))\n", + " return accuracy_score(y_true, np.argmax(predictions, axis=1))\n", "\n", "\n", - "def roc_auc(datasamples, predictions_path):\n", + "def roc_auc(datasamples, predictions):\n", " y_true = datasamples[\"labels\"]\n", - " y_pred = np.load(predictions_path)\n", "\n", " n_class = np.max(y_true) + 1\n", " y_true_one_hot = np.eye(n_class)[y_true]\n", "\n", - " return roc_auc_score(y_true_one_hot, y_pred)" + " return roc_auc_score(y_true_one_hot, predictions)" ] }, { @@ -483,7 +481,7 @@ "source": [ "from substrafl.strategies import FedAvg\n", "\n", - "strategy = FedAvg(algo=TorchCNN())" + "strategy = FedAvg(algo=TorchCNN(), metric_functions={\"Accuracy\": accuracy, \"ROC AUC\": roc_auc})" ] }, { @@ -556,7 +554,6 @@ " organization_id=org_id,\n", " data_manager_key=dataset_keys[org_id],\n", " test_data_sample_keys=[test_datasample_keys[org_id]],\n", - " metric_functions={\"Accuracy\": accuracy, \"ROC AUC\": roc_auc},\n", " )\n", " for org_id in DATA_PROVIDER_ORGS_ID\n", "]\n", @@ -576,8 +573,7 @@ "The [Dependency](https://docs.substra.org/en/stable/substrafl_doc/api/dependency.html) object is instantiated in order to install the right libraries in\n", "the Python environment of each organization.\n", "\n", - "The CPU torch version is installed here to have a `Dependency` object as light as possible as we don't use GPUs (`use_gpu` set to `False`). Remove the `--extra-index-url` to install the cuda torch version.\n", - "\n" + "The CPU torch version is installed here to have a `Dependency` object as light as possible as we don't use GPUs (`use_gpu` set to `False`). Remove the `--extra-index-url` to install the cuda torch version." ] }, { @@ -647,13 +643,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The compute plan created is composed of 29 tasks:\n", + "The compute plan created is composed of 21 tasks:\n", "\n", - "* For each local training step, we create 3 tasks per organisation: training + prediction + evaluation -> 3 tasks.\n", - "* We are training on 2 data organizations; for each round, we have 3 * 2 local tasks + 1 aggregation task -> 7 tasks.\n", - "* We are training for 3 rounds: 3 * 7 -> 21 tasks.\n", - "* Before the first local training step, there is an initialization step on each data organization: 21 + 2 -> 23 tasks.\n", - "* After the last aggregation step, there are three more tasks: applying the last updates from the aggregator + prediction + evaluation, on both organizations: 23 + 2 * 3 -> 29 tasks\n", + "* For each local training step, we create 2 tasks per organisation: training + evaluation -> 2 tasks.\n", + "* We are training on 2 data organizations; for each round, we have 2 * 2 local tasks + 1 aggregation task -> 5 tasks.\n", + "* We are training for 3 rounds: 3 * 5 -> 15 tasks.\n", + "* Before the first local training step, there is an initialization step on each data organization: 15 + 2 -> 17 tasks.\n", + "* After the last aggregation step, there are two more tasks: applying the last updates from the aggregator + evaluation, on both organizations: 17 + 2 * 2 -> 21 tasks\n", "\n" ] }, diff --git a/docs/source/examples/substrafl/go_further/run_iris_sklearn.ipynb b/docs/source/examples/substrafl/go_further/run_iris_sklearn.ipynb index 040d7f86c..c5cd314a9 100644 --- a/docs/source/examples/substrafl/go_further/run_iris_sklearn.ipynb +++ b/docs/source/examples/substrafl/go_further/run_iris_sklearn.ipynb @@ -197,11 +197,10 @@ "import numpy as np\n", "\n", "\n", - "def accuracy(datasamples, predictions_path):\n", + "def accuracy(datasamples, predictions):\n", " y_true = datasamples[\"targets\"]\n", - " y_pred = np.load(predictions_path)\n", "\n", - " return accuracy_score(y_true, y_pred)" + " return accuracy_score(y_true, predictions)" ] }, { @@ -281,7 +280,7 @@ " The train method must accept as parameters `datasamples` and `shared_state`.\n", "- **predict** (method): a function to describe how to compute the\n", " predictions from the algo model.\n", - " The predict method must accept as parameters `datasamples`, `shared_state` and `predictions_path`.\n", + " The predict method must accept as parameters `datasamples` and `shared_state`.\n", "- **save** (method): specify how to save the important states of our algo.\n", "- **load** (method): specify how to load the important states of our algo from a previously saved filed\n", " by the `save` function describe above.\n", @@ -302,7 +301,6 @@ "\n", "import joblib\n", "from typing import Optional\n", - "import shutil\n", "\n", "# The Iris dataset proposes four attributes to predict three different classes.\n", "INPUT_SIZE = 4\n", @@ -390,29 +388,19 @@ " parameters_update=[p for p in delta_coef] + [delta_bias],\n", " )\n", "\n", - " @remote.remote_data\n", - " def predict(self, datasamples, shared_state, predictions_path):\n", - " \"\"\"The predict function to be executed on organizations containing\n", - " data we want to test our model on. The @remote_data decorator is mandatory\n", - " to allow this function to be sent and executed on the right organization.\n", + " def predict(self, datasamples, shared_state):\n", + " \"\"\"The predict function to be executed by the evaluation function on\n", + " data we want to test our model on. The predict method is mandatory and is \n", + " an `abstractmethod` of the `Algo` class.\n", "\n", " Args:\n", " datasamples: datasamples extracted from the organizations data using\n", " the given opener.\n", " shared_state: shared_state provided by the aggregator.\n", - " predictions_path: Path where to save the predictions.\n", - " This path is provided by Substra and the metric will automatically\n", - " get access to this path to load the predictions.\n", " \"\"\"\n", " predictions = self._model.predict(datasamples[\"data\"])\n", "\n", - " if predictions_path is not None:\n", - " np.save(predictions_path, predictions)\n", - "\n", - " # np.save() automatically adds a \".npy\" to the end of the file.\n", - " # We rename the file produced by removing the \".npy\" suffix, to make sure that\n", - " # predictions_path is the actual file name.\n", - " shutil.move(str(predictions_path) + \".npy\", predictions_path)\n", + " return predictions\n", "\n", " def save_local_state(self, path):\n", " joblib.dump(\n", @@ -448,8 +436,8 @@ "outputs": [], "source": [ "from substrafl.strategies import FedAvg\n", - "\n", - "strategy = FedAvg(algo=SklearnLogisticRegression(model=cls, seed=SEED))" + " \n", + "strategy = FedAvg(algo=SklearnLogisticRegression(model=cls, seed=SEED), metric_functions=accuracy)" ] }, { @@ -508,7 +496,6 @@ " organization_id=org_id,\n", " data_manager_key=dataset_keys[org_id],\n", " test_data_sample_keys=[test_datasample_keys[org_id]],\n", - " metric_functions=accuracy,\n", " )\n", " for org_id in DATA_PROVIDER_ORGS_ID\n", "]\n", diff --git a/docs/source/examples/substrafl/go_further/run_mnist_cyclic.ipynb b/docs/source/examples/substrafl/go_further/run_mnist_cyclic.ipynb index 789d77914..d7be4bd05 100644 --- a/docs/source/examples/substrafl/go_further/run_mnist_cyclic.ipynb +++ b/docs/source/examples/substrafl/go_further/run_mnist_cyclic.ipynb @@ -254,21 +254,19 @@ "import numpy as np\n", "\n", "\n", - "def accuracy(datasamples, predictions_path):\n", + "def accuracy(datasamples, predictions):\n", " y_true = datasamples[\"labels\"]\n", - " y_pred = np.load(predictions_path)\n", "\n", - " return accuracy_score(y_true, np.argmax(y_pred, axis=1))\n", + " return accuracy_score(y_true, np.argmax(predictions, axis=1))\n", "\n", "\n", - "def roc_auc(datasamples, predictions_path):\n", + "def roc_auc(datasamples, predictions):\n", " y_true = datasamples[\"labels\"]\n", - " y_pred = np.load(predictions_path)\n", "\n", " n_class = np.max(y_true) + 1\n", " y_true_one_hot = np.eye(n_class)[y_true]\n", "\n", - " return roc_auc_score(y_true_one_hot, y_pred)" + " return roc_auc_score(y_true_one_hot, predictions)" ] }, { @@ -446,7 +444,7 @@ "- `initialization_round`, to indicate what tasks to execute at round 0, in order to setup the variable\n", " and be able to compute the performances of the model before any training.\n", "- `perform_round`, to indicate what tasks and in which order we need to compute to execute a round of the strategy.\n", - "- `perform_predict`, to indicate how to compute the predictions and performances .\n", + "- `perform_evaluation`, to indicate how to compute the predictions and performances .\n", "\n", "\n" ] @@ -462,6 +460,8 @@ "from typing import Any\n", "from typing import List\n", "from typing import Optional\n", + "from typing import Dict\n", + "from typing import Callable\n", "\n", "from substrafl import strategies\n", "from substrafl.algorithms.algo import Algo\n", @@ -479,7 +479,13 @@ " strategy to trigger the tests tasks when needed.\n", " \"\"\"\n", "\n", - " def __init__(self, algo: Algo, *args, **kwargs):\n", + " def __init__(\n", + " self, \n", + " algo: Algo, \n", + " metric_functions: Optional[Dict[str, Callable]] = None, \n", + " *args, \n", + " **kwargs,\n", + " ):\n", " \"\"\"\n", " It is possible to add any arguments to a Strategy. It is important to pass these arguments as\n", " args or kwargs to the parent class, using the super().__init__(...) method.\n", @@ -490,8 +496,12 @@ " Args:\n", " algo (Algo): A Strategy takes an Algo as argument, in order to deal with framework\n", " specific function in a dedicated object.\n", + " metric_functions (Optional[Dict[str, Callable]]):\n", + " list of Functions that implement the different metrics. If a Dict is given, the keys will be used to\n", + " register the result of the associated function. If a Function or a List is given, function.__name__\n", + " will be used to store the result.\n", " \"\"\"\n", - " super().__init__(algo=algo, *args, **kwargs)\n", + " super().__init__(algo=algo, metric_functions=metric_functions, *args, **kwargs)\n", "\n", " self._cyclic_local_state = None\n", " self._cyclic_shared_state = None\n", @@ -590,14 +600,14 @@ " clean_models=clean_models,\n", " )\n", "\n", - " def perform_predict(\n", + " def perform_evaluation(\n", " self,\n", " test_data_nodes: List[TestDataNode],\n", " train_data_nodes: List[TrainDataNode],\n", " round_idx: int,\n", " ):\n", " \"\"\"This method is called regarding the given evaluation strategy. If the round is included\n", - " in the evaluation strategy, the ``perform_predict`` method will be called on the different concerned nodes.\n", + " in the evaluation strategy, the ``perform_evaluation`` method will be called on the different concerned nodes.\n", "\n", " We are using the last computed ``_cyclic_local_state`` to feed the test task, which mean that we will\n", " always test the model after its training on the last train data nodes of the list.\n", @@ -611,9 +621,9 @@ " for test_node in test_data_nodes:\n", " test_node.update_states(\n", " traintask_id=self._cyclic_local_state.key,\n", - " operation=self.algo.predict(\n", + " operation=self.evaluate(\n", " data_samples=test_node.test_data_sample_keys,\n", - " _algo_name=f\"Predicting with {self.algo.__class__.__name__}\",\n", + " _algo_name=f\"Evaluating with {self.__class__.__name__}\",\n", " ),\n", " round_idx=round_idx,\n", " )" @@ -823,7 +833,7 @@ " )\n", "\n", "\n", - "strategy = CyclicStrategy(algo=MyAlgo())" + "strategy = CyclicStrategy(algo=MyAlgo(), metric_functions={\"Accuracy\": accuracy, \"ROC AUC\": roc_auc})" ] }, { @@ -892,7 +902,6 @@ " organization_id=org_id,\n", " data_manager_key=dataset_keys[org_id],\n", " test_data_sample_keys=[test_datasample_keys[org_id]],\n", - " metric_functions={\"Accuracy\": accuracy, \"ROC AUC\": roc_auc},\n", " )\n", " for org_id in DATA_PROVIDER_ORGS_ID\n", "]\n",