Skip to content

Commit

Permalink
Migrate sklearn data preparation to use Flower Datasets (#2450)
Browse files Browse the repository at this point in the history
Co-authored-by: jafermarq <[email protected]>
  • Loading branch information
adam-narozniak and jafermarq authored Dec 18, 2023
1 parent fa7be1b commit 0eec9e5
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 62 deletions.
20 changes: 13 additions & 7 deletions examples/sklearn-logreg-mnist/README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Flower Example using scikit-learn

This example of Flower uses `scikit-learn`'s `LogisticRegression` model to train a federated learning system. It will help you understand how to adapt Flower for use with `scikit-learn`.
Running this example in itself is quite easy.
Running this example in itself is quite easy. This example uses [Flower Datasets](https://flower.dev/docs/datasets/) to download, partition and preprocess the MNIST dataset.

## Project Setup

Expand Down Expand Up @@ -57,18 +57,24 @@ Afterwards you are ready to start the Flower server as well as the clients. You
poetry run python3 server.py
```

Now you are ready to start the Flower clients which will participate in the learning. To do so simply open two more terminals and run the following command in each:
Now you are ready to start the Flower clients which will participate in the learning. To do so simply open two or more terminals and run the following command in each:

Start client 1 in the first terminal:

```shell
poetry run python3 client.py
python3 client.py --node-id 0 # or any integer in {0-9}
```

Alternatively you can run all of it in one shell as follows:
Start client 2 in the second terminal:

```shell
poetry run python3 server.py &
poetry run python3 client.py &
poetry run python3 client.py
python3 client.py --node-id 1 # or any integer in {0-9}
```

Alternatively, you can run all of it in one shell as follows:

```bash
bash run.sh
```

You will see that Flower is starting a federated training.
30 changes: 23 additions & 7 deletions examples/sklearn-logreg-mnist/client.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,35 @@
import argparse
import warnings
import flwr as fl
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

import flwr as fl
import utils
from flwr_datasets import FederatedDataset

if __name__ == "__main__":
# Load MNIST dataset from https://www.openml.org/d/554
(X_train, y_train), (X_test, y_test) = utils.load_mnist()
N_CLIENTS = 10

parser = argparse.ArgumentParser(description="Flower")
parser.add_argument(
"--node-id",
type=int,
choices=range(0, N_CLIENTS),
required=True,
help="Specifies the artificial data partition",
)
args = parser.parse_args()
partition_id = args.node_id

# Load the partition data
fds = FederatedDataset(dataset="mnist", partitioners={"train": N_CLIENTS})

# Split train set into 10 partitions and randomly use one for training.
partition_id = np.random.choice(10)
(X_train, y_train) = utils.partition(X_train, y_train, 10)[partition_id]
dataset = fds.load_partition(partition_id, "train").with_format("numpy")
X, y = dataset["image"].reshape((len(dataset), -1)), dataset["label"]
# Split the on edge data: 80% train, 20% test
X_train, X_test = X[: int(0.8 * len(X))], X[int(0.8 * len(X)) :]
y_train, y_test = y[: int(0.8 * len(y))], y[int(0.8 * len(y)) :]

# Create LogisticRegression Model
model = LogisticRegression(
Expand Down
4 changes: 2 additions & 2 deletions examples/sklearn-logreg-mnist/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ authors = [

[tool.poetry.dependencies]
python = "^3.8"
flwr = "^1.0.0"
flwr = ">=1.0,<2.0"
# flwr = { path = "../../", develop = true } # Development
flwr-datasets = { extras = ["vision"], version = ">=0.0.2,<1.0.0" }
scikit-learn = "^1.1.1"
openml = "^0.12.2"
4 changes: 2 additions & 2 deletions examples/sklearn-logreg-mnist/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
flwr~=1.4.0
flwr>=1.0, <2.0
flwr-datasets[vision]>=0.0.2, <1.0.0
numpy~=1.21.1
openml~=0.13.1
scikit_learn~=1.2.2
8 changes: 5 additions & 3 deletions examples/sklearn-logreg-mnist/run.sh
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#!/bin/bash
set -e
cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"/

echo "Starting server"
python server.py &
sleep 3 # Sleep for 3s to give the server enough time to start

for i in `seq 0 1`; do
for i in $(seq 0 1); do
echo "Starting client $i"
python client.py &
python client.py --node-id "${i}" &
done

# This will allow you to use CTRL+C to stop all background processes
trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM
trap 'trap - SIGTERM && kill -- -$$' SIGINT SIGTERM
# Wait for all background processes to complete
wait
6 changes: 5 additions & 1 deletion examples/sklearn-logreg-mnist/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from sklearn.linear_model import LogisticRegression
from typing import Dict

from flwr_datasets import FederatedDataset


def fit_round(server_round: int) -> Dict:
"""Send round number to client."""
Expand All @@ -14,7 +16,9 @@ def get_evaluate_fn(model: LogisticRegression):
"""Return an evaluation function for server-side evaluation."""

# Load test data here to avoid the overhead of doing it in `evaluate` itself
_, (X_test, y_test) = utils.load_mnist()
fds = FederatedDataset(dataset="mnist", partitioners={"train": 10})
dataset = fds.load_full("test").with_format("numpy")
X_test, y_test = dataset["image"].reshape((len(dataset), -1)), dataset["label"]

# The `evaluate` function will be called after every round
def evaluate(server_round, parameters: fl.common.NDArrays, config):
Expand Down
44 changes: 4 additions & 40 deletions examples/sklearn-logreg-mnist/utils.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
from typing import Tuple, Union, List
import numpy as np
from sklearn.linear_model import LogisticRegression
import openml

XY = Tuple[np.ndarray, np.ndarray]
Dataset = Tuple[XY, XY]
LogRegParams = Union[XY, Tuple[np.ndarray]]
XYList = List[XY]
from flwr.common import NDArrays


def get_model_parameters(model: LogisticRegression) -> LogRegParams:
"""Returns the paramters of a sklearn LogisticRegression model."""
def get_model_parameters(model: LogisticRegression) -> NDArrays:
"""Returns the parameters of a sklearn LogisticRegression model."""
if model.fit_intercept:
params = [
model.coef_,
Expand All @@ -23,9 +18,7 @@ def get_model_parameters(model: LogisticRegression) -> LogRegParams:
return params


def set_model_params(
model: LogisticRegression, params: LogRegParams
) -> LogisticRegression:
def set_model_params(model: LogisticRegression, params: NDArrays) -> LogisticRegression:
"""Sets the parameters of a sklean LogisticRegression model."""
model.coef_ = params[0]
if model.fit_intercept:
Expand All @@ -47,32 +40,3 @@ def set_initial_params(model: LogisticRegression):
model.coef_ = np.zeros((n_classes, n_features))
if model.fit_intercept:
model.intercept_ = np.zeros((n_classes,))


def load_mnist() -> Dataset:
"""Loads the MNIST dataset using OpenML.
OpenML dataset link: https://www.openml.org/d/554
"""
mnist_openml = openml.datasets.get_dataset(554)
Xy, _, _, _ = mnist_openml.get_data(dataset_format="array")
X = Xy[:, :-1] # the last column contains labels
y = Xy[:, -1]
# First 60000 samples consist of the train set
x_train, y_train = X[:60000], y[:60000]
x_test, y_test = X[60000:], y[60000:]
return (x_train, y_train), (x_test, y_test)


def shuffle(X: np.ndarray, y: np.ndarray) -> XY:
"""Shuffle X and y."""
rng = np.random.default_rng()
idx = rng.permutation(len(X))
return X[idx], y[idx]


def partition(X: np.ndarray, y: np.ndarray, num_partitions: int) -> XYList:
"""Split X and y into a number of partitions."""
return list(
zip(np.array_split(X, num_partitions), np.array_split(y, num_partitions))
)

0 comments on commit 0eec9e5

Please sign in to comment.