Restore openml

adap · Nov 6, 2024 · 08138cf · 08138cf
1 parent e288878
commit 08138cf
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 24 deletions.
diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml
@@ -156,6 +156,9 @@ jobs:
 
           - directory: e2e-scikit-learn
             e2e: e2e_scikit_learn
+            dataset: |
+              import openml
+              openml.datasets.get_dataset(554)
 
           - directory: e2e-fastai
             e2e: e2e_fastai

diff --git a/e2e/e2e-scikit-learn/e2e_scikit_learn/client_app.py b/e2e/e2e-scikit-learn/e2e_scikit_learn/client_app.py
@@ -8,10 +8,12 @@
 from flwr.client import ClientApp, NumPyClient, start_client
 from flwr.common import Context
 
+# Load MNIST dataset from https://www.openml.org/d/554
+(X_train, y_train), (X_test, y_test) = utils.load_mnist()
+
 # Split train set into 10 partitions and randomly use one for training.
 partition_id = np.random.choice(10)
-X_train, X_test, y_train, y_test = utils.load_data(partition_id, num_partitions=10)
-
+(X_train, y_train) = utils.partition(X_train, y_train, 10)[partition_id]
 
 # Create LogisticRegression Model
 model = LogisticRegression(

diff --git a/e2e/e2e-scikit-learn/e2e_scikit_learn/utils.py b/e2e/e2e-scikit-learn/e2e_scikit_learn/utils.py
@@ -1,8 +1,7 @@
 from typing import List, Tuple, Union
 
 import numpy as np
-from flwr_datasets import FederatedDataset
-from flwr_datasets.partitioner import IidPartitioner
+import openml
 from sklearn.linear_model import LogisticRegression
 
 XY = Tuple[np.ndarray, np.ndarray]
@@ -51,23 +50,30 @@ def set_initial_params(model: LogisticRegression):
         model.intercept_ = np.zeros((n_classes,))
 
 
-fds = None  # Cache FederatedDataset
+def load_mnist() -> Dataset:
+    """Loads the MNIST dataset using OpenML.
 
-
-def load_data(partition_id: int, num_partitions: int):
-    # Only initialize `FederatedDataset` once
-    global fds
-    if fds is None:
-        partitioner = IidPartitioner(num_partitions=num_partitions)
-        fds = FederatedDataset(
-            dataset="ylecun/mnist",
-            partitioners={"train": partitioner},
-        )
-
-    dataset = fds.load_partition(partition_id, "train").with_format("numpy")
-    X, y = dataset["image"].reshape((len(dataset), -1)), dataset["label"]
-    # Split the on edge data: 80% train, 20% test
-    X_train, X_test = X[: int(0.8 * len(X))], X[int(0.8 * len(X)) :]
-    y_train, y_test = y[: int(0.8 * len(y))], y[int(0.8 * len(y)) :]
-
-    return X_train, X_test, y_train, y_test
+    OpenML dataset link: https://www.openml.org/d/554
+    """
+    mnist_openml = openml.datasets.get_dataset(554)
+    Xy, _, _, _ = mnist_openml.get_data(dataset_format="array")
+    X = Xy[:, :-1]  # the last column contains labels
+    y = Xy[:, -1]
+    # First 60000 samples consist of the train set
+    x_train, y_train = X[:1000], y[:1000]
+    x_test, y_test = X[60000:62000], y[60000:62000]
+    return (x_train, y_train), (x_test, y_test)
+
+
+def shuffle(X: np.ndarray, y: np.ndarray) -> XY:
+    """Shuffle X and y."""
+    rng = np.random.default_rng()
+    idx = rng.permutation(len(X))
+    return X[idx], y[idx]
+
+
+def partition(X: np.ndarray, y: np.ndarray, num_partitions: int) -> XYList:
+    """Split X and y into a number of partitions."""
+    return list(
+        zip(np.array_split(X, num_partitions), np.array_split(y, num_partitions))
+    )
diff --git a/e2e/e2e-scikit-learn/pyproject.toml b/e2e/e2e-scikit-learn/pyproject.toml
@@ -13,7 +13,6 @@ authors = [
 ]
 dependencies = [
     "flwr[simulation,rest] @ {root:parent:parent:uri}",
-    "flwr-datasets[vision]>=0.3.0",
     "scikit-learn>=1.1.1,<2.0.0",
     "numpy<2.0.0",
 ]