chore: make long notebooks much faster

zama-ai · Apr 25, 2024 · ec0ff23 · ec0ff23
1 parent 1234259
commit ec0ff23
Show file tree

Hide file tree

Showing 8 changed files with 155 additions and 255 deletions.
diff --git a/docs/advanced_examples/ClassifierComparison.ipynb b/docs/advanced_examples/ClassifierComparison.ipynb
diff --git a/docs/advanced_examples/DecisionTreeClassifier.ipynb b/docs/advanced_examples/DecisionTreeClassifier.ipynb
@@ -90,7 +90,7 @@
     "    cv=10,\n",
     "    scoring=\"average_precision\",\n",
     "    error_score=\"raise\",\n",
-    "    n_jobs=-1,\n",
+    "    n_jobs=10,\n",
     ")\n",
     "\n",
     "gs_results = grid_search.fit(x_train, y_train)\n",

diff --git a/docs/advanced_examples/LogisticRegression.ipynb b/docs/advanced_examples/LogisticRegression.ipynb
diff --git a/docs/advanced_examples/QuantizationAwareTraining.ipynb b/docs/advanced_examples/QuantizationAwareTraining.ipynb
diff --git a/docs/advanced_examples/XGBRegressor.ipynb b/docs/advanced_examples/XGBRegressor.ipynb
diff --git a/docs/advanced_examples/utils/classifier_comparison_utils.py b/docs/advanced_examples/utils/classifier_comparison_utils.py
@@ -25,9 +25,7 @@
 ALWAYS_USE_SIM = False
 
 # pylint: disable=too-many-locals,too-many-statements,too-many-branches,invalid-name
-def make_classifier_comparison(title, classifiers, decision_level, verbose=False, save_plot=False):
-
-    h = 0.04  # Step size in the mesh
+def make_classifier_comparison(title, classifiers, decision_level, verbose=False, save_plot=False, simulate=False, h=0.04):
     n_samples = 200
 
     X, y = make_classification(
@@ -140,13 +138,16 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
                 if verbose:
                     print(f"Key generation time: {time.time() - time_begin:.4f} seconds")
 
-            # Compute the predictions in FHE using the Concrete ML model
+            fhe = "simulate" if simulate else "execute"
+
+            # Compute the predictions in FHE (with simulation or not) using the Concrete ML model
             time_begin = time.time()
-            concrete_y_pred = concrete_model.predict(X_test, fhe="execute")
+            concrete_y_pred = concrete_model.predict(X_test, fhe=fhe)
 
             if verbose:
                 print(
-                    f"FHE Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
+                    "FHE " + "(simulation) " * simulate
+                    + f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
                     "seconds per sample\n"
                 )
 
@@ -169,23 +170,17 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
             if not is_a_tree_based_model:
                 bitwidth = circuit.graph.maximum_integer_bit_width()
 
+            raveled_input = np.c_[xx.ravel(), yy.ravel()]
+
             # Plot the decision boundaries.
             # For that, a color is assigned to each point in the mesh, which is obtained as a
             # cartesian product of [x_min, x_max] with [y_min, y_max].
             if hasattr(sklearn_model, "decision_function"):
-                sklearn_Z = sklearn_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
-                concrete_Z = concrete_model.decision_function(
-                    np.c_[xx.ravel(), yy.ravel()],
-                    fhe="simulate",
-                )
+                sklearn_Z = sklearn_model.decision_function(raveled_input)
+                concrete_Z = concrete_model.decision_function(raveled_input, fhe="simulate")
             else:
-                sklearn_Z = sklearn_model.predict_proba(
-                    np.c_[xx.ravel(), yy.ravel()].astype(np.float32)
-                )[:, 1]
-                concrete_Z = concrete_model.predict_proba(
-                    np.c_[xx.ravel(), yy.ravel()],
-                    fhe="simulate",
-                )[:, 1]
+                sklearn_Z = sklearn_model.predict_proba(raveled_input.astype(np.float32))[:, 1]
+                concrete_Z = concrete_model.predict_proba(raveled_input, fhe="simulate")[:, 1]
 
             for k, (framework, score, Z) in enumerate(
                 zip(

diff --git a/docs/advanced_examples/utils/scaling_comparison_utils.py b/docs/advanced_examples/utils/scaling_comparison_utils.py
@@ -4,7 +4,7 @@
 from sklearn.metrics import accuracy_score
 
 # pylint: disable=too-many-locals
-def plot_data(axs, X_train, y_train, X_test, y_test, model, name, h = 0.04, font_size_text = 20):
+def plot_data(axs, X_train, y_train, X_test, y_test, model, name, h=0.04, font_size_text=20):
     # Train the model and retrieve both the Concrete ML model and its equivalent one from
     # scikit-learn
     concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train)
@@ -29,24 +29,18 @@ def plot_data(axs, X_train, y_train, X_test, y_test, model, name, h = 0.04, font
     # pylint: disable-next=no-member
     cm = plt.cm.RdBu
     cm_bright = ListedColormap(["#FF0000", "#0000FF"])
-
+
+    raveled_input = np.c_[xx.ravel(), yy.ravel()]
+
     # Plot the decision boundaries.
     # For that, a color is assigned to each point in the mesh, which is obtained as a
     # cartesian product of [x_min, x_max] with [y_min, y_max].
     if hasattr(sklearn_model, "decision_function"):
-        sklearn_Z = sklearn_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
-        concrete_Z = concrete_model.decision_function(
-            np.c_[xx.ravel(), yy.ravel()],
-            fhe="simulate",
-        )
+        sklearn_Z = sklearn_model.decision_function(raveled_input)
+        concrete_Z = concrete_model.decision_function(raveled_input, fhe="simulate")
     else:
-        sklearn_Z = sklearn_model.predict_proba(
-            np.c_[xx.ravel(), yy.ravel()].astype(np.float32)
-        )[:, 1]
-        concrete_Z = concrete_model.predict_proba(
-            np.c_[xx.ravel(), yy.ravel()],
-            fhe="simulate",
-        )[:, 1]
+        sklearn_Z = sklearn_model.predict_proba(raveled_input.astype(np.float32))[:, 1]
+        concrete_Z = concrete_model.predict_proba(raveled_input, fhe="simulate")[:, 1]
 
     for _, (ax, framework, score, Z) in enumerate(
         zip(
@@ -107,4 +101,4 @@ def plot_data(axs, X_train, y_train, X_test, y_test, model, name, h = 0.04, font
                 f"bit-width={bitwidth}",
                 size=font_size_text,
                 horizontalalignment="right",
-            )
+            )
diff --git a/script/make_utils/jupyter.sh b/script/make_utils/jupyter.sh
@@ -15,7 +15,7 @@ WHAT_TO_DO="open"
 
 # Create a list of notebooks with long execution times in order not to consider them when refreshing
 # all notebooks at the same time.
-LONG_EXECUTION_TIMES_NOTEBOOKS=("docs/advanced_examples/LogisticRegression.ipynb" "docs/advanced_examples/ClassifierComparison.ipynb" "docs/advanced_examples/QuantizationAwareTraining.ipynb" "docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb")
+LONG_EXECUTION_TIMES_NOTEBOOKS=("docs/advanced_examples/ExperimentPrivacyTreePaper.ipynb")
 
 while [ -n "$1" ]
 do
@@ -63,7 +63,7 @@ then
     echo "" > "${FAILED_NOTEBOOKS}"
 
     # shellcheck disable=SC2207
-    LIST_OF_NOTEBOOKS=($(find ./docs/ -type f -name "*.ipynb" | grep -v ".nbconvert" | grep -v "_build" | grep -v "ipynb_checkpoints"))
+    LIST_OF_NOTEBOOKS=($(find ./docs -type f -name "*.ipynb" | grep -v ".nbconvert" | grep -v "_build" | grep -v "ipynb_checkpoints"))
 
     # Remove notebooks with long execution times
     for NOTEBOOK_TO_REMOVE in "${LONG_EXECUTION_TIMES_NOTEBOOKS[@]}"