feat: support from_sklearn for trees

Support `from_sklearn` for tree based models. Two options: - Quantization from thresholds: the main idea is to consider the thresholds of the nodes of the trees for quantization to not have to use data. - Quantization from data: build a quantizer from the data provided by the user and quantize the thresholds based on that. This also raises the question of non-uniform input quantization. We could quantize the data based on the thresholds thus reducing the number of bits required to log2(max_{feature}(node_{feature})). That would leak the thresholds used in the model per feature but not the structure of the tree itself while increasing significantly the number of bits required. We could try to automatically determine the n-bits to use to properly represent all thresholds but this might result in a very high bit-with. This commit also changes to comparison so that it uses truncation and not rounding anymore.
zama-ai · May 30, 2024 · af14572 · af14572
1 parent 9a8121c
commit af14572
Show file tree

Hide file tree

Showing 20 changed files with 1,689 additions and 63 deletions.
diff --git a/.github/workflows/refresh-one-notebook.yaml b/.github/workflows/refresh-one-notebook.yaml
@@ -22,6 +22,7 @@ on:
         - FullyConnectedNeuralNetworkOnMNIST \n
         - GLMComparison \n
         - HealthCarePrediction \n
+        - ImportingFromScikitLearn \n
         - KaggleTitanic \n
         - KNearestNeighbors \n
         - LinearRegression \n
@@ -66,6 +67,7 @@ env:
   FullyConnectedNeuralNetworkOnMNIST: "docs/advanced_examples/FullyConnectedNeuralNetworkOnMNIST.ipynb" 
   GLMComparison: "docs/advanced_examples/GLMComparison.ipynb" 
   HealthCarePrediction: "use_case_examples/disease_prediction/HealthCarePrediction.ipynb" 
+  ImportingFromScikitLearn: "docs/advanced_examples/ImportingFromScikitLearn.ipynb" 
   KaggleTitanic: "use_case_examples/titanic/KaggleTitanic.ipynb" 
   KNearestNeighbors: "docs/advanced_examples/KNearestNeighbors.ipynb" 
   LinearRegression: "docs/advanced_examples/LinearRegression.ipynb" 

diff --git a/.gitleaksignore b/.gitleaksignore
@@ -8,3 +8,4 @@ a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipyn
 a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipynb:aws-access-token:18379
 f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipynb:aws-access-token:6404
 e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5
+7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026
diff --git a/deps_licenses/licenses_linux_user.txt b/deps_licenses/licenses_linux_user.txt
@@ -35,7 +35,7 @@ protobuf, 3.20.3, BSD-3-Clause
 psutil, 5.9.8, BSD License
 python-dateutil, 2.9.0.post0, Apache Software License; BSD License
 pytz, 2024.1, MIT License
-requests, 2.32.1, Apache Software License
+requests, 2.32.2, Apache Software License
 scikit-learn, 1.1.3, BSD License
 scipy, 1.10.1, BSD License
 six, 1.16.0, MIT License

diff --git a/deps_licenses/licenses_linux_user.txt.md5 b/deps_licenses/licenses_linux_user.txt.md5
@@ -1 +1 @@
-5e424e2fdfb0fe158f7f91eccd528367
+137e9f0fb1ee91035add06b2a5f29d41
diff --git a/deps_licenses/licenses_mac_intel_user.txt b/deps_licenses/licenses_mac_intel_user.txt
@@ -31,7 +31,7 @@ protobuf, 3.20.3, BSD-3-Clause
 psutil, 5.9.8, BSD License
 python-dateutil, 2.9.0.post0, Apache Software License; BSD License
 pytz, 2024.1, MIT License
-requests, 2.32.1, Apache Software License
+requests, 2.32.2, Apache Software License
 scikit-learn, 1.1.3, BSD License
 scipy, 1.10.1, BSD License
 six, 1.16.0, MIT License

diff --git a/deps_licenses/licenses_mac_intel_user.txt.md5 b/deps_licenses/licenses_mac_intel_user.txt.md5
@@ -1 +1 @@
-5e424e2fdfb0fe158f7f91eccd528367
+137e9f0fb1ee91035add06b2a5f29d41
diff --git a/deps_licenses/licenses_mac_silicon_user.txt b/deps_licenses/licenses_mac_silicon_user.txt
@@ -31,7 +31,7 @@ protobuf, 3.20.3, BSD-3-Clause
 psutil, 5.9.8, BSD License
 python-dateutil, 2.9.0.post0, Apache Software License; BSD License
 pytz, 2024.1, MIT License
-requests, 2.32.1, Apache Software License
+requests, 2.32.2, Apache Software License
 scikit-learn, 1.1.3, BSD License
 scipy, 1.10.1, BSD License
 six, 1.16.0, MIT License

diff --git a/deps_licenses/licenses_mac_silicon_user.txt.md5 b/deps_licenses/licenses_mac_silicon_user.txt.md5
@@ -1 +1 @@
-5e424e2fdfb0fe158f7f91eccd528367
+137e9f0fb1ee91035add06b2a5f29d41
diff --git a/docs/advanced_examples/ImportingFromScikitLearn.ipynb b/docs/advanced_examples/ImportingFromScikitLearn.ipynb
diff --git a/docs/advanced_examples/XGBClassifier.ipynb b/docs/advanced_examples/XGBClassifier.ipynb
@@ -587,5 +587,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/docs/advanced_examples/utils/classifier_comparison_utils.py b/docs/advanced_examples/utils/classifier_comparison_utils.py
@@ -20,7 +20,9 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 
-from concrete.ml.sklearn import DecisionTreeClassifier
+from concrete.ml.sklearn.base import BaseTreeEstimatorMixin
+from concrete.fhe import Configuration
+
 
 ALWAYS_USE_SIM = False
 
@@ -118,7 +120,7 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
 
             # Compile the Concrete ML model
             time_begin = time.time()
-            circuit = concrete_model.compile(X_train)
+            circuit = concrete_model.compile(X_train,)
 
             if verbose:
                 print(f"Compilation time: {(time.time() - time_begin):.4f} seconds\n")
@@ -155,9 +157,7 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
             sklearn_score = accuracy_score(sklearn_y_pred, y_test)
             concrete_score = accuracy_score(concrete_y_pred, y_test)
 
-            is_a_tree_based_model = concrete_model.__class__ in [
-                DecisionTreeClassifier,
-            ]
+            is_a_tree_based_model = isinstance(concrete_model, BaseTreeEstimatorMixin)
 
             # Compile the Concrete ML model with FHE simulation mode to evaluate the domain grid
             circuit = concrete_model.compile(
@@ -242,8 +242,255 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
                         horizontalalignment="right",
                     )
 
+    plt.tight_layout()
     if save_plot:
         plt.savefig(f"./{title}.png")
 
+    plt.show()
+
+
+def make_classifier_comparison_from_sklearn(title, classifiers, decision_level, verbose=False, save_plot=False, simulate=False, h=0.04):
+    n_samples = 200
+    num_models = 3
+
+    X, y = make_classification(
+        n_samples=n_samples,
+        n_features=2,
+        n_redundant=0,
+        n_informative=2,
+        random_state=1,
+        n_clusters_per_class=1,
+    )
+    assert isinstance(X, np.ndarray)
+    assert isinstance(y, np.ndarray)
+    # pylint: disable-next=no-member
+    rng = np.random.RandomState(2)
+    X += 2 * rng.uniform(size=X.shape)
+    linearly_separable = (X, y)
+
+    datasets = [
+        make_moons(n_samples=n_samples, noise=0.2, random_state=0),
+        make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
+        linearly_separable,
+    ]
+
+    font_size_text = 20
+
+    num_y_plots = len(datasets)
+    num_x_plots = num_models * len(classifiers) + 1
+    fig, axs = plt.subplots(num_y_plots, num_x_plots, figsize=(num_x_plots*4, num_y_plots*4))
+    fig.suptitle(title, fontsize=20)
+    fig.patch.set_facecolor("white")
+    plt.subplots_adjust(top=0.9)
+
+    # Iterate over data-sets
+    for i, dataset in enumerate(datasets):
+        # Preprocess data-set
+        X, y = dataset
+        X = X.astype(np.float32)
+        X = StandardScaler().fit_transform(X)
+
+        # Split the data into training and test sets
+        # Use 15 percent (30 points for a data-set of 200 points) for prediction
+        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
+
+        x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
+        y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
+        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+        # pylint: disable-next=no-member
+        cm = plt.cm.RdBu
+        cm_bright = ListedColormap(["#FF0000", "#0000FF"])
+        ax = axs[i, 0]
+        if i == 0:
+            ax.set_title("Input data", fontsize=font_size_text)
+
+        # Plot the training points
+        ax.scatter(
+            X_train[:, 0],
+            X_train[:, 1],
+            c=y_train,
+            cmap=cm_bright,
+            edgecolors="k",
+            label="Train data",
+        )
+
+        # Plot the testing points
+        ax.scatter(
+            X_test[:, 0],
+            X_test[:, 1],
+            marker="D",
+            c=y_test,
+            cmap=cm_bright,
+            alpha=0.6,
+            edgecolors="k",
+            label="Test data",
+        )
+        ax.legend()
+
+        ax.set_xlim(xx.min(), xx.max())
+        ax.set_ylim(yy.min(), yy.max())
+        ax.set_xticks(())
+        ax.set_yticks(())
+
+        # Iterate over the given classifiers
+        for j, (classifier, model_name) in enumerate(classifiers):
+            # Instantiate the model
+            model = classifier()
+
+            # Train the model and retrieve both the Concrete ML model and its equivalent one from
+            # scikit-learn
+            concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train)
+
+            # TODO: from data or not?
+            sklearn_fhe_model = concrete_model.__class__.from_sklearn_model(sklearn_model, X=X_train)
+
+            # Compute the predictions in clear using the scikit-learn model
+            sklearn_y_pred = sklearn_model.predict(X_test)
+
+            # Compile the Concrete ML model
+            time_begin = time.time()
+            cfg = Configuration(detect_overflow_in_simulation=False)
+            circuit_cml = concrete_model.compile(X_train,)
+            circuit_sklearn = sklearn_fhe_model.compile(X_train,)
+
+            fhe = "simulate"
+            for circuit in [circuit_cml, circuit_sklearn]:
+                if verbose:
+                    print(f"Compilation time: {(time.time() - time_begin):.4f} seconds\n")
+
+                # If the prediction are done in FHE, generate the key
+                if not ALWAYS_USE_SIM:
+
+                    if verbose:
+                        print(
+                            "Generating a key for a "
+                            f"{circuit.graph.maximum_integer_bit_width()}-bit circuit"
+                        )
+
+                    time_begin = time.time()
+                    circuit.client.keygen(force=False)
+
+                    if verbose:
+                        print(f"Key generation time: {time.time() - time_begin:.4f} seconds")
+
+                fhe = "simulate" if simulate else "execute"
+
+            # Compute the predictions in FHE (with simulation or not) using the Concrete ML model
+            time_begin = time.time()
+            concrete_y_pred = concrete_model.predict(X_test, fhe=fhe)
+
+            if verbose:
+                print(
+                    "FHE " + "(simulation) " * simulate
+                    + f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
+                    "seconds per sample\n"
+                )
+
+            time_begin = time.time()
+            sklearn_fhe_y_pred = sklearn_fhe_model.predict(X_test, fhe=fhe)
+
+            if verbose:
+                print(
+                    "FHE " + "(simulation) " * simulate
+                    + f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
+                    "seconds per sample\n"
+                )
+
+            # Measure the accuracy scores
+            sklearn_score = accuracy_score(sklearn_y_pred, y_test)
+            sklearn_fhe_score = accuracy_score(sklearn_fhe_y_pred, y_test)
+            concrete_score = accuracy_score(concrete_y_pred, y_test)
+
+            is_a_tree_based_model = isinstance(concrete_model, BaseTreeEstimatorMixin)
+
+            # Compile the Concrete ML model with FHE simulation mode to evaluate the domain grid
+            circuit = concrete_model.compile(
+                X_train,
+            )
+
+            # If the model is not a tree-based model, retrieve the maximum integer bit-width
+            # reached within its circuit.
+            bitwidth = None
+            if not is_a_tree_based_model:
+                bitwidth = circuit.graph.maximum_integer_bit_width()
+
+            raveled_input = np.c_[xx.ravel(), yy.ravel()]
+
+            # Plot the decision boundaries.
+            # For that, a color is assigned to each point in the mesh, which is obtained as a
+            # cartesian product of [x_min, x_max] with [y_min, y_max].
+            if hasattr(sklearn_model, "decision_function"):
+                sklearn_Z = sklearn_model.decision_function(raveled_input)
+                concrete_Z = concrete_model.decision_function(raveled_input, fhe="simulate")
+                sklearn_fhe_Z = sklearn_fhe_model.decision_function(raveled_input, fhe="simulate")
+            else:
+                sklearn_Z = sklearn_model.predict_proba(raveled_input.astype(np.float32))[:, 1]
+                concrete_Z = concrete_model.predict_proba(raveled_input, fhe="simulate")[:, 1]
+                sklearn_fhe_Z = sklearn_fhe_model.predict_proba(raveled_input, fhe="simulate")[:, 1]
+
+            for k, (framework, score, Z) in enumerate(
+                zip(
+                    ["scikit-learn", "Concrete ML", "scikit-learn imported"],
+                    [sklearn_score, concrete_score, sklearn_fhe_score],
+                    [sklearn_Z, concrete_Z, sklearn_fhe_Z],
+                )
+            ):
+                ax = axs[i, num_models * j + k + 1]
+
+                # Put the result into a color plot
+                Z = Z.reshape(xx.shape)
+                ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
+
+                # Plot the training points
+                ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
+
+                # Plot the testing points
+                ax.scatter(
+                    X_test[:, 0],
+                    X_test[:, 1],
+                    c=y_test,
+                    marker="D",
+                    cmap=cm_bright,
+                    edgecolors="k",
+                    alpha=0.6,
+                )
+
+                ax.contour(
+                    xx,
+                    yy,
+                    Z,
+                    levels=[decision_level],
+                    linewidths=2,
+                )
+
+                ax.set_xlim(xx.min(), xx.max())
+                ax.set_ylim(yy.min(), yy.max())
+                ax.set_xticks(())
+                ax.set_yticks(())
+
+                if i == 0:
+                    ax.set_title(model_name + f" ({framework})", fontsize=font_size_text)
+
+                ax.text(
+                    xx.max() - 0.3,
+                    yy.min() + 0.3,
+                    f"{score*100:0.1f}%",
+                    size=font_size_text,
+                    horizontalalignment="right",
+                )
+
+                if bitwidth and framework == "Concrete ML":
+                    ax.text(
+                        xx.max() - 0.3,
+                        yy.min() + 1.0,
+                        f"bit-width={bitwidth}",
+                        size=font_size_text,
+                        horizontalalignment="right",
+                    )
+
     plt.tight_layout()
+    if save_plot:
+        plt.savefig(f"./{title}.png")
+
     plt.show()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		5e424e2fdfb0fe158f7f91eccd528367
		137e9f0fb1ee91035add06b2a5f29d41