Skip to content

Commit

Permalink
feat: support from_sklearn for trees
Browse files Browse the repository at this point in the history
Support `from_sklearn` for tree based models.

Two options:
- Quantization from thresholds: the main idea is to consider the
  thresholds of the nodes of the trees for quantization to not have
  to use data.
- Quantization from data: build a quantizer from the data provided by
  the user and quantize the thresholds based on that.

This also raises the question of non-uniform input quantization.
We could quantize the data based on the thresholds thus reducing the
number of bits required to log2(max_{feature}(node_{feature})).

That would leak the thresholds used in the model per feature but not the
structure of the tree itself while increasing significantly the number
of bits required.

We could try to automatically determine the n-bits to use to
properly represent all thresholds but this might result in a very high
bit-with.

This commit also changes to comparison so that it uses truncation and
not rounding anymore.
  • Loading branch information
fd0r committed May 30, 2024
1 parent 9a8121c commit af14572
Show file tree
Hide file tree
Showing 20 changed files with 1,689 additions and 63 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/refresh-one-notebook.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ on:
- FullyConnectedNeuralNetworkOnMNIST \n
- GLMComparison \n
- HealthCarePrediction \n
- ImportingFromScikitLearn \n
- KaggleTitanic \n
- KNearestNeighbors \n
- LinearRegression \n
Expand Down Expand Up @@ -66,6 +67,7 @@ env:
FullyConnectedNeuralNetworkOnMNIST: "docs/advanced_examples/FullyConnectedNeuralNetworkOnMNIST.ipynb"
GLMComparison: "docs/advanced_examples/GLMComparison.ipynb"
HealthCarePrediction: "use_case_examples/disease_prediction/HealthCarePrediction.ipynb"
ImportingFromScikitLearn: "docs/advanced_examples/ImportingFromScikitLearn.ipynb"
KaggleTitanic: "use_case_examples/titanic/KaggleTitanic.ipynb"
KNearestNeighbors: "docs/advanced_examples/KNearestNeighbors.ipynb"
LinearRegression: "docs/advanced_examples/LinearRegression.ipynb"
Expand Down
1 change: 1 addition & 0 deletions .gitleaksignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipyn
a99389ee01cbb972e46a892d3d0e9c7f8ee23f59:use_case_examples/training/analyze.ipynb:aws-access-token:18379
f41de03048a9ed27946b875e81b34138bb4bb17b:use_case_examples/training/analyze.ipynb:aws-access-token:6404
e2904473898ddd325f245f4faca526a0e9520f49:builders/Dockerfile.zamalang-env:generic-api-key:5
7d5e885816f1f1e432dd94da38c5c8267292056a:docs/advanced_examples/XGBRegressor.ipynb:aws-access-token:1026
2 changes: 1 addition & 1 deletion deps_licenses/licenses_linux_user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ protobuf, 3.20.3, BSD-3-Clause
psutil, 5.9.8, BSD License
python-dateutil, 2.9.0.post0, Apache Software License; BSD License
pytz, 2024.1, MIT License
requests, 2.32.1, Apache Software License
requests, 2.32.2, Apache Software License
scikit-learn, 1.1.3, BSD License
scipy, 1.10.1, BSD License
six, 1.16.0, MIT License
Expand Down
2 changes: 1 addition & 1 deletion deps_licenses/licenses_linux_user.txt.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5e424e2fdfb0fe158f7f91eccd528367
137e9f0fb1ee91035add06b2a5f29d41
2 changes: 1 addition & 1 deletion deps_licenses/licenses_mac_intel_user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ protobuf, 3.20.3, BSD-3-Clause
psutil, 5.9.8, BSD License
python-dateutil, 2.9.0.post0, Apache Software License; BSD License
pytz, 2024.1, MIT License
requests, 2.32.1, Apache Software License
requests, 2.32.2, Apache Software License
scikit-learn, 1.1.3, BSD License
scipy, 1.10.1, BSD License
six, 1.16.0, MIT License
Expand Down
2 changes: 1 addition & 1 deletion deps_licenses/licenses_mac_intel_user.txt.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5e424e2fdfb0fe158f7f91eccd528367
137e9f0fb1ee91035add06b2a5f29d41
2 changes: 1 addition & 1 deletion deps_licenses/licenses_mac_silicon_user.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ protobuf, 3.20.3, BSD-3-Clause
psutil, 5.9.8, BSD License
python-dateutil, 2.9.0.post0, Apache Software License; BSD License
pytz, 2024.1, MIT License
requests, 2.32.1, Apache Software License
requests, 2.32.2, Apache Software License
scikit-learn, 1.1.3, BSD License
scipy, 1.10.1, BSD License
six, 1.16.0, MIT License
Expand Down
2 changes: 1 addition & 1 deletion deps_licenses/licenses_mac_silicon_user.txt.md5
Original file line number Diff line number Diff line change
@@ -1 +1 @@
5e424e2fdfb0fe158f7f91eccd528367
137e9f0fb1ee91035add06b2a5f29d41
823 changes: 823 additions & 0 deletions docs/advanced_examples/ImportingFromScikitLearn.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion docs/advanced_examples/XGBClassifier.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -587,5 +587,5 @@
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
257 changes: 252 additions & 5 deletions docs/advanced_examples/utils/classifier_comparison_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from concrete.ml.sklearn import DecisionTreeClassifier
from concrete.ml.sklearn.base import BaseTreeEstimatorMixin
from concrete.fhe import Configuration


ALWAYS_USE_SIM = False

Expand Down Expand Up @@ -118,7 +120,7 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False

# Compile the Concrete ML model
time_begin = time.time()
circuit = concrete_model.compile(X_train)
circuit = concrete_model.compile(X_train,)

if verbose:
print(f"Compilation time: {(time.time() - time_begin):.4f} seconds\n")
Expand Down Expand Up @@ -155,9 +157,7 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
sklearn_score = accuracy_score(sklearn_y_pred, y_test)
concrete_score = accuracy_score(concrete_y_pred, y_test)

is_a_tree_based_model = concrete_model.__class__ in [
DecisionTreeClassifier,
]
is_a_tree_based_model = isinstance(concrete_model, BaseTreeEstimatorMixin)

# Compile the Concrete ML model with FHE simulation mode to evaluate the domain grid
circuit = concrete_model.compile(
Expand Down Expand Up @@ -242,8 +242,255 @@ def make_classifier_comparison(title, classifiers, decision_level, verbose=False
horizontalalignment="right",
)

plt.tight_layout()
if save_plot:
plt.savefig(f"./{title}.png")

plt.show()


def make_classifier_comparison_from_sklearn(title, classifiers, decision_level, verbose=False, save_plot=False, simulate=False, h=0.04):
n_samples = 200
num_models = 3

X, y = make_classification(
n_samples=n_samples,
n_features=2,
n_redundant=0,
n_informative=2,
random_state=1,
n_clusters_per_class=1,
)
assert isinstance(X, np.ndarray)
assert isinstance(y, np.ndarray)
# pylint: disable-next=no-member
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
make_moons(n_samples=n_samples, noise=0.2, random_state=0),
make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
linearly_separable,
]

font_size_text = 20

num_y_plots = len(datasets)
num_x_plots = num_models * len(classifiers) + 1
fig, axs = plt.subplots(num_y_plots, num_x_plots, figsize=(num_x_plots*4, num_y_plots*4))
fig.suptitle(title, fontsize=20)
fig.patch.set_facecolor("white")
plt.subplots_adjust(top=0.9)

# Iterate over data-sets
for i, dataset in enumerate(datasets):
# Preprocess data-set
X, y = dataset
X = X.astype(np.float32)
X = StandardScaler().fit_transform(X)

# Split the data into training and test sets
# Use 15 percent (30 points for a data-set of 200 points) for prediction
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# pylint: disable-next=no-member
cm = plt.cm.RdBu
cm_bright = ListedColormap(["#FF0000", "#0000FF"])
ax = axs[i, 0]
if i == 0:
ax.set_title("Input data", fontsize=font_size_text)

# Plot the training points
ax.scatter(
X_train[:, 0],
X_train[:, 1],
c=y_train,
cmap=cm_bright,
edgecolors="k",
label="Train data",
)

# Plot the testing points
ax.scatter(
X_test[:, 0],
X_test[:, 1],
marker="D",
c=y_test,
cmap=cm_bright,
alpha=0.6,
edgecolors="k",
label="Test data",
)
ax.legend()

ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

# Iterate over the given classifiers
for j, (classifier, model_name) in enumerate(classifiers):
# Instantiate the model
model = classifier()

# Train the model and retrieve both the Concrete ML model and its equivalent one from
# scikit-learn
concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train)

# TODO: from data or not?
sklearn_fhe_model = concrete_model.__class__.from_sklearn_model(sklearn_model, X=X_train)

# Compute the predictions in clear using the scikit-learn model
sklearn_y_pred = sklearn_model.predict(X_test)

# Compile the Concrete ML model
time_begin = time.time()
cfg = Configuration(detect_overflow_in_simulation=False)
circuit_cml = concrete_model.compile(X_train,)
circuit_sklearn = sklearn_fhe_model.compile(X_train,)

fhe = "simulate"
for circuit in [circuit_cml, circuit_sklearn]:
if verbose:
print(f"Compilation time: {(time.time() - time_begin):.4f} seconds\n")

# If the prediction are done in FHE, generate the key
if not ALWAYS_USE_SIM:

if verbose:
print(
"Generating a key for a "
f"{circuit.graph.maximum_integer_bit_width()}-bit circuit"
)

time_begin = time.time()
circuit.client.keygen(force=False)

if verbose:
print(f"Key generation time: {time.time() - time_begin:.4f} seconds")

fhe = "simulate" if simulate else "execute"

# Compute the predictions in FHE (with simulation or not) using the Concrete ML model
time_begin = time.time()
concrete_y_pred = concrete_model.predict(X_test, fhe=fhe)

if verbose:
print(
"FHE " + "(simulation) " * simulate
+ f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
"seconds per sample\n"
)

time_begin = time.time()
sklearn_fhe_y_pred = sklearn_fhe_model.predict(X_test, fhe=fhe)

if verbose:
print(
"FHE " + "(simulation) " * simulate
+ f"Execution time: {(time.time() - time_begin) / len(X_test):.4f} "
"seconds per sample\n"
)

# Measure the accuracy scores
sklearn_score = accuracy_score(sklearn_y_pred, y_test)
sklearn_fhe_score = accuracy_score(sklearn_fhe_y_pred, y_test)
concrete_score = accuracy_score(concrete_y_pred, y_test)

is_a_tree_based_model = isinstance(concrete_model, BaseTreeEstimatorMixin)

# Compile the Concrete ML model with FHE simulation mode to evaluate the domain grid
circuit = concrete_model.compile(
X_train,
)

# If the model is not a tree-based model, retrieve the maximum integer bit-width
# reached within its circuit.
bitwidth = None
if not is_a_tree_based_model:
bitwidth = circuit.graph.maximum_integer_bit_width()

raveled_input = np.c_[xx.ravel(), yy.ravel()]

# Plot the decision boundaries.
# For that, a color is assigned to each point in the mesh, which is obtained as a
# cartesian product of [x_min, x_max] with [y_min, y_max].
if hasattr(sklearn_model, "decision_function"):
sklearn_Z = sklearn_model.decision_function(raveled_input)
concrete_Z = concrete_model.decision_function(raveled_input, fhe="simulate")
sklearn_fhe_Z = sklearn_fhe_model.decision_function(raveled_input, fhe="simulate")
else:
sklearn_Z = sklearn_model.predict_proba(raveled_input.astype(np.float32))[:, 1]
concrete_Z = concrete_model.predict_proba(raveled_input, fhe="simulate")[:, 1]
sklearn_fhe_Z = sklearn_fhe_model.predict_proba(raveled_input, fhe="simulate")[:, 1]

for k, (framework, score, Z) in enumerate(
zip(
["scikit-learn", "Concrete ML", "scikit-learn imported"],
[sklearn_score, concrete_score, sklearn_fhe_score],
[sklearn_Z, concrete_Z, sklearn_fhe_Z],
)
):
ax = axs[i, num_models * j + k + 1]

# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)

# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")

# Plot the testing points
ax.scatter(
X_test[:, 0],
X_test[:, 1],
c=y_test,
marker="D",
cmap=cm_bright,
edgecolors="k",
alpha=0.6,
)

ax.contour(
xx,
yy,
Z,
levels=[decision_level],
linewidths=2,
)

ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())

if i == 0:
ax.set_title(model_name + f" ({framework})", fontsize=font_size_text)

ax.text(
xx.max() - 0.3,
yy.min() + 0.3,
f"{score*100:0.1f}%",
size=font_size_text,
horizontalalignment="right",
)

if bitwidth and framework == "Concrete ML":
ax.text(
xx.max() - 0.3,
yy.min() + 1.0,
f"bit-width={bitwidth}",
size=font_size_text,
horizontalalignment="right",
)

plt.tight_layout()
if save_plot:
plt.savefig(f"./{title}.png")

plt.show()
Loading

0 comments on commit af14572

Please sign in to comment.