From b174509caa285a8cc27a591f96398462d1e6f803 Mon Sep 17 00:00:00 2001
From: Roman <56846628+RomanBredehoft@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:38:04 +0200
Subject: [PATCH] docs: add schema example for encrypted data-frames (#715)
---
docs/advanced_examples/EncryptedPandas.ipynb | 284 ++++++++++--------
.../encrypted_pandas/client_1/df_left.csv | 4 +-
.../encrypted_pandas/client_2/df_right.csv | 4 +-
.../utils/classifier_comparison_utils.py | 1 -
src/concrete/ml/pandas/_processing.py | 5 -
5 files changed, 157 insertions(+), 141 deletions(-)
diff --git a/docs/advanced_examples/EncryptedPandas.ipynb b/docs/advanced_examples/EncryptedPandas.ipynb
index 2699d9ddf..40a6a55ac 100644
--- a/docs/advanced_examples/EncryptedPandas.ipynb
+++ b/docs/advanced_examples/EncryptedPandas.ipynb
@@ -51,7 +51,7 @@
"source": [
"### User 1\n",
"\n",
- "On the first user's side, load the private data using Pandas. For this example, we took the [Tips]( https://www.kaggle.com/code/sanjanabasu/tips-dataset/input) dataset and separated it into two csv files so that: \n",
+ "On the first user's side, load the private data using Pandas. This example uses the [Tips]( https://www.kaggle.com/code/sanjanabasu/tips-dataset/input) dataset. It was split into two csv files so that: \n",
"- all columns are different, except for column \"index\", representing the initial data-frame's index\n",
"- some indexes are common, some others are not"
]
@@ -92,7 +92,7 @@
"
\n",
" \n",
" 0 | \n",
- " 1 | \n",
+ " client_1 | \n",
" 12.54 | \n",
" 2.50 | \n",
" Male | \n",
@@ -100,7 +100,7 @@
"
\n",
" \n",
" 1 | \n",
- " 2 | \n",
+ " client_2 | \n",
" 11.17 | \n",
" 1.50 | \n",
" Female | \n",
@@ -108,7 +108,7 @@
"
\n",
" \n",
" 2 | \n",
- " 3 | \n",
+ " client_3 | \n",
" 20.29 | \n",
" 2.75 | \n",
" Female | \n",
@@ -116,7 +116,7 @@
"
\n",
" \n",
" 3 | \n",
- " 4 | \n",
+ " client_4 | \n",
" 14.07 | \n",
" 2.50 | \n",
" Male | \n",
@@ -124,7 +124,7 @@
"
\n",
" \n",
" 4 | \n",
- " 5 | \n",
+ " client_5 | \n",
" 15.69 | \n",
" 3.00 | \n",
" Male | \n",
@@ -132,7 +132,7 @@
"
\n",
" \n",
" 5 | \n",
- " 6 | \n",
+ " client_6 | \n",
" 18.29 | \n",
" 3.00 | \n",
" Male | \n",
@@ -140,7 +140,7 @@
"
\n",
" \n",
" 6 | \n",
- " 7 | \n",
+ " client_7 | \n",
" 16.93 | \n",
" 3.07 | \n",
" Female | \n",
@@ -148,7 +148,7 @@
"
\n",
" \n",
" 7 | \n",
- " 8 | \n",
+ " client_8 | \n",
" 24.27 | \n",
" 2.03 | \n",
" Male | \n",
@@ -156,7 +156,7 @@
"
\n",
" \n",
" 8 | \n",
- " 9 | \n",
+ " client_9 | \n",
" 8.77 | \n",
" 2.00 | \n",
" Male | \n",
@@ -167,16 +167,16 @@
""
],
"text/plain": [
- " index total_bill tip sex smoker\n",
- "0 1 12.54 2.50 Male No\n",
- "1 2 11.17 1.50 Female No\n",
- "2 3 20.29 2.75 Female No\n",
- "3 4 14.07 2.50 Male No\n",
- "4 5 15.69 3.00 Male Yes\n",
- "5 6 18.29 3.00 Male No\n",
- "6 7 16.93 3.07 Female No\n",
- "7 8 24.27 2.03 Male Yes\n",
- "8 9 8.77 2.00 Male No"
+ " index total_bill tip sex smoker\n",
+ "0 client_1 12.54 2.50 Male No\n",
+ "1 client_2 11.17 1.50 Female No\n",
+ "2 client_3 20.29 2.75 Female No\n",
+ "3 client_4 14.07 2.50 Male No\n",
+ "4 client_5 15.69 3.00 Male Yes\n",
+ "5 client_6 18.29 3.00 Male No\n",
+ "6 client_7 16.93 3.07 Female No\n",
+ "7 client_8 24.27 2.03 Male Yes\n",
+ "8 client_9 8.77 2.00 Male No"
]
},
"execution_count": 2,
@@ -196,7 +196,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "A `ClientEngine` instance is then initialized, which is used for managing keys (encryption, decryption)."
+ "In order to be encrypted, string values first need to be mapped to integers (see section below about `get_schema`). By default, this mapping is done automatically. However, for example, the column won't be able to be selected when merging encrypted data-frames. This is because such an operator requires the data-frames' string mapping to match, else values will be mixed up.\n",
+ "\n",
+ "This is exactly the case here, as the index column only contains string values, thus the mapping must be defined by the application developer. This mapping will then be shared to the second client (see below) in order to make sure both matches. Other non-integer columns do not require any pre-computed mapping if they are not expected to be selected for merging. All mappings are grouped per column as a dictionary, called \"schema\". \n",
+ "\n",
+ "Therefore, let's define our schema:"
]
},
{
@@ -204,6 +208,22 @@
"execution_count": 3,
"metadata": {},
"outputs": [],
+ "source": [
+ "schema = {\"index\": {index_value: i + 1 for i, index_value in enumerate(df_left[\"index\"].values)}}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A `ClientEngine` instance is then initialized, which is used for managing keys (encryption, decryption)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
"source": [
"client_1_temp_dir = TemporaryDirectory(dir=str(CLIENT_1_DIR))\n",
"client_1_temp_path = Path(client_1_temp_dir.name)\n",
@@ -218,16 +238,16 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Using the `ClientEngine` instance, the user is now able to encrypt the Pandas data-frame, building a new `EncryptedDataFrame` instance."
+ "Using the `ClientEngine` instance, the user is now able to encrypt the Pandas data-frame, building a new `EncryptedDataFrame` instance. The schema, which includes the string mapping for column `index`, is provided as well."
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
- "df_left_enc = client_1.encrypt_from_pandas(df_left)"
+ "df_left_enc = client_1.encrypt_from_pandas(df_left, schema=schema)"
]
},
{
@@ -239,14 +259,14 @@
"- floating points: the values are quantized under a certain precision, and quantization parameters (scale, zero-point) are sent to the server\n",
"- strings: the values are mapped to integers using a dict, which is sent to the server as well\n",
"\n",
- "More generally, the quantized values need be within the range currently allowed. This notably means that the number of rows allowed in a data-frame are also limited, as we expect the keys on which to merge to be unique.\n",
+ "More generally, the quantized values must be within the range currently allowed. This notably means that the number of rows allowed in a data-frame are also limited, as keys on which to merge are expected to be unique.\n",
"\n",
"Once the inputs are quantized and encrypted, the user can print the encrypted data-frame's schema. A schema represents the data-frame's columns as well as their dtype and associated quantization parameters or mappings. "
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -280,13 +300,21 @@
"
\n",
" \n",
" dtype | \n",
- " int64 | \n",
+ " object | \n",
" float64 | \n",
" float64 | \n",
" object | \n",
" object | \n",
"
\n",
" \n",
+ " str_to_int | \n",
+ " {'client_1': 1, 'client_2': 2, 'client_3': 3, ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " {'Male': 1, 'Female': 2} | \n",
+ " {'No': 1, 'Yes': 2} | \n",
+ "
\n",
+ " \n",
" scale | \n",
" NaN | \n",
" 0.903226 | \n",
@@ -302,33 +330,25 @@
" NaN | \n",
" NaN | \n",
"
\n",
- " \n",
- " str_to_int | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " {'Male': 1, 'Female': 2} | \n",
- " {'No': 1, 'Yes': 2} | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " index total_bill tip sex \\\n",
- "dtype int64 float64 float64 object \n",
- "scale NaN 0.903226 8.917197 NaN \n",
- "zero_point NaN 6.92129 12.375796 NaN \n",
- "str_to_int NaN NaN NaN {'Male': 1, 'Female': 2} \n",
+ " index total_bill \\\n",
+ "dtype object float64 \n",
+ "str_to_int {'client_1': 1, 'client_2': 2, 'client_3': 3, ... NaN \n",
+ "scale NaN 0.903226 \n",
+ "zero_point NaN 6.92129 \n",
"\n",
- " smoker \n",
- "dtype object \n",
- "scale NaN \n",
- "zero_point NaN \n",
- "str_to_int {'No': 1, 'Yes': 2} "
+ " tip sex smoker \n",
+ "dtype float64 object object \n",
+ "str_to_int NaN {'Male': 1, 'Female': 2} {'No': 1, 'Yes': 2} \n",
+ "scale 8.917197 NaN NaN \n",
+ "zero_point 12.375796 NaN NaN "
]
},
- "execution_count": 5,
+ "execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@@ -346,7 +366,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -365,7 +385,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -398,21 +418,21 @@
" \n",
" \n",
" 0 | \n",
- " 2 | \n",
+ " client_2 | \n",
" Thur | \n",
" Lunch | \n",
" 2 | \n",
"
\n",
" \n",
" 1 | \n",
- " 5 | \n",
+ " client_5 | \n",
" Sat | \n",
" Dinner | \n",
" 3 | \n",
"
\n",
" \n",
" 2 | \n",
- " 9 | \n",
+ " client_9 | \n",
" Sun | \n",
" Dinner | \n",
" 2 | \n",
@@ -422,13 +442,13 @@
""
],
"text/plain": [
- " index day time size\n",
- "0 2 Thur Lunch 2\n",
- "1 5 Sat Dinner 3\n",
- "2 9 Sun Dinner 2"
+ " index day time size\n",
+ "0 client_2 Thur Lunch 2\n",
+ "1 client_5 Sat Dinner 3\n",
+ "2 client_9 Sun Dinner 2"
]
},
- "execution_count": 7,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@@ -445,12 +465,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Currently, the users need to share the private keys in order to be able to run an encrypted merge. We are currently working on new techniques that would avoid this."
+ "Currently, the users need to share the private keys in order to be able to run an encrypted merge. Future works will provide new techniques that would avoid this."
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -470,12 +490,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Encrypt the second user's data-frame. It is possible to get the encrypted data-frame's representation by simply returning the variable."
+ "Encrypt the second user's data-frame. Here, the same schema used for client 1 is needed in order to make sure that custom mappings are matching.\n",
+ "\n",
+ "It is possible to get the encrypted data-frame's representation by simply returning the variable."
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -492,40 +514,40 @@
" \n",
"
\n",
" \n",
- " ..f915460bb8.. | \n",
- " ..2516cc2849.. | \n",
- " ..863d7dfe70.. | \n",
- " ..def69f8873.. | \n",
+ " ..3416e4aa89.. | \n",
+ " ..625630ee2d.. | \n",
+ " ..f97b291b65.. | \n",
+ " ..533b84b338.. | \n",
"
\n",
" \n",
- " ..a71b8807d3.. | \n",
- " ..3898f1290c.. | \n",
- " ..d2ed2b92b5.. | \n",
- " ..35885dd5df.. | \n",
+ " ..4411510149.. | \n",
+ " ..eb98969b1c.. | \n",
+ " ..7aaf60bdc2.. | \n",
+ " ..e92c207904.. | \n",
"
\n",
" \n",
- " ..a09e0ec21b.. | \n",
- " ..c4c723ba41.. | \n",
- " ..0e80736a37.. | \n",
- " ..249c21a1d6.. | \n",
+ " ..0f5ea3f9af.. | \n",
+ " ..dc70ca2391.. | \n",
+ " ..7e6a4a58b4.. | \n",
+ " ..3eda8cd70d.. | \n",
"
\n",
" \n",
""
],
"text/plain": [
" index day time size\n",
- "..f915460bb8.. ..2516cc2849.. ..863d7dfe70.. ..def69f8873..\n",
- "..a71b8807d3.. ..3898f1290c.. ..d2ed2b92b5.. ..35885dd5df..\n",
- "..a09e0ec21b.. ..c4c723ba41.. ..0e80736a37.. ..249c21a1d6.."
+ "..3416e4aa89.. ..625630ee2d.. ..f97b291b65.. ..533b84b338..\n",
+ "..4411510149.. ..eb98969b1c.. ..7aaf60bdc2.. ..e92c207904..\n",
+ "..0f5ea3f9af.. ..dc70ca2391.. ..7e6a4a58b4.. ..3eda8cd70d.."
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df_right_enc = client_2.encrypt_from_pandas(df_right)\n",
+ "df_right_enc = client_2.encrypt_from_pandas(df_right, schema=schema)\n",
"\n",
"df_right_enc"
]
@@ -539,7 +561,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -560,7 +582,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -572,19 +594,19 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We now chose to run a left join on the encrypted data-frames' common column \"index\" using FHE. This step can take several seconds. "
+ "The server can now run a left join on the encrypted data-frames' common column \"index\" using FHE. This step can take several seconds. "
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "Total execution time: 8.59s\n"
+ "Total execution time: 7.11s\n"
]
}
],
@@ -605,7 +627,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -625,7 +647,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -641,7 +663,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -678,7 +700,7 @@
" \n",
" \n",
" 0 | \n",
- " 1 | \n",
+ " client_1 | \n",
" 12.091429 | \n",
" 2.509286 | \n",
" Male | \n",
@@ -689,7 +711,7 @@
"
\n",
" \n",
" 1 | \n",
- " 2 | \n",
+ " client_2 | \n",
" 10.984286 | \n",
" 1.500000 | \n",
" Female | \n",
@@ -700,7 +722,7 @@
"
\n",
" \n",
" 2 | \n",
- " 3 | \n",
+ " client_3 | \n",
" 19.841429 | \n",
" 2.733571 | \n",
" Female | \n",
@@ -711,7 +733,7 @@
"
\n",
" \n",
" 3 | \n",
- " 4 | \n",
+ " client_4 | \n",
" 14.305714 | \n",
" 2.509286 | \n",
" Male | \n",
@@ -722,7 +744,7 @@
"
\n",
" \n",
" 4 | \n",
- " 5 | \n",
+ " client_5 | \n",
" 15.412857 | \n",
" 2.957857 | \n",
" Male | \n",
@@ -733,7 +755,7 @@
"
\n",
" \n",
" 5 | \n",
- " 6 | \n",
+ " client_6 | \n",
" 18.734286 | \n",
" 2.957857 | \n",
" Male | \n",
@@ -744,7 +766,7 @@
"
\n",
" \n",
" 6 | \n",
- " 7 | \n",
+ " client_7 | \n",
" 16.520000 | \n",
" 3.070000 | \n",
" Female | \n",
@@ -755,7 +777,7 @@
"
\n",
" \n",
" 7 | \n",
- " 8 | \n",
+ " client_8 | \n",
" 24.270000 | \n",
" 2.060714 | \n",
" Male | \n",
@@ -766,7 +788,7 @@
"
\n",
" \n",
" 8 | \n",
- " 9 | \n",
+ " client_9 | \n",
" 8.770000 | \n",
" 1.948571 | \n",
" Male | \n",
@@ -780,19 +802,19 @@
""
],
"text/plain": [
- " index total_bill tip sex smoker day time size\n",
- "0 1 12.091429 2.509286 Male No NaN NaN NaN\n",
- "1 2 10.984286 1.500000 Female No Thur Lunch 2.0\n",
- "2 3 19.841429 2.733571 Female No NaN NaN NaN\n",
- "3 4 14.305714 2.509286 Male No NaN NaN NaN\n",
- "4 5 15.412857 2.957857 Male Yes Sat Dinner 3.0\n",
- "5 6 18.734286 2.957857 Male No NaN NaN NaN\n",
- "6 7 16.520000 3.070000 Female No NaN NaN NaN\n",
- "7 8 24.270000 2.060714 Male Yes NaN NaN NaN\n",
- "8 9 8.770000 1.948571 Male No Sun Dinner 2.0"
+ " index total_bill tip sex smoker day time size\n",
+ "0 client_1 12.091429 2.509286 Male No NaN NaN NaN\n",
+ "1 client_2 10.984286 1.500000 Female No Thur Lunch 2.0\n",
+ "2 client_3 19.841429 2.733571 Female No NaN NaN NaN\n",
+ "3 client_4 14.305714 2.509286 Male No NaN NaN NaN\n",
+ "4 client_5 15.412857 2.957857 Male Yes Sat Dinner 3.0\n",
+ "5 client_6 18.734286 2.957857 Male No NaN NaN NaN\n",
+ "6 client_7 16.520000 3.070000 Female No NaN NaN NaN\n",
+ "7 client_8 24.270000 2.060714 Male Yes NaN NaN NaN\n",
+ "8 client_9 8.770000 1.948571 Male No Sun Dinner 2.0"
]
},
- "execution_count": 15,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -809,12 +831,12 @@
"source": [
"### Concrete ML vs Pandas comparison\n",
"\n",
- "As this is only a demo in a notebook, we are able to compute Pandas' expected output (in a non-private setting) and compare it to the result above. "
+ "For this demo, expected output from Pandas (in a non-private setting) can be computed and compared to the result above. "
]
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
"outputs": [
{
@@ -851,7 +873,7 @@
"
\n",
" \n",
" 0 | \n",
- " 1 | \n",
+ " client_1 | \n",
" 12.54 | \n",
" 2.50 | \n",
" Male | \n",
@@ -862,7 +884,7 @@
"
\n",
" \n",
" 1 | \n",
- " 2 | \n",
+ " client_2 | \n",
" 11.17 | \n",
" 1.50 | \n",
" Female | \n",
@@ -873,7 +895,7 @@
"
\n",
" \n",
" 2 | \n",
- " 3 | \n",
+ " client_3 | \n",
" 20.29 | \n",
" 2.75 | \n",
" Female | \n",
@@ -884,7 +906,7 @@
"
\n",
" \n",
" 3 | \n",
- " 4 | \n",
+ " client_4 | \n",
" 14.07 | \n",
" 2.50 | \n",
" Male | \n",
@@ -895,7 +917,7 @@
"
\n",
" \n",
" 4 | \n",
- " 5 | \n",
+ " client_5 | \n",
" 15.69 | \n",
" 3.00 | \n",
" Male | \n",
@@ -906,7 +928,7 @@
"
\n",
" \n",
" 5 | \n",
- " 6 | \n",
+ " client_6 | \n",
" 18.29 | \n",
" 3.00 | \n",
" Male | \n",
@@ -917,7 +939,7 @@
"
\n",
" \n",
" 6 | \n",
- " 7 | \n",
+ " client_7 | \n",
" 16.93 | \n",
" 3.07 | \n",
" Female | \n",
@@ -928,7 +950,7 @@
"
\n",
" \n",
" 7 | \n",
- " 8 | \n",
+ " client_8 | \n",
" 24.27 | \n",
" 2.03 | \n",
" Male | \n",
@@ -939,7 +961,7 @@
"
\n",
" \n",
" 8 | \n",
- " 9 | \n",
+ " client_9 | \n",
" 8.77 | \n",
" 2.00 | \n",
" Male | \n",
@@ -953,19 +975,19 @@
""
],
"text/plain": [
- " index total_bill tip sex smoker day time size\n",
- "0 1 12.54 2.50 Male No NaN NaN NaN\n",
- "1 2 11.17 1.50 Female No Thur Lunch 2.0\n",
- "2 3 20.29 2.75 Female No NaN NaN NaN\n",
- "3 4 14.07 2.50 Male No NaN NaN NaN\n",
- "4 5 15.69 3.00 Male Yes Sat Dinner 3.0\n",
- "5 6 18.29 3.00 Male No NaN NaN NaN\n",
- "6 7 16.93 3.07 Female No NaN NaN NaN\n",
- "7 8 24.27 2.03 Male Yes NaN NaN NaN\n",
- "8 9 8.77 2.00 Male No Sun Dinner 2.0"
+ " index total_bill tip sex smoker day time size\n",
+ "0 client_1 12.54 2.50 Male No NaN NaN NaN\n",
+ "1 client_2 11.17 1.50 Female No Thur Lunch 2.0\n",
+ "2 client_3 20.29 2.75 Female No NaN NaN NaN\n",
+ "3 client_4 14.07 2.50 Male No NaN NaN NaN\n",
+ "4 client_5 15.69 3.00 Male Yes Sat Dinner 3.0\n",
+ "5 client_6 18.29 3.00 Male No NaN NaN NaN\n",
+ "6 client_7 16.93 3.07 Female No NaN NaN NaN\n",
+ "7 client_8 24.27 2.03 Male Yes NaN NaN NaN\n",
+ "8 client_9 8.77 2.00 Male No Sun Dinner 2.0"
]
},
- "execution_count": 16,
+ "execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
@@ -980,12 +1002,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can observe slight differences between Pandas and Concrete ML with floating points values. This is only due to quantization artifacts, as we currently only allow a few bits of precision. We can still see that both data-frames are equal under a small float relative tolerance."
+ "Slight differences cab be observed between Pandas and Concrete ML with floating points values. This is only due to quantization artifacts, as currently only 4 bits of precision are supported. Still, both data-frames are equal under a small float relative tolerance."
]
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"outputs": [
{
@@ -1007,7 +1029,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
@@ -1026,7 +1048,7 @@
"\n",
"#### Future Work\n",
"\n",
- "We are currently working on improving the encrypted data-frame feature. In the near future, we are planning on allowing bigger precisions, which would make encrypted data-frames able to handle larger integers, floating points with better precisions and more unique strings values, as well as provide more rows. We will also add support for more encrypted operations on data-frames. Additionally, we are working new techniques that would avoid users having to share a private keys between themselves. "
+ "In the near future, bigger precisions will be allowed, which would make encrypted data-frames able to handle larger integers, floating points with better precisions and more unique strings values, as well as provide more rows. Support for more encrypted operations on data-frames will also be added. While users need to share private keys with the current version of the API, threshold decryption, a multi party key generation protocol, could allow them to compute on joint data without revealing it to each other."
]
}
],
diff --git a/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv b/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
index cb141493c..11e431025 100644
--- a/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
+++ b/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:5eafbecd06d0ae93bcfa71cae12081575d4f740ac9f91e4cdba1dc6247e71dde
-size 222
+oid sha256:6a45941850d9f1916b83c164254bd1ac6cc1d1392878b0c2a8253d9db12d0b05
+size 285
diff --git a/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv b/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
index 03333f809..493b81ca9 100644
--- a/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
+++ b/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
-oid sha256:95219d1043be5bda2859aeadca26a5245f1014af99a6a402c8c42ad7c3525df7
-size 65
+oid sha256:0489dc076045f210c557f0f0867b5c197c7109547b80f8da64b36217dfd31667
+size 86
diff --git a/docs/advanced_examples/utils/classifier_comparison_utils.py b/docs/advanced_examples/utils/classifier_comparison_utils.py
index 50816c949..e8e436177 100644
--- a/docs/advanced_examples/utils/classifier_comparison_utils.py
+++ b/docs/advanced_examples/utils/classifier_comparison_utils.py
@@ -342,7 +342,6 @@ def make_classifier_comparison_from_sklearn(title, classifiers, decision_level,
# scikit-learn
concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train)
- # TODO: from data or not?
sklearn_fhe_model = concrete_model.__class__.from_sklearn_model(sklearn_model, X=X_train)
# Compute the predictions in clear using the scikit-learn model
diff --git a/src/concrete/ml/pandas/_processing.py b/src/concrete/ml/pandas/_processing.py
index 5133d9abe..f47d46c77 100644
--- a/src/concrete/ml/pandas/_processing.py
+++ b/src/concrete/ml/pandas/_processing.py
@@ -128,8 +128,6 @@ def check_schema_format(pandas_dataframe: pandas.DataFrame, schema: Optional[Dic
for column_name, column_mapping in schema.items():
if column_name not in column_names:
- # TODO: Is this check actually relevant ? Can't the schema provide more columns than the
- # one found in the data-frame ?
raise ValueError(
f"Column name '{column_name}' found in the given schema cannot be found in the "
f"input data-frame. Expected one of {column_names}"
@@ -322,9 +320,6 @@ def pre_process_dtypes(
"supported."
)
- # TODO: Should all non-integers columns be considered by the schema if not None ? Currently,
- # mappings are computed automatically if schema is not set
-
return pandas_dataframe, dtype_mappings