From b174509caa285a8cc27a591f96398462d1e6f803 Mon Sep 17 00:00:00 2001
From: Roman <56846628+RomanBredehoft@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:38:04 +0200
Subject: [PATCH] docs: add schema example for encrypted data-frames (#715)

---
 docs/advanced_examples/EncryptedPandas.ipynb  | 284 ++++++++++--------
 .../encrypted_pandas/client_1/df_left.csv     |   4 +-
 .../encrypted_pandas/client_2/df_right.csv    |   4 +-
 .../utils/classifier_comparison_utils.py      |   1 -
 src/concrete/ml/pandas/_processing.py         |   5 -
 5 files changed, 157 insertions(+), 141 deletions(-)
diff --git a/docs/advanced_examples/EncryptedPandas.ipynb b/docs/advanced_examples/EncryptedPandas.ipynb
index 2699d9ddf..40a6a55ac 100644
--- a/docs/advanced_examples/EncryptedPandas.ipynb
+++ b/docs/advanced_examples/EncryptedPandas.ipynb
@@ -51,7 +51,7 @@
    "source": [
     "### User 1\n",
     "\n",
-    "On the first user's side, load the private data using Pandas. For this example, we took the [Tips]( https://www.kaggle.com/code/sanjanabasu/tips-dataset/input) dataset and separated it into two csv files so that: \n",
+    "On the first user's side, load the private data using Pandas. This example uses the [Tips]( https://www.kaggle.com/code/sanjanabasu/tips-dataset/input) dataset. It was split into two csv files so that: \n",
     "- all columns are different, except for column \"index\", representing the initial data-frame's index\n",
     "- some indexes are common, some others are not"
    ]
@@ -92,7 +92,7 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1</td>\n",
+       "      <td>client_1</td>\n",
        "      <td>12.54</td>\n",
        "      <td>2.50</td>\n",
        "      <td>Male</td>\n",
@@ -100,7 +100,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2</td>\n",
+       "      <td>client_2</td>\n",
        "      <td>11.17</td>\n",
        "      <td>1.50</td>\n",
        "      <td>Female</td>\n",
@@ -108,7 +108,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3</td>\n",
+       "      <td>client_3</td>\n",
        "      <td>20.29</td>\n",
        "      <td>2.75</td>\n",
        "      <td>Female</td>\n",
@@ -116,7 +116,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>4</td>\n",
+       "      <td>client_4</td>\n",
        "      <td>14.07</td>\n",
        "      <td>2.50</td>\n",
        "      <td>Male</td>\n",
@@ -124,7 +124,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>5</td>\n",
+       "      <td>client_5</td>\n",
        "      <td>15.69</td>\n",
        "      <td>3.00</td>\n",
        "      <td>Male</td>\n",
@@ -132,7 +132,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>6</td>\n",
+       "      <td>client_6</td>\n",
        "      <td>18.29</td>\n",
        "      <td>3.00</td>\n",
        "      <td>Male</td>\n",
@@ -140,7 +140,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>7</td>\n",
+       "      <td>client_7</td>\n",
        "      <td>16.93</td>\n",
        "      <td>3.07</td>\n",
        "      <td>Female</td>\n",
@@ -148,7 +148,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>8</td>\n",
+       "      <td>client_8</td>\n",
        "      <td>24.27</td>\n",
        "      <td>2.03</td>\n",
        "      <td>Male</td>\n",
@@ -156,7 +156,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>9</td>\n",
+       "      <td>client_9</td>\n",
        "      <td>8.77</td>\n",
        "      <td>2.00</td>\n",
        "      <td>Male</td>\n",
@@ -167,16 +167,16 @@
        "</div>"
       ],
       "text/plain": [
-       "   index  total_bill   tip     sex smoker\n",
-       "0      1       12.54  2.50    Male     No\n",
-       "1      2       11.17  1.50  Female     No\n",
-       "2      3       20.29  2.75  Female     No\n",
-       "3      4       14.07  2.50    Male     No\n",
-       "4      5       15.69  3.00    Male    Yes\n",
-       "5      6       18.29  3.00    Male     No\n",
-       "6      7       16.93  3.07  Female     No\n",
-       "7      8       24.27  2.03    Male    Yes\n",
-       "8      9        8.77  2.00    Male     No"
+       "      index  total_bill   tip     sex smoker\n",
+       "0  client_1       12.54  2.50    Male     No\n",
+       "1  client_2       11.17  1.50  Female     No\n",
+       "2  client_3       20.29  2.75  Female     No\n",
+       "3  client_4       14.07  2.50    Male     No\n",
+       "4  client_5       15.69  3.00    Male    Yes\n",
+       "5  client_6       18.29  3.00    Male     No\n",
+       "6  client_7       16.93  3.07  Female     No\n",
+       "7  client_8       24.27  2.03    Male    Yes\n",
+       "8  client_9        8.77  2.00    Male     No"
       ]
      },
      "execution_count": 2,
@@ -196,7 +196,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A `ClientEngine` instance is then initialized, which is used for managing keys (encryption, decryption)."
+    "In order to be encrypted, string values first need to be mapped to integers (see section below about `get_schema`). By default, this mapping is done automatically. However, for example, the column won't be able to be selected when merging encrypted data-frames. This is because such an operator requires the data-frames' string mapping to match, else values will be mixed up.\n",
+    "\n",
+    "This is exactly the case here, as the index column only contains string values, thus the mapping must be defined by the application developer. This mapping will then be shared to the second client (see below) in order to make sure both matches. Other non-integer columns do not require any pre-computed mapping if they are not expected to be selected for merging. All mappings are grouped per column as a dictionary, called \"schema\". \n",
+    "\n",
+    "Therefore, let's define our schema:"
    ]
   },
   {
@@ -204,6 +208,22 @@
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "schema = {\"index\": {index_value: i + 1 for i, index_value in enumerate(df_left[\"index\"].values)}}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A `ClientEngine` instance is then initialized, which is used for managing keys (encryption, decryption)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "client_1_temp_dir = TemporaryDirectory(dir=str(CLIENT_1_DIR))\n",
     "client_1_temp_path = Path(client_1_temp_dir.name)\n",
@@ -218,16 +238,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Using the `ClientEngine` instance, the user is now able to encrypt the Pandas data-frame, building a new `EncryptedDataFrame` instance."
+    "Using the `ClientEngine` instance, the user is now able to encrypt the Pandas data-frame, building a new `EncryptedDataFrame` instance. The schema, which includes the string mapping for column `index`, is provided as well."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_left_enc = client_1.encrypt_from_pandas(df_left)"
+    "df_left_enc = client_1.encrypt_from_pandas(df_left, schema=schema)"
    ]
   },
   {
@@ -239,14 +259,14 @@
     "- floating points: the values are quantized under a certain precision, and quantization parameters (scale, zero-point) are sent to the server\n",
     "- strings: the values are mapped to integers using a dict, which is sent to the server as well\n",
     "\n",
-    "More generally, the quantized values need be within the range currently allowed. This notably means that the number of rows allowed in a data-frame are also limited, as we expect the keys on which to merge to be unique.\n",
+    "More generally, the quantized values must be within the range currently allowed. This notably means that the number of rows allowed in a data-frame are also limited, as keys on which to merge are expected to be unique.\n",
     "\n",
     "Once the inputs are quantized and encrypted, the user can print the encrypted data-frame's schema. A schema represents the data-frame's columns as well as their dtype and associated quantization parameters or mappings.  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -280,13 +300,21 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>dtype</th>\n",
-       "      <td>int64</td>\n",
+       "      <td>object</td>\n",
        "      <td>float64</td>\n",
        "      <td>float64</td>\n",
        "      <td>object</td>\n",
        "      <td>object</td>\n",
        "    </tr>\n",
        "    <tr>\n",
+       "      <th>str_to_int</th>\n",
+       "      <td>{'client_1': 1, 'client_2': 2, 'client_3': 3, ...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>{'Male': 1, 'Female': 2}</td>\n",
+       "      <td>{'No': 1, 'Yes': 2}</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
        "      <th>scale</th>\n",
        "      <td>NaN</td>\n",
        "      <td>0.903226</td>\n",
@@ -302,33 +330,25 @@
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>str_to_int</th>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>{'Male': 1, 'Female': 2}</td>\n",
-       "      <td>{'No': 1, 'Yes': 2}</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "            index total_bill        tip                       sex  \\\n",
-       "dtype       int64    float64    float64                    object   \n",
-       "scale         NaN   0.903226   8.917197                       NaN   \n",
-       "zero_point    NaN    6.92129  12.375796                       NaN   \n",
-       "str_to_int    NaN        NaN        NaN  {'Male': 1, 'Female': 2}   \n",
+       "                                                        index total_bill  \\\n",
+       "dtype                                                  object    float64   \n",
+       "str_to_int  {'client_1': 1, 'client_2': 2, 'client_3': 3, ...        NaN   \n",
+       "scale                                                     NaN   0.903226   \n",
+       "zero_point                                                NaN    6.92129   \n",
        "\n",
-       "                         smoker  \n",
-       "dtype                    object  \n",
-       "scale                       NaN  \n",
-       "zero_point                  NaN  \n",
-       "str_to_int  {'No': 1, 'Yes': 2}  "
+       "                  tip                       sex               smoker  \n",
+       "dtype         float64                    object               object  \n",
+       "str_to_int        NaN  {'Male': 1, 'Female': 2}  {'No': 1, 'Yes': 2}  \n",
+       "scale        8.917197                       NaN                  NaN  \n",
+       "zero_point  12.375796                       NaN                  NaN  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -346,7 +366,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -365,7 +385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -398,21 +418,21 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>2</td>\n",
+       "      <td>client_2</td>\n",
        "      <td>Thur</td>\n",
        "      <td>Lunch</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>5</td>\n",
+       "      <td>client_5</td>\n",
        "      <td>Sat</td>\n",
        "      <td>Dinner</td>\n",
        "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>9</td>\n",
+       "      <td>client_9</td>\n",
        "      <td>Sun</td>\n",
        "      <td>Dinner</td>\n",
        "      <td>2</td>\n",
@@ -422,13 +442,13 @@
        "</div>"
       ],
       "text/plain": [
-       "   index   day    time  size\n",
-       "0      2  Thur   Lunch     2\n",
-       "1      5   Sat  Dinner     3\n",
-       "2      9   Sun  Dinner     2"
+       "      index   day    time  size\n",
+       "0  client_2  Thur   Lunch     2\n",
+       "1  client_5   Sat  Dinner     3\n",
+       "2  client_9   Sun  Dinner     2"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -445,12 +465,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Currently, the users need to share the private keys in order to be able to run an encrypted merge. We are currently working on new techniques that would avoid this."
+    "Currently, the users need to share the private keys in order to be able to run an encrypted merge. Future works will provide new techniques that would avoid this."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -470,12 +490,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Encrypt the second user's data-frame. It is possible to get the encrypted data-frame's representation by simply returning the variable."
+    "Encrypt the second user's data-frame. Here, the same schema used for client 1 is needed in order to make sure that custom mappings are matching.\n",
+    "\n",
+    "It is possible to get the encrypted data-frame's representation by simply returning the variable."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -492,40 +514,40 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td>..f915460bb8..</td>\n",
-       "      <td>..2516cc2849..</td>\n",
-       "      <td>..863d7dfe70..</td>\n",
-       "      <td>..def69f8873..</td>\n",
+       "      <td>..3416e4aa89..</td>\n",
+       "      <td>..625630ee2d..</td>\n",
+       "      <td>..f97b291b65..</td>\n",
+       "      <td>..533b84b338..</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>..a71b8807d3..</td>\n",
-       "      <td>..3898f1290c..</td>\n",
-       "      <td>..d2ed2b92b5..</td>\n",
-       "      <td>..35885dd5df..</td>\n",
+       "      <td>..4411510149..</td>\n",
+       "      <td>..eb98969b1c..</td>\n",
+       "      <td>..7aaf60bdc2..</td>\n",
+       "      <td>..e92c207904..</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <td>..a09e0ec21b..</td>\n",
-       "      <td>..c4c723ba41..</td>\n",
-       "      <td>..0e80736a37..</td>\n",
-       "      <td>..249c21a1d6..</td>\n",
+       "      <td>..0f5ea3f9af..</td>\n",
+       "      <td>..dc70ca2391..</td>\n",
+       "      <td>..7e6a4a58b4..</td>\n",
+       "      <td>..3eda8cd70d..</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>"
       ],
       "text/plain": [
        "         index            day           time           size\n",
-       "..f915460bb8.. ..2516cc2849.. ..863d7dfe70.. ..def69f8873..\n",
-       "..a71b8807d3.. ..3898f1290c.. ..d2ed2b92b5.. ..35885dd5df..\n",
-       "..a09e0ec21b.. ..c4c723ba41.. ..0e80736a37.. ..249c21a1d6.."
+       "..3416e4aa89.. ..625630ee2d.. ..f97b291b65.. ..533b84b338..\n",
+       "..4411510149.. ..eb98969b1c.. ..7aaf60bdc2.. ..e92c207904..\n",
+       "..0f5ea3f9af.. ..dc70ca2391.. ..7e6a4a58b4.. ..3eda8cd70d.."
       ]
      },
-     "execution_count": 9,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df_right_enc = client_2.encrypt_from_pandas(df_right)\n",
+    "df_right_enc = client_2.encrypt_from_pandas(df_right, schema=schema)\n",
     "\n",
     "df_right_enc"
    ]
@@ -539,7 +561,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -560,7 +582,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -572,19 +594,19 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We now chose to run a left join on the encrypted data-frames' common column \"index\" using FHE. This step can take several seconds.  "
+    "The server can now run a left join on the encrypted data-frames' common column \"index\" using FHE. This step can take several seconds.  "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Total execution time: 8.59s\n"
+      "Total execution time: 7.11s\n"
      ]
     }
    ],
@@ -605,7 +627,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -625,7 +647,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -641,7 +663,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -678,7 +700,7 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1</td>\n",
+       "      <td>client_1</td>\n",
        "      <td>12.091429</td>\n",
        "      <td>2.509286</td>\n",
        "      <td>Male</td>\n",
@@ -689,7 +711,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2</td>\n",
+       "      <td>client_2</td>\n",
        "      <td>10.984286</td>\n",
        "      <td>1.500000</td>\n",
        "      <td>Female</td>\n",
@@ -700,7 +722,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3</td>\n",
+       "      <td>client_3</td>\n",
        "      <td>19.841429</td>\n",
        "      <td>2.733571</td>\n",
        "      <td>Female</td>\n",
@@ -711,7 +733,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>4</td>\n",
+       "      <td>client_4</td>\n",
        "      <td>14.305714</td>\n",
        "      <td>2.509286</td>\n",
        "      <td>Male</td>\n",
@@ -722,7 +744,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>5</td>\n",
+       "      <td>client_5</td>\n",
        "      <td>15.412857</td>\n",
        "      <td>2.957857</td>\n",
        "      <td>Male</td>\n",
@@ -733,7 +755,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>6</td>\n",
+       "      <td>client_6</td>\n",
        "      <td>18.734286</td>\n",
        "      <td>2.957857</td>\n",
        "      <td>Male</td>\n",
@@ -744,7 +766,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>7</td>\n",
+       "      <td>client_7</td>\n",
        "      <td>16.520000</td>\n",
        "      <td>3.070000</td>\n",
        "      <td>Female</td>\n",
@@ -755,7 +777,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>8</td>\n",
+       "      <td>client_8</td>\n",
        "      <td>24.270000</td>\n",
        "      <td>2.060714</td>\n",
        "      <td>Male</td>\n",
@@ -766,7 +788,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>9</td>\n",
+       "      <td>client_9</td>\n",
        "      <td>8.770000</td>\n",
        "      <td>1.948571</td>\n",
        "      <td>Male</td>\n",
@@ -780,19 +802,19 @@
        "</div>"
       ],
       "text/plain": [
-       "   index  total_bill       tip     sex smoker   day    time  size\n",
-       "0      1   12.091429  2.509286    Male     No   NaN     NaN   NaN\n",
-       "1      2   10.984286  1.500000  Female     No  Thur   Lunch   2.0\n",
-       "2      3   19.841429  2.733571  Female     No   NaN     NaN   NaN\n",
-       "3      4   14.305714  2.509286    Male     No   NaN     NaN   NaN\n",
-       "4      5   15.412857  2.957857    Male    Yes   Sat  Dinner   3.0\n",
-       "5      6   18.734286  2.957857    Male     No   NaN     NaN   NaN\n",
-       "6      7   16.520000  3.070000  Female     No   NaN     NaN   NaN\n",
-       "7      8   24.270000  2.060714    Male    Yes   NaN     NaN   NaN\n",
-       "8      9    8.770000  1.948571    Male     No   Sun  Dinner   2.0"
+       "      index  total_bill       tip     sex smoker   day    time  size\n",
+       "0  client_1   12.091429  2.509286    Male     No   NaN     NaN   NaN\n",
+       "1  client_2   10.984286  1.500000  Female     No  Thur   Lunch   2.0\n",
+       "2  client_3   19.841429  2.733571  Female     No   NaN     NaN   NaN\n",
+       "3  client_4   14.305714  2.509286    Male     No   NaN     NaN   NaN\n",
+       "4  client_5   15.412857  2.957857    Male    Yes   Sat  Dinner   3.0\n",
+       "5  client_6   18.734286  2.957857    Male     No   NaN     NaN   NaN\n",
+       "6  client_7   16.520000  3.070000  Female     No   NaN     NaN   NaN\n",
+       "7  client_8   24.270000  2.060714    Male    Yes   NaN     NaN   NaN\n",
+       "8  client_9    8.770000  1.948571    Male     No   Sun  Dinner   2.0"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -809,12 +831,12 @@
    "source": [
     "### Concrete ML vs Pandas comparison\n",
     "\n",
-    "As this is only a demo in a notebook, we are able to compute Pandas' expected output (in a non-private setting) and compare it to the result above. "
+    "For this demo, expected output from Pandas (in a non-private setting) can be computed and compared to the result above. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -851,7 +873,7 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>1</td>\n",
+       "      <td>client_1</td>\n",
        "      <td>12.54</td>\n",
        "      <td>2.50</td>\n",
        "      <td>Male</td>\n",
@@ -862,7 +884,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>2</td>\n",
+       "      <td>client_2</td>\n",
        "      <td>11.17</td>\n",
        "      <td>1.50</td>\n",
        "      <td>Female</td>\n",
@@ -873,7 +895,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3</td>\n",
+       "      <td>client_3</td>\n",
        "      <td>20.29</td>\n",
        "      <td>2.75</td>\n",
        "      <td>Female</td>\n",
@@ -884,7 +906,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>4</td>\n",
+       "      <td>client_4</td>\n",
        "      <td>14.07</td>\n",
        "      <td>2.50</td>\n",
        "      <td>Male</td>\n",
@@ -895,7 +917,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>5</td>\n",
+       "      <td>client_5</td>\n",
        "      <td>15.69</td>\n",
        "      <td>3.00</td>\n",
        "      <td>Male</td>\n",
@@ -906,7 +928,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>6</td>\n",
+       "      <td>client_6</td>\n",
        "      <td>18.29</td>\n",
        "      <td>3.00</td>\n",
        "      <td>Male</td>\n",
@@ -917,7 +939,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>7</td>\n",
+       "      <td>client_7</td>\n",
        "      <td>16.93</td>\n",
        "      <td>3.07</td>\n",
        "      <td>Female</td>\n",
@@ -928,7 +950,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>8</td>\n",
+       "      <td>client_8</td>\n",
        "      <td>24.27</td>\n",
        "      <td>2.03</td>\n",
        "      <td>Male</td>\n",
@@ -939,7 +961,7 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>9</td>\n",
+       "      <td>client_9</td>\n",
        "      <td>8.77</td>\n",
        "      <td>2.00</td>\n",
        "      <td>Male</td>\n",
@@ -953,19 +975,19 @@
        "</div>"
       ],
       "text/plain": [
-       "   index  total_bill   tip     sex smoker   day    time  size\n",
-       "0      1       12.54  2.50    Male     No   NaN     NaN   NaN\n",
-       "1      2       11.17  1.50  Female     No  Thur   Lunch   2.0\n",
-       "2      3       20.29  2.75  Female     No   NaN     NaN   NaN\n",
-       "3      4       14.07  2.50    Male     No   NaN     NaN   NaN\n",
-       "4      5       15.69  3.00    Male    Yes   Sat  Dinner   3.0\n",
-       "5      6       18.29  3.00    Male     No   NaN     NaN   NaN\n",
-       "6      7       16.93  3.07  Female     No   NaN     NaN   NaN\n",
-       "7      8       24.27  2.03    Male    Yes   NaN     NaN   NaN\n",
-       "8      9        8.77  2.00    Male     No   Sun  Dinner   2.0"
+       "      index  total_bill   tip     sex smoker   day    time  size\n",
+       "0  client_1       12.54  2.50    Male     No   NaN     NaN   NaN\n",
+       "1  client_2       11.17  1.50  Female     No  Thur   Lunch   2.0\n",
+       "2  client_3       20.29  2.75  Female     No   NaN     NaN   NaN\n",
+       "3  client_4       14.07  2.50    Male     No   NaN     NaN   NaN\n",
+       "4  client_5       15.69  3.00    Male    Yes   Sat  Dinner   3.0\n",
+       "5  client_6       18.29  3.00    Male     No   NaN     NaN   NaN\n",
+       "6  client_7       16.93  3.07  Female     No   NaN     NaN   NaN\n",
+       "7  client_8       24.27  2.03    Male    Yes   NaN     NaN   NaN\n",
+       "8  client_9        8.77  2.00    Male     No   Sun  Dinner   2.0"
       ]
      },
-     "execution_count": 16,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -980,12 +1002,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We can observe slight differences between Pandas and Concrete ML with floating points values. This is only due to quantization artifacts, as we currently only allow a few bits of precision. We can still see that both data-frames are equal under a small float relative tolerance."
+    "Slight differences cab be observed between Pandas and Concrete ML with floating points values. This is only due to quantization artifacts, as currently only 4 bits of precision are supported. Still, both data-frames are equal under a small float relative tolerance."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -1007,7 +1029,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1026,7 +1048,7 @@
     "\n",
     "#### Future Work\n",
     "\n",
-    "We are currently working on improving the encrypted data-frame feature. In the near future, we are planning on allowing bigger precisions, which would make encrypted data-frames able to handle larger integers, floating points with better precisions and more unique strings values, as well as provide more rows. We will also add support for more encrypted operations on data-frames. Additionally, we are working new techniques that would avoid users having to share a private keys between themselves. "
+    "In the near future, bigger precisions will be allowed, which would make encrypted data-frames able to handle larger integers, floating points with better precisions and more unique strings values, as well as provide more rows. Support for more encrypted operations on data-frames will also be added. While users need to share private keys with the current version of the API, threshold decryption, a multi party key generation protocol, could allow them to compute on joint data without revealing it to each other."
    ]
   }
  ],
diff --git a/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv b/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
index cb141493c..11e431025 100644
--- a/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
+++ b/docs/advanced_examples/data/encrypted_pandas/client_1/df_left.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5eafbecd06d0ae93bcfa71cae12081575d4f740ac9f91e4cdba1dc6247e71dde
-size 222
+oid sha256:6a45941850d9f1916b83c164254bd1ac6cc1d1392878b0c2a8253d9db12d0b05
+size 285
diff --git a/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv b/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
index 03333f809..493b81ca9 100644
--- a/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
+++ b/docs/advanced_examples/data/encrypted_pandas/client_2/df_right.csv
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95219d1043be5bda2859aeadca26a5245f1014af99a6a402c8c42ad7c3525df7
-size 65
+oid sha256:0489dc076045f210c557f0f0867b5c197c7109547b80f8da64b36217dfd31667
+size 86
diff --git a/docs/advanced_examples/utils/classifier_comparison_utils.py b/docs/advanced_examples/utils/classifier_comparison_utils.py
index 50816c949..e8e436177 100644
--- a/docs/advanced_examples/utils/classifier_comparison_utils.py
+++ b/docs/advanced_examples/utils/classifier_comparison_utils.py
@@ -342,7 +342,6 @@ def make_classifier_comparison_from_sklearn(title, classifiers, decision_level,
             # scikit-learn
             concrete_model, sklearn_model = model.fit_benchmark(X_train, y_train)
 
-            # TODO: from data or not?
             sklearn_fhe_model = concrete_model.__class__.from_sklearn_model(sklearn_model, X=X_train)
 
             # Compute the predictions in clear using the scikit-learn model
diff --git a/src/concrete/ml/pandas/_processing.py b/src/concrete/ml/pandas/_processing.py
index 5133d9abe..f47d46c77 100644
--- a/src/concrete/ml/pandas/_processing.py
+++ b/src/concrete/ml/pandas/_processing.py
@@ -128,8 +128,6 @@ def check_schema_format(pandas_dataframe: pandas.DataFrame, schema: Optional[Dic
 
     for column_name, column_mapping in schema.items():
         if column_name not in column_names:
-            # TODO: Is this check actually relevant ? Can't the schema provide more columns than the
-            # one found in the data-frame ?
             raise ValueError(
                 f"Column name '{column_name}' found in the given schema cannot be found in the "
                 f"input data-frame. Expected one of {column_names}"
@@ -322,9 +320,6 @@ def pre_process_dtypes(
                 "supported."
             )
 
-    # TODO: Should all non-integers columns be considered by the schema if not None ? Currently,
-    # mappings are computed automatically if schema is not set
-
     return pandas_dataframe, dtype_mappings