From d546e7d56110aaf701aa0f186b237e4fdd53a4df Mon Sep 17 00:00:00 2001
From: james-choncholas <jim@choncholas.com>
Date: Wed, 30 Oct 2024 06:45:50 +0000
Subject: [PATCH] Update examples and add demos for tf-shell features.

---
 examples/automatic_parameters_demo.ipynb | 162 +++++
 examples/benchmark.ipynb                 |  40 +-
 examples/distributed_demo.ipynb          | 183 ++++++
 examples/intro.ipynb                     | 158 ++++-
 examples/intro_with_auto_param.ipynb     | 157 -----
 examples/label_dp_sgd.ipynb              | 236 --------
 examples/label_dp_sgd_post_scale.ipynb   | 503 ----------------
 examples/label_dp_sgd_sentiment.ipynb    | 716 -----------------------
 examples/parallelization_demo.ipynb      | 181 ++++++
 9 files changed, 698 insertions(+), 1638 deletions(-)
 create mode 100644 examples/automatic_parameters_demo.ipynb
 create mode 100644 examples/distributed_demo.ipynb
 delete mode 100644 examples/intro_with_auto_param.ipynb
 delete mode 100644 examples/label_dp_sgd.ipynb
 delete mode 100644 examples/label_dp_sgd_post_scale.ipynb
 delete mode 100644 examples/label_dp_sgd_sentiment.ipynb
 create mode 100644 examples/parallelization_demo.ipynb

diff --git a/examples/automatic_parameters_demo.ipynb b/examples/automatic_parameters_demo.ipynb
new file mode 100644
index 0000000..3c14dc4
--- /dev/null
+++ b/examples/automatic_parameters_demo.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Automatic HE Parameter Selection\n",
+    "\n",
+    "This notebook demonstrates how to use `tf-shell` to automatically choose low\n",
+    "level parameters (like the plaintext modulus and ciphertext moduli) for the BGV\n",
+    "HE scheme. While these parameters can be chosen manually, as shown in other\n",
+    "examples, it is convenient to let `tf-shell` choose them.\n",
+    "\n",
+    "Since the HE parameters depend on the depth of a computation, they must be\n",
+    "fixed before the computation starts. `tf-shell` does this by extending\n",
+    "TensorFlow's graph compiler, grappler,  with some convenient\n",
+    "homomorphic-encryption (HE) specific features, one of which is automatic\n",
+    "parameter selection.\n",
+    "\n",
+    "As such, automatic parameter selection is only available when using TensorFlow's\n",
+    "deferred execution mode (graph mode). This way, the graph is available for\n",
+    "inspection (to estimate ciphertext noise growth) and modification (to inject\n",
+    "generated parameters) before it is executed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-10-29 19:56:56.261116: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-10-29 19:56:56.287144: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf\n",
+    "import tf_shell"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = [1, 2, 3]\n",
+    "b = [4, 5, 6]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we define the function we'd like to compute. TensorFlow will first trace\n",
+    "this function (without executing it) to build a graph of the computation. Then,\n",
+    "during a graph compiler optimization pass, `tf-shell` will replace the\n",
+    "\"autocontext\" placeholder Op with parameters generated for this specific\n",
+    "computation based on statistical estimation of the noise growth and the initial\n",
+    "plaintext size.\n",
+    "\n",
+    "Note, the `create_autocontext64` function must be called from inside a\n",
+    "`tf.function` in order to execute in deferred mode."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tf.function\n",
+    "def foo(cleartext_a, cleartext_b):\n",
+    "    shell_context = tf_shell.create_autocontext64(\n",
+    "        log2_cleartext_sz=4,  # Maximum size of the cleartexts (including the scaling factor).\n",
+    "        scaling_factor=1,  # The scaling factor (analagous to fixed-point but not necessarily base 2).\n",
+    "        noise_offset_log2=0,  # Extra buffer for noise growth.\n",
+    "    )\n",
+    "    key = tf_shell.create_key64(shell_context)\n",
+    "    a = tf_shell.to_encrypted(cleartext_a, key, shell_context)\n",
+    "    b = tf_shell.to_shell_plaintext(cleartext_b, shell_context)\n",
+    "\n",
+    "    intermediate = a * b\n",
+    "    result = tf_shell.to_tensorflow(intermediate, key)\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Selected BGV parameters:\n",
+      "log_n: 11\n",
+      "t: 12289 (14 bits, min:4)\n",
+      "qs: 70371484213249  (47 bits, min:47 = t:14 + noise:33 + offset:0)\n",
+      "INFO: Generating key\n",
+      "tf.Tensor([ 4 10 18 ...  0  0  0], shape=(2048,), dtype=int32)\n"
+     ]
+    }
+   ],
+   "source": [
+    "tf_shell.enable_optimization()  # Enable the autoparameter graph optimization pass.\n",
+    "\n",
+    "a = [1, 2, 3]\n",
+    "b = [4, 5, 6]\n",
+    "c = foo(a, b)\n",
+    "\n",
+    "print(c)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`tf-shell` selected a plaintext modulus `t` to be at least 4 bits, and ciphertext\n",
+    "modulus `Q` as a produt of smaller moduli `qs` for representation of ciphertexts\n",
+    "in RNS (residual number system). `Q` is chosen to be large enough to support\n",
+    "noise growth in the computation without overflowing. Since this computation is\n",
+    "small, only one ciphertext modulus is needed.\n",
+    "\n",
+    "Note that `tf-shell` treats the first dimension of data as the packing dimension\n",
+    "of the BGV scheme (the slotting dimension). When the function is first traced,\n",
+    "the size of this dimension is unknown, because the ring degree of the\n",
+    "ciphertexts has not been chosen yet as it depends on `Q`, which depends on the\n",
+    "estimated noise growth. In the example above, the three elements of the input\n",
+    "vectors are packed into this first dimension for efficiency purposes. The\n",
+    "remaining slots in the ciphertexts went unused."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/benchmark.ipynb b/examples/benchmark.ipynb
index 8d4abce..8671910 100644
--- a/examples/benchmark.ipynb
+++ b/examples/benchmark.ipynb
@@ -4,10 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Introduction to tf-shell\n",
-    "\n",
-    "To get started, `pip install tf-shell`. tf-shell has a few modules, the one used\n",
-    "in this notebook is `tf_shell`."
+    "# Benchmarking tf-shell"
    ]
   },
   {
@@ -19,10 +16,19 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-09-12 15:46:51.029615: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-09-12 15:46:51.173136: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2024-10-29 21:41:36.488386: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-10-29 21:41:36.514318: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Generating key\n",
+      "INFO: Generating rotation key\n",
+      "INFO: Generating rotation key\n"
+     ]
     }
    ],
    "source": [
@@ -58,7 +64,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.5914442079999844\n"
+      "0.5060953950014664\n"
      ]
     }
    ],
@@ -79,7 +85,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.9706441910000194\n"
+      "0.17610475800029235\n"
      ]
     }
    ],
@@ -100,7 +106,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.5826959020000686\n"
+      "0.5579067959988606\n"
      ]
     }
    ],
@@ -121,7 +127,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.0182888270001058\n"
+      "0.7779848270001821\n"
      ]
     }
    ],
@@ -142,7 +148,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.1156545989999813\n"
+      "0.44140414300272823\n"
      ]
     }
    ],
@@ -163,7 +169,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1.2263275259999773\n"
+      "1.415064643999358\n"
      ]
     }
    ],
@@ -184,7 +190,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.9413914479998766\n"
+      "0.8980931189980765\n"
      ]
     }
    ],
@@ -205,7 +211,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0.7464927729999999\n"
+      "0.7085658289979619\n"
      ]
     }
    ],
@@ -226,7 +232,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "26.568641370000023\n"
+      "27.201639023998723\n"
      ]
     }
    ],
@@ -247,7 +253,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "350.995303421\n"
+      "360.84974249600054\n"
      ]
     }
    ],
@@ -268,7 +274,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "5.003688195999985\n"
+      "6.758062808999966\n"
      ]
     }
    ],
diff --git a/examples/distributed_demo.ipynb b/examples/distributed_demo.ipynb
new file mode 100644
index 0000000..b91652b
--- /dev/null
+++ b/examples/distributed_demo.ipynb
@@ -0,0 +1,183 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using tf-shell with Multiple Machines\n",
+    "\n",
+    "`tf-shell` can be run on multiple machines using TensorFlow device placement.\n",
+    "\n",
+    "A TensorFlow cluster is set up by each machine running something like the\n",
+    "following:\n",
+    "\n",
+    "```python\n",
+    "cluster = tf.train.ClusterSpec('''{\n",
+    "  \"alice\": [\"alice.com:2222\"],\n",
+    "  \"bob\": [\"bob.com:2223\"],\n",
+    "}''')\n",
+    "\n",
+    "server = tf.distribute.Server(\n",
+    "    cluster,\n",
+    "    job_name=\"/job:alice/replica:0/task:0/device:CPU:0\",  # or bob\n",
+    "    task_index=0,\n",
+    ")\n",
+    "\n",
+    "tf.config.experimental_connect_to_cluster(cluster)\n",
+    "```\n",
+    "\n",
+    "In this notebook, we will emulate distributed execution on a single machine by\n",
+    "using the special job name `/job:localhost/replica:0/task:0/device:CPU:0` for\n",
+    "both alice and bob, and skip the server setup."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "alice = \"/job:localhost/replica:0/task:0/device:CPU:0\"\n",
+    "bob = \"/job:localhost/replica:0/task:0/device:CPU:0\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Since `tf-shell` works with sensitive cryptographic material, it is important to\n",
+    "tell TensorFlow to only place ops on devices which were explicitly assigned,\n",
+    "for security reasons."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-10-30 05:37:22.799257: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-10-30 05:37:22.825891: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tensorflow as tf\n",
+    "tf.config.set_soft_device_placement(False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "TensorFlow makes it easy to schedule operations on specific parties. In this\n",
+    "example, Alice will generate a secret key, encrypt the input x, and send it to\n",
+    "Bob. Bob will square the value, and return to alice, who will decrypt it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Selected BGV parameters:\n",
+      "log_n: 11\n",
+      "t: 12289 (14 bits, min:6)\n",
+      "qs: 2251800363651073  (52 bits, min:52 = t:14 + noise:38 + offset:0)\n",
+      "INFO: Generating key\n",
+      "tf.Tensor(25.0, shape=(), dtype=float32)\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tf_shell\n",
+    "\n",
+    "param_cache = \"/tmp/example_protocol_param_cache\"\n",
+    "\n",
+    "@tf.function\n",
+    "def example_protocol(x):\n",
+    "  with tf.device(alice):\n",
+    "    shell_context = tf_shell.create_autocontext64(\n",
+    "        log2_cleartext_sz=6,\n",
+    "        scaling_factor=1,\n",
+    "        noise_offset_log2=0,\n",
+    "        cache_path=param_cache,\n",
+    "    )\n",
+    "    key = tf_shell.create_key64(shell_context, param_cache)\n",
+    "\n",
+    "    enc_x = tf_shell.to_encrypted(x, key, shell_context)\n",
+    "\n",
+    "  with tf.device(bob):\n",
+    "    enc_x_squared = enc_x * enc_x\n",
+    "  \n",
+    "  with tf.device(alice):\n",
+    "    x_squared = tf_shell.to_tensorflow(enc_x_squared, key)\n",
+    "    return x_squared\n",
+    "\n",
+    "# Turn on shell graph optimizers and deferred execution to use autocontext.\n",
+    "tf_shell.enable_optimization()\n",
+    "tf.config.run_functions_eagerly(False)\n",
+    "\n",
+    "res = example_protocol(tf.constant([5.0]))\n",
+    "print(res[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example, we used the `cache_path` arguments when creating the context\n",
+    "and key. This prevents regenerating the parameters every time the code is run.\n",
+    "If we call the function again, these parameters will be loaded from the cache."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Generating key\n",
+      "tf.Tensor(36.0, shape=(), dtype=float32)\n"
+     ]
+    }
+   ],
+   "source": [
+    "res = example_protocol(tf.constant([6.0]))\n",
+    "print(res[0])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/intro.ipynb b/examples/intro.ipynb
index 79a4c31..75c6634 100644
--- a/examples/intro.ipynb
+++ b/examples/intro.ipynb
@@ -19,14 +19,15 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2024-09-12 07:20:19.615023: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-09-12 07:20:19.638581: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "2024-10-29 18:41:45.349763: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-10-29 18:41:45.375597: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
     }
    ],
    "source": [
-    "import tf_shell"
+    "import tf_shell\n",
+    "import tensorflow as tf"
    ]
   },
   {
@@ -48,7 +49,15 @@
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Generating key\n"
+     ]
+    }
+   ],
    "source": [
     "context = tf_shell.create_context64(\n",
     "    log_n=10,\n",
@@ -82,12 +91,11 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The first 3 elements of the data are [3.024391  2.5263894 6.0816336]\n"
+      "The first 3 elements of the data are [4.8077097 6.9659996 0.500679 ]\n"
      ]
     }
    ],
    "source": [
-    "import tensorflow as tf\n",
     "tf_data = tf.random.uniform([context.num_slots, 2], dtype=tf.float32, maxval=10)\n",
     "print(f\"The first 3 elements of the data are {tf_data[:3, 0]}\")\n",
     "\n",
@@ -145,9 +153,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "enc:       [3.        2.6666667 6.       ]\n",
-      "enc + enc: [ 6.         5.3333335 12.       ]\n",
-      "enc * enc: [ 9.        7.111111 36.      ]\n"
+      "enc:       [4.6666665 7.        0.6666667]\n",
+      "enc + enc: [ 9.333333  14.         1.3333334]\n",
+      "enc * enc: [21.777779   49.          0.44444445]\n"
      ]
     }
    ],
@@ -160,6 +168,138 @@
     "print(f\"enc + enc: {add[:3, 0]}\")\n",
     "print(f\"enc * enc: {mul[:3, 0]}\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Scaling Factors\n",
+    "tf-shell keeps track of scaling factors, so you don't have to.\n",
+    "\n",
+    "In this example, we'll encrypt a ciphertext with scaling factor 3 (defined in\n",
+    "the context above) and multiply it by a plaintext. The resulting ciphertext will\n",
+    "have a scaling factor of 9."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ".33 * .33 = 0.1111111119389534\n",
+      "Scaling factor: 9\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = tf.constant([.33], dtype=tf.float32)\n",
+    "enc_a = tf_shell.to_encrypted(a, secret_key, context)\n",
+    "\n",
+    "enc_mul = enc_a * .33\n",
+    "mul = tf_shell.to_tensorflow(enc_mul, secret_key)\n",
+    "print(f\".33 * .33 = {mul[0]}\")\n",
+    "print(f\"Scaling factor: {enc_mul.scaling_factor}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Say we perform another multiplication, the scaling factor will be 27."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ".33 * .33 * .33 = 0.03703703731298447\n",
+      "Scaling factor: 27\n"
+     ]
+    }
+   ],
+   "source": [
+    "enc_mul_mul = enc_mul * .33\n",
+    "mul_mul = tf_shell.to_tensorflow(enc_mul_mul, secret_key)\n",
+    "print(f\".33 * .33 * .33 = {mul_mul[0]}\")\n",
+    "print(f\"Scaling factor: {enc_mul_mul.scaling_factor}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The output scaling factor of multiplication is the product of the scaling\n",
+    "factors of the operands e.g. sf=3 \\* sf=9 -> sf=27.\n",
+    "\n",
+    "Additive operations, on the other hand, have their scaling factors matched to\n",
+    "the LCM (least common multiple) of each. Here we'll add a ciphertext with\n",
+    "scaling factor 9 to a ciphertext with scaling factor 27. The result will have a\n",
+    "scaling factor of 27."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(.33 * .33) + (.33 * .33 * .33) = 0.14814814925193787\n",
+      "Scaling factor: 27\n"
+     ]
+    }
+   ],
+   "source": [
+    "enc_mul_mul = enc_mul + enc_mul_mul\n",
+    "mul_mul = tf_shell.to_tensorflow(enc_mul_mul, secret_key)\n",
+    "print(f\"(.33 * .33) + (.33 * .33 * .33) = {mul_mul[0]}\")\n",
+    "print(f\"Scaling factor: {enc_mul_mul.scaling_factor}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Modulus Switching\n",
+    "`tf-shell` supports modulus switching and will keep track of the moduli just\n",
+    "like it does with scaling factors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sum: 0.6666666865348816\n",
+      "Level of arg1: 2 arg2: 1 sum: 1\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = tf.constant([.33], dtype=tf.float32)\n",
+    "enc_a = tf_shell.to_encrypted(a, secret_key, context)\n",
+    "\n",
+    "mod_reduced_a = tf_shell.mod_reduce_tensor64(enc_a)\n",
+    "reduced_sum = enc_a + mod_reduced_a  # enc_a is mod_reduced before the addition.\n",
+    "\n",
+    "print(f\"Sum: {tf_shell.to_tensorflow(reduced_sum, secret_key)[0]}\")\n",
+    "print(f\"Level of arg1: {enc_a.level} arg2: {mod_reduced_a.level} sum: {reduced_sum.level}\")\n"
+   ]
   }
  ],
  "metadata": {
diff --git a/examples/intro_with_auto_param.ipynb b/examples/intro_with_auto_param.ipynb
deleted file mode 100644
index 7ee13a7..0000000
--- a/examples/intro_with_auto_param.ipynb
+++ /dev/null
@@ -1,157 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Introduction to tf-shell with AutoParameter Optimization\n",
-    "\n",
-    "To get started, `pip install tf-shell`.\n",
-    "\n",
-    "`tf-shell` is tightly integrated with TensorFlow's graph execution engine and\n",
-    "extends TensorFlow's graph compiler with some convenient\n",
-    "homomorphic-encryption (HE) specific features.\n",
-    "\n",
-    "In this notebook, we will demonstrate how to use `tf-shell` to automatically\n",
-    "choose low level parameters (like the plaintext modulus and ciphertext moduli)\n",
-    "for the BGV HE scheme.\n",
-    "These moduli depend on the depth of a computation, and must be fixed before\n",
-    "the computation starts.\n",
-    "In other examples, these moduli are chosen manually however it is convenient to\n",
-    "let `tf-shell` automatically choose these parameters for you."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-10-07 22:50:25.411435: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-10-07 22:50:25.436845: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import tensorflow as tf\n",
-    "import tf_shell"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "a = [1, 2, 3]\n",
-    "b = [4, 5, 6]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here we define the function we'd like to compute.\n",
-    "TensorFlow will first trace this function (without executing it) to build a\n",
-    "graph of the computation.\n",
-    "Then, during a graph compiler optimization pass, `tf-shell` will replace the\n",
-    "\"autoparameters\" placeholder with moduli generated for this specific computation\n",
-    "based on statistical estimation of the noise growth, the initial plaintext size,\n",
-    "and the scaling factor.\n",
-    "\n",
-    "Note, the `create_autocontext64` function must be called from inside a\n",
-    "`tf.function` in order to execute in non-eager (deferred) mode.\n",
-    "This ensures TensorFlow creates the computation graph which is required to\n",
-    "choose the moduli."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@tf.function\n",
-    "def foo(cleartext_a, cleartext_b):\n",
-    "    shell_context = tf_shell.create_autocontext64(\n",
-    "        log2_cleartext_sz=4,  # Maximum size of the cleartexts (ignoring the scaling factor).\n",
-    "        scaling_factor=1,  # The scaling factor (analagous to fixed-point but not base 2).\n",
-    "        noise_offset_log2=0,  # Extra buffer for noise growth.\n",
-    "    )\n",
-    "    key = tf_shell.create_key64(shell_context)\n",
-    "    a = tf_shell.to_encrypted(cleartext_a, key, shell_context)\n",
-    "    b = tf_shell.to_shell_plaintext(cleartext_b, shell_context)\n",
-    "\n",
-    "    intermediate = a * b\n",
-    "    result = tf_shell.to_tensorflow(intermediate, key)\n",
-    "    return result"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Selected BGV parameters:\n",
-      "log_n: 11\n",
-      "t: 12289 (14 bits)\n",
-      "qs: 68909633537  (37 bits)\n",
-      "INFO: Generating key\n",
-      "tf.Tensor([ 4 10 18 ...  0  0  0], shape=(2048,), dtype=int32)\n"
-     ]
-    }
-   ],
-   "source": [
-    "tf_shell.enable_optimization()\n",
-    "\n",
-    "a = [1, 2, 3]\n",
-    "b = [4, 5, 6]\n",
-    "c = foo(a, b)\n",
-    "\n",
-    "print(c)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Recall `tf-shell` treats the first dimension of data as the packing dimension\n",
-    "of the BGV scheme.\n",
-    "In the example above, the three elements of the input vectors are packed into\n",
-    "this first dimension for efficiency purposes.\n",
-    "\n",
-    "Note however, that the remaining slots in the ciphertexts went unused and the\n",
-    "number of slots was chosen automatically during graph optimization."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/label_dp_sgd.ipynb b/examples/label_dp_sgd.ipynb
deleted file mode 100644
index d080820..0000000
--- a/examples/label_dp_sgd.ipynb
+++ /dev/null
@@ -1,236 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Label DP SGD\n",
-    "\n",
-    "This notebook walks through how to train a model to recognize hand written\n",
-    "digits using label differentially private gradient decent and the MNIST dataset.\n",
-    "In this setting, one party has the images and the other party has the labels.\n",
-    "They would like to collaborate to train a model without revealing their data.\n",
-    "\n",
-    "Before starting, install the tf-shell package.\n",
-    "\n",
-    "```bash\n",
-    "pip install tf-shell\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "First, import some modules and set up tf-shell. The parameters are for the SHELL\n",
-    "encryption library, which tf-shell uses, and mostly depend on the multiplicative\n",
-    "depth of the computation to be performed. This example performs back\n",
-    "propagation, thus the multiplicative depth is determined by the number of\n",
-    "layers. For more information, see [SHELL](https://github.com/google/shell)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-29 06:18:05.175618: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-09-29 06:18:05.198038: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import time\n",
-    "import os\n",
-    "from datetime import datetime\n",
-    "import tensorflow as tf\n",
-    "import keras\n",
-    "import numpy as np\n",
-    "import tf_shell\n",
-    "import tf_shell_ml"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set a default batch size (must be less that the chosen ciphertext ring degree\n",
-    "# so anything less than 2**10 is fine). This will be used for validation, but\n",
-    "# for training using autocontext (as below) the batch size is determined by the\n",
-    "# ciphertext parameters.\n",
-    "batch_size = 2**10\n",
-    "\n",
-    "# Setup the dataset\n",
-    "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
-    "x_train, x_test = np.reshape(x_train, (-1, 784)), np.reshape(x_test, (-1, 784))\n",
-    "x_train, x_test = x_train / np.float32(255.0), x_test / np.float32(255.0)\n",
-    "y_train, y_test = tf.one_hot(y_train, 10), tf.one_hot(y_test, 10)\n",
-    "\n",
-    "epochs = 1\n",
-    "train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
-    "train_dataset = (\n",
-    "    train_dataset.shuffle(buffer_size=2048)\n",
-    "    .batch(batch_size, drop_remainder=True)\n",
-    "    .repeat(count=epochs)\n",
-    ")\n",
-    "\n",
-    "val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))\n",
-    "val_dataset = val_dataset.batch(batch_size, drop_remainder=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final parameters:\n",
-      "log_n: 12\n",
-      "t: 65537\n",
-      "qs: 288230376151760897 288230376152137729 \n"
-     ]
-    }
-   ],
-   "source": [
-    "# Turn on the shell optimizer to use autocontext.\n",
-    "tf_shell.enable_optimization()\n",
-    "use_fast_reduce_sum = True\n",
-    "\n",
-    "m = tf_shell_ml.TfShellSequential(\n",
-    "    [\n",
-    "        tf_shell_ml.ShellDense(\n",
-    "            64,\n",
-    "            activation=tf_shell_ml.relu,\n",
-    "            activation_deriv=tf_shell_ml.relu_deriv,\n",
-    "            use_fast_reduce_sum=use_fast_reduce_sum,\n",
-    "        ),\n",
-    "        tf_shell_ml.ShellDense(\n",
-    "            10,\n",
-    "            activation=tf.nn.softmax,\n",
-    "            use_fast_reduce_sum=use_fast_reduce_sum,\n",
-    "        ),\n",
-    "    ],\n",
-    "    lambda: tf_shell.create_autocontext64(\n",
-    "        log2_cleartext_sz=12,\n",
-    "        scaling_factor=3,\n",
-    "        noise_offset_log2=32,\n",
-    "    ),\n",
-    "    True,\n",
-    ")\n",
-    "\n",
-    "m.compile(\n",
-    "    shell_loss=tf_shell_ml.CategoricalCrossentropy(),\n",
-    "    optimizer=tf.keras.optimizers.Adam(0.1),\n",
-    "    loss=tf.keras.losses.CategoricalCrossentropy(),\n",
-    "    metrics=[tf.keras.metrics.CategoricalAccuracy()],\n",
-    ")\n",
-    "\n",
-    "train_datset = m.set_dataset_batching(train_dataset)\n",
-    "\n",
-    "# m.build([batch_size, 784])  # do not build if using autoparams\n",
-    "# m(train_dataset)\n",
-    "# m.summary()\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-29 06:18:37.535177: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.\n",
-      "2024-09-29 06:18:37.535200: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.\n",
-      "2024-09-29 06:18:37.535262: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Final parameters:\n",
-      "log_n: 12\n",
-      "t: 65537\n",
-      "qs: 288230376151760897 288230376152137729 \n",
-      "58/58 [==============================] - 1160s 20s/step - num_slots: 4096.0000 - val_categorical_accuracy: 0.1018\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Set up tensorboard logging.\n",
-    "stamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
-    "fast_str = \"-fast\" if use_fast_reduce_sum else \"\"\n",
-    "logdir = os.path.abspath(\"\") + f\"/tflogs/dp-sgd{fast_str}-{stamp}\"\n",
-    "\n",
-    "tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logdir,\n",
-    "                                                 histogram_freq = 1,\n",
-    "                                                 profile_batch = '500,520')\n",
-    "\n",
-    "history = m.fit(train_dataset, epochs=1, validation_data=val_dataset, callbacks = [tboard_callback])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Model: \"tf_shell_sequential\"\n",
-      "_________________________________________________________________\n",
-      " Layer (type)                Output Shape              Param #   \n",
-      "=================================================================\n",
-      " shell_dense (ShellDense)    multiple                  50176     \n",
-      "                                                                 \n",
-      " shell_dense_1 (ShellDense)  multiple                  640       \n",
-      "                                                                 \n",
-      "=================================================================\n",
-      "Total params: 50816 (198.50 KB)\n",
-      "Trainable params: 50816 (198.50 KB)\n",
-      "Non-trainable params: 0 (0.00 Byte)\n",
-      "_________________________________________________________________\n"
-     ]
-    }
-   ],
-   "source": [
-    "m.summary()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/label_dp_sgd_post_scale.ipynb b/examples/label_dp_sgd_post_scale.ipynb
deleted file mode 100644
index 886bcf2..0000000
--- a/examples/label_dp_sgd_post_scale.ipynb
+++ /dev/null
@@ -1,503 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Label DP SGD (Post Scale)\n",
-    "\n",
-    "This notebook walks through how to train a model to recognize hand written\n",
-    "digits using label differentially private gradient decent and the MNIST dataset.\n",
-    "In this setting, one party has the images and the other party has the labels.\n",
-    "They would like to collaborate to train a model without revealing their data.\n",
-    "\n",
-    "This colab uses the post-scale approach to training.\n",
-    "\n",
-    "Before starting, install the tf-shell package.\n",
-    "\n",
-    "```bash\n",
-    "pip install tf-shell\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "First, import some modules and set up tf-shell. The parameters are for the SHELL\n",
-    "encryption library, which tf-shell uses, and mostly depend on the multiplicative\n",
-    "depth of the computation to be performed. This example performs back\n",
-    "propagation, thus the multiplicative depth is determined by the number of\n",
-    "layers. For more information, see [SHELL](https://github.com/google/shell)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 14:10:33.037606: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-09-13 14:10:33.172913: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import time\n",
-    "from datetime import datetime\n",
-    "import tensorflow as tf\n",
-    "import keras\n",
-    "import numpy as np\n",
-    "import tf_shell\n",
-    "import tf_shell_ml\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "use_fast_rotation_protocol = True\n",
-    "\n",
-    "if use_fast_rotation_protocol:\n",
-    "    # Num plaintext bits: 19, noise bits: 39\n",
-    "    # Max representable value: 61895\n",
-    "    context = tf_shell.create_context64(\n",
-    "        log_n=11,\n",
-    "        main_moduli=[288230376151748609],\n",
-    "        plaintext_modulus=557057,\n",
-    "        scaling_factor=3,\n",
-    "    )\n",
-    "    # accuracy: 0.83642578125\n",
-    "    # Total training time: 572.2677536010742 seconds\n",
-    "else:\n",
-    "    # Num plaintext bits: 19, noise bits: 39\n",
-    "    # Max representable value: 61895\n",
-    "    context = tf_shell.create_context64(\n",
-    "        log_n=11,\n",
-    "        main_moduli=[288230376151748609],\n",
-    "        plaintext_modulus=557057,\n",
-    "        scaling_factor=3,\n",
-    "    )\n",
-    "    # accuracy: 0.82861328125\n",
-    "    # Total training time: 2218.1095881462097 seconds\n",
-    "    \n",
-    "\n",
-    "# Create the secret key for encryption and a rotation key (rotation key is\n",
-    "# an auxilary key required for operations like roll or matmul).\n",
-    "secret_key = tf_shell.create_key64(context)\n",
-    "public_rotation_key = tf_shell.create_rotation_key64(context, secret_key)\n",
-    "secret_fast_rotation_key = tf_shell.create_fast_rotation_key64(context, secret_key)\n",
-    "\n",
-    "# The batch size is determined by the ciphertext parameters, specifically the\n",
-    "# schemes polynomial's ring degree because tf-shell uses batch axis packing.\n",
-    "# Furthermore, two micro-batches to run in parallel.\n",
-    "batch_size = context.num_slots"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Setup MNIST dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()\n",
-    "x_train, x_test = np.reshape(x_train, (-1, 784)), np.reshape(x_test, (-1, 784))\n",
-    "x_train, x_test = x_train / np.float32(255.0), x_test / np.float32(255.0)\n",
-    "y_train, y_test = tf.one_hot(y_train, 10), tf.one_hot(y_test, 10)\n",
-    "\n",
-    "epochs = 1\n",
-    "train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))\n",
-    "train_dataset = (\n",
-    "    train_dataset.shuffle(buffer_size=2048)\n",
-    "    .batch(batch_size, drop_remainder=True)\n",
-    "    .repeat(count=epochs)\n",
-    ")\n",
-    "\n",
-    "val_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))\n",
-    "val_dataset = val_dataset.batch(batch_size, drop_remainder=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Create a simple model with a hidden layer of size 64 and an output layer\n",
-    "of size 10 (for each of the 10 digits)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mnist_layers = [\n",
-    "    tf.keras.layers.Dense(64, activation=\"relu\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"sigmoid\"),\n",
-    "]\n",
-    "\n",
-    "model = keras.Sequential(mnist_layers)\n",
-    "model.compile(\n",
-    "    optimizer=\"adam\",\n",
-    "    metrics=[\"accuracy\"],\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, define the `train_step` function which will be called for each batch on an\n",
-    "encrypted batch of labels, y. The function first does a forward on the plaintext\n",
-    "image x to compute a predicted label, then does backpropagation using the\n",
-    "encrypted label y."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "@tf.function\n",
-    "def train_step(x, y):\n",
-    "    \"\"\"One step of training with using the \"post scale\" approach.\n",
-    "\n",
-    "    High level idea:\n",
-    "    For each output class, backprop to compute the gradient but exclude the loss\n",
-    "    function. Now we have a _vector_ of model updates for one sample. The real\n",
-    "    gradient update for the sample is a linear combination of the vector of\n",
-    "    weight updates whose scale is determined by dJ_dyhat (the derivative of the\n",
-    "    loss with respect to the predicted output yhat). Effectively, we have\n",
-    "    factored out dJ_dyhat from the gradient. Separating out dJ_dyhat allows us\n",
-    "    to scale the weight updates easily when the label is secret and the gradient\n",
-    "    must be computed under encryption / multiparty computation because the\n",
-    "    multiplicative depth of the computation is 1, however the number of\n",
-    "    multiplications required now depends on the model size AND the number of\n",
-    "    output classes. In contrast, standard backpropagation only requires\n",
-    "    multiplications proportional to the model size, howver the multiplicative\n",
-    "    depth is proportional to the model depth.\n",
-    "    \"\"\"\n",
-    "\n",
-    "    # Unset the activation function for the last layer so it is not used in\n",
-    "    # computing the gradient. The effect of the last layer activation function\n",
-    "    # is factored out of the gradient computation and accounted for below.\n",
-    "    model.layers[-1].activation = tf.keras.activations.linear\n",
-    "\n",
-    "    with tf.GradientTape() as tape:\n",
-    "        y_pred = model(x, training=True)  # forward pass\n",
-    "    grads = tape.jacobian(y_pred, model.trainable_weights)\n",
-    "    # ^  layers list x (batch size x num output classes x weights) matrix\n",
-    "    # dy_pred_j/dW_sample_class\n",
-    "\n",
-    "\n",
-    "    # Reset the activation function for the last layer and compute the real\n",
-    "    # prediction.\n",
-    "    model.layers[-1].activation = tf.keras.activations.sigmoid\n",
-    "    y_pred = model(x, training=False)\n",
-    "\n",
-    "    # Compute y_pred - y (where y is encrypted).\n",
-    "    scalars = y.__rsub__(y_pred)  # dJ/dy_pred\n",
-    "    # ^  batch_size x num output classes.\n",
-    "\n",
-    "    # Expand the last dim so that the subsequent multiplication is\n",
-    "    # broadcasted.\n",
-    "    scalars = tf_shell.expand_dims(scalars, axis=-1)\n",
-    "    # ^ batch_size x num output classes x 1\n",
-    "\n",
-    "    # Scale each gradient. Since 'scalars' may be a vector of ciphertexts, this\n",
-    "    # requires multiplying plaintext gradient for the specific layer (2d) by the\n",
-    "    # ciphertext (scalar). To do so efficiently under encryption requires\n",
-    "    # flattening and packing the weights, as shown below.\n",
-    "    ps_grads = []\n",
-    "    for layer_grad_full in grads:\n",
-    "        # Remember the original shape of the gradient in order to unpack them\n",
-    "        # after the multiplication so they can be applied to the model.\n",
-    "        batch_sz = layer_grad_full.shape[0]\n",
-    "        num_output_classes = layer_grad_full.shape[1]\n",
-    "        grad_shape = layer_grad_full.shape[2:]\n",
-    "\n",
-    "        packable_grad = tf.reshape(layer_grad_full, [batch_sz, num_output_classes, -1])\n",
-    "        # ^  batch_size x num output classes x flattened weights\n",
-    "\n",
-    "        # Scale the gradient precursors.\n",
-    "        scaled_grad = scalars * packable_grad\n",
-    "        # ^ dJ/dW = dJ/dy_pred * dy_pred/dW \n",
-    "\n",
-    "        # Sum over the output classes.\n",
-    "        scaled_grad = tf_shell.reduce_sum(scaled_grad, axis=1)\n",
-    "        # ^  batch_size x 1 x flattened weights\n",
-    "\n",
-    "        # In the real world, this approach would also likely require clipping\n",
-    "        # the gradient, aggregation, and adding DP noise.\n",
-    "\n",
-    "        # Reshape to remove the '1' dimension in the middle.\n",
-    "        scaled_grad = tf_shell.reshape(scaled_grad, [batch_sz] + grad_shape)\n",
-    "        # ^  batch_size x weights\n",
-    "\n",
-    "        # Sum over the batch.\n",
-    "        if use_fast_rotation_protocol:\n",
-    "            scaled_grad = tf_shell.fast_reduce_sum(scaled_grad)\n",
-    "        else:\n",
-    "            scaled_grad = tf_shell.reduce_sum(scaled_grad, axis=0, rotation_key=public_rotation_key)\n",
-    "        # ^  batch_size x flattened weights\n",
-    "        # Every [i, ...] is the same, the sum over the batching dim axis=0.\n",
-    "\n",
-    "        ps_grads.append(scaled_grad)\n",
-    "\n",
-    "    return ps_grads\n",
-    "\n",
-    "\n",
-    "@tf.function\n",
-    "def train_step_wrapper(x_batch, y_batch):\n",
-    "    # Encrypt\n",
-    "    enc_y_batch = tf_shell.to_encrypted(y_batch, secret_key, context)\n",
-    "\n",
-    "    # Train\n",
-    "    ps_grads = train_step(x_batch, enc_y_batch)\n",
-    "\n",
-    "    # Decrypt\n",
-    "    if use_fast_rotation_protocol:\n",
-    "        decrypt_key = secret_fast_rotation_key\n",
-    "    else:\n",
-    "        decrypt_key = secret_key\n",
-    "    batch_sz = context.num_slots\n",
-    "    top_grads = [tf_shell.to_tensorflow(enc_g, decrypt_key)[0] for enc_g in ps_grads]\n",
-    "    bottom_grads = [tf_shell.to_tensorflow(enc_g, decrypt_key)[batch_sz // 2] for enc_g in ps_grads]\n",
-    "    # ^ take the first element of each batch because the grad sum is repeated over the batching dim.\n",
-    "\n",
-    "    model.optimizer.apply_gradients(\n",
-    "        zip(\n",
-    "            top_grads,\n",
-    "            model.trainable_weights\n",
-    "        )\n",
-    "    )\n",
-    "    model.optimizer.apply_gradients(\n",
-    "        zip(\n",
-    "            bottom_grads,\n",
-    "            model.trainable_weights\n",
-    "        )\n",
-    "    )"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is the training loop. Each inner iteration runs a batch of size 2^(11),\n",
-    "then meaures the model accuracy.\n",
-    "\n",
-    "Tensorboard can be used to visualize the training progress. See cell output for\n",
-    "command to start tensorboard."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "To start tensorboard, run: tensorboard --logdir ./ --host 0.0.0.0\n",
-      "\ttensorboard profiling requires: pip install tensorboard_plugin_profile\n",
-      "Batch: 0 / 29, Time Stamp: 0.06960606575012207\n",
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1369: start (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.start` instead.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 14:10:36.177098: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.\n",
-      "2024-09-13 14:10:36.177118: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: stop (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.stop` instead.\n",
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: save (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n",
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/eager/profiler.py:150: maybe_create_event_file (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n",
-      "\taccuracy: 0.1240234375\n",
-      "Batch: 1 / 29, Time Stamp: 35.91403651237488\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 14:11:11.935723: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:70] Profiler session collecting data.\n",
-      "2024-09-13 14:11:11.946607: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\taccuracy: 0.11767578125\n",
-      "Batch: 2 / 29, Time Stamp: 56.13673758506775\n",
-      "\taccuracy: 0.111328125\n",
-      "Batch: 3 / 29, Time Stamp: 75.9100878238678\n",
-      "\taccuracy: 0.11962890625\n",
-      "Batch: 4 / 29, Time Stamp: 95.8462266921997\n",
-      "\taccuracy: 0.13037109375\n",
-      "Batch: 5 / 29, Time Stamp: 115.84013247489929\n",
-      "\taccuracy: 0.142578125\n",
-      "Batch: 6 / 29, Time Stamp: 135.50198316574097\n",
-      "\taccuracy: 0.1513671875\n",
-      "Batch: 7 / 29, Time Stamp: 155.1141803264618\n",
-      "\taccuracy: 0.162109375\n",
-      "Batch: 8 / 29, Time Stamp: 175.40088200569153\n",
-      "\taccuracy: 0.18798828125\n",
-      "Batch: 9 / 29, Time Stamp: 195.27433037757874\n",
-      "\taccuracy: 0.23828125\n",
-      "Batch: 10 / 29, Time Stamp: 215.34290480613708\n",
-      "\taccuracy: 0.29638671875\n",
-      "Batch: 11 / 29, Time Stamp: 234.9976098537445\n",
-      "\taccuracy: 0.37548828125\n",
-      "Batch: 12 / 29, Time Stamp: 254.66605234146118\n",
-      "\taccuracy: 0.46875\n",
-      "Batch: 13 / 29, Time Stamp: 274.9420063495636\n",
-      "\taccuracy: 0.52978515625\n",
-      "Batch: 14 / 29, Time Stamp: 295.4427545070648\n",
-      "\taccuracy: 0.56884765625\n",
-      "Batch: 15 / 29, Time Stamp: 315.34135460853577\n",
-      "\taccuracy: 0.5927734375\n",
-      "Batch: 16 / 29, Time Stamp: 335.8522572517395\n",
-      "\taccuracy: 0.61572265625\n",
-      "Batch: 17 / 29, Time Stamp: 356.5795986652374\n",
-      "\taccuracy: 0.64111328125\n",
-      "Batch: 18 / 29, Time Stamp: 377.4952039718628\n",
-      "\taccuracy: 0.662109375\n",
-      "Batch: 19 / 29, Time Stamp: 398.5302703380585\n",
-      "\taccuracy: 0.69287109375\n",
-      "Batch: 20 / 29, Time Stamp: 419.59759402275085\n",
-      "\taccuracy: 0.71240234375\n",
-      "Batch: 21 / 29, Time Stamp: 439.87542271614075\n",
-      "\taccuracy: 0.7265625\n",
-      "Batch: 22 / 29, Time Stamp: 460.02534770965576\n",
-      "\taccuracy: 0.74267578125\n",
-      "Batch: 23 / 29, Time Stamp: 479.76108503341675\n",
-      "\taccuracy: 0.76171875\n",
-      "Batch: 24 / 29, Time Stamp: 499.2436821460724\n",
-      "\taccuracy: 0.7802734375\n",
-      "Batch: 25 / 29, Time Stamp: 519.1094493865967\n",
-      "\taccuracy: 0.80224609375\n",
-      "Batch: 26 / 29, Time Stamp: 538.8864521980286\n",
-      "\taccuracy: 0.82421875\n",
-      "Batch: 27 / 29, Time Stamp: 558.8192665576935\n",
-      "\taccuracy: 0.83642578125\n",
-      "Batch: 28 / 29, Time Stamp: 578.5930137634277\n",
-      "\taccuracy: 0.84423828125\n",
-      "Total training time: 598.0663740634918 seconds\n"
-     ]
-    }
-   ],
-   "source": [
-    "start_time = time.time()\n",
-    "\n",
-    "# Set up tensorboard logging.\n",
-    "stamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
-    "fast_str = \"-fast\" if use_fast_rotation_protocol else \"\"\n",
-    "logdir = os.path.abspath(\"\") + f\"/tflogs/post-scale{fast_str}-{stamp}\"\n",
-    "print(f\"To start tensorboard, run: tensorboard --logdir ./ --host 0.0.0.0\")\n",
-    "print(f\"\\ttensorboard profiling requires: pip install tensorboard_plugin_profile\")\n",
-    "writer = tf.summary.create_file_writer(logdir)\n",
-    "\n",
-    "# Iterate over the batches of the dataset.\n",
-    "for step, (x_batch, y_batch) in enumerate(train_dataset.take(batch_size)):\n",
-    "    print(\n",
-    "        f\"Batch: {step} / {len(train_dataset)}, Time Stamp: {time.time() - start_time}\"\n",
-    "    )\n",
-    "\n",
-    "    if step == 0:\n",
-    "        tf.summary.trace_on(\n",
-    "            graph=True,\n",
-    "            profiler=True,\n",
-    "            # profiler_outdir=logdir, #  Only for tf 2.16+\n",
-    "        )\n",
-    "\n",
-    "    train_step_wrapper(x_batch, y_batch)\n",
-    "\n",
-    "    if step == 0:\n",
-    "        with writer.as_default():\n",
-    "            tf.summary.trace_export(\n",
-    "                name=\"label_dp_sgd_post_scale\",\n",
-    "                step=step,\n",
-    "                profiler_outdir=logdir,\n",
-    "            )\n",
-    "\n",
-    "    # Check the accuracy.\n",
-    "    average_loss = 0\n",
-    "    average_accuracy = 0\n",
-    "    for x, y in val_dataset:\n",
-    "        y_pred = model(x, training=False)\n",
-    "        loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y, y_pred))\n",
-    "        accuracy = tf.reduce_mean(\n",
-    "            tf.cast(\n",
-    "                tf.equal(tf.argmax(y, axis=1), tf.argmax(y_pred, axis=1)), tf.float32\n",
-    "            )\n",
-    "        )\n",
-    "        average_accuracy += accuracy\n",
-    "        average_loss += loss\n",
-    "    average_loss /= len(val_dataset)\n",
-    "    average_accuracy /= len(val_dataset)\n",
-    "    tf.print(f\"\\taccuracy: {accuracy}\")\n",
-    "\n",
-    "    with writer.as_default():\n",
-    "        tf.summary.scalar(\"loss\", average_loss, step=step)\n",
-    "        tf.summary.scalar(\"accuracy\", average_accuracy, step=step)\n",
-    "\n",
-    "\n",
-    "print(f\"Total training time: {time.time() - start_time} seconds\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/label_dp_sgd_sentiment.ipynb b/examples/label_dp_sgd_sentiment.ipynb
deleted file mode 100644
index 365cd1f..0000000
--- a/examples/label_dp_sgd_sentiment.ipynb
+++ /dev/null
@@ -1,716 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Sentiment Analysis on IMDB dataset\n",
-    "\n",
-    "This notebook walks through how perform sentament analysis on the IMDB dataset.\n",
-    "In this setting, one party has the reviews and the other party has the labels.\n",
-    "The party with the labels is helping the party with the reviews train a model\n",
-    "without sharing the labels themselves.\n",
-    "\n",
-    "Before starting, install tf-shell and the dataset.\n",
-    "\n",
-    "```bash\n",
-    "pip install tf-shell\n",
-    "pip install tensorflow_hub tensorflow_datasets\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 05:53:52.002235: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2024-09-13 05:53:52.025598: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
-     ]
-    }
-   ],
-   "source": [
-    "import time\n",
-    "from datetime import datetime\n",
-    "import tensorflow as tf\n",
-    "import tensorflow_hub as hub\n",
-    "import tensorflow_datasets as tfds\n",
-    "\n",
-    "import keras\n",
-    "import numpy as np\n",
-    "import tf_shell\n",
-    "import tf_shell_ml\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Set up parameters for the SHELL encryption library.\n",
-    "context = tf_shell.create_context64(\n",
-    "    log_n=12,\n",
-    "    main_moduli=[288230376151760897, 288230376152137729],\n",
-    "    plaintext_modulus=4294991873,\n",
-    "    scaling_factor=3,\n",
-    "    seed=\"test_seed\",\n",
-    ")\n",
-    "\n",
-    "# Create the secret key for encryption and a rotation key (rotation key is\n",
-    "# an auxilary key required for operations like roll or matmul).\n",
-    "secret_key = tf_shell.create_key64(context)\n",
-    "public_rotation_key = tf_shell.create_rotation_key64(context, secret_key)\n",
-    "\n",
-    "# The batch size is determined by the ciphertext parameters, specifically the\n",
-    "# schemes polynomial's ring degree because tf-shell uses batch axis packing.\n",
-    "# Furthermore, two micro-batches to run in parallel.\n",
-    "batch_size = context.num_slots\n",
-    "\n",
-    "use_encryption = True"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Setup IMDB dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 05:54:11.598078: W external/local_tsl/tsl/platform/cloud/google_auth_provider.cc:184] All attempts to get a Google authentication bearer token failed, returning an empty token. Retrieving token from files failed with \"NOT_FOUND: Could not locate the credentials file.\". Retrieving token from GCE failed with \"FAILED_PRECONDITION: Error executing an HTTP request: libcurl code 6 meaning 'Couldn't resolve host name', error details: Could not resolve host: metadata.google.internal\".\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/vscode/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...\u001b[0m\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/workspaces/tf-shell/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "Dl Size...: 100%|██████████| 80/80 [00:04<00:00, 18.44 MiB/s]rl]\n",
-      "Dl Completed...: 100%|██████████| 1/1 [00:04<00:00,  4.34s/ url]\n",
-      "2024-09-13 05:54:37.119837: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[1mDataset imdb_reviews downloaded and prepared to /home/vscode/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.\u001b[0m\n",
-      "Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.\n",
-      "Label: 0\n",
-      "Most used words: ['', '[UNK]', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'it']\n",
-      "Dictionary size: 10000\n",
-      "Word 0 () count: 190260176\n",
-      "Word 1 ([UNK]) count: 1908048\n",
-      "Word 2 (the) count: 1640589\n",
-      "Word 3 (a) count: 793367\n",
-      "Word 4 (and) count: 792937\n",
-      "Word 5 (of) count: 713899\n",
-      "Word 6 (to) count: 663159\n",
-      "Word 7 (is) count: 522180\n",
-      "Word 8 (in) count: 454454\n",
-      "Word 9 (it) count: 379058\n",
-      "Word 10 (i) count: 376094\n",
-      "Word 11 (this) count: 372402\n",
-      "Word 12 (that) count: 342344\n",
-      "Word 13 (br) count: 280608\n",
-      "Word 14 (was) count: 237739\n",
-      "Word 15 (as) count: 227240\n",
-      "Word 16 (with) count: 215115\n",
-      "Word 17 (for) count: 215019\n",
-      "Word 18 (movie) count: 205242\n",
-      "Word 19 (but) count: 204007\n",
-      "Word 20 (film) count: 184563\n",
-      "Word 21 (on) count: 165692\n",
-      "Word 22 (not) count: 147470\n",
-      "Word 23 (you) count: 146530\n",
-      "Word 24 (are) count: 144606\n",
-      "Word 25 (his) count: 144318\n",
-      "Word 26 (have) count: 137392\n",
-      "Word 27 (be) count: 130623\n",
-      "Word 28 (he) count: 129926\n",
-      "Word 29 (one) count: 125435\n",
-      "Word 30 (its) count: 122657\n",
-      "Word 31 (all) count: 115104\n",
-      "Word 32 (at) count: 114646\n",
-      "Word 33 (by) count: 109158\n",
-      "Word 34 (they) count: 104446\n",
-      "Word 35 (an) count: 104373\n",
-      "Word 36 (who) count: 99613\n",
-      "Word 37 (from) count: 98877\n",
-      "Word 38 (so) count: 97604\n",
-      "Word 39 (like) count: 97158\n",
-      "Word 40 (or) count: 87118\n",
-      "Word 41 (her) count: 86980\n",
-      "Word 42 (just) count: 86533\n",
-      "Word 43 (about) count: 84370\n",
-      "Word 44 (if) count: 83054\n",
-      "Word 45 (out) count: 80859\n",
-      "Word 46 (has) count: 80649\n",
-      "Word 47 (some) count: 78007\n",
-      "Word 48 (there) count: 75396\n",
-      "Word 49 (what) count: 75491\n",
-      "Word 50 (good) count: 71475\n",
-      "Word 51 (very) count: 68814\n",
-      "Word 52 (when) count: 68908\n",
-      "Word 53 (more) count: 68895\n",
-      "Word 54 (my) count: 61713\n",
-      "Word 55 (even) count: 61146\n",
-      "Word 56 (she) count: 60811\n",
-      "Word 57 (would) count: 60268\n",
-      "Word 58 (no) count: 59316\n",
-      "Word 59 (up) count: 58955\n",
-      "Word 60 (time) count: 58343\n",
-      "Word 61 (really) count: 57514\n",
-      "Word 62 (only) count: 57442\n",
-      "Word 63 (which) count: 57578\n",
-      "Word 64 (had) count: 55691\n",
-      "Word 65 (see) count: 55699\n",
-      "Word 66 (were) count: 55339\n",
-      "Word 67 (their) count: 55382\n",
-      "Word 68 (story) count: 54809\n",
-      "Word 69 (can) count: 54071\n",
-      "Word 70 (me) count: 51803\n",
-      "Word 71 (we) count: 48530\n",
-      "Word 72 (than) count: 48370\n",
-      "Word 73 (much) count: 46081\n",
-      "Word 74 (well) count: 45461\n",
-      "Word 75 (been) count: 45473\n",
-      "Word 76 (get) count: 45216\n",
-      "Word 77 (do) count: 45029\n",
-      "Word 78 (will) count: 44979\n",
-      "Word 79 (also) count: 44631\n",
-      "Word 80 (bad) count: 44767\n",
-      "Word 81 (because) count: 44384\n",
-      "Word 82 (people) count: 44541\n",
-      "Word 83 (into) count: 44172\n",
-      "Word 84 (other) count: 44230\n",
-      "Word 85 (great) count: 43514\n",
-      "Word 86 (first) count: 43337\n",
-      "Word 87 (how) count: 43293\n",
-      "Word 88 (dont) count: 41866\n",
-      "Word 89 (most) count: 41685\n",
-      "Word 90 (him) count: 41019\n",
-      "Word 91 (then) count: 39080\n",
-      "Word 92 (movies) count: 38802\n",
-      "Word 93 (make) count: 38682\n",
-      "Word 94 (made) count: 38338\n",
-      "Word 95 (them) count: 38339\n",
-      "Word 96 (films) count: 38100\n",
-      "Word 97 (any) count: 37665\n",
-      "Word 98 (way) count: 37769\n",
-      "Word 99 (could) count: 37505\n",
-      "Word 100 (too) count: 37306\n",
-      "Word 101 (after) count: 36375\n",
-      "Word 102 (characters) count: 35590\n",
-      "Word 103 (think) count: 35321\n",
-      "Word 104 (watch) count: 34037\n",
-      "Word 105 (two) count: 32776\n",
-      "Word 106 (many) count: 32476\n",
-      "Word 107 (being) count: 32295\n",
-      "Word 108 (seen) count: 31988\n",
-      "Word 109 (character) count: 31853\n",
-      "Word 110 (never) count: 31977\n",
-      "Word 111 (plot) count: 31203\n",
-      "Word 112 (acting) count: 30877\n",
-      "Word 113 (best) count: 30728\n",
-      "Word 114 (did) count: 30577\n",
-      "Word 115 (love) count: 30399\n",
-      "Word 116 (little) count: 30455\n",
-      "Word 117 (where) count: 30332\n",
-      "Word 118 (life) count: 29390\n",
-      "Word 119 (show) count: 29199\n",
-      "Word 120 (know) count: 28775\n",
-      "Word 121 (ever) count: 28504\n",
-      "Word 122 (does) count: 28508\n",
-      "Word 123 (your) count: 28391\n",
-      "Word 124 (still) count: 27215\n",
-      "Word 125 (over) count: 27261\n",
-      "Word 126 (better) count: 27114\n",
-      "Word 127 (these) count: 26639\n",
-      "Word 128 (while) count: 26374\n",
-      "Word 129 (say) count: 26345\n",
-      "Word 130 (off) count: 25916\n",
-      "Word 131 (end) count: 25828\n",
-      "Word 132 (man) count: 25823\n",
-      "Word 133 (scene) count: 25368\n",
-      "Word 134 (here) count: 24971\n",
-      "Word 135 (such) count: 24965\n",
-      "Word 136 (go) count: 24883\n",
-      "Word 137 (scenes) count: 24693\n",
-      "Word 138 (why) count: 24798\n",
-      "Word 139 (through) count: 24340\n",
-      "Word 140 (should) count: 24113\n",
-      "Word 141 (something) count: 24027\n",
-      "Word 142 (im) count: 23657\n",
-      "Word 143 (back) count: 23500\n",
-      "Word 144 (doesnt) count: 22745\n",
-      "Word 145 (those) count: 22692\n",
-      "Word 146 (real) count: 22546\n",
-      "Word 147 (watching) count: 22416\n",
-      "Word 148 (thing) count: 22294\n",
-      "Word 149 (years) count: 21953\n",
-      "Word 150 (now) count: 21693\n",
-      "Word 151 (didnt) count: 21447\n",
-      "Word 152 (though) count: 21377\n",
-      "Word 153 (actors) count: 20909\n",
-      "Word 154 (find) count: 20483\n",
-      "Word 155 (nothing) count: 20554\n",
-      "Word 156 (actually) count: 20540\n",
-      "Word 157 (makes) count: 20544\n",
-      "Word 158 (new) count: 20233\n",
-      "Word 159 (work) count: 20396\n",
-      "Word 160 (before) count: 20393\n",
-      "Word 161 (old) count: 20295\n",
-      "Word 162 (another) count: 20287\n",
-      "Word 163 (going) count: 20187\n",
-      "Word 164 (funny) count: 19953\n",
-      "Word 165 (every) count: 20038\n",
-      "Word 166 (same) count: 20035\n",
-      "Word 167 (look) count: 19611\n",
-      "Word 168 (few) count: 19627\n",
-      "Word 169 (us) count: 19516\n",
-      "Word 170 (lot) count: 19084\n",
-      "Word 171 (part) count: 18994\n",
-      "Word 172 (director) count: 18975\n",
-      "Word 173 (again) count: 18872\n",
-      "Word 174 (cant) count: 18748\n",
-      "Word 175 (quite) count: 18581\n",
-      "Word 176 (cast) count: 18326\n",
-      "Word 177 (thats) count: 18250\n",
-      "Word 178 (want) count: 17881\n",
-      "Word 179 (pretty) count: 17967\n",
-      "Word 180 (seems) count: 17569\n",
-      "Word 181 (things) count: 17432\n",
-      "Word 182 (got) count: 17428\n",
-      "Word 183 (young) count: 17347\n",
-      "Word 184 (around) count: 17212\n",
-      "Word 185 (fact) count: 17087\n",
-      "Word 186 (enough) count: 16979\n",
-      "Word 187 (down) count: 16947\n",
-      "Word 188 (however) count: 16845\n",
-      "Word 189 (take) count: 16766\n",
-      "Word 190 (thought) count: 16623\n",
-      "Word 191 (may) count: 16721\n",
-      "Word 192 (world) count: 16394\n",
-      "Word 193 (both) count: 16363\n",
-      "Word 194 (between) count: 16361\n",
-      "Word 195 (own) count: 16273\n",
-      "Word 196 (give) count: 16183\n",
-      "Word 197 (series) count: 16077\n",
-      "Word 198 (original) count: 16276\n",
-      "Word 199 (ive) count: 15961\n"
-     ]
-    }
-   ],
-   "source": [
-    "# Split the training set into 60% and 40% to end up with 15,000 examples\n",
-    "# for training, 10,000 examples for validation and 25,000 examples for testing.\n",
-    "train_data, val_data, test_data = tfds.load(\n",
-    "    name=\"imdb_reviews\", \n",
-    "    split=('train[:60%]', 'train[60%:]', 'test'),\n",
-    "    as_supervised=True)\n",
-    "\n",
-    "# Print the first example.\n",
-    "for review, label in train_data.take(1):\n",
-    "    print(\"Review:\", review.numpy().decode('utf-8'))\n",
-    "    print(\"Label:\", label.numpy())\n",
-    "\n",
-    "epochs = 10\n",
-    "train_data = train_data.shuffle(buffer_size=2048).batch(batch_size, drop_remainder=True).repeat(count=epochs)\n",
-    "val_data = val_data.shuffle(buffer_size=2048).batch(batch_size, drop_remainder=True)\n",
-    "test_data = test_data.shuffle(buffer_size=2048).batch(batch_size, drop_remainder=True)\n",
-    "\n",
-    "vocab_size = 10000  # This dataset has 92061 unique words.\n",
-    "max_length = 250\n",
-    "embedding_dim = 16\n",
-    "\n",
-    "vectorize_layer = tf.keras.layers.TextVectorization(\n",
-    "    max_tokens=vocab_size,\n",
-    "    output_mode='int',\n",
-    ")\n",
-    "\n",
-    "vectorize_layer.adapt(train_data.map(lambda text, label: text))\n",
-    "\n",
-    "print(\"Most used words:\", vectorize_layer.get_vocabulary()[:10])\n",
-    "print(\"Dictionary size:\", len(vectorize_layer.get_vocabulary()))\n",
-    "\n",
-    "# Count the top n words in the training set.\n",
-    "top_n = 200\n",
-    "word_counts = np.zeros(top_n, dtype=np.int64)\n",
-    "for review, label in train_data:\n",
-    "    vectorized_reviews = vectorize_layer(review)\n",
-    "    for i in range(len(word_counts)):\n",
-    "        counts = tf.where(vectorized_reviews == i, 1, 0)\n",
-    "        word_counts[i] += tf.reduce_sum(tf.cast(counts, dtype=tf.int64))\n",
-    "\n",
-    "for i in range(len(word_counts)):\n",
-    "    print(f\"Word {i} ({vectorize_layer.get_vocabulary()[i]}) count: {word_counts[i]}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Create the trainable layers.\n",
-    "embedding_layer = tf_shell_ml.ShellEmbedding(\n",
-    "    vocab_size + 1,  # +1 for OOV token.\n",
-    "    embedding_dim,\n",
-    "    skip_embeddings_below_index=top_n,\n",
-    ")\n",
-    "# TODO dropout layer?\n",
-    "hidden_layer = tf_shell_ml.GlobalAveragePooling1D()\n",
-    "# TODO dropout layer?\n",
-    "output_layer = tf_shell_ml.ShellDense(\n",
-    "    2,\n",
-    "    activation=tf.nn.softmax,\n",
-    ")\n",
-    "\n",
-    "layers = [\n",
-    "    embedding_layer,\n",
-    "    hidden_layer,\n",
-    "    output_layer,\n",
-    "]\n",
-    "\n",
-    "loss_fn = tf_shell_ml.CategoricalCrossentropy()\n",
-    "optimizer = tf.keras.optimizers.Adam(0.1)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, define the `train_step` function which will be called for each batch on an\n",
-    "encrypted batch of labels, y. The function first does a forward on the plaintext\n",
-    "image x to compute a predicted label, then does backpropagation using the\n",
-    "encrypted label y."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def train_step(x, enc_y):\n",
-    "    # Forward pass always in plaintext\n",
-    "    y_pred = x\n",
-    "    for i, l in enumerate(layers):\n",
-    "        y_pred = l(y_pred, training=True)\n",
-    "\n",
-    "    # Backward pass.\n",
-    "    dx = loss_fn.grad(enc_y, y_pred)\n",
-    "    dJ_dw = []\n",
-    "    dJ_dx = [dx,]\n",
-    "    for l in reversed(layers):\n",
-    "        if isinstance(l, tf_shell_ml.GlobalAveragePooling1D):\n",
-    "            dw, dx = l.backward(dJ_dx[-1])\n",
-    "        else:\n",
-    "            dw, dx = l.backward(dJ_dx[-1], public_rotation_key)\n",
-    "        dJ_dw.extend(dw)\n",
-    "        dJ_dx.append(dx)\n",
-    "\n",
-    "    return reversed(dJ_dw)\n",
-    "\n",
-    "\n",
-    "@tf.function\n",
-    "def train_step_wrapper(x_batch, y_batch):\n",
-    "    if use_encryption:\n",
-    "        # Encrypt the batch of secret labels y.\n",
-    "        enc_y_batch = tf_shell.to_encrypted(y_batch, secret_key, context)\n",
-    "    else:\n",
-    "        enc_y_batch = y_batch\n",
-    "\n",
-    "    # Run the training step. The top and bottom halves of the batch are\n",
-    "    # treated as two separate mini-batches run in parallel.\n",
-    "    enc_grads = train_step(x_batch, enc_y_batch)\n",
-    "\n",
-    "    filtered_layers = [l for l in layers if len(l.weights) > 0]\n",
-    "\n",
-    "    if use_encryption:\n",
-    "        # Decrypt the weight gradients. In practice, the gradients should be\n",
-    "        # noised before decrypting.\n",
-    "        packed_grads = [tf_shell.to_tensorflow(g, secret_key) for g in enc_grads]\n",
-    "        # Unpack the plaintext gradients using the corresponding layer.\n",
-    "        grads = [l.unpack(g) for l, g in zip (filtered_layers, packed_grads)]\n",
-    "    else:\n",
-    "        grads = enc_grads\n",
-    "\n",
-    "    weights = []\n",
-    "    for l in filtered_layers:\n",
-    "        weights+=l.weights\n",
-    "\n",
-    "    # Apply the gradients to the model.\n",
-    "    optimizer.apply_gradients(zip(grads, weights))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is the training loop. Each inner iteration runs two batches of size\n",
-    "$2^{12-1}$ simultaneously.\n",
-    "\n",
-    "Tensorboard can be used to visualize the training progress. See cell output for\n",
-    "command to start tensorboard."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "To start tensorboard, run: tensorboard --logdir ./ --host 0.0.0.0\n",
-      "\ttensorboard profiling requires: pip install tensorboard_plugin_profile\n",
-      "\tvalidation loss: 0.34678205847740173\taccuracy: 0.50244140625\n",
-      "Step: 0 / 30, Time Stamp: 0.4297327995300293\n",
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1369: start (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.start` instead.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1369: start (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.start` instead.\n",
-      "2024-09-13 05:55:43.043938: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.\n",
-      "2024-09-13 05:55:43.043966: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: stop (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.stop` instead.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: stop (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "use `tf.profiler.experimental.stop` instead.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: save (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-09-13 07:05:45.882663: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:70] Profiler session collecting data.\n",
-      "2024-09-13 07:05:45.932257: I external/local_tsl/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.\n",
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/ops/summary_ops_v2.py:1420: save (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/eager/profiler.py:150: maybe_create_event_file (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:From /workspaces/tf-shell/.venv/lib/python3.10/site-packages/tensorflow/python/eager/profiler.py:150: maybe_create_event_file (from tensorflow.python.eager.profiler) is deprecated and will be removed after 2020-07-01.\n",
-      "Instructions for updating:\n",
-      "`tf.python.eager.profiler` has deprecated, use `tf.profiler` instead.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ttrain loss: 0.3463047742843628\taccuracy: 0.508227527141571\n",
-      "\tvalidation loss: 0.3464067280292511\taccuracy: 0.5057373046875\n",
-      "Step: 1 / 30, Time Stamp: 4207.950785398483\n",
-      "\ttrain loss: 0.3458217680454254\taccuracy: 0.519848644733429\n",
-      "\tvalidation loss: 0.34607988595962524\taccuracy: 0.5130615234375\n",
-      "Step: 2 / 30, Time Stamp: 8758.485680818558\n",
-      "\ttrain loss: 0.34537193179130554\taccuracy: 0.5295491814613342\n",
-      "\tvalidation loss: 0.3455347418785095\taccuracy: 0.5269775390625\n",
-      "Step: 3 / 30, Time Stamp: 12703.130770683289\n",
-      "\ttrain loss: 0.3447962999343872\taccuracy: 0.5438232421875\n",
-      "\tvalidation loss: 0.34524139761924744\taccuracy: 0.5352783203125\n",
-      "Step: 4 / 30, Time Stamp: 17270.923393011093\n",
-      "\ttrain loss: 0.3442564010620117\taccuracy: 0.5548909306526184\n",
-      "\tvalidation loss: 0.3448839783668518\taccuracy: 0.5419921875\n",
-      "Step: 5 / 30, Time Stamp: 21087.202694416046\n",
-      "WARNING:tensorflow:5 out of the last 6 calls to <function train_step_wrapper at 0x7f92ab580d30> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "WARNING:tensorflow:5 out of the last 6 calls to <function train_step_wrapper at 0x7f92ab580d30> triggered tf.function retracing. Tracing is expensive and the excessive number of tracings could be due to (1) creating @tf.function repeatedly in a loop, (2) passing tensors with different shapes, (3) passing Python objects instead of tensors. For (1), please define your @tf.function outside of the loop. For (2), @tf.function has reduce_retracing=True option that can avoid unnecessary retracing. For (3), please refer to https://www.tensorflow.org/guide/function#controlling_retracing and https://www.tensorflow.org/api_docs/python/tf/function for  more details.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\ttrain loss: 0.34387531876564026\taccuracy: 0.5640299320220947\n",
-      "\tvalidation loss: 0.34451237320899963\taccuracy: 0.5521240234375\n",
-      "Step: 6 / 30, Time Stamp: 24922.99744272232\n",
-      "\ttrain loss: 0.34310781955718994\taccuracy: 0.5768473148345947\n",
-      "\tvalidation loss: 0.3439071774482727\taccuracy: 0.561767578125\n",
-      "Step: 7 / 30, Time Stamp: 29491.1250500679\n"
-     ]
-    }
-   ],
-   "source": [
-    "start_time = time.time()\n",
-    "tf.config.run_functions_eagerly(False)\n",
-    "\n",
-    "\n",
-    "def check_accuracy(dataset):\n",
-    "    average_loss = 0\n",
-    "    average_accuracy = 0\n",
-    "    for x, y in dataset:\n",
-    "        y = tf.one_hot(tf.cast(y, tf.int32), 2)\n",
-    "\n",
-    "        y_pred = vectorize_layer(x)\n",
-    "        # Do not filter when testing.\n",
-    "        for i, l in enumerate(layers):\n",
-    "            y_pred = l(y_pred)\n",
-    "\n",
-    "        loss = tf.reduce_mean(loss_fn(y, y_pred))\n",
-    "\n",
-    "        accuracy = tf.reduce_mean(\n",
-    "            tf.cast(\n",
-    "                tf.equal(tf.argmax(y, axis=1), tf.argmax(y_pred, axis=1)), tf.float32\n",
-    "            )\n",
-    "        )\n",
-    "        average_loss += loss\n",
-    "        average_accuracy += accuracy\n",
-    "    average_loss /= len(dataset)\n",
-    "    average_accuracy /= len(dataset)\n",
-    "\n",
-    "    return average_loss, average_accuracy\n",
-    "\n",
-    "\n",
-    "# Set up tensorboard logging.\n",
-    "stamp = datetime.now().strftime(\"%Y%m%d-%H%M%S\")\n",
-    "logdir = os.path.abspath(\"\") + \"/tflogs/sentiment-%s\" % stamp\n",
-    "print(f\"To start tensorboard, run: tensorboard --logdir ./ --host 0.0.0.0\")\n",
-    "print(f\"\\ttensorboard profiling requires: pip install tensorboard_plugin_profile\")\n",
-    "writer = tf.summary.create_file_writer(logdir)\n",
-    "\n",
-    "# Initial accuracy\n",
-    "loss, accuracy = check_accuracy(val_data)\n",
-    "tf.print(f\"\\tvalidation loss: {loss}\\taccuracy: {accuracy}\")\n",
-    "\n",
-    "# Iterate over the batches of the dataset.\n",
-    "for step, (x_batch, y_batch) in enumerate(train_data.take(batch_size)):\n",
-    "    print(f\"Step: {step} / {len(train_data)}, Time Stamp: {time.time() - start_time}\")\n",
-    "\n",
-    "    y_batch = tf.one_hot(tf.cast(y_batch, tf.int32), 2)\n",
-    "\n",
-    "    if step == 0:\n",
-    "        tf.summary.trace_on(\n",
-    "            graph=True,\n",
-    "            profiler=True,\n",
-    "            # profiler_outdir=logdir,  # Only for tf 2.16+\n",
-    "        )\n",
-    "\n",
-    "    x_batch = vectorize_layer(x_batch)  # No shape inference, do outside tf.function\n",
-    "    train_step_wrapper(x_batch, y_batch)\n",
-    "\n",
-    "    # tf.print(\"embedding layer slot counter:\")\n",
-    "    # tf.print(embedding_layer._last_slot_count, summarize=-1)\n",
-    "    # tf.print(\"embedding layer max slot counter:\")\n",
-    "    # tf.print(tf.reduce_max(embedding_layer._last_slot_count), summarize=-1)\n",
-    "\n",
-    "    if step == 0:\n",
-    "        with writer.as_default():\n",
-    "            tf.summary.trace_export(\n",
-    "                name=\"sentiment\",\n",
-    "                step=step,\n",
-    "                profiler_outdir=logdir,\n",
-    "            )\n",
-    "\n",
-    "    loss, accuracy = check_accuracy(train_data)\n",
-    "    tf.print(f\"\\ttrain loss: {loss}\\taccuracy: {accuracy}\")\n",
-    "    loss, accuracy = check_accuracy(val_data)\n",
-    "    tf.print(f\"\\tvalidation loss: {loss}\\taccuracy: {accuracy}\")\n",
-    "\n",
-    "    with writer.as_default():\n",
-    "        tf.summary.scalar(\"loss\", loss, step=step)\n",
-    "        tf.summary.scalar(\"accuracy\", accuracy, step=step)\n",
-    "\n",
-    "\n",
-    "print(f\"Total training time: {time.time() - start_time} seconds\")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": ".venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/examples/parallelization_demo.ipynb b/examples/parallelization_demo.ipynb
new file mode 100644
index 0000000..1fa8eed
--- /dev/null
+++ b/examples/parallelization_demo.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parallelizing HE with tf-shell\n",
+    "\n",
+    "There are two forms of parallelism in `tf-shell`.\n",
+    "\n",
+    "First, many of tf-shell's operations internally parallelize over the dimension\n",
+    "of the input. For example, multiplying two [slot, 3] ciphertexts (i.e. a vector\n",
+    "of ciphertexts with length 3) may run the three element-wise multiplications in\n",
+    "parallel. This is performed using TensorFlow's thread pool.\n",
+    "\n",
+    "Second is graph level parallelism. This is where multiple operations are run in\n",
+    "parallel. As an extension of TensorFlow, `tf-shell` supports this."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-10-29 19:47:41.337185: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-10-29 19:47:41.532076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO: Generating key\n",
+      "INFO: Generating rotation key\n",
+      "INFO: Generating rotation key\n"
+     ]
+    }
+   ],
+   "source": [
+    "import tf_shell\n",
+    "import tensorflow as tf\n",
+    "import timeit\n",
+    "\n",
+    "context = tf_shell.create_context64(\n",
+    "    log_n=10,\n",
+    "    main_moduli=[8556589057, 8388812801],\n",
+    "    plaintext_modulus=40961,\n",
+    "    scaling_factor=3,\n",
+    "    seed=\"test_seed\",\n",
+    ")\n",
+    "\n",
+    "secret_key = tf_shell.create_key64(context)\n",
+    "rotation_key = tf_shell.create_rotation_key64(context, secret_key)\n",
+    "\n",
+    "single_pt = tf.random.uniform([context.num_slots, 1], dtype=tf.float32, maxval=10)\n",
+    "single_ct = tf_shell.to_encrypted(single_pt, secret_key, context)\n",
+    "\n",
+    "vector_pt = tf.random.uniform([context.num_slots, 8], dtype=tf.float32, maxval=10)\n",
+    "vector_ct = tf_shell.to_encrypted(vector_pt, secret_key, context)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Op Level Parallelism\n",
+    "\n",
+    "Benchmark the time taken to multiply two ciphertexts. The first test performs\n",
+    "multiplication between two individual ciphertexts (each has shape [slots, 1]).\n",
+    "The second test measures element-wise multiplication between [slot, 8]\n",
+    "ciphertexts, i.e. a vector of ciphertexts of length 8.\n",
+    "\n",
+    "Without parallelism, the element-wise multiplication is expected to take 8 times\n",
+    "longer than the individual multiplication, but we show here this is not the\n",
+    "case."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Multiply single ct * single ct: 0.028574070000104257\n",
+      "Multiply vector ct * vector ct: 0.038458762001027935\n"
+     ]
+    }
+   ],
+   "source": [
+    "def mul_single_ct_ct():\n",
+    "    return single_ct * single_ct\n",
+    "\n",
+    "def mul_vector_ct_ct():\n",
+    "    return vector_ct * vector_ct\n",
+    "\n",
+    "single_ct_ct_time = min(timeit.Timer(mul_single_ct_ct).repeat(repeat=10, number=100))\n",
+    "print(f\"Multiply single ct * single ct: {single_ct_ct_time}\")\n",
+    "\n",
+    "vector_ct_ct_time = min(timeit.Timer(mul_vector_ct_ct).repeat(repeat=10, number=100))\n",
+    "print(f\"Multiply vector ct * vector ct: {vector_ct_ct_time}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Graph Level Parallelism\n",
+    "\n",
+    "Benchmark the time taken to perform two multiplications. The first test is run\n",
+    "in TensorFlow's eager mode, meaning the two multiplications are run\n",
+    "sequentially. The second test is run in graph mode, and tensorflow may run the\n",
+    "two multiplications in parallel."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Imperitive execution: 21.420444982000845\n",
+      "Graph-based execution: 9.077498362999904\n"
+     ]
+    }
+   ],
+   "source": [
+    "large_pt = tf.random.uniform([context.num_slots, 10000], dtype=tf.float32, maxval=10)\n",
+    "large_ct = tf_shell.to_encrypted(large_pt, secret_key, context)\n",
+    "\n",
+    "def fn():\n",
+    "    # The two operations may be run in parallel when in graph mode.\n",
+    "    return [large_ct + 1, large_ct + 2]\n",
+    "\n",
+    "def eager():\n",
+    "    return fn()\n",
+    "\n",
+    "@tf.function\n",
+    "def deferred():\n",
+    "    return fn()\n",
+    "\n",
+    "single_ct_ct_time = min(timeit.Timer(eager).repeat(repeat=1, number=100))\n",
+    "print(f\"Imperitive execution: {single_ct_ct_time}\")\n",
+    "\n",
+    "vector_ct_ct_time = min(timeit.Timer(deferred).repeat(repeat=1, number=100))\n",
+    "print(f\"Graph-based execution: {vector_ct_ct_time}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}