reduce sparse problem size for docs

Quantco · Aug 27, 2024 · 8282381 · 8282381
1 parent 6a77952
commit 8282381
Showing 1 changed file with 32 additions and 20 deletions.
diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
@@ -110,7 +110,7 @@
    "source": [
     "def generate_causal_data(\n",
     "    n_samples=100_000,\n",
-    "    n_categories=1000,\n",
+    "    n_categories=500,\n",
     "    n_features=100,\n",
     "    tau_magnitude=1.0,\n",
     "):\n",
@@ -168,8 +168,9 @@
     "\n",
     "\n",
     "X, W, Y, tau, propensity_score = generate_causal_data(\n",
-    "    n_samples=10000, tau_magnitude=1.0\n",
-    ")\n"
+    "    n_samples=1000, tau_magnitude=1.0\n",
+    ")\n",
+    "Xdf = pd.DataFrame(X)"
    ]
   },
   {
@@ -179,10 +180,14 @@
    "outputs": [],
    "source": [
     "# sparse and dense X matrices\n",
-    "e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically\n",
-    "Xdf = pd.DataFrame(X)\n",
+    "e1 = OneHotEncoder(\n",
+    "    sparse_output=True\n",
+    ")  # onehot encoder generates sparse output automatically\n",
+    "\n",
     "X_csr = e1.fit_transform(X)\n",
-    "X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas"
+    "X_np = pd.get_dummies(\n",
+    "    Xdf, columns=Xdf.columns\n",
+    ").values  # dense onehot encoding with pandas"
    ]
   },
   {
@@ -195,8 +200,8 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "Sparse data memory: 7.63MB\n",
-      "Dense data memory: 953.66MB\n"
+      "Sparse data memory: 0.76MB\n",
+      "Dense data memory: 41.28MB\n"
      ]
     }
    ],
@@ -266,8 +271,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n",
-      "(array([1.0007226]), array([0.02021719]))\n"
+      "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n",
+      "(array([1.0161235]), array([0.06374022]))\n"
      ]
     }
    ],
@@ -291,26 +296,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n",
-      "(array([1.00067664]), array([0.02021555]))\n"
+      "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n",
+      "(array([1.01609547]), array([0.06384197]))\n"
      ]
     }
    ],
    "source": [
     "fit_drlearner_wrapper(X_np)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. "
-   ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   },
   "mystnb": {
    "execution_timeout": 120