diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb
index 0eda7327..4cbd597c 100644
--- a/docs/examples/example_estimating_ates.ipynb
+++ b/docs/examples/example_estimating_ates.ipynb
@@ -20,7 +20,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -99,9 +99,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2.083595103597918, 0.06526671583747883)"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n",
"naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n",
@@ -110,9 +121,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(2.1433722387308025, 0.06345124983351998)"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n",
" df) .fit(cov_type=\"HC1\")\n",
@@ -138,9 +160,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"from metalearners import DRLearner\n",
"from lightgbm import LGBMRegressor, LGBMClassifier\n",
@@ -149,9 +204,26 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "execution_count": 6,
+ "metadata": {
+ "editable": true,
+ "slideshow": {
+ "slide_type": ""
+ },
+ "tags": []
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(array([1.02931589]), array([0.06679633]))"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"metalearners_dr = DRLearner(\n",
" nuisance_model_factory=LGBMRegressor,\n",
@@ -558,9 +630,9 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "dev",
"language": "python",
- "name": "python3"
+ "name": "dev"
},
"language_info": {
"codemirror_mode": {
@@ -572,7 +644,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.11.9"
+ "version": "3.11.7"
+ },
+ "mystnb": {
+ "execution_timeout": 60
}
},
"nbformat": 4,
diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
index 04677eca..50c797b3 100644
--- a/docs/examples/example_sparse_inputs.ipynb
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -20,9 +20,42 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n"
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
"source": [
"import time, psutil, os, gc\n",
"import numpy as np\n",
@@ -43,7 +76,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -71,7 +104,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -141,7 +174,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -154,9 +187,19 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "Sparse data memory: 7.63MB\n",
+ "Dense data memory: 953.66MB\n"
+ ]
+ }
+ ],
"source": [
"print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n",
"print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")"
@@ -171,7 +214,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -207,15 +250,6 @@
" print(metalearners_est)"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "gc.collect()"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -225,12 +259,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n",
+ "(array([1.0007226]), array([0.02021719]))\n"
+ ]
+ }
+ ],
"source": [
- "fit_drlearner_wrapper(X_csr)\n",
- "gc.collect()"
+ "fit_drlearner_wrapper(X_csr)"
]
},
{
@@ -242,12 +284,20 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n",
+ "(array([1.00067664]), array([0.02021555]))\n"
+ ]
+ }
+ ],
"source": [
- "fit_drlearner_wrapper(X_np)\n",
- "gc.collect()"
+ "fit_drlearner_wrapper(X_np)"
]
},
{
@@ -256,147 +306,6 @@
"source": [
"In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. "
]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Prediction \n",
- "\n",
- "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n",
- " X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n",
- " y = np.zeros(n_samples)\n",
- " # Select a few features for main effects\n",
- " main_effect_features = np.random.choice(n_features, 3, replace=False)\n",
- " # Create main effects\n",
- " for i in main_effect_features:\n",
- " # Create a random effect for each category\n",
- " category_effects = np.random.normal(0, 1, n_categories)\n",
- " y += category_effects[X[:, i]]\n",
- " # Select a couple of feature pairs for interaction effects\n",
- " interaction_pairs = [\n",
- " (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n",
- " ]\n",
- " selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n",
- " # Create interaction effects\n",
- " for idx in selected_interactions:\n",
- " i, j = interaction_pairs[idx]\n",
- " # Create a sparse interaction effect\n",
- " interaction_effect = np.random.choice(\n",
- " [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n",
- " )\n",
- " y += interaction_effect[X[:, i], X[:, j]]\n",
- " # Add a non-linear effect for one feature\n",
- " nonlinear_feature = np.random.choice(n_features)\n",
- " y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n",
- " y = (y - np.mean(y)) / np.std(y)\n",
- " y += np.random.normal(0, 0.1, n_samples)\n",
- "\n",
- " return X, y"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def prepare_data(X):\n",
- " e1 = OneHotEncoder(sparse_output=True)\n",
- " # dense - use pd.get_dummies to mimic current practice\n",
- " Xdf = pd.DataFrame(X)\n",
- " return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n",
- "\n",
- "def fit_and_measure(X_train, y_train, X_test, y_test):\n",
- " start_memory = get_memory_usage()\n",
- " start_time = time.time()\n",
- " m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n",
- " m.fit(X_train, y_train)\n",
- " end_time = time.time()\n",
- " end_memory = get_memory_usage()\n",
- " runtime = end_time - start_time\n",
- " memory_used = end_memory - start_memory\n",
- "\n",
- " # Compute accuracy metrics\n",
- " y_pred = m.predict(X_test)\n",
- " mse = mean_squared_error(y_test, y_pred)\n",
- " r2 = r2_score(y_test, y_pred)\n",
- "\n",
- " return runtime, memory_used, mse, r2"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X, y = generate_dummy_data()\n",
- "# Split the data into train and test sets\n",
- "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
- "X_train_sparse, X_train_dense = prepare_data(X_train)\n",
- "X_test_sparse, X_test_dense = prepare_data(X_test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n",
- "gc.collect()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n",
- "gc.collect()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n",
- "print(\n",
- " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \" # type: ignore[name-defined]\n",
- " f\"Memory used: {sparse_memory:.2f}MB, \" # type: ignore[name-defined]\n",
- " f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\" # type: ignore[name-defined]\n",
- ") \n",
- "print(\n",
- " f\"Dense data - Runtime: {dense_runtime:.2f}s, \" # type: ignore[name-defined]\n",
- " f\"Memory used: {dense_memory:.2f}MB, \" # type: ignore[name-defined]\n",
- " f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\" # type: ignore[name-defined]\n",
- ")\n",
- "\n",
- "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n",
- "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -415,7 +324,10 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.12.3"
+ "version": "3.11.7"
+ },
+ "mystnb": {
+ "execution_timeout": 120
}
},
"nbformat": 4,