diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb index 0eda7327..4cbd597c 100644 --- a/docs/examples/example_estimating_ates.ipynb +++ b/docs/examples/example_estimating_ates.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -99,9 +99,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2.083595103597918, 0.06526671583747883)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n", "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n", @@ -110,9 +121,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2.1433722387308025, 0.06345124983351998)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n", " df) .fit(cov_type=\"HC1\")\n", @@ -138,9 +160,42 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from metalearners import DRLearner\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", @@ -149,9 +204,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.02931589]), array([0.06679633]))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "metalearners_dr = DRLearner(\n", " nuisance_model_factory=LGBMRegressor,\n", @@ -558,9 +630,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "dev", "language": "python", - "name": "python3" + "name": "dev" }, "language_info": { "codemirror_mode": { @@ -572,7 +644,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.7" + }, + "mystnb": { + "execution_timeout": 60 } }, "nbformat": 4, diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index 04677eca..50c797b3 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -20,9 +20,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import time, psutil, os, gc\n", "import numpy as np\n", @@ -43,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -141,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -154,9 +187,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Sparse data memory: 7.63MB\n", + "Dense data memory: 953.66MB\n" + ] + } + ], "source": [ "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" @@ -171,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -207,15 +250,6 @@ " print(metalearners_est)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gc.collect()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -225,12 +259,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n", + "(array([1.0007226]), array([0.02021719]))\n" + ] + } + ], "source": [ - "fit_drlearner_wrapper(X_csr)\n", - "gc.collect()" + "fit_drlearner_wrapper(X_csr)" ] }, { @@ -242,12 +284,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n", + "(array([1.00067664]), array([0.02021555]))\n" + ] + } + ], "source": [ - "fit_drlearner_wrapper(X_np)\n", - "gc.collect()" + "fit_drlearner_wrapper(X_np)" ] }, { @@ -256,147 +306,6 @@ "source": [ "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. " ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prediction \n", - "\n", - "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n", - " X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n", - " y = np.zeros(n_samples)\n", - " # Select a few features for main effects\n", - " main_effect_features = np.random.choice(n_features, 3, replace=False)\n", - " # Create main effects\n", - " for i in main_effect_features:\n", - " # Create a random effect for each category\n", - " category_effects = np.random.normal(0, 1, n_categories)\n", - " y += category_effects[X[:, i]]\n", - " # Select a couple of feature pairs for interaction effects\n", - " interaction_pairs = [\n", - " (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n", - " ]\n", - " selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n", - " # Create interaction effects\n", - " for idx in selected_interactions:\n", - " i, j = interaction_pairs[idx]\n", - " # Create a sparse interaction effect\n", - " interaction_effect = np.random.choice(\n", - " [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n", - " )\n", - " y += interaction_effect[X[:, i], X[:, j]]\n", - " # Add a non-linear effect for one feature\n", - " nonlinear_feature = np.random.choice(n_features)\n", - " y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n", - " y = (y - np.mean(y)) / np.std(y)\n", - " y += np.random.normal(0, 0.1, n_samples)\n", - "\n", - " return X, y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_data(X):\n", - " e1 = OneHotEncoder(sparse_output=True)\n", - " # dense - use pd.get_dummies to mimic current practice\n", - " Xdf = pd.DataFrame(X)\n", - " return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n", - "\n", - "def fit_and_measure(X_train, y_train, X_test, y_test):\n", - " start_memory = get_memory_usage()\n", - " start_time = time.time()\n", - " m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n", - " m.fit(X_train, y_train)\n", - " end_time = time.time()\n", - " end_memory = get_memory_usage()\n", - " runtime = end_time - start_time\n", - " memory_used = end_memory - start_memory\n", - "\n", - " # Compute accuracy metrics\n", - " y_pred = m.predict(X_test)\n", - " mse = mean_squared_error(y_test, y_pred)\n", - " r2 = r2_score(y_test, y_pred)\n", - "\n", - " return runtime, memory_used, mse, r2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = generate_dummy_data()\n", - "# Split the data into train and test sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "X_train_sparse, X_train_dense = prepare_data(X_train)\n", - "X_test_sparse, X_test_dense = prepare_data(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n", - "gc.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n", - "gc.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n", - "print(\n", - " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \" # type: ignore[name-defined]\n", - " f\"Memory used: {sparse_memory:.2f}MB, \" # type: ignore[name-defined]\n", - " f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\" # type: ignore[name-defined]\n", - ") \n", - "print(\n", - " f\"Dense data - Runtime: {dense_runtime:.2f}s, \" # type: ignore[name-defined]\n", - " f\"Memory used: {dense_memory:.2f}MB, \" # type: ignore[name-defined]\n", - " f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\" # type: ignore[name-defined]\n", - ")\n", - "\n", - "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n", - "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -415,7 +324,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.11.7" + }, + "mystnb": { + "execution_timeout": 120 } }, "nbformat": 4,