From e4501b4450791ac8ab3ef2e44d31ff8c6864bcba Mon Sep 17 00:00:00 2001 From: Apoorva Lal Date: Wed, 28 Aug 2024 08:54:09 -0700 Subject: [PATCH] final touches --- CHANGELOG.rst | 8 ++ docs/examples/example_estimating_ates.ipynb | 100 ++++---------------- docs/examples/example_sparse_inputs.ipynb | 93 ++++-------------- metalearners/_utils.py | 2 +- 4 files changed, 45 insertions(+), 158 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 56fd87c7..34eb7f81 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog ========= +0.11.0 (2024-09-xx) +------------------- + +**New features** + +* Add support for using ``scipy.sparse.csr_matrix`` as datastructure for covariates ``X``. + + 0.10.0 (2024-08-13) ------------------- diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb index 79a69d52..ec88ec48 100644 --- a/docs/examples/example_estimating_ates.ipynb +++ b/docs/examples/example_estimating_ates.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -99,20 +99,9 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.083595103597918, 0.06526671583747883)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n", "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n", @@ -121,20 +110,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.1433722387308025, 0.06345124983351998)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n", " df) .fit(cov_type=\"HC1\")\n", @@ -160,42 +138,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from metalearners import DRLearner\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", @@ -204,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "editable": true, "slideshow": { @@ -212,18 +157,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([1.02931589]), array([0.06679633]))" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "metalearners_dr = DRLearner(\n", " nuisance_model_factory=LGBMRegressor,\n", @@ -629,8 +563,14 @@ } ], "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "name": "python", + "version": "3.11.7" }, "mystnb": { "execution_timeout": 120 diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index cc351bfa..03cad0c2 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -20,42 +20,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import time, psutil, os, gc\n", "import numpy as np\n", @@ -76,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -175,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -192,19 +159,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Sparse data memory: 0.76MB\n", - "Dense data memory: 41.28MB\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" @@ -219,11 +176,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def fit_drlearner_wrapper(X):\n", + "def fit_drlearner_wrapper(X, name):\n", " start_memory = get_memory_usage()\n", " start_time = time.time()\n", " metalearners_dr = DRLearner(\n", @@ -251,7 +208,7 @@ " end_memory = get_memory_usage()\n", " runtime = end_time - start_time\n", " memory_used = end_memory - start_memory\n", - " print(f\"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n", + " print(f\"{name} data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n", " print(metalearners_est)" ] }, @@ -264,20 +221,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n", - "(array([1.0161235]), array([0.06374022]))\n" - ] - } - ], + "outputs": [], "source": [ - "fit_drlearner_wrapper(X_csr)" + "fit_drlearner_wrapper(X_csr, \"Sparse\")" ] }, { @@ -289,20 +237,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n", - "(array([1.01609547]), array([0.06384197]))\n" - ] - } - ], + "outputs": [], "source": [ - "fit_drlearner_wrapper(X_np)" + "fit_drlearner_wrapper(X_np, \"Dense\")" ] } ], diff --git a/metalearners/_utils.py b/metalearners/_utils.py index c1d63d67..a5c02f37 100644 --- a/metalearners/_utils.py +++ b/metalearners/_utils.py @@ -25,7 +25,7 @@ default_rng = np.random.default_rng() -def safe_len(X): +def safe_len(X: Matrix) -> int: if scipy.sparse.issparse(X): return X.shape[0] return len(X)