From c1351d0184573f9325b9d88e8607a0e781e13216 Mon Sep 17 00:00:00 2001
From: Apoorva Lal <apoorval@netflix.com>
Date: Fri, 16 Aug 2024 15:54:32 -0700
Subject: [PATCH 1/9] add support for csr matrix

---
 docs/examples/example_sparse_inputs.ipynb | 550 ++++++++++++++++++++++
 docs/examples/index.rst                   |   1 +
 metalearners/_typing.py                   |   3 +-
 metalearners/_utils.py                    |   7 +
 metalearners/cross_fit_estimator.py       |  19 +-
 metalearners/drlearner.py                 |   7 +-
 metalearners/explainer.py                 |   4 +-
 metalearners/metalearner.py               |   5 +-
 metalearners/rlearner.py                  |   9 +-
 metalearners/slearner.py                  |   3 +-
 metalearners/utils.py                     |   5 +-
 metalearners/xlearner.py                  |   7 +-
 12 files changed, 596 insertions(+), 24 deletions(-)
 create mode 100644 docs/examples/example_sparse_inputs.ipynb
diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
new file mode 100644
index 00000000..79a15786
--- /dev/null
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -0,0 +1,550 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(example-sparse)=\n",
+    "\n",
+    " Example: Using Sparse Covariate Matrices\n",
+    "=============================\n",
+    "\n",
+    "Motivation\n",
+    "----------\n",
+    "\n",
+    "In many applications, we want to adjust for categorical covariates with many levels. As a natural pre-processing step, this may involve one-hot-encoding the covariates, which can lead to a high-dimensional covariate matrix, which is typically very sparse. Many scikit-style learners accept (scipy's) sparse matrices as input, which allows us to use them for treatment effect estimation as well. \n",
+    "\n",
+    "Example\n",
+    "-------"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type='text/css'>\n",
+       ".datatable table.frame { margin-bottom: 0; }\n",
+       ".datatable table.frame thead { border-bottom: none; }\n",
+       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
+       ".datatable .bool    { background: #DDDD99; }\n",
+       ".datatable .object  { background: #565656; }\n",
+       ".datatable .int     { background: #5D9E5D; }\n",
+       ".datatable .float   { background: #4040CC; }\n",
+       ".datatable .str     { background: #CC4040; }\n",
+       ".datatable .time    { background: #40CC40; }\n",
+       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
+       ".datatable .frame tbody td { text-align: left; }\n",
+       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
+       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
+       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
+       ".datatable .sp {  opacity: 0.25;}\n",
+       ".datatable .footer { font-size: 9px; }\n",
+       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import time, psutil, os, gc\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import scipy as sp\n",
+    "\n",
+    "from sklearn.dummy import DummyRegressor\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "\n",
+    "from lightgbm import LGBMRegressor, LGBMClassifier\n",
+    "from metalearners import DRLearner"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_memory_usage():\n",
+    "    process = psutil.Process(os.getpid())\n",
+    "    return process.memory_info().rss / 1024 / 1024  # in MB\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Causal Inference\n",
+    "\n",
+    "### DRLearner\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We generate some data where X comprises of 100 categorical variables with 1000 possible levels. Naively one-hot-encoding this data produces a very large matrix with many zeroes, which is an ideal application of `scipy.sparse.csr_matrix`. We then use the `DRLearner` to estimate the treatment effect. \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_causal_data(\n",
+    "    n_samples=100_000,\n",
+    "    n_categories=1000,\n",
+    "    n_features=100,\n",
+    "    tau_magnitude=1.0,\n",
+    "):\n",
+    "    ######################################################################\n",
+    "    # Generate covariate matrix X\n",
+    "    X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n",
+    "    ######################################################################\n",
+    "    # Generate potential outcome y0\n",
+    "    y0 = np.zeros(n_samples)\n",
+    "    # Select a few features for main effects\n",
+    "    main_effect_features = np.random.choice(n_features, 3, replace=False)\n",
+    "    # Create main effects - fully dense\n",
+    "    for i in main_effect_features:\n",
+    "        category_effects = np.random.normal(0, 4, n_categories)\n",
+    "        y0 += category_effects[X[:, i]]\n",
+    "    # Select a couple of feature pairs for interaction effects\n",
+    "    interaction_pairs = [\n",
+    "        (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n",
+    "    ]\n",
+    "    selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n",
+    "    # Create interaction effects\n",
+    "    for idx in selected_interactions:\n",
+    "        i, j = interaction_pairs[idx]\n",
+    "        interaction_effect = np.random.choice(\n",
+    "            [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]\n",
+    "        )\n",
+    "        y0 += interaction_effect[X[:, i], X[:, j]]\n",
+    "    # Normalize y0\n",
+    "    y0 = (y0 - np.mean(y0)) / np.std(y0)\n",
+    "    y0 += np.random.normal(0, 0.1, n_samples)\n",
+    "    ######################################################################\n",
+    "    # Generate treatment assignment W\n",
+    "    propensity_score = np.zeros(n_samples)\n",
+    "    for i in main_effect_features:\n",
+    "        category_effects = np.random.normal(0, 4, n_categories)\n",
+    "        propensity_score += category_effects[X[:, i]]\n",
+    "    # same interactions enter pscore\n",
+    "    # Create interaction effects\n",
+    "    for idx in selected_interactions:\n",
+    "        i, j = interaction_pairs[idx]\n",
+    "        interaction_effect = np.random.choice(\n",
+    "            [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]\n",
+    "        )\n",
+    "        propensity_score += interaction_effect[X[:, i], X[:, j]]\n",
+    "    # Convert to probabilities using logistic function\n",
+    "    propensity_score = sp.special.expit(propensity_score)\n",
+    "    # Generate binary treatment\n",
+    "    W = np.random.binomial(1, propensity_score)\n",
+    "    ######################################################################\n",
+    "    # Generate treatment effect\n",
+    "    tau = tau_magnitude * np.ones(n_samples)\n",
+    "    # Generate final outcome\n",
+    "    Y = y0 + W * tau\n",
+    "    return X, W, Y, tau, propensity_score\n",
+    "\n",
+    "\n",
+    "X, W, Y, tau, propensity_score = generate_causal_data(\n",
+    "    n_samples=10000, tau_magnitude=1.0\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sparse and dense X matrices\n",
+    "e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically\n",
+    "Xdf = pd.DataFrame(X)\n",
+    "X_csr = e1.fit_transform(X)\n",
+    "X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Sparse data memory: 7.63MB\n",
+      "Dense data memory: 953.64MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n",
+    "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, the memory footprint of the sparse matrix is considerably smaller than the dense matrix. \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def fit_drlearner_wrapper(X):\n",
+    "    start_memory = get_memory_usage()\n",
+    "    start_time = time.time()\n",
+    "    metalearners_dr = DRLearner(\n",
+    "        nuisance_model_factory=LGBMRegressor,\n",
+    "        treatment_model_factory=DummyRegressor,\n",
+    "        propensity_model_factory=LGBMClassifier,\n",
+    "        is_classification=False,\n",
+    "        n_variants=2,\n",
+    "        nuisance_model_params={\"verbose\": -1},\n",
+    "        propensity_model_params={\"verbose\": -1},\n",
+    "    )\n",
+    "\n",
+    "    metalearners_dr.fit_all_nuisance(\n",
+    "        X=X,\n",
+    "        y=Y,\n",
+    "        w=W,\n",
+    "    )\n",
+    "    metalearners_est = metalearners_dr.average_treatment_effect(\n",
+    "        X=X,\n",
+    "        y=Y,\n",
+    "        w=W,\n",
+    "        is_oos=False,\n",
+    "    )\n",
+    "    end_time = time.time()\n",
+    "    end_memory = get_memory_usage()\n",
+    "    runtime = end_time - start_time\n",
+    "    memory_used = end_memory - start_memory\n",
+    "    print(f\"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n",
+    "    print(metalearners_est)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`scipy.sparse.csr_matrix` input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sparse data - Runtime: 15.04s, Memory used: 324.86MB\n",
+      "(array([0.9523358]), array([0.0202085]))\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "318"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_drlearner_wrapper(X_csr)\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`np.ndarray` input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sparse data - Runtime: 117.22s, Memory used: 87.21MB\n",
+      "(array([0.95124745]), array([0.02021724]))\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "190"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "fit_drlearner_wrapper(X_np)\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prediction \n",
+    "\n",
+    "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n",
+    "    X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n",
+    "    y = np.zeros(n_samples)\n",
+    "    # Select a few features for main effects\n",
+    "    main_effect_features = np.random.choice(n_features, 3, replace=False)\n",
+    "    # Create main effects\n",
+    "    for i in main_effect_features:\n",
+    "        # Create a random effect for each category\n",
+    "        category_effects = np.random.normal(0, 1, n_categories)\n",
+    "        y += category_effects[X[:, i]]\n",
+    "    # Select a couple of feature pairs for interaction effects\n",
+    "    interaction_pairs = [\n",
+    "        (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n",
+    "    ]\n",
+    "    selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n",
+    "    # Create interaction effects\n",
+    "    for idx in selected_interactions:\n",
+    "        i, j = interaction_pairs[idx]\n",
+    "        # Create a sparse interaction effect\n",
+    "        interaction_effect = np.random.choice(\n",
+    "            [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n",
+    "        )\n",
+    "        y += interaction_effect[X[:, i], X[:, j]]\n",
+    "    # Add a non-linear effect for one feature\n",
+    "    nonlinear_feature = np.random.choice(n_features)\n",
+    "    y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n",
+    "    y = (y - np.mean(y)) / np.std(y)\n",
+    "    y += np.random.normal(0, 0.1, n_samples)\n",
+    "\n",
+    "    return X, y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def prepare_data(X):\n",
+    "    e1 = OneHotEncoder(sparse_output=True)\n",
+    "    # dense - use pd.get_dummies to mimic current practice\n",
+    "    Xdf = pd.DataFrame(X)\n",
+    "    return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n",
+    "\n",
+    "def fit_and_measure(X_train, y_train, X_test, y_test):\n",
+    "    start_memory = get_memory_usage()\n",
+    "    start_time = time.time()\n",
+    "    m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n",
+    "    m.fit(X_train, y_train)\n",
+    "    end_time = time.time()\n",
+    "    end_memory = get_memory_usage()\n",
+    "    runtime = end_time - start_time\n",
+    "    memory_used = end_memory - start_memory\n",
+    "\n",
+    "    # Compute accuracy metrics\n",
+    "    y_pred = m.predict(X_test)\n",
+    "    mse = mean_squared_error(y_test, y_pred)\n",
+    "    r2 = r2_score(y_test, y_pred)\n",
+    "\n",
+    "    return runtime, memory_used, mse, r2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X, y = generate_dummy_data()\n",
+    "# Split the data into train and test sets\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+    "X_train_sparse, X_train_dense = prepare_data(X_train)\n",
+    "X_test_sparse, X_test_dense = prepare_data(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1min 4s, sys: 1.83 s, total: 1min 5s\n",
+      "Wall time: 8.6 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "39"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 2min 50s, sys: 6.88 s, total: 2min 57s\n",
+      "Wall time: 31.6 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "35"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n",
+    "gc.collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sparse data - Runtime: 8.48s, Memory used: 69.88MB, MSE: 0.8659, R2: 0.1396\n",
+      "Dense data - Runtime: 30.58s, Memory used: 1.21MB, MSE: 0.8659, R2: 0.1396\n",
+      "\n",
+      "Sparse data memory: 12.21MB\n",
+      "Dense data memory: 1525.88MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\n",
+    "    f\"Sparse data - Runtime: {sparse_runtime:.2f}s, Memory used: {sparse_memory:.2f}MB, MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"\n",
+    ")\n",
+    "print(\n",
+    "    f\"Dense data - Runtime: {dense_runtime:.2f}s, Memory used: {dense_memory:.2f}MB, MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"\n",
+    ")\n",
+    "\n",
+    "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n",
+    "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index b91cea15..81401fdc 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -16,3 +16,4 @@ Examples
    Estimating CATEs for survival analysis <example_survival.ipynb>
    What if I know the propensity score? <example_propensity.ipynb>
    Converting a MetaLearner to ONNX <example_onnx.ipynb>
+   Using Sparse Covariate Matrices <example_sparse_inputs.ipynb>
diff --git a/metalearners/_typing.py b/metalearners/_typing.py
index 95b66b8b..10a6f4d7 100644
--- a/metalearners/_typing.py
+++ b/metalearners/_typing.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import scipy.sparse as sps
 
 PredictMethod = Literal["predict", "predict_proba"]
 
@@ -21,7 +22,7 @@
 
 # ruff is not happy about the usage of Union.
 Vector = Union[pd.Series, np.ndarray]  # noqa
-Matrix = Union[pd.DataFrame, np.ndarray]  # noqa
+Matrix = Union[pd.DataFrame, np.ndarray, sps.csr_matrix]  # noqa
 
 
 class _ScikitModel(Protocol):
diff --git a/metalearners/_utils.py b/metalearners/_utils.py
index 2337c421..c1d63d67 100644
--- a/metalearners/_utils.py
+++ b/metalearners/_utils.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 import pandas as pd
+import scipy
 from sklearn.base import check_array, check_X_y, is_classifier, is_regressor
 from sklearn.ensemble import (
     HistGradientBoostingClassifier,
@@ -24,6 +25,12 @@
 default_rng = np.random.default_rng()
 
 
+def safe_len(X):
+    if scipy.sparse.issparse(X):
+        return X.shape[0]
+    return len(X)
+
+
 def index_matrix(matrix: Matrix, rows: Vector) -> Matrix:
     """Subselect certain rows from a matrix."""
     if isinstance(rows, pd.Series):
diff --git a/metalearners/cross_fit_estimator.py b/metalearners/cross_fit_estimator.py
index aa112c03..568ca274 100644
--- a/metalearners/cross_fit_estimator.py
+++ b/metalearners/cross_fit_estimator.py
@@ -16,7 +16,12 @@
 from typing_extensions import Self
 
 from metalearners._typing import Matrix, OosMethod, PredictMethod, SplitIndices, Vector
-from metalearners._utils import _ScikitModel, index_matrix, validate_number_positive
+from metalearners._utils import (
+    _ScikitModel,
+    index_matrix,
+    safe_len,
+    validate_number_positive,
+)
 
 OVERALL: OosMethod = "overall"
 MEDIAN: OosMethod = "median"
@@ -157,7 +162,7 @@ def fit(
         (train_indices, test_indices) tuples indicating how to split the data at hand
         into train and test/estimation sets for different folds.
         """
-        _validate_data_match_prior_split(len(X), self._test_indices)
+        _validate_data_match_prior_split(safe_len(X), self._test_indices)
 
         if fit_params is None:
             fit_params = dict()
@@ -215,13 +220,13 @@ def _n_outputs(self, method: PredictMethod) -> int:
     def _predict_all(self, X: Matrix, method: PredictMethod) -> np.ndarray:
         n_outputs = self._n_outputs(method)
         predictions = self._initialize_prediction_tensor(
-            n_observations=len(X),
+            n_observations=safe_len(X),
             n_outputs=n_outputs,
             n_folds=self.n_folds,
         )
         for i, estimator in enumerate(self._estimators):
             predictions[:, :, i] = np.reshape(
-                getattr(estimator, method)(X), (len(X), n_outputs)
+                getattr(estimator, method)(X), (safe_len(X), n_outputs)
             )
         if n_outputs == 1:
             return predictions[:, 0, :]
@@ -242,15 +247,15 @@ def _predict_in_sample(
     ) -> np.ndarray:
         if not self._test_indices:
             raise ValueError()
-        if len(X) != sum(len(fold) for fold in self._test_indices):
+        if safe_len(X) != sum(len(fold) for fold in self._test_indices):
             raise ValueError(
                 "Trying to predict in-sample on data that is unlike data encountered in training. "
                 f"Training data included {sum(len(fold) for fold in self._test_indices)} "
-                f"observations while prediction data includes {len(X)} observations."
+                f"observations while prediction data includes {safe_len(X)} observations."
             )
         n_outputs = self._n_outputs(method)
         predictions = self._initialize_prediction_tensor(
-            n_observations=len(X),
+            n_observations=safe_len(X),
             n_outputs=n_outputs,
             n_folds=1,
         )
diff --git a/metalearners/drlearner.py b/metalearners/drlearner.py
index 7bff6fb5..6c03ab36 100644
--- a/metalearners/drlearner.py
+++ b/metalearners/drlearner.py
@@ -28,6 +28,7 @@
     get_predict_proba,
     index_matrix,
     infer_input_dict,
+    safe_len,
     validate_valid_treatment_variant_not_control,
     warning_experimental_feature,
 )
@@ -253,7 +254,7 @@ def predict(
         oos_method: OosMethod = OVERALL,
     ) -> np.ndarray:
         n_outputs = 2 if self.is_classification else 1
-        estimates = np.zeros((len(X), self.n_variants - 1, n_outputs))
+        estimates = np.zeros((safe_len(X), self.n_variants - 1, n_outputs))
         for treatment_variant in range(1, self.n_variants):
             estimates_variant = self.predict_treatment(
                 X,
@@ -365,7 +366,7 @@ def average_treatment_effect(
             raise ValueError(
                 "The nuisance models need to be fitted before computing the treatment effect."
             )
-        gamma_matrix = np.zeros((len(X), self.n_variants - 1))
+        gamma_matrix = np.zeros((safe_len(X), self.n_variants - 1))
         for treatment_variant in range(1, self.n_variants):
             gamma_matrix[:, treatment_variant - 1] = self._pseudo_outcome(
                 X=X,
@@ -375,7 +376,7 @@ def average_treatment_effect(
                 is_oos=is_oos,
             )
         treatment_effect = gamma_matrix.mean(axis=0)
-        standard_error = gamma_matrix.std(axis=0) / np.sqrt(len(X))
+        standard_error = gamma_matrix.std(axis=0) / np.sqrt(safe_len(X))
         return treatment_effect, standard_error
 
     def _pseudo_outcome(
diff --git a/metalearners/explainer.py b/metalearners/explainer.py
index 721514c3..8449bc11 100644
--- a/metalearners/explainer.py
+++ b/metalearners/explainer.py
@@ -8,7 +8,7 @@
 import shap
 
 from metalearners._typing import Matrix, _ScikitModel
-from metalearners._utils import simplify_output_2d
+from metalearners._utils import safe_len, simplify_output_2d
 from metalearners.metalearner import Params
 
 
@@ -59,7 +59,7 @@ def from_estimates(
         The ``cate_estimates`` should be the raw outcome of a MetaLearner with 3 dimensions
         and should not be simplified.
         """
-        if len(X) != len(cate_estimates) or len(X) == 0:
+        if safe_len(X) != len(cate_estimates) or safe_len(X) == 0:
             raise ValueError(
                 "X and cate_estimates should contain the same number of observations "
                 "and not be empty."
diff --git a/metalearners/metalearner.py b/metalearners/metalearner.py
index 5aca3169..1ec53c90 100644
--- a/metalearners/metalearner.py
+++ b/metalearners/metalearner.py
@@ -30,6 +30,7 @@
     ONNX_PROBABILITIES_OUTPUTS,
     default_metric,
     index_matrix,
+    safe_len,
     validate_model_and_predict_method,
     validate_number_positive,
 )
@@ -120,7 +121,7 @@ def _filter_x_columns(X: Matrix, feature_set: Features) -> Matrix:
     if feature_set is None:
         X_filtered = X
     elif len(feature_set) == 0:
-        X_filtered = np.ones((len(X), 1))
+        X_filtered = np.ones((safe_len(X), 1))
     else:
         if isinstance(X, pd.DataFrame):
             X_filtered = X[list(feature_set)]
@@ -1347,7 +1348,7 @@ def predict_conditional_average_outcomes(
                 "typically set during fitting, is None."
             )
         # TODO: Consider multiprocessing
-        n_obs = len(X)
+        n_obs = safe_len(X)
         nuisance_tensors = self._nuisance_tensors(n_obs)
         conditional_average_outcomes_list = nuisance_tensors[VARIANT_OUTCOME_MODEL]
 
diff --git a/metalearners/rlearner.py b/metalearners/rlearner.py
index b86e9764..95cc237e 100644
--- a/metalearners/rlearner.py
+++ b/metalearners/rlearner.py
@@ -20,6 +20,7 @@
     get_predict_proba,
     index_matrix,
     infer_input_dict,
+    safe_len,
     validate_all_vectors_same_index,
     validate_valid_treatment_variant_not_control,
     warning_experimental_feature,
@@ -277,7 +278,7 @@ def predict(
         oos_method: OosMethod = OVERALL,
     ) -> np.ndarray:
         n_outputs = 2 if self.is_classification else 1
-        tau_hat = np.zeros((len(X), self.n_variants - 1, n_outputs))
+        tau_hat = np.zeros((safe_len(X), self.n_variants - 1, n_outputs))
 
         if is_oos:
 
@@ -298,7 +299,7 @@ def predict(
                     variant_estimates = np.stack(
                         [-variant_estimates, variant_estimates], axis=-1
                     )
-                variant_estimates = variant_estimates.reshape(len(X), n_outputs)
+                variant_estimates = variant_estimates.reshape(safe_len(X), n_outputs)
                 tau_hat[:, treatment_variant - 1, :] = variant_estimates
 
             return tau_hat
@@ -486,7 +487,7 @@ def _pseudo_outcome_and_weights(
         constant ``epsilon`` to the denominator in order to avoid numerical problems.
         """
         if mask is None:
-            mask = np.ones(len(X), dtype=bool)
+            mask = np.ones(safe_len(X), dtype=bool)
 
         validate_valid_treatment_variant_not_control(treatment_variant, self.n_variants)
 
@@ -560,7 +561,7 @@ def predict_conditional_average_outcomes(
 
         where :math:`K` is the number of treatment variants.
         """
-        n_obs = len(X)
+        n_obs = safe_len(X)
 
         cate_estimates = self.predict(
             X=X,
diff --git a/metalearners/slearner.py b/metalearners/slearner.py
index 7bda6a13..d4f48bee 100644
--- a/metalearners/slearner.py
+++ b/metalearners/slearner.py
@@ -21,6 +21,7 @@
 from metalearners._utils import (
     convert_treatment,
     get_one,
+    safe_len,
     supports_categoricals,
 )
 from metalearners.cross_fit_estimator import OVERALL, CrossFitEstimator
@@ -231,7 +232,7 @@ def evaluate(
     def predict_conditional_average_outcomes(
         self, X: Matrix, is_oos: bool, oos_method: OosMethod = OVERALL
     ) -> np.ndarray:
-        n_obs = len(X)
+        n_obs = safe_len(X)
         conditional_average_outcomes_list = []
 
         for treatment_variant in range(self.n_variants):
diff --git a/metalearners/utils.py b/metalearners/utils.py
index 587bef67..7f8ece7f 100644
--- a/metalearners/utils.py
+++ b/metalearners/utils.py
@@ -9,6 +9,7 @@
 from typing_extensions import Self
 
 from metalearners._typing import Matrix, Vector
+from metalearners._utils import safe_len
 from metalearners.drlearner import DRLearner
 from metalearners.metalearner import MetaLearner
 from metalearners.rlearner import RLearner
@@ -104,4 +105,6 @@ def predict(self, X: Matrix) -> np.ndarray[Any, Any]:
         return np.argmax(self.predict_proba(X), axis=1)
 
     def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]:
-        return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score])
+        return np.full(
+            (safe_len(X), 2), [1 - self.propensity_score, self.propensity_score]
+        )
diff --git a/metalearners/xlearner.py b/metalearners/xlearner.py
index 28bee892..016a726e 100644
--- a/metalearners/xlearner.py
+++ b/metalearners/xlearner.py
@@ -18,6 +18,7 @@
     index_matrix,
     infer_input_dict,
     infer_probabilities_output,
+    safe_len,
     validate_valid_treatment_variant_not_control,
     warning_experimental_feature,
 )
@@ -231,7 +232,7 @@ def predict(
                 "typically set during fitting, is None."
             )
         n_outputs = 2 if self.is_classification else 1
-        tau_hat = np.zeros((len(X), self.n_variants - 1, n_outputs))
+        tau_hat = np.zeros((safe_len(X), self.n_variants - 1, n_outputs))
         # Propensity score model is always a classifier so we can't use MEDIAN
         propensity_score_oos = OVERALL if oos_method == MEDIAN else oos_method
         propensity_score = self.predict_nuisance(
@@ -266,8 +267,8 @@ def predict(
                     oos_method=oos_method,
                 )
             else:
-                tau_hat_treatment = np.zeros(len(X))
-                tau_hat_control = np.zeros(len(X))
+                tau_hat_treatment = np.zeros(safe_len(X))
+                tau_hat_control = np.zeros(safe_len(X))
 
                 tau_hat_treatment[non_treatment_variant_indices] = (
                     self.predict_treatment(

From ff810340ce5321c8c25f23bd2dc00125d75133f4 Mon Sep 17 00:00:00 2001
From: kklein </>
Date: Tue, 27 Aug 2024 10:12:41 +0200
Subject: [PATCH 2/9] Appease mypy about notebook.

---
 docs/examples/example_sparse_inputs.ipynb | 217 +++++-----------------
 1 file changed, 45 insertions(+), 172 deletions(-)

diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
index 79a15786..04677eca 100644
--- a/docs/examples/example_sparse_inputs.ipynb
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -20,42 +20,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style type='text/css'>\n",
-       ".datatable table.frame { margin-bottom: 0; }\n",
-       ".datatable table.frame thead { border-bottom: none; }\n",
-       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
-       ".datatable .bool    { background: #DDDD99; }\n",
-       ".datatable .object  { background: #565656; }\n",
-       ".datatable .int     { background: #5D9E5D; }\n",
-       ".datatable .float   { background: #4040CC; }\n",
-       ".datatable .str     { background: #CC4040; }\n",
-       ".datatable .time    { background: #40CC40; }\n",
-       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
-       ".datatable .frame tbody td { text-align: left; }\n",
-       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
-       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
-       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
-       ".datatable .sp {  opacity: 0.25;}\n",
-       ".datatable .footer { font-size: 9px; }\n",
-       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
-       "</style>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import time, psutil, os, gc\n",
     "import numpy as np\n",
@@ -68,12 +35,15 @@
     "from sklearn.metrics import mean_squared_error, r2_score\n",
     "\n",
     "from lightgbm import LGBMRegressor, LGBMClassifier\n",
-    "from metalearners import DRLearner"
+    "from metalearners import DRLearner\n",
+    "\n",
+    "# This is required for when nbconvert converts the cell-magic to regular function calls.\n",
+    "from IPython import get_ipython"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -101,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -184,19 +154,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Sparse data memory: 7.63MB\n",
-      "Dense data memory: 953.64MB\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n",
     "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")"
@@ -211,7 +171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -249,20 +209,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "0"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "gc.collect()"
    ]
@@ -276,28 +225,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sparse data - Runtime: 15.04s, Memory used: 324.86MB\n",
-      "(array([0.9523358]), array([0.0202085]))\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "318"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "fit_drlearner_wrapper(X_csr)\n",
     "gc.collect()"
@@ -312,28 +242,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sparse data - Runtime: 117.22s, Memory used: 87.21MB\n",
-      "(array([0.95124745]), array([0.02021724]))\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "190"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "fit_drlearner_wrapper(X_np)\n",
     "gc.collect()"
@@ -357,7 +268,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -395,7 +306,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -425,7 +336,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -438,28 +349,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 1min 4s, sys: 1.83 s, total: 1min 5s\n",
-      "Wall time: 8.6 s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "39"
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%time\n",
     "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n",
@@ -468,28 +360,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "CPU times: user 2min 50s, sys: 6.88 s, total: 2min 57s\n",
-      "Wall time: 31.6 s\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "35"
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%time\n",
     "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n",
@@ -498,37 +371,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sparse data - Runtime: 8.48s, Memory used: 69.88MB, MSE: 0.8659, R2: 0.1396\n",
-      "Dense data - Runtime: 30.58s, Memory used: 1.21MB, MSE: 0.8659, R2: 0.1396\n",
-      "\n",
-      "Sparse data memory: 12.21MB\n",
-      "Dense data memory: 1525.88MB\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
+    "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n",
     "print(\n",
-    "    f\"Sparse data - Runtime: {sparse_runtime:.2f}s, Memory used: {sparse_memory:.2f}MB, MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"\n",
-    ")\n",
+    "    f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \"  # type: ignore[name-defined]\n",
+    "    f\"Memory used: {sparse_memory:.2f}MB, \"  # type: ignore[name-defined]\n",
+    "    f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"  # type: ignore[name-defined]\n",
+    ") \n",
     "print(\n",
-    "    f\"Dense data - Runtime: {dense_runtime:.2f}s, Memory used: {dense_memory:.2f}MB, MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"\n",
+    "    f\"Dense data - Runtime: {dense_runtime:.2f}s, \"  # type: ignore[name-defined]\n",
+    "    f\"Memory used: {dense_memory:.2f}MB, \"  # type: ignore[name-defined]\n",
+    "    f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"  # type: ignore[name-defined]\n",
     ")\n",
     "\n",
     "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n",
     "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py311",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -542,9 +415,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.7"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }

From 47dac106b9f6ce24c3723ca7ee7655b8ca7652f8 Mon Sep 17 00:00:00 2001
From: kklein </>
Date: Tue, 27 Aug 2024 12:41:09 +0200
Subject: [PATCH 3/9] Test against csr in test_cross_fit_estimator.py

---
 tests/test_cross_fit_estimator.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/test_cross_fit_estimator.py b/tests/test_cross_fit_estimator.py
index 4165a06c..abdcbf4a 100644
--- a/tests/test_cross_fit_estimator.py
+++ b/tests/test_cross_fit_estimator.py
@@ -6,6 +6,7 @@
 import numpy as np
 import pytest
 from lightgbm import LGBMClassifier, LGBMRegressor
+from scipy.sparse import csr_matrix
 from sklearn.base import is_classifier, is_regressor
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.metrics import accuracy_score, log_loss
@@ -24,10 +25,10 @@
 @pytest.mark.parametrize("predict_proba", [True, False])
 @pytest.mark.parametrize("is_oos", [True, False])
 @pytest.mark.parametrize("oos_method", ["overall", "mean", "median"])
-@pytest.mark.parametrize("use_np", [True, False])
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
 @pytest.mark.parametrize("pass_cv", [True, False])
 def test_crossfitestimator_oos_smoke(
-    mindset_data, rng, use_clf, predict_proba, is_oos, oos_method, use_np, pass_cv
+    mindset_data, rng, use_clf, predict_proba, is_oos, oos_method, backend, pass_cv
 ):
     if not use_clf and predict_proba:
         pytest.skip()
@@ -50,9 +51,12 @@ def test_crossfitestimator_oos_smoke(
         # Arbitrary cut-off
         y = y > 0.8
 
-    if use_np:
+    if backend == "np":
         X = X.to_numpy()
         y = y.to_numpy()
+    if backend == "csr":
+        X = csr_matrix(df.values)
+        y = y.to_numpy()
 
     cfe = CrossFitEstimator(
         n_folds=5,

From 3d2d84e15b39ee3e3c53810b0d559c25470819b5 Mon Sep 17 00:00:00 2001
From: kklein </>
Date: Tue, 27 Aug 2024 12:51:08 +0200
Subject: [PATCH 4/9] Test against csr in test_utils.py

---
 tests/test_cross_fit_estimator.py |  2 +-
 tests/test_utils.py               | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tests/test_cross_fit_estimator.py b/tests/test_cross_fit_estimator.py
index abdcbf4a..eb709735 100644
--- a/tests/test_cross_fit_estimator.py
+++ b/tests/test_cross_fit_estimator.py
@@ -54,7 +54,7 @@ def test_crossfitestimator_oos_smoke(
     if backend == "np":
         X = X.to_numpy()
         y = y.to_numpy()
-    if backend == "csr":
+    elif backend == "csr":
         X = csr_matrix(df.values)
         y = y.to_numpy()
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index d88b5d14..f634b772 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import pytest
 from lightgbm import LGBMRegressor
+from scipy.sparse import csr_matrix
 
 from metalearners.metalearner import MetaLearner
 from metalearners.utils import (
@@ -59,8 +60,8 @@ def test_simplify_output_raises(input):
         simplify_output(input)
 
 
-@pytest.mark.parametrize("use_pd", [True, False])
-def test_fixed_binary_propensity(use_pd):
+@pytest.mark.parametrize("backend", ["pd", "pd", "csr"])
+def test_fixed_binary_propensity(backend):
     propensity_score = 0.3
     dominant_class = propensity_score >= 0.5
 
@@ -69,19 +70,24 @@ def test_fixed_binary_propensity(use_pd):
     n_samples = 5
     X_train = np.ones((n_samples, 5))
     y_train = np.ones(n_samples)
-    if use_pd:
+
+    n_test_samples = 3
+    X_test = np.zeros((n_test_samples, 5))
+
+    expected_result = np.array(np.ones(n_test_samples) * dominant_class)
+
+    if backend == "pd":
         X_train = pd.DataFrame(X_train)
         y_train = pd.Series(y_train)
+        X_test = pd.DataFrame(X_test)
+    elif backend == "csr":
+        X_train = csr_matrix(X_train)
+        X_test = csr_matrix(X_test)
 
     model.fit(X_train, y_train)
-
-    n_test_samples = 3
-    X_test = np.zeros(n_test_samples)
-
     class_predictions = model.predict(X_test)
-    assert np.array_equal(
-        class_predictions, np.array(np.ones(n_test_samples) * dominant_class)
-    )
+
+    assert np.array_equal(class_predictions, expected_result)
 
     probability_estimates = model.predict_proba(X_test)
     assert np.array_equal(

From 7e892f2751ef40ca6a9f6e073603de926effcf6c Mon Sep 17 00:00:00 2001
From: kklein </>
Date: Tue, 27 Aug 2024 13:25:54 +0200
Subject: [PATCH 5/9] Adapt S-Learner to work with csr matrix.

---
 metalearners/slearner.py | 21 +++++++++++++++------
 tests/test_slearner.py   | 29 +++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/metalearners/slearner.py b/metalearners/slearner.py
index d4f48bee..bd4395d3 100644
--- a/metalearners/slearner.py
+++ b/metalearners/slearner.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy.sparse import csr_matrix, hstack
 from typing_extensions import Self
 
 from metalearners._typing import (
@@ -57,21 +58,29 @@ def _append_treatment_to_covariates(
             # names are integers and some strings.
             X_with_w.columns = X_with_w.columns.astype(str)
             return X_with_w
+        elif isinstance(X, csr_matrix):
+            return hstack((X, w_dummies), format="csr")
         else:
             return np.concatenate([X, w_dummies], axis=1)
 
+    # This is necessary as each model works differently with categoricals,
+    # in some you need to specify them on instantiation while some others on
+    # fitting. This solutions converts it to a pd.DataFrame as most of the models
+    # have some "automatic" detection of categorical features based on pandas
+    # dtypes. Theoretically it would be possible to get around this conversion
+    # but it would require loads of model specific code.
     if isinstance(X, np.ndarray):
-        # This is necessary as each model works differently with categoricals,
-        # in some you need to specify them on instantiation while some others on
-        # fitting. This solutions converts it to a pd.DataFrame as most of the models
-        # have some "automatic" detection of categorical features based on pandas
-        # dtypes. Theoretically it would be possible to get around this conversion
-        # but it would require loads of model specific code.
         warnings.warn(
             "Converting the input covariates X from np.ndarray to a "
             f"pd.DataFrame as the {_BASE_MODEL} supports categorical variables."
         )
         X = pd.DataFrame(X, copy=True)
+    elif isinstance(X, csr_matrix):
+        warnings.warn(
+            "Converting the input covariates X from a scipy csr_matrix to a "
+            f"pd.DataFrame as the {_BASE_MODEL} supports categorical variables."
+        )
+        X = pd.DataFrame.sparse.from_spmatrix(X)
 
     X_with_w = pd.concat([X, pd.Series(w, dtype="category", name="treatment")], axis=1)
     X_with_w.columns = X_with_w.columns.astype(str)
diff --git a/tests/test_slearner.py b/tests/test_slearner.py
index 37f6086c..15f0eaa1 100644
--- a/tests/test_slearner.py
+++ b/tests/test_slearner.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import pytest
 from lightgbm import LGBMRegressor
+from scipy.sparse import csr_matrix
 from sklearn.linear_model import LinearRegression
 
 from metalearners.slearner import SLearner, _append_treatment_to_covariates
@@ -31,16 +32,20 @@ def test_feature_set_doesnt_raise(rng):
 @pytest.mark.parametrize(
     "model, supports_categoricals", [(LinearRegression, False), (LGBMRegressor, True)]
 )
-@pytest.mark.parametrize("use_pd", [False, True])
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
 def test_append_treatment_to_covariates(
     model,
     supports_categoricals,
-    use_pd,
+    backend,
     sample_size,
     request,
 ):
-    dataset_name = "mixed" if use_pd else "numerical"
+    dataset_name = "mixed" if backend == "pd" else "numerical"
     covariates, _, _ = request.getfixturevalue(f"{dataset_name}_covariates")
+
+    if backend == "csr":
+        covariates = csr_matrix(covariates)
+
     treatment = np.array([0] * sample_size)
     n_variants = 4
     X_with_w = _append_treatment_to_covariates(
@@ -52,20 +57,28 @@ def test_append_treatment_to_covariates(
         list(range(n_variants))
     )
 
-    if not use_pd and not supports_categoricals:
-        assert isinstance(X_with_w, np.ndarray)
+    if backend in ["np", "csr"] and not supports_categoricals:
+        if backend == "np":
+            assert isinstance(X_with_w, np.ndarray)
+        elif backend == "csr":
+            assert isinstance(X_with_w, csr_matrix)
         assert (
             (
                 X_with_w[:, -3:]
-                == pd.get_dummies(treatment_pd, dtype=int, drop_first=True)
+                == pd.get_dummies(treatment_pd, dtype=int, drop_first=True).values
             )
             .all()
             .all()
         )
-        assert np.all(X_with_w[:, :-3] == covariates)
+        assert (X_with_w[:, :-3] != covariates).sum() < 1
     else:
         assert isinstance(X_with_w, pd.DataFrame)
-        covariates_pd = pd.DataFrame(covariates) if not use_pd else covariates
+        if backend == "np":
+            covariates_pd = pd.DataFrame(covariates)
+        elif backend == "csr":
+            covariates_pd = pd.DataFrame.sparse.from_spmatrix(covariates)
+        else:
+            covariates_pd = covariates
         covariates_pd.columns = covariates_pd.columns.astype(str)
         if not supports_categoricals:
             assert X_with_w[["treatment_1", "treatment_2", "treatment_3"]].equals(

From 43da98913292501d36bbb1e36078a505210009e1 Mon Sep 17 00:00:00 2001
From: kklein <kevin.klein@quantco.com>
Date: Tue, 27 Aug 2024 13:26:49 +0200
Subject: [PATCH 6/9] Test against csr matrix in test_learner and
 test_metalearner.

---
 tests/test_learner.py     |  9 ++++++---
 tests/test_metalearner.py | 25 ++++++++++++++++---------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/tests/test_learner.py b/tests/test_learner.py
index 01d33754..afe26342 100644
--- a/tests/test_learner.py
+++ b/tests/test_learner.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import pytest
 from lightgbm import LGBMClassifier, LGBMRegressor
+from scipy.sparse import csr_matrix
 from sklearn.linear_model import LinearRegression, LogisticRegression
 from sklearn.metrics import make_scorer, root_mean_squared_error
 from sklearn.model_selection import train_test_split
@@ -939,16 +940,18 @@ def test_model_reusage(outcome_kind, request):
         ),
     ],
 )
-@pytest.mark.parametrize("use_pandas", [False, True])
-def test_evaluate_feature_set_smoke(metalearner_factory, feature_set, rng, use_pandas):
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
+def test_evaluate_feature_set_smoke(metalearner_factory, feature_set, rng, backend):
     n_samples = 100
     X = rng.standard_normal((n_samples, 5))
     y = rng.standard_normal(n_samples)
     w = rng.integers(0, 2, n_samples)
-    if use_pandas:
+    if backend == "pd":
         X = pd.DataFrame(X)
         y = pd.Series(y)
         w = pd.Series(w)
+    elif backend == "csr":
+        X = csr_matrix(X)
 
     ml = metalearner_factory(
         n_variants=2,
diff --git a/tests/test_metalearner.py b/tests/test_metalearner.py
index d9ac1f68..133d3d7f 100644
--- a/tests/test_metalearner.py
+++ b/tests/test_metalearner.py
@@ -9,6 +9,7 @@
 import pandas as pd
 import pytest
 from lightgbm import LGBMClassifier, LGBMRegressor
+from scipy.sparse import csr_matrix
 from shap import TreeExplainer, summary_plot
 from sklearn.base import BaseEstimator
 from sklearn.linear_model import LinearRegression, LogisticRegression
@@ -480,8 +481,8 @@ def test_combine_propensity_and_nuisance_specs(
         ),
     ],
 )
-@pytest.mark.parametrize("use_pandas", [False, True])
-def test_feature_set(feature_set, expected_n_features, use_pandas, rng):
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
+def test_feature_set(feature_set, expected_n_features, backend, rng):
     ml = _TestMetaLearner(
         nuisance_model_factory=LGBMRegressor,
         is_classification=False,
@@ -495,10 +496,12 @@ def test_feature_set(feature_set, expected_n_features, use_pandas, rng):
     X = rng.standard_normal((sample_size, n_features))
     y = rng.standard_normal(sample_size)
     w = rng.integers(0, 2, sample_size)
-    if use_pandas:
+    if backend == "pd":
         X = pd.DataFrame(X)
         y = pd.Series(y)
         w = pd.Series(w)
+    elif backend == "csr":
+        X = csr_matrix(X)
     ml.fit(X, y, w)
 
     for model_kind, model_kind_list in ml._nuisance_models.items():
@@ -1078,15 +1081,17 @@ def test_n_jobs_base_learners(implementation, rng):
     "implementation",
     [TLearner, SLearner, XLearner, RLearner, DRLearner],
 )
-@pytest.mark.parametrize("use_pandas", [False, True])
-def test_validate_outcome_one_class(implementation, use_pandas, rng):
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
+def test_validate_outcome_one_class(implementation, backend, rng):
     X = rng.standard_normal((10, 2))
     y = np.zeros(10)
     w = rng.integers(0, 2, 10)
-    if use_pandas:
+    if backend == "pandas":
         X = pd.DataFrame(X)
         y = pd.Series(y)
         w = pd.Series(w)
+    elif backend == "csr":
+        X = csr_matrix(X)
 
     ml = implementation(
         True,
@@ -1106,15 +1111,17 @@ def test_validate_outcome_one_class(implementation, use_pandas, rng):
     "implementation",
     [TLearner, SLearner, XLearner, RLearner, DRLearner],
 )
-@pytest.mark.parametrize("use_pandas", [False, True])
-def test_validate_outcome_different_classes(implementation, use_pandas, rng):
+@pytest.mark.parametrize("backend", ["np", "pd", "csr"])
+def test_validate_outcome_different_classes(implementation, backend, rng):
     X = rng.standard_normal((4, 2))
     y = np.array([0, 1, 0, 0])
     w = np.array([0, 0, 1, 1])
-    if use_pandas:
+    if backend == "pd":
         X = pd.DataFrame(X)
         y = pd.Series(y)
         w = pd.Series(w)
+    elif backend == "csr":
+        X = csr_matrix(X)
 
     ml = implementation(
         True,

From 9a6ed735d47db12458ac18e241d48042d867cfe7 Mon Sep 17 00:00:00 2001
From: Apoorva Lal <apoorval@netflix.com>
Date: Tue, 27 Aug 2024 13:10:40 -0700
Subject: [PATCH 7/9] fix notebook metadata

---
 docs/examples/example_estimating_ates.ipynb | 119 ++++++---
 docs/examples/example_sparse_inputs.ipynb   | 262 ++++++--------------
 2 files changed, 170 insertions(+), 211 deletions(-)

diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb
index 0eda7327..79a69d52 100644
--- a/docs/examples/example_estimating_ates.ipynb
+++ b/docs/examples/example_estimating_ates.ipynb
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -99,9 +99,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2.083595103597918, 0.06526671583747883)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n",
     "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n",
@@ -110,9 +121,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2.1433722387308025, 0.06345124983351998)"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n",
     "                   df) .fit(cov_type=\"HC1\")\n",
@@ -138,9 +160,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type='text/css'>\n",
+       ".datatable table.frame { margin-bottom: 0; }\n",
+       ".datatable table.frame thead { border-bottom: none; }\n",
+       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
+       ".datatable .bool    { background: #DDDD99; }\n",
+       ".datatable .object  { background: #565656; }\n",
+       ".datatable .int     { background: #5D9E5D; }\n",
+       ".datatable .float   { background: #4040CC; }\n",
+       ".datatable .str     { background: #CC4040; }\n",
+       ".datatable .time    { background: #40CC40; }\n",
+       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
+       ".datatable .frame tbody td { text-align: left; }\n",
+       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
+       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
+       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
+       ".datatable .sp {  opacity: 0.25;}\n",
+       ".datatable .footer { font-size: 9px; }\n",
+       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "from metalearners import DRLearner\n",
     "from lightgbm import LGBMRegressor, LGBMClassifier\n",
@@ -149,9 +204,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 6,
+   "metadata": {
+    "editable": true,
+    "slideshow": {
+     "slide_type": ""
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(array([1.02931589]), array([0.06679633]))"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "metalearners_dr = DRLearner(\n",
     "    nuisance_model_factory=LGBMRegressor,\n",
@@ -557,22 +629,11 @@
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "name": "python"
+  },
+  "mystnb": {
+   "execution_timeout": 120
   }
  },
  "nbformat": 4,
diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
index 04677eca..56589580 100644
--- a/docs/examples/example_sparse_inputs.ipynb
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -20,9 +20,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<style type='text/css'>\n",
+       ".datatable table.frame { margin-bottom: 0; }\n",
+       ".datatable table.frame thead { border-bottom: none; }\n",
+       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
+       ".datatable .bool    { background: #DDDD99; }\n",
+       ".datatable .object  { background: #565656; }\n",
+       ".datatable .int     { background: #5D9E5D; }\n",
+       ".datatable .float   { background: #4040CC; }\n",
+       ".datatable .str     { background: #CC4040; }\n",
+       ".datatable .time    { background: #40CC40; }\n",
+       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
+       ".datatable .frame tbody td { text-align: left; }\n",
+       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
+       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
+       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
+       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
+       ".datatable .sp {  opacity: 0.25;}\n",
+       ".datatable .footer { font-size: 9px; }\n",
+       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
+       "</style>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "import time, psutil, os, gc\n",
     "import numpy as np\n",
@@ -43,7 +76,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -71,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,9 +187,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Sparse data memory: 7.63MB\n",
+      "Dense data memory: 953.66MB\n"
+     ]
+    }
+   ],
    "source": [
     "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n",
     "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")"
@@ -171,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -207,15 +250,6 @@
     "    print(metalearners_est)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "gc.collect()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -225,12 +259,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n",
+      "(array([1.0007226]), array([0.02021719]))\n"
+     ]
+    }
+   ],
    "source": [
-    "fit_drlearner_wrapper(X_csr)\n",
-    "gc.collect()"
+    "fit_drlearner_wrapper(X_csr)"
    ]
   },
   {
@@ -242,12 +284,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n",
+      "(array([1.00067664]), array([0.02021555]))\n"
+     ]
+    }
+   ],
    "source": [
-    "fit_drlearner_wrapper(X_np)\n",
-    "gc.collect()"
+    "fit_drlearner_wrapper(X_np)"
    ]
   },
   {
@@ -256,168 +306,16 @@
    "source": [
     "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. "
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Prediction \n",
-    "\n",
-    "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n",
-    "    X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n",
-    "    y = np.zeros(n_samples)\n",
-    "    # Select a few features for main effects\n",
-    "    main_effect_features = np.random.choice(n_features, 3, replace=False)\n",
-    "    # Create main effects\n",
-    "    for i in main_effect_features:\n",
-    "        # Create a random effect for each category\n",
-    "        category_effects = np.random.normal(0, 1, n_categories)\n",
-    "        y += category_effects[X[:, i]]\n",
-    "    # Select a couple of feature pairs for interaction effects\n",
-    "    interaction_pairs = [\n",
-    "        (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n",
-    "    ]\n",
-    "    selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n",
-    "    # Create interaction effects\n",
-    "    for idx in selected_interactions:\n",
-    "        i, j = interaction_pairs[idx]\n",
-    "        # Create a sparse interaction effect\n",
-    "        interaction_effect = np.random.choice(\n",
-    "            [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n",
-    "        )\n",
-    "        y += interaction_effect[X[:, i], X[:, j]]\n",
-    "    # Add a non-linear effect for one feature\n",
-    "    nonlinear_feature = np.random.choice(n_features)\n",
-    "    y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n",
-    "    y = (y - np.mean(y)) / np.std(y)\n",
-    "    y += np.random.normal(0, 0.1, n_samples)\n",
-    "\n",
-    "    return X, y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def prepare_data(X):\n",
-    "    e1 = OneHotEncoder(sparse_output=True)\n",
-    "    # dense - use pd.get_dummies to mimic current practice\n",
-    "    Xdf = pd.DataFrame(X)\n",
-    "    return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n",
-    "\n",
-    "def fit_and_measure(X_train, y_train, X_test, y_test):\n",
-    "    start_memory = get_memory_usage()\n",
-    "    start_time = time.time()\n",
-    "    m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n",
-    "    m.fit(X_train, y_train)\n",
-    "    end_time = time.time()\n",
-    "    end_memory = get_memory_usage()\n",
-    "    runtime = end_time - start_time\n",
-    "    memory_used = end_memory - start_memory\n",
-    "\n",
-    "    # Compute accuracy metrics\n",
-    "    y_pred = m.predict(X_test)\n",
-    "    mse = mean_squared_error(y_test, y_pred)\n",
-    "    r2 = r2_score(y_test, y_pred)\n",
-    "\n",
-    "    return runtime, memory_used, mse, r2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X, y = generate_dummy_data()\n",
-    "# Split the data into train and test sets\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
-    "X_train_sparse, X_train_dense = prepare_data(X_train)\n",
-    "X_test_sparse, X_test_dense = prepare_data(X_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n",
-    "gc.collect()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n",
-    "gc.collect()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n",
-    "print(\n",
-    "    f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \"  # type: ignore[name-defined]\n",
-    "    f\"Memory used: {sparse_memory:.2f}MB, \"  # type: ignore[name-defined]\n",
-    "    f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"  # type: ignore[name-defined]\n",
-    ") \n",
-    "print(\n",
-    "    f\"Dense data - Runtime: {dense_runtime:.2f}s, \"  # type: ignore[name-defined]\n",
-    "    f\"Memory used: {dense_memory:.2f}MB, \"  # type: ignore[name-defined]\n",
-    "    f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"  # type: ignore[name-defined]\n",
-    ")\n",
-    "\n",
-    "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n",
-    "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "name": "python"
+  },
+  "mystnb": {
+   "execution_timeout": 120
   }
  },
  "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 2
 }

From 8282381c277ec27b9cd834bfa9f66a12337a6432 Mon Sep 17 00:00:00 2001
From: Apoorva Lal <apoorval@netflix.com>
Date: Tue, 27 Aug 2024 13:44:35 -0700
Subject: [PATCH 8/9] reduce sparse problem size for docs

---
 docs/examples/example_sparse_inputs.ipynb | 52 ++++++++++++++---------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
index 56589580..cc351bfa 100644
--- a/docs/examples/example_sparse_inputs.ipynb
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -110,7 +110,7 @@
    "source": [
     "def generate_causal_data(\n",
     "    n_samples=100_000,\n",
-    "    n_categories=1000,\n",
+    "    n_categories=500,\n",
     "    n_features=100,\n",
     "    tau_magnitude=1.0,\n",
     "):\n",
@@ -168,8 +168,9 @@
     "\n",
     "\n",
     "X, W, Y, tau, propensity_score = generate_causal_data(\n",
-    "    n_samples=10000, tau_magnitude=1.0\n",
-    ")\n"
+    "    n_samples=1000, tau_magnitude=1.0\n",
+    ")\n",
+    "Xdf = pd.DataFrame(X)"
    ]
   },
   {
@@ -179,10 +180,14 @@
    "outputs": [],
    "source": [
     "# sparse and dense X matrices\n",
-    "e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically\n",
-    "Xdf = pd.DataFrame(X)\n",
+    "e1 = OneHotEncoder(\n",
+    "    sparse_output=True\n",
+    ")  # onehot encoder generates sparse output automatically\n",
+    "\n",
     "X_csr = e1.fit_transform(X)\n",
-    "X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas"
+    "X_np = pd.get_dummies(\n",
+    "    Xdf, columns=Xdf.columns\n",
+    ").values  # dense onehot encoding with pandas"
    ]
   },
   {
@@ -195,8 +200,8 @@
      "output_type": "stream",
      "text": [
       "\n",
-      "Sparse data memory: 7.63MB\n",
-      "Dense data memory: 953.66MB\n"
+      "Sparse data memory: 0.76MB\n",
+      "Dense data memory: 41.28MB\n"
      ]
     }
    ],
@@ -266,8 +271,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n",
-      "(array([1.0007226]), array([0.02021719]))\n"
+      "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n",
+      "(array([1.0161235]), array([0.06374022]))\n"
      ]
     }
    ],
@@ -291,26 +296,33 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n",
-      "(array([1.00067664]), array([0.02021555]))\n"
+      "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n",
+      "(array([1.01609547]), array([0.06384197]))\n"
      ]
     }
    ],
    "source": [
     "fit_drlearner_wrapper(X_np)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. "
-   ]
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
   },
   "mystnb": {
    "execution_timeout": 120

From e4501b4450791ac8ab3ef2e44d31ff8c6864bcba Mon Sep 17 00:00:00 2001
From: Apoorva Lal <apoorval@netflix.com>
Date: Wed, 28 Aug 2024 08:54:09 -0700
Subject: [PATCH 9/9] final touches

---
 CHANGELOG.rst                               |   8 ++
 docs/examples/example_estimating_ates.ipynb | 100 ++++----------------
 docs/examples/example_sparse_inputs.ipynb   |  93 ++++--------------
 metalearners/_utils.py                      |   2 +-
 4 files changed, 45 insertions(+), 158 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 56fd87c7..34eb7f81 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,14 @@
 Changelog
 =========
 
+0.11.0 (2024-09-xx)
+-------------------
+
+**New features**
+
+* Add support for using ``scipy.sparse.csr_matrix`` as datastructure for covariates ``X``.
+
+
 0.10.0 (2024-08-13)
 -------------------
 
diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb
index 79a69d52..ec88ec48 100644
--- a/docs/examples/example_estimating_ates.ipynb
+++ b/docs/examples/example_estimating_ates.ipynb
@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,7 +40,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -99,20 +99,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2.083595103597918, 0.06526671583747883)"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n",
     "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n",
@@ -121,20 +110,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(2.1433722387308025, 0.06345124983351998)"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n",
     "                   df) .fit(cov_type=\"HC1\")\n",
@@ -160,42 +138,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style type='text/css'>\n",
-       ".datatable table.frame { margin-bottom: 0; }\n",
-       ".datatable table.frame thead { border-bottom: none; }\n",
-       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
-       ".datatable .bool    { background: #DDDD99; }\n",
-       ".datatable .object  { background: #565656; }\n",
-       ".datatable .int     { background: #5D9E5D; }\n",
-       ".datatable .float   { background: #4040CC; }\n",
-       ".datatable .str     { background: #CC4040; }\n",
-       ".datatable .time    { background: #40CC40; }\n",
-       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
-       ".datatable .frame tbody td { text-align: left; }\n",
-       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
-       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
-       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
-       ".datatable .sp {  opacity: 0.25;}\n",
-       ".datatable .footer { font-size: 9px; }\n",
-       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
-       "</style>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "from metalearners import DRLearner\n",
     "from lightgbm import LGBMRegressor, LGBMClassifier\n",
@@ -204,7 +149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {
     "editable": true,
     "slideshow": {
@@ -212,18 +157,7 @@
     },
     "tags": []
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(array([1.02931589]), array([0.06679633]))"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "metalearners_dr = DRLearner(\n",
     "    nuisance_model_factory=LGBMRegressor,\n",
@@ -629,8 +563,14 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "py311",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "name": "python",
+   "version": "3.11.7"
   },
   "mystnb": {
    "execution_timeout": 120
diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb
index cc351bfa..03cad0c2 100644
--- a/docs/examples/example_sparse_inputs.ipynb
+++ b/docs/examples/example_sparse_inputs.ipynb
@@ -20,42 +20,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style type='text/css'>\n",
-       ".datatable table.frame { margin-bottom: 0; }\n",
-       ".datatable table.frame thead { border-bottom: none; }\n",
-       ".datatable table.frame tr.coltypes td {  color: #FFFFFF;  line-height: 6px;  padding: 0 0.5em;}\n",
-       ".datatable .bool    { background: #DDDD99; }\n",
-       ".datatable .object  { background: #565656; }\n",
-       ".datatable .int     { background: #5D9E5D; }\n",
-       ".datatable .float   { background: #4040CC; }\n",
-       ".datatable .str     { background: #CC4040; }\n",
-       ".datatable .time    { background: #40CC40; }\n",
-       ".datatable .row_index {  background: var(--jp-border-color3);  border-right: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  font-size: 9px;}\n",
-       ".datatable .frame tbody td { text-align: left; }\n",
-       ".datatable .frame tr.coltypes .row_index {  background: var(--jp-border-color0);}\n",
-       ".datatable th:nth-child(2) { padding-left: 12px; }\n",
-       ".datatable .hellipsis {  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .vellipsis {  background: var(--jp-layout-color0);  color: var(--jp-cell-editor-border-color);}\n",
-       ".datatable .na {  color: var(--jp-cell-editor-border-color);  font-size: 80%;}\n",
-       ".datatable .sp {  opacity: 0.25;}\n",
-       ".datatable .footer { font-size: 9px; }\n",
-       ".datatable .frame_dimensions {  background: var(--jp-border-color3);  border-top: 1px solid var(--jp-border-color0);  color: var(--jp-ui-font-color3);  display: inline-block;  opacity: 0.6;  padding: 1px 10px 1px 5px;}\n",
-       "</style>\n"
-      ],
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import time, psutil, os, gc\n",
     "import numpy as np\n",
@@ -76,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -104,7 +71,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -175,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -192,19 +159,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\n",
-      "Sparse data memory: 0.76MB\n",
-      "Dense data memory: 41.28MB\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n",
     "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")"
@@ -219,11 +176,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def fit_drlearner_wrapper(X):\n",
+    "def fit_drlearner_wrapper(X, name):\n",
     "    start_memory = get_memory_usage()\n",
     "    start_time = time.time()\n",
     "    metalearners_dr = DRLearner(\n",
@@ -251,7 +208,7 @@
     "    end_memory = get_memory_usage()\n",
     "    runtime = end_time - start_time\n",
     "    memory_used = end_memory - start_memory\n",
-    "    print(f\"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n",
+    "    print(f\"{name} data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n",
     "    print(metalearners_est)"
    ]
   },
@@ -264,20 +221,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n",
-      "(array([1.0161235]), array([0.06374022]))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "fit_drlearner_wrapper(X_csr)"
+    "fit_drlearner_wrapper(X_csr, \"Sparse\")"
    ]
   },
   {
@@ -289,20 +237,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n",
-      "(array([1.01609547]), array([0.06384197]))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "fit_drlearner_wrapper(X_np)"
+    "fit_drlearner_wrapper(X_np, \"Dense\")"
    ]
   }
  ],
diff --git a/metalearners/_utils.py b/metalearners/_utils.py
index c1d63d67..a5c02f37 100644
--- a/metalearners/_utils.py
+++ b/metalearners/_utils.py
@@ -25,7 +25,7 @@
 default_rng = np.random.default_rng()
 
 
-def safe_len(X):
+def safe_len(X: Matrix) -> int:
     if scipy.sparse.issparse(X):
         return X.shape[0]
     return len(X)