From c1351d0184573f9325b9d88e8607a0e781e13216 Mon Sep 17 00:00:00 2001 From: Apoorva Lal Date: Fri, 16 Aug 2024 15:54:32 -0700 Subject: [PATCH 1/9] add support for csr matrix --- docs/examples/example_sparse_inputs.ipynb | 550 ++++++++++++++++++++++ docs/examples/index.rst | 1 + metalearners/_typing.py | 3 +- metalearners/_utils.py | 7 + metalearners/cross_fit_estimator.py | 19 +- metalearners/drlearner.py | 7 +- metalearners/explainer.py | 4 +- metalearners/metalearner.py | 5 +- metalearners/rlearner.py | 9 +- metalearners/slearner.py | 3 +- metalearners/utils.py | 5 +- metalearners/xlearner.py | 7 +- 12 files changed, 596 insertions(+), 24 deletions(-) create mode 100644 docs/examples/example_sparse_inputs.ipynb diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb new file mode 100644 index 00000000..79a15786 --- /dev/null +++ b/docs/examples/example_sparse_inputs.ipynb @@ -0,0 +1,550 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "(example-sparse)=\n", + "\n", + " Example: Using Sparse Covariate Matrices\n", + "=============================\n", + "\n", + "Motivation\n", + "----------\n", + "\n", + "In many applications, we want to adjust for categorical covariates with many levels. As a natural pre-processing step, this may involve one-hot-encoding the covariates, which can lead to a high-dimensional covariate matrix, which is typically very sparse. Many scikit-style learners accept (scipy's) sparse matrices as input, which allows us to use them for treatment effect estimation as well. \n", + "\n", + "Example\n", + "-------" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import time, psutil, os, gc\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy as sp\n", + "\n", + "from sklearn.dummy import DummyRegressor\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "\n", + "from lightgbm import LGBMRegressor, LGBMClassifier\n", + "from metalearners import DRLearner" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def get_memory_usage():\n", + " process = psutil.Process(os.getpid())\n", + " return process.memory_info().rss / 1024 / 1024 # in MB\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Causal Inference\n", + "\n", + "### DRLearner\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We generate some data where X comprises of 100 categorical variables with 1000 possible levels. Naively one-hot-encoding this data produces a very large matrix with many zeroes, which is an ideal application of `scipy.sparse.csr_matrix`. We then use the `DRLearner` to estimate the treatment effect. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_causal_data(\n", + " n_samples=100_000,\n", + " n_categories=1000,\n", + " n_features=100,\n", + " tau_magnitude=1.0,\n", + "):\n", + " ######################################################################\n", + " # Generate covariate matrix X\n", + " X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n", + " ######################################################################\n", + " # Generate potential outcome y0\n", + " y0 = np.zeros(n_samples)\n", + " # Select a few features for main effects\n", + " main_effect_features = np.random.choice(n_features, 3, replace=False)\n", + " # Create main effects - fully dense\n", + " for i in main_effect_features:\n", + " category_effects = np.random.normal(0, 4, n_categories)\n", + " y0 += category_effects[X[:, i]]\n", + " # Select a couple of feature pairs for interaction effects\n", + " interaction_pairs = [\n", + " (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n", + " ]\n", + " selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n", + " # Create interaction effects\n", + " for idx in selected_interactions:\n", + " i, j = interaction_pairs[idx]\n", + " interaction_effect = np.random.choice(\n", + " [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]\n", + " )\n", + " y0 += interaction_effect[X[:, i], X[:, j]]\n", + " # Normalize y0\n", + " y0 = (y0 - np.mean(y0)) / np.std(y0)\n", + " y0 += np.random.normal(0, 0.1, n_samples)\n", + " ######################################################################\n", + " # Generate treatment assignment W\n", + " propensity_score = np.zeros(n_samples)\n", + " for i in main_effect_features:\n", + " category_effects = np.random.normal(0, 4, n_categories)\n", + " propensity_score += category_effects[X[:, i]]\n", + " # same interactions enter pscore\n", + " # Create interaction effects\n", + " for idx in selected_interactions:\n", + " i, j = interaction_pairs[idx]\n", + " interaction_effect = np.random.choice(\n", + " [-1, 0, 1], size=(n_categories, n_categories), p=[0.25, 0.5, 0.25]\n", + " )\n", + " propensity_score += interaction_effect[X[:, i], X[:, j]]\n", + " # Convert to probabilities using logistic function\n", + " propensity_score = sp.special.expit(propensity_score)\n", + " # Generate binary treatment\n", + " W = np.random.binomial(1, propensity_score)\n", + " ######################################################################\n", + " # Generate treatment effect\n", + " tau = tau_magnitude * np.ones(n_samples)\n", + " # Generate final outcome\n", + " Y = y0 + W * tau\n", + " return X, W, Y, tau, propensity_score\n", + "\n", + "\n", + "X, W, Y, tau, propensity_score = generate_causal_data(\n", + " n_samples=10000, tau_magnitude=1.0\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# sparse and dense X matrices\n", + "e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically\n", + "Xdf = pd.DataFrame(X)\n", + "X_csr = e1.fit_transform(X)\n", + "X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Sparse data memory: 7.63MB\n", + "Dense data memory: 953.64MB\n" + ] + } + ], + "source": [ + "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", + "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As expected, the memory footprint of the sparse matrix is considerably smaller than the dense matrix. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def fit_drlearner_wrapper(X):\n", + " start_memory = get_memory_usage()\n", + " start_time = time.time()\n", + " metalearners_dr = DRLearner(\n", + " nuisance_model_factory=LGBMRegressor,\n", + " treatment_model_factory=DummyRegressor,\n", + " propensity_model_factory=LGBMClassifier,\n", + " is_classification=False,\n", + " n_variants=2,\n", + " nuisance_model_params={\"verbose\": -1},\n", + " propensity_model_params={\"verbose\": -1},\n", + " )\n", + "\n", + " metalearners_dr.fit_all_nuisance(\n", + " X=X,\n", + " y=Y,\n", + " w=W,\n", + " )\n", + " metalearners_est = metalearners_dr.average_treatment_effect(\n", + " X=X,\n", + " y=Y,\n", + " w=W,\n", + " is_oos=False,\n", + " )\n", + " end_time = time.time()\n", + " end_memory = get_memory_usage()\n", + " runtime = end_time - start_time\n", + " memory_used = end_memory - start_memory\n", + " print(f\"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n", + " print(metalearners_est)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`scipy.sparse.csr_matrix` input" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 15.04s, Memory used: 324.86MB\n", + "(array([0.9523358]), array([0.0202085]))\n" + ] + }, + { + "data": { + "text/plain": [ + "318" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_drlearner_wrapper(X_csr)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`np.ndarray` input" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 117.22s, Memory used: 87.21MB\n", + "(array([0.95124745]), array([0.02021724]))\n" + ] + }, + { + "data": { + "text/plain": [ + "190" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit_drlearner_wrapper(X_np)\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prediction \n", + "\n", + "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n", + " X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n", + " y = np.zeros(n_samples)\n", + " # Select a few features for main effects\n", + " main_effect_features = np.random.choice(n_features, 3, replace=False)\n", + " # Create main effects\n", + " for i in main_effect_features:\n", + " # Create a random effect for each category\n", + " category_effects = np.random.normal(0, 1, n_categories)\n", + " y += category_effects[X[:, i]]\n", + " # Select a couple of feature pairs for interaction effects\n", + " interaction_pairs = [\n", + " (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n", + " ]\n", + " selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n", + " # Create interaction effects\n", + " for idx in selected_interactions:\n", + " i, j = interaction_pairs[idx]\n", + " # Create a sparse interaction effect\n", + " interaction_effect = np.random.choice(\n", + " [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n", + " )\n", + " y += interaction_effect[X[:, i], X[:, j]]\n", + " # Add a non-linear effect for one feature\n", + " nonlinear_feature = np.random.choice(n_features)\n", + " y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n", + " y = (y - np.mean(y)) / np.std(y)\n", + " y += np.random.normal(0, 0.1, n_samples)\n", + "\n", + " return X, y" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def prepare_data(X):\n", + " e1 = OneHotEncoder(sparse_output=True)\n", + " # dense - use pd.get_dummies to mimic current practice\n", + " Xdf = pd.DataFrame(X)\n", + " return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n", + "\n", + "def fit_and_measure(X_train, y_train, X_test, y_test):\n", + " start_memory = get_memory_usage()\n", + " start_time = time.time()\n", + " m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n", + " m.fit(X_train, y_train)\n", + " end_time = time.time()\n", + " end_memory = get_memory_usage()\n", + " runtime = end_time - start_time\n", + " memory_used = end_memory - start_memory\n", + "\n", + " # Compute accuracy metrics\n", + " y_pred = m.predict(X_test)\n", + " mse = mean_squared_error(y_test, y_pred)\n", + " r2 = r2_score(y_test, y_pred)\n", + "\n", + " return runtime, memory_used, mse, r2" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = generate_dummy_data()\n", + "# Split the data into train and test sets\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "X_train_sparse, X_train_dense = prepare_data(X_train)\n", + "X_test_sparse, X_test_dense = prepare_data(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1min 4s, sys: 1.83 s, total: 1min 5s\n", + "Wall time: 8.6 s\n" + ] + }, + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 2min 50s, sys: 6.88 s, total: 2min 57s\n", + "Wall time: 31.6 s\n" + ] + }, + { + "data": { + "text/plain": [ + "35" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n", + "gc.collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 8.48s, Memory used: 69.88MB, MSE: 0.8659, R2: 0.1396\n", + "Dense data - Runtime: 30.58s, Memory used: 1.21MB, MSE: 0.8659, R2: 0.1396\n", + "\n", + "Sparse data memory: 12.21MB\n", + "Dense data memory: 1525.88MB\n" + ] + } + ], + "source": [ + "print(\n", + " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, Memory used: {sparse_memory:.2f}MB, MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"\n", + ")\n", + "print(\n", + " f\"Dense data - Runtime: {dense_runtime:.2f}s, Memory used: {dense_memory:.2f}MB, MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"\n", + ")\n", + "\n", + "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n", + "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/examples/index.rst b/docs/examples/index.rst index b91cea15..81401fdc 100644 --- a/docs/examples/index.rst +++ b/docs/examples/index.rst @@ -16,3 +16,4 @@ Examples Estimating CATEs for survival analysis What if I know the propensity score? Converting a MetaLearner to ONNX + Using Sparse Covariate Matrices diff --git a/metalearners/_typing.py b/metalearners/_typing.py index 95b66b8b..10a6f4d7 100644 --- a/metalearners/_typing.py +++ b/metalearners/_typing.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +import scipy.sparse as sps PredictMethod = Literal["predict", "predict_proba"] @@ -21,7 +22,7 @@ # ruff is not happy about the usage of Union. Vector = Union[pd.Series, np.ndarray] # noqa -Matrix = Union[pd.DataFrame, np.ndarray] # noqa +Matrix = Union[pd.DataFrame, np.ndarray, sps.csr_matrix] # noqa class _ScikitModel(Protocol): diff --git a/metalearners/_utils.py b/metalearners/_utils.py index 2337c421..c1d63d67 100644 --- a/metalearners/_utils.py +++ b/metalearners/_utils.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd +import scipy from sklearn.base import check_array, check_X_y, is_classifier, is_regressor from sklearn.ensemble import ( HistGradientBoostingClassifier, @@ -24,6 +25,12 @@ default_rng = np.random.default_rng() +def safe_len(X): + if scipy.sparse.issparse(X): + return X.shape[0] + return len(X) + + def index_matrix(matrix: Matrix, rows: Vector) -> Matrix: """Subselect certain rows from a matrix.""" if isinstance(rows, pd.Series): diff --git a/metalearners/cross_fit_estimator.py b/metalearners/cross_fit_estimator.py index aa112c03..568ca274 100644 --- a/metalearners/cross_fit_estimator.py +++ b/metalearners/cross_fit_estimator.py @@ -16,7 +16,12 @@ from typing_extensions import Self from metalearners._typing import Matrix, OosMethod, PredictMethod, SplitIndices, Vector -from metalearners._utils import _ScikitModel, index_matrix, validate_number_positive +from metalearners._utils import ( + _ScikitModel, + index_matrix, + safe_len, + validate_number_positive, +) OVERALL: OosMethod = "overall" MEDIAN: OosMethod = "median" @@ -157,7 +162,7 @@ def fit( (train_indices, test_indices) tuples indicating how to split the data at hand into train and test/estimation sets for different folds. """ - _validate_data_match_prior_split(len(X), self._test_indices) + _validate_data_match_prior_split(safe_len(X), self._test_indices) if fit_params is None: fit_params = dict() @@ -215,13 +220,13 @@ def _n_outputs(self, method: PredictMethod) -> int: def _predict_all(self, X: Matrix, method: PredictMethod) -> np.ndarray: n_outputs = self._n_outputs(method) predictions = self._initialize_prediction_tensor( - n_observations=len(X), + n_observations=safe_len(X), n_outputs=n_outputs, n_folds=self.n_folds, ) for i, estimator in enumerate(self._estimators): predictions[:, :, i] = np.reshape( - getattr(estimator, method)(X), (len(X), n_outputs) + getattr(estimator, method)(X), (safe_len(X), n_outputs) ) if n_outputs == 1: return predictions[:, 0, :] @@ -242,15 +247,15 @@ def _predict_in_sample( ) -> np.ndarray: if not self._test_indices: raise ValueError() - if len(X) != sum(len(fold) for fold in self._test_indices): + if safe_len(X) != sum(len(fold) for fold in self._test_indices): raise ValueError( "Trying to predict in-sample on data that is unlike data encountered in training. " f"Training data included {sum(len(fold) for fold in self._test_indices)} " - f"observations while prediction data includes {len(X)} observations." + f"observations while prediction data includes {safe_len(X)} observations." ) n_outputs = self._n_outputs(method) predictions = self._initialize_prediction_tensor( - n_observations=len(X), + n_observations=safe_len(X), n_outputs=n_outputs, n_folds=1, ) diff --git a/metalearners/drlearner.py b/metalearners/drlearner.py index 7bff6fb5..6c03ab36 100644 --- a/metalearners/drlearner.py +++ b/metalearners/drlearner.py @@ -28,6 +28,7 @@ get_predict_proba, index_matrix, infer_input_dict, + safe_len, validate_valid_treatment_variant_not_control, warning_experimental_feature, ) @@ -253,7 +254,7 @@ def predict( oos_method: OosMethod = OVERALL, ) -> np.ndarray: n_outputs = 2 if self.is_classification else 1 - estimates = np.zeros((len(X), self.n_variants - 1, n_outputs)) + estimates = np.zeros((safe_len(X), self.n_variants - 1, n_outputs)) for treatment_variant in range(1, self.n_variants): estimates_variant = self.predict_treatment( X, @@ -365,7 +366,7 @@ def average_treatment_effect( raise ValueError( "The nuisance models need to be fitted before computing the treatment effect." ) - gamma_matrix = np.zeros((len(X), self.n_variants - 1)) + gamma_matrix = np.zeros((safe_len(X), self.n_variants - 1)) for treatment_variant in range(1, self.n_variants): gamma_matrix[:, treatment_variant - 1] = self._pseudo_outcome( X=X, @@ -375,7 +376,7 @@ def average_treatment_effect( is_oos=is_oos, ) treatment_effect = gamma_matrix.mean(axis=0) - standard_error = gamma_matrix.std(axis=0) / np.sqrt(len(X)) + standard_error = gamma_matrix.std(axis=0) / np.sqrt(safe_len(X)) return treatment_effect, standard_error def _pseudo_outcome( diff --git a/metalearners/explainer.py b/metalearners/explainer.py index 721514c3..8449bc11 100644 --- a/metalearners/explainer.py +++ b/metalearners/explainer.py @@ -8,7 +8,7 @@ import shap from metalearners._typing import Matrix, _ScikitModel -from metalearners._utils import simplify_output_2d +from metalearners._utils import safe_len, simplify_output_2d from metalearners.metalearner import Params @@ -59,7 +59,7 @@ def from_estimates( The ``cate_estimates`` should be the raw outcome of a MetaLearner with 3 dimensions and should not be simplified. """ - if len(X) != len(cate_estimates) or len(X) == 0: + if safe_len(X) != len(cate_estimates) or safe_len(X) == 0: raise ValueError( "X and cate_estimates should contain the same number of observations " "and not be empty." diff --git a/metalearners/metalearner.py b/metalearners/metalearner.py index 5aca3169..1ec53c90 100644 --- a/metalearners/metalearner.py +++ b/metalearners/metalearner.py @@ -30,6 +30,7 @@ ONNX_PROBABILITIES_OUTPUTS, default_metric, index_matrix, + safe_len, validate_model_and_predict_method, validate_number_positive, ) @@ -120,7 +121,7 @@ def _filter_x_columns(X: Matrix, feature_set: Features) -> Matrix: if feature_set is None: X_filtered = X elif len(feature_set) == 0: - X_filtered = np.ones((len(X), 1)) + X_filtered = np.ones((safe_len(X), 1)) else: if isinstance(X, pd.DataFrame): X_filtered = X[list(feature_set)] @@ -1347,7 +1348,7 @@ def predict_conditional_average_outcomes( "typically set during fitting, is None." ) # TODO: Consider multiprocessing - n_obs = len(X) + n_obs = safe_len(X) nuisance_tensors = self._nuisance_tensors(n_obs) conditional_average_outcomes_list = nuisance_tensors[VARIANT_OUTCOME_MODEL] diff --git a/metalearners/rlearner.py b/metalearners/rlearner.py index b86e9764..95cc237e 100644 --- a/metalearners/rlearner.py +++ b/metalearners/rlearner.py @@ -20,6 +20,7 @@ get_predict_proba, index_matrix, infer_input_dict, + safe_len, validate_all_vectors_same_index, validate_valid_treatment_variant_not_control, warning_experimental_feature, @@ -277,7 +278,7 @@ def predict( oos_method: OosMethod = OVERALL, ) -> np.ndarray: n_outputs = 2 if self.is_classification else 1 - tau_hat = np.zeros((len(X), self.n_variants - 1, n_outputs)) + tau_hat = np.zeros((safe_len(X), self.n_variants - 1, n_outputs)) if is_oos: @@ -298,7 +299,7 @@ def predict( variant_estimates = np.stack( [-variant_estimates, variant_estimates], axis=-1 ) - variant_estimates = variant_estimates.reshape(len(X), n_outputs) + variant_estimates = variant_estimates.reshape(safe_len(X), n_outputs) tau_hat[:, treatment_variant - 1, :] = variant_estimates return tau_hat @@ -486,7 +487,7 @@ def _pseudo_outcome_and_weights( constant ``epsilon`` to the denominator in order to avoid numerical problems. """ if mask is None: - mask = np.ones(len(X), dtype=bool) + mask = np.ones(safe_len(X), dtype=bool) validate_valid_treatment_variant_not_control(treatment_variant, self.n_variants) @@ -560,7 +561,7 @@ def predict_conditional_average_outcomes( where :math:`K` is the number of treatment variants. """ - n_obs = len(X) + n_obs = safe_len(X) cate_estimates = self.predict( X=X, diff --git a/metalearners/slearner.py b/metalearners/slearner.py index 7bda6a13..d4f48bee 100644 --- a/metalearners/slearner.py +++ b/metalearners/slearner.py @@ -21,6 +21,7 @@ from metalearners._utils import ( convert_treatment, get_one, + safe_len, supports_categoricals, ) from metalearners.cross_fit_estimator import OVERALL, CrossFitEstimator @@ -231,7 +232,7 @@ def evaluate( def predict_conditional_average_outcomes( self, X: Matrix, is_oos: bool, oos_method: OosMethod = OVERALL ) -> np.ndarray: - n_obs = len(X) + n_obs = safe_len(X) conditional_average_outcomes_list = [] for treatment_variant in range(self.n_variants): diff --git a/metalearners/utils.py b/metalearners/utils.py index 587bef67..7f8ece7f 100644 --- a/metalearners/utils.py +++ b/metalearners/utils.py @@ -9,6 +9,7 @@ from typing_extensions import Self from metalearners._typing import Matrix, Vector +from metalearners._utils import safe_len from metalearners.drlearner import DRLearner from metalearners.metalearner import MetaLearner from metalearners.rlearner import RLearner @@ -104,4 +105,6 @@ def predict(self, X: Matrix) -> np.ndarray[Any, Any]: return np.argmax(self.predict_proba(X), axis=1) def predict_proba(self, X: pd.DataFrame) -> np.ndarray[Any, Any]: - return np.full((len(X), 2), [1 - self.propensity_score, self.propensity_score]) + return np.full( + (safe_len(X), 2), [1 - self.propensity_score, self.propensity_score] + ) diff --git a/metalearners/xlearner.py b/metalearners/xlearner.py index 28bee892..016a726e 100644 --- a/metalearners/xlearner.py +++ b/metalearners/xlearner.py @@ -18,6 +18,7 @@ index_matrix, infer_input_dict, infer_probabilities_output, + safe_len, validate_valid_treatment_variant_not_control, warning_experimental_feature, ) @@ -231,7 +232,7 @@ def predict( "typically set during fitting, is None." ) n_outputs = 2 if self.is_classification else 1 - tau_hat = np.zeros((len(X), self.n_variants - 1, n_outputs)) + tau_hat = np.zeros((safe_len(X), self.n_variants - 1, n_outputs)) # Propensity score model is always a classifier so we can't use MEDIAN propensity_score_oos = OVERALL if oos_method == MEDIAN else oos_method propensity_score = self.predict_nuisance( @@ -266,8 +267,8 @@ def predict( oos_method=oos_method, ) else: - tau_hat_treatment = np.zeros(len(X)) - tau_hat_control = np.zeros(len(X)) + tau_hat_treatment = np.zeros(safe_len(X)) + tau_hat_control = np.zeros(safe_len(X)) tau_hat_treatment[non_treatment_variant_indices] = ( self.predict_treatment( From ff810340ce5321c8c25f23bd2dc00125d75133f4 Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 27 Aug 2024 10:12:41 +0200 Subject: [PATCH 2/9] Appease mypy about notebook. --- docs/examples/example_sparse_inputs.ipynb | 217 +++++----------------- 1 file changed, 45 insertions(+), 172 deletions(-) diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index 79a15786..04677eca 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -20,42 +20,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import time, psutil, os, gc\n", "import numpy as np\n", @@ -68,12 +35,15 @@ "from sklearn.metrics import mean_squared_error, r2_score\n", "\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", - "from metalearners import DRLearner" + "from metalearners import DRLearner\n", + "\n", + "# This is required for when nbconvert converts the cell-magic to regular function calls.\n", + "from IPython import get_ipython" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -171,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -184,19 +154,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Sparse data memory: 7.63MB\n", - "Dense data memory: 953.64MB\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" @@ -211,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -249,20 +209,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "gc.collect()" ] @@ -276,28 +225,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 15.04s, Memory used: 324.86MB\n", - "(array([0.9523358]), array([0.0202085]))\n" - ] - }, - { - "data": { - "text/plain": [ - "318" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "fit_drlearner_wrapper(X_csr)\n", "gc.collect()" @@ -312,28 +242,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 117.22s, Memory used: 87.21MB\n", - "(array([0.95124745]), array([0.02021724]))\n" - ] - }, - { - "data": { - "text/plain": [ - "190" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "fit_drlearner_wrapper(X_np)\n", "gc.collect()" @@ -357,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -395,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -425,7 +336,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -438,28 +349,9 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 1min 4s, sys: 1.83 s, total: 1min 5s\n", - "Wall time: 8.6 s\n" - ] - }, - { - "data": { - "text/plain": [ - "39" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n", @@ -468,28 +360,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 2min 50s, sys: 6.88 s, total: 2min 57s\n", - "Wall time: 31.6 s\n" - ] - }, - { - "data": { - "text/plain": [ - "35" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n", @@ -498,37 +371,37 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 8.48s, Memory used: 69.88MB, MSE: 0.8659, R2: 0.1396\n", - "Dense data - Runtime: 30.58s, Memory used: 1.21MB, MSE: 0.8659, R2: 0.1396\n", - "\n", - "Sparse data memory: 12.21MB\n", - "Dense data memory: 1525.88MB\n" - ] - } - ], + "outputs": [], "source": [ + "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n", "print(\n", - " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, Memory used: {sparse_memory:.2f}MB, MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\"\n", - ")\n", + " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \" # type: ignore[name-defined]\n", + " f\"Memory used: {sparse_memory:.2f}MB, \" # type: ignore[name-defined]\n", + " f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\" # type: ignore[name-defined]\n", + ") \n", "print(\n", - " f\"Dense data - Runtime: {dense_runtime:.2f}s, Memory used: {dense_memory:.2f}MB, MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\"\n", + " f\"Dense data - Runtime: {dense_runtime:.2f}s, \" # type: ignore[name-defined]\n", + " f\"Memory used: {dense_memory:.2f}MB, \" # type: ignore[name-defined]\n", + " f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\" # type: ignore[name-defined]\n", ")\n", "\n", "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "py311", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -542,9 +415,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.7" + "version": "3.12.3" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 47dac106b9f6ce24c3723ca7ee7655b8ca7652f8 Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 27 Aug 2024 12:41:09 +0200 Subject: [PATCH 3/9] Test against csr in test_cross_fit_estimator.py --- tests/test_cross_fit_estimator.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/test_cross_fit_estimator.py b/tests/test_cross_fit_estimator.py index 4165a06c..abdcbf4a 100644 --- a/tests/test_cross_fit_estimator.py +++ b/tests/test_cross_fit_estimator.py @@ -6,6 +6,7 @@ import numpy as np import pytest from lightgbm import LGBMClassifier, LGBMRegressor +from scipy.sparse import csr_matrix from sklearn.base import is_classifier, is_regressor from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import accuracy_score, log_loss @@ -24,10 +25,10 @@ @pytest.mark.parametrize("predict_proba", [True, False]) @pytest.mark.parametrize("is_oos", [True, False]) @pytest.mark.parametrize("oos_method", ["overall", "mean", "median"]) -@pytest.mark.parametrize("use_np", [True, False]) +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) @pytest.mark.parametrize("pass_cv", [True, False]) def test_crossfitestimator_oos_smoke( - mindset_data, rng, use_clf, predict_proba, is_oos, oos_method, use_np, pass_cv + mindset_data, rng, use_clf, predict_proba, is_oos, oos_method, backend, pass_cv ): if not use_clf and predict_proba: pytest.skip() @@ -50,9 +51,12 @@ def test_crossfitestimator_oos_smoke( # Arbitrary cut-off y = y > 0.8 - if use_np: + if backend == "np": X = X.to_numpy() y = y.to_numpy() + if backend == "csr": + X = csr_matrix(df.values) + y = y.to_numpy() cfe = CrossFitEstimator( n_folds=5, From 3d2d84e15b39ee3e3c53810b0d559c25470819b5 Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 27 Aug 2024 12:51:08 +0200 Subject: [PATCH 4/9] Test against csr in test_utils.py --- tests/test_cross_fit_estimator.py | 2 +- tests/test_utils.py | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tests/test_cross_fit_estimator.py b/tests/test_cross_fit_estimator.py index abdcbf4a..eb709735 100644 --- a/tests/test_cross_fit_estimator.py +++ b/tests/test_cross_fit_estimator.py @@ -54,7 +54,7 @@ def test_crossfitestimator_oos_smoke( if backend == "np": X = X.to_numpy() y = y.to_numpy() - if backend == "csr": + elif backend == "csr": X = csr_matrix(df.values) y = y.to_numpy() diff --git a/tests/test_utils.py b/tests/test_utils.py index d88b5d14..f634b772 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,6 +5,7 @@ import pandas as pd import pytest from lightgbm import LGBMRegressor +from scipy.sparse import csr_matrix from metalearners.metalearner import MetaLearner from metalearners.utils import ( @@ -59,8 +60,8 @@ def test_simplify_output_raises(input): simplify_output(input) -@pytest.mark.parametrize("use_pd", [True, False]) -def test_fixed_binary_propensity(use_pd): +@pytest.mark.parametrize("backend", ["pd", "pd", "csr"]) +def test_fixed_binary_propensity(backend): propensity_score = 0.3 dominant_class = propensity_score >= 0.5 @@ -69,19 +70,24 @@ def test_fixed_binary_propensity(use_pd): n_samples = 5 X_train = np.ones((n_samples, 5)) y_train = np.ones(n_samples) - if use_pd: + + n_test_samples = 3 + X_test = np.zeros((n_test_samples, 5)) + + expected_result = np.array(np.ones(n_test_samples) * dominant_class) + + if backend == "pd": X_train = pd.DataFrame(X_train) y_train = pd.Series(y_train) + X_test = pd.DataFrame(X_test) + elif backend == "csr": + X_train = csr_matrix(X_train) + X_test = csr_matrix(X_test) model.fit(X_train, y_train) - - n_test_samples = 3 - X_test = np.zeros(n_test_samples) - class_predictions = model.predict(X_test) - assert np.array_equal( - class_predictions, np.array(np.ones(n_test_samples) * dominant_class) - ) + + assert np.array_equal(class_predictions, expected_result) probability_estimates = model.predict_proba(X_test) assert np.array_equal( From 7e892f2751ef40ca6a9f6e073603de926effcf6c Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 27 Aug 2024 13:25:54 +0200 Subject: [PATCH 5/9] Adapt S-Learner to work with csr matrix. --- metalearners/slearner.py | 21 +++++++++++++++------ tests/test_slearner.py | 29 +++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/metalearners/slearner.py b/metalearners/slearner.py index d4f48bee..bd4395d3 100644 --- a/metalearners/slearner.py +++ b/metalearners/slearner.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from scipy.sparse import csr_matrix, hstack from typing_extensions import Self from metalearners._typing import ( @@ -57,21 +58,29 @@ def _append_treatment_to_covariates( # names are integers and some strings. X_with_w.columns = X_with_w.columns.astype(str) return X_with_w + elif isinstance(X, csr_matrix): + return hstack((X, w_dummies), format="csr") else: return np.concatenate([X, w_dummies], axis=1) + # This is necessary as each model works differently with categoricals, + # in some you need to specify them on instantiation while some others on + # fitting. This solutions converts it to a pd.DataFrame as most of the models + # have some "automatic" detection of categorical features based on pandas + # dtypes. Theoretically it would be possible to get around this conversion + # but it would require loads of model specific code. if isinstance(X, np.ndarray): - # This is necessary as each model works differently with categoricals, - # in some you need to specify them on instantiation while some others on - # fitting. This solutions converts it to a pd.DataFrame as most of the models - # have some "automatic" detection of categorical features based on pandas - # dtypes. Theoretically it would be possible to get around this conversion - # but it would require loads of model specific code. warnings.warn( "Converting the input covariates X from np.ndarray to a " f"pd.DataFrame as the {_BASE_MODEL} supports categorical variables." ) X = pd.DataFrame(X, copy=True) + elif isinstance(X, csr_matrix): + warnings.warn( + "Converting the input covariates X from a scipy csr_matrix to a " + f"pd.DataFrame as the {_BASE_MODEL} supports categorical variables." + ) + X = pd.DataFrame.sparse.from_spmatrix(X) X_with_w = pd.concat([X, pd.Series(w, dtype="category", name="treatment")], axis=1) X_with_w.columns = X_with_w.columns.astype(str) diff --git a/tests/test_slearner.py b/tests/test_slearner.py index 37f6086c..15f0eaa1 100644 --- a/tests/test_slearner.py +++ b/tests/test_slearner.py @@ -5,6 +5,7 @@ import pandas as pd import pytest from lightgbm import LGBMRegressor +from scipy.sparse import csr_matrix from sklearn.linear_model import LinearRegression from metalearners.slearner import SLearner, _append_treatment_to_covariates @@ -31,16 +32,20 @@ def test_feature_set_doesnt_raise(rng): @pytest.mark.parametrize( "model, supports_categoricals", [(LinearRegression, False), (LGBMRegressor, True)] ) -@pytest.mark.parametrize("use_pd", [False, True]) +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) def test_append_treatment_to_covariates( model, supports_categoricals, - use_pd, + backend, sample_size, request, ): - dataset_name = "mixed" if use_pd else "numerical" + dataset_name = "mixed" if backend == "pd" else "numerical" covariates, _, _ = request.getfixturevalue(f"{dataset_name}_covariates") + + if backend == "csr": + covariates = csr_matrix(covariates) + treatment = np.array([0] * sample_size) n_variants = 4 X_with_w = _append_treatment_to_covariates( @@ -52,20 +57,28 @@ def test_append_treatment_to_covariates( list(range(n_variants)) ) - if not use_pd and not supports_categoricals: - assert isinstance(X_with_w, np.ndarray) + if backend in ["np", "csr"] and not supports_categoricals: + if backend == "np": + assert isinstance(X_with_w, np.ndarray) + elif backend == "csr": + assert isinstance(X_with_w, csr_matrix) assert ( ( X_with_w[:, -3:] - == pd.get_dummies(treatment_pd, dtype=int, drop_first=True) + == pd.get_dummies(treatment_pd, dtype=int, drop_first=True).values ) .all() .all() ) - assert np.all(X_with_w[:, :-3] == covariates) + assert (X_with_w[:, :-3] != covariates).sum() < 1 else: assert isinstance(X_with_w, pd.DataFrame) - covariates_pd = pd.DataFrame(covariates) if not use_pd else covariates + if backend == "np": + covariates_pd = pd.DataFrame(covariates) + elif backend == "csr": + covariates_pd = pd.DataFrame.sparse.from_spmatrix(covariates) + else: + covariates_pd = covariates covariates_pd.columns = covariates_pd.columns.astype(str) if not supports_categoricals: assert X_with_w[["treatment_1", "treatment_2", "treatment_3"]].equals( From 43da98913292501d36bbb1e36078a505210009e1 Mon Sep 17 00:00:00 2001 From: kklein Date: Tue, 27 Aug 2024 13:26:49 +0200 Subject: [PATCH 6/9] Test against csr matrix in test_learner and test_metalearner. --- tests/test_learner.py | 9 ++++++--- tests/test_metalearner.py | 25 ++++++++++++++++--------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/test_learner.py b/tests/test_learner.py index 01d33754..afe26342 100644 --- a/tests/test_learner.py +++ b/tests/test_learner.py @@ -5,6 +5,7 @@ import pandas as pd import pytest from lightgbm import LGBMClassifier, LGBMRegressor +from scipy.sparse import csr_matrix from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import make_scorer, root_mean_squared_error from sklearn.model_selection import train_test_split @@ -939,16 +940,18 @@ def test_model_reusage(outcome_kind, request): ), ], ) -@pytest.mark.parametrize("use_pandas", [False, True]) -def test_evaluate_feature_set_smoke(metalearner_factory, feature_set, rng, use_pandas): +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) +def test_evaluate_feature_set_smoke(metalearner_factory, feature_set, rng, backend): n_samples = 100 X = rng.standard_normal((n_samples, 5)) y = rng.standard_normal(n_samples) w = rng.integers(0, 2, n_samples) - if use_pandas: + if backend == "pd": X = pd.DataFrame(X) y = pd.Series(y) w = pd.Series(w) + elif backend == "csr": + X = csr_matrix(X) ml = metalearner_factory( n_variants=2, diff --git a/tests/test_metalearner.py b/tests/test_metalearner.py index d9ac1f68..133d3d7f 100644 --- a/tests/test_metalearner.py +++ b/tests/test_metalearner.py @@ -9,6 +9,7 @@ import pandas as pd import pytest from lightgbm import LGBMClassifier, LGBMRegressor +from scipy.sparse import csr_matrix from shap import TreeExplainer, summary_plot from sklearn.base import BaseEstimator from sklearn.linear_model import LinearRegression, LogisticRegression @@ -480,8 +481,8 @@ def test_combine_propensity_and_nuisance_specs( ), ], ) -@pytest.mark.parametrize("use_pandas", [False, True]) -def test_feature_set(feature_set, expected_n_features, use_pandas, rng): +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) +def test_feature_set(feature_set, expected_n_features, backend, rng): ml = _TestMetaLearner( nuisance_model_factory=LGBMRegressor, is_classification=False, @@ -495,10 +496,12 @@ def test_feature_set(feature_set, expected_n_features, use_pandas, rng): X = rng.standard_normal((sample_size, n_features)) y = rng.standard_normal(sample_size) w = rng.integers(0, 2, sample_size) - if use_pandas: + if backend == "pd": X = pd.DataFrame(X) y = pd.Series(y) w = pd.Series(w) + elif backend == "csr": + X = csr_matrix(X) ml.fit(X, y, w) for model_kind, model_kind_list in ml._nuisance_models.items(): @@ -1078,15 +1081,17 @@ def test_n_jobs_base_learners(implementation, rng): "implementation", [TLearner, SLearner, XLearner, RLearner, DRLearner], ) -@pytest.mark.parametrize("use_pandas", [False, True]) -def test_validate_outcome_one_class(implementation, use_pandas, rng): +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) +def test_validate_outcome_one_class(implementation, backend, rng): X = rng.standard_normal((10, 2)) y = np.zeros(10) w = rng.integers(0, 2, 10) - if use_pandas: + if backend == "pandas": X = pd.DataFrame(X) y = pd.Series(y) w = pd.Series(w) + elif backend == "csr": + X = csr_matrix(X) ml = implementation( True, @@ -1106,15 +1111,17 @@ def test_validate_outcome_one_class(implementation, use_pandas, rng): "implementation", [TLearner, SLearner, XLearner, RLearner, DRLearner], ) -@pytest.mark.parametrize("use_pandas", [False, True]) -def test_validate_outcome_different_classes(implementation, use_pandas, rng): +@pytest.mark.parametrize("backend", ["np", "pd", "csr"]) +def test_validate_outcome_different_classes(implementation, backend, rng): X = rng.standard_normal((4, 2)) y = np.array([0, 1, 0, 0]) w = np.array([0, 0, 1, 1]) - if use_pandas: + if backend == "pd": X = pd.DataFrame(X) y = pd.Series(y) w = pd.Series(w) + elif backend == "csr": + X = csr_matrix(X) ml = implementation( True, From 9a6ed735d47db12458ac18e241d48042d867cfe7 Mon Sep 17 00:00:00 2001 From: Apoorva Lal Date: Tue, 27 Aug 2024 13:10:40 -0700 Subject: [PATCH 7/9] fix notebook metadata --- docs/examples/example_estimating_ates.ipynb | 119 ++++++--- docs/examples/example_sparse_inputs.ipynb | 262 ++++++-------------- 2 files changed, 170 insertions(+), 211 deletions(-) diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb index 0eda7327..79a69d52 100644 --- a/docs/examples/example_estimating_ates.ipynb +++ b/docs/examples/example_estimating_ates.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -99,9 +99,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2.083595103597918, 0.06526671583747883)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n", "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n", @@ -110,9 +121,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2.1433722387308025, 0.06345124983351998)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n", " df) .fit(cov_type=\"HC1\")\n", @@ -138,9 +160,42 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "from metalearners import DRLearner\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", @@ -149,9 +204,26 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.02931589]), array([0.06679633]))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "metalearners_dr = DRLearner(\n", " nuisance_model_factory=LGBMRegressor,\n", @@ -557,22 +629,11 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" + "name": "python" + }, + "mystnb": { + "execution_timeout": 120 } }, "nbformat": 4, diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index 04677eca..56589580 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -20,9 +20,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import time, psutil, os, gc\n", "import numpy as np\n", @@ -43,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -71,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -141,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -154,9 +187,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Sparse data memory: 7.63MB\n", + "Dense data memory: 953.66MB\n" + ] + } + ], "source": [ "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" @@ -171,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -207,15 +250,6 @@ " print(metalearners_est)" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gc.collect()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -225,12 +259,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n", + "(array([1.0007226]), array([0.02021719]))\n" + ] + } + ], "source": [ - "fit_drlearner_wrapper(X_csr)\n", - "gc.collect()" + "fit_drlearner_wrapper(X_csr)" ] }, { @@ -242,12 +284,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n", + "(array([1.00067664]), array([0.02021555]))\n" + ] + } + ], "source": [ - "fit_drlearner_wrapper(X_np)\n", - "gc.collect()" + "fit_drlearner_wrapper(X_np)" ] }, { @@ -256,168 +306,16 @@ "source": [ "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. " ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prediction \n", - "\n", - "These benefits aren't limited to causal inference. We can also use sparse matrices for prediction tasks as well." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_dummy_data(n_samples=100000, n_categories=1000, n_features=20):\n", - " X = np.random.randint(0, n_categories, size=(n_samples, n_features))\n", - " y = np.zeros(n_samples)\n", - " # Select a few features for main effects\n", - " main_effect_features = np.random.choice(n_features, 3, replace=False)\n", - " # Create main effects\n", - " for i in main_effect_features:\n", - " # Create a random effect for each category\n", - " category_effects = np.random.normal(0, 1, n_categories)\n", - " y += category_effects[X[:, i]]\n", - " # Select a couple of feature pairs for interaction effects\n", - " interaction_pairs = [\n", - " (i, j) for i in range(n_features) for j in range(i + 1, n_features)\n", - " ]\n", - " selected_interactions = np.random.choice(len(interaction_pairs), 2, replace=False)\n", - " # Create interaction effects\n", - " for idx in selected_interactions:\n", - " i, j = interaction_pairs[idx]\n", - " # Create a sparse interaction effect\n", - " interaction_effect = np.random.choice(\n", - " [-1, 0, 1], size=(n_categories, n_categories), p=[0.05, 0.9, 0.05]\n", - " )\n", - " y += interaction_effect[X[:, i], X[:, j]]\n", - " # Add a non-linear effect for one feature\n", - " nonlinear_feature = np.random.choice(n_features)\n", - " y += np.square(X[:, nonlinear_feature] / (n_categories / 2) - 1)\n", - " y = (y - np.mean(y)) / np.std(y)\n", - " y += np.random.normal(0, 0.1, n_samples)\n", - "\n", - " return X, y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_data(X):\n", - " e1 = OneHotEncoder(sparse_output=True)\n", - " # dense - use pd.get_dummies to mimic current practice\n", - " Xdf = pd.DataFrame(X)\n", - " return e1.fit_transform(X), pd.get_dummies(Xdf, columns=Xdf.columns).values\n", - "\n", - "def fit_and_measure(X_train, y_train, X_test, y_test):\n", - " start_memory = get_memory_usage()\n", - " start_time = time.time()\n", - " m = LGBMRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, verbose=-1)\n", - " m.fit(X_train, y_train)\n", - " end_time = time.time()\n", - " end_memory = get_memory_usage()\n", - " runtime = end_time - start_time\n", - " memory_used = end_memory - start_memory\n", - "\n", - " # Compute accuracy metrics\n", - " y_pred = m.predict(X_test)\n", - " mse = mean_squared_error(y_test, y_pred)\n", - " r2 = r2_score(y_test, y_pred)\n", - "\n", - " return runtime, memory_used, mse, r2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = generate_dummy_data()\n", - "# Split the data into train and test sets\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "X_train_sparse, X_train_dense = prepare_data(X_train)\n", - "X_test_sparse, X_test_dense = prepare_data(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "sparse_runtime, sparse_memory, sparse_mse, sparse_r2 = fit_and_measure(X_train_sparse, y_train, X_test_sparse, y_test)\n", - "gc.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%time\n", - "dense_runtime, dense_memory, dense_mse, dense_r2 = fit_and_measure(X_train_dense, y_train, X_test_dense, y_test)\n", - "gc.collect()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mypy can't find these names/variables since they are assigned to via cell-magic.\n", - "print(\n", - " f\"Sparse data - Runtime: {sparse_runtime:.2f}s, \" # type: ignore[name-defined]\n", - " f\"Memory used: {sparse_memory:.2f}MB, \" # type: ignore[name-defined]\n", - " f\"MSE: {sparse_mse:.4f}, R2: {sparse_r2:.4f}\" # type: ignore[name-defined]\n", - ") \n", - "print(\n", - " f\"Dense data - Runtime: {dense_runtime:.2f}s, \" # type: ignore[name-defined]\n", - " f\"Memory used: {dense_memory:.2f}MB, \" # type: ignore[name-defined]\n", - " f\"MSE: {dense_mse:.4f}, R2: {dense_r2:.4f}\" # type: ignore[name-defined]\n", - ")\n", - "\n", - "print(f\"\\nSparse data memory: {X_train_sparse.data.nbytes / 1024 / 1024:.2f}MB\")\n", - "print(f\"Dense data memory: {X_train_dense.nbytes / 1024 / 1024:.2f}MB\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.3" + "name": "python" + }, + "mystnb": { + "execution_timeout": 120 } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 2 } From 8282381c277ec27b9cd834bfa9f66a12337a6432 Mon Sep 17 00:00:00 2001 From: Apoorva Lal Date: Tue, 27 Aug 2024 13:44:35 -0700 Subject: [PATCH 8/9] reduce sparse problem size for docs --- docs/examples/example_sparse_inputs.ipynb | 52 ++++++++++++++--------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index 56589580..cc351bfa 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -110,7 +110,7 @@ "source": [ "def generate_causal_data(\n", " n_samples=100_000,\n", - " n_categories=1000,\n", + " n_categories=500,\n", " n_features=100,\n", " tau_magnitude=1.0,\n", "):\n", @@ -168,8 +168,9 @@ "\n", "\n", "X, W, Y, tau, propensity_score = generate_causal_data(\n", - " n_samples=10000, tau_magnitude=1.0\n", - ")\n" + " n_samples=1000, tau_magnitude=1.0\n", + ")\n", + "Xdf = pd.DataFrame(X)" ] }, { @@ -179,10 +180,14 @@ "outputs": [], "source": [ "# sparse and dense X matrices\n", - "e1 = OneHotEncoder(sparse_output=True) # onehot encoder generates sparse output automatically\n", - "Xdf = pd.DataFrame(X)\n", + "e1 = OneHotEncoder(\n", + " sparse_output=True\n", + ") # onehot encoder generates sparse output automatically\n", + "\n", "X_csr = e1.fit_transform(X)\n", - "X_np = pd.get_dummies(Xdf, columns=Xdf.columns).values # dense onehot encoding with pandas" + "X_np = pd.get_dummies(\n", + " Xdf, columns=Xdf.columns\n", + ").values # dense onehot encoding with pandas" ] }, { @@ -195,8 +200,8 @@ "output_type": "stream", "text": [ "\n", - "Sparse data memory: 7.63MB\n", - "Dense data memory: 953.66MB\n" + "Sparse data memory: 0.76MB\n", + "Dense data memory: 41.28MB\n" ] } ], @@ -266,8 +271,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sparse data - Runtime: 13.27s, Memory used: 345.56MB\n", - "(array([1.0007226]), array([0.02021719]))\n" + "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n", + "(array([1.0161235]), array([0.06374022]))\n" ] } ], @@ -291,26 +296,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Sparse data - Runtime: 149.10s, Memory used: 70.43MB\n", - "(array([1.00067664]), array([0.02021555]))\n" + "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n", + "(array([1.01609547]), array([0.06384197]))\n" ] } ], "source": [ "fit_drlearner_wrapper(X_np)" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this (admittedly somewhat contrived) example, we that solving the DRLearner problem with sparse inputs takes around 1/8 of the time compared to dense inputs at the cost of some more memory usage in estimation. " - ] } ], "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" }, "mystnb": { "execution_timeout": 120 From e4501b4450791ac8ab3ef2e44d31ff8c6864bcba Mon Sep 17 00:00:00 2001 From: Apoorva Lal Date: Wed, 28 Aug 2024 08:54:09 -0700 Subject: [PATCH 9/9] final touches --- CHANGELOG.rst | 8 ++ docs/examples/example_estimating_ates.ipynb | 100 ++++---------------- docs/examples/example_sparse_inputs.ipynb | 93 ++++-------------- metalearners/_utils.py | 2 +- 4 files changed, 45 insertions(+), 158 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 56fd87c7..34eb7f81 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Changelog ========= +0.11.0 (2024-09-xx) +------------------- + +**New features** + +* Add support for using ``scipy.sparse.csr_matrix`` as datastructure for covariates ``X``. + + 0.10.0 (2024-08-13) ------------------- diff --git a/docs/examples/example_estimating_ates.ipynb b/docs/examples/example_estimating_ates.ipynb index 79a69d52..ec88ec48 100644 --- a/docs/examples/example_estimating_ates.ipynb +++ b/docs/examples/example_estimating_ates.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -99,20 +99,9 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.083595103597918, 0.06526671583747883)" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "naive_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}\", df) .fit(cov_type=\"HC1\")\n", "naive_est = naive_lm.params.iloc[1], naive_lm.bse.iloc[1]\n", @@ -121,20 +110,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2.1433722387308025, 0.06345124983351998)" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "covaradjust_lm = smf.ols(f\"{outcome_column} ~ {treatment_column}+{'+'.join(feature_columns)}\",\n", " df) .fit(cov_type=\"HC1\")\n", @@ -160,42 +138,9 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "from metalearners import DRLearner\n", "from lightgbm import LGBMRegressor, LGBMClassifier\n", @@ -204,7 +149,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "editable": true, "slideshow": { @@ -212,18 +157,7 @@ }, "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "(array([1.02931589]), array([0.06679633]))" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "metalearners_dr = DRLearner(\n", " nuisance_model_factory=LGBMRegressor,\n", @@ -629,8 +563,14 @@ } ], "metadata": { + "kernelspec": { + "display_name": "py311", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "name": "python", + "version": "3.11.7" }, "mystnb": { "execution_timeout": 120 diff --git a/docs/examples/example_sparse_inputs.ipynb b/docs/examples/example_sparse_inputs.ipynb index cc351bfa..03cad0c2 100644 --- a/docs/examples/example_sparse_inputs.ipynb +++ b/docs/examples/example_sparse_inputs.ipynb @@ -20,42 +20,9 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "import time, psutil, os, gc\n", "import numpy as np\n", @@ -76,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -104,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -175,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -192,19 +159,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Sparse data memory: 0.76MB\n", - "Dense data memory: 41.28MB\n" - ] - } - ], + "outputs": [], "source": [ "print(f\"\\nSparse data memory: {X_csr.data.nbytes / 1024 / 1024:.2f}MB\")\n", "print(f\"Dense data memory: {X_np.nbytes / 1024 / 1024:.2f}MB\")" @@ -219,11 +176,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "def fit_drlearner_wrapper(X):\n", + "def fit_drlearner_wrapper(X, name):\n", " start_memory = get_memory_usage()\n", " start_time = time.time()\n", " metalearners_dr = DRLearner(\n", @@ -251,7 +208,7 @@ " end_memory = get_memory_usage()\n", " runtime = end_time - start_time\n", " memory_used = end_memory - start_memory\n", - " print(f\"Sparse data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n", + " print(f\"{name} data - Runtime: {runtime:.2f}s, Memory used: {memory_used:.2f}MB\")\n", " print(metalearners_est)" ] }, @@ -264,20 +221,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 3.06s, Memory used: 115.93MB\n", - "(array([1.0161235]), array([0.06374022]))\n" - ] - } - ], + "outputs": [], "source": [ - "fit_drlearner_wrapper(X_csr)" + "fit_drlearner_wrapper(X_csr, \"Sparse\")" ] }, { @@ -289,20 +237,11 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sparse data - Runtime: 6.91s, Memory used: 131.66MB\n", - "(array([1.01609547]), array([0.06384197]))\n" - ] - } - ], + "outputs": [], "source": [ - "fit_drlearner_wrapper(X_np)" + "fit_drlearner_wrapper(X_np, \"Dense\")" ] } ], diff --git a/metalearners/_utils.py b/metalearners/_utils.py index c1d63d67..a5c02f37 100644 --- a/metalearners/_utils.py +++ b/metalearners/_utils.py @@ -25,7 +25,7 @@ default_rng = np.random.default_rng() -def safe_len(X): +def safe_len(X: Matrix) -> int: if scipy.sparse.issparse(X): return X.shape[0] return len(X)