Skip to content

Commit

Permalink
Improve memory usage of MetaLearnerGridSearch (#62)
Browse files Browse the repository at this point in the history
* Add options for storing

* Tests

* Finish TODO

* Reduce memory usage by not creating metalearner object

* Update CHANGELOG

* Use generator_unordered

* Add grid_size_ and move attributes initialization to fit

* Fix

* Fix

* grid_size_ docstring

* Add new options to tutorial

* Remove check empty generator

* Apply suggestions from code review

Co-authored-by: Kevin Klein <[email protected]>

* Add explanation grid_size_

---------

Co-authored-by: Kevin Klein <[email protected]>
  • Loading branch information
FrancescMartiEscofetQC and kklein authored Jul 22, 2024
1 parent 9406ef7 commit 80ce219
Show file tree
Hide file tree
Showing 4 changed files with 173 additions and 33 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ Changelog
* Added :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
:meth:`metalearners.metalearner.MetaLearner.fit_all_treatment`.

* Add optional ``store_raw_results`` and ``store_results`` parameters to :class:`metalearners.grid_search.MetaLearnerGridSearch`.

* Renamed :class:`metalearners.grid_search._GSResult` to :class:`metalearners.grid_search.GSResult`.

* Added ``grid_size_`` attribute to :class:`metalearners.grid_search.MetaLearnerGridSearch`.

* Implement :meth:`metalearners.cross_fit_estimator.CrossFitEstimator.score`.

**Bug fixes**
Expand Down
20 changes: 20 additions & 0 deletions docs/examples/example_gridsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,26 @@
"gs.results_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What if I run out of memory?\n",
"----------------------------\n",
"\n",
"If you're conducting an optimization task over a large grid with a substantial dataset,\n",
"it is possible that memory usage issues may arise. To try to solve these, you can minimize\n",
"memory usage by adjusting your settings.\n",
"\n",
"In that case you can set ``store_raw_results=False``, the grid search will then operate\n",
"with a generator rather than a list, significantly reducing memory usage.\n",
"\n",
"If the ``results_ DataFrame`` is what you're after, you can simply set ``store_results=True``.\n",
"However, if you aim to iterate over the {class}`~metalearners.metalearner.MetaLearner` objects,\n",
"you can set ``store_results=False``. Consequently, ``raw_results_`` will become a generator\n",
"object yielding {class}`~metalearners.grid_search.GSResult`."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
114 changes: 81 additions & 33 deletions metalearners/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: BSD-3-Clause

import time
from collections.abc import Mapping, Sequence
from collections.abc import Generator, Mapping, Sequence
from dataclasses import dataclass
from typing import Any

Expand All @@ -17,7 +17,8 @@

@dataclass(frozen=True)
class _FitAndScoreJob:
metalearner: MetaLearner
metalearner_factory: type[MetaLearner]
metalearner_params: dict[str, Any]
X_train: Matrix
y_train: Vector
w_train: Vector
Expand All @@ -32,7 +33,7 @@ class _FitAndScoreJob:


@dataclass(frozen=True)
class _GSResult:
class GSResult:
r"""Result from a single grid search evaluation."""

metalearner: MetaLearner
Expand All @@ -42,23 +43,22 @@ class _GSResult:
score_time: float


def _fit_and_score(job: _FitAndScoreJob) -> _GSResult:
def _fit_and_score(job: _FitAndScoreJob) -> GSResult:
start_time = time.time()
job.metalearner.fit(
job.X_train, job.y_train, job.w_train, **job.metalerner_fit_params
)
ml = job.metalearner_factory(**job.metalearner_params)
ml.fit(job.X_train, job.y_train, job.w_train, **job.metalerner_fit_params)
fit_time = time.time() - start_time
start_time = time.time()

train_scores = job.metalearner.evaluate(
train_scores = ml.evaluate(
X=job.X_train,
y=job.y_train,
w=job.w_train,
is_oos=False,
scoring=job.scoring,
)
if job.X_test is not None and job.y_test is not None and job.w_test is not None:
test_scores = job.metalearner.evaluate(
test_scores = ml.evaluate(
X=job.X_test,
y=job.y_test,
w=job.w_test,
Expand All @@ -69,16 +69,18 @@ def _fit_and_score(job: _FitAndScoreJob) -> _GSResult:
else:
test_scores = None
score_time = time.time() - start_time
return _GSResult(
metalearner=job.metalearner,
return GSResult(
metalearner=ml,
fit_time=fit_time,
score_time=score_time,
train_scores=train_scores,
test_scores=test_scores,
)


def _format_results(results: Sequence[_GSResult]) -> pd.DataFrame:
def _format_results(
results: list[GSResult] | Generator[GSResult, None, None]
) -> pd.DataFrame:
rows = []
for result in results:
row: dict[str, str | int | float] = {}
Expand Down Expand Up @@ -180,11 +182,33 @@ class MetaLearnerGridSearch:
``verbose`` will be passed to `joblib.Parallel <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`_.
After fitting a dataframe with the results will be available in `results_`.
``store_raw_results`` and ``store_results`` define which and how the results are saved
after calling :meth:`~metalearners.grid_search.MetaLearnerGridSearch.fit` depending on
their values:
* Both are ``True`` (default): ``raw_results_`` will be a list of
:class:`~metalearners.grid_search.GSResult` with all the results and ``results_``
will be a DataFrame with the processed results.
* ``store_raw_results=True`` and ``store_results=False``: ``raw_results_`` will be a
list of :class:`~metalearners.grid_search.GSResult` with all the results
and ``results`` will be ``None``.
* ``store_raw_results=False`` and ``store_results=True``: ``raw_results_`` will be
``None`` and ``results_`` will be a DataFrame with the processed results.
* Both are ``False``: ``raw_results_`` will be a generator which yields a
:class:`~metalearners.grid_search.GSResult` for each configuration and ``results``
will be None. This configuration can be useful in the case the grid search is big
and you do not want to store all MetaLearners objects rather evaluate them after
fitting each one and just store one.
``grid_size_`` will contain the number of hyperparameter combinations after fitting.
This attribute may be useful in the case ``store_raw_results = False`` and ``store_results = False``.
In that case, the generator object returned in ``raw_results_`` doesn't trigger the fitting
of individual metalearners until explicitly requested, e.g. in a loop. This attribute
can be use to track the progress, for instance, by creating a progress bar or a similar utility.
For an illustration see :ref:`our example on Tuning hyperparameters of a MetaLearner with MetaLearnerGridSearch <example-grid-search>`.
"""

# TODO: Add a reference to a docs example once it is written.

def __init__(
self,
metalearner_factory: type[MetaLearner],
Expand All @@ -195,16 +219,17 @@ def __init__(
n_jobs: int | None = None,
random_state: int | None = None,
verbose: int = 0,
store_raw_results: bool = True,
store_results: bool = True,
):
self.metalearner_factory = metalearner_factory
self.metalearner_params = metalearner_params
self.scoring = scoring
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose

self.raw_results_: Sequence[_GSResult] | None = None
self.results_: pd.DataFrame | None = None
self.store_raw_results = store_raw_results
self.store_results = store_results

all_base_models = set(
metalearner_factory.nuisance_model_specifications().keys()
Expand Down Expand Up @@ -286,20 +311,33 @@ def fit(
}
propensity_model_params = params.get(PROPENSITY_MODEL, None)

ml = self.metalearner_factory(
**self.metalearner_params,
nuisance_model_factory=nuisance_model_factory,
treatment_model_factory=treatment_model_factory,
propensity_model_factory=propensity_model_factory,
nuisance_model_params=nuisance_model_params,
treatment_model_params=treatment_model_params,
propensity_model_params=propensity_model_params,
random_state=self.random_state,
)
grid_metalearner_params = {
"nuisance_model_factory": nuisance_model_factory,
"treatment_model_factory": treatment_model_factory,
"propensity_model_factory": propensity_model_factory,
"nuisance_model_params": nuisance_model_params,
"treatment_model_params": treatment_model_params,
"propensity_model_params": propensity_model_params,
"random_state": self.random_state,
}

if (
len(
shared_keys := set(grid_metalearner_params.keys())
& set(self.metalearner_params.keys())
)
> 0
):
raise ValueError(
f"{shared_keys} should not be specified in metalearner_params as "
"they are used internally. Please use the correct parameters."
)

jobs.append(
_FitAndScoreJob(
metalearner=ml,
metalearner_factory=self.metalearner_factory,
metalearner_params=dict(self.metalearner_params)
| grid_metalearner_params,
X_train=X,
y_train=y,
w_train=w,
Expand All @@ -312,7 +350,17 @@ def fit(
)
)

parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
raw_results = parallel(delayed(_fit_and_score)(job) for job in jobs)
self.raw_results_ = raw_results
self.results_ = _format_results(results=raw_results)
self.grid_size_ = len(jobs)
self.raw_results_: list[GSResult] | Generator[GSResult, None, None] | None
self.results_: pd.DataFrame | None = None

return_as = "list" if self.store_raw_results else "generator_unordered"
parallel = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose, return_as=return_as
)
self.raw_results_ = parallel(delayed(_fit_and_score)(job) for job in jobs)
if self.store_results:
self.results_ = _format_results(results=self.raw_results_) # type: ignore
if not self.store_raw_results:
# The generator will be empty so we replace it with None
self.raw_results_ = None
66 changes: 66 additions & 0 deletions tests/test_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
# SPDX-License-Identifier: BSD-3-Clause


from types import GeneratorType

import numpy as np
import pandas as pd
import pytest
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
Expand Down Expand Up @@ -153,6 +156,7 @@ def test_metalearnergridsearch_smoke(
assert gs.results_ is not None
assert gs.results_.shape[0] == expected_n_configs
assert gs.results_.index.names == expected_index_cols
assert gs.grid_size_ == expected_n_configs

train_scores_cols = set(
c[6:] for c in list(gs.results_.columns) if c.startswith("train_")
Expand Down Expand Up @@ -259,3 +263,65 @@ def test_metalearnergridsearch_reuse_propensity_smoke(grid_search_data):
assert gs.results_ is not None
assert gs.results_.shape[0] == 2
assert len(gs.results_.index.names) == 5


@pytest.mark.parametrize(
"store_raw_results, store_results, expected_type_raw_results, expected_type_results",
[
(True, True, list, pd.DataFrame),
(True, False, list, type(None)),
(False, True, type(None), pd.DataFrame),
(False, False, GeneratorType, type(None)),
],
)
def test_metalearnergridsearch_store(
store_raw_results,
store_results,
expected_type_raw_results,
expected_type_results,
grid_search_data,
):
X, _, y, w, X_test, _, y_test, w_test = grid_search_data
n_variants = len(np.unique(w))

metalearner_params = {
"is_classification": False,
"n_variants": n_variants,
"n_folds": 2,
}

gs = MetaLearnerGridSearch(
metalearner_factory=SLearner,
metalearner_params=metalearner_params,
base_learner_grid={"base_model": [LinearRegression, LGBMRegressor]},
param_grid={"base_model": {"LGBMRegressor": {"n_estimators": [1, 2]}}},
store_raw_results=store_raw_results,
store_results=store_results,
)

gs.fit(X, y, w, X_test, y_test, w_test)
assert isinstance(gs.raw_results_, expected_type_raw_results)
assert isinstance(gs.results_, expected_type_results)


def test_metalearnergridsearch_error(grid_search_data):
X, _, y, w, X_test, _, y_test, w_test = grid_search_data
n_variants = len(np.unique(w))

metalearner_params = {
"is_classification": False,
"n_variants": n_variants,
"n_folds": 2,
"random_state": 1,
}

gs = MetaLearnerGridSearch(
metalearner_factory=SLearner,
metalearner_params=metalearner_params,
base_learner_grid={"base_model": [LinearRegression, LGBMRegressor]},
param_grid={"base_model": {"LGBMRegressor": {"n_estimators": [1, 2]}}},
)
with pytest.raises(
ValueError, match="should not be specified in metalearner_params"
):
gs.fit(X, y, w, X_test, y_test, w_test)

0 comments on commit 80ce219

Please sign in to comment.