Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve memory usage of MetaLearnerGridSearch #62

Merged
merged 17 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ Changelog
* Added :meth:`metalearners.metalearner.MetaLearner.fit_all_nuisance` and
:meth:`metalearners.metalearner.MetaLearner.fit_all_treatment`.

* Add optional ``store_raw_results`` and ``store_results`` parameters to :class:`metalearners.grid_search.MetaLearnerGridSearch`.

* Renamed :class:`metalearners.grid_search._GSResult` to :class:`metalearners.grid_search.GSResult`.

* Added ``grid_size_`` attribute to :class:`metalearners.grid_search.MetaLearnerGridSearch`.

* Implement :meth:`metalearners.cross_fit_estimator.CrossFitEstimator.score`.

**Bug fixes**
Expand Down
20 changes: 20 additions & 0 deletions docs/examples/example_gridsearch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,26 @@
"gs.results_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What if I run out of memory?\n",
"----------------------------\n",
"\n",
"If you're conducting an optimization task over a large grid with a substantial dataset,\n",
"it is possible that memory usage issues may arise. To try to solve these, you can minimize\n",
"memory usage by adjusting your settings.\n",
"\n",
"In that case you can set ``store_raw_results=False``, the grid search will then operate\n",
"with a generator rather than a list, significantly reducing memory usage.\n",
"\n",
"If the ``results_ DataFrame`` is what you're after, you can simply set ``store_results=True``.\n",
"However, if you aim to iterate over the {class}`~metalearners.metalearner.MetaLearner` objects,\n",
"you can set ``store_results=False``. Consequently, ``raw_results_`` will become a generator\n",
"object yielding {class}`~metalearners.grid_search.GSResult`."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
110 changes: 77 additions & 33 deletions metalearners/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-License-Identifier: BSD-3-Clause

import time
from collections.abc import Mapping, Sequence
from collections.abc import Generator, Mapping, Sequence
from dataclasses import dataclass
from typing import Any

Expand All @@ -17,7 +17,8 @@

@dataclass(frozen=True)
class _FitAndScoreJob:
metalearner: MetaLearner
metalearner_factory: type[MetaLearner]
metalearner_params: dict[str, Any]
X_train: Matrix
y_train: Vector
w_train: Vector
Expand All @@ -32,7 +33,7 @@ class _FitAndScoreJob:


@dataclass(frozen=True)
class _GSResult:
class GSResult:
r"""Result from a single grid search evaluation."""

metalearner: MetaLearner
Expand All @@ -42,23 +43,22 @@ class _GSResult:
score_time: float


def _fit_and_score(job: _FitAndScoreJob) -> _GSResult:
def _fit_and_score(job: _FitAndScoreJob) -> GSResult:
start_time = time.time()
job.metalearner.fit(
job.X_train, job.y_train, job.w_train, **job.metalerner_fit_params
)
ml = job.metalearner_factory(**job.metalearner_params)
ml.fit(job.X_train, job.y_train, job.w_train, **job.metalerner_fit_params)
fit_time = time.time() - start_time
start_time = time.time()

train_scores = job.metalearner.evaluate(
train_scores = ml.evaluate(
X=job.X_train,
y=job.y_train,
w=job.w_train,
is_oos=False,
scoring=job.scoring,
)
if job.X_test is not None and job.y_test is not None and job.w_test is not None:
test_scores = job.metalearner.evaluate(
test_scores = ml.evaluate(
X=job.X_test,
y=job.y_test,
w=job.w_test,
Expand All @@ -69,16 +69,18 @@ def _fit_and_score(job: _FitAndScoreJob) -> _GSResult:
else:
test_scores = None
score_time = time.time() - start_time
return _GSResult(
metalearner=job.metalearner,
return GSResult(
metalearner=ml,
fit_time=fit_time,
score_time=score_time,
train_scores=train_scores,
test_scores=test_scores,
)


def _format_results(results: Sequence[_GSResult]) -> pd.DataFrame:
def _format_results(
results: list[GSResult] | Generator[GSResult, None, None]
) -> pd.DataFrame:
rows = []
for result in results:
row: dict[str, str | int | float] = {}
Expand Down Expand Up @@ -180,11 +182,29 @@ class MetaLearnerGridSearch:

``verbose`` will be passed to `joblib.Parallel <https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation>`_.

After fitting a dataframe with the results will be available in `results_`.
``store_raw_results`` and ``store_results`` define which and how the results are saved
after calling :meth:`~metalearners.grid_search.MetaLearnerGridSearch.fit` depending on
their values:

* Both are ``True`` (default): ``raw_results_`` will be a list of
:class:`~metalearners.grid_search.GSResult` with all the results and ``results_``
will be a DataFrame with the processed results.
* ``store_raw_results=True`` and ``store_results=False``: ``raw_results_`` will be a
list of :class:`~metalearners.grid_search.GSResult` with all the results
and ``results`` will be ``None``.
* ``store_raw_results=False`` and ``store_results=True``: ``raw_results_`` will be
``None`` and ``results_`` will be a DataFrame with the processed results.
* Both are ``False``: ``raw_results_`` will be a generator which yields a
:class:`~metalearners.grid_search.GSResult` for each configuration and ``results``
will be None. This configuration can be useful in the case the grid search is big
and you do not want to store all MetaLearners objects rather evaluate them after
fitting each one and just store one.

``grid_size_`` will contain the number of hyperparameter combinations after fitting.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't quite grasped yet what the motivation for this attribute is. If the user expects a generator, won't they just be used to querying it until it no longer yields?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes! But I think it may be useful in the case the user wants to show a progress bar or similar, this way it's easier for them to access the number of metalearners to fit instead of having to calculate it manually.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I understand what you mean. At the same time this use case is not super clear to me. Shall we simplify and remove it until we witness that there is a need for it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I may not explain it correctly, in the case we set store_raw_results = False and store_results = False then the fit method finishes "instantly" (it does not wait for fitting the individual metalearners). Then, afaict these are only fitted when the user requests them by iterating over the generator where it may be of use to use this grid_size_ to display some progress bar or similar. If you still think it's not clear lmk and we can discuss it further.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed, I added some explanation about this in the docstring:
1590e94


For an illustration see :ref:`our example on Tuning hyperparameters of a MetaLearner with MetaLearnerGridSearch <example-grid-search>`.
"""

# TODO: Add a reference to a docs example once it is written.

def __init__(
self,
metalearner_factory: type[MetaLearner],
Expand All @@ -195,16 +215,17 @@ def __init__(
n_jobs: int | None = None,
random_state: int | None = None,
verbose: int = 0,
store_raw_results: bool = True,
store_results: bool = True,
):
self.metalearner_factory = metalearner_factory
self.metalearner_params = metalearner_params
self.scoring = scoring
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose

self.raw_results_: Sequence[_GSResult] | None = None
self.results_: pd.DataFrame | None = None
self.store_raw_results = store_raw_results
self.store_results = store_results

all_base_models = set(
metalearner_factory.nuisance_model_specifications().keys()
Expand Down Expand Up @@ -286,20 +307,33 @@ def fit(
}
propensity_model_params = params.get(PROPENSITY_MODEL, None)

ml = self.metalearner_factory(
**self.metalearner_params,
nuisance_model_factory=nuisance_model_factory,
treatment_model_factory=treatment_model_factory,
propensity_model_factory=propensity_model_factory,
nuisance_model_params=nuisance_model_params,
treatment_model_params=treatment_model_params,
propensity_model_params=propensity_model_params,
random_state=self.random_state,
)
grid_metalearner_params = {
"nuisance_model_factory": nuisance_model_factory,
"treatment_model_factory": treatment_model_factory,
"propensity_model_factory": propensity_model_factory,
"nuisance_model_params": nuisance_model_params,
"treatment_model_params": treatment_model_params,
"propensity_model_params": propensity_model_params,
"random_state": self.random_state,
}

if (
len(
shared_keys := set(grid_metalearner_params.keys())
& set(self.metalearner_params.keys())
)
> 0
):
raise ValueError(
f"{shared_keys} should not be specified in metalearner_params as "
"they are used internally. Please use the correct parameters."
)

jobs.append(
_FitAndScoreJob(
metalearner=ml,
metalearner_factory=self.metalearner_factory,
metalearner_params=dict(self.metalearner_params)
| grid_metalearner_params,
X_train=X,
y_train=y,
w_train=w,
Expand All @@ -312,7 +346,17 @@ def fit(
)
)

parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
raw_results = parallel(delayed(_fit_and_score)(job) for job in jobs)
self.raw_results_ = raw_results
self.results_ = _format_results(results=raw_results)
self.grid_size_ = len(jobs)
self.raw_results_: list[GSResult] | Generator[GSResult, None, None] | None
self.results_: pd.DataFrame | None = None

return_as = "list" if self.store_raw_results else "generator_unordered"
parallel = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose, return_as=return_as
)
self.raw_results_ = parallel(delayed(_fit_and_score)(job) for job in jobs)
if self.store_results:
self.results_ = _format_results(results=self.raw_results_) # type: ignore
if not self.store_raw_results:
# The generator will be empty so we replace it with None
self.raw_results_ = None
66 changes: 66 additions & 0 deletions tests/test_grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
# SPDX-License-Identifier: BSD-3-Clause


from types import GeneratorType

import numpy as np
import pandas as pd
import pytest
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
Expand Down Expand Up @@ -153,6 +156,7 @@ def test_metalearnergridsearch_smoke(
assert gs.results_ is not None
assert gs.results_.shape[0] == expected_n_configs
assert gs.results_.index.names == expected_index_cols
assert gs.grid_size_ == expected_n_configs

train_scores_cols = set(
c[6:] for c in list(gs.results_.columns) if c.startswith("train_")
Expand Down Expand Up @@ -259,3 +263,65 @@ def test_metalearnergridsearch_reuse_propensity_smoke(grid_search_data):
assert gs.results_ is not None
assert gs.results_.shape[0] == 2
assert len(gs.results_.index.names) == 5


@pytest.mark.parametrize(
"store_raw_results, store_results, expected_type_raw_results, expected_type_results",
[
(True, True, list, pd.DataFrame),
(True, False, list, type(None)),
(False, True, type(None), pd.DataFrame),
(False, False, GeneratorType, type(None)),
],
)
def test_metalearnergridsearch_store(
store_raw_results,
store_results,
expected_type_raw_results,
expected_type_results,
grid_search_data,
):
X, _, y, w, X_test, _, y_test, w_test = grid_search_data
n_variants = len(np.unique(w))

metalearner_params = {
"is_classification": False,
"n_variants": n_variants,
"n_folds": 2,
}

gs = MetaLearnerGridSearch(
metalearner_factory=SLearner,
metalearner_params=metalearner_params,
base_learner_grid={"base_model": [LinearRegression, LGBMRegressor]},
param_grid={"base_model": {"LGBMRegressor": {"n_estimators": [1, 2]}}},
store_raw_results=store_raw_results,
store_results=store_results,
)

gs.fit(X, y, w, X_test, y_test, w_test)
assert isinstance(gs.raw_results_, expected_type_raw_results)
assert isinstance(gs.results_, expected_type_results)


def test_metalearnergridsearch_error(grid_search_data):
X, _, y, w, X_test, _, y_test, w_test = grid_search_data
n_variants = len(np.unique(w))

metalearner_params = {
"is_classification": False,
"n_variants": n_variants,
"n_folds": 2,
"random_state": 1,
}

gs = MetaLearnerGridSearch(
metalearner_factory=SLearner,
metalearner_params=metalearner_params,
base_learner_grid={"base_model": [LinearRegression, LGBMRegressor]},
param_grid={"base_model": {"LGBMRegressor": {"n_estimators": [1, 2]}}},
)
with pytest.raises(
ValueError, match="should not be specified in metalearner_params"
):
gs.fit(X, y, w, X_test, y_test, w_test)
Loading