Skip to content

Commit

Permalink
Disable cv to be able to reuse models
Browse files Browse the repository at this point in the history
  • Loading branch information
FrancescMartiEscofetQC committed Jul 4, 2024
1 parent 82d38d9 commit 3b841e5
Show file tree
Hide file tree
Showing 2 changed files with 187 additions and 111 deletions.
210 changes: 108 additions & 102 deletions metalearners/grid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@

import pandas as pd
from joblib import Parallel, delayed
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.model_selection import ParameterGrid

from metalearners._typing import Matrix, OosMethod, Scoring, Vector, _ScikitModel
from metalearners._utils import index_matrix, index_vector
from metalearners.cross_fit_estimator import OVERALL
from metalearners.metalearner import PROPENSITY_MODEL, MetaLearner

Expand All @@ -22,28 +21,26 @@ class _FitAndScoreJob:
X_train: Matrix
y_train: Vector
w_train: Vector
X_test: Matrix
y_test: Vector
w_test: Vector
X_test: Matrix | None
y_test: Vector | None
w_test: Vector | None
oos_method: OosMethod
scoring: Scoring | None
kwargs: dict
cv_index: int


@dataclass(frozen=True)
class _CVResult:
class _GSResult:
r"""Cross Validation Result."""

metalearner: MetaLearner
train_scores: dict
test_scores: dict
test_scores: dict | None
fit_time: float
score_time: float
cv_index: int


def _fit_and_score(job: _FitAndScoreJob) -> _CVResult:
def _fit_and_score(job: _FitAndScoreJob) -> _GSResult:
start_time = time.time()
job.metalearner.fit(job.X_train, job.y_train, job.w_train, **job.kwargs)
fit_time = time.time() - start_time
Expand All @@ -55,31 +52,36 @@ def _fit_and_score(job: _FitAndScoreJob) -> _CVResult:
is_oos=False,
scoring=job.scoring,
)
test_scores = job.metalearner.evaluate(
X=job.X_test,
y=job.y_test,
w=job.w_test,
is_oos=True,
oos_method=job.oos_method,
scoring=job.scoring,
)
if job.X_test is not None and job.y_test is not None and job.w_test is not None:
test_scores = job.metalearner.evaluate(
X=job.X_test,
y=job.y_test,
w=job.w_test,
is_oos=True,
oos_method=job.oos_method,
scoring=job.scoring,
)
else:
test_scores = None
score_time = time.time() - fit_time
return _CVResult(
return _GSResult(
metalearner=job.metalearner,
fit_time=fit_time,
score_time=score_time,
train_scores=train_scores,
test_scores=test_scores,
cv_index=job.cv_index,
)


def _format_results(results: Sequence[_CVResult]) -> pd.DataFrame:
def _format_results(results: Sequence[_GSResult]) -> pd.DataFrame:
rows = []
for result in results:
row: dict[str, str | int | float] = {}
row["metalearner"] = result.metalearner.__class__.__name__
nuisance_models = set(result.metalearner.nuisance_model_specifications().keys())
nuisance_models = (
set(result.metalearner.nuisance_model_specifications().keys())
- result.metalearner._prefitted_nuisance_models
)
treatment_models = set(
result.metalearner.treatment_model_specifications().keys()
)
Expand All @@ -99,19 +101,19 @@ def _format_results(results: Sequence[_CVResult]) -> pd.DataFrame:
model_kind
].items():
row[f"{model_kind}_{param}"] = value
row["cv_index"] = result.cv_index
row["fit_time"] = result.fit_time
row["score_time"] = result.score_time
for name, value in result.train_scores.items():
row[f"train_{name}"] = value
for name, value in result.test_scores.items():
row[f"test_{name}"] = value
if result.test_scores is not None:
for name, value in result.test_scores.items():
row[f"test_{name}"] = value
rows.append(row)
df = pd.DataFrame(rows)
return df


class MetaLearnerGridSearchCV:
class MetaLearnerGridSearch:
"""Exhaustive search over specified parameter values for a MetaLearner.
``metalearner_params`` should contain the necessary params for the MetaLearner initialization
Expand Down Expand Up @@ -167,28 +169,37 @@ def __init__(
base_learner_grid: Mapping[str, Sequence[type[_ScikitModel]]],
param_grid: Mapping[str, Mapping[str, Mapping[str, Sequence]]],
scoring: Scoring | None = None,
cv: int = 5,
n_jobs: int | None = None,
random_state: int | None = None,
verbose: int = 0,
):
self.metalearner_factory = metalearner_factory
self.metalearner_params = metalearner_params
self.scoring = scoring
self.cv = cv
self.n_jobs = n_jobs
self.random_state = random_state
self.verbose = verbose

self.raw_results_: Sequence[_CVResult] | None = None
self.cv_results_: pd.DataFrame | None = None
self.raw_results_: Sequence[_GSResult] | None = None
self.results_: pd.DataFrame | None = None

expected_base_models = set(
all_base_models = set(
metalearner_factory.nuisance_model_specifications().keys()
) | set(metalearner_factory.treatment_model_specifications().keys())

if set(base_learner_grid.keys()) != expected_base_models:
raise ValueError("base_learner_grid keys don't match the model names.")
self.fitted_models = set(
metalearner_params.get("fitted_nuisance_models", {}).keys()
)
if metalearner_params.get("fitted_propensity_model", None) is not None:
self.fitted_models |= {PROPENSITY_MODEL}

self.models_to_fit = all_base_models - self.fitted_models

if set(base_learner_grid.keys()) != self.models_to_fit:
raise ValueError(
"base_learner_grid keys don't match the expected model names. base_learner_grid "
f"keys were expected to be {self.models_to_fit}."
)
self.base_learner_grid = list(ParameterGrid(base_learner_grid))

self.param_grid = param_grid
Expand All @@ -198,94 +209,89 @@ def fit(
X: Matrix,
y: Vector,
w: Vector,
X_test: Matrix | None = None,
y_test: Vector | None = None,
w_test: Vector | None = None,
oos_method: OosMethod = OVERALL,
**kwargs,
):
"""Run fit with all sets of parameters.
``X_test``, ``y_test`` and ``w_test`` are optional, in case they are passed all the
fitted metalearners will be evaluated on it.
``kwargs`` will be passed through to the :meth:`~metalearners.metalearner.MetaLearner.fit`
call of each individual MetaLearner.
"""
cv = KFold(n_splits=self.cv, shuffle=True, random_state=self.random_state)
nuisance_models_no_propensity = set.intersection(
set(self.metalearner_factory.nuisance_model_specifications().keys())
- {PROPENSITY_MODEL},
self.models_to_fit,
)

nuisance_models_no_propensity = set(
self.metalearner_factory.nuisance_model_specifications().keys()
) - {PROPENSITY_MODEL}
# We don't need to intersect as treatment models can't be reused
treatment_models = set(
self.metalearner_factory.treatment_model_specifications().keys()
)

all_models = set(
self.metalearner_factory.nuisance_model_specifications().keys()
) | set(self.metalearner_factory.treatment_model_specifications().keys())

jobs: list[_FitAndScoreJob] = []
for cv_index, (train_indices, test_indices) in enumerate(cv.split(X)):
X_train = index_matrix(X, train_indices)
X_test = index_matrix(X, test_indices)
y_train = index_vector(y, train_indices)
y_test = index_vector(y, test_indices)
w_train = index_vector(w, train_indices)
w_test = index_vector(w, test_indices)
for base_learners in self.base_learner_grid:
nuisance_model_factory = {
model_kind: base_learners[model_kind]
for model_kind in nuisance_models_no_propensity
}
treatment_model_factory = {
model_kind: base_learners[model_kind]
for model_kind in treatment_models
}
propensity_model_factory = base_learners.get(PROPENSITY_MODEL, None)
base_learner_param_grids = {
model_kind: list(
ParameterGrid(
self.param_grid.get(model_kind, {}).get(
base_learners[model_kind].__name__, {}
)

for base_learners in self.base_learner_grid:
nuisance_model_factory = {
model_kind: base_learners[model_kind]
for model_kind in nuisance_models_no_propensity
}
treatment_model_factory = {
model_kind: base_learners[model_kind] for model_kind in treatment_models
}
propensity_model_factory = base_learners.get(PROPENSITY_MODEL, None)
base_learner_param_grids = {
model_kind: list(
ParameterGrid(
self.param_grid.get(model_kind, {}).get(
base_learners[model_kind].__name__, {}
)
)
for model_kind in all_models
)
for model_kind in self.models_to_fit
}
for params in ParameterGrid(base_learner_param_grids):
nuisance_model_params = {
model_kind: params[model_kind]
for model_kind in nuisance_models_no_propensity
}
for params in ParameterGrid(base_learner_param_grids):
nuisance_model_params = {
model_kind: params[model_kind]
for model_kind in nuisance_models_no_propensity
}
treatment_model_params = {
model_kind: params[model_kind]
for model_kind in treatment_models
}
propensity_model_params = params.get(PROPENSITY_MODEL, None)

ml = self.metalearner_factory(
**self.metalearner_params,
nuisance_model_factory=nuisance_model_factory,
treatment_model_factory=treatment_model_factory,
propensity_model_factory=propensity_model_factory,
nuisance_model_params=nuisance_model_params,
treatment_model_params=treatment_model_params,
propensity_model_params=propensity_model_params,
random_state=self.random_state,
)

jobs.append(
_FitAndScoreJob(
metalearner=ml,
X_train=X_train,
y_train=y_train,
w_train=w_train,
X_test=X_test,
y_test=y_test,
w_test=w_test,
oos_method=oos_method,
scoring=self.scoring,
kwargs=kwargs,
cv_index=cv_index,
)
treatment_model_params = {
model_kind: params[model_kind] for model_kind in treatment_models
}
propensity_model_params = params.get(PROPENSITY_MODEL, None)

ml = self.metalearner_factory(
**self.metalearner_params,
nuisance_model_factory=nuisance_model_factory,
treatment_model_factory=treatment_model_factory,
propensity_model_factory=propensity_model_factory,
nuisance_model_params=nuisance_model_params,
treatment_model_params=treatment_model_params,
propensity_model_params=propensity_model_params,
random_state=self.random_state,
)

jobs.append(
_FitAndScoreJob(
metalearner=ml,
X_train=X,
y_train=y,
w_train=w,
X_test=X_test,
y_test=y_test,
w_test=w_test,
oos_method=oos_method,
scoring=self.scoring,
kwargs=kwargs,
)
)

parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)
raw_results = parallel(delayed(_fit_and_score)(job) for job in jobs)
self.raw_results_ = raw_results
self.cv_results_ = _format_results(results=raw_results)
self.results_ = _format_results(results=raw_results)
Loading

0 comments on commit 3b841e5

Please sign in to comment.