Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enabled custom evaluators with states via update function #52

Merged
merged 3 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 66 additions & 56 deletions python/naiveautoml/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,66 +220,76 @@ def evaluate(self, pl, timeout=None):
warnings.filterwarnings('ignore', module='sklearn')
warnings.filterwarnings('ignore', module='numpy')

if is_pipeline_forbidden(pl):
self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
try:

process = psutil.Process(os.getpid())
mem = int(process.memory_info().rss / 1024 / 1024)
self.logger.info(f"Initializing evaluation of {pl}. Current memory consumption {mem}MB. Now awaiting results.")
if is_pipeline_forbidden(pl):
self.logger.info(f"Preventing evaluation of forbidden pipeline {pl}")
return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}

start_outer = time.time()
spl = str(pl)
if self.use_caching and spl in self.cache:
out = {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
out[get_scoring_name(self.scoring)] = np.round(np.mean(self.cache[spl][1]), 4)
return out
timestamp = time.time()
if timeout is not None:
if timeout > 1:
with pynisher.limit(self.evaluation_fun, wall_time=timeout) as limited_evaluation:
if hasattr(self.evaluation_fun, "errors"):
scores = limited_evaluation(
pl,
self.X,
self.y,
[self.scoring] + self.side_scores,
errors="ignore"
)
else:
scores = limited_evaluation(
pl,
self.X,
self.y,
[self.scoring] + self.side_scores
)
else: # no time left
scores = None
else:
scores = self.evaluation_fun(pl, self.X, self.y, [self.scoring] + self.side_scores)
if scores is None:
return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
runtime = time.time() - start_outer

# if scores is a 2-tuple, it is assumed that the evaluator object returned itself (in an altered version)
if isinstance(scores, tuple):
if not isinstance(scores[1], type(self.evaluation_fun)):
raise ValueError(
"If an evaluation function returns an object in its second output,"
"the type must coincide to the previous one!"
)
self.evaluation_fun = scores[1]
scores = scores[0]
process = psutil.Process(os.getpid())
mem = int(process.memory_info().rss / 1024 / 1024)
self.logger.info(
f"Initializing evaluation of {pl}. Current memory consumption {mem}MB. Now awaiting results."
)

if not isinstance(scores, dict):
raise TypeError(f"""
scores is of type {type(scores)} but must be a dictionary with entries for {get_scoring_name(self.scoring)}.
Probably you inserted an evaluation_fun argument that does not return a proper dictionary."""
start_outer = time.time()
spl = str(pl)
if self.use_caching and spl in self.cache:
out = {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
out[get_scoring_name(self.scoring)] = np.round(np.mean(self.cache[spl][1]), 4)
return out
timestamp = time.time()
if timeout is not None:
if timeout > 1:
with pynisher.limit(self.evaluation_fun, wall_time=timeout) as limited_evaluation:
if hasattr(self.evaluation_fun, "errors"):
scores = limited_evaluation(
pl,
self.X,
self.y,
[self.scoring] + self.side_scores,
errors="ignore"
)

self.logger.info(f"Completed evaluation of {spl} after {runtime}s. Scores are {scores}")
self.tellEvaluation(pl, scores[get_scoring_name(self.scoring)], timestamp)
return {scoring: np.round(np.mean(scores[scoring]), 4) for scoring in scores}
else:
scores = limited_evaluation(
pl,
self.X,
self.y,
[self.scoring] + self.side_scores
)
else: # no time left
scores = None
else:
scores = self.evaluation_fun(pl, self.X, self.y, [self.scoring] + self.side_scores)

# here we give the evaluator the chance to update itself
# this looks funny, but it is done because the evaluation could have been done with a copy of the evaluator
if hasattr(self.evaluation_fun, "update"):
self.evaluation_fun.update(pl, scores)

# if no score was observed, return results here
if scores is None:
return {get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores}
runtime = time.time() - start_outer

if not isinstance(scores, dict):
raise TypeError(f"""
scores is of type {type(scores)} but must be a dictionary
with entries for {get_scoring_name(self.scoring)}. Probably you inserted an
evaluation_fun argument that does not return a proper dictionary."""
)

self.logger.info(f"Completed evaluation of {spl} after {runtime}s. Scores are {scores}")
self.tellEvaluation(pl, scores[get_scoring_name(self.scoring)], timestamp)
return {scoring: np.round(np.mean(scores[scoring]), 4) for scoring in scores}

# if there was an exception, then tell the evaluator function about a nan
except Exception:
if hasattr(self.evaluation_fun, "update"):
self.evaluation_fun.update(pl, {
get_scoring_name(scoring): np.nan for scoring in [self.scoring] + self.side_scores
})
raise


def fullname(o):
Expand Down
1 change: 0 additions & 1 deletion python/naiveautoml/naiveautoml.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,6 @@ def eval_history(self, X, y):
return scores

def predict(self, X):
print(self.pl)
return self.pl.predict(X)

def predict_proba(self, X):
Expand Down
2 changes: 1 addition & 1 deletion python/naiveautoml/searchspace-classification.json
Original file line number Diff line number Diff line change
Expand Up @@ -767,7 +767,7 @@
{
"name": "loss",
"type": "constant",
"value": "auto"
"value": "log_loss"
},
{
"name": "max_bins",
Expand Down
2 changes: 1 addition & 1 deletion python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
install_requires=[
'numpy',
'pandas',
'scikit-learn',
'scikit-learn>=1.4',
'configspace<0.7.1',
'scipy',
'pynisher',
Expand Down
102 changes: 96 additions & 6 deletions python/test/test_naiveautoml.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import openml
import pandas as pd

from typing import Callable


def get_dataset(openmlid, as_numpy = True):
ds = openml.datasets.get_dataset(openmlid)
Expand All @@ -32,10 +34,11 @@ def get_dataset(openmlid, as_numpy = True):
y = y.values
print(f"Data is of shape {X.shape}.")
return X, y


class TestNaiveAutoML(unittest.TestCase):

@staticmethod
def setUpClass():
# setup logger for this test suite
logger = logging.getLogger('naml_test')
Expand Down Expand Up @@ -225,7 +228,7 @@ def test_naml_results_classification(self, openmlid, exp_runtime, exp_result):
@parameterized.expand([
(41021, 120, 650), # moneyball
#(183, 260, 15), # abalone
(212, 120, 15) # diabetes, has decimal targets
(212, 120, 15) # diabetes, has decimal targets

])
def test_naml_results_regression(self, openmlid, exp_runtime, exp_result):
Expand All @@ -245,7 +248,7 @@ def test_naml_results_regression(self, openmlid, exp_runtime, exp_result):
start = time.time()
naml = naiveautoml.NaiveAutoML(
logger_name="naml",
timeout=120,
timeout=75,
max_hpo_iterations=10,
show_progress=True,
task_type="regression",
Expand Down Expand Up @@ -321,7 +324,13 @@ def test_individual_scoring(self, openmlid, exp_runtime, exp_result):

# run naml
start = time.time()
naml = naiveautoml.NaiveAutoML(logger_name="naml", max_hpo_iterations=10, show_progress=True, scoring = scoring1, side_scores=[scoring2])
naml = naiveautoml.NaiveAutoML(
logger_name="naml",
max_hpo_iterations=10,
show_progress=True,
scoring = scoring1,
side_scores=[scoring2]
)
naml.fit(X_train, y_train)
end = time.time()
runtime = end - start
Expand Down Expand Up @@ -394,4 +403,85 @@ def evaluation(pl, X, y, scoring_functions):
score_mean = np.round(np.mean(scores), 2)
self.assertTrue(runtime_mean <= exp_runtime, msg=f"Permitted runtime exceeded. Expected was {exp_runtime}s but true runtime was {runtime_mean}")
self.assertTrue(score_mean >= exp_result, msg=f"Returned solution was bad. Expected was at least {exp_result} but true avg score was {score_mean}")
self.logger.info(f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")
self.logger.info(f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")


@parameterized.expand([
(61, 30, 0.9),
# (188, 60, 0.5), # eucalyptus. Very important because has both missing values and categorical attributes
# (1485, 240, 0.82),
# (1515, 240, 0.85),
# (1468, 120, 0.94),
# (1489, 180, 0.89),
# (23512, 600, 0.65),
# (23517, 600, 0.5),
# (4534, 180, 0.92),
# (4538, 400, 0.66),
# (4134, 400, 0.79),

])
def test_individual_stateful_evaluation(self, openmlid, exp_runtime, exp_result):
X, y = get_dataset(openmlid)
self.logger.info(f"Start result test for NaiveAutoML on classification dataset {openmlid}")

class Evaluator(Callable):

def __init__(self):
self.history = []

def reset(self):
self.history = []

def __call__(self, pl, X, y, scoring_functions):
results = {
s: np.mean(sklearn.model_selection.cross_validate(pl, X, y, scoring=s)["test_score"])
for s in scoring_functions
}
return results

def update(self, pl, results):
self.history.append([pl, results])

scorer = sklearn.metrics.get_scorer("accuracy")
evaluation = Evaluator()

# run naml
scores = []
runtimes = []
for seed in range(1, self.num_seeds + 1):

evaluation.reset()

# create split
self.logger.debug(f"Running test on seed {seed}/{self.num_seeds}")
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)

# run naml
start = time.time()
naml = naiveautoml.NaiveAutoML(logger_name="naml", max_hpo_iterations=10, show_progress=True,
evaluation_fun=evaluation)
naml.fit(X_train, y_train)
end = time.time()
runtime = end - start
runtimes.append(runtime)

# compute test performance
self.logger.debug(
f"finished training on seed {seed} after {int(np.round(runtime))}s. Now computing performance of solution.")
score = scorer(naml, X_test, y_test)
scores.append(score)
self.logger.debug(f"finished test on seed {seed}. Test score for this run is {score}")

self.assertEquals(len(naml.history), len(evaluation.history), "History lengths don't match!")

# check conditions
runtime_mean = int(np.round(np.mean(runtimes)))
score_mean = np.round(np.mean(scores), 2)
self.assertTrue(runtime_mean <= exp_runtime,
msg=f"Permitted runtime exceeded. Expected was {exp_runtime}s but true runtime was {runtime_mean}")

# we also check the score, because the result here *should* be good. if not, the values might not be used
self.assertTrue(score_mean >= exp_result,
msg=f"Returned solution was bad. Expected was at least {exp_result} but true avg score was {score_mean}")
self.logger.info(
f"Test on dataset {openmlid} finished. Mean runtimes was {runtime_mean}s and avg accuracy was {score_mean}")
Loading