Skip to content

Commit

Permalink
[ADD] Enable long running regression (automl#251)
Browse files Browse the repository at this point in the history
* Early stop on metric

* Enable long run regression

* Move from deterministic score to lower bound
  • Loading branch information
franchuterivera authored Jun 16, 2021
1 parent 6a8155f commit eda89f4
Show file tree
Hide file tree
Showing 6 changed files with 267 additions and 6 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/long_regression_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Tests

on:
schedule:
# Every Truesday at 7AM UTC
# TODO teporary set to every day just for the PR
#- cron: '0 07 * * 2'
- cron: '0 07 * * *'


jobs:
ubuntu:

runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
fail-fast: false

steps:
- uses: actions/checkout@v2
with:
ref: development
- name: Setup Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install test dependencies
run: |
git submodule update --init --recursive
python -m pip install --upgrade pip
pip install -e .[test]
- name: Run tests
run: |
python -m pytest --durations=200 cicd/test_preselected_configs.py -vs
2 changes: 2 additions & 0 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,6 +838,8 @@ def _search(
self._metric = get_metrics(
names=[optimize_metric], dataset_properties=dataset_properties)[0]

self.pipeline_options['optimize_metric'] = optimize_metric

self.search_space = self.get_search_space(dataset)

budget_config: Dict[str, Union[float, str]] = {}
Expand Down
5 changes: 3 additions & 2 deletions autoPyTorch/pipeline/components/training/trainer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic
self.run_summary = RunSummary(
total_parameter_count,
trainable_parameter_count,
optimize_metric=None if not X['metrics_during_training'] else X.get('optimize_metric'),
)

epoch = 1
Expand Down Expand Up @@ -329,9 +330,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> 'TrainerChoic

# wrap up -- add score if not evaluating every epoch
if not self.eval_valid_each_epoch(X):
val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'])
val_loss, val_metrics = self.choice.evaluate(X['val_data_loader'], epoch, writer)
if 'test_data_loader' in X and X['val_data_loader']:
test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'])
test_loss, test_metrics = self.choice.evaluate(X['test_data_loader'], epoch, writer)
self.run_summary.add_performance(
epoch=epoch,
start_time=start_time,
Expand Down
27 changes: 23 additions & 4 deletions autoPyTorch/pipeline/components/training/trainer/base_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from autoPyTorch.constants import REGRESSION_TASKS
from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent
from autoPyTorch.pipeline.components.training.metrics.metrics import CLASSIFICATION_METRICS, REGRESSION_METRICS
from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score
from autoPyTorch.utils.implementations import get_loss_weight_strategy

Expand Down Expand Up @@ -61,6 +62,7 @@ def __init__(
self,
total_parameter_count: float,
trainable_parameter_count: float,
optimize_metric: Optional[str] = None,
):
"""
A useful object to track performance per epoch.
Expand All @@ -77,6 +79,7 @@ def __init__(

self.total_parameter_count = total_parameter_count
self.trainable_parameter_count = trainable_parameter_count
self.optimize_metric = optimize_metric

# Allow to track the training performance
self.performance_tracker['train_loss'] = {}
Expand Down Expand Up @@ -116,10 +119,26 @@ def add_performance(self,
self.performance_tracker['test_metrics'][epoch] = test_metrics

def get_best_epoch(self, loss_type: str = 'val_loss') -> int:
return np.argmin(
[self.performance_tracker[loss_type][e]
for e in range(1, len(self.performance_tracker[loss_type]) + 1)]
) + 1 # Epochs start at 1

# If we compute validation scores, prefer the performance
# metric to the loss
if self.optimize_metric is not None:
scorer = CLASSIFICATION_METRICS[
self.optimize_metric
] if self.optimize_metric in CLASSIFICATION_METRICS else REGRESSION_METRICS[
self.optimize_metric
]
# Some metrics maximize, other minimize!
opt_func = np.argmax if scorer._sign > 0 else np.argmin
return opt_func(
[self.performance_tracker['val_metrics'][e][self.optimize_metric]
for e in range(1, len(self.performance_tracker['val_metrics']) + 1)]
) + 1 # Epochs start at 1
else:
return np.argmin(
[self.performance_tracker[loss_type][e]
for e in range(1, len(self.performance_tracker[loss_type]) + 1)]
) + 1 # Epochs start at 1

def get_last_epoch(self) -> int:
if 'train_loss' not in self.performance_tracker:
Expand Down
11 changes: 11 additions & 0 deletions cicd/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
###########################################################
# Continuous integration and continuous delivery/deployment
###########################################################

This part of the code is tasked to make sure that we can perform reliable NAS.
To this end, we rely on pytest to run some long running configurations from both
the greedy portafolio and the default configuration.

```
python -m pytest cicd/test_preselected_configs.py -vs
```
193 changes: 193 additions & 0 deletions cicd/test_preselected_configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import copy
import logging.handlers
import os
import random
import tempfile
import time

import numpy as np

import openml

import pytest

import sklearn.datasets

import torch

from autoPyTorch.automl_common.common.utils.backend import create
from autoPyTorch.data.tabular_validator import TabularInputValidator
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
)
from autoPyTorch.datasets.tabular_dataset import TabularDataset
from autoPyTorch.optimizer.utils import read_return_initial_configurations
from autoPyTorch.pipeline.components.training.metrics.metrics import (
accuracy,
balanced_accuracy,
roc_auc,
)
from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline
from autoPyTorch.utils.pipeline import get_dataset_requirements


def get_backend_dirs_for_openml_task(openml_task_id):
temporary_directory = os.path.join(tempfile.gettempdir(), f"tmp_{openml_task_id}_{time.time()}")
output_directory = os.path.join(tempfile.gettempdir(), f"out_{openml_task_id}_{time.time()}")
return temporary_directory, output_directory


def get_fit_dictionary(openml_task_id):
# Make sure everything from here onwards is reproducible
# Add CUDA for future testing also
seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)

task = openml.tasks.get_task(openml_task_id)
temporary_directory, output_directory = get_backend_dirs_for_openml_task(openml_task_id)
backend = create(
temporary_directory=temporary_directory,
output_directory=output_directory,
delete_tmp_folder_after_terminate=False,
delete_output_folder_after_terminate=False,
prefix='autoPyTorch'
)
X, y = sklearn.datasets.fetch_openml(data_id=task.dataset_id, return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, random_state=seed)
validator = TabularInputValidator(
is_classification='classification' in task.task_type.lower()).fit(X.copy(), y.copy())
datamanager = TabularDataset(
dataset_name=openml.datasets.get_dataset(task.dataset_id, download_data=False).name,
X=X_train, Y=y_train,
validator=validator,
X_test=X_test, Y_test=y_test,
resampling_strategy=CrossValTypes.stratified_k_fold_cross_validation
if 'cross' in str(task.estimation_procedure) else HoldoutValTypes.holdout_validation
)

info = datamanager.get_required_dataset_info()

dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info))
fit_dictionary = {
'X_train': datamanager.train_tensors[0],
'y_train': datamanager.train_tensors[1],
'train_indices': datamanager.splits[0][0],
'val_indices': datamanager.splits[0][1],
'dataset_properties': dataset_properties,
'num_run': openml_task_id,
'device': 'cpu',
'budget_type': 'epochs',
'epochs': 200,
'torch_num_threads': 1,
'early_stopping': 100,
'working_dir': '/tmp',
'use_tensorboard_logger': False,
'metrics_during_training': True,
'split_id': 0,
'backend': backend,
'logger_port': logging.handlers.DEFAULT_TCP_LOGGING_PORT,
}
backend.save_datamanager(datamanager)
return fit_dictionary


@pytest.mark.parametrize(
'openml_task_id,configuration,scorer,lower_bound_score',
(
# Australian
(146818, 0, balanced_accuracy, 0.85),
(146818, 1, roc_auc, 0.90),
(146818, 2, balanced_accuracy, 0.80),
(146818, 3, balanced_accuracy, 0.85),
# credit-g
(31, 0, accuracy, 0.75),
(31, 1, accuracy, 0.75),
(31, 2, accuracy, 0.75),
(31, 3, accuracy, 0.70),
(31, 4, accuracy, 0.70),
# segment
(146822, 'default', accuracy, 0.90),
# kr-vs-kp
(3, 'default', accuracy, 0.90),
# vehicle
(53, 'default', accuracy, 0.75),
),
)
def test_can_properly_fit_a_config(openml_task_id, configuration, scorer, lower_bound_score):

fit_dictionary = get_fit_dictionary(openml_task_id)
fit_dictionary['additional_metrics'] = [scorer.name]
fit_dictionary['optimize_metric'] = scorer.name

pipeline = TabularClassificationPipeline(
dataset_properties=fit_dictionary['dataset_properties'])
cs = pipeline.get_hyperparameter_search_space()
if configuration == 'default':
config = cs.get_default_configuration()
else:
# Else configuration indicates what index of the greedy config
config = read_return_initial_configurations(
config_space=cs,
portfolio_selection="greedy",
)[configuration]
pipeline.set_hyperparameters(config)
pipeline.fit(copy.deepcopy(fit_dictionary))

# First we make sure performance is deterministic
# As we use the validation performance for early stopping, this is
# not the true generalization performance, but our goal is to test
# that we can learn the data and capture wrong configurations

# Sadly, when using batch norm we have results that are dependent on the current
# torch manual seed. Set seed zero here to make this test reproducible
torch.manual_seed(0)
val_indices = fit_dictionary['val_indices']
train_data, target_data = fit_dictionary['backend'].load_datamanager().train_tensors
predictions = pipeline.predict(train_data[val_indices])
score = scorer(fit_dictionary['y_train'][val_indices], predictions)
assert pytest.approx(score) >= lower_bound_score

# Check that we reverted to the best score
run_summary = pipeline.named_steps['trainer'].run_summary

# Then check that the training progressed nicely
# We fit a file to have the trajectory-tendency
# Some epochs might be bad, but overall we should make progress
train_scores = [run_summary.performance_tracker['train_metrics'][e][scorer.name]
for e in range(1, len(run_summary.performance_tracker['train_metrics']) + 1)]
slope, intersect = np.polyfit(np.arange(len(train_scores)), train_scores, 1)
if scorer._sign > 0:
# We expect an increasing trajectory of training
assert train_scores[0] < train_scores[-1]
assert slope > 0
else:
# We expect a decreasing trajectory of training
assert train_scores[0] > train_scores[-1]
assert slope < 0

# We do not expect the network to output zeros during training.
# We add this check to prevent a dropout bug we had, where dropout probability
# was a bool, not a float
network = pipeline.named_steps['network'].network
network.train()
global_accumulator = {}

def forward_hook(module, X_in, X_out):
global_accumulator[f"{id(module)}_{module.__class__.__name__}"] = torch.mean(X_out)

for i, (hierarchy, module) in enumerate(network.named_modules()):
module.register_forward_hook(forward_hook)
pipeline.predict(train_data[val_indices])
for module_name, mean_tensor in global_accumulator.items():
# The global accumulator has the output of each layer
# of the network. If an output of any layer is zero, this
# check will fail
assert mean_tensor != 0, module_name

0 comments on commit eda89f4

Please sign in to comment.