Merge pull request #154 from optimas-org/feature/failed_trials

Improve handling of failed evaluations
optimas-org · Feb 12, 2024 · 48d42fc · 48d42fc
2 parents c354790 + 43b7795
commit 48d42fc
Show file tree

Hide file tree

Showing 18 changed files with 473 additions and 136 deletions.
diff --git a/optimas/core/__init__.py b/optimas/core/__init__.py
@@ -1,7 +1,7 @@
 from .evaluation import Evaluation
 from .parameter import Parameter, VaryingParameter, TrialParameter, Objective
 from .task import Task
-from .trial import Trial
+from .trial import Trial, TrialStatus
 
 
 __all__ = [
@@ -12,4 +12,5 @@
     "Objective",
     "Task",
     "Trial",
+    "TrialStatus",
 ]
diff --git a/optimas/core/trial.py b/optimas/core/trial.py
@@ -1,13 +1,23 @@
 """Contains the definition of the Trial class."""
 
 from typing import List, Dict, Optional
+from enum import Enum
 
 import numpy as np
 
 from .parameter import VaryingParameter, Objective, Parameter, TrialParameter
 from .evaluation import Evaluation
 
 
+class TrialStatus(int, Enum):
+    """Enum of trial status, based on the Ax implementation."""
+
+    CANDIDATE = 0
+    RUNNING = 1
+    COMPLETED = 2
+    FAILED = 3
+
+
 class Trial:
     """Defines a trial to be evaluated.
 
@@ -67,6 +77,7 @@ def __init__(
             self._mapped_evaluations[par.name] = None
         for ev in evaluations:
             self._mapped_evaluations[ev.parameter.name] = ev
+        self.mark_as(TrialStatus.CANDIDATE)
 
     @property
     def varying_parameters(self) -> List[VaryingParameter]:
@@ -121,6 +132,36 @@ def custom_parameters(self) -> List[TrialParameter]:
         """Get the list of custom trial parameters."""
         return self._custom_parameters
 
+    @property
+    def status(self) -> TrialStatus:
+        """Get current trial status."""
+        return self._status
+
+    @property
+    def completed(self) -> bool:
+        """Determine whether the trial has been successfully evaluated."""
+        return self._status == TrialStatus.COMPLETED
+
+    @property
+    def failed(self) -> bool:
+        """Determine whether the trial evaluation has failed."""
+        return self._status == TrialStatus.FAILED
+
+    @property
+    def evaluated(self) -> bool:
+        """Determine whether the trial has been evaluated."""
+        return self.completed or self.failed
+
+    def mark_as(self, status) -> None:
+        """Set trial status.
+
+        Parameters
+        ----------
+        status : int
+            A valid trial status (use ``TrialStatus`` enum).
+        """
+        self._status = status
+
     def complete_evaluation(self, evaluation: Evaluation) -> None:
         """Complete the evaluation of an objective or analyzed parameter.
 
@@ -134,6 +175,7 @@ def complete_evaluation(self, evaluation: Evaluation) -> None:
         assert evaluated_parameter in self._mapped_evaluations
         if self._mapped_evaluations[evaluated_parameter] is None:
             self._mapped_evaluations[evaluated_parameter] = evaluation
+        self.mark_as(TrialStatus.COMPLETED)
 
     def parameters_as_dict(self) -> Dict:
         """Get a mapping between names and values of the varying parameters."""
@@ -165,10 +207,3 @@ def analyzed_parameters_as_dict(self) -> Dict:
             ev = self._mapped_evaluations[par.name]
             params[par.name] = (ev.value, ev.sem)
         return params
-
-    def completed(self) -> bool:
-        """Determine whether the trial has been completed."""
-        for par, ev in self._mapped_evaluations.items():
-            if ev is None:
-                return False
-        return True
diff --git a/optimas/evaluators/base.py b/optimas/evaluators/base.py
@@ -21,6 +21,9 @@ class Evaluator:
     n_gpus : int, optional
         The number of GPUs that will be made available for each evaluation. By
         default, 0.
+    fail_on_nan : bool, optional
+        Whether to mark an evaluation as failed if the value of any of the
+        objectives is NaN. By default, ``True``.
 
     """
 
@@ -29,6 +32,7 @@ def __init__(
         sim_function: Callable,
         n_procs: Optional[int] = None,
         n_gpus: Optional[int] = None,
+        fail_on_nan: Optional[bool] = True,
     ) -> None:
         self.sim_function = sim_function
         # If no resources are specified, use 1 CPU an 0 GPUs.
@@ -44,6 +48,7 @@ def __init__(
             n_gpus = 0
         self._n_procs = n_procs
         self._n_gpus = n_gpus
+        self._fail_on_nan = fail_on_nan
         self._initialized = False
 
     def get_sim_specs(
@@ -68,14 +73,14 @@ def get_sim_specs(
             "in": [var.name for var in varying_parameters],
             "out": (
                 [(obj.name, obj.dtype) for obj in objectives]
-                # f is the single float output that LibEnsemble minimizes.
                 + [(par.name, par.dtype) for par in analyzed_parameters]
-                # input parameters
-                + [(var.name, var.dtype) for var in varying_parameters]
+                + [("trial_status", str, 10)]
             ),
             "user": {
                 "n_procs": self._n_procs,
                 "n_gpus": self._n_gpus,
+                "fail_on_nan": self._fail_on_nan,
+                "objectives": [obj.name for obj in objectives],
             },
         }
         return sim_specs

diff --git a/optimas/evaluators/multitask_evaluator.py b/optimas/evaluators/multitask_evaluator.py
@@ -56,10 +56,8 @@ def get_sim_specs(
             self.tasks[0].name: sim_specs_1["user"],
             self.tasks[1].name: sim_specs_2["user"],
         }
-        # Add task name to sim_specs in and out.
-        task_len = max([len(self.tasks[0].name), len(self.tasks[1].name)])
+        # Add task name to sim_specs in.
         sim_specs["in"].append("task")
-        sim_specs["out"].append(("task", str, task_len))
         return sim_specs
 
     def get_libe_specs(self) -> Dict:

diff --git a/optimas/explorations/base.py b/optimas/explorations/base.py
@@ -15,6 +15,7 @@
 from libensemble.alloc_funcs.start_only_persistent import only_persistent_gens
 from libensemble.executors.mpi_executor import MPIExecutor
 
+from optimas.core.trial import TrialStatus
 from optimas.generators.base import Generator
 from optimas.evaluators.base import Evaluator
 from optimas.evaluators.function_evaluator import FunctionEvaluator
@@ -128,7 +129,7 @@ def __init__(
     def history(self) -> pd.DataFrame:
         """Get the exploration history."""
         history = convert_to_dataframe(self._libe_history.H)
-        ordered_columns = ["trial_index"]
+        ordered_columns = ["trial_index", "trial_status"]
         ordered_columns += [p.name for p in self.generator.varying_parameters]
         ordered_columns += [p.name for p in self.generator.objectives]
         ordered_columns += [p.name for p in self.generator.analyzed_parameters]
@@ -147,6 +148,13 @@ def run(self, n_evals: Optional[int] = None) -> None:
             run until the number of evaluations reaches `max_evals`.
 
         """
+        # Store current working directory. It has been observed that sometimes
+        # (especially when using `local_threading`) the working directory
+        # is changed to the exploration directory after the call to `libE`.
+        # As a workaround, the cwd is stored and then set again at the end of
+        # `run`.
+        cwd = os.getcwd()
+
         # Set exit criteria to maximum number of evaluations.
         remaining_evals = self.max_evals - self._n_evals
         if remaining_evals < 1:
@@ -162,7 +170,7 @@ def run(self, n_evals: Optional[int] = None) -> None:
             exit_criteria["sim_max"] = sim_max
 
         # Get initial number of generator trials.
-        n_evals_initial = self.generator.n_completed_trials
+        n_evals_initial = self.generator.n_evaluated_trials
 
         # Create persis_info.
         persis_info = add_unique_random_streams({}, self.sim_workers + 2)
@@ -209,8 +217,11 @@ def run(self, n_evals: Optional[int] = None) -> None:
         self.generator._update(persis_info[1]["generator"])
 
         # Update number of evaluation in this exploration.
-        n_trials_final = self.generator.n_completed_trials
-        self._n_evals += n_trials_final - n_evals_initial
+        n_evals_final = self.generator.n_evaluated_trials
+        self._n_evals += n_evals_final - n_evals_initial
+
+        # Reset `cwd` to initial value before `libE` was called.
+        os.chdir(cwd)
 
     def attach_trials(
         self,
@@ -420,10 +431,24 @@ def attach_evaluations(
                 self.generator._trial_count + n_evals,
                 dtype=int,
             )
+        if "trial_status" not in fields:
+            history_new["trial_status"] = TrialStatus.COMPLETED.name
 
         # Incorporate new history into generator.
         self.generator.incorporate_history(history_new)
 
+    def mark_evaluation_as_failed(self, trial_index):
+        """Mark an already evaluated trial as failed.
+
+        Parameters
+        ----------
+        trial_index : int
+            The index of the trial.
+        """
+        self.generator.mark_trial_as_failed(trial_index)
+        i = np.where(self._libe_history.H["trial_index"] == trial_index)[0][0]
+        self._libe_history.H[i]["trial_status"] = TrialStatus.FAILED.name
+
     def _create_executor(self) -> None:
         """Create libEnsemble executor."""
         self.executor = MPIExecutor()

diff --git a/optimas/gen_functions.py b/optimas/gen_functions.py
@@ -13,6 +13,7 @@
 from libensemble.resources.resources import Resources
 
 from optimas.core import Evaluation
+from optimas.core.trial import TrialStatus
 
 
 def persistent_generator(H, persis_info, gen_specs, libE_info):
@@ -99,11 +100,15 @@ def persistent_generator(H, persis_info, gen_specs, libE_info):
             # Update the GP with latest simulation results
             for i in range(n):
                 trial_index = int(calc_in["trial_index"][i])
+                trial_status = calc_in["trial_status"][i]
                 trial = generator.get_trial(trial_index)
-                for par in objectives + analyzed_parameters:
-                    y = calc_in[par.name][i]
-                    ev = Evaluation(parameter=par, value=y)
-                    trial.complete_evaluation(ev)
+                if trial_status == TrialStatus.FAILED.name:
+                    trial.mark_as(TrialStatus.FAILED)
+                else:
+                    for par in objectives + analyzed_parameters:
+                        y = calc_in[par.name][i]
+                        ev = Evaluation(parameter=par, value=y)
+                        trial.complete_evaluation(ev)
                 # Register trial with unknown SEM
                 generator.tell([trial])
             # Set the number of points to generate to that number:

diff --git a/optimas/generators/ax/developer/multitask.py b/optimas/generators/ax/developer/multitask.py
@@ -38,6 +38,7 @@
     Parameter,
     Task,
     Trial,
+    TrialStatus,
 )
 from .ax_metric import AxMetric
 
@@ -225,10 +226,13 @@ def _incorporate_external_data(self, trials: List[Trial]) -> None:
             ax_trial.run()
             # Incorporate observations.
             for trial in trials_i:
-                objective_eval = {}
-                oe = trial.objective_evaluations[0]
-                objective_eval["f"] = (oe.value, oe.sem)
-                ax_trial.run_metadata[trial.arm_name] = objective_eval
+                if trial.status != TrialStatus.FAILED:
+                    objective_eval = {}
+                    oe = trial.objective_evaluations[0]
+                    objective_eval["f"] = (oe.value, oe.sem)
+                    ax_trial.run_metadata[trial.arm_name] = objective_eval
+                else:
+                    ax_trial.mark_arm_abandoned(trial.arm_name)
             # Mark batch trial as completed.
             ax_trial.mark_completed()
             # Keep track of high-fidelity trials.
@@ -245,10 +249,13 @@ def _complete_evaluations(self, trials: List[Trial]) -> None:
                 "External data can only be loaded into generator before "
                 "initialization."
             )
-            objective_eval = {}
-            oe = trial.objective_evaluations[0]
-            objective_eval["f"] = (oe.value, oe.sem)
-            self.current_trial.run_metadata[trial.arm_name] = objective_eval
+            if trial.status != TrialStatus.FAILED:
+                objective_eval = {}
+                oe = trial.objective_evaluations[0]
+                objective_eval["f"] = (oe.value, oe.sem)
+                self.current_trial.run_metadata[trial.arm_name] = objective_eval
+            else:
+                self.current_trial.mark_arm_abandoned(trial.arm_name)
             if trial.trial_type == self.lofi_task.name:
                 self.returned_lofi_trials += 1
                 if self.returned_lofi_trials == self.n_gen_lofi:
@@ -447,7 +454,7 @@ def _save_model_to_file(self) -> None:
         file_path = os.path.join(
             self._model_history_dir,
             "ax_experiment_at_eval_{}.json".format(
-                self._n_completed_trials_last_saved
+                self._n_evaluated_trials_last_saved
             ),
         )
         save_experiment(

diff --git a/optimas/generators/ax/service/ax_client.py b/optimas/generators/ax/service/ax_client.py
@@ -26,6 +26,9 @@ class AxClientGenerator(AxServiceGenerator):
     analyzed_parameters : list of Parameter, optional
         List of parameters to analyze at each trial, but which are not
         optimization objectives. By default ``None``.
+    abandon_failed_trials : bool, optional
+        Whether failed trials should be abandoned (i.e., not suggested again).
+        By default, ``True``.
     gpu_id : int, optional
         The ID of the GPU in which to run the generator. By default, ``0``.
         This parameter will only have an effect if any ``GenerationStep`` in
@@ -61,6 +64,7 @@ def __init__(
         self,
         ax_client: AxClient,
         analyzed_parameters: Optional[List[Parameter]] = None,
+        abandon_failed_trials: Optional[bool] = True,
         gpu_id: Optional[int] = 0,
         dedicated_resources: Optional[bool] = False,
         save_model: Optional[bool] = True,
@@ -79,6 +83,7 @@ def __init__(
             objectives=objectives,
             analyzed_parameters=analyzed_parameters,
             enforce_n_init=True,
+            abandon_failed_trials=abandon_failed_trials,
             use_cuda=use_cuda,
             gpu_id=gpu_id,
             dedicated_resources=dedicated_resources,