From 7de96cbd8fcc888ee0797fa6871ab2ae7e3147e8 Mon Sep 17 00:00:00 2001 From: ephoris Date: Wed, 28 Aug 2024 13:04:16 -0400 Subject: [PATCH] [Feat] Change callback functions to return state --- endure.py | 4 ++-- endure/util/trainer.py | 4 ++-- jobs/ltune_train.py | 12 +++++------- jobs/mlos_exp_runs.py | 36 ++++++++++++++++++++++++------------ requirements.txt | 1 + 5 files changed, 34 insertions(+), 23 deletions(-) mode change 100644 => 100755 jobs/mlos_exp_runs.py diff --git a/endure.py b/endure.py index 84a2bf3..c2011d0 100755 --- a/endure.py +++ b/endure.py @@ -21,7 +21,7 @@ def __init__(self, config: dict[str, Any]) -> None: format=config["log"]["format"], datefmt=config["log"]["datefmt"] ) self.log: logging.Logger = logging.getLogger(config["log"]["name"]) - self.log.setLevel(logging.getLevelName(config["log"]["level"])) + self.log.setLevel(getattr(logging, config["log"]["level"])) log_level = logging.getLevelName(self.log.getEffectiveLevel()) self.log.debug(f"Log level: {log_level}") @@ -40,7 +40,7 @@ def run(self): for job_name in jobs_list: job = jobs.get(job_name, None) if job is None: - self.log.warn(f"No job associated with {job_name}") + self.log.warning(f"No job associated with {job_name}") continue job = job(config) _ = job.run() diff --git a/endure/util/trainer.py b/endure/util/trainer.py index af6b97f..6db7e19 100644 --- a/endure/util/trainer.py +++ b/endure/util/trainer.py @@ -25,7 +25,7 @@ def __init__( model_test_kwargs: dict[str, Any] = {}, disable_tqdm: bool = False, no_checkpoint: bool = False, - train_callback: Optional[Callable[[dict], None]] = None, + train_callback: Optional[Callable[[dict], dict]] = None, ) -> None: self.log = log self.model = model @@ -95,7 +95,7 @@ def _train_loop(self) -> float: self.scheduler.step() if self.train_callback is not None: - self.train_callback(self.model_train_kwargs) + self.model_train_kwargs = self.train_callback(self.model_train_kwargs) if self.train_len == 0: self.train_len = batch + 1 diff --git a/jobs/ltune_train.py b/jobs/ltune_train.py index cf138c0..f1bf8ab 100644 --- a/jobs/ltune_train.py +++ b/jobs/ltune_train.py @@ -120,28 +120,26 @@ def gumbel_temp_schedule( train_kwargs: dict, decay_rate: float = 0.95, floor: float = 0.01, - ) -> None: + ) -> dict: train_kwargs["temp"] *= decay_rate if train_kwargs["temp"] < floor: train_kwargs["temp"] = floor - return + return train_kwargs @staticmethod def reinmax_temp_schedule( train_kwargs: dict, decay_rate: float = 0.9, floor: float = 1, - ) -> None: + ) -> dict: train_kwargs["temp"] *= decay_rate if train_kwargs["temp"] < floor: train_kwargs["temp"] = floor - return + return train_kwargs - def get_train_callback(self) -> Optional[Callable[[dict], None]]: - if not self.design == Policy.KHybrid: - return None + def get_train_callback(self) -> Optional[Callable[[dict], dict]]: if self.config["ltune"]["model"]["categorical_mode"] == "reinmax": return lambda train_kwargs: self.reinmax_temp_schedule(train_kwargs) # default train_callback will be gumbel softmax diff --git a/jobs/mlos_exp_runs.py b/jobs/mlos_exp_runs.py old mode 100644 new mode 100755 index d9438f1..fe0d9cc --- a/jobs/mlos_exp_runs.py +++ b/jobs/mlos_exp_runs.py @@ -9,8 +9,8 @@ from endure.lsm.types import LSMBounds, LSMDesign, Policy, System, Workload from mlos_core.optimizers import SmacOptimizer -NUM_SAMPLES = 100 -NUM_ROUNDS = 20 +NUM_ROUNDS = 100 +NUM_TRIALS = 10 class ExperimentMLOS: @@ -64,7 +64,8 @@ def _create_optimizer(self, parameter_space: CS.ConfigurationSpace): def _train_model( self, - workload_id: int, + wl_id: int, + trial: int, workload: Workload, system: System, num_rounds: int = NUM_ROUNDS, @@ -81,20 +82,26 @@ def _train_model( optimizer.register( configs=suggestion, scores=pd.DataFrame([{"cost": cost}]) ) - self.log.info(f"Round {round}: Cost: {cost}") - self.db.log_round(workload_id, round, design, cost) + self.log.info(f"[ID {wl_id}][Trial {trial}][Round {round}] Cost: {cost}") + self.db.log_round(wl_id, trial, round, design, cost) return def run(self) -> None: - for _ in range(NUM_SAMPLES): - workload = Workload(*self.gen._sample_workload(4)) - system = self.gen._sample_system() + system = System() + for rep_wl in self.config["workloads"]: + workload = Workload( + z0=rep_wl["z0"], + z1=rep_wl["z1"], + q=rep_wl["q"], + w=rep_wl["w"], + ) row_id = self.db.log_workload(workload, system) self.log.info(f"Workload: {workload}") self.log.info(f"System: {system}") - self.log.info(f"Environment ID: {row_id}") - self._train_model(row_id, workload, system) + for trial in range(NUM_TRIALS): + self.log.info(f"(Workload ID, Trial): ({row_id}, {trial})") + self._train_model(row_id, trial, workload, system) return @@ -127,6 +134,7 @@ def __init__(self, config: dict, db_path: str = "mlos_exp.db") -> None: CREATE TABLE IF NOT EXISTS tunings ( idx INTEGER PRIMARY KEY AUTOINCREMENT, env_id INTEGER, + trial INTEGER, round INTEGER, bits_per_elem REAL, size_ratio INTEGER, @@ -182,6 +190,7 @@ def log_workload(self, workload: Workload, system: System) -> int: def log_round( self, workload_id: int, + trial: int, round: int, design: LSMDesign, cost: float, @@ -191,6 +200,7 @@ def log_round( """ INSERT INTO tunings ( env_id, + trial, round, bits_per_elem, size_ratio, @@ -200,9 +210,11 @@ def log_round( kap15, kap16, kap17, kap18, kap19, cost ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, - ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) """, - (workload_id, round, design.h, int(design.T)) + tuple(design.K) + (cost,), + (workload_id, trial, round, design.h, int(design.T)) + + tuple(design.K) + + (cost,), ) self.connector.commit() diff --git a/requirements.txt b/requirements.txt index a9dee5e..3dce06e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,4 @@ toml torch torchdata tqdm +mlos