diff --git a/flagscale/auto_tuner/tuner.py b/flagscale/auto_tuner/tuner.py index 66b9223a6..fc48dbf73 100644 --- a/flagscale/auto_tuner/tuner.py +++ b/flagscale/auto_tuner/tuner.py @@ -123,7 +123,7 @@ def tune(self): # get best strategy best_strategy = self.get_best() if best_strategy: - self.logger.info(f"Best strategy tuned so far: {best_strategy}, and performance is {best_strategy["performance"]}.") + self.logger.info(f"Best strategy tuned so far: {best_strategy}, and performance is {best_strategy['performance']}.") else: self.logger.info(f"No strategy can run so far.") tuner_end_time = time.time() diff --git a/flagscale/auto_tuner/utils.py b/flagscale/auto_tuner/utils.py index 3e4409e5c..d1fed7c3a 100644 --- a/flagscale/auto_tuner/utils.py +++ b/flagscale/auto_tuner/utils.py @@ -67,13 +67,15 @@ def sort_by_memory(strategy): -strategy["use_distributed_optimizer"], strategy["micro_batch_size"], -strategy["use_recompute"], + -strategy["sequence_parallel"], ) def sort_by_performance(strategy): return ( - -strategy["micro_batch_size"], - strategy["use_recompute"], - strategy["tensor_model_parallel_size"], + -strategy["use_recompute"], + (strategy["tensor_model_parallel_size"] % 4), + (strategy["micro_batch_size"] % 4), strategy["pipeline_model_parallel_size"], + -strategy["sequence_parallel"], )