Skip to content

Commit

Permalink
update autotuner env
Browse files Browse the repository at this point in the history
  • Loading branch information
caozhou committed May 31, 2024
1 parent fa85bd7 commit d27d136
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 4 deletions.
4 changes: 2 additions & 2 deletions flagscale/train/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ def pretrain(train_valid_test_dataset_provider,

print_datetime('after training is done')

if not os.environ.get("AUTO_TUNER", False):
if not os.environ.get("FLAGSCALE_AUTOTUNER", False):
if args.save and iteration != 0 and iteration % args.save_interval != 0:
save_checkpoint(
iteration,
Expand Down Expand Up @@ -951,7 +951,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
total_loss_dict[skipped_iters_key] = 0
total_loss_dict[nan_iters_key] = 0
print_rank_last(log_string)
if not os.environ.get("AUTO_TUNER", False):
if not os.environ.get("FLAGSCALE_AUTOTUNER", False):
if report_memory_flag and learning_rate > 0.0:
# Report memory after optimizer state has been initialized.
if torch.distributed.get_rank() == 0:
Expand Down
4 changes: 2 additions & 2 deletions megatron/megatron/training/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def pretrain(train_valid_test_dataset_provider,
extra_valid_dataset_provider)

print_datetime('after training is done')
if not os.environ.get("AUTO_TUNER", False):
if not os.environ.get("FLAGSCALE_AUTOTUNER", False):
if args.save and iteration != 0 and iteration % args.save_interval != 0:
save_checkpoint(iteration, model, optimizer, opt_param_scheduler,
num_floating_point_operations_so_far, checkpointing_context)
Expand Down Expand Up @@ -952,7 +952,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r
total_loss_dict[skipped_iters_key] = 0
total_loss_dict[nan_iters_key] = 0
print_rank_last(log_string)
if not os.environ.get("AUTO_TUNER", False):
if not os.environ.get("FLAGSCALE_AUTOTUNER", False):
if report_memory_flag and learning_rate > 0.:
# Report memory after optimizer state has been initialized.
if torch.distributed.get_rank() == 0:
Expand Down
1 change: 1 addition & 0 deletions megatron/megatron/training/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def report_memory(name):
string += ' | max reserved: {}'.format(
torch.cuda.max_memory_reserved() / mega_bytes)
if not os.environ.get("FLAGSCALE_AUTOTUNER", False):
# Each rank prints the memory report.
if mpu.get_data_parallel_rank() == 0:
print("[Rank {}] {}".format(torch.distributed.get_rank(), string),
flush=True)
Expand Down

0 comments on commit d27d136

Please sign in to comment.