diff --git a/vissl/hooks/log_hooks.py b/vissl/hooks/log_hooks.py index 3fcaa462e..a3f66f830 100644 --- a/vissl/hooks/log_hooks.py +++ b/vissl/hooks/log_hooks.py @@ -18,6 +18,7 @@ from fvcore.common.file_io import PathManager from vissl.utils.checkpoint import is_checkpoint_phase from vissl.utils.env import get_machine_local_and_dist_rank +from vissl.utils.events import VisslEventStorage from vissl.utils.io import create_file_symlink, save_file from vissl.utils.logger import log_gpu_stats from vissl.utils.perf_stats import PerfStats @@ -153,6 +154,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None: train_phase_idx = task.train_phase_idx log_freq = task.config["LOG_FREQUENCY"] iteration = task.iteration + evt_stg: VisslEventStorage = task.event_storage if torch.cuda.is_available(): peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0) @@ -172,22 +174,21 @@ def on_update(self, task: "tasks.ClassyTask") -> None: avg_time = sum(batch_times) / len(batch_times) eta_secs = avg_time * (task.max_iteration - iteration) - eta_string = str(datetime.timedelta(seconds=int(eta_secs))) if isinstance(task.optimizer.options_view.lr, set): lr_val = list(task.optimizer.options_view.lr) else: lr_val = round(task.optimizer.options_view.lr, 5) batch_time = int(1000.0 * avg_time) rank = get_rank() - log_str = ( - f"Rank: {rank}; " - f"[ep: {train_phase_idx}] " - f"iter: {iteration}; " - f"lr: {lr_val}; " - f"loss: {loss_val}; " - f"btime(ms): {batch_time}; " - f"eta: {eta_string}; " - f"peak_mem: {peak_mem_used}M" + evt_stg.put_scalars( + rank=rank, + epoch=train_phase_idx, + iteration=iteration, + lr=lr_val, + loss=loss_val, + batch_time=batch_time, + eta=eta_secs, + peak_mem_used=peak_mem_used, ) if self.btime_freq and len(batch_times) >= self.btime_freq: rolling_avg_time = ( @@ -200,12 +201,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None: datetime.timedelta(seconds=int(rolling_eta_secs)) ) rolling_btime = int(1000.0 * rolling_avg_time) - log_str = ( - f"{log_str}; " - f"btime({self.btime_freq}iters): {rolling_btime} ms; " - f"rolling_eta: {rolling_eta_str}" + evt_stg.put_scalars( + rolling_btime=rolling_btime, rolling_eta=rolling_eta_str ) - logging.info(log_str) + for writer in task.event_storage_writers: + writer.write() class LogLossMetricsCheckpointHook(ClassyHook): diff --git a/vissl/trainer/train_task.py b/vissl/trainer/train_task.py index 9a4780732..a614e03d4 100644 --- a/vissl/trainer/train_task.py +++ b/vissl/trainer/train_task.py @@ -111,6 +111,7 @@ def __init__(self, config: AttrDict): # communication as much as possible self.set_ddp_bucket_cap_mb() self.use_gpu = self.device.type == "cuda" + self.event_storage_writers = [] def initiate_vissl_event_storage(self): from vissl.utils.events import create_event_storage, get_event_storage @@ -118,6 +119,13 @@ def initiate_vissl_event_storage(self): create_event_storage() self._event_storage = get_event_storage() + def build_event_storage_writers(self): + from vissl.utils.events import JsonWriter + + self.event_storage_writers = [ + JsonWriter(f"{self.checkpoint_folder}/stdout.json") + ] + @property def event_storage(self): return self._event_storage diff --git a/vissl/trainer/trainer_main.py b/vissl/trainer/trainer_main.py index b6b08aba4..94181ab96 100644 --- a/vissl/trainer/trainer_main.py +++ b/vissl/trainer/trainer_main.py @@ -89,6 +89,7 @@ def __init__( self.local_rank, self.distributed_rank = get_machine_local_and_dist_rank() self.task.initiate_vissl_event_storage() + self.task.build_event_storage_writers() self.setup_distributed(self.task.device.type == "cuda") def setup_distributed(self, use_gpu: bool): @@ -186,7 +187,7 @@ def train(self): logging.info("CUDA cache cleared") task = train_step_fn(task) iteration_num += 1 - task.local_iteration_num = iteration_num + task.local_iteration_num = task.event_storage.iter = iteration_num task.run_hooks(SSLClassyHookFunctions.on_step.name) except StopIteration: break diff --git a/vissl/utils/events.py b/vissl/utils/events.py index 7a698b8ad..adc06eaea 100644 --- a/vissl/utils/events.py +++ b/vissl/utils/events.py @@ -1,5 +1,7 @@ +import json from collections import defaultdict +from fvcore.common.file_io import PathManager from fvcore.common.history_buffer import HistoryBuffer @@ -114,3 +116,28 @@ def put_histogram(self, hist_name, hist_tensor, bins=1000): def put_image(self, img_name, img_tensor): # implement later for tensorboard return NotImplementedError + + +class JsonWriter(VisslEventWriter): + def __init__(self, json_file): + """ + Args: + json_file: path to the json file. New data will be appended if the file + exists. + """ + self._file_handle = PathManager.open(json_file, "a") + + def write(self): + storage: VisslEventStorage = get_event_storage() + to_save = defaultdict(dict) + + for k, (v, iter) in storage.latest().items(): + # keep scalars that have not been written + to_save[iter][k] = v + for itr, scalars_per_iter in to_save.items(): + scalars_per_iter["iteration"] = itr + self._file_handle.write(json.dumps(scalars_per_iter, sort_keys=True) + "\n") + self._file_handle.flush() + + def close(self): + self._file_handle.close()