Skip to content
This repository has been archived by the owner on Mar 19, 2024. It is now read-only.

Commit

Permalink
Implement a JSON writer for logging
Browse files Browse the repository at this point in the history
Need to add standard output via logging back
  • Loading branch information
akainth015 committed Apr 8, 2021
1 parent 5a69b36 commit 5fdea84
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 16 deletions.
30 changes: 15 additions & 15 deletions vissl/hooks/log_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from fvcore.common.file_io import PathManager
from vissl.utils.checkpoint import is_checkpoint_phase
from vissl.utils.env import get_machine_local_and_dist_rank
from vissl.utils.events import VisslEventStorage
from vissl.utils.io import create_file_symlink, save_file
from vissl.utils.logger import log_gpu_stats
from vissl.utils.perf_stats import PerfStats
Expand Down Expand Up @@ -153,6 +154,7 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
train_phase_idx = task.train_phase_idx
log_freq = task.config["LOG_FREQUENCY"]
iteration = task.iteration
evt_stg: VisslEventStorage = task.event_storage

if torch.cuda.is_available():
peak_mem_used = int(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0)
Expand All @@ -172,22 +174,21 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
avg_time = sum(batch_times) / len(batch_times)

eta_secs = avg_time * (task.max_iteration - iteration)
eta_string = str(datetime.timedelta(seconds=int(eta_secs)))
if isinstance(task.optimizer.options_view.lr, set):
lr_val = list(task.optimizer.options_view.lr)
else:
lr_val = round(task.optimizer.options_view.lr, 5)
batch_time = int(1000.0 * avg_time)
rank = get_rank()
log_str = (
f"Rank: {rank}; "
f"[ep: {train_phase_idx}] "
f"iter: {iteration}; "
f"lr: {lr_val}; "
f"loss: {loss_val}; "
f"btime(ms): {batch_time}; "
f"eta: {eta_string}; "
f"peak_mem: {peak_mem_used}M"
evt_stg.put_scalars(
rank=rank,
epoch=train_phase_idx,
iteration=iteration,
lr=lr_val,
loss=loss_val,
batch_time=batch_time,
eta=eta_secs,
peak_mem_used=peak_mem_used,
)
if self.btime_freq and len(batch_times) >= self.btime_freq:
rolling_avg_time = (
Expand All @@ -200,12 +201,11 @@ def on_update(self, task: "tasks.ClassyTask") -> None:
datetime.timedelta(seconds=int(rolling_eta_secs))
)
rolling_btime = int(1000.0 * rolling_avg_time)
log_str = (
f"{log_str}; "
f"btime({self.btime_freq}iters): {rolling_btime} ms; "
f"rolling_eta: {rolling_eta_str}"
evt_stg.put_scalars(
rolling_btime=rolling_btime, rolling_eta=rolling_eta_str
)
logging.info(log_str)
for writer in task.event_storage_writers:
writer.write()


class LogLossMetricsCheckpointHook(ClassyHook):
Expand Down
8 changes: 8 additions & 0 deletions vissl/trainer/train_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,21 @@ def __init__(self, config: AttrDict):
# communication as much as possible
self.set_ddp_bucket_cap_mb()
self.use_gpu = self.device.type == "cuda"
self.event_storage_writers = []

def initiate_vissl_event_storage(self):
from vissl.utils.events import create_event_storage, get_event_storage

create_event_storage()
self._event_storage = get_event_storage()

def build_event_storage_writers(self):
from vissl.utils.events import JsonWriter

self.event_storage_writers = [
JsonWriter(f"{self.checkpoint_folder}/stdout.json")
]

@property
def event_storage(self):
return self._event_storage
Expand Down
3 changes: 2 additions & 1 deletion vissl/trainer/trainer_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def __init__(

self.local_rank, self.distributed_rank = get_machine_local_and_dist_rank()
self.task.initiate_vissl_event_storage()
self.task.build_event_storage_writers()
self.setup_distributed(self.task.device.type == "cuda")

def setup_distributed(self, use_gpu: bool):
Expand Down Expand Up @@ -186,7 +187,7 @@ def train(self):
logging.info("CUDA cache cleared")
task = train_step_fn(task)
iteration_num += 1
task.local_iteration_num = iteration_num
task.local_iteration_num = task.event_storage.iter = iteration_num
task.run_hooks(SSLClassyHookFunctions.on_step.name)
except StopIteration:
break
Expand Down
27 changes: 27 additions & 0 deletions vissl/utils/events.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
from collections import defaultdict

from fvcore.common.file_io import PathManager
from fvcore.common.history_buffer import HistoryBuffer


Expand Down Expand Up @@ -114,3 +116,28 @@ def put_histogram(self, hist_name, hist_tensor, bins=1000):
def put_image(self, img_name, img_tensor):
# implement later for tensorboard
return NotImplementedError


class JsonWriter(VisslEventWriter):
def __init__(self, json_file):
"""
Args:
json_file: path to the json file. New data will be appended if the file
exists.
"""
self._file_handle = PathManager.open(json_file, "a")

def write(self):
storage: VisslEventStorage = get_event_storage()
to_save = defaultdict(dict)

for k, (v, iter) in storage.latest().items():
# keep scalars that have not been written
to_save[iter][k] = v
for itr, scalars_per_iter in to_save.items():
scalars_per_iter["iteration"] = itr
self._file_handle.write(json.dumps(scalars_per_iter, sort_keys=True) + "\n")
self._file_handle.flush()

def close(self):
self._file_handle.close()

0 comments on commit 5fdea84

Please sign in to comment.