From 0ef00c76bb4195fa8c515a004a67d0fb1b12fcd3 Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 23 Apr 2018 09:52:37 +0200 Subject: [PATCH 1/2] lowered tensorflow version --- models/__init__.py | 0 models/tf_details/resnet_details.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 models/__init__.py diff --git a/models/__init__.py b/models/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/models/tf_details/resnet_details.py b/models/tf_details/resnet_details.py index eb2c45b..2c8d31d 100644 --- a/models/tf_details/resnet_details.py +++ b/models/tf_details/resnet_details.py @@ -33,7 +33,7 @@ def can_train(): warnings.simplefilter(action='ignore', category=FutureWarning) from tensorflow import __version__ as tfv - required = "1.7.0" + required = "1.6.0" #only require major and minor release number as the patch number may contain 'rc' etc if versiontuple(tfv,2) >= versiontuple(required,2): From 227a4d2db727500c57f81fa819846b1a906e74ea Mon Sep 17 00:00:00 2001 From: Peter Steinbach Date: Mon, 23 Apr 2018 11:05:39 +0200 Subject: [PATCH 2/2] ditched hook timer as it showed unreliable triggering on epoch end --- models/resnet.py | 4 ++-- models/tf_details/resnet_details.py | 5 +++++ models/tf_details/resnet_run_loop.py | 7 +------ models/tf_details/utils/logging/hooks.py | 8 +++++--- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/models/resnet.py b/models/resnet.py index 05f1052..00f0ef5 100644 --- a/models/resnet.py +++ b/models/resnet.py @@ -121,7 +121,7 @@ def versions(self): value = "" - if "keras" in self.backend.lower(): + if self.backend.lower().startswith("keras"): import keras from keras import backend as K @@ -143,7 +143,7 @@ def versions(self): else: - if "tensorflow" in self.backend.lower(): + if self.backend.lower() == "tensorflow" or self.backend.lower() == "tf": import tensorflow as tf value = "tensorflow:{ver}".format(ver=tf.__version__) else: diff --git a/models/tf_details/resnet_details.py b/models/tf_details/resnet_details.py index 2c8d31d..551192e 100644 --- a/models/tf_details/resnet_details.py +++ b/models/tf_details/resnet_details.py @@ -111,4 +111,9 @@ def train(train, test, datafraction, opts): logging.info('handing over \n >> %s \n >> %s',flags,opts) history, timings = run_loop.resnet_main(flags, cfmain.cifar10_model_fn, cfmain.input_fn, opts) + if not opts['checkpoint_epochs']: + logging.info("unable to ensure pure no-checkpoint behavior with resnet in pure tensorflow, removing result directory") + import shutil + shutil.rmtree(model_dir) + return history, timings, { 'num_weights' : None } diff --git a/models/tf_details/resnet_run_loop.py b/models/tf_details/resnet_run_loop.py index 6397cbd..14beba4 100644 --- a/models/tf_details/resnet_run_loop.py +++ b/models/tf_details/resnet_run_loop.py @@ -327,6 +327,7 @@ def resnet_main(flags, model_function, input_function, opts = None): logging.warning("batch sizes differ in model %i %s", flags.batch_size, opts["batch_size"]) if ngpus > 1: + steps_per_epoch -= 1 validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens @@ -407,12 +408,6 @@ def input_fn_eval(): validation_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) - - # for (k,v) in train_hooks["CaptureTensorsHook"].captured.items(): - # print(">> ",k,v[:5],v[-2:]) - - #epoch_times.extend(train_hooks["TimePerEpochHook"].epoch_durations) - for k in validation_results.keys(): if "global_step" in k: continue diff --git a/models/tf_details/utils/logging/hooks.py b/models/tf_details/utils/logging/hooks.py index 370a9da..6db11ee 100644 --- a/models/tf_details/utils/logging/hooks.py +++ b/models/tf_details/utils/logging/hooks.py @@ -50,7 +50,7 @@ def add(self,other): class TimePerEpochHook(tf.train.SessionRunHook): def __init__(self, every_n_steps, - warm_steps=0): + warm_steps=-1): self.every_n_steps = every_n_steps logging.info("TimePerEpochHook triggering every %i steps",every_n_steps) @@ -112,8 +112,8 @@ def after_run(self, run_context, run_values): # pylint: disable=unused-argument global_step = run_values.results sess = run_context.session - - if self._timer.should_trigger_for_step(global_step) and global_step > self._warm_steps: + #if self._timer.should_trigger_for_step(global_step) and global_step > self._warm_steps: + if self._step % self.every_n_steps == 0: elapsed_time, elapsed_steps = self._timer.update_last_triggered_step( global_step) if elapsed_time is not None: @@ -124,6 +124,8 @@ def after_run(self, run_context, run_values): # pylint: disable=unused-argument tf.logging.info('Epoch [%g steps]: %g (%s)', self._total_steps,self._epoch_train_time,str(self.epoch_durations)) self._epoch_train_time = 0 + else: + logging.warning("step %i, elapsed_time is None!", global_step)