From 0609ad47388abba37d074736014a239d0839a951 Mon Sep 17 00:00:00 2001
From: Eugene Fedorenko <eugene.fedor@gmail.com>
Date: Thu, 30 Apr 2020 14:07:38 -0700
Subject: [PATCH] Fixes. End-to-End Pipeline Example on Azure (#788)

* Fixes

* Fixes
---
 .../azurepipeline/code/deploy/Dockerfile      |   0
 pipelines/azurepipeline/code/deploy/score.py  | 115 +++----
 .../azurepipeline/code/preprocess/Dockerfile  |   0
 .../azurepipeline/code/preprocess/data.py     | 188 +++++------
 .../azurepipeline/code/register/Dockerfile    |   0
 .../azurepipeline/code/register/register.py   | 166 +++++-----
 .../azurepipeline/code/training/Dockerfile    |   0
 .../azurepipeline/code/training/train.py      | 301 +++++++++---------
 8 files changed, 397 insertions(+), 373 deletions(-)
 mode change 100644 => 100755 pipelines/azurepipeline/code/deploy/Dockerfile
 mode change 100644 => 100755 pipelines/azurepipeline/code/deploy/score.py
 mode change 100644 => 100755 pipelines/azurepipeline/code/preprocess/Dockerfile
 mode change 100644 => 100755 pipelines/azurepipeline/code/preprocess/data.py
 mode change 100644 => 100755 pipelines/azurepipeline/code/register/Dockerfile
 mode change 100644 => 100755 pipelines/azurepipeline/code/register/register.py
 mode change 100644 => 100755 pipelines/azurepipeline/code/training/Dockerfile
 mode change 100644 => 100755 pipelines/azurepipeline/code/training/train.py

diff --git a/pipelines/azurepipeline/code/deploy/Dockerfile b/pipelines/azurepipeline/code/deploy/Dockerfile
old mode 100644
new mode 100755
diff --git a/pipelines/azurepipeline/code/deploy/score.py b/pipelines/azurepipeline/code/deploy/score.py
old mode 100644
new mode 100755
index bafdcf4a8..da092dce7
--- a/pipelines/azurepipeline/code/deploy/score.py
+++ b/pipelines/azurepipeline/code/deploy/score.py
@@ -11,83 +11,84 @@
 
 
 def init():
-  if Model.get_model_path('tacosandburritos'):
-    model_path = Model.get_model_path('tacosandburritos')
-  else:
-    model_path = '/model/latest.h5'
+    global model
+    if Model.get_model_path('tacosandburritos'):
+        model_path = Model.get_model_path('tacosandburritos')
+    else:
+        model_path = '/model/latest.h5'
 
-  print('Attempting to load model')
-  model = tf.keras.models.load_model(model_path)
-  model.summary()
-  print('Done!')
+    print('Attempting to load model')
+    model = tf.keras.models.load_model(model_path)
+    model.summary()
+    print('Done!')
 
-  print('Initialized model "{}" at {}'.format(model_path, datetime.datetime.now()))
-  return model
+    print('Initialized model "{}" at {}'.format(
+        model_path, datetime.datetime.now()))
 
 
-def run(raw_data, model):
-  prev_time = time.time()
+def run(raw_data):
+    prev_time = time.time()
 
-  post = json.loads(raw_data)
-  img_path = post['image']
+    post = json.loads(raw_data)
+    img_path = post['image']
 
-  current_time = time.time()
+    current_time = time.time()
 
-  tensor = process_image(img_path, 160)
-  t = tf.reshape(tensor, [-1, 160, 160, 3])
-  o = model.predict(t, steps=1)  # [0][0]
-  print(o)
-  o = o[0][0]
-  inference_time = datetime.timedelta(seconds=current_time - prev_time)
-  payload = {
-    'time': inference_time.total_seconds(),
-    'prediction': 'burrito' if o > 0.5 else 'tacos',
-    'scores': str(o)
-  }
+    tensor = process_image(img_path, 160)
+    t = tf.reshape(tensor, [-1, 160, 160, 3])
+    o = model.predict(t, steps=1)  # [0][0]
+    print(o)
+    o = o[0][0]
+    inference_time = datetime.timedelta(seconds=current_time - prev_time)
+    payload = {
+        'time': inference_time.total_seconds(),
+        'prediction': 'burrito' if o > 0.5 else 'tacos',
+        'scores': str(o)
+    }
 
-  print('Input ({}), Prediction ({})'.format(post['image'], payload))
+    print('Input ({}), Prediction ({})'.format(post['image'], payload))
 
-  return payload
+    return payload
 
 
 def process_image(path, image_size):
-  # Extract image (from web or path)
-  if path.startswith('http'):
-    response = requests.get(path)
-    img = np.array(Image.open(BytesIO(response.content)))
-  else:
-    img = np.array(Image.open(path))
+    # Extract image (from web or path)
+    if path.startswith('http'):
+        response = requests.get(path)
+        img = np.array(Image.open(BytesIO(response.content)))
+    else:
+        img = np.array(Image.open(path))
 
-  img_tensor = tf.convert_to_tensor(img, dtype=tf.float32)
-  # tf.image.decode_jpeg(img_raw, channels=3)
-  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
-  return img_final
+    img_tensor = tf.convert_to_tensor(img, dtype=tf.float32)
+    # tf.image.decode_jpeg(img_raw, channels=3)
+    img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
+    return img_final
 
 
 def info(msg, char="#", width=75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
-  print(char * width)
+    print("")
+    print(char * width)
+    print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
+    print(char * width)
 
 
 if __name__ == "__main__":
-  images = {
-    'tacos': 'https://c1.staticflickr.com/5/4022/4401140214_f489c708f0_b.jpg',
-    'burrito': 'https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg'
-  }
+    images = {
+        'tacos': 'https://c1.staticflickr.com/5/4022/4401140214_f489c708f0_b.jpg',  # noqa: E501
+        'burrito': 'https://www.exploreveg.org/files/2015/05/sofritas-burrito.jpeg'  # noqa: E501
+    }
 
-  my_model = init()
+    init()
 
-  for k, v in images.items():
-    print('{} => {}'.format(k, v))
+    for k, v in images.items():
+        print('{} => {}'.format(k, v))
 
-  info('Taco Test')
-  taco = json.dumps({'image': images['tacos']})
-  print(taco)
-  run(taco, my_model)
+    info('Taco Test')
+    taco = json.dumps({'image': images['tacos']})
+    print(taco)
+    run(taco)
 
-  info('Burrito Test')
-  burrito = json.dumps({'image': images['burrito']})
-  print(burrito)
-  run(burrito, my_model)
+    info('Burrito Test')
+    burrito = json.dumps({'image': images['burrito']})
+    print(burrito)
+    run(burrito)
diff --git a/pipelines/azurepipeline/code/preprocess/Dockerfile b/pipelines/azurepipeline/code/preprocess/Dockerfile
old mode 100644
new mode 100755
diff --git a/pipelines/azurepipeline/code/preprocess/data.py b/pipelines/azurepipeline/code/preprocess/data.py
old mode 100644
new mode 100755
index b21c13dd1..05165bde7
--- a/pipelines/azurepipeline/code/preprocess/data.py
+++ b/pipelines/azurepipeline/code/preprocess/data.py
@@ -8,111 +8,115 @@
 
 
 def check_dir(path):
-  if not os.path.exists(path):
-    os.makedirs(path)
-  return Path(path).resolve(strict=False)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return Path(path).resolve(strict=False)
 
 
 def download(source, target, force_clear=False):
-  if force_clear and os.path.exists(target):
-    print('Removing {}...'.format(target))
-    shutil.rmtree(target)
+    if force_clear and os.path.exists(target):
+        print('Removing {}...'.format(target))
+        shutil.rmtree(target)
 
-  check_dir(target)
+    check_dir(target)
 
-  targt_file = str(Path(target).joinpath('data.zip'))
-  if os.path.exists(targt_file) and not force_clear:
-    print('data already exists, skipping download')
-    return
+    targt_file = str(Path(target).joinpath('data.zip'))
+    if os.path.exists(targt_file) and not force_clear:
+        print('data already exists, skipping download')
+        return
 
-  if source.startswith('http'):
-    print("Downloading from {} to {}".format(source, target))
-    wget.download(source, targt_file)
-    print("Done!")
-  else:
-    print("Copying from {} to {}".format(source, target))
-    shutil.copyfile(source, targt_file)
+    if source.startswith('http'):
+        print("Downloading from {} to {}".format(source, target))
+        wget.download(source, targt_file)
+        print("Done!")
+    else:
+        print("Copying from {} to {}".format(source, target))
+        shutil.copyfile(source, targt_file)
 
-  print('Unzipping {}'.format(targt_file))
-  zipr = zipfile.ZipFile(targt_file)
-  zipr.extractall(target)
-  zipr.close()
+    print('Unzipping {}'.format(targt_file))
+    zipr = zipfile.ZipFile(targt_file)
+    zipr.extractall(target)
+    zipr.close()
 
 
 def process_image(path, image_size=160):
-  img_raw = tf.io.read_file(path)
-  img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
-  img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
-  return img_final
+    img_raw = tf.io.read_file(path)
+    img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
+    img_final = tf.image.resize(img_tensor, [image_size, image_size]) / 255
+    return img_final
 
 
 def walk_images(path, image_size=160):
-  imgs = []
-  print('Scanning {}'.format(path))
-  # find subdirectories in base path
-  # (they should be the labels)
-  labels = []
-  for (_, dirs, _) in os.walk(path):
-    print('Found {}'.format(dirs))
-    labels = dirs
-    break
-
-  for d in labels:
-    tmp_path = os.path.join(path, d)
-    print('Processing {}'.format(tmp_path))
-    # only care about files in directory
-    for item in os.listdir(tmp_path):
-      if not item.lower().endswith('.jpg'):
-        print('skipping {}'.format(item))
-        continue
-
-      image = os.path.join(tmp_path, item)
-      try:
-        img = process_image(image, image_size)
-        assert img.shape[2] == 3, "Invalid channel count"
-        # write out good images
-        imgs.append(image)
-      except img.shape[2] != 3:
-        print('{}\n'.format(image))
-
-  return imgs
+    imgs = []
+    print('Scanning {}'.format(path))
+    # find subdirectories in base path
+    # (they should be the labels)
+    labels = []
+    for (_, dirs, _) in os.walk(path):
+        print('Found {}'.format(dirs))
+        labels = dirs
+        break
+
+    for d in labels:
+        tmp_path = os.path.join(path, d)
+        print('Processing {}'.format(tmp_path))
+        # only care about files in directory
+        for item in os.listdir(tmp_path):
+            if not item.lower().endswith('.jpg'):
+                print('skipping {}'.format(item))
+                continue
+
+            image = os.path.join(tmp_path, item)
+            try:
+                img = process_image(image, image_size)
+                assert img.shape[2] == 3, "Invalid channel count"
+                # write out good images
+                imgs.append(image)
+            except img.shape[2] != 3:
+                print('{}\n'.format(image))
+
+    return imgs
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='data cleaning for binary image task')
-  parser.add_argument('-b', '--base_path', help='directory to base data', default='../../data')
-  parser.add_argument('-d', '--data', help='directory to training data', default='train')
-  parser.add_argument('-t', '--target', help='target file to hold good data', default='train.txt')
-  parser.add_argument('-i', '--img_size', help='target image size to verify', default=160, type=int)
-  parser.add_argument('-z', '--zipfile', help='source data zip file', default='../../tacodata.zip')
-  parser.add_argument('-f', '--force',
-                      help='force clear all data', default=False, action='store_true')
-  args = parser.parse_args()
-  print(args)
-
-  print('Using TensorFlow v.{}'.format(tf.__version__))
-
-  base_path = Path(args.base_path).resolve(strict=False)
-  print('Base Path:  {}'.format(base_path))
-  data_path = base_path.joinpath(args.data).resolve(strict=False)
-  print('Train Path: {}'.format(data_path))
-  target_path = Path(base_path).resolve(strict=False).joinpath(args.target)
-  print('Train File: {}'.format(target_path))
-  zip_path = args.zipfile
-
-  print('Acquiring data...')
-  download('https://aiadvocate.blob.core.windows.net/public/tacodata.zip',
-           str(base_path), args.force)
-
-  if os.path.exists(str(target_path)):
-    print('dataset text file already exists, skipping check')
-  else:
-    print('Testing images...')
-    images = walk_images(str(data_path), args.img_size)
-
-    # save file
-    print('writing dataset to {}'.format(target_path))
-    with open(str(target_path), 'w+') as f:
-      f.write('\n'.join(images))
-
-  # python data.py -z https://aiadvocate.blob.core.windows.net/public/tacodata.zip -t train.txt
+    parser = argparse.ArgumentParser(
+        description='data cleaning for binary image task')
+    parser.add_argument('-b', '--base_path',
+                        help='directory to base data', default='../../data')
+    parser.add_argument(
+        '-d', '--data', help='directory to training data', default='train')
+    parser.add_argument(
+        '-t', '--target', help='target file to hold good data', default='train.txt')  # noqa: E501
+    parser.add_argument(
+        '-i', '--img_size', help='target image size to verify', default=160, type=int)  # noqa: E501
+    parser.add_argument(
+        '-z', '--zipfile', help='source data zip file', default='../../tacodata.zip')  # noqa: E501
+    parser.add_argument('-f', '--force',
+                        help='force clear all data', default=False, action='store_true')  # noqa: E501
+    args = parser.parse_args()
+    print(args)
+
+    print('Using TensorFlow v.{}'.format(tf.__version__))
+
+    base_path = Path(args.base_path).resolve(strict=False)
+    print('Base Path:  {}'.format(base_path))
+    data_path = base_path.joinpath(args.data).resolve(strict=False)
+    print('Train Path: {}'.format(data_path))
+    target_path = Path(base_path).resolve(strict=False).joinpath(args.target)
+    print('Train File: {}'.format(target_path))
+    zip_path = args.zipfile
+
+    print('Acquiring data...')
+    download('https://aiadvocate.blob.core.windows.net/public/tacodata.zip',
+             str(base_path), args.force)
+
+    if os.path.exists(str(target_path)):
+        print('dataset text file already exists, skipping check')
+    else:
+        print('Testing images...')
+        images = walk_images(str(data_path), args.img_size)
+
+        # save file
+        print('writing dataset to {}'.format(target_path))
+        with open(str(target_path), 'w+') as f:
+            f.write('\n'.join(images))
diff --git a/pipelines/azurepipeline/code/register/Dockerfile b/pipelines/azurepipeline/code/register/Dockerfile
old mode 100644
new mode 100755
diff --git a/pipelines/azurepipeline/code/register/register.py b/pipelines/azurepipeline/code/register/register.py
old mode 100644
new mode 100755
index 8ef55c723..8de597af4
--- a/pipelines/azurepipeline/code/register/register.py
+++ b/pipelines/azurepipeline/code/register/register.py
@@ -9,94 +9,102 @@
 
 
 def info(msg, char="#", width=75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
-  print(char * width)
+    print("")
+    print(char * width)
+    print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
+    print(char * width)
 
 
 def get_ws(tenant_id, service_principal_id,
-           service_principal_password, subscription_id, resource_group, workspace):
-  auth_args = {
-    'tenant_id': tenant_id,
-    'service_principal_id': service_principal_id,
-    'service_principal_password': service_principal_password
-  }
-
-  ws_args = {
-    'auth': ServicePrincipalAuthentication(**auth_args),
-    'subscription_id': subscription_id,
-    'resource_group': resource_group
-  }
-  ws = Workspace.get(workspace, **ws_args)
-  return ws
+           service_principal_password, subscription_id, resource_group, workspace):  # noqa: E501
+    auth_args = {
+        'tenant_id': tenant_id,
+        'service_principal_id': service_principal_id,
+        'service_principal_password': service_principal_password
+    }
+
+    ws_args = {
+        'auth': ServicePrincipalAuthentication(**auth_args),
+        'subscription_id': subscription_id,
+        'resource_group': resource_group
+    }
+    ws = Workspace.get(workspace, **ws_args)
+    return ws
 
 
 def run(mdl_path, model_name, ws, tgs):
-  print(ws.get_details())
+    print(ws.get_details())
 
-  print('\nSaving model {} to {}'.format(mdl_path, model_name))
+    print('\nSaving model {} to {}'.format(mdl_path, model_name))
 
-  # Model Path needs to be relative
-  mdl_path = relpath(mdl_path, '.')
+    # Model Path needs to be relative
+    mdl_path = relpath(mdl_path, '.')
 
-  Model.register(ws, model_name=model_name, model_path=mdl_path, tags=tgs)
-  print('Done!')
+    Model.register(ws, model_name=model_name, model_path=mdl_path, tags=tgs)
+    print('Done!')
 
 
 if __name__ == "__main__":
-  # argparse stuff for model path and model name
-  parser = argparse.ArgumentParser(description='sanity check on model')
-  parser.add_argument('-b', '--base_path', help='directory to base folder', default='../../data')
-  parser.add_argument('-m', '--model', help='path to model file', default='/model/latest.h5')
-  parser.add_argument('-n', '--model_name', help='AML Model name', default='tacosandburritos')
-  parser.add_argument('-t', '--tenant_id', help='tenant_id')
-  parser.add_argument('-s', '--service_principal_id', help='service_principal_id')
-  parser.add_argument('-p', '--service_principal_password', help='service_principal_password')
-  parser.add_argument('-u', '--subscription_id', help='subscription_id')
-  parser.add_argument('-r', '--resource_group', help='resource_group')
-  parser.add_argument('-w', '--workspace', help='workspace')
-  args = parser.parse_args()
-
-  print('Azure ML SDK Version: {}'.format(azureml.core.VERSION))
-  args.model = 'model/' + args.model
-  model_path = str(Path(args.base_path).resolve(
-    strict=False).joinpath(args.model).resolve(strict=False))
-  params_path = str(Path(args.base_path).resolve(
-    strict=False).joinpath('params.json').resolve(strict=False))
-  wsrgs = {
-    'tenant_id': args.tenant_id,
-    'service_principal_id': args.service_principal_id,
-    'service_principal_password': args.service_principal_password,
-    'subscription_id': args.subscription_id,
-    'resource_group': args.resource_group,
-    'workspace': args.workspace
-  }
-  rgs = {
-    'mdl_path': model_path,
-    'model_name': args.model_name
-  }
-
-  # printing out args for posterity
-  for i in wsrgs:
-    if i == 'service_principal_password':
-      print('{} => **********'.format(i))
-    else:
-      print('{} => {}'.format(i, wsrgs[i]))
-
-  with(open(str(params_path), 'r')) as f:
-    tags = json.load(f)
-
-  print('\n\nUsing the following tags:')
-  for tag in tags:
-    print('{} => {}'.format(tag, tags[tag]))
-
-  rgs['tgs'] = tags
-
-  workspc = get_ws(**wsrgs)
-  rgs['ws'] = workspc
-  run(**rgs)
-
-  # python register.py --model_path v --model_name c --tenant_id c
-  # --service_principal_id v --service_principal_password v
-  # --subscription_id v --resource_group x --workspace c
+    # argparse stuff for model path and model name
+    parser = argparse.ArgumentParser(description='sanity check on model')
+    parser.add_argument('-b', '--base_path',
+                        help='directory to base folder', default='../../data')
+    parser.add_argument(
+        '-m', '--model', help='path to model file', default='/model/latest.h5')
+    parser.add_argument('-n', '--model_name',
+                        help='AML Model name', default='tacosandburritos')
+    parser.add_argument('-t', '--tenant_id', help='tenant_id')
+    parser.add_argument('-s', '--service_principal_id',
+                        help='service_principal_id')
+    parser.add_argument('-p', '--service_principal_password',
+                        help='service_principal_password')
+    parser.add_argument('-u', '--subscription_id', help='subscription_id')
+    parser.add_argument('-r', '--resource_group', help='resource_group')
+    parser.add_argument('-w', '--workspace', help='workspace')
+    args = parser.parse_args()
+
+    print('Azure ML SDK Version: {}'.format(azureml.core.VERSION))
+    args.model = 'model/' + args.model
+    model_path = str(Path(args.base_path).resolve(
+        strict=False).joinpath(args.model).resolve(strict=False))
+    params_path = str(Path(args.base_path).resolve(
+        strict=False).joinpath('params.json').resolve(strict=False))
+    wsrgs = {
+        'tenant_id': args.tenant_id,
+        'service_principal_id': args.service_principal_id,
+        'service_principal_password': args.service_principal_password,
+        'subscription_id': args.subscription_id,
+        'resource_group': args.resource_group,
+        'workspace': args.workspace
+    }
+    rgs = {
+        'mdl_path': model_path,
+        'model_name': args.model_name
+    }
+
+    # printing out args for posterity
+    for i in wsrgs:
+        if i == 'service_principal_password':
+            print('{} => **********'.format(i))
+        else:
+            print('{} => {}'.format(i, wsrgs[i]))
+
+    for i in rgs:
+        print('{} => {}'.format(i, rgs[i]))
+
+    with(open(str(params_path), 'r')) as f:
+        tags = json.load(f)
+
+    print('\n\nUsing the following tags:')
+    for tag in tags:
+        print('{} => {}'.format(tag, tags[tag]))
+
+    rgs['tgs'] = tags
+
+    workspc = get_ws(**wsrgs)
+    rgs['ws'] = workspc
+    run(**rgs)
+
+    # python register.py --model_path v --model_name c --tenant_id c
+    # --service_principal_id v --service_principal_password v
+    # --subscription_id v --resource_group x --workspace c
diff --git a/pipelines/azurepipeline/code/training/Dockerfile b/pipelines/azurepipeline/code/training/Dockerfile
old mode 100644
new mode 100755
diff --git a/pipelines/azurepipeline/code/training/train.py b/pipelines/azurepipeline/code/training/train.py
old mode 100644
new mode 100755
index 885608268..a67f23f63
--- a/pipelines/azurepipeline/code/training/train.py
+++ b/pipelines/azurepipeline/code/training/train.py
@@ -13,188 +13,199 @@
 
 
 def info(msg, char="#", width=75):
-  print("")
-  print(char * width)
-  print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
-  print(char * width)
+    print("")
+    print(char * width)
+    print(char + "   %0*s" % ((-1 * width) + 5, msg) + char)
+    print(char * width)
 
 
 def check_dir(path):
-  if not os.path.exists(path):
-    os.makedirs(path)
-  return Path(path).resolve(strict=False)
+    if not os.path.exists(path):
+        os.makedirs(path)
+    return Path(path).resolve(strict=False)
 
 
 def process_image(path, label, img_size):
-  img_raw = tf.io.read_file(path)
-  img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
-  img_final = tf.image.resize(img_tensor, [img_size, img_size]) / 255
-  return img_final, label
+    img_raw = tf.io.read_file(path)
+    img_tensor = tf.image.decode_jpeg(img_raw, channels=3)
+    img_final = tf.image.resize(img_tensor, [img_size, img_size]) / 255
+    return img_final, label
 
 
 def load_dataset(base_path, dset, split=None):
-  # normalize splits
-  if split is None:
-    split = [8, 1, 1]
-  splits = np.array(split) / np.sum(np.array(split))
-
-  # find labels - parent folder names
-  labels = {}
-  for (_, dirs, _) in os.walk(base_path):
-    print('found {}'.format(dirs))
-    labels = {k: v for (v, k) in enumerate(dirs)}
-    print('using {}'.format(labels))
-    break
+    # normalize splits
+    if split is None:
+        split = [8, 1, 1]
+    splits = np.array(split) / np.sum(np.array(split))
+
+    # find labels - parent folder names
+    labels = {}
+    for (_, dirs, _) in os.walk(base_path):
+        print('found {}'.format(dirs))
+        labels = {k: v for (v, k) in enumerate(dirs)}
+        print('using {}'.format(labels))
+        break
 
-  # load all files along with idx label
-  print('loading dataset from {}'.format(dset))
-  with open(dset, 'r') as d:
-    data = [(str(Path(line.strip()).absolute()),
-             labels[Path(line.strip()).parent.name]) for line in d.readlines()]
+    # load all files along with idx label
+    print('loading dataset from {}'.format(dset))
+    with open(dset, 'r') as d:
+        data = [(str(Path(line.strip()).absolute()),
+                 labels[Path(line.strip()).parent.name]) for line in d.readlines()]  # noqa: E501
 
-  print('dataset size: {}\nsuffling data...'.format(len(data)))
+    print('dataset size: {}\nsuffling data...'.format(len(data)))
 
-  # shuffle data
-  shuffle(data)
+    # shuffle data
+    shuffle(data)
 
-  print('splitting data...')
-  # split data
-  train_idx = int(len(data) * splits[0])
+    print('splitting data...')
+    # split data
+    train_idx = int(len(data) * splits[0])
 
-  return data[:train_idx]
+    return data[:train_idx]
 
 
 # @print_info
 def run(
-    dpath,
-    img_size=160,
-    epochs=10,
-    batch_size=32,
-    learning_rate=0.0001,
-    output='model',
-    dset=None):
-  img_shape = (img_size, img_size, 3)
+        dpath,
+        img_size=160,
+        epochs=10,
+        batch_size=32,
+        learning_rate=0.0001,
+        output='model',
+        dset=None):
+    img_shape = (img_size, img_size, 3)
 
-  info('Loading Data Set')
-  # load dataset
-  train = load_dataset(dpath, dset)
+    info('Loading Data Set')
+    # load dataset
+    train = load_dataset(dpath, dset)
 
-  # training data
-  train_data, train_labels = zip(*train)
-  train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
-                          Dataset.from_tensor_slices(list(train_labels)),
-                          Dataset.from_tensor_slices([img_size]*len(train_data))))
+    # training data
+    train_data, train_labels = zip(*train)
+    train_ds = Dataset.zip((Dataset.from_tensor_slices(list(train_data)),
+                            Dataset.from_tensor_slices(list(train_labels)),
+                            Dataset.from_tensor_slices([img_size]*len(train_data))))
 
-  train_ds = train_ds.map(map_func=process_image,
-                          num_parallel_calls=5)
+    print(train_ds)
+    train_ds = train_ds.map(map_func=process_image,
+                            num_parallel_calls=5)
 
-  train_ds = train_ds.apply(tf.data.experimental.ignore_errors())
+    train_ds = train_ds.apply(tf.data.experimental.ignore_errors())
 
-  train_ds = train_ds.batch(batch_size)
-  train_ds = train_ds.prefetch(buffer_size=5)
-  train_ds = train_ds.repeat()
+    train_ds = train_ds.batch(batch_size)
+    train_ds = train_ds.prefetch(buffer_size=5)
+    train_ds = train_ds.repeat()
 
-  # model
-  info('Creating Model')
-  base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,
-                                                 include_top=False,
-                                                 weights='imagenet')
-  base_model.trainable = True
+    # model
+    info('Creating Model')
+    base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape,
+                                                   include_top=False,
+                                                   weights='imagenet')
+    base_model.trainable = True
 
-  model = tf.keras.Sequential([
-    base_model,
-    tf.keras.layers.GlobalAveragePooling2D(),
-    tf.keras.layers.Dense(1, activation='sigmoid')
-  ])
+    model = tf.keras.Sequential([
+        base_model,
+        tf.keras.layers.GlobalAveragePooling2D(),
+        tf.keras.layers.Dense(1, activation='sigmoid')
+    ])
 
-  model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
-                loss='binary_crossentropy',
-                metrics=['accuracy'])
+    model.compile(optimizer=tf.keras.optimizers.Adam(lr=learning_rate),
+                  loss='binary_crossentropy',
+                  metrics=['accuracy'])
 
-  model.summary()
+    model.summary()
 
-  # training
-  info('Training')
-  steps_per_epoch = math.ceil(len(train) / batch_size)
-  model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)
+    # training
+    info('Training')
+    steps_per_epoch = math.ceil(len(train) / batch_size)
 
-  # save model
-  info('Saving Model')
+    model.fit(train_ds, epochs=epochs, steps_per_epoch=steps_per_epoch)
 
-  # check existence of base model folder
-  output = check_dir(output)
+    # save model
+    info('Saving Model')
 
-  print('Serializing into saved_model format')
-  tf.saved_model.save(model, str(output))
-  print('Done!')
+    # check existence of base model folder
+    output = check_dir(output)
 
-  # add time prefix folder
-  file_output = str(Path(output).joinpath('latest.h5'))
-  print('Serializing h5 model to:\n{}'.format(file_output))
-  model.save(file_output)
+    print('Serializing into saved_model format')
+    tf.saved_model.save(model, str(output))
+    print('Done!')
 
-  return generate_hash(file_output, 'kf_pipeline')
+    # add time prefix folder
+    file_output = str(Path(output).joinpath('latest.h5'))
+    print('Serializing h5 model to:\n{}'.format(file_output))
+    model.save(file_output)
+
+    return generate_hash(file_output, 'kf_pipeline')
 
 
 def generate_hash(dfile, key):
-  print('Generating hash for {}'.format(dfile))
-  m = hmac.new(str.encode(key), digestmod=hashlib.sha256)
-  BUF_SIZE = 65536
-  with open(str(dfile), 'rb') as myfile:
-    while True:
-      data = myfile.read(BUF_SIZE)
-      if not data:
-        break
-      m.update(data)
+    print('Generating hash for {}'.format(dfile))
+    m = hmac.new(str.encode(key), digestmod=hashlib.sha256)
+    BUF_SIZE = 65536
+    with open(str(dfile), 'rb') as myfile:
+        while True:
+            data = myfile.read(BUF_SIZE)
+            if not data:
+                break
+            m.update(data)
 
-  return m.hexdigest()
+    return m.hexdigest()
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='transfer learning for binary image task')
-  parser.add_argument('-s', '--base_path', help='directory to base data', default='../../data')
-  parser.add_argument('-d', '--data', help='directory to training and test data', default='train')
-  parser.add_argument('-e', '--epochs', help='number of epochs', default=10, type=int)
-  parser.add_argument('-b', '--batch', help='batch size', default=32, type=int)
-  parser.add_argument('-i', '--image_size', help='image size', default=160, type=int)
-  parser.add_argument('-l', '--lr', help='learning rate', default=0.0001, type=float)
-  parser.add_argument('-o', '--outputs', help='output directory', default='model')
-  parser.add_argument('-f', '--dataset', help='cleaned data listing')
-  args = parser.parse_args()
-
-  info('Using TensorFlow v.{}'.format(tf.__version__))
-
-  data_path = Path(args.base_path).joinpath(args.data).resolve(strict=False)
-  target_path = Path(args.base_path).resolve(strict=False).joinpath(args.outputs)
-  dataset = Path(args.base_path).joinpath(args.dataset)
-  image_size = args.image_size
-
-  params = Path(args.base_path).joinpath('params.json')
-
-  args = {
-    "dpath": str(data_path),
-    "img_size": image_size,
-    "epochs": args.epochs,
-    "batch_size": args.batch,
-    "learning_rate": args.lr,
-    "output": str(target_path),
-    "dset": str(dataset)
-  }
-
-  dataset_signature = generate_hash(dataset, 'kf_pipeline')
-  # printing out args for posterity
-  for i in args:
-    print('{} => {}'.format(i, args[i]))
-
-  model_signature = run(**args)
-
-  args['dataset_signature'] = dataset_signature.upper()
-  args['model_signature'] = model_signature.upper()
-  args['model_type'] = 'tfv2-MobileNetV2'
-  print('Writing out params...', end='')
-  with open(str(params), 'w') as f:
-    json.dump(args, f)
-
-  print(' Saved to {}'.format(str(params)))
-
-  # python train.py -d train -e 3 -b 32 -l 0.0001 -o model -f train.txt
+    parser = argparse.ArgumentParser(
+        description='transfer learning for binary image task')
+    parser.add_argument('-s', '--base_path',
+                        help='directory to base data', default='../../data')
+    parser.add_argument(
+        '-d', '--data', help='directory to training and test data', default='train')  # noqa: E501
+    parser.add_argument(
+        '-e', '--epochs', help='number of epochs', default=10, type=int)
+    parser.add_argument('-b', '--batch', help='batch size',
+                        default=32, type=int)
+    parser.add_argument('-i', '--image_size',
+                        help='image size', default=160, type=int)
+    parser.add_argument('-l', '--lr', help='learning rate',
+                        default=0.0001, type=float)
+    parser.add_argument('-o', '--outputs',
+                        help='output directory', default='model')
+    parser.add_argument('-f', '--dataset', help='cleaned data listing')
+    args = parser.parse_args()
+
+    info('Using TensorFlow v.{}'.format(tf.__version__))
+
+    data_path = Path(args.base_path).joinpath(args.data).resolve(strict=False)
+    target_path = Path(args.base_path).resolve(
+        strict=False).joinpath(args.outputs)
+    dataset = Path(args.base_path).joinpath(args.dataset)
+    image_size = args.image_size
+
+    params = Path(args.base_path).joinpath('params.json')
+
+    args = {
+        "dpath": str(data_path),
+        "img_size": image_size,
+        "epochs": args.epochs,
+        "batch_size": args.batch,
+        "learning_rate": args.lr,
+        "output": str(target_path),
+        "dset": str(dataset)
+    }
+
+    dataset_signature = generate_hash(dataset, 'kf_pipeline')
+    # printing out args for posterity
+    for i in args:
+        print('{} => {}'.format(i, args[i]))
+
+    model_signature = run(**args)
+
+    args['dataset_signature'] = dataset_signature.upper()
+    args['model_signature'] = model_signature.upper()
+    args['model_type'] = 'tfv2-MobileNetV2'
+    print('Writing out params...', end='')
+    with open(str(params), 'w') as f:
+        json.dump(args, f)
+
+    print(' Saved to {}'.format(str(params)))
+
+    # python train.py -d train -e 3 -b 32 -l 0.0001 -o model -f train.txt