diff --git a/README.md b/README.md
index 3180eaf2..38b6a279 100755
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@
 * **Flesh Digressions**: @aydao’s circular constant layer script edited to work with ADA see aydao_flesh_digressions.py
 * **Raw dataset creations**: Taken from the @skyflynil repo, reduces the size of datasets dramatically. Use `create_from images_raw` and `create_from image_folders_raw` in dataset creation, and use `--use-raw=True` in training (False by default!)
 * **align faces script**: From @pbaylies, this script will align images better for projection.
+* **top-k training**: Improve generator training by only propagating gradients from images the discriminator was most unsure of: [Sinha & Zhao](https://arxiv.org/abs/2002.06224).
+* **@aydao's config**: Extra large config for huge datasets (>100k img)
 
 ## StyleGAN2 with adaptive discriminator augmentation (ADA)<br>&mdash; Official TensorFlow implementation
 
diff --git a/dataset_tool.py b/dataset_tool.py
index 822465fc..8a22a720 100755
--- a/dataset_tool.py
+++ b/dataset_tool.py
@@ -695,6 +695,8 @@ def create_from_images(tfrecord_dir, image_dir, shuffle):
         error('No input images found')
 
     img = np.asarray(PIL.Image.open(image_filenames[0]))
+    print(img.shape)
+    shape = img.shape
     resolution = img.shape[0]
     channels = img.shape[2] if img.ndim == 3 else 1
     if img.shape[1] != resolution:
@@ -709,9 +711,13 @@ def create_from_images(tfrecord_dir, image_dir, shuffle):
         for idx in range(order.size):
             img = np.asarray(PIL.Image.open(image_filenames[order[idx]]))
             if channels == 1:
+                print("Greyscale, adding dimension:", image_filenames[order[idx]], img.shape)
                 img = img[np.newaxis, :, :] # HW => CHW
             else:
                 img = img.transpose([2, 0, 1]) # HWC => CHW
+            if img.shape != shape:
+                print("Wrong shape:", image_filenames[order[idx]], img.shape, "should be", shape)
+                continue
             tfr.add_image(img)
 
 #----------------------------------------------------------------------------
@@ -750,7 +756,7 @@ def create_from_images_raw(tfrecord_dir, image_dir, shuffle, resolution_log2=7,
         order = tfr.choose_shuffled_order() if shuffle else np.arange(len(image_filenames))
         tfr.create_tfr_writer(img.shape)
         for idx in range(order.size):
-            print('loading: ' + image_filenames[order[idx]])
+            # print('loading: ' + image_filenames[order[idx]])
             # img = np.asarray(PIL.Image.open(image_filenames[order[idx]]))
             # if (img.shape[1] != 1024) or (img.shape[0] != 1024):
             #     error('Input images must have the same width and height')
diff --git a/dnnlib/tflib/network.py b/dnnlib/tflib/network.py
index ff0c169e..45bb5f9c 100755
--- a/dnnlib/tflib/network.py
+++ b/dnnlib/tflib/network.py
@@ -120,6 +120,7 @@ def _init_fields(self, name: str, static_kwargs: dict, build_func: Callable, bui
         self._trainables            = None
         self._var_global_to_local   = None
         self._run_cache             = dict()
+        self.epochs = tf.Variable(0., dtype=tf.float32, name='epochs')
 
     def _init_graph(self) -> None:
         assert self._var_inits is not None
@@ -537,6 +538,12 @@ def setup_as_moving_average_of(self, src_net: "Network", beta: TfExpressionEx =
                     ops.append(var.assign(new_value))
             return tf.group(*ops)
 
+    def update_epochs(self, epochs: TfExpressionEx = 0) -> tf.Operation:
+        """Construct a TensorFlow op that updates the epoch counter of this network."""
+        with tfutil.absolute_name_scope(self.scope + "/_Epochs"):
+            op = self.epochs.assign(epochs)
+            return op
+
     def run(self,
             *in_arrays: Tuple[Union[np.ndarray, None], ...],
             input_transform: dict = None,
diff --git a/train.py b/train.py
index ac3d3aa9..ba37cd96 100755
--- a/train.py
+++ b/train.py
@@ -172,6 +172,7 @@ def setup_training_options(
 
     cfg_specs = {
         'auto':          dict(ref_gpus=-1, kimg=25000,  mb=-1, mbstd=-1, fmaps=-1,  lrate=-1,     gamma=-1,   ema=-1,  ramp=0.05, map=2), # populated dynamically based on 'gpus' and 'res'
+        'aydao':     dict(ref_gpus=2,  kimg=25000,  mb=16, mbstd=8,  fmaps=1,   lrate=0.002,  gamma=10,   ema=10,  ramp=None, map=8), # uses mixed-precision, 11GB GPU
         '11gb-gpu':     dict(ref_gpus=1,  kimg=25000,  mb=4, mbstd=4,  fmaps=1,   lrate=0.002,  gamma=10,   ema=10,  ramp=None, map=8), # uses mixed-precision, 11GB GPU
         '11gb-gpu-complex':     dict(ref_gpus=1,  kimg=25000,  mb=4, mbstd=4,  fmaps=1,   lrate=0.002,  gamma=10,   ema=10,  ramp=None, map=8), # uses mixed-precision, 11GB GPU
         '24gb-gpu':     dict(ref_gpus=1,  kimg=25000,  mb=8, mbstd=8,  fmaps=1,   lrate=0.002,  gamma=10,   ema=10,  ramp=None, map=8), # uses mixed-precision, 24GB GPU
@@ -211,6 +212,26 @@ def setup_training_options(
     args.G_args.num_fp16_res = args.D_args.num_fp16_res = 4 # enable mixed-precision training
     args.G_args.conv_clamp = args.D_args.conv_clamp = 256 # clamp activations to avoid float16 overflow
 
+    if cfg == 'aydao':
+        # disable path length and style mixing regularization
+        args.loss_args.pl_weight = 0
+        args.G_args.style_mixing_prob = None
+
+        # double generator capacity
+        args.G_args.fmap_base = 32 << 10
+        args.G_args.fmap_max = 1024
+
+        # enable top k training
+        args.loss_args.G_top_k = True
+        # args.loss_args.G_top_k_gamma = 0.99 # takes ~70% of full training from scratch to decay to 0.5
+        # args.loss_args.G_top_k_gamma = 0.9862 # takes 12500 kimg to decay to 0.5 (~1/2 of total_kimg when training from scratch)
+        args.loss_args.G_top_k_gamma = 0.9726 # takes 6250 kimg to decay to 0.5 (~1/4 of total_kimg when training from scratch)
+        args.loss_args.G_top_k_frac = 0.5
+
+        # reduce in-memory size, you need a BIG GPU for this model
+        args.minibatch_gpu = 4 # probably will need to set this pretty low with such a large G, higher values work better for top-k training though
+        args.G_args.num_fp16_res = 6 # making more layers fp16 can help as well
+
     if cfg == 'cifar' or cfg.split('-')[-1] == 'complex':
         args.loss_args.pl_weight = 0 # disable path length regularization
         args.G_args.style_mixing_prob = None # disable style mixing
@@ -560,7 +581,7 @@ def main():
     group.add_argument('--metricdata', help='Dataset to evaluate metrics against (optional)', metavar='PATH')
 
     group = parser.add_argument_group('base config')
-    group.add_argument('--cfg',   help='Base config (default: auto)', choices=['auto', '11gb-gpu','11gb-gpu-complex', '24gb-gpu','24gb-gpu-complex', '48gb-gpu', 'stylegan2', 'paper256', 'paper512', 'paper1024', 'cifar', 'cifarbaseline'])
+    group.add_argument('--cfg',   help='Base config (default: auto)', choices=['auto', '11gb-gpu','11gb-gpu-complex', '24gb-gpu','24gb-gpu-complex', '48gb-gpu', 'stylegan2', 'paper256', 'paper512', 'paper1024', 'cifar', 'cifarbaseline', 'aydao'])
     group.add_argument('--gamma', help='Override R1 gamma', type=float, metavar='FLOAT')
     group.add_argument('--kimg',  help='Override training duration', type=int, metavar='INT')
 
diff --git a/training/loss.py b/training/loss.py
index 9d819d29..12e0660f 100755
--- a/training/loss.py
+++ b/training/loss.py
@@ -88,7 +88,7 @@ def eval_D(D, aug, images, labels, report=None, augment_inputs=True, return_aux=
 # Non-saturating logistic loss with R1 and path length regularizers, used
 # in the paper "Analyzing and Improving the Image Quality of StyleGAN".
 
-def stylegan2(G, D, aug, fake_labels, real_images, real_labels, r1_gamma=10, pl_minibatch_shrink=2, pl_decay=0.01, pl_weight=2, **_kwargs):
+def stylegan2(G, D, aug, fake_labels, real_images, real_labels, r1_gamma=10, pl_minibatch_shrink=2, pl_decay=0.01, pl_weight=2, G_top_k = False, G_top_k_gamma = 0.9, G_top_k_frac = 0.5,  **_kwargs):
     # Evaluate networks for the main loss.
     minibatch_size = tf.shape(fake_labels)[0]
     fake_latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
@@ -98,7 +98,14 @@ def stylegan2(G, D, aug, fake_labels, real_images, real_labels, r1_gamma=10, pl_
 
     # Non-saturating logistic loss from "Generative Adversarial Nets".
     with tf.name_scope('Loss_main'):
-        G_loss = tf.nn.softplus(-D_fake.scores) # -log(sigmoid(D_fake.scores)), pylint: disable=invalid-unary-operand-type
+        D_fake_scores = D_fake.scores
+        if G_top_k:
+            k_frac = tf.maximum(G_top_k_gamma ** G.epochs, G_top_k_frac)
+            k = tf.cast(tf.ceil(tf.cast(minibatch_size, tf.float32) * k_frac), tf.int32)
+            lowest_k_scores, _ = tf.nn.top_k(-tf.squeeze(D_fake_scores), k=k) # want smallest probabilities not largest
+            D_fake_scores = tf.expand_dims(-lowest_k_scores, axis=1)
+        G_loss = tf.nn.softplus(-D_fake_scores) # -log(sigmoid(D_fake_scores)), pylint: disable=invalid-unary-operand-type
+
         D_loss = tf.nn.softplus(D_fake.scores) # -log(1 - sigmoid(D_fake.scores))
         D_loss += tf.nn.softplus(-D_real.scores) # -log(sigmoid(D_real.scores)), pylint: disable=invalid-unary-operand-type
         G_reg = 0
diff --git a/training/training_loop.py b/training/training_loop.py
index 7b5e2b3b..7e709776 100755
--- a/training/training_loop.py
+++ b/training/training_loop.py
@@ -202,6 +202,8 @@ def training_loop(
     D_reg_op = D_reg_opt.apply_updates(allow_no_op=True)
     Gs_beta_in = tf.placeholder(tf.float32, name='Gs_beta_in', shape=[])
     Gs_update_op = Gs.setup_as_moving_average_of(G, beta=Gs_beta_in)
+    Gs_epochs = tf.placeholder(tf.float32, name='Gs_epochs', shape=[])
+    Gs_epochs_op = Gs.update_epochs(Gs_epochs)
     tflib.init_uninitialized_vars()
     with tf.device('/gpu:0'):
         peak_gpu_mem_op = tf.contrib.memory_stats.MaxBytesInUse()
@@ -234,6 +236,8 @@ def training_loop(
             Gs_nimg = min(Gs_nimg, cur_nimg * G_smoothing_rampup)
         Gs_beta = 0.5 ** (minibatch_size / max(Gs_nimg, 1e-8))
 
+        epochs = float(100 * cur_nimg / (total_kimg * 1000)) # 100 total top k "epochs" in total_kimg
+
         # Run training ops.
         for _repeat_idx in range(minibatch_repeats):
             rounds = range(0, minibatch_size, minibatch_gpu * num_gpus)
@@ -247,7 +251,7 @@ def training_loop(
                 tflib.run([G_train_op, data_fetch_op])
                 if run_G_reg:
                     tflib.run(G_reg_op)
-                tflib.run([D_train_op, Gs_update_op], {Gs_beta_in: Gs_beta})
+                tflib.run([D_train_op, Gs_update_op, Gs_epochs_op], {Gs_beta_in: Gs_beta, Gs_epochs: epochs})
                 if run_D_reg:
                     tflib.run(D_reg_op)
 
@@ -257,7 +261,7 @@ def training_loop(
                     tflib.run(G_train_op)
                     if run_G_reg:
                         tflib.run(G_reg_op)
-                tflib.run(Gs_update_op, {Gs_beta_in: Gs_beta})
+                tflib.run([Gs_update_op, Gs_epochs_op], {Gs_beta_in: Gs_beta, Gs_epochs: epochs})
                 for _round in rounds:
                     tflib.run(data_fetch_op)
                     tflib.run(D_train_op)