Minor revision from 'live' demo

JMGaljaard · Mar 28, 2022 · ee107eb · ee107eb
1 parent d3d68c8
commit ee107eb
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 85 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -24,7 +24,3 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r require
 ADD fltk fltk
 ADD configs configs
 ADD charts charts
-
-# Update relevant runtime configuration for experiment
-COPY configs/ configs/
-
diff --git a/configs/tasks/example_arrival_config.json b/configs/tasks/example_arrival_config.json
@@ -1,64 +1,14 @@
-[ {
-  "jobClassParameters": [
-    {
-      "networkConfiguration": {
-        "network": "FashionMNISTCNN",
-        "dataset": "MNIST"
-      },
-      "systemParameters": {
-        "dataParallelism": "3",
-        "executorCores": "1",
-        "executorMemory": "1Gi",
-        "action": "train"
-      },
-      "hyperParameters": {
-        "batchSize": 128,
-        "maxEpoch": 10,
-        "learningRate": 0.01,
-        "learningrateDecay": 0.0002,
-        "decayStepSize": 50
-      },
-      "classProbability": 0.1,
-      "priorities": [
-        { "priority": 1, "probability": 1 }
-      ]
-    }
-  ],
-  "lambda": 40,
-  "preemptJobs": 0
-},
+[
   {
     "jobClassParameters": [
-      {
-        "networkConfiguration": {
-          "network": "FashionMNISTResNet",
-          "dataset": "MNIST"
-        },
-        "systemParameters": {
-          "dataParallelism": "1",
-          "executorCores": "1",
-          "executorMemory": "1Gi",
-          "action": "train"
-        },
-        "hyperParameters": {
-          "batchSize": "128",
-          "maxEpoch": "5",
-          "learningRate": "0.01",
-          "learningrateDecay": "0.0002"
-        },
-        "classProbability": 0.1,
-        "priorities": [
-          { "priority": 1, "probability": 1 }
-        ]
-      },
       {
         "networkConfiguration": {
           "network": "FashionMNISTCNN",
-          "dataset": "MNIST"
+          "dataset": "FashionMNIST"
         },
         "systemParameters": {
           "dataParallelism": "1",
-          "executorCores": "1",
+          "executorCores": "1000m",
           "executorMemory": "1Gi",
           "action": "train"
         },
@@ -68,10 +18,12 @@
           "learningRate": "0.01",
           "learningrateDecay": "0.0002"
         },
-        "classProbability": 0.8,
+        "classProbability": 0.1,
         "priorities": [
-          { "priority": 0, "probability": 0.9 },
-          { "priority": 1, "probability": 0.1 }
+          {
+            "priority": 1,
+            "probability": 1
+          }
         ]
       }
     ],

diff --git a/fltk/client.py b/fltk/client.py
@@ -206,31 +206,32 @@ def run_epochs(self) -> List[EpochData]:
         epoch_results = []
         for epoch in range(1, max_epoch):
             train_loss = self.train(epoch)
+
+            # Let only the 'master node' work on training. Possibly DDP can be used
+            # to have a distributed test loader as well to speed up (would require
+            # aggregation of data.
+            # Example https://github.com/fabio-deep/Distributed-Pytorch-Boilerplate/blob/0206247150720ca3e287e9531cb20ef68dc9a15f/src/datasets.py#L271-L303.
+            elapsed_time_train = datetime.datetime.now() - start_time_train
+            train_time_ms = int(elapsed_time_train.total_seconds() * 1000)
+
+            start_time_test = datetime.datetime.now()
+            accuracy, test_loss, class_precision, class_recall, confusion_mat = self.test()
+
+            elapsed_time_test = datetime.datetime.now() - start_time_test
+            test_time_ms = int(elapsed_time_test.total_seconds() * 1000)
+
+            data = EpochData(epoch_id=epoch,
+                             duration_train=train_time_ms,
+                             duration_test=test_time_ms,
+                             loss_train=train_loss,
+                             accuracy=accuracy,
+                             loss=test_loss,
+                             class_precision=class_precision,
+                             class_recall=class_recall,
+                             confusion_mat=confusion_mat)
+
+            epoch_results.append(data)
             if self._id == 0:
-                # Let only the 'master node' work on training. Possibly DDP can be used
-                # to have a distributed test loader as well to speed up (would require
-                # aggregation of data.
-                # Example https://github.com/fabio-deep/Distributed-Pytorch-Boilerplate/blob/0206247150720ca3e287e9531cb20ef68dc9a15f/src/datasets.py#L271-L303.
-                elapsed_time_train = datetime.datetime.now() - start_time_train
-                train_time_ms = int(elapsed_time_train.total_seconds() * 1000)
-
-                start_time_test = datetime.datetime.now()
-                accuracy, test_loss, class_precision, class_recall, confusion_mat = self.test()
-
-                elapsed_time_test = datetime.datetime.now() - start_time_test
-                test_time_ms = int(elapsed_time_test.total_seconds() * 1000)
-
-                data = EpochData(epoch_id=epoch,
-                                 duration_train=train_time_ms,
-                                 duration_test=test_time_ms,
-                                 loss_train=train_loss,
-                                 accuracy=accuracy,
-                                 loss=test_loss,
-                                 class_precision=class_precision,
-                                 class_recall=class_recall,
-                                 confusion_mat=confusion_mat)
-
-                epoch_results.append(data)
                 self.log_progress(data, epoch)
         return epoch_results
 

diff --git a/fltk/util/cluster/client.py b/fltk/util/cluster/client.py
@@ -334,7 +334,7 @@ def construct(self) -> V1PyTorchJob:
         job = V1PyTorchJob(
             api_version="kubeflow.org/v1",
             kind="PyTorchJob",
-            metadata=V1ObjectMeta(name=str(self._buildDescription.id), namespace='test'),
+            metadata=V1ObjectMeta(name=f'trainjob-{self._buildDescription.id}', namespace='test'),
             spec=self._buildDescription.spec)
         return job