Skip to content

Commit

Permalink
Minor revision from 'live' demo
Browse files Browse the repository at this point in the history
  • Loading branch information
JMGaljaard committed Mar 28, 2022
1 parent d3d68c8 commit ee107eb
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 85 deletions.
4 changes: 0 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,3 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r require
ADD fltk fltk
ADD configs configs
ADD charts charts

# Update relevant runtime configuration for experiment
COPY configs/ configs/

64 changes: 8 additions & 56 deletions configs/tasks/example_arrival_config.json
Original file line number Diff line number Diff line change
@@ -1,64 +1,14 @@
[ {
"jobClassParameters": [
{
"networkConfiguration": {
"network": "FashionMNISTCNN",
"dataset": "MNIST"
},
"systemParameters": {
"dataParallelism": "3",
"executorCores": "1",
"executorMemory": "1Gi",
"action": "train"
},
"hyperParameters": {
"batchSize": 128,
"maxEpoch": 10,
"learningRate": 0.01,
"learningrateDecay": 0.0002,
"decayStepSize": 50
},
"classProbability": 0.1,
"priorities": [
{ "priority": 1, "probability": 1 }
]
}
],
"lambda": 40,
"preemptJobs": 0
},
[
{
"jobClassParameters": [
{
"networkConfiguration": {
"network": "FashionMNISTResNet",
"dataset": "MNIST"
},
"systemParameters": {
"dataParallelism": "1",
"executorCores": "1",
"executorMemory": "1Gi",
"action": "train"
},
"hyperParameters": {
"batchSize": "128",
"maxEpoch": "5",
"learningRate": "0.01",
"learningrateDecay": "0.0002"
},
"classProbability": 0.1,
"priorities": [
{ "priority": 1, "probability": 1 }
]
},
{
"networkConfiguration": {
"network": "FashionMNISTCNN",
"dataset": "MNIST"
"dataset": "FashionMNIST"
},
"systemParameters": {
"dataParallelism": "1",
"executorCores": "1",
"executorCores": "1000m",
"executorMemory": "1Gi",
"action": "train"
},
Expand All @@ -68,10 +18,12 @@
"learningRate": "0.01",
"learningrateDecay": "0.0002"
},
"classProbability": 0.8,
"classProbability": 0.1,
"priorities": [
{ "priority": 0, "probability": 0.9 },
{ "priority": 1, "probability": 0.1 }
{
"priority": 1,
"probability": 1
}
]
}
],
Expand Down
49 changes: 25 additions & 24 deletions fltk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,31 +206,32 @@ def run_epochs(self) -> List[EpochData]:
epoch_results = []
for epoch in range(1, max_epoch):
train_loss = self.train(epoch)

# Let only the 'master node' work on training. Possibly DDP can be used
# to have a distributed test loader as well to speed up (would require
# aggregation of data.
# Example https://github.com/fabio-deep/Distributed-Pytorch-Boilerplate/blob/0206247150720ca3e287e9531cb20ef68dc9a15f/src/datasets.py#L271-L303.
elapsed_time_train = datetime.datetime.now() - start_time_train
train_time_ms = int(elapsed_time_train.total_seconds() * 1000)

start_time_test = datetime.datetime.now()
accuracy, test_loss, class_precision, class_recall, confusion_mat = self.test()

elapsed_time_test = datetime.datetime.now() - start_time_test
test_time_ms = int(elapsed_time_test.total_seconds() * 1000)

data = EpochData(epoch_id=epoch,
duration_train=train_time_ms,
duration_test=test_time_ms,
loss_train=train_loss,
accuracy=accuracy,
loss=test_loss,
class_precision=class_precision,
class_recall=class_recall,
confusion_mat=confusion_mat)

epoch_results.append(data)
if self._id == 0:
# Let only the 'master node' work on training. Possibly DDP can be used
# to have a distributed test loader as well to speed up (would require
# aggregation of data.
# Example https://github.com/fabio-deep/Distributed-Pytorch-Boilerplate/blob/0206247150720ca3e287e9531cb20ef68dc9a15f/src/datasets.py#L271-L303.
elapsed_time_train = datetime.datetime.now() - start_time_train
train_time_ms = int(elapsed_time_train.total_seconds() * 1000)

start_time_test = datetime.datetime.now()
accuracy, test_loss, class_precision, class_recall, confusion_mat = self.test()

elapsed_time_test = datetime.datetime.now() - start_time_test
test_time_ms = int(elapsed_time_test.total_seconds() * 1000)

data = EpochData(epoch_id=epoch,
duration_train=train_time_ms,
duration_test=test_time_ms,
loss_train=train_loss,
accuracy=accuracy,
loss=test_loss,
class_precision=class_precision,
class_recall=class_recall,
confusion_mat=confusion_mat)

epoch_results.append(data)
self.log_progress(data, epoch)
return epoch_results

Expand Down
2 changes: 1 addition & 1 deletion fltk/util/cluster/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ def construct(self) -> V1PyTorchJob:
job = V1PyTorchJob(
api_version="kubeflow.org/v1",
kind="PyTorchJob",
metadata=V1ObjectMeta(name=str(self._buildDescription.id), namespace='test'),
metadata=V1ObjectMeta(name=f'trainjob-{self._buildDescription.id}', namespace='test'),
spec=self._buildDescription.spec)
return job

Expand Down

0 comments on commit ee107eb

Please sign in to comment.