From 902d8256882eaa60c5ea7f8b1af4f5a50daf48e7 Mon Sep 17 00:00:00 2001 From: Lydia Date: Thu, 13 Jan 2022 10:57:29 +0100 Subject: [PATCH] Update offloading --- Dockerfile | 1 + configs/experiment.yaml | 1 + configs/experiment_deadline.yaml | 13 ++++--- configs/experiment_freeze.yaml | 28 ++++++++++++++ configs/experiment_offload.yaml | 28 ++++++++++++++ configs/experiment_swyh.yaml | 28 ++++++++++++++ configs/experiment_swyh_first_long.yaml | 29 ++++++++++++++ configs/experiment_swyh_warmup.yaml | 28 ++++++++++++++ configs/experiment_vanilla.yaml | 1 + deploy/templates/client_stub_default.yml | 1 + deploy/templates/client_stub_medium.yml | 2 +- deploy/templates/client_stub_slow.yml | 4 +- deploy/templates/system_stub.yml | 1 + fltk/client.py | 47 +++++++++++++---------- fltk/federator.py | 41 ++++++++++++++++---- fltk/util/analyze.py | 33 +++++++++++++++- fltk/util/base_config.py | 6 +++ run_multi_exp.bash | 48 ++++++++++++++++++++++++ 18 files changed, 303 insertions(+), 37 deletions(-) create mode 100644 configs/experiment_freeze.yaml create mode 100644 configs/experiment_offload.yaml create mode 100644 configs/experiment_swyh.yaml create mode 100644 configs/experiment_swyh_first_long.yaml create mode 100644 configs/experiment_swyh_warmup.yaml create mode 100644 run_multi_exp.bash diff --git a/Dockerfile b/Dockerfile index 006c97d0..abb7ce0a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,4 +46,5 @@ COPY fltk ./fltk COPY configs ./configs #CMD python3 ./fltk/__main__.py single configs/experiment.yaml --rank=$RANK CMD python3 -m fltk single configs/experiment_vanilla.yaml --rank=$RANK +CMD python3 -m fltk single $EXP_CONFIG --rank=$RANK #CMD python3 setup.py \ No newline at end of file diff --git a/configs/experiment.yaml b/configs/experiment.yaml index 62ee3a93..70296073 100644 --- a/configs/experiment.yaml +++ b/configs/experiment.yaml @@ -11,6 +11,7 @@ experiment_prefix: 'experiment_sample' offload_stategy: vanilla profiling_time: 100 deadline: 500 +warmup_round: true output_location: 'output' tensor_board_active: true clients_per_round: 2 diff --git a/configs/experiment_deadline.yaml b/configs/experiment_deadline.yaml index 5ffdca23..c038e3a6 100644 --- a/configs/experiment_deadline.yaml +++ b/configs/experiment_deadline.yaml @@ -1,19 +1,22 @@ --- # Experiment configuration -total_epochs: 4 +total_epochs: 20 epochs_per_cycle: 1 wait_for_clients: true net: Cifar10CNN dataset: cifar10 # Use cuda is available; setting to false will force CPU cuda: false -experiment_prefix: 'offloading_deadline' -offload_stategy: offload +experiment_prefix: 'exp_offload_deadline' +offload_stategy: deadline +profiling_time: 50 +deadline: 140 +warmup_round: false output_location: 'output' tensor_board_active: true clients_per_round: 2 -# sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) -sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) sampler_args: - 0.07 # label limit || q probability || alpha || unused - 42 # random seed || random seed || random seed || unused diff --git a/configs/experiment_freeze.yaml b/configs/experiment_freeze.yaml new file mode 100644 index 00000000..78631070 --- /dev/null +++ b/configs/experiment_freeze.yaml @@ -0,0 +1,28 @@ +--- +# Experiment configuration +total_epochs: 20 +epochs_per_cycle: 1 +wait_for_clients: true +net: Cifar10CNN +dataset: cifar10 +# Use cuda is available; setting to false will force CPU +cuda: false +experiment_prefix: 'exp_freeze_deadline' +offload_stategy: freeze +profiling_time: 50 +deadline: 140 +warmup_round: false +output_location: 'output' +tensor_board_active: true +clients_per_round: 2 +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +system: + federator: + hostname: '10.5.0.11' + nic: 'eth0' + clients: + amount: 2 diff --git a/configs/experiment_offload.yaml b/configs/experiment_offload.yaml new file mode 100644 index 00000000..ccf8c0c1 --- /dev/null +++ b/configs/experiment_offload.yaml @@ -0,0 +1,28 @@ +--- +# Experiment configuration +total_epochs: 1 +epochs_per_cycle: 1 +wait_for_clients: true +net: Cifar10CNN +dataset: cifar10 +# Use cuda is available; setting to false will force CPU +cuda: false +experiment_prefix: 'exp_model_offload_deadline_fedavg_test' +offload_stategy: offload +profiling_time: 50 +deadline: 140 +warmup_round: false +output_location: 'output' +tensor_board_active: true +clients_per_round: 2 +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +system: + federator: + hostname: '10.5.0.11' + nic: 'eth0' + clients: + amount: 2 diff --git a/configs/experiment_swyh.yaml b/configs/experiment_swyh.yaml new file mode 100644 index 00000000..86b185fd --- /dev/null +++ b/configs/experiment_swyh.yaml @@ -0,0 +1,28 @@ +--- +# Experiment configuration +total_epochs: 20 +epochs_per_cycle: 1 +wait_for_clients: true +net: Cifar10CNN +dataset: cifar10 +# Use cuda is available; setting to false will force CPU +cuda: false +experiment_prefix: 'exp_swyh_deadline' +offload_stategy: swyh +profiling_time: 50 +deadline: 140 +warmup_round: false +output_location: 'output' +tensor_board_active: true +clients_per_round: 2 +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +system: + federator: + hostname: '10.5.0.11' + nic: 'eth0' + clients: + amount: 2 diff --git a/configs/experiment_swyh_first_long.yaml b/configs/experiment_swyh_first_long.yaml new file mode 100644 index 00000000..7089d52b --- /dev/null +++ b/configs/experiment_swyh_first_long.yaml @@ -0,0 +1,29 @@ +--- +# Experiment configuration +total_epochs: 20 +epochs_per_cycle: 1 +wait_for_clients: true +net: Cifar10CNN +dataset: cifar10 +# Use cuda is available; setting to false will force CPU +cuda: false +experiment_prefix: 'exp_swyh_first_long_deadline' +offload_stategy: swyh +profiling_time: 50 +deadline: 140 +first_deadline: 400 +warmup_round: false +output_location: 'output' +tensor_board_active: true +clients_per_round: 2 +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +system: + federator: + hostname: '10.5.0.11' + nic: 'eth0' + clients: + amount: 2 diff --git a/configs/experiment_swyh_warmup.yaml b/configs/experiment_swyh_warmup.yaml new file mode 100644 index 00000000..72400588 --- /dev/null +++ b/configs/experiment_swyh_warmup.yaml @@ -0,0 +1,28 @@ +--- +# Experiment configuration +total_epochs: 20 +epochs_per_cycle: 1 +wait_for_clients: true +net: Cifar10CNN +dataset: cifar10 +# Use cuda is available; setting to false will force CPU +cuda: false +experiment_prefix: 'exp_swyh__warmup_deadline' +offload_stategy: swyh +profiling_time: 50 +deadline: 140 +warmup_round: true +output_location: 'output' +tensor_board_active: true +clients_per_round: 2 +sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +#sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +system: + federator: + hostname: '10.5.0.11' + nic: 'eth0' + clients: + amount: 2 diff --git a/configs/experiment_vanilla.yaml b/configs/experiment_vanilla.yaml index a8c10a79..2ab96331 100644 --- a/configs/experiment_vanilla.yaml +++ b/configs/experiment_vanilla.yaml @@ -11,6 +11,7 @@ experiment_prefix: 'exp_offload_vanilla' offload_stategy: vanilla profiling_time: 100 deadline: 500 +warmup_round: false output_location: 'output' tensor_board_active: true clients_per_round: 2 diff --git a/deploy/templates/client_stub_default.yml b/deploy/templates/client_stub_default.yml index 838cf699..5ff5eeb5 100644 --- a/deploy/templates/client_stub_default.yml +++ b/deploy/templates/client_stub_default.yml @@ -12,6 +12,7 @@ client_name: # name can be anything - PYTHONUNBUFFERED=1 - RANK={rank} - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} ports: - "5002:5000" # {machine-port}:{docker-port} depends_on: diff --git a/deploy/templates/client_stub_medium.yml b/deploy/templates/client_stub_medium.yml index 6037ce44..0d3ded62 100644 --- a/deploy/templates/client_stub_medium.yml +++ b/deploy/templates/client_stub_medium.yml @@ -12,6 +12,7 @@ client_name: # name can be anything - PYTHONUNBUFFERED=1 - RANK={rank} - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} ports: - "5002:5000" # {machine-port}:{docker-port} depends_on: @@ -20,4 +21,3 @@ client_name: # name can be anything resources: limits: cpus: '1' - memory: 1024M diff --git a/deploy/templates/client_stub_slow.yml b/deploy/templates/client_stub_slow.yml index 7d541d65..5f39b9b3 100644 --- a/deploy/templates/client_stub_slow.yml +++ b/deploy/templates/client_stub_slow.yml @@ -12,6 +12,7 @@ client_name: # name can be anything - PYTHONUNBUFFERED=1 - RANK={rank} - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} ports: - "5002:5000" # {machine-port}:{docker-port} depends_on: @@ -19,5 +20,4 @@ client_name: # name can be anything deploy: resources: limits: - cpus: '0.5' - memory: 1024M + cpus: '0.5' \ No newline at end of file diff --git a/deploy/templates/system_stub.yml b/deploy/templates/system_stub.yml index 4f05dbfc..53159b83 100644 --- a/deploy/templates/system_stub.yml +++ b/deploy/templates/system_stub.yml @@ -14,6 +14,7 @@ services: - PYTHONUNBUFFERED=1 - RANK=0 - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} ports: - "5000:5000" # {machine-port}:{docker-port} networks: diff --git a/fltk/client.py b/fltk/client.py index 7c5fa710..2826c053 100644 --- a/fltk/client.py +++ b/fltk/client.py @@ -15,6 +15,7 @@ from torch.distributed.rpc import RRef from fltk.schedulers import MinCapableStepLR +from fltk.strategy.aggregation import FedAvg from fltk.strategy.offloading import OffloadingStrategy from fltk.util.arguments import Arguments from fltk.util.fed_avg import average_nn_parameters @@ -25,10 +26,15 @@ from fltk.util.profiler import Profiler from fltk.util.results import EpochData -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig( + level=logging.DEBUG, + + format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', +) global_dict = {} global_model_weights = {} +global_model_data_size = 0 global_offload_received = False @@ -76,9 +82,10 @@ def __init__(self, id, log_rref, rank, world_size, config = None): logging.info(f'Welcome to client {id}') self.id = id global_dict['id'] = id - global global_model_weights, global_offload_received + global global_model_weights, global_offload_received, global_model_data_size global_model_weights = None global_offload_received = False + global_model_data_size = 0 self.log_rref = log_rref self.rank = rank self.world_size = world_size @@ -262,10 +269,11 @@ def report_performance_estimate(self, performance_data): return _remote_method_async(Federator.perf_est_endpoint, self.server_ref, self.id, performance_data) @staticmethod - def offload_receive_endpoint(model_weights): + def offload_receive_endpoint(model_weights, num_train_samples): print(f'Got the offload_receive_endpoint endpoint') - global global_model_weights, global_offload_received + global global_model_weights, global_offload_received, global_model_data_size global_model_weights = copy.deepcopy(model_weights.copy()) + global_model_data_size = num_train_samples global_offload_received = True @staticmethod @@ -294,7 +302,7 @@ def unfreeze_layers(self): for param in self.net.parameters(): param.requires_grad = True - def train(self, epoch, deadline: int = None): + def train(self, epoch, deadline: int = None, warmup=False): """ Different modes: @@ -318,13 +326,11 @@ def train(self, epoch, deadline: int = None): :type epoch: int """ start_time = time.time() - deadline_threshold = 5 + deadline_threshold = 10 train_stop_time = None if self.deadline_enabled and deadline is not None: train_stop_time = start_time + deadline - deadline_threshold - strategy = OffloadingStrategy.VANILLA - # Ignore profiler for now # p = Profiler() # p.attach(self.net) @@ -356,13 +362,13 @@ def train(self, epoch, deadline: int = None): for i, (inputs, labels) in enumerate(self.dataset.get_train_loader(), 0): start_train_time = time.time() - if self.offload_enabled: + if self.offload_enabled and not warmup: # Check if there is a call to offload if self.call_to_offload: self.args.get_logger().info('Got call to offload model') model_weights = self.get_nn_parameters() - ret = rpc.rpc_sync(self.client_to_offload_to, Client.offload_receive_endpoint, args=([model_weights])) + ret = rpc.rpc_sync(self.client_to_offload_to, Client.offload_receive_endpoint, args=([model_weights, i])) print(f'Result of rref: {ret}') self.call_to_offload = False @@ -375,13 +381,15 @@ def train(self, epoch, deadline: int = None): if global_offload_received: self.args.get_logger().info('Merging offloaded model') self.args.get_logger().info('FedAvg locally with offloaded model') - updated_weights = average_nn_parameters([self.get_nn_parameters(), global_model_weights]) + updated_weights = FedAvg({'own': self.get_nn_parameters(), 'remote': global_model_weights}, {'own': i, 'remote': global_model_data_size}) + + # updated_weights = average_nn_parameters([self.get_nn_parameters(), global_model_weights]) self.args.get_logger().info('Updating local weights due to offloading') self.update_nn_parameters(updated_weights) global_offload_received = False global_model_weights = None - if self.deadline_enabled: + if self.deadline_enabled and not warmup: # Deadline if train_stop_time is not None: if time.time() >= train_stop_time: @@ -435,7 +443,7 @@ def train(self, epoch, deadline: int = None): logging.info(f'Estimated training time is {est_total_time}') self.report_performance_estimate((time_per_batch, est_total_time, number_of_training_samples)) - if self.freeze_layers_enabled: + if self.freeze_layers_enabled and not warmup: logging.info(f'Checking if need to freeze layers ? {est_total_time} > {deadline}') if est_total_time > deadline: logging.info('Will freeze layers to speed up computation') @@ -445,7 +453,7 @@ def train(self, epoch, deadline: int = None): # logging.info(f'Batch time is {batch_duration}') # Break away from loop for debug purposes - # if i > 50: + # if i > 5: # break control_end_time = time.time() @@ -453,8 +461,8 @@ def train(self, epoch, deadline: int = None): logging.info(f'Measure end time is {(control_end_time - control_start_time)}') logging.info(f'Trained on {training_process} samples') - - self.scheduler.step() + if not warmup: + self.scheduler.step() # Reset the layers self.unfreeze_layers() @@ -502,13 +510,14 @@ def test(self): return accuracy, loss, class_precision, class_recall - def run_epochs(self, num_epoch, deadline: int = None): + def run_epochs(self, num_epoch, deadline: int = None, warmup=False): start_time_train = datetime.datetime.now() self.dataset.get_train_sampler().set_epoch_size(num_epoch) # Train locally - loss, weights, training_process = self.train(self.epoch_counter, deadline) - self.epoch_counter += num_epoch + loss, weights, training_process = self.train(self.epoch_counter, deadline, warmup) + if not warmup: + self.epoch_counter += num_epoch elapsed_time_train = datetime.datetime.now() - start_time_train train_time_ms = int(elapsed_time_train.total_seconds()*1000) diff --git a/fltk/federator.py b/fltk/federator.py index 8790747c..b9f72ffc 100644 --- a/fltk/federator.py +++ b/fltk/federator.py @@ -26,11 +26,14 @@ from pathlib import Path import logging -from fltk.util.profile_plots import stability_plot, parse_stability_data +# from fltk.util.profile_plots import stability_plot, parse_stability_data from fltk.util.results import EpochData from fltk.util.tensor_converter import convert_distributed_data_into_numpy -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', +) def _call_method(method, rref, *args, **kwargs): @@ -119,6 +122,9 @@ class Federator: swyh_enabled = False freeze_layers_enabled = False offload_enabled = False + warmup_active = False + + exp_start_time = 0 strategy = OffloadingStrategy.VANILLA @@ -278,10 +284,15 @@ def ask_client_to_offload(self, client1_ref, client2_ref): _remote_method(Client.call_to_offload_endpoint, client1_ref, client2_ref) logging.info(f'Done with call to offload') - def remote_run_epoch(self, epochs): + def remote_run_epoch(self, epochs, warmup=False, first_epoch=False): + if warmup: + logging.info('This is a WARMUP round') start_epoch_time = time.time() deadline = self.config.deadline deadline_time = self.config.deadline + if first_epoch: + deadline = self.config.first_deadline + deadline_time = self.config.first_deadline """ 1. Client selection 2. Run local updates @@ -318,7 +329,7 @@ def remote_run_epoch(self, epochs): deadline = 0 responses: List[ClientResponse] = [] for client in selected_clients: - cr = ClientResponse(self.response_id, client, _remote_method_async(Client.run_epochs, client.ref, num_epoch=epochs, deadline=deadline)) + cr = ClientResponse(self.response_id, client, _remote_method_async(Client.run_epochs, client.ref, num_epoch=epochs, deadline=deadline, warmup=warmup)) self.response_id += 1 self.response_list.append(cr) responses.append(cr) @@ -343,7 +354,7 @@ def reached_deadline(): has_not_called = True show_perf_data = True - while not all_finished and not (self.deadline_enabled and reached_deadline()): + while not all_finished and not ((self.deadline_enabled and reached_deadline()) or warmup): # if self.deadline_enabled and reached_deadline() # if has_not_called and (time.time() -start) > 10: # logging.info('Sending call to offload') @@ -376,7 +387,7 @@ def reached_deadline(): # weak_client = k # else: # strong_client = k - if self.offload_enabled: + if self.offload_enabled and not warmup: weak_client = est_keys[0] strong_client = est_keys[1] if self.performance_estimate[est_keys[1]][1] > self.performance_estimate[est_keys[0]][1]: @@ -398,9 +409,9 @@ def reached_deadline(): all_finished = False time.sleep(0.1) logging.info(f'Stopped waiting due to all_finished={all_finished} and deadline={reached_deadline()}') - for client_response in responses: - + if warmup: + break client = client_response.client logging.info(f'{client} had a exec time of {client_response.duration()} dropped?={client_response.dropped}') if client_response.dropped: @@ -415,6 +426,7 @@ def reached_deadline(): logging.info(f'{client} had a loss of {epoch_data.loss}') logging.info(f'{client} had a epoch data of {epoch_data}') logging.info(f'{client} has trained on {epoch_data.training_process} samples') + elapsed_time = client_response.end_time - self.exp_start_time client.tb_writer.add_scalar('training loss', epoch_data.loss_train, # for every 1000 minibatches @@ -424,6 +436,9 @@ def reached_deadline(): epoch_data.accuracy, # for every 1000 minibatches self.epoch_counter * client.data_size) + client.tb_writer.add_scalar('accuracy wall time', + epoch_data.accuracy, # for every 1000 minibatches + elapsed_time) client.tb_writer.add_scalar('training loss per epoch', epoch_data.loss_train, # for every 1000 minibatches self.epoch_counter) @@ -448,6 +463,10 @@ def reached_deadline(): # self.tb_writer.add_scalar('training loss', loss, self.epoch_counter * self.test_data.get_client_datasize()) # does not seem to work :( ) self.tb_writer.add_scalar('accuracy', accuracy, self.epoch_counter * self.test_data.get_client_datasize()) self.tb_writer.add_scalar('accuracy per epoch', accuracy, self.epoch_counter) + elapsed_time = time.time() - self.exp_start_time + self.tb_writer.add_scalar('accuracy wall time', + accuracy, # for every 1000 minibatches + elapsed_time) end_epoch_time = time.time() duration = end_epoch_time - start_epoch_time @@ -523,6 +542,12 @@ def run(self): addition = 0 epoch_to_run = self.config.epochs epoch_size = self.config.epochs_per_cycle + + if self.config.warmup_round: + logging.info('Running warmup round') + self.remote_run_epoch(epoch_size, warmup=True) + + self.exp_start_time = time.time() for epoch in range(epoch_to_run): self.process_response_list() logging.info(f'Running epoch {epoch}') diff --git a/fltk/util/analyze.py b/fltk/util/analyze.py index 985bc080..5c304464 100644 --- a/fltk/util/analyze.py +++ b/fltk/util/analyze.py @@ -6,10 +6,39 @@ if __name__ == '__main__': - df = pd.read_csv('output/general_data.csv') + exp_name = 'output/exp_offload_vanilla' + + general_file = f'{exp_name}-general_data.csv' + print(f'Loading data file: {general_file}') + df = pd.read_csv(general_file) print(df) plt.figure() sns.pointplot(data=df, x='epoch', y='accuracy') - plt.show() \ No newline at end of file + plt.title('Accuracy per epoch') + plt.show() + + plt.figure() + # sns.pointplot(data=df[df['epoch'] > 1], x='epoch', y='duration') + sns.pointplot(data=df, x='epoch', y='duration') + plt.title('Train time per epoch') + plt.show() + + dfs = [] + for file in [f'{exp_name}_client1_epochs.csv', f'{exp_name}_client2_epochs.csv']: + dfs.append(pd.read_csv(file)) + client_df = pd.concat(dfs, ignore_index=True) + + print('Loading client data') + plt.figure() + # sns.pointplot(data=client_df[client_df['epoch_id'] > 1], x='epoch_id', y='duration_train', hue='client_id') + sns.pointplot(data=client_df, x='epoch_id', y='duration_train', hue='client_id') + plt.title('Train time per epoch clients') + plt.show() + + plt.figure() + sns.pointplot(data=client_df, x='epoch_id', y='accuracy', hue='client_id') + plt.title('Accuracy per epoch clients') + plt.show() + diff --git a/fltk/util/base_config.py b/fltk/util/base_config.py index e41b92b9..284a3f51 100644 --- a/fltk/util/base_config.py +++ b/fltk/util/base_config.py @@ -46,6 +46,8 @@ def __init__(self): self.offload_strategy = 'vanilla' self.profiling_size = 100 self.deadline = 400 + self.first_deadline = 400 + self.warmup_round = False self.federator_host = '0.0.0.0' self.rank = 0 @@ -119,6 +121,10 @@ def merge_yaml(self, cfg = {}): self.profiling_size = cfg['profiling_size'] if 'deadline' in cfg: self.deadline = cfg['deadline'] + if 'first_deadline' in cfg: + self.first_deadline = cfg['first_deadline'] + if 'warmup_round' in cfg: + self.warmup_round = cfg['warmup_round'] if 'experiment_prefix' in cfg: self.experiment_prefix = cfg['experiment_prefix'] else: diff --git a/run_multi_exp.bash b/run_multi_exp.bash new file mode 100644 index 00000000..b833e6a0 --- /dev/null +++ b/run_multi_exp.bash @@ -0,0 +1,48 @@ +#!/bin/bash + +## declare an array variable +declare -a arr=("configs/experiment_vanilla.yaml" + # "configs/experiment_deadline.yaml" + # "configs/experiment_swyh.yaml" + # "configs/experiment_freeze.yaml" + # "configs/experiment_offload.yaml" + ) +EVENT_FILE="exp_events.txt" +# Check if all files are present +for i in "${arr[@]}" +do +# echo "$i" + if [ ! -f $i ]; then + echo "File not found! Cannot find: $i" +# exit + fi + # or do whatever with individual element of the array +done + +read -p "Do you wish to continue? (y/n)?" choice +case "$choice" in + y|Y ) ;; + n|N ) exit;; + * ) exit;; +esac + +echo "" > $EVENT_FILE + +# Start running experiments +## now loop through the above array +for i in "${arr[@]}" +do + export EXP_CONFIG_FILE="$i" + echo "[$(date +"%T")] Starting $EXP_CONFIG_FILE" + echo "[$(date +"%T")] Starting $EXP_CONFIG_FILE" >> $EVENT_FILE + start_time=$(date +%s) + docker-compose up --build 2>&1 | tee dc_log.txt + end_time=$(date +%s) + # elapsed time with second resolution + elapsed=$(( end_time - start_time )) + echo "[$(date +"%T")] Finished with $EXP_CONFIG_FILE in $elapsed seconds" >> $EVENT_FILE +# docker-compose up + # or do whatever with individual element of the array +done +echo "[$(date +"%T")] Finished all experiments" +echo "[$(date +"%T")] Finished all experiments" >> $EVENT_FILE