From 4e7523519a5820e0308a76761728059b4a0eb4b6 Mon Sep 17 00:00:00 2001 From: Srivatsan Krisnan Date: Fri, 6 Oct 2023 13:54:24 -0400 Subject: [PATCH 01/12] update files oss --- arch_gym/envs/AstraSimEnv.py | 134 +++++++++----- arch_gym/envs/AstraSimWrapper.py | 19 +- arch_gym/envs/envHelpers.py | 8 +- sims/AstraSim/AstraSimRL.csv | 5 + sims/AstraSim/astrasim-archgym | 1 + sims/AstraSim/bo_logs/metadata.riegeli | Bin 0 -> 485 bytes sims/AstraSim/exp_config.ini | 7 + sims/AstraSim/general_workload.txt | 20 ++ sims/AstraSim/gridSearchAstraSim.py | 159 ++++++++++++++++ sims/AstraSim/restructure.py | 32 ---- sims/AstraSim/run_3dfrs.sh | 15 ++ sims/AstraSim/run_general.sh | 7 - sims/AstraSim/trainACOAstraSim.py | 2 +- sims/AstraSim/trainGAAstraSim.py | 9 +- sims/AstraSim/trainRandomWalkerAstraSim.py | 29 ++- sims/AstraSim/trainSingleAgentAstraSim.py | 205 +++++++++++++++++++++ 16 files changed, 538 insertions(+), 114 deletions(-) create mode 100644 sims/AstraSim/AstraSimRL.csv create mode 160000 sims/AstraSim/astrasim-archgym create mode 100644 sims/AstraSim/bo_logs/metadata.riegeli create mode 100644 sims/AstraSim/exp_config.ini create mode 100644 sims/AstraSim/general_workload.txt create mode 100644 sims/AstraSim/gridSearchAstraSim.py delete mode 100644 sims/AstraSim/restructure.py create mode 100755 sims/AstraSim/run_3dfrs.sh create mode 100644 sims/AstraSim/trainSingleAgentAstraSim.py diff --git a/arch_gym/envs/AstraSimEnv.py b/arch_gym/envs/AstraSimEnv.py index 737c5b25..10d0156d 100644 --- a/arch_gym/envs/AstraSimEnv.py +++ b/arch_gym/envs/AstraSimEnv.py @@ -8,6 +8,8 @@ import csv import random +from envHelpers import helpers + settings_file_path = os.path.realpath(__file__) settings_dir_path = os.path.dirname(settings_file_path) proj_root_path = os.path.join(settings_dir_path, '..', '..') @@ -16,18 +18,25 @@ # astra-sim environment class AstraSimEnv(gym.Env): - def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1): - # action space = set of all possible actions. Space.sample() returns a random action - self.action_space = gym.spaces.Discrete(16) - # observation space = set of all possible observations - self.observation_space = gym.spaces.Discrete(1) + def __init__(self, rl_form="sa1", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1,): + self.rl_form = rl_form + + if self.rl_form == 'sa1': + # action space = set of all possible actions. Space.sample() returns a random action + # observation space = set of all possible observations + self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32) # box is an array of shape len + self.action_space = gym.spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32) + self.helpers = helpers() + else: + self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32) + self.action_space = gym.spaces.Discrete(16) + + # set parameters self.max_steps = max_steps self.counter = 0 self.useful_counter = 0 - - self.rl_form = rl_form self.num_agents = num_agents self.reward_formulation = reward_formulation self.reward_scaling = reward_scaling @@ -49,9 +58,15 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo self.networks_folder = os.path.join(sim_path, "astrasim-archgym/dse/archgen_v1_knobs/templates/network") self.workloads_folder = os.path.join(sim_path, "astrasim-archgym/themis/inputs/workload") + # Config does not matter self.network_config = os.path.join(self.networks_folder, "3d_fc_ring_switch.json") - self.workload_config = os.path.join(sim_path, "realworld_workloads/transformer_1t_fused_only_t.txt") + self.workload_config = os.path.join(self.workloads_folder, "all_reduce/allreduce_0.65.txt") + self.astrasim_archgym = os.path.join(sim_path, "astrasim-archgym") + self.systems_folder = os.path.join(self.astrasim_archgym, "themis/inputs/system") + self.network_file = "4d_ring_fc_ring_switch.json" + self.system_file = os.path.join(self.systems_folder, "4d_ring_fc_ring_switch_baseline.txt") + self.workload_file = "all_reduce/allreduce_0.65.txt" print("_____________________*****************************_____________________") @@ -60,6 +75,7 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo # reset function def reset(self): + self.counter = 0 # get results folder path results_folder_path = os.path.join(sim_path, "results", "run_general") @@ -72,7 +88,13 @@ def reset(self): csv_files = os.path.join(results_folder_path, csv_files) if os.path.exists(csv_files): os.remove(csv_files) - return + + # TODO: + obs = np.zeros(self.observation_space.shape) + + return obs + + # parses a result csv file and stores it in a dictionary def parse_result(self, file_name): @@ -113,13 +135,45 @@ def calculate_reward(self, observations): print(sum) return 1 / (sum ** 0.5) + + # parse system_file (above is the content) into dict + def parse_system(self, system_file, action_dict): + action_dict['system'] = {} + with open(system_file, 'r') as file: + lines = file.readlines() + + for line in lines: + key, value = line.strip().split(': ') + action_dict['system'][key] = value + # give it one action: one set of parameters from json file def step(self, action_dict): - # write the three config files - # with open(self.network_config, "w") as outfile: - # outfile.write(json.dumps(action_dict['network'], indent=4)) - print(action_dict) + if not isinstance(action_dict, dict): + with open(settings_dir_path + "/AstraSimRL_2.csv", 'a') as f: + writer = csv.writer(f) + writer.writerow(action_dict) + + print("STEP: action_dict is a list") + action_dict_decoded = {} + action_dict_decoded['network'] = {"path": self.network_file} + action_dict_decoded['workload'] = {"path": self.workload_file} + + # parse system: initial values + self.parse_system(self.system_file, action_dict_decoded) + + # returning an + action_decoded = self.helpers.action_decoder_ga_astraSim(action_dict) + + # change all variables decoded into action_dict + for sect in action_decoded: + for key in action_decoded[sect]: + action_dict_decoded[sect][key] = action_decoded[sect][key] + + action_dict = action_dict_decoded + + + if "path" in action_dict["network"]: self.network_config = action_dict["network"]["path"] @@ -135,12 +189,7 @@ def step(self, action_dict): # the action is actually the parsed parameter files print("Step: " + str(self.counter)) - if (self.counter == self.max_steps): - self.done = True - print("Maximum steps reached") - self.reset() - else: - self.counter += 1 + self.counter += 1 # start subrpocess to run the simulation # $1: network, $2: system, $3: workload @@ -174,6 +223,12 @@ def step(self, action_dict): sample_all_reduce_dimension_utilization = self.parse_result(sim_path + '/results/run_general/sample_all_reduce_dimension_utilization.csv') + if (self.counter == self.max_steps): + self.done = True + print("Maximum steps reached") + self.reset() + + # test if the csv files exist (if they don't, the config files are invalid) if ((len(backend_dim_info) == 0 or len(backend_end_to_end) == 0 or len(detailed) == 0 or len(end_to_end) == 0 or @@ -181,50 +236,33 @@ def step(self, action_dict): # set reward to be extremely negative reward = float("-inf") print("reward: ", reward) - return [[], reward, self.done, {"useful_counter": self.useful_counter}, self.state] + return [], reward, self.done, {"useful_counter": self.useful_counter}, self.state else: # only recording the first line because apparently they are all the same? TODO - self.observations = [ - backend_end_to_end["CommsTime"][0], + observations = [ + float(backend_end_to_end["CommsTime"][0]) # end_to_end["fwd compute"][0], # end_to_end["wg compute"][0], # end_to_end["ig compute"][0], # end_to_end["total exposed comm"][0] ] - reward = self.calculate_reward(self.observations) - print("reward: ", reward) - print("observations: ", self.observations) + + reward = self.calculate_reward(observations) + + print("reward: ", reward) + + # reshape observations with shape of observation space + observations = np.reshape(observations, self.observation_space.shape) self.useful_counter += 1 - return [self.observations, reward, self.done, {"useful_counter": self.useful_counter}, self.state] + return observations, reward, self.done, {"useful_counter": self.useful_counter}, self.state if __name__ == "__main__": print("Testing AstraSimEnv") - env = AstraSimEnv(rl_form='random_walker', + env = AstraSimEnv(rl_form='sa1', max_steps=10, num_agents=1, reward_formulation='reward_formulation_1', reward_scaling=1) - - - - - - - """ - Everytime rest happens: - - zero out the observation - - 3/24: - Communication Time (unit: microseconds) - Time breakdowns (forward pass, weight gradient, input gradient) - Exposed communication - - - 3/31: - Catch errors by giving it high negative reward. This way we can test the range. - - - """ diff --git a/arch_gym/envs/AstraSimWrapper.py b/arch_gym/envs/AstraSimWrapper.py index 02ef73d9..d0724abc 100644 --- a/arch_gym/envs/AstraSimWrapper.py +++ b/arch_gym/envs/AstraSimWrapper.py @@ -13,7 +13,7 @@ # limitations under the License. """Wraps an OpenAI Gym environment to be used as a dm_env environment.""" -import sys, os +import sys from typing import Any, Dict, List, Optional from acme import specs @@ -25,8 +25,8 @@ import numpy as np import tree -os.sys.path.insert(0, os.path.abspath('../../')) -from arch_gym.envs.AstraSimEnv import AstraSimEnv +from AstraSimEnv import AstraSimEnv +from envHelpers import helpers # dm = deepmind class AstraSimEnvWrapper(dm_env.Environment): @@ -41,6 +41,7 @@ def __init__(self, environment: gym.Env, self._environment = environment self._reset_next_step = True self._last_info = None + self.helper = helpers() self.env_wrapper_sel = env_wrapper_sel # set useful counter @@ -182,12 +183,12 @@ def _convert_to_spec(space: gym.Space, else: raise ValueError('Unexpected gym space: {}'.format(space)) -def make_astraSim_env(seed: int = 12345, - rl_form = 'macme', +def make_astraSim_env(seed: int = 12234, + rl_form = 'sa1', reward_formulation = 'power', reward_scaling = 'false', - max_steps: int = 100, - num_agents: int = 10) -> dm_env.Environment: + max_steps: int = 1, + num_agents: int = 1) -> dm_env.Environment: """Returns DRAMSys environment.""" print("[DEBUG][Seed]", seed) print("[DEBUG][RL Form]", rl_form) @@ -206,6 +207,6 @@ def make_astraSim_env(seed: int = 12345, env_wrapper_sel = rl_form ) environment = wrappers.SinglePrecisionWrapper(environment) - if(rl_form == 'sa' or rl_form == 'tdm'): - environment = wrappers.CanonicalSpecWrapper(environment, clip=True) + if(rl_form == 'sa1' or rl_form == 'tdm'): + environment = wrappers.CanonicalSpecWrapper(environment, clip=False) return environment \ No newline at end of file diff --git a/arch_gym/envs/envHelpers.py b/arch_gym/envs/envHelpers.py index d6836057..784fb789 100644 --- a/arch_gym/envs/envHelpers.py +++ b/arch_gym/envs/envHelpers.py @@ -797,10 +797,10 @@ def action_decoder_ga_astraSim(self, act_encoded): interDimension_mapper = {0: "baseline", 1: "themis"} # Modified system parameters - act_decoded["system"]["scheduling-policy"] = schedulePolicy_mapper[int(act_encoded[0])] - act_decoded["system"]["collective-optimization"] = collectiveOptimization_mapper[int(act_encoded[1])] - act_decoded["system"]["intra-dimension-scheduling"] = intraDimension_mapper[int(act_encoded[2])] - act_decoded["system"]["inter-dimension-scheduling"] = interDimension_mapper[int(act_encoded[3])] + act_decoded["system"]["scheduling-policy"] = schedulePolicy_mapper[int(round(act_encoded[0]))] + act_decoded["system"]["collective-optimization"] = collectiveOptimization_mapper[int(round(act_encoded[1]))] + act_decoded["system"]["intra-dimension-scheduling"] = intraDimension_mapper[int(round(act_encoded[2]))] + act_decoded["system"]["inter-dimension-scheduling"] = interDimension_mapper[int(round(act_encoded[3]))] return act_decoded diff --git a/sims/AstraSim/AstraSimRL.csv b/sims/AstraSim/AstraSimRL.csv new file mode 100644 index 00000000..4624ae37 --- /dev/null +++ b/sims/AstraSim/AstraSimRL.csv @@ -0,0 +1,5 @@ +0.17908713,0.44089648,0.83359694,0.2673431 +0.5,0.5,0.5,0.5 +0.0,0.0,0.0,1.0 +0.0,0.0,0.0,1.0 +0.17908713,0.44089648,0.83359694,0.2673431 diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym new file mode 160000 index 00000000..2ff6b732 --- /dev/null +++ b/sims/AstraSim/astrasim-archgym @@ -0,0 +1 @@ +Subproject commit 2ff6b7325d0e21229124f1101a7d2941f434267c diff --git a/sims/AstraSim/bo_logs/metadata.riegeli b/sims/AstraSim/bo_logs/metadata.riegeli new file mode 100644 index 0000000000000000000000000000000000000000..1eb1858ebbf0650c1a6dcbbe7088ba3d5959d3db GIT binary patch literal 485 zcmZo(UvQDP!^@rl3LK#H#9fDMCbd6Y2@`)fU--bwjfabhVFFP4@}Hg2S5w^Y*D^u` zCjQKRFma7RbqSO&&I}PxYG7<bZWM0IisgtV^xja#`{UoT07RP9YChy=ecPf3w3OD73SvQB}Lw|NqU* zvfFDN^Z(3|oP5+oXy*I(2fyv;p3mQZ{Bhxqd3n=wxBSxUXKhrqkXz&s%%hpcSg~*i zGlPQcf+OFTTX?RS(5ajBmT6%EH-kd`{NXT-Y7&RN*o)Y$nd z@~_$Ydt$sZ!{zhe&IrA^Z|eQ_@G$k%lJ|=hq&F@5_ciXeo9%fYX7gj;FJ@glpEQH_ zzr6EDuD>o{&P;8(mJ-&#%{6tYrRt>It;~Lq&z@q5^m5vAzb=^35w5m82CJ7+CJ4-CB1*J645D%hmTp_|=npW6C?S6&L_N C(8rko literal 0 HcmV?d00001 diff --git a/sims/AstraSim/exp_config.ini b/sims/AstraSim/exp_config.ini new file mode 100644 index 00000000..b75f3bc8 --- /dev/null +++ b/sims/AstraSim/exp_config.ini @@ -0,0 +1,7 @@ +[experiment_configuration] +exp_name = resnet18_random_state_2_num_iter_16 +trajectory_dir = ./bo_trajectories/power/resnet18_random_state_2_num_iter_16 +log_dir = ./bo_logs/power/resnet18_random_state_2_num_iter_16 +reward_formulation = power +use_envlogger = True + diff --git a/sims/AstraSim/general_workload.txt b/sims/AstraSim/general_workload.txt new file mode 100644 index 00000000..c95a1f2a --- /dev/null +++ b/sims/AstraSim/general_workload.txt @@ -0,0 +1,20 @@ +HYBRID_TRANSFORMER_FWD_IN_BCKWD model_parallel_NPU_group: 128 checkpoints: 2 0 9 checkpoint_initiates: 2 17 8 +18 +Q1 -1 2343750 NONE 0 2343750 ALLREDUCE 805306368 2343750 ALLREDUCE 240316416 10 +K1 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10 +V1 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10 +QK1 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10 +softmax1 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10 +concat1 -1 2343750 ALLREDUCE 805306368 2343750 ALLGATHER 6291456 2343750 NONE 0 10 +X1W1b1 -1 9375000 NONE 0 9375000 ALLREDUCE 805306368 9375000 NONE 0 10 +X1W2b2 -1 9375000 ALLREDUCE 805306368 9375000 NONE 0 9375000 NONE 0 10 +layerNorm1 -1 12207 NONE 0 12207 NONE 0 12207 NONE 0 10 +Q2 -1 2343750 NONE 0 2343750 ALLREDUCE 805306368 2343750 NONE 0 10 +K2 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10 +V2 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10 +QK2 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10 +softmax2 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10 +concat2 -1 2343750 ALLREDUCE 805306368 2343750 ALLGATHER 6291456 2343750 NONE 0 10 +X2W1b1 -1 9375000 NONE 0 9375000 ALLREDUCE 805306368 9375000 NONE 0 10 +X2W2b2 -1 9375000 ALLREDUCE 805306368 9375000 NONE 0 9375000 NONE 0 10 +layerNorm2 -1 12207 NONE 0 12207 NONE 0 12207 NONE 0 10 \ No newline at end of file diff --git a/sims/AstraSim/gridSearchAstraSim.py b/sims/AstraSim/gridSearchAstraSim.py new file mode 100644 index 00000000..9c63a6eb --- /dev/null +++ b/sims/AstraSim/gridSearchAstraSim.py @@ -0,0 +1,159 @@ +import os +import sys +import pickle + +from absl import app + +os.sys.path.insert(0, os.path.abspath('../../')) +os.sys.path.insert(0, os.path.abspath('../../arch_gym')) + +from configs import arch_gym_configs + +from arch_gym.envs.envHelpers import helpers +from arch_gym.envs import AstraSimWrapper, AstraSimEnv +import envlogger +import numpy as np +import pandas as pd +import random +import time +import json + +# systems: parse from file into json into generate_random_actions +""" +system_file content: +scheduling-policy: LIFO +endpoint-delay: 1 +active-chunks-per-dimension: 1 +preferred-dataset-splits: 64 +boost-mode: 1 +all-reduce-implementation: direct_ring_halvingDoubling +all-gather-implementation: direct_ring_halvingDoubling +reduce-scatter-implementation: direct_ring_halvingDoubling +all-to-all-implementation: direct_direct_direct +collective-optimization: localBWAware +intra-dimension-scheduling: FIFO +inter-dimension-scheduling: baseline +""" +def parse_system(system_file, action_dict): + # parse system_file (above is the content) into dict + action_dict['system'] = {} + with open(system_file, 'r') as file: + lines = file.readlines() + + for line in lines: + key, value = line.strip().split(': ') + action_dict['system'][key] = value + +# parses knobs that we want to experiment with +def parse_knobs(knobs_spec): + SYSTEM_KNOBS = {} + NETWORK_KNOBS = {} + + with open(knobs_spec, 'r') as file: + file_contents = file.read() + parsed_dicts = {} + + # Evaluate the file contents and store the dictionaries in the parsed_dicts dictionary + exec(file_contents, parsed_dicts) + + # Access the dictionaries + SYSTEM_KNOBS = parsed_dicts['SYSTEM_KNOBS'] + NETWORK_KNOBS = parsed_dicts['NETWORK_KNOBS'] + + return SYSTEM_KNOBS, NETWORK_KNOBS + + +# action_type = specify 'network' or 'system +# new_params = parsed knobs from experiment file +def generate_random_actions(action_dict, system_knob, network_knob, args): + dicts = [(system_knob, 'system'), (network_knob, 'network')] + for dict_type, dict_name in dicts: + i = 0 + for knob in dict_type.keys(): + if isinstance(dict_type[knob], set): + action_dict[dict_name][knob] = list(dict_type[knob])[args[i]] + i += 1 + + return action_dict + + +def main(_): + settings_file_path = os.path.realpath(__file__) + settings_dir_path = os.path.dirname(settings_file_path) + proj_root_path = os.path.abspath(settings_dir_path) + + astrasim_archgym = os.path.join(proj_root_path, "astrasim-archgym") + + # TODO: V1 SPEC: + archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs") + knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py") + networks_folder = os.path.join(archgen_v1_knobs, "templates/network") + systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system") + workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") + + # DEFINE NETWORK AND SYSTEM AND WORKLOAD + network_file = "4d_ring_fc_ring_switch.json" + system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt") + workload_file = "all_reduce/allreduce_0.65.txt" + + env = AstraSimWrapper.make_astraSim_env(rl_form='random_walker') + # env = AstraSimEnv.AstraSimEnv(rl_form='random_walker') + + astrasim_helper = helpers() + + start = time.time() + + step_results = {} + + all_results = [] + best_reward, best_observation, best_actions = 0.0, 0.0, {} + + for sp in [0, 1]: + for co in [0, 1]: + for intra in [0, 1]: + for inter in [0, 1]: + # INITIATE action dict + action_dict = {} + args = [sp, co, intra, inter] + + # if path exists, use path, else parse the sub-dict + action_dict['network'] = {"path": network_file} + action_dict['workload'] = {"path": workload_file} + + # TODO: parse system + parse_system(system_file, action_dict) + + # TODO: parse knobs (all variables to change in action_dict) + system_knob, network_knob = parse_knobs(knobs_spec) + + # pass into generate_random_actions(dimension, knobs) + action_dict = generate_random_actions(action_dict, system_knob, network_knob, args) + + # with open("general_workload.txt", 'w') as file: + # file.write(action["workload"]["value"]) + + # step_result wrapped in TimeStep object + step_result = env.step(action_dict) + step_type, reward, discount, observation = step_result + + step_results['reward'] = [reward] + step_results['action'] = action_dict + step_results['obs'] = observation + + all_results.append((reward, observation)) + + if reward and reward > best_reward: + best_reward = reward + best_observation = observation + best_actions = action_dict + + end = time.time() + + print("Best Reward: ", best_reward) + print("Best Observation: ", best_observation) + print("Best Parameters: ", best_actions) + print("All Results: ", all_results) + + +if __name__ == '__main__': + app.run(main) diff --git a/sims/AstraSim/restructure.py b/sims/AstraSim/restructure.py deleted file mode 100644 index f17e131e..00000000 --- a/sims/AstraSim/restructure.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -import csv - -def read_csv_column(file_path): - with open(file_path, 'r', newline='') as csvfile: - reader = csv.reader(csvfile) - column_data = [row if row else "NA" for row in reader] - return column_data - -def merge_columns(file1_path, file2_path, output_file_path): - column1_data = read_csv_column(file1_path) - column2_data = read_csv_column(file2_path) - - # Ensure both columns have the same length - if len(column1_data) != len(column2_data): - raise ValueError("The columns must have the same length.") - # Merge the columns into a list of dictionaries - merged_data = [{'x': column1_data[i], 'y': column2_data[i]} for i in range(len(column1_data))] - - # Write the merged data to a new CSV file - with open(output_file_path, 'w', newline='') as outfile: - fieldnames = ['x', 'y'] - writer = csv.DictWriter(outfile, fieldnames=fieldnames) - writer.writeheader() - writer.writerows(merged_data) - -if __name__ == "__main__": - log_path = "random_walker_logs/latency/resnet18_num_steps_6_num_episodes_1/" - file1_path = os.path.join(log_path, "actions.csv") - file2_path = os.path.join(log_path, "observations.csv") - output_file_path = os.path.join(log_path, "merged.csv") - merge_columns(file1_path, file2_path, output_file_path) diff --git a/sims/AstraSim/run_3dfrs.sh b/sims/AstraSim/run_3dfrs.sh new file mode 100755 index 00000000..657eb6f8 --- /dev/null +++ b/sims/AstraSim/run_3dfrs.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +SCRIPT_DIR=$(dirname "$(realpath $0)") +BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra +WORKLOAD="${SCRIPT_DIR:?}"/3d_frs_baseline_allreduce065/allreduce_065.txt +SYSTEM="${SCRIPT_DIR:?}"/3d_frs_baseline_allreduce065/general_system.txt +NETWORK="${SCRIPT_DIR:?}"/3d_frs_baseline_allreduce065/3d_fc_ring_switch.json +RUN_DIR="${SCRIPT_DIR:?}"/3d_frs_baseline_allreduce065/ + +"${BINARY}" \ + --workload-configuration="${WORKLOAD}" \ + --system-configuration="${SYSTEM}" \ + --network-configuration="${NETWORK}" \ + --path="${RUN_DIR}" \ + --run-name="3d_frs_baseline_allreduce065" > ${RUN_DIR}/stdout \ No newline at end of file diff --git a/sims/AstraSim/run_general.sh b/sims/AstraSim/run_general.sh index 72b54b54..b7813d87 100755 --- a/sims/AstraSim/run_general.sh +++ b/sims/AstraSim/run_general.sh @@ -4,7 +4,6 @@ SCRIPT_DIR=$(dirname "$(realpath $0)") # Absolute paths to useful directories - BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra SYSTEM="${SCRIPT_DIR:?}"/general_system.txt NETWORK="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/network/analytical/$1 @@ -14,8 +13,6 @@ echo "SH NETWORK: ${NETWORK}" echo "SH SYSTEM: ${SYSTEM}" echo "SH WORKLOAD: ${WORKLOAD}" -# WORKLOAD="${SCRIPT_DIR:?}"/astra-sim/inputs/workload/Transformer_HybridParallel.txt # CHANGE THIS - STATS="${SCRIPT_DIR:?}"/results/run_general rm -rf "${STATS}" @@ -27,8 +24,4 @@ mkdir "${STATS}" --workload-configuration="${WORKLOAD}" \ --path="${STATS}/" \ --run-name="sample_all_reduce" \ ---num-passes=5 \ ---comm-scale=50 \ ---total-stat-rows=1 \ ---stat-row=0 diff --git a/sims/AstraSim/trainACOAstraSim.py b/sims/AstraSim/trainACOAstraSim.py index 56804567..2ea47fb2 100644 --- a/sims/AstraSim/trainACOAstraSim.py +++ b/sims/AstraSim/trainACOAstraSim.py @@ -22,7 +22,7 @@ flags.DEFINE_string('aco_log_dir', 'aco_logs', 'Directory to store logs.') flags.DEFINE_string('workload', 'stream.stl', 'Which workload to run') flags.DEFINE_string('exp_config_file', 'exp_config.ini', 'Experiment config file.') -flags.DEFINE_integer('depth', 1, 'Depth of the network.') +flags.DEFINE_integer('depth', 10, 'Depth of the network.') flags.DEFINE_string('summary_dir', '.', 'Directory to store summaries.') flags.DEFINE_string('reward_formulation', 'power', 'Reward formulation to use.') flags.DEFINE_bool('use_envlogger', True, 'Use EnvLogger to log environment data.') diff --git a/sims/AstraSim/trainGAAstraSim.py b/sims/AstraSim/trainGAAstraSim.py index 2c4a6854..b7bcf8a0 100644 --- a/sims/AstraSim/trainGAAstraSim.py +++ b/sims/AstraSim/trainGAAstraSim.py @@ -20,7 +20,7 @@ import pandas as pd import matplotlib.pyplot as plt -flags.DEFINE_integer('num_steps', 10, 'Number of training steps.') +flags.DEFINE_integer('num_steps', 20, 'Number of training steps.') flags.DEFINE_integer('num_agents', 4, 'Number of agents.') flags.DEFINE_float('prob_mutation', 0.1, 'Probability of mutation.') flags.DEFINE_string('workload','resnet18', 'ML model name') @@ -86,9 +86,9 @@ def AstraSim_optimization_function(p): workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") # DEFINE NETWORK AND SYSTEM AND WORKLOAD - network_file = "3d_fc_ring_switch.json" - system_file = os.path.join(systems_folder, "3d_fc_ring_switch_baseline.txt") - workload_file = "gnmt_fp16_fused.txt" + network_file = "4d_ring_fc_ring_switch.json" + system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt") + workload_file = "all_reduce/allreduce_0.65.txt" # parse knobs system_knob, network_knob = parse_knobs(knobs_spec) @@ -122,7 +122,6 @@ def AstraSim_optimization_function(p): # parse system parse_system(system_file, action_dict) - action_dict_decoded = astraSim_helper.action_decoder_ga_astraSim(p) # change all variables decoded into action_dict diff --git a/sims/AstraSim/trainRandomWalkerAstraSim.py b/sims/AstraSim/trainRandomWalkerAstraSim.py index 29914f71..f6393975 100644 --- a/sims/AstraSim/trainRandomWalkerAstraSim.py +++ b/sims/AstraSim/trainRandomWalkerAstraSim.py @@ -9,7 +9,10 @@ os.sys.path.insert(0, os.path.abspath('../../')) os.sys.path.insert(0, os.path.abspath('../../arch_gym')) -from arch_gym.envs import AstraSimWrapper +from configs import arch_gym_configs + +from arch_gym.envs.envHelpers import helpers +from arch_gym.envs import AstraSimWrapper, AstraSimEnv import envlogger import numpy as np import pandas as pd @@ -20,7 +23,7 @@ # define workload in run_general.sh flags.DEFINE_string('workload', 'resnet18', 'Which AstraSim workload to run?') -flags.DEFINE_integer('num_steps', 2, 'Number of training steps.') +flags.DEFINE_integer('num_steps', 50, 'Number of training steps.') flags.DEFINE_integer('num_episodes', 1, 'Number of training episodes.') flags.DEFINE_bool('use_envlogger', True, 'Use envlogger to log the data.') flags.DEFINE_string('traject_dir', @@ -194,8 +197,6 @@ def parse_system(system_file, action_dict): key, value = line.strip().split(': ') action_dict['system'][key] = value - - # def parse_workload(workload_file): @@ -283,9 +284,9 @@ def main(_): workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") # DEFINE NETWORK AND SYSTEM AND WORKLOAD - network_file = "3d_fc_ring_switch.json" - system_file = os.path.join(systems_folder, "3d_fc_ring_switch_baseline.txt") - workload_file = "gnmt_fp16_fused.txt" + network_file = "4d_ring_fc_ring_switch.json" + system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt") + workload_file = "all_reduce/allreduce_0.65.txt" exe_path = os.path.join(proj_root_path, "run_general.sh") @@ -297,6 +298,8 @@ def main(_): env = AstraSimWrapper.make_astraSim_env(rl_form='random_walker') # env = AstraSimEnv.AstraSimEnv(rl_form='random_walker') + astrasim_helper = helpers() + # experiment name exp_name = str(FLAGS.workload)+"_num_steps_" + str(FLAGS.num_steps) + "_num_episodes_" + str(FLAGS.num_episodes) # append logs to base path @@ -322,7 +325,7 @@ def main(_): # INITIATE action dict action_dict = {} - # TODO: load network and workloads + # if path exists, use path, else parse the sub-dict action_dict['network'] = {"path": network_file} action_dict['workload'] = {"path": workload_file} @@ -332,6 +335,8 @@ def main(_): # TODO: parse knobs (all variables to change in action_dict) system_knob, network_knob = parse_knobs(knobs_spec) + best_reward, best_observation, best_actions = 0.0, 0.0, {} + for i in range(FLAGS.num_episodes): logging.info('Episode %r', i) @@ -350,11 +355,19 @@ def main(_): step_results['reward'] = [reward] step_results['action'] = action_dict step_results['obs'] = observation + + if reward and reward > best_reward: + best_reward = reward + best_observation = observation + best_actions = action_dict log_results_to_csv(log_path, step_results) end = time.time() + print("Best Reward: ", best_reward) + print("Best Observation: ", best_observation) + print("Best Parameters: ", best_actions) print("Total Time Taken: ", end - start) print("Total Useful Steps: ", env.useful_counter) diff --git a/sims/AstraSim/trainSingleAgentAstraSim.py b/sims/AstraSim/trainSingleAgentAstraSim.py new file mode 100644 index 00000000..54cd4a1e --- /dev/null +++ b/sims/AstraSim/trainSingleAgentAstraSim.py @@ -0,0 +1,205 @@ +import os +import sys +import json +from typing import Optional + +os.sys.path.insert(0, os.path.abspath('../../')) +from configs import arch_gym_configs +os.sys.path.insert(0, os.path.abspath('../../acme/')) +# print(os.sys.path) +# sys.exit() +import envlogger +from acme.agents.jax import ppo +from acme.agents.jax import sac +from acme import wrappers +from acme import specs + +from absl import app +from absl import flags +from absl import logging +from acme.utils import lp_utils +from acme.jax import experiments +from acme.agents.jax import normalization + +from acme.utils.loggers.tf_summary import TFSummaryLogger +from acme.utils.loggers.terminal import TerminalLogger +from acme.utils.loggers.csv import CSVLogger +from acme.utils.loggers import aggregators +from acme.utils.loggers import base + +from arch_gym.envs import AstraSimWrapper + +FLAGS = flags.FLAGS + +# Workload to run for training + +_WORKLOAD = flags.DEFINE_string('workload', 'resnet18', 'Workload to run for training') + +# select which RL algorithm to use +_RL_AGO = flags.DEFINE_string('rl_algo', 'ppo', 'RL algorithm.') + +# select which RL form to use +_RL_FORM = flags.DEFINE_string('rl_form', 'sa1', 'RL form.') + +# Acceptable values for reward: power, latency, and both (both means latency & power) +_REWARD_FORM = flags.DEFINE_string('reward_form', 'both', 'Reward form.') + +# Scale reward +_REWARD_SCALE = flags.DEFINE_string('reward_scale', 'false', 'Scale reward.') + +# Hyperparameters for each RL algorithm +_NUM_STEPS = flags.DEFINE_integer('num_steps', 100, 'Number of training steps.') +_EVAL_EVERY = flags.DEFINE_integer('eval_every', 50, 'Number of evaluation steps.') +_EVAL_EPISODES = flags.DEFINE_integer('eval_episodes', 1, 'Number of evaluation episode.') +_SEED = flags.DEFINE_integer('seed', 1, 'Random seed.') +_LEARNING_RATE = flags.DEFINE_float('learning_rate', 1e-5, 'Learning rate.') + + +# Hyperparameters for PPO +_ENTROPY_COST = flags.DEFINE_float('entropy_cost', 0.1, 'Entropy cost.') +_PPO_CLIPPING_EPSILON = flags.DEFINE_float('ppo_clipping_epsilon', 0.2, 'PPO clipping epsilon.') +_CLIP_VALUE = flags.DEFINE_bool('clip_value', False, 'Clip value.') + +# Experiment setup related parameters +_SUMMARYDIR = flags.DEFINE_string('summarydir', './logs', 'Directory to save summaries.') +_ENVLOGGER_DIR = flags.DEFINE_string('envlogger_dir', 'trajectory', 'Directory to save envlogger.') +_USE_ENVLOGGER = flags.DEFINE_bool('use_envlogger', False, 'Use envlogger.') +_RUN_DISTRIBUTED = flags.DEFINE_bool( + 'run_distributed', False, 'Should an agent be executed in a ' + 'distributed way (the default is a single-threaded agent)') + +# Experimental feature to scale RL policy parameters. Ideally want to keep it same as number +# of agents used in multi-agent training. +flags.DEFINE_integer("params_scaling", 1, "Number of training steps") + +def get_directory_name(): + _EXP_NAME = 'Algo_{}_rlform_{}_num_steps_{}_seed_{}_lr_{}_entropy_{}'.format(_RL_AGO.value, _RL_FORM.value,_NUM_STEPS.value, _SEED.value, _LEARNING_RATE.value, _ENTROPY_COST.value) + + return _EXP_NAME + + +def wrap_in_envlogger(env, envlogger_dir): + metadata = { + 'agent_type': FLAGS.rl_algo, + 'rl_form': FLAGS.rl_form, + 'num_steps': FLAGS.num_steps, + 'env_type': type(env).__name__, + } + env = envlogger.EnvLogger(env, + data_directory = envlogger_dir, + metadata = metadata, + max_episodes_per_file = 1000) + return env + + +def _logger_factory(logger_label: str, steps_key: Optional[str] = None, task_instance: Optional[int]=0) -> base.Logger: + """logger factory.""" + _EXP_NAME = get_directory_name() + if logger_label == 'actor': + terminal_logger = TerminalLogger(label=logger_label, print_fn=logging.info) + summarydir = os.path.join(FLAGS.summarydir,_EXP_NAME, logger_label) + tb_logger = TFSummaryLogger(summarydir, label=logger_label, steps_key=steps_key) + csv_logger = CSVLogger(summarydir, label=logger_label) + serialize_fn = base.to_numpy + logger = aggregators.Dispatcher([terminal_logger, tb_logger, csv_logger], serialize_fn) + return logger + elif logger_label == 'learner': + terminal_logger = TerminalLogger(label=logger_label, print_fn=logging.info) + summarydir = os.path.join(FLAGS.summarydir,_EXP_NAME, logger_label) + tb_logger = TFSummaryLogger(summarydir, label=logger_label, steps_key=steps_key) + csv_logger = CSVLogger(summarydir, label=logger_label) + serialize_fn = base.to_numpy + logger = aggregators.Dispatcher([terminal_logger, tb_logger, csv_logger], serialize_fn) + return logger + elif logger_label == 'evaluator': + terminal_logger = TerminalLogger(label=logger_label, print_fn=logging.info) + summarydir = os.path.join(FLAGS.summarydir,_EXP_NAME, logger_label) + tb_logger = TFSummaryLogger(summarydir, label=logger_label, steps_key=steps_key) + csv_logger = CSVLogger(summarydir, label=logger_label) + serialize_fn = base.to_numpy + logger = aggregators.Dispatcher([terminal_logger, tb_logger, csv_logger], serialize_fn) + return logger + else: + raise ValueError( + f'Improper value for logger label. Logger_label is {logger_label}') + +def build_experiment_config(): + """Builds the experiment configuration.""" + + if(FLAGS.rl_form == 'tdm'): + env = AstraSimWrapper.make_astraSim_env( + reward_formulation = _REWARD_FORM.value, + reward_scaling = _REWARD_SCALE.value + ) + else: + env = AstraSimWrapper.make_astraSim_env( + rl_form=FLAGS.rl_form, + reward_formulation = _REWARD_FORM.value, + reward_scaling = _REWARD_SCALE.value) + if FLAGS.use_envlogger: + envlogger_dir = os.path.join(FLAGS.summarydir, get_directory_name(), FLAGS.envlogger_dir) + if(not os.path.exists(envlogger_dir)): + os.makedirs(envlogger_dir) + env = wrap_in_envlogger(env, envlogger_dir) + + env_spec = specs.make_environment_spec(env) #TODO + if FLAGS.rl_algo == 'ppo': + config = ppo.PPOConfig(entropy_cost=FLAGS.entropy_cost, + learning_rate=FLAGS.learning_rate, + ppo_clipping_epsilon=FLAGS.ppo_clipping_epsilon, + clip_value=FLAGS.clip_value, + ) + ppo_builder = ppo.PPOBuilder(config) + + if FLAGS.params_scaling > 1: + size = 32 * FLAGS.params_scaling + layer_sizes = (size, size, size) + else: + layer_sizes = (32, 32, 32) + make_eval_policy = lambda network: ppo.make_inference_fn(network, True) + + return experiments.ExperimentConfig( + builder=ppo_builder, + environment_factory=lambda seed: env, + network_factory=lambda spec: ppo.make_networks(env_spec, layer_sizes), + policy_network_factory = ppo.make_inference_fn, + eval_policy_network_factory = make_eval_policy, + seed = FLAGS.seed, + logger_factory=_logger_factory, + max_num_actor_steps=_NUM_STEPS.value) + elif FLAGS.rl_algo == 'sac': + config = sac.SACConfig( + learning_rate=FLAGS.learning_rate, + n_step=FLAGS.n_step, + ) + sac_builder = sac.builder.SACBuilder(config) + size = 32 * FLAGS.params_scaling + return experiments.ExperimentConfig( + builder = sac_builder, + environment_factory = lambda seed: env, + network_factory = lambda spec: sac.make_networks(env_spec, (size, size, size)), + seed = FLAGS.seed, + logger_factory = _logger_factory, + max_num_actor_steps = FLAGS.num_steps) + else: + raise ValueError(f'Improper value for rl_algo. rl_algo is {FLAGS.rl_algo}') + +def main(_): + + sim_config = arch_gym_configs.sim_config + config = build_experiment_config() #TODO + if FLAGS.run_distributed: + program = experiments.make_distributed_experiment( + experiment=config, num_actors=4) + lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program)) + else: + experiments.run_experiment( + experiment=config, + eval_every=FLAGS.eval_every, + num_eval_episodes=FLAGS.eval_episodes) + +if __name__ == '__main__': + app.run(main) + + + From 05eab926f36bce87940bf72c56f37db69d53ccf1 Mon Sep 17 00:00:00 2001 From: jared-ni Date: Fri, 6 Oct 2023 13:59:05 -0400 Subject: [PATCH 02/12] try again --- arch_gym/envs/AstraSimWrapper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/arch_gym/envs/AstraSimWrapper.py b/arch_gym/envs/AstraSimWrapper.py index d0724abc..de42ce3b 100644 --- a/arch_gym/envs/AstraSimWrapper.py +++ b/arch_gym/envs/AstraSimWrapper.py @@ -206,6 +206,7 @@ def make_astraSim_env(seed: int = 12234, ), env_wrapper_sel = rl_form ) + environment = wrappers.SinglePrecisionWrapper(environment) if(rl_form == 'sa1' or rl_form == 'tdm'): environment = wrappers.CanonicalSpecWrapper(environment, clip=False) From e2fabc4fd0a5b087fcc6fdd7958996d451b35536 Mon Sep 17 00:00:00 2001 From: jared-ni Date: Fri, 6 Oct 2023 15:44:26 -0400 Subject: [PATCH 03/12] test --- sims/AstraSim/astrasim-archgym | 1 - 1 file changed, 1 deletion(-) delete mode 160000 sims/AstraSim/astrasim-archgym diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym deleted file mode 160000 index 2ff6b732..00000000 --- a/sims/AstraSim/astrasim-archgym +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2ff6b7325d0e21229124f1101a7d2941f434267c From 1e50b1ec9fb5c67a2251ed53ca3fb88c70863c5c Mon Sep 17 00:00:00 2001 From: jared-ni Date: Fri, 6 Oct 2023 15:51:45 -0400 Subject: [PATCH 04/12] test --- .gitmodules | 3 +++ sims/AstraSim/astrasim-archgym | 1 + sims/CFU-Playground/CFU-Playground | 1 + 3 files changed, 5 insertions(+) create mode 160000 sims/AstraSim/astrasim-archgym create mode 160000 sims/CFU-Playground/CFU-Playground diff --git a/.gitmodules b/.gitmodules index 9d7547a5..9425415f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,6 @@ [submodule "sims/AstraSim/astra-sim"] path = sims/AstraSim/astra-sim url = https://github.com/astra-sim/astra-sim.git +[submodule "sims/AstraSim/astrasim-archgym"] + path = sims/AstraSim/astrasim-archgym + url = https://github.com/astra-sim/astrasim-archgym.git diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym new file mode 160000 index 00000000..5a6101e5 --- /dev/null +++ b/sims/AstraSim/astrasim-archgym @@ -0,0 +1 @@ +Subproject commit 5a6101e50371f366b93522b15ed4f5b771207d34 diff --git a/sims/CFU-Playground/CFU-Playground b/sims/CFU-Playground/CFU-Playground new file mode 160000 index 00000000..fb43f80f --- /dev/null +++ b/sims/CFU-Playground/CFU-Playground @@ -0,0 +1 @@ +Subproject commit fb43f80f44c32f81c9cdc1d7c015946f0265fa08 From 0997925ae505a7a4dd50feff32755fbd78037e25 Mon Sep 17 00:00:00 2001 From: jared-ni Date: Fri, 6 Oct 2023 15:55:18 -0400 Subject: [PATCH 05/12] remove again to submodule --- sims/AstraSim/astrasim-archgym | 1 - 1 file changed, 1 deletion(-) delete mode 160000 sims/AstraSim/astrasim-archgym diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym deleted file mode 160000 index 5a6101e5..00000000 --- a/sims/AstraSim/astrasim-archgym +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 5a6101e50371f366b93522b15ed4f5b771207d34 From fa347239f3824e8b672a89a94175555cb6179829 Mon Sep 17 00:00:00 2001 From: jared-ni Date: Fri, 6 Oct 2023 16:03:11 -0400 Subject: [PATCH 06/12] added submodules --- sims/AstraSim/astrasim-archgym | 1 + 1 file changed, 1 insertion(+) create mode 160000 sims/AstraSim/astrasim-archgym diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym new file mode 160000 index 00000000..5a6101e5 --- /dev/null +++ b/sims/AstraSim/astrasim-archgym @@ -0,0 +1 @@ +Subproject commit 5a6101e50371f366b93522b15ed4f5b771207d34 From d3dcde03230245d086ea5adfbe1a2d5f67f2e37e Mon Sep 17 00:00:00 2001 From: Changhai Man Date: Fri, 20 Oct 2023 15:15:23 +0000 Subject: [PATCH 07/12] update astrasim version, add missing config file, add gitkeep to output folder in astrasim env modified: .gitmodules new file: configs/arch_gym_configs.py modified: sims/AstraSim/astrasim-archgym new file: sims/AstraSim/results/run_general/.gitkeep --- .gitmodules | 2 +- configs/arch_gym_configs.py | 170 +++++++++++++++++++++ sims/AstraSim/astrasim-archgym | 2 +- sims/AstraSim/results/run_general/.gitkeep | 0 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 configs/arch_gym_configs.py create mode 100644 sims/AstraSim/results/run_general/.gitkeep diff --git a/.gitmodules b/.gitmodules index 9425415f..642139be 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,4 +4,4 @@ url = https://github.com/astra-sim/astra-sim.git [submodule "sims/AstraSim/astrasim-archgym"] path = sims/AstraSim/astrasim-archgym - url = https://github.com/astra-sim/astrasim-archgym.git + url = https://github.com/changhai0109/astrasim-archgym-public.git diff --git a/configs/arch_gym_configs.py b/configs/arch_gym_configs.py new file mode 100644 index 00000000..204db30c --- /dev/null +++ b/configs/arch_gym_configs.py @@ -0,0 +1,170 @@ +import os +import sys +import psutil + +settings_file_path = os.path.realpath(__file__) +settings_dir_path = os.path.dirname(settings_file_path) +proj_root_path = os.path.abspath(settings_dir_path + "/..") + +os.sys.path.insert(0, proj_root_path) +os.sys.path.insert(0, proj_root_path + "/arch_gym") +os.sys.path.insert(0, proj_root_path + "/arch_gym/sims") +os.sys.path.insert(0, proj_root_path + "/arch_gym/sims/Timeloop") +os.sys.path.insert(0, proj_root_path + "/arch_gym/sims/Sniper") +os.sys.path.insert(0, proj_root_path + "/arch_gym/sims/DRAM") + +os.sys.path.insert(0,proj_root_path+ "/arch_gym/envs") +''' +def check_paths(paths): +for path in paths: +if not os.path.exists(path): +# print in red color if there is an error +print("\033[91m", end="") +print("Path: {} does not exist".format(path)) +else: +# print in green color if there is no error +print("\033[92m", end="") +print("Path: {} exists".format(path)) +''' + +############################### +# Algorithm Configurations # +############################### +# ACO +aco_config = "default_astrasim.yaml" +aco_batch_mode = False +aco_base_path = proj_root_path + +ant_count = [2, 4, 8, 16, 32, 64] # 6 +evaporation = [0.1, 0.25, 0.5, 0.75, 1.0] # 5 +greediness = [0.0, 0.25, 0.5, 0.75, 1.0] # 5 +depth= [2, 4, 8, 16] # 4 + +# GA +ga_batch_mode = False +num_agents = [2, 4, 8, 16, 32, 64] +num_iter_ga = [8, 16, 32, 64] +prob_mut = [0.01, 0.05, 0.001] + +# BO +num_iter_bo = [16, 32, 64] +rand_state_bo = [1, 2, 3, 4, 5] + +# Random Walk +num_steps = [10000, 20000] + +# Reinforcement Learning PPO +rl_agent = False + +######################## +# Target Spec Sniper # +######################## + +# Gainestown latency for 600 +# 30% improvemnent in latency +# 15% improvement in area +# 25% improvement in power + +target_latency = 23160713.1 +target_power = 30 +target_area = 81 + +################ +# DRAMSys # +################ + +dram_mem_controller_config = os.path.join(proj_root_path, "sims/DRAM/DRAMSys/library/resources/configs/mcconfigs") +dram_mem_controller_config_file = os.path.join(dram_mem_controller_config, "policy.json") +binary_name = "DRAMSys" +exe_path = os.path.join(proj_root_path, "sims/DRAM/binary/DRAMSys") +sim_config = os.path.join(proj_root_path, "sims/DRAM/DRAMSys/library/simulations/ddr3-example.json") +experiment_name = "random_walk.csv" +logdir = os.path.join(proj_root_path, "logs") +dramsys_envlogger_path = os.path.join(proj_root_path, "sims/DRAM/envlogger") +target_power = 1 # mw +target_latency = 0.1 # ns +# Make sure these files exists +dram_sys_paths = [] +dram_sys_paths.append(dram_mem_controller_config) +dram_sys_paths.append(exe_path) +dram_sys_paths.append(sim_config) +dram_sys_paths.append(dram_mem_controller_config_file) +dram_sys_paths.append(logdir) + +dram_sys_workload = ['stream.stl', 'random.stl', 'cloud-1.stl', 'cloud-2.stl'] +#check_paths(dram_sys_paths) +################ +# Sniper # +################ + +sniper_config = os.path.join(proj_root_path, "sims/Sniper/arch_gym_x86.cfg") +sniper_binary_name = "simulate_benchmark.py" +sniper_binary_path = os.path.join(proj_root_path, "sims/Sniper") +sniper_logdir = os.path.join(proj_root_path, "sims/Sniper/logs") +sniper_workload = "600" +sniper_numcores = str(psutil.cpu_count()) +sniper_metric_log = "sniper_metric_log.csv" +spec_workload = "602" +sniper_envlogger_path = os.path.join(proj_root_path, "sims/Sniper/envlogger") +# Todo: Set this to a random string when not using Sniper +sniper_mode = "batch" +dummy_power_file = os.path.join(proj_root_path, "sims/Sniper/") + +# Make sure these files exists +sniper_sim_paths = [] +sniper_sim_paths.append(sniper_config) +sniper_sim_paths.append(sniper_binary_path) +sniper_sim_paths.append(sniper_logdir) +sniper_sim_paths.append(dummy_power_file) +#check_paths(sniper_sim_paths) + +################ +# Timeloop # +################ + +timeloop_binary_name = "simulate_timeloop.py" +timeloop_binary_path = os.path.join(proj_root_path, "sims/Timeloop/") +timeloop_parameters = os.path.join(proj_root_path, "sims/Timeloop/parameters.ini") +timeloop_scriptdir = os.path.join(proj_root_path, "sims/Timeloop/script") +timeloop_outputdir = os.path.join(proj_root_path, "sims/Timeloop/output") +timeloop_archdir = os.path.join(proj_root_path, "sims/Timeloop/arch") +timeloop_mapperdir = os.path.join(proj_root_path, "sims/Timeloop/mapper") +timeloop_workloaddir = os.path.join(proj_root_path, "sims/Timeloop/layer_shapes/AlexNet") +timeloop_numcores = str(psutil.cpu_count()) + +# Make sure these files exists +timeloop_sim_paths = [] +timeloop_sim_paths.append(timeloop_binary_path) +timeloop_sim_paths.append(timeloop_scriptdir) +timeloop_sim_paths.append(timeloop_outputdir) +timeloop_sim_paths.append(timeloop_archdir) +timeloop_sim_paths.append(timeloop_mapperdir) +timeloop_sim_paths.append(timeloop_workloaddir) +#check_paths(timeloop_sim_paths) + +########################## +# Target Spec Timeloop # +########################## + +# 30% improvement in energy +# 15% improvement in area +# 20% improvement in cycles + +target_energy_improv = 0.3 +target_area_improv = 0.15 +target_cycle_improv = 0.2 + +target_energy = 29206 * (1 - target_energy_improv) +target_area = 2.03 * (1 - target_area_improv) +target_cycles = 7885704 * (1 - target_cycle_improv) + +########################## +# Target Spec Mastero # +########################## +mastero_model_path = os.path.join(proj_root_path, "sims/gamma/data/model") +exe_file = os.path.join(proj_root_path, "sims/gamma/cost_model/maestro") +aco_config_file = os.path.join(proj_root_path, "settings/default_maestro.yaml") + +# switch back to default color +print("\033[0m", end="") + diff --git a/sims/AstraSim/astrasim-archgym b/sims/AstraSim/astrasim-archgym index 5a6101e5..53574d2f 160000 --- a/sims/AstraSim/astrasim-archgym +++ b/sims/AstraSim/astrasim-archgym @@ -1 +1 @@ -Subproject commit 5a6101e50371f366b93522b15ed4f5b771207d34 +Subproject commit 53574d2f1d20e26d5143eb0ff0c98b6af338da61 diff --git a/sims/AstraSim/results/run_general/.gitkeep b/sims/AstraSim/results/run_general/.gitkeep new file mode 100644 index 00000000..e69de29b From 794704e2aab8afdc082d4817634e2eb716b033be Mon Sep 17 00:00:00 2001 From: Changhai Date: Sat, 21 Oct 2023 11:38:52 -0400 Subject: [PATCH 08/12] switch astrasim knobs to themis modified: sims/AstraSim/trainGAAstraSim.py modified: sims/AstraSim/trainRandomWalkerAstraSim.py --- sims/AstraSim/trainGAAstraSim.py | 2 +- sims/AstraSim/trainRandomWalkerAstraSim.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sims/AstraSim/trainGAAstraSim.py b/sims/AstraSim/trainGAAstraSim.py index b7bcf8a0..34c9be4a 100644 --- a/sims/AstraSim/trainGAAstraSim.py +++ b/sims/AstraSim/trainGAAstraSim.py @@ -80,7 +80,7 @@ def AstraSim_optimization_function(p): # TODO: V1 SPEC: archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs") - knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py") + knobs_spec = os.path.join(archgen_v1_knobs, "themis_knobs_spec.py") networks_folder = os.path.join(archgen_v1_knobs, "templates/network") systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system") workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") diff --git a/sims/AstraSim/trainRandomWalkerAstraSim.py b/sims/AstraSim/trainRandomWalkerAstraSim.py index f6393975..ed5bc9e5 100644 --- a/sims/AstraSim/trainRandomWalkerAstraSim.py +++ b/sims/AstraSim/trainRandomWalkerAstraSim.py @@ -278,7 +278,7 @@ def main(_): # TODO: V1 SPEC: archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs") - knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py") + knobs_spec = os.path.join(archgen_v1_knobs, "themis_knobs_spec.py") networks_folder = os.path.join(archgen_v1_knobs, "templates/network") systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system") workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") From 7d722d75e36d2b82b584b62176c6b5b1c7e8e73e Mon Sep 17 00:00:00 2001 From: Jared Ni Date: Tue, 7 Nov 2023 22:22:12 -0500 Subject: [PATCH 09/12] docker and launch gcp --- sims/AstraSim/Dockerfile | 38 +++++++ sims/AstraSim/launch_gcp.py | 213 ++++++++++++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 sims/AstraSim/Dockerfile create mode 100644 sims/AstraSim/launch_gcp.py diff --git a/sims/AstraSim/Dockerfile b/sims/AstraSim/Dockerfile new file mode 100644 index 00000000..bb728e75 --- /dev/null +++ b/sims/AstraSim/Dockerfile @@ -0,0 +1,38 @@ +# Dockerfile for Arch-Gym project for other agents (BO, ACO, GA, RW) +# start with miniconda image + + +FROM continuumio/miniconda3 +RUN if ! id 1000; then useradd -m -u 1000 clouduser; fi +RUN mkdir /workdir +WORKDIR /workdir +RUN echo "recloning arch-gym rep00000o0" +#RUN git clone --recursive https://srivatsankrishnan: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@github.com/srivatsankrishnan/arch-gym.git +RUN git clone --recursive https://srivatsankrishnan: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@github.com/srivatsankrishnan/arch-gym.git + +RUN cd arch-gym && conda env create -f environment.yml +RUN apt-get update && apt-get -y install libgmp-dev gcc g++ libboost-all-dev +RUN echo "conda activate arch-gym" >> ~/.bashrc +SHELL ["/bin/bash", "--login", "-c"] + + +RUN cd arch-gym/acme && pip install .[jax,tf,testing,envs] && pip install envlogger[tfds] scons && apt-get update && apt-get -y install libgmp-dev && pip install scikit-optimize sympy plotly && conda install --channel conda-forge pygraphviz + + +# The code to run when container is started: + + +RUN chown -R 1000:root /workdir && chmod -R 775 /workdir + + +WORKDIR /workdir/arch-gym/sims/gamma/ + + +# Install cost model +RUN python build.py + +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/arch-gym/lib +RUN echo $LD_LIBRARY_PATH +WORKDIR /workdir/arch-gym/sims/AstraSim/ + +ENTRYPOINT ["conda", "run", "-n", "arch-gym", "python", "launch_gcp.py"] \ No newline at end of file diff --git a/sims/AstraSim/launch_gcp.py b/sims/AstraSim/launch_gcp.py new file mode 100644 index 00000000..5dcfce6b --- /dev/null +++ b/sims/AstraSim/launch_gcp.py @@ -0,0 +1,213 @@ +import subprocess +import numpy as np +from itertools import product +import sys +import os +import yaml +import json +from datetime import date, datetime +os.sys.path.insert(0, os.path.abspath('../../../../')) +from configs import arch_gym_configs + +from absl import flags +from absl import app + +FLAGS = flags.FLAGS + +flags.DEFINE_string('algo', 'ga', 'Which Algorithm to run') +flags.DEFINE_string('workload', 'resnet18', 'Which workload to run') +flags.DEFINE_string('summary_dir', '', 'Directory to store the summary') +flags.DEFINE_integer('num_iter', 10, 'Number of iterations') +flags.DEFINE_string('reward_formulation', 'energy', 'Reward formulation to use') + + +# BO +flags.DEFINE_integer('rand_state', 0, 'Random state') + +# GAa +flags.DEFINE_integer('num_agents', 10, 'Number of agents') +flags.DEFINE_float('prob_mutation', 0.1, 'Probability of mutation.') + +# ACO +flags.DEFINE_integer('ant_count', 2, 'Number of ants') +flags.DEFINE_float('evaporation', 0.25, 'Evaporation rate') +flags.DEFINE_float('greediness', 0.25, 'Greedy rate') + +def update_aco_agent_configs(agent_config, aco_hyperparams): + print("Agent Config File", agent_config) + print("Agent Hyperparams", aco_hyperparams) + + # read the yaml file + with open(agent_config, "r") as stream: + try: + data = yaml.safe_load(stream) + except yaml.YAMLError as exc: + print(exc) + + data['DeepSwarm']['max_depth'] = aco_hyperparams["depth"] + data['DeepSwarm']['aco']['ant_count'] = aco_hyperparams["ant_count"] + data['DeepSwarm']['aco']['greediness'] = aco_hyperparams["greediness"] + data['DeepSwarm']['aco']['pheromone']['evaporation'] = aco_hyperparams["evaporation"] + + # write back the yaml data to agent_config file + with open(agent_config, "w") as stream: + yaml.dump(data, stream, default_flow_style=True) + +def run_task(task): + + if ("algo" in task.keys()): + if (task["algo"] in ["ga", "bo", "aco", "rw", "rl"]): + if (task["algo"] == "aco"): + algo = "aco" + elif (task["algo"] == "bo"): + algo = "bo" + elif (task["algo"] == "ga"): + algo = "ga" + elif (task["algo"] == "rl"): + algo = "rl" + elif(task["algo"] == "rw"): + algo = "rw" + else: + print("This algorithm is not supported.") + exit(0) + else: + print("Need to provide an algorithm.") + exit(0) + + workload = task['workload'] + if(algo == "ga"): + prob_mut = task["prob_mut"] + num_agents = task["num_agents"] + num_iter = task["num_iter"] + summary_dir = task["summary_dir"] + reward_formulation = task["reward_formulation"] + unqiue_ids = [algo, workload, str(prob_mut), str(num_agents)] + elif (algo == "rw"): + num_iter = task["num_iter"] + summary_dir = task["summary_dir"] + reward_formulation = task["reward_formulation"] + unqiue_ids = [algo, workload] + elif (algo == "bo"): + num_iter = task["num_iter"] + rand_state = task["rand_state"] + summary_dir = task["summary_dir"] + reward_formulation = task["reward_formulation"] + unqiue_ids = [algo, workload, str(rand_state)] + elif (algo == "aco"): + num_iter = task["num_iter"] + ant_count = task["ant_count"] + evaporation = task["evaporation"] + greediness = task["greediness"] + summary_dir = task["summary_dir"] + reward_formulation = task["reward_formulation"] + depth = task["num_iter"] + unqiue_ids = [algo, workload, str(ant_count), str(evaporation), str(greediness)] + else: + raise NotImplementedError + + if algo == "ga": + print("train_ga_DRAMSys") + cmd = "python train_ga_maestro.py " + \ + "--workload=" + str(workload) + " " \ + "--num_steps=" + str(num_iter) + " " \ + "--prob_mutation=" + str(prob_mut) + " "\ + "--num_agents=" + str(num_agents) + " "\ + "--summary_dir=" + str(summary_dir) + " "\ + "--reward_formulation=" + str(reward_formulation) + print("Shell Command", cmd) + + elif algo == "rw": + print("train_randomwalker_maestro") + cmd = "python train_randomwalker_maestro.py " + \ + "--workload=" + str(workload) + " " \ + "--num_steps=" + str(num_iter) + " " \ + "--summary_dir=" + str(summary_dir) + " "\ + "--reward_formulation=" + str(reward_formulation) + print("Shell Command", cmd) + elif algo == "bo": + print("train_bo_maestro") + cmd = "python train_bo_maestro.py " + \ + "--workload=" + str(workload) + " " \ + "--num_iter=" + str(num_iter) + " " \ + "--random_state=" + str(rand_state) + " " \ + "--summary_dir=" + str(summary_dir) + " "\ + "--reward_formulation=" + str(reward_formulation) + print("Shell Command", cmd) + elif algo == "aco": + aco_agent_config_file = os.path.join( + arch_gym_configs.proj_root_path, + "settings", + arch_gym_configs.aco_config) + aco_hyperparams = {"evaporation": evaporation, + "ant_count": ant_count, + "greediness": greediness, + "depth": depth} + update_aco_agent_configs(aco_agent_config_file, aco_hyperparams) + + print("train_aco_maestro") + cmd = "python train_aco_maestro.py " + \ + "--workload=" + str(workload) + " " \ + "--depth=" + str(num_iter) + " " \ + "--ant_count=" + str(ant_count) + " " \ + "--evaporation=" + str(evaporation) + " " \ + "--greediness=" + str(greediness) + " " \ + "--summary_dir=" + str(summary_dir) + " "\ + "--reward_formulation=" + str(reward_formulation) + print("Shell Command", cmd) + else: + raise NotImplementedError + + # run the command + os.system(cmd) + + + +def main(_): + taskList = [] + + if FLAGS.algo == "ga": + task = {"algo": FLAGS.algo, + "workload": FLAGS.workload, + "num_agents": FLAGS.num_agents, + "num_iter": FLAGS.num_iter, + "prob_mut": FLAGS.prob_mutation, + 'summary_dir': FLAGS.summary_dir, + 'reward_formulation': FLAGS.reward_formulation} + taskList.append(task) + elif FLAGS.algo == "rw": + task = {"algo": FLAGS.algo, + "workload": FLAGS.workload, + "num_iter": FLAGS.num_iter, + 'summary_dir': FLAGS.summary_dir, + 'reward_formulation': FLAGS.reward_formulation} + taskList.append(task) + elif FLAGS.algo == "bo": + task = {"algo": FLAGS.algo, + "workload": FLAGS.workload, + "num_iter": FLAGS.num_iter, + "rand_state": FLAGS.rand_state, + 'summary_dir': FLAGS.summary_dir, + 'reward_formulation': FLAGS.reward_formulation} + taskList.append(task) + elif FLAGS.algo == "aco": + task = {"algo": FLAGS.algo, + "workload": FLAGS.workload, + "num_iter": FLAGS.num_iter, + "ant_count": FLAGS.ant_count, + "evaporation": FLAGS.evaporation, + "greediness": FLAGS.greediness, + 'summary_dir': FLAGS.summary_dir, + 'reward_formulation': FLAGS.reward_formulation} + taskList.append(task) + + else: + raise NotImplementedError + + + for each_task in taskList: + # update the workload in DRAMSys simulator + run_task(each_task) + + +if __name__ == '__main__': + app.run(main) \ No newline at end of file From b0577060954c187054946f96cdac74472a54a13d Mon Sep 17 00:00:00 2001 From: Jared Ni Date: Tue, 7 Nov 2023 22:25:00 -0500 Subject: [PATCH 10/12] workspace docker --- sims/AstraSim/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sims/AstraSim/Dockerfile b/sims/AstraSim/Dockerfile index bb728e75..d9167bca 100644 --- a/sims/AstraSim/Dockerfile +++ b/sims/AstraSim/Dockerfile @@ -25,7 +25,7 @@ RUN cd arch-gym/acme && pip install .[jax,tf,testing,envs] && pip install envlog RUN chown -R 1000:root /workdir && chmod -R 775 /workdir -WORKDIR /workdir/arch-gym/sims/gamma/ +WORKDIR /workdir/arch-gym/sims/AstraSim/ # Install cost model From aef2a490725f3fe4d634734c25598c5cd291d3f7 Mon Sep 17 00:00:00 2001 From: AditiR_42 Date: Tue, 7 Nov 2023 23:02:14 -0500 Subject: [PATCH 11/12] update random walker --- sims/AstraSim/trainRandomWalkerAstraSim.py | 226 +++++---------------- 1 file changed, 50 insertions(+), 176 deletions(-) diff --git a/sims/AstraSim/trainRandomWalkerAstraSim.py b/sims/AstraSim/trainRandomWalkerAstraSim.py index ed5bc9e5..ec82ca51 100644 --- a/sims/AstraSim/trainRandomWalkerAstraSim.py +++ b/sims/AstraSim/trainRandomWalkerAstraSim.py @@ -1,3 +1,12 @@ +import json +import time +import random +import pandas as pd +import numpy as np +import envlogger +from arch_gym.envs import AstraSimWrapper, AstraSimEnv +from arch_gym.envs.envHelpers import helpers +from configs import arch_gym_configs import os import sys import pickle @@ -9,161 +18,22 @@ os.sys.path.insert(0, os.path.abspath('../../')) os.sys.path.insert(0, os.path.abspath('../../arch_gym')) -from configs import arch_gym_configs - -from arch_gym.envs.envHelpers import helpers -from arch_gym.envs import AstraSimWrapper, AstraSimEnv -import envlogger -import numpy as np -import pandas as pd -import random -import time -import json # define workload in run_general.sh flags.DEFINE_string('workload', 'resnet18', 'Which AstraSim workload to run?') flags.DEFINE_integer('num_steps', 50, 'Number of training steps.') flags.DEFINE_integer('num_episodes', 1, 'Number of training episodes.') -flags.DEFINE_bool('use_envlogger', True, 'Use envlogger to log the data.') -flags.DEFINE_string('traject_dir', - 'random_walker_trajectories', - 'Directory to save the dataset.') +flags.DEFINE_bool('use_envlogger', True, 'Use envlogger to log the data.') +flags.DEFINE_string('traject_dir', + 'random_walker_trajectories', + 'Directory to save the dataset.') flags.DEFINE_string('summary_dir', ".", 'Directory to save the dataset.') -flags.DEFINE_string('reward_formulation', 'latency', 'Which reward formulation to use?') +flags.DEFINE_string('reward_formulation', 'latency', + 'Which reward formulation to use?') FLAGS = flags.FLAGS -def write_network(dimension): - def rand_dim_helper(dim, vals): - return [random.choice(vals) for _ in range(dim)] - - links_count = {"Ring": 2, "FullyConnected": 7, "Switch": 1} - - def rand_num_helper(dim, min, max): - return [random.randint(min, max) for _ in range(dim)] - - def rand_float_helper(dim, min, max): - return [round(random.uniform(min, max), 1) for _ in range(dim)] - - network = { - "topology-name": random.choice(["Hierarchical"]), - "topologies-per-dim": rand_dim_helper(dimension, ["Ring", "FullyConnected", "Switch"]), - # NEED TO CHECK HOW RANDOM DIM TYPE CAN BE - # "dimension-type": rand_dim_helper(dimension, ["T", "N", "P"]), - "dimension-type": rand_dim_helper(dimension, ["N"]), - # "dimensions-count": (int, 1, 5, 2), - "dimensions-count": dimension, - "units-count": rand_num_helper(dimension, 2, 1024), - "links-count": rand_num_helper(dimension, 1, 10), - "link-latency": rand_num_helper(dimension, 1, 1000), - "link-bandwidth": rand_float_helper(dimension, 0.00001, 100000), - # SHOULD THIS BE ONLY ZEROS? - "nic-latency": rand_num_helper(dimension, 0, 1000), - "router-latency": rand_num_helper(dimension, 0, 1000), - "hbm-latency": rand_num_helper(dimension, 1, 1), - "hbm-bandwidth": rand_num_helper(dimension, 1, 1), - "hbm-scale": rand_num_helper(dimension, 0, 0), - } - - return network - - -def write_system(dimension): - def implementation_helper(dim, val): - if val in ["oneRing", "oneDirect"]: - return val - else: - value = "" - for _ in range(dim): - value += val + "_" - return value[:-1] - - system = { - "scheduling-policy": random.choice(["LIFO", "FIFO"]), - "endpoint-delay": random.randint(1, 1000), - "active-chunks-per-dimension": random.randint(1, 32), - # whenever dataset splits is high, it takes a long time to run - "preferred-dataset-splits": random.randint(16, 1024), - "boost-mode": 1, - "all-reduce-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), - "all-gather-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), - "reduce-scatter-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), - "all-to-all-implementation": implementation_helper(dimension, random.choice(["ring", "direct", "doubleBinaryTree", "oneRing", "oneDirect", "hierarchicalRing", "halvingDoubling", "oneHalvingDoubling"])), - - "collective-optimization": random.choice(["baseline", "localBWAware"]), - "intra-dimension-scheduling": random.choice(["FIFO", "SCF"]), - "inter-dimension-scheduling": random.choice(["baseline", "themis"]) - } - return system - - -def write_workload(): - value = "" - # randomize workload type - workload_type = random.choice(["DATA\n", "HYBRID_TRANSFORMER\n", "HYBRID_DLRM\n", "MICRO\n"]) - # randomize number of DNN layers - layers_count = random.randint(1, 50) - if workload_type == "MICRO\n": - layers_count = 1 - value += workload_type - - value += str(layers_count) + "\n" - # configure each layer - for i in range(layers_count): - # layer name and reserved variable - value += "layer" + str(i) + "\t-1\t" - # forward pass compute time - forward_time = str(random.randint(1, 42000000)) + "\t" - # forward_time = str(random.randint(1, 4200)) + "\t" - if workload_type == "MICRO\n": - forward_time = "5\t" - value += forward_time - - # forward pass communication type - forward_type = random.choice(["ALLREDUCE", "ALLGATHER", "ALLTOALL", "NONE"]) + "\t" - if workload_type == "MICRO\n": - forward_type = "NONE\t" - value += forward_type - # forward pass communication size - forward_size = "0\t" if forward_type == "NONE\t" else str(random.randint(0, 70000000)) + "\t" - value += forward_size - - # input grad compute time - grad_time = str(random.randint(1, 42000000)) + " " - - if workload_type == "MICRO\n": - grad_time = "5\t" - value += grad_time - # input grad communication type - grad_type = random.choice(["ALLREDUCE", "ALLGATHER", "ALLTOALL", "NONE"]) + "\t" - if workload_type == "MICRO\n": - grad_type = "NONE\t" - value += grad_type - # input grad communication size - grad_size = "0\t" if grad_type == "NONE\t" else str(random.randint(0, 70000000)) + "\t" - value += grad_size - - # weight grad compute time - weight_time = str(random.randint(1, 42000000)) + "\t" - # weight_time = str(random.randint(1, 4200)) + "\t" - if workload_type == "MICRO\n": - weight_time = "5\t" - value += weight_time - # weight grad communication type - weight_type = random.choice(["ALLREDUCE", "ALLGATHER", "ALLTOALL", "NONE"]) + "\t" - value += weight_type - # weight grad communication size - weight_size = "0\t" if weight_type == "NONE\t" else str(random.randint(0, 70000000)) + "\t" - - value += weight_size - # delay per entire weight/input/output update after the collective is finished - value += str(random.randint(5, 5000)) + "\n" - # value += str(random.randint(5, 50)) + "\n" - - return {"value": value} - - # parses the network file # def parse_network(network_file): # with open(network_file) as f: @@ -187,6 +57,8 @@ def write_workload(): intra-dimension-scheduling: FIFO inter-dimension-scheduling: baseline """ + + def parse_system(system_file, action_dict): # parse system_file (above is the content) into dict action_dict['system'] = {} @@ -199,8 +71,7 @@ def parse_system(system_file, action_dict): # def parse_workload(workload_file): - - + # parses knobs that we want to experiment with def parse_knobs(knobs_spec): @@ -217,9 +88,8 @@ def parse_knobs(knobs_spec): # Access the dictionaries SYSTEM_KNOBS = parsed_dicts['SYSTEM_KNOBS'] NETWORK_KNOBS = parsed_dicts['NETWORK_KNOBS'] - + return SYSTEM_KNOBS, NETWORK_KNOBS - # action_type = specify 'network' or 'system @@ -229,25 +99,27 @@ def generate_random_actions(action_dict, system_knob, network_knob): for dict_type, dict_name in dicts: for knob in dict_type.keys(): if isinstance(dict_type[knob], set): - action_dict[dict_name][knob] = random.choice(list(dict_type[knob])) + action_dict[dict_name][knob] = random.choice( + list(dict_type[knob])) else: - action_dict[dict_name][knob] = random.randint(dict_type[knob][1], dict_type[knob][2]) - + action_dict[dict_name][knob] = random.randint( + dict_type[knob][1], dict_type[knob][2]) + return action_dict def log_results_to_csv(filename, fitness_dict): - df = pd.DataFrame([fitness_dict['reward']]) - csvfile = os.path.join(filename, "rewards.csv") - df.to_csv(csvfile, index=False, header=False, mode='a') + df = pd.DataFrame([fitness_dict['reward']]) + csvfile = os.path.join(filename, "rewards.csv") + df.to_csv(csvfile, index=False, header=False, mode='a') - df = pd.DataFrame([fitness_dict['action']]) - csvfile = os.path.join(filename, "actions.csv") - df.to_csv(csvfile, index=False, header=False, mode='a') + df = pd.DataFrame([fitness_dict['action']]) + csvfile = os.path.join(filename, "actions.csv") + df.to_csv(csvfile, index=False, header=False, mode='a') - df = pd.DataFrame([fitness_dict['obs']]) - csvfile = os.path.join(filename, "observations.csv") - df.to_csv(csvfile, index=False, header=False, mode='a') + df = pd.DataFrame([fitness_dict['obs']]) + csvfile = os.path.join(filename, "observations.csv") + df.to_csv(csvfile, index=False, header=False, mode='a') # Random walker then random walker, else use other @@ -285,27 +157,29 @@ def main(_): # DEFINE NETWORK AND SYSTEM AND WORKLOAD network_file = "4d_ring_fc_ring_switch.json" - system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt") + system_file = os.path.join( + systems_folder, "4d_ring_fc_ring_switch_baseline.txt") workload_file = "all_reduce/allreduce_0.65.txt" - exe_path = os.path.join(proj_root_path, "run_general.sh") network_config = os.path.join(proj_root_path, "general_network.json") system_config = os.path.join(proj_root_path, "general_system.txt") workload_config = os.path.join(proj_root_path, "general_workload.txt") - env = AstraSimWrapper.make_astraSim_env(rl_form='random_walker') # env = AstraSimEnv.AstraSimEnv(rl_form='random_walker') astrasim_helper = helpers() - # experiment name - exp_name = str(FLAGS.workload)+"_num_steps_" + str(FLAGS.num_steps) + "_num_episodes_" + str(FLAGS.num_episodes) + # experiment name + exp_name = str(FLAGS.workload)+"_num_steps_" + \ + str(FLAGS.num_steps) + "_num_episodes_" + str(FLAGS.num_episodes) # append logs to base path - log_path = os.path.join(FLAGS.summary_dir, 'random_walker_logs', FLAGS.reward_formulation, exp_name) + log_path = os.path.join( + FLAGS.summary_dir, 'random_walker_logs', FLAGS.reward_formulation, exp_name) # get the current working directory and append the exp name - traject_dir = os.path.join(FLAGS.summary_dir, FLAGS.traject_dir, FLAGS.reward_formulation, exp_name) + traject_dir = os.path.join( + FLAGS.summary_dir, FLAGS.traject_dir, FLAGS.reward_formulation, exp_name) # check if log_path exists else create it if not os.path.exists(log_path): os.makedirs(log_path) @@ -321,15 +195,15 @@ def main(_): start = time.time() step_results = {} - + # INITIATE action dict action_dict = {} # if path exists, use path, else parse the sub-dict action_dict['network'] = {"path": network_file} action_dict['workload'] = {"path": workload_file} - - # TODO: parse system + + # TODO: parse system parse_system(system_file, action_dict) # TODO: parse knobs (all variables to change in action_dict) @@ -343,7 +217,8 @@ def main(_): # every step of the current training for step in range(FLAGS.num_steps): # pass into generate_random_actions(dimension, knobs) - action_dict = generate_random_actions(action_dict, system_knob, network_knob) + action_dict = generate_random_actions( + action_dict, system_knob, network_knob) # with open("general_workload.txt", 'w') as file: # file.write(action["workload"]["value"]) @@ -351,7 +226,7 @@ def main(_): # step_result wrapped in TimeStep object step_result = env.step(action_dict) step_type, reward, discount, observation = step_result - + step_results['reward'] = [reward] step_results['action'] = action_dict step_results['obs'] = observation @@ -360,7 +235,7 @@ def main(_): best_reward = reward best_observation = observation best_actions = action_dict - + log_results_to_csv(log_path, step_results) end = time.time() @@ -372,6 +247,5 @@ def main(_): print("Total Useful Steps: ", env.useful_counter) - if __name__ == '__main__': - app.run(main) + app.run(main) From 6924d38ae3c51d0a296ad0b637f2016345dca29a Mon Sep 17 00:00:00 2001 From: AditiR_42 Date: Tue, 7 Nov 2023 23:07:54 -0500 Subject: [PATCH 12/12] update random walk --- sims/AstraSim/trainRandomWalkerAstraSim.py | 43 ++++++---------------- 1 file changed, 11 insertions(+), 32 deletions(-) diff --git a/sims/AstraSim/trainRandomWalkerAstraSim.py b/sims/AstraSim/trainRandomWalkerAstraSim.py index ec82ca51..69ef5451 100644 --- a/sims/AstraSim/trainRandomWalkerAstraSim.py +++ b/sims/AstraSim/trainRandomWalkerAstraSim.py @@ -34,31 +34,17 @@ FLAGS = flags.FLAGS -# parses the network file -# def parse_network(network_file): -# with open(network_file) as f: -# network = json.load(f) -# return network +# network: parses the network file +def parse_network(network_file, action_dict): + with open(network_file) as f: + network = json.load(f) - -# systems: parse from file into json into generate_random_actions -""" -system_file content: -scheduling-policy: LIFO -endpoint-delay: 1 -active-chunks-per-dimension: 1 -preferred-dataset-splits: 64 -boost-mode: 1 -all-reduce-implementation: direct_ring_halvingDoubling -all-gather-implementation: direct_ring_halvingDoubling -reduce-scatter-implementation: direct_ring_halvingDoubling -all-to-all-implementation: direct_direct_direct -collective-optimization: localBWAware -intra-dimension-scheduling: FIFO -inter-dimension-scheduling: baseline -""" + for key in network.keys(): + action_dict['network'][key] = network[key] + return network +# systems: parse from file into json into generate_random_actions def parse_system(system_file, action_dict): # parse system_file (above is the content) into dict action_dict['system'] = {} @@ -70,9 +56,6 @@ def parse_system(system_file, action_dict): action_dict['system'][key] = value -# def parse_workload(workload_file): - - # parses knobs that we want to experiment with def parse_knobs(knobs_spec): SYSTEM_KNOBS = {} @@ -156,7 +139,7 @@ def main(_): workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload") # DEFINE NETWORK AND SYSTEM AND WORKLOAD - network_file = "4d_ring_fc_ring_switch.json" + network_file = os.path.join(networks_folder, "4d_ring_fc_ring_switch.json") system_file = os.path.join( systems_folder, "4d_ring_fc_ring_switch_baseline.txt") workload_file = "all_reduce/allreduce_0.65.txt" @@ -188,10 +171,6 @@ def main(_): os.makedirs(traject_dir) env = wrap_in_envlogger(env, traject_dir) - # get the dimension of the network - # the dimension is now defined in the template - dimension = random.randint(2, 3) - start = time.time() step_results = {} @@ -200,11 +179,11 @@ def main(_): action_dict = {} # if path exists, use path, else parse the sub-dict - action_dict['network'] = {"path": network_file} action_dict['workload'] = {"path": workload_file} - # TODO: parse system + # TODO: parse system and network parse_system(system_file, action_dict) + parse_network(network_file, action_dict) # TODO: parse knobs (all variables to change in action_dict) system_knob, network_knob = parse_knobs(knobs_spec)