Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bayesian optimization #29

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 86 additions & 48 deletions arch_gym/envs/AstraSimEnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import csv
import random

from envHelpers import helpers

settings_file_path = os.path.realpath(__file__)
settings_dir_path = os.path.dirname(settings_file_path)
proj_root_path = os.path.join(settings_dir_path, '..', '..')
Expand All @@ -16,18 +18,25 @@

# astra-sim environment
class AstraSimEnv(gym.Env):
def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1):
# action space = set of all possible actions. Space.sample() returns a random action
self.action_space = gym.spaces.Discrete(16)
# observation space = set of all possible observations
self.observation_space = gym.spaces.Discrete(1)
def __init__(self, rl_form="sa1", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1,):
self.rl_form = rl_form

if self.rl_form == 'sa1':
# action space = set of all possible actions. Space.sample() returns a random action
# observation space = set of all possible observations
self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32) # box is an array of shape len
self.action_space = gym.spaces.Box(low=0, high=1, shape=(4,), dtype=np.float32)
self.helpers = helpers()

else:
self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
self.action_space = gym.spaces.Discrete(16)


# set parameters
self.max_steps = max_steps
self.counter = 0
self.useful_counter = 0

self.rl_form = rl_form
self.num_agents = num_agents
self.reward_formulation = reward_formulation
self.reward_scaling = reward_scaling
Expand All @@ -49,9 +58,15 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo
self.networks_folder = os.path.join(sim_path, "astrasim-archgym/dse/archgen_v1_knobs/templates/network")
self.workloads_folder = os.path.join(sim_path, "astrasim-archgym/themis/inputs/workload")

# Config does not matter
self.network_config = os.path.join(self.networks_folder, "3d_fc_ring_switch.json")
self.workload_config = os.path.join(sim_path, "realworld_workloads/transformer_1t_fused_only_t.txt")
self.workload_config = os.path.join(self.workloads_folder, "all_reduce/allreduce_0.65.txt")
self.astrasim_archgym = os.path.join(sim_path, "astrasim-archgym")
self.systems_folder = os.path.join(self.astrasim_archgym, "themis/inputs/system")

self.network_file = "4d_ring_fc_ring_switch.json"
self.system_file = os.path.join(self.systems_folder, "4d_ring_fc_ring_switch_baseline.txt")
self.workload_file = "all_reduce/allreduce_0.65.txt"

print("_____________________*****************************_____________________")

Expand All @@ -60,6 +75,7 @@ def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_fo
# reset function

def reset(self):

self.counter = 0
# get results folder path
results_folder_path = os.path.join(sim_path, "results", "run_general")
Expand All @@ -72,7 +88,13 @@ def reset(self):
csv_files = os.path.join(results_folder_path, csv_files)
if os.path.exists(csv_files):
os.remove(csv_files)
return

# TODO:
obs = np.zeros(self.observation_space.shape)

return obs



# parses a result csv file and stores it in a dictionary
def parse_result(self, file_name):
Expand Down Expand Up @@ -113,13 +135,45 @@ def calculate_reward(self, observations):
print(sum)
return 1 / (sum ** 0.5)


# parse system_file (above is the content) into dict
def parse_system(self, system_file, action_dict):
action_dict['system'] = {}
with open(system_file, 'r') as file:
lines = file.readlines()

for line in lines:
key, value = line.strip().split(': ')
action_dict['system'][key] = value

# give it one action: one set of parameters from json file
def step(self, action_dict):

# write the three config files
# with open(self.network_config, "w") as outfile:
# outfile.write(json.dumps(action_dict['network'], indent=4))
print(action_dict)
if not isinstance(action_dict, dict):
with open(settings_dir_path + "/AstraSimRL_2.csv", 'a') as f:
writer = csv.writer(f)
writer.writerow(action_dict)

print("STEP: action_dict is a list")
action_dict_decoded = {}
action_dict_decoded['network'] = {"path": self.network_file}
action_dict_decoded['workload'] = {"path": self.workload_file}

# parse system: initial values
self.parse_system(self.system_file, action_dict_decoded)

# returning an
action_decoded = self.helpers.action_decoder_ga_astraSim(action_dict)

# change all variables decoded into action_dict
for sect in action_decoded:
for key in action_decoded[sect]:
action_dict_decoded[sect][key] = action_decoded[sect][key]

action_dict = action_dict_decoded



if "path" in action_dict["network"]:
self.network_config = action_dict["network"]["path"]

Expand All @@ -135,12 +189,7 @@ def step(self, action_dict):

# the action is actually the parsed parameter files
print("Step: " + str(self.counter))
if (self.counter == self.max_steps):
self.done = True
print("Maximum steps reached")
self.reset()
else:
self.counter += 1
self.counter += 1

# start subrpocess to run the simulation
# $1: network, $2: system, $3: workload
Expand Down Expand Up @@ -174,57 +223,46 @@ def step(self, action_dict):
sample_all_reduce_dimension_utilization = self.parse_result(sim_path +
'/results/run_general/sample_all_reduce_dimension_utilization.csv')

if (self.counter == self.max_steps):
self.done = True
print("Maximum steps reached")
self.reset()


# test if the csv files exist (if they don't, the config files are invalid)
if ((len(backend_dim_info) == 0 or len(backend_end_to_end) == 0 or
len(detailed) == 0 or len(end_to_end) == 0 or
len(sample_all_reduce_dimension_utilization) == 0)):
# set reward to be extremely negative
reward = float("-inf")
print("reward: ", reward)
return [[], reward, self.done, {"useful_counter": self.useful_counter}, self.state]
return [], reward, self.done, {"useful_counter": self.useful_counter}, self.state
else:
# only recording the first line because apparently they are all the same? TODO
self.observations = [
backend_end_to_end["CommsTime"][0],
observations = [
float(backend_end_to_end["CommsTime"][0])
# end_to_end["fwd compute"][0],
# end_to_end["wg compute"][0],
# end_to_end["ig compute"][0],
# end_to_end["total exposed comm"][0]
]
reward = self.calculate_reward(self.observations)
print("reward: ", reward)
print("observations: ", self.observations)


reward = self.calculate_reward(observations)

print("reward: ", reward)

# reshape observations with shape of observation space
observations = np.reshape(observations, self.observation_space.shape)
self.useful_counter += 1

return [self.observations, reward, self.done, {"useful_counter": self.useful_counter}, self.state]
return observations, reward, self.done, {"useful_counter": self.useful_counter}, self.state


if __name__ == "__main__":
print("Testing AstraSimEnv")
env = AstraSimEnv(rl_form='random_walker',
env = AstraSimEnv(rl_form='sa1',
max_steps=10,
num_agents=1,
reward_formulation='reward_formulation_1',
reward_scaling=1)






"""
Everytime rest happens:
- zero out the observation

3/24:
Communication Time (unit: microseconds)
Time breakdowns (forward pass, weight gradient, input gradient)
Exposed communication


3/31:
Catch errors by giving it high negative reward. This way we can test the range.


"""
20 changes: 11 additions & 9 deletions arch_gym/envs/AstraSimWrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.

"""Wraps an OpenAI Gym environment to be used as a dm_env environment."""
import sys, os
import sys
from typing import Any, Dict, List, Optional

from acme import specs
Expand All @@ -25,8 +25,8 @@
import numpy as np
import tree

os.sys.path.insert(0, os.path.abspath('../../'))
from arch_gym.envs.AstraSimEnv import AstraSimEnv
from AstraSimEnv import AstraSimEnv
from envHelpers import helpers

# dm = deepmind
class AstraSimEnvWrapper(dm_env.Environment):
Expand All @@ -41,6 +41,7 @@ def __init__(self, environment: gym.Env,
self._environment = environment
self._reset_next_step = True
self._last_info = None
self.helper = helpers()
self.env_wrapper_sel = env_wrapper_sel

# set useful counter
Expand Down Expand Up @@ -182,12 +183,12 @@ def _convert_to_spec(space: gym.Space,
else:
raise ValueError('Unexpected gym space: {}'.format(space))

def make_astraSim_env(seed: int = 12345,
rl_form = 'macme',
def make_astraSim_env(seed: int = 12234,
rl_form = 'sa1',
reward_formulation = 'power',
reward_scaling = 'false',
max_steps: int = 100,
num_agents: int = 10) -> dm_env.Environment:
max_steps: int = 1,
num_agents: int = 1) -> dm_env.Environment:
"""Returns DRAMSys environment."""
print("[DEBUG][Seed]", seed)
print("[DEBUG][RL Form]", rl_form)
Expand All @@ -205,7 +206,8 @@ def make_astraSim_env(seed: int = 12345,
),
env_wrapper_sel = rl_form
)

environment = wrappers.SinglePrecisionWrapper(environment)
if(rl_form == 'sa' or rl_form == 'tdm'):
environment = wrappers.CanonicalSpecWrapper(environment, clip=True)
if(rl_form == 'sa1' or rl_form == 'tdm'):
environment = wrappers.CanonicalSpecWrapper(environment, clip=False)
return environment
8 changes: 4 additions & 4 deletions arch_gym/envs/envHelpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,10 +797,10 @@ def action_decoder_ga_astraSim(self, act_encoded):
interDimension_mapper = {0: "baseline", 1: "themis"}

# Modified system parameters
act_decoded["system"]["scheduling-policy"] = schedulePolicy_mapper[int(act_encoded[0])]
act_decoded["system"]["collective-optimization"] = collectiveOptimization_mapper[int(act_encoded[1])]
act_decoded["system"]["intra-dimension-scheduling"] = intraDimension_mapper[int(act_encoded[2])]
act_decoded["system"]["inter-dimension-scheduling"] = interDimension_mapper[int(act_encoded[3])]
act_decoded["system"]["scheduling-policy"] = schedulePolicy_mapper[int(round(act_encoded[0]))]
act_decoded["system"]["collective-optimization"] = collectiveOptimization_mapper[int(round(act_encoded[1]))]
act_decoded["system"]["intra-dimension-scheduling"] = intraDimension_mapper[int(round(act_encoded[2]))]
act_decoded["system"]["inter-dimension-scheduling"] = interDimension_mapper[int(round(act_encoded[3]))]

return act_decoded

Expand Down
5 changes: 5 additions & 0 deletions sims/AstraSim/AstraSimRL.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
0.17908713,0.44089648,0.83359694,0.2673431
0.5,0.5,0.5,0.5
0.0,0.0,0.0,1.0
0.0,0.0,0.0,1.0
0.17908713,0.44089648,0.83359694,0.2673431
1 change: 1 addition & 0 deletions sims/AstraSim/astrasim-archgym
Submodule astrasim-archgym added at 2ff6b7
Binary file added sims/AstraSim/bo_logs/metadata.riegeli
Binary file not shown.
7 changes: 7 additions & 0 deletions sims/AstraSim/exp_config.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[experiment_configuration]
exp_name = resnet18_random_state_2_num_iter_16
trajectory_dir = ./bo_trajectories/power/resnet18_random_state_2_num_iter_16
log_dir = ./bo_logs/power/resnet18_random_state_2_num_iter_16
reward_formulation = power
use_envlogger = True

20 changes: 20 additions & 0 deletions sims/AstraSim/general_workload.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
HYBRID_TRANSFORMER_FWD_IN_BCKWD model_parallel_NPU_group: 128 checkpoints: 2 0 9 checkpoint_initiates: 2 17 8
18
Q1 -1 2343750 NONE 0 2343750 ALLREDUCE 805306368 2343750 ALLREDUCE 240316416 10
K1 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10
V1 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10
QK1 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10
softmax1 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10
concat1 -1 2343750 ALLREDUCE 805306368 2343750 ALLGATHER 6291456 2343750 NONE 0 10
X1W1b1 -1 9375000 NONE 0 9375000 ALLREDUCE 805306368 9375000 NONE 0 10
X1W2b2 -1 9375000 ALLREDUCE 805306368 9375000 NONE 0 9375000 NONE 0 10
layerNorm1 -1 12207 NONE 0 12207 NONE 0 12207 NONE 0 10
Q2 -1 2343750 NONE 0 2343750 ALLREDUCE 805306368 2343750 NONE 0 10
K2 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10
V2 -1 2343750 NONE 0 2343750 NONE 0 2343750 NONE 0 10
QK2 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10
softmax2 -1 97656 NONE 0 97656 NONE 0 97656 NONE 0 10
concat2 -1 2343750 ALLREDUCE 805306368 2343750 ALLGATHER 6291456 2343750 NONE 0 10
X2W1b1 -1 9375000 NONE 0 9375000 ALLREDUCE 805306368 9375000 NONE 0 10
X2W2b2 -1 9375000 ALLREDUCE 805306368 9375000 NONE 0 9375000 NONE 0 10
layerNorm2 -1 12207 NONE 0 12207 NONE 0 12207 NONE 0 10
Loading
Loading