Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Uploading installaiton.md #13

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion aco/DeepSwarm/deepswarm/backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,7 @@ def __init__(self, path, exp_name, traject_dir, log_dir, reward_formulation, use
# SET UP ACTION DICT
self.action_dict = {"network": {}, "workload": {}}
self.action_dict["network"]['path'] = "3d_fc_ring_switch.json"
self.action_dict["workload"]['path'] = "gnmt_fp16_fused.txt"
self.action_dict["workload"]['path'] = "all_reduce/allreduce_0.65.txt"

# PARSE SYSTEM FILE
self.parse_system(self.system_file, self.action_dict)
Expand Down
17 changes: 9 additions & 8 deletions arch_gym/envs/AstraSimEnv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class AstraSimEnv(gym.Env):
def __init__(self, rl_form="random_walker", max_steps=5, num_agents=1, reward_formulation="None", reward_scaling=1):
# action space = set of all possible actions. Space.sample() returns a random action
self.action_space = gym.spaces.Discrete(2)
self.action_space = gym.spaces.Discrete(16)
# observation space = set of all possible observations
self.observation_space = gym.spaces.Discrete(1)

Expand Down Expand Up @@ -107,9 +107,9 @@ def close(self):
def calculate_reward(self, observations):
print("Calculating reward...")
print(observations)
sum = 0
sum = 1.0
for obs in observations:
sum += ((float(obs[0]) - 1) ** 2)
sum += ((float(obs) - 1) ** 2)
print(sum)
return 1 / (sum ** 0.5)

Expand All @@ -119,6 +119,7 @@ def step(self, action_dict):
# write the three config files
# with open(self.network_config, "w") as outfile:
# outfile.write(json.dumps(action_dict['network'], indent=4))
print(action_dict)
if "path" in action_dict["network"]:
self.network_config = action_dict["network"]["path"]

Expand Down Expand Up @@ -178,17 +179,17 @@ def step(self, action_dict):
len(detailed) == 0 or len(end_to_end) == 0 or
len(sample_all_reduce_dimension_utilization) == 0)):
# set reward to be extremely negative
reward = -100000
reward = float("-inf")
print("reward: ", reward)
return [[], reward, self.done, {"useful_counter": self.useful_counter}, self.state]
else:
# only recording the first line because apparently they are all the same? TODO
self.observations = [
backend_end_to_end["CommsTime"][0],
end_to_end["fwd compute"][0],
end_to_end["wg compute"][0],
end_to_end["ig compute"][0],
end_to_end["total exposed comm"][0]
# end_to_end["fwd compute"][0],
# end_to_end["wg compute"][0],
# end_to_end["ig compute"][0],
# end_to_end["total exposed comm"][0]
]
reward = self.calculate_reward(self.observations)
print("reward: ", reward)
Expand Down
225 changes: 225 additions & 0 deletions bo/AstraSimEstimator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
from sklearn.base import BaseEstimator, ClassifierMixin
import os
os.sys.path.insert(0, os.path.abspath('/../arch_gym/envs/'))
os.sys.path.insert(0, os.path.abspath('/../'))
from configs import arch_gym_configs
import json
from arch_gym.envs.envHelpers import helpers
from arch_gym.envs.AstraSimEnv import AstraSimEnv
from arch_gym.envs.AstraSimWrapper import make_astraSim_env
import configparser
import envlogger
import sys
import numpy as np
import pandas as pd
import time

from absl import logging
from absl import flags


class AstraSimEstimator(BaseEstimator):

def __init__(self, scheduling_policy="FIFO", collective_optimization="baseline",
intra_dimension_scheduling="FIFO", inter_dimension_scheduling="baseline",
exp_name="test", traject_dir="traj"):

''' All the default values of AstraSim should be initialized here.
Take all the parameters here and write it to the config files
'''
# To do: Implement some default parameters
self.env = AstraSimEnv()
self.helper = helpers()
self.action_dict = {}

settings_file_path = os.path.realpath(__file__)
settings_dir_path = os.path.dirname(settings_file_path)
proj_root_path = os.path.join(settings_dir_path, '..')
astrasim_archgym = os.path.join(proj_root_path, "sims/AstraSim/astrasim-archgym")

# TODO: V1 SPEC:
archgen_v1_knobs = os.path.join(astrasim_archgym, "dse/archgen_v1_knobs")
knobs_spec = os.path.join(archgen_v1_knobs, "archgen_v1_knobs_spec.py")
networks_folder = os.path.join(archgen_v1_knobs, "templates/network")
systems_folder = os.path.join(astrasim_archgym, "themis/inputs/system")
workloads_folder = os.path.join(astrasim_archgym, "themis/inputs/workload")


self.network_file = "4d_ring_fc_ring_switch.json"
self.system_file = os.path.join(systems_folder, "4d_ring_fc_ring_switch_baseline.txt")
self.workload_file = "all_reduce/allreduce_0.20.txt"

# self.action_dict['network'] = {"path": self.network_file}
# self.action_dict['workload'] = {"path": self.workload_file}

# self.parse_system(self.system_file, self.action_dict)

# self.action_dict["system"]["scheduling-policy"] = scheduling_policy
self.action_dict["scheduling_policy"] = scheduling_policy
self.action_dict["collective_optimization"] = collective_optimization
self.action_dict["intra_dimension_scheduling"] = intra_dimension_scheduling
self.action_dict["inter_dimension_scheduling"] = inter_dimension_scheduling

self.exp_name = exp_name
self.traject_dir = traject_dir
self.fitness_hist = []
self.exp_log_dir = os.path.join(os.getcwd(), "bo_logs")
self.reward_formulation = 'power'

print("[Experiment]: ", self.exp_name)
print("[Trajectory Log path]: ", self.traject_dir)


self.bo_steps=0


def parse_system(self, system_file, action_dict):
# parse system_file (above is the content) into dict
action_dict['system'] = {}
with open(system_file, 'r') as file:
lines = file.readlines()

for line in lines:
key, value = line.strip().split(': ')
action_dict['system'][key] = value


def wrap_in_envlogger(self, env, envlogger_dir, use_envlogger):
metadata = {
'agent_type': 'Bayesian Optimization',
'env_type': type(env).__name__,
}
if use_envlogger == 'True':
logging.info('Wrapping environment with EnvironmentLogger...')
env = envlogger.EnvLogger(env,
data_directory=envlogger_dir,
max_episodes_per_file=1000,
metadata=metadata)
logging.info('Done wrapping environment with EnvironmentLogger.')
return env
else:
print("Not using envlogger")
return env


def fit (self, X, y=None):
'''
1) Call the AstraSim simulator and return performance, power, and energy
2) The parameter must be updated before the calling the AstraSim simulator
3) X is the trace files (.e., Workload)
'''
self.bo_steps += 1

def step_fn(unused_timestep, unused_action, unused_env):
return {'timestamp': time.time()}

reward = 0
self.fitness_hist = {}

# read from the config file
config = configparser.ConfigParser()
config.read("exp_config.ini")

# read the all the parameters from exp_config.ini
traj_dir = config.get("experiment_configuration", "trajectory_dir")
exp_name = config.get("experiment_configuration", "exp_name")
log_dir = config.get("experiment_configuration", "log_dir")
reward_formulation = config.get("experiment_configuration", "reward_formulation")
use_envlogger = config.get("experiment_configuration", "use_envlogger")

env_wrapper = make_astraSim_env(reward_formulation = reward_formulation,
rl_form = 'bo')

# check if trajectory directory exists
if use_envlogger == 'True':
if not os.path.exists(traj_dir):
os.makedirs(traj_dir)
# check if log directory exists
if not os.path.exists(log_dir):
os.makedirs(log_dir)

env = self.wrap_in_envlogger(env_wrapper, self.exp_log_dir, use_envlogger)
env.reset()
print("Action dict: ", self.action_dict)

# convert the action dict to a list with the same order
# action_list = []

actual_action = {}
actual_action['network'] = {"path": self.network_file}
actual_action['workload'] = {"path": self.workload_file}
self.parse_system(self.system_file, actual_action)

actual_action["system"]["scheduling-policy"] = self.action_dict["scheduling_policy"]
actual_action["system"]["collective-optimization"] = self.action_dict["collective_optimization"]
actual_action["system"]["intra-dimension-scheduling"] = self.action_dict["intra_dimension_scheduling"]
actual_action["system"]["inter-dimension-scheduling"] = self.action_dict["inter_dimension_scheduling"]

_, reward, _, info = env.step(actual_action)

self.fitness_hist['reward'] = reward
self.fitness_hist['action'] = self.action_dict
self.fitness_hist['obs'] = info

fitness_filename = os.path.join(self.exp_name)

# logging twice due to the cv. So we will track the bo_steps and log only once
if self.bo_steps == 1:
self.log_fitness_to_csv(log_dir)

# clear the self.fitness_hist
self.fitness_hist = []

return reward

def predict(self, X, y):
return NotImplementedError

def score(self,X, y=None):
return NotImplementedError

def get_params(self, deep=False):
return {
"scheduling_policy": self.action_dict["scheduling_policy"],
"collective_optimization": self.action_dict["collective_optimization"],
"intra_dimension_scheduling": self.action_dict["intra_dimension_scheduling"],
"inter_dimension_scheduling": self.action_dict["inter_dimension_scheduling"]
}

def set_params(self, **params):
"""
scheduling-policy: LIFO
endpoint-delay: 1
active-chunks-per-dimension: 1
preferred-dataset-splits: 64
boost-mode: 1
all-reduce-implementation: direct_ring_halvingDoubling
all-gather-implementation: direct_ring_halvingDoubling
reduce-scatter-implementation: direct_ring_halvingDoubling
all-to-all-implementation: direct_direct_direct
collective-optimization: localBWAware
intra-dimension-scheduling: FIFO
inter-dimension-scheduling: baseline
"""
_params = params
self.action_dict["scheduling_policy"] = _params["scheduling_policy"]
self.action_dict["collective_optimization"] = _params["collective_optimization"]
self.action_dict["intra_dimension_scheduling"] = _params["intra_dimension_scheduling"]
self.action_dict["inter_dimension_scheduling"] = _params["inter_dimension_scheduling"]

return self


def calculate_reward(self, energy, latency):
sum = ((float(latency) - 1) ** 2)
return 1 / (sum ** 0.5)


def log_fitness_to_csv(self, filename):
df = pd.DataFrame([self.fitness_hist['reward']])
csvfile = os.path.join(filename, "fitness.csv")
df.to_csv(csvfile, index=False, header=False, mode='a')

df = pd.DataFrame([self.fitness_hist])
csvfile = os.path.join(filename, "actions.csv")
df.to_csv(csvfile, index=False, header=False, mode='a')
Binary file added docs/installation_images/file_preview.png

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove this file.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 35 additions & 0 deletions installation.md

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove the file as well. I think the new_VM_setup_archgym should be the main one.

Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Installation for Ubuntu VM
## Step-by-Step Guide

If you are reading this, we suppose you have ready to run Ubuntu on VM, in your system. if not, then go to some recent youtube videos and make sure you have Ubuntu successfully running on VM.

1. Open Your Virtual Machine
2. Open Terminal (ctrl + T)
3. Clone this repo https://github.com/google/CFU-Playground by using ```git clone https://github.com/google/CFU-Playground```
4. Go to this directory "CFU-Playground/third_party/python/vizier/" using ```cd CFU-Playground/third_party/python/vizier/```
5. Now you will see that there is a CFU-Playground folder. Now go to "CFU-Playground/third_party/python/vizier/" and see if vizier folder is empty or not! ![Alt text](./docs/installation_images/file_preview.png?raw=true "Title")
10. Go to python folder using terminal (location: CFU-Playground/third_party/python/) using command like ```cd CFU-Playground/third_party/python/```
11. run ```rm -rf vizier```
13. Clone vizier repo, run ```git clone https://github.com/ShvetankPrakash/vizier```
15. ```cd CFU-Playground/``` -> ```cd scripts/``` -> Run setup_vizier.sh file using command ```./setup_vizier.sh```
19. Might give some errors, therefore activate conda environment/ or create one, If you have existing environments, you can find the list using this command ```conda env list``` if you don't find one, create using following commands: Open anaconda terminal and run ```conda create -n myenv```. Replace myenv with the environment name. now activate that enviroment using ```conda activate myenv```
1. Install anaconda
2.
22. Now run ```./setup_vizier.sh```again
23. Some Errors might occur due to version of python.
24. run ```sudo apt install build-essential```
25. run ```pip install cvxopt```
26. run ```export CVXOPT_BUILD_FFTW=1```
27. ```pip install cvxopt --no-binary cvxopt```
28. ```conda install -c conda-forge cvxopt```
29. ```./setup_vizier.sh```
30. Might give errors related to ale-py
31. Go to file requirements-benchmarks.txt:
32. third party -> python -> vizier -> vizier -> requirements-benchmarks.txt
33. Comment out all the lines
34. Now running setup_vizier.sh file would not give ale-py error
35. Now, try some other example - go to CFU-Playground -> proj -> dse_template -> vizier_dse.py
36. Go to line 40, comment it out and add this line
37. cycles, cells = 1, 1
38. Run this file. It should run successfully without any errors.

66 changes: 66 additions & 0 deletions new_VM_setup_archgym.md

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove references to the username etc.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Setting Up a New Virtual Machine and Arch Gym Env Installation !

Follow the below instruction to setup a remote access of virtual machine and peform the necessary intallation for creating arch-gym enviroment


## Initializing and Starting VM
1. Generating the ssh public and private key using : `ssh-keygen -t rsa -b 2048 -C [USERNAME]`
2. Get the ssh access from adminstrator
3. Open terminal and run :`ssh -i <PATH_TO_PRIVATE_KEY> <USERNAME@IP_ADDRESS>` . Example : `ssh -i C:\Users\yashc\.ssh/id_rsa [email protected]`
4. Open VS code and download Remote-SSH extension by microsoft
5. Press F1 select Remote-SSH: Connect to Host...use the same `USERNAME@IP_ADDRESS` as in step 2
6. New VS Code window will be opened and If VS Code cannot automatically detect the type of server you are connecting to, you will be asked to select the type manually.

## Installing Conda
In terminal run the following commands to install conda for your remote virtual machine
1. `curl -O https://repo.anaconda.com/archive/Anaconda3-2023.07-1-Linux-x86_64.sh`
2. `sha256sum Anaconda3-2023.07-1-Linux-x86_64.sh`
3. `bash Anaconda3-2023.07-1-Linux-x86_64.sh ( press yes and enter for all steps )`
4. `source ~/.bashrc`


## Creating Arch-Gym Enviroment
Follow the below steps for setting up vizier and arch-gym. In terminal
1. `git clone https://github.com/srivatsankrishnan/oss-arch-gym.git`
2. `cd oss-arch-gym/`
3. `conda env create -f environment.yml`
4. `conda activate arch-gym`
5. `cd ..`
6. `git clone https://github.com/ShvetankPrakash/vizier.git`
7. cd into vizier directory
8. `sudo apt-get install -y libprotobuf-dev`
9. `pip install -r requirements.txt --use-deprecated=legacy-resolver` ( you may see some package compatibility issues, ignore them )
10. `pip install -e .` ( you may see some package compatibility issues, ignore them )
11. `./build_protos.sh`
12. `pip install -r requirements-algorithms.txt` (you may probably end up with gcc compiler issue, ignore as of now)
13. `pip install -r requirements-benchmarks.txt` ( you may see some package compatibility issues, ignore them )
14. Open VS code and make a copy of this script: [https://github.com/google/CFU-Playground/blob/main/proj/dse_template/vizier_dse.py](https://github.com/google/CFU-Playground/blob/main/proj/dse_template/vizier_dse.py)
15. Remove line 10 of your local copy
16. Go to line 40, comment it out and add this line `cycles, cells = 1, 1`
17. In terminal Run `python vizier_dse.py` to test working. Note : all this should be done with arch-gym virtual env activated only
18. If you get ModuleNotFoundError: No module named 'emukit'. Run `pip install emukit`
19. If pip install emukit throws error related to gcc compiler, then try to install it using this - `sudo apt update && sudo apt install -y build-essential`
20. Run `pip install emukit` again
21. Run `python vizier_dse.py` to test its working
The output should look like
`Suggested Parameters (bypass, cfu, dCacheSize, hardwareDiv, iCacheSize, mulDiv, prediction, safe, singleCycleShift, singleCycleMulDiv): True False 8192.0 True 4096.0 True static False True False.............`


## Testing Overall Installation

Come out of vizier directory in terminal using `cd ..`

1. `cd oss-arch-gym/acme`
2. `pip install .[jax,tf,testing,envs]`
3. `which python`
Output eg : `/home/yashc/anaconda3/envs/arch-gym/bin/python`
Replace `bin/python` with `lib` and copy it : `/home/yashc/anaconda3/envs/arch-gym/lib`
#### In VS Code
1. Go to .bashrc file inside your username folder ( for me its yashc)
2. Paste this in last : `export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/<USERNAME>/anaconda3/envs/arch-gym/lib/"`

#### In terminal
Remember that your arch-gym env should be activated all the times
1. Run `sudo apt-get install libgmp-dev`
2. `cd oss-arch-gym/sims/customenv`
3. Rull all the Python files ( all should run without error )
2 changes: 1 addition & 1 deletion sims/AstraSim/run_general.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ SCRIPT_DIR=$(dirname "$(realpath $0)")
BINARY="${SCRIPT_DIR:?}"/astrasim-archgym/astra-sim/build/astra_analytical/build/AnalyticalAstra/bin/AnalyticalAstra
SYSTEM="${SCRIPT_DIR:?}"/general_system.txt
NETWORK="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/network/analytical/$1
WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/realworld_workloads/$3
WORKLOAD="${SCRIPT_DIR:?}"/astrasim-archgym/themis/inputs/workload/$3

echo "SH NETWORK: ${NETWORK}"
echo "SH SYSTEM: ${SYSTEM}"
Expand Down
Loading