Merge pull request argonne-lcf#289 from argonne-lcf/feature/Cerebrase…

…_updates_1.9.2 updates (minor) for release 1.9.2, which was just a OS patch
saforem2 · Nov 4, 2023 · 9ab1162 · 9ab1162
2 parents 79715b9 + 8b8cef9
commit 9ab1162
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 42 deletions.
diff --git a/docs/ai-testbed/cerebras/customizing-environment.md b/docs/ai-testbed/cerebras/customizing-environment.md
@@ -7,15 +7,15 @@
 ```console
 #Make your home directory navigable
 chmod a+xr ~/
-mkdir ~/R_1.9.1
-chmod a+x ~/R_1.9.1/
-cd ~/R_1.9.1
+mkdir ~/R_1.9.2
+chmod a+x ~/R_1.9.2/
+cd ~/R_1.9.2
 # Note: "deactivate" does not actually work in scripts.
 deactivate
 rm -r venv_pt
 /software/cerebras/python3.8/bin/python3.8 -m venv venv_pt
 source venv_pt/bin/activate
-pip3 install /opt/cerebras/wheels/cerebras_pytorch-1.9.1+1cf4d0632b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels
+pip3 install /opt/cerebras/wheels/cerebras_pytorch-1.9.2+92b4fad15b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels
 pip install numpy==1.23.4
 pip install datasets transformers
 ```
@@ -24,17 +24,17 @@ pip install datasets transformers
 
 ```console
 chmod a+xr ~/
-mkdir ~/R_1.9.1
-chmod a+x ~/R_1.9.1/
-cd ~/R_1.9.1
+mkdir ~/R_1.9.2
+chmod a+x ~/R_1.9.2/
+cd ~/R_1.9.2
 # Note: "deactivate" does not actually work in scripts.
 deactivate
 rm -r venv_tf
 /software/cerebras/python3.8/bin/python3.8 -m venv venv_tf
 source venv_tf/bin/activate
 #pip install tensorflow_datasets
 #pip install spacy
-pip3 install /opt/cerebras/wheels/cerebras_tensorflow-1.9.1+1cf4d0632b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels/
+pip3 install /opt/cerebras/wheels/cerebras_tensorflow-1.9.2+92b4fad15b-cp38-cp38-linux_x86_64.whl --find-links=/opt/cerebras/wheels/
 pip install numpy==1.23.4
 ```
 
@@ -43,13 +43,13 @@ pip install numpy==1.23.4
 To activate one of these virtual environments,
 
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
+source ~/R_1.9.2/venv_pt/bin/activate
 ```
 
 or
 
 ```console
-source ~/R_1.9.1/venv_tf/bin/activate
+source ~/R_1.9.2/venv_tf/bin/activate
 ```
 
 To deactivate a virtual environment,

diff --git a/docs/ai-testbed/cerebras/example-programs.md b/docs/ai-testbed/cerebras/example-programs.md
@@ -4,12 +4,12 @@
 Make a working directory and a local copy of the Cerebras **modelzoo** and **anl_shared** repository, if not previously done, as follows.
 
 ```bash
-mkdir ~/R_1.9.1
-cd ~/R_1.9.1
+mkdir ~/R_1.9.2
+cd ~/R_1.9.2
 git clone https://github.com/Cerebras/modelzoo.git
 ```
 <!---
-cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_1.9.1/anl_shared
+cp -r /software/cerebras/model_zoo/anl_shared/ ~/R_1.9.2/anl_shared
 --->
 
 ## UNet
@@ -19,17 +19,17 @@ To run Unet with the <a href="https://www.kaggle.com/c/severstal-steel-defect-de
 First, source a Cerebras PyTorch virtual environment.
 
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
+source ~/R_1.9.2/venv_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.1/modelzoo/modelzoo/vision/pytorch/unet
+cd ~/R_1.9.2/modelzoo/modelzoo/vision/pytorch/unet
 cp /software/cerebras/dataset/severstal-steel-defect-detection/params_severstal_binary_rawds.yaml configs/params_severstal_binary_rawds.yaml
 export MODEL_DIR=model_dir_unet
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
+python run.py CSX --job_labels name=unet_pt --params configs/params_severstal_binary_rawds.yaml --model_dir $MODEL_DIR --mode train --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log 
 ```
 
 <!--- Appears to not have been ported to 1.7.1
@@ -43,7 +43,7 @@ The BraggNN model has two versions:<br>
 
 ```console
 TODO
-cd ~/R_1.9.1/anl_shared/braggnn/tf
+cd ~/R_1.9.2/anl_shared/braggnn/tf
 # This yaml has a correct path to a BraggNN dataset
 cp /software/cerebras/dataset/BraggN/params_bragg_nonlocal_sampleds.yaml configs/params_bragg_nonlocal_sampleds.yaml
 export MODEL_DIR=model_dir_braggnn
@@ -63,17 +63,17 @@ source /software/cerebras/venvs/venv_pt/bin/activate
 # or your personal venv
 --->
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
+source ~/R_1.9.2/venv_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.1/modelzoo/modelzoo/transformers/pytorch/bert
+cd ~/R_1.9.2/modelzoo/modelzoo/transformers/pytorch/bert
 cp /software/cerebras/dataset/bert_large/bert_large_MSL128_sampleds.yaml configs/bert_large_MSL128_sampleds.yaml
 export MODEL_DIR=model_dir_bert_large_pytorch
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=bert_pt --params configs/bert_large_MSL128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
@@ -104,17 +104,17 @@ This BERT-large msl128 example uses a single sample dataset for both training an
 First, source a Cerebras TensorFlow virtual environment.
 
 ```console
-source ~/R_1.9.1/venv_tf/bin/activate
+source ~/R_1.9.2/venv_tf/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.1/modelzoo/modelzoo/transformers/tf/bert
+cd ~/R_1.9.2/modelzoo/modelzoo/transformers/tf/bert
 cp /software/cerebras/dataset/bert_large/params_bert_large_msl128_sampleds.yaml configs/params_bert_large_msl128_sampleds.yaml
 export MODEL_DIR=mytest
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=bert_tf --max_steps 1000 --params configs/params_bert_large_msl128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=bert_tf --max_steps 1000 --params configs/params_bert_large_msl128_sampleds.yaml --num_workers_per_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following, with messages about cuda that should be ignored and are not shown.
@@ -147,17 +147,17 @@ This PyTorch GPT-J 6B parameter pretraining sample uses 2 CS2s.
 First, source a Cerebras PyTorch virtual environment.
 
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
+source ~/R_1.9.2/venv_pt/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.1/modelzoo/modelzoo/transformers/pytorch/gptj
+cd ~/R_1.9.2/modelzoo/modelzoo/transformers/pytorch/gptj
 cp /software/cerebras/dataset/gptj/params_gptj_6B_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
 export MODEL_DIR=model_dir_gptj
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following:
@@ -187,17 +187,17 @@ source /software/cerebras/venvs/venv_tf/bin/activate
 # or your personal venv
 
 ```console
-source ~/R_1.9.1/venv_tf/bin/activate
+source ~/R_1.9.2/venv_tf/bin/activate
 ```
 
 Then
 
 ```console
-cd ~/R_1.9.1/modelzoo/modelzoo/transformers/tf/gptj
+cd ~/R_1.9.2/modelzoo/modelzoo/transformers/tf/gptj
 cp /software/cerebras/dataset/gptj/params_gptj_6B_tf_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
 export MODEL_DIR=model_dir_gptj_tf
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=gptj_tf --max_steps 500 --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=gptj_tf --max_steps 500 --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir $(whoami) |& tee mytest.log
 ```
 
 The last parts of the output should resemble the following:

diff --git a/docs/ai-testbed/cerebras/running-a-model-or-program.md b/docs/ai-testbed/cerebras/running-a-model-or-program.md
@@ -25,20 +25,22 @@ Follow these instructions to compile and train the `fc_mnist` TensorFlow and PyT
 
 First, make virtual environments for Cerebras for PyTorch and/or TensorFlow.
 See [Customizing Environments](./customizing-environment.md) for the procedures for making PyTorch and/or TensorFlow virtual environments for Cerebras.
-If the environments are made in ```~/R_1.9.1/```, then they would be activated as follows:
+If the environments are made in ```~/R_1.9.2/```, then they would be activated as follows:
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
+source ~/R_1.9.2/venv_pt/bin/activate
 ```
 or
 ```console
-source ~/R_1.9.1/vent_tf/bin/activate
+source ~/R_1.9.2/vent_tf/bin/activate
 ```
 
 ### Clone the Cerebras modelzoo
 
+Note: For virtual environent R_1.9.2, the modelzoo is unchanged from R_1.9.1. 
+
 ```console
-mkdir ~/R_1.9.1
-cd ~/R_1.9.1
+mkdir ~/R_1.9.2
+cd ~/R_1.9.2
 git clone https://github.com/Cerebras/modelzoo.git
 cd modelzoo
 git tag
@@ -49,23 +51,23 @@ git checkout Release_1.9.1
 ### Activate your PyTorch virtual environment, and change to the working directory
 
 ```console
-source ~/R_1.9.1/venv_pt/bin/activate
-cd ~/R_1.9.1/modelzoo/modelzoo/fc_mnist/pytorch
+source ~/R_1.9.2/venv_pt/bin/activate
+cd ~/R_1.9.2/modelzoo/modelzoo/fc_mnist/pytorch
 ```
 
 Next, edit configs/params.yaml, making the following changes:
 
 ```text
  train_input:
--    data_dir: "./data/mnist/train"
+-    data_dir: "./mnist"
 +    data_dir: "/software/cerebras/dataset/fc_mnist/data/mnist/train"
 ```
 
 and
 
 ```text
  eval_input:
--    data_dir: "./data/mnist/val"
+-    data_dir: "./mnist"
 +    data_dir: "/software/cerebras/dataset/fc_mnist/data/mnist/train"
 ```
 
@@ -79,7 +81,7 @@ To run the sample:
 export MODEL_DIR=model_dir
 # deletion of the model_dir is only needed if sample has been previously run
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX --job_labels name=pt_smoketest --params configs/params.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.1/modelzoo --compile_dir /$(whoami) |& tee mytest.log
+python run.py CSX --job_labels name=pt_smoketest --params configs/params.yaml --num_csx=1 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_1.9.2/modelzoo --compile_dir /$(whoami) |& tee mytest.log
 ```
 
 A successful fc_mnist PyTorch training run should finish with output resembling the following:
@@ -100,8 +102,8 @@ A successful fc_mnist PyTorch training run should finish with output resembling
 ### Activate your TensorFlow virtual environment and change to the working directory
 
 ```console
-source ~/R_1.9.1/venv_tf/bin/activate
-cd ~/R_1.9.1/modelzoo/modelzoo/fc_mnist/tf/
+source ~/R_1.9.2/venv_tf/bin/activate
+cd ~/R_1.9.2/modelzoo/modelzoo/fc_mnist/tf/
 ```
 
 Next, edit configs/params.yaml, making the following change. Cerebras requires that the data_dir be an absolute path.
@@ -125,7 +127,7 @@ Next, edit configs/params.yaml, making the following change. Cerebras requires t
 export MODEL_DIR=model_dir
 # deletion of the model_dir is only needed if sample has been previously run
 if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
-python run.py CSX pipeline --job_labels name=tf_fc_mnist --params configs/params.yaml --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.1/modelzoo/ --compile_dir /$(whoami) |& tee mytest.log
+python run.py CSX pipeline --job_labels name=tf_fc_mnist --params configs/params.yaml --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software/ --python_paths /home/$(whoami)/R_1.9.2/modelzoo/ --compile_dir /$(whoami) |& tee mytest.log
 ```
 
 A successful fc_mnist TensorFlow training run should finish with output resembling the following: