-
Notifications
You must be signed in to change notification settings - Fork 49
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
44 changed files
with
2,171 additions
and
131 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
[html] | ||
directory = coverage | ||
|
||
[run] | ||
data_file = .coverage_$LOCAL_RANK |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
name: approve | ||
|
||
on: | ||
pull_request: | ||
branches: [ "main" ] | ||
types: [opened, synchronize, reopened] | ||
|
||
jobs: | ||
approve: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout Code | ||
uses: actions/checkout@v2 | ||
|
||
- name: Get PR Approvals | ||
run: | | ||
PR_NUMBER=$(jq --raw-output .number "$GITHUB_EVENT_PATH") | ||
APPROVERS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ | ||
"https://api.github.com/repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \ | ||
| jq -r '.[] | select(.state == "APPROVED") | .user.login') | ||
echo "APPROVERS=$APPROVERS" >> $GITHUB_ENV | ||
- name: Check for Specific Approver | ||
run: | | ||
SPECIFIC_APPROVER="aoyulong" | ||
if echo "$APPROVERS" | grep -q "$SPECIFIC_APPROVER"; then | ||
echo "Specific approver has approved the PR." | ||
else | ||
echo "The PR has not been approved by the specific approver." | ||
exit 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,54 @@ | ||
# This workflow will install Python dependencies, run tests and lint with a single version of Python | ||
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python | ||
|
||
name: flagscale-test | ||
name: test | ||
|
||
on: | ||
push: | ||
branches: [ "main", "add_CICD" ] | ||
branches: [ "main" ] | ||
pull_request: | ||
branches: [ "main" ] | ||
|
||
jobs: | ||
container-test-job: | ||
test: | ||
runs-on: self-hosted | ||
container: | ||
image: localhost:5000/flagscale_cicd:v1.1 | ||
env: | ||
NODE_ENV: development | ||
image: localhost:5000/flagscale_cicd:v1.3 | ||
ports: | ||
- 80 | ||
options: --gpus all --hostname flagscale_cicd | ||
volumes: | ||
- /home/flagscale_cicd/flask/static:/workspace/report | ||
options: --gpus all --hostname flagscale_cicd | ||
steps: | ||
- name: checkout-code | ||
- name: Checkout Code | ||
uses: actions/checkout@v2 | ||
|
||
- name: unit_test-megatron | ||
- name: Megatron Unit Test | ||
run: | | ||
export PYTHONPATH=./megatron:$PYTHONPATH | ||
export PYTHONPATH=./../../FlagScale/:$PYTHONPATH | ||
cd megatron | ||
# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/*.py | ||
- name: unit_test-flagscale | ||
export PYTHONPATH=..:$PYTHONPATH | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/data | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/dist_checkpointing | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/fusions | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/models | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/pipeline_parallel | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/tensor_parallel | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/transformer | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/*.py | ||
- name: Megatron Unit Test Coverage Online Report | ||
run: | | ||
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html" | ||
- name: Flagscale Unit Test | ||
run: | | ||
export PYTHONPATH=./flagscale:$PYTHONPATH | ||
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/launcher | ||
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher | ||
- name: functional_test-flagscale | ||
- name: Flagscale Unit Test Coverage Online Report | ||
run: | | ||
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html" | ||
- name: Flagscale Functional Test | ||
run: | | ||
python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test | ||
pytest -s tests/functional_tests/test_result.py --test_reaults_path=./tests/functional_tests/aquila/test_result |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
FROM nvcr.io/nvidia/pytorch:24.05-py3 | ||
|
||
ENV DEBIAN_FRONTEND noninteractive | ||
ENV TZ=Asia/Shanghai | ||
|
||
############################################################################## | ||
# Change apt source to Ksyun | ||
############################################################################## | ||
RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \ | ||
> /etc/apt/apt.conf.d/docker-clean && \ | ||
> /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config | ||
|
||
############################################################################## | ||
# Install basic utilities | ||
############################################################################## | ||
RUN apt-get update && \ | ||
apt-get install -y --no-install-recommends \ | ||
curl wget vim tmux less unzip \ | ||
htop iftop iotop ca-certificates openssh-client openssh-server \ | ||
rsync iputils-ping net-tools \ | ||
tzdata psmisc screen && \ | ||
apt-get clean | ||
|
||
############################################################################## | ||
# SSH configuration (not secure, only for development purpose) | ||
############################################################################## | ||
RUN mkdir -p /run/sshd && \ | ||
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \ | ||
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ | ||
echo "StrictHostKeyChecking no\n" >> /etc/ssh/ssh_config | ||
|
||
############################################################################## | ||
# Install Miniconda | ||
############################################################################## | ||
RUN mkdir -p ~/miniconda3 && \ | ||
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh && \ | ||
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 || { echo 'Miniconda installation failed' ; exit 1; } && \ | ||
rm -rf ~/miniconda3/miniconda.sh && \ | ||
~/miniconda3/bin/conda init bash || { echo 'conda init failed' ; exit 1; } && \ | ||
~/miniconda3/bin/conda config --set auto_activate_base false || { echo 'conda config failed' ; exit 1; } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
defaults: | ||
- train: demo | ||
- _self_ | ||
|
||
experiment: | ||
exp_name: aquila2 | ||
exp_dir: ./outputs | ||
task: | ||
type: train | ||
backend: megatron | ||
entrypoint: ./flagscale/train/train_aquila.py | ||
runner: | ||
backend: torchrun | ||
nnodes: 1 | ||
nproc_per_node: 8 | ||
envs: | ||
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 | ||
CUDA_DEVICE_MAX_CONNECTIONS: 1 | ||
auto_tuner: | ||
space: | ||
data_parallel_size: "auto" | ||
use_distributed_optimizer: [true, false] | ||
tensor_model_parallel_size: [2, 4, 8] | ||
sequence_parallel: [true] | ||
pipeline_model_parallel_size: "auto" | ||
num_layers_per_virtual_pipeline_stage: [1] | ||
context_parallel_size: "auto" | ||
expert_model_parallel_size: [1] | ||
micro_batch_size: "auto" | ||
use_recompute: [true] | ||
recompute_method: "auto" | ||
recompute_granularity: "auto" | ||
recompute_num_layers: "auto" | ||
control: | ||
max_time_per_task: 300 | ||
train_iters: 5 | ||
max_time: 600 | ||
|
||
action: run | ||
|
||
hydra: | ||
run: | ||
dir: ${experiment.exp_dir}/hydra |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .tuner import AutoTuner |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os | ||
import copy | ||
|
||
|
||
class Generator: | ||
|
||
def __init__(self, config): | ||
self.config = config | ||
# TODO: Just a temporary solution, need to be configurated by user | ||
if "args_mapping" in config.experiment.auto_tuner: | ||
self.args_mapping = config.experiment.auto_tuner.args_mapping | ||
else: | ||
self.args_mapping = { | ||
"data_parallel_size": "data_parallel_size", | ||
"use_distributed_optimizer": "use_distributed_optimizer", | ||
"tensor_model_parallel_size": "tensor_model_parallel_size", | ||
"sequence_parallel": "sequence_parallel", | ||
"pipeline_model_parallel_size": "pipeline_model_parallel_size", | ||
"num_layers_per_virtual_pipeline_stage": | ||
"num_layers_per_virtual_pipeline_stage", | ||
"recompute_method": "recompute_method", | ||
"recompute_granularity": "recompute_granularity", | ||
"recompute_num_layers": "recompute_num_layers", | ||
"micro_batch_size": "micro_batch_size", | ||
"context_parallel_size": "context_parallel_size", | ||
"expert_model_parallel_size": "expert_model_parallel_size", | ||
} | ||
|
||
def _set_value(self, strategy, config): | ||
for key, value in self.args_mapping.items(): | ||
if key in ["micro_batch_size"]: | ||
config.train.model[value] = strategy[key] | ||
elif key in ["data_parallel_size"]: | ||
continue | ||
else: | ||
if strategy[key] is None: | ||
if value in config.train.system: | ||
del config.train.system[value] | ||
continue | ||
config.train.system[value] = strategy[key] | ||
|
||
def gen(self, strategy): | ||
config = copy.deepcopy(self.config) | ||
self._set_value(strategy, config) | ||
|
||
# Logging interval should be 1 | ||
config.train.system.logging.log_interval = 1 | ||
|
||
# Set redict and tee | ||
config.experiment.runner.tee = 3 | ||
config.experiment.runner.redirects = 3 | ||
|
||
# auto_tune should be true, it will not save ckpt when train ended and report memory every iteration | ||
config.train.system.auto_tune = True | ||
|
||
# Del lr_warmup_samples and train_samples to run megatron. | ||
assert "optimizer" in config.train.model | ||
assert "lr_scheduler" in config.train.model.optimizer | ||
if "lr_warmup_samples" in config.train.model.optimizer.lr_scheduler: | ||
del config.train.model.optimizer.lr_scheduler.lr_warmup_samples | ||
# Del lr_decay_samples and train_samples to run megatron. | ||
if "lr_decay_samples" in config.train.model.optimizer.lr_scheduler: | ||
del config.train.model.optimizer.lr_scheduler.lr_decay_samples | ||
# Del rampup_batch_size and train_samples to run megatron. | ||
if "rampup_batch_size" in config.train.model.optimizer.lr_scheduler: | ||
del config.train.model.optimizer.lr_scheduler.rampup_batch_size | ||
# Del lr_decay_samples and train_samples to run megatron. | ||
if "lr_warmup_fraction" in config.train.model.optimizer.lr_scheduler: | ||
del config.train.model.optimizer.lr_scheduler.lr_warmup_fraction | ||
|
||
if "train_samples" in config.train.model: | ||
del config.train.model.train_samples | ||
|
||
# Del checkpoint load | ||
if "checkpoint" in config.train.system: | ||
if "load" in config.train.system.checkpoint: | ||
del config.train.system.checkpoint.load | ||
if "save_interval" in config.train.system.checkpoint: | ||
config.train.system.checkpoint.save_interval = 2000 | ||
|
||
# Set train_iters of each task | ||
if "control" in config.experiment.auto_tuner: | ||
config.train.model.train_iters = config.experiment.auto_tuner.control.get( | ||
"train_iters", 5) | ||
else: | ||
config.train.model.train_iters = 5 | ||
|
||
# log dir | ||
config.experiment.exp_dir = os.path.join(config.experiment.exp_dir, | ||
"auto_tuner", | ||
f"task_{strategy['idx']}") | ||
|
||
return config | ||
|
||
def gen_best_task(self, strategy, config): | ||
self._set_value(strategy, config) | ||
return config |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .pruner import Pruner |
Oops, something went wrong.