Skip to content

Commit

Permalink
Merge branch 'main' into hetero_tp
Browse files Browse the repository at this point in the history
  • Loading branch information
njuerect authored Jun 7, 2024
2 parents d5c38a9 + ac373cb commit 27577d9
Show file tree
Hide file tree
Showing 44 changed files with 2,171 additions and 131 deletions.
5 changes: 5 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[html]
directory = coverage

[run]
data_file = .coverage_$LOCAL_RANK
33 changes: 33 additions & 0 deletions .github/workflows/approve.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: approve

on:
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened]

jobs:
approve:
runs-on: ubuntu-latest

steps:
- name: Checkout Code
uses: actions/checkout@v2

- name: Get PR Approvals
run: |
PR_NUMBER=$(jq --raw-output .number "$GITHUB_EVENT_PATH")
APPROVERS=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
"https://api.github.com/repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
| jq -r '.[] | select(.state == "APPROVED") | .user.login')
echo "APPROVERS=$APPROVERS" >> $GITHUB_ENV
- name: Check for Specific Approver
run: |
SPECIFIC_APPROVER="aoyulong"
if echo "$APPROVERS" | grep -q "$SPECIFIC_APPROVER"; then
echo "Specific approver has approved the PR."
else
echo "The PR has not been approved by the specific approver."
exit 1
fi
54 changes: 30 additions & 24 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -1,48 +1,54 @@
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python

name: flagscale-test
name: test

on:
push:
branches: [ "main", "add_CICD" ]
branches: [ "main" ]
pull_request:
branches: [ "main" ]

jobs:
container-test-job:
test:
runs-on: self-hosted
container:
image: localhost:5000/flagscale_cicd:v1.1
env:
NODE_ENV: development
image: localhost:5000/flagscale_cicd:v1.3
ports:
- 80
options: --gpus all --hostname flagscale_cicd
volumes:
- /home/flagscale_cicd/flask/static:/workspace/report
options: --gpus all --hostname flagscale_cicd
steps:
- name: checkout-code
- name: Checkout Code
uses: actions/checkout@v2

- name: unit_test-megatron
- name: Megatron Unit Test
run: |
export PYTHONPATH=./megatron:$PYTHONPATH
export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
cd megatron
# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/*.py
- name: unit_test-flagscale
export PYTHONPATH=..:$PYTHONPATH
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-megatron --cov=megatron/core -q -x tests/unit_tests/*.py
- name: Megatron Unit Test Coverage Online Report
run: |
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-megatron/index.html"
- name: Flagscale Unit Test
run: |
export PYTHONPATH=./flagscale:$PYTHONPATH
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/launcher
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib --cov-append --cov-report=html:/workspace/report/${{github.sha}}/cov-report-flagscale --cov=flagscale -q -x tests/unit_tests/launcher
- name: functional_test-flagscale
- name: Flagscale Unit Test Coverage Online Report
run: |
echo "You can access the test coverage report at the http://120.92.110.224:8081/${{github.sha}}/cov-report-flagscale/index.html"
- name: Flagscale Functional Test
run: |
python run.py --config-path tests/functional_tests/aquila/conf --config-name config action=test
pytest -s tests/functional_tests/test_result.py --test_reaults_path=./tests/functional_tests/aquila/test_result
Empty file removed __init__.py
Empty file.
40 changes: 40 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM nvcr.io/nvidia/pytorch:24.05-py3

ENV DEBIAN_FRONTEND noninteractive
ENV TZ=Asia/Shanghai

##############################################################################
# Change apt source to Ksyun
##############################################################################
RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
> /etc/apt/apt.conf.d/docker-clean && \
> /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config

##############################################################################
# Install basic utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl wget vim tmux less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools \
tzdata psmisc screen && \
apt-get clean

##############################################################################
# SSH configuration (not secure, only for development purpose)
##############################################################################
RUN mkdir -p /run/sshd && \
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
echo "StrictHostKeyChecking no\n" >> /etc/ssh/ssh_config

##############################################################################
# Install Miniconda
##############################################################################
RUN mkdir -p ~/miniconda3 && \
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh && \
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 || { echo 'Miniconda installation failed' ; exit 1; } && \
rm -rf ~/miniconda3/miniconda.sh && \
~/miniconda3/bin/conda init bash || { echo 'conda init failed' ; exit 1; } && \
~/miniconda3/bin/conda config --set auto_activate_base false || { echo 'conda config failed' ; exit 1; }
43 changes: 43 additions & 0 deletions examples/aquila/conf/config_auto_tuner.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
defaults:
- train: demo
- _self_

experiment:
exp_name: aquila2
exp_dir: ./outputs
task:
type: train
backend: megatron
entrypoint: ./flagscale/train/train_aquila.py
runner:
backend: torchrun
nnodes: 1
nproc_per_node: 8
envs:
CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7
CUDA_DEVICE_MAX_CONNECTIONS: 1
auto_tuner:
space:
data_parallel_size: "auto"
use_distributed_optimizer: [true, false]
tensor_model_parallel_size: [2, 4, 8]
sequence_parallel: [true]
pipeline_model_parallel_size: "auto"
num_layers_per_virtual_pipeline_stage: [1]
context_parallel_size: "auto"
expert_model_parallel_size: [1]
micro_batch_size: "auto"
use_recompute: [true]
recompute_method: "auto"
recompute_granularity: "auto"
recompute_num_layers: "auto"
control:
max_time_per_task: 300
train_iters: 5
max_time: 600

action: run

hydra:
run:
dir: ${experiment.exp_dir}/hydra
2 changes: 1 addition & 1 deletion examples/aquila/conf/train/train_aquila_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,4 @@ data:
vocab_file: ./examples/aquila/tokenizer/vocab.json
merge_file: ./examples/aquila/tokenizer/merges.txt
special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
vocab_size: 100008
vocab_size: 100008
1 change: 1 addition & 0 deletions flagscale/auto_tuner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .tuner import AutoTuner
97 changes: 97 additions & 0 deletions flagscale/auto_tuner/generate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import os
import copy


class Generator:

def __init__(self, config):
self.config = config
# TODO: Just a temporary solution, need to be configurated by user
if "args_mapping" in config.experiment.auto_tuner:
self.args_mapping = config.experiment.auto_tuner.args_mapping
else:
self.args_mapping = {
"data_parallel_size": "data_parallel_size",
"use_distributed_optimizer": "use_distributed_optimizer",
"tensor_model_parallel_size": "tensor_model_parallel_size",
"sequence_parallel": "sequence_parallel",
"pipeline_model_parallel_size": "pipeline_model_parallel_size",
"num_layers_per_virtual_pipeline_stage":
"num_layers_per_virtual_pipeline_stage",
"recompute_method": "recompute_method",
"recompute_granularity": "recompute_granularity",
"recompute_num_layers": "recompute_num_layers",
"micro_batch_size": "micro_batch_size",
"context_parallel_size": "context_parallel_size",
"expert_model_parallel_size": "expert_model_parallel_size",
}

def _set_value(self, strategy, config):
for key, value in self.args_mapping.items():
if key in ["micro_batch_size"]:
config.train.model[value] = strategy[key]
elif key in ["data_parallel_size"]:
continue
else:
if strategy[key] is None:
if value in config.train.system:
del config.train.system[value]
continue
config.train.system[value] = strategy[key]

def gen(self, strategy):
config = copy.deepcopy(self.config)
self._set_value(strategy, config)

# Logging interval should be 1
config.train.system.logging.log_interval = 1

# Set redict and tee
config.experiment.runner.tee = 3
config.experiment.runner.redirects = 3

# auto_tune should be true, it will not save ckpt when train ended and report memory every iteration
config.train.system.auto_tune = True

# Del lr_warmup_samples and train_samples to run megatron.
assert "optimizer" in config.train.model
assert "lr_scheduler" in config.train.model.optimizer
if "lr_warmup_samples" in config.train.model.optimizer.lr_scheduler:
del config.train.model.optimizer.lr_scheduler.lr_warmup_samples
# Del lr_decay_samples and train_samples to run megatron.
if "lr_decay_samples" in config.train.model.optimizer.lr_scheduler:
del config.train.model.optimizer.lr_scheduler.lr_decay_samples
# Del rampup_batch_size and train_samples to run megatron.
if "rampup_batch_size" in config.train.model.optimizer.lr_scheduler:
del config.train.model.optimizer.lr_scheduler.rampup_batch_size
# Del lr_decay_samples and train_samples to run megatron.
if "lr_warmup_fraction" in config.train.model.optimizer.lr_scheduler:
del config.train.model.optimizer.lr_scheduler.lr_warmup_fraction

if "train_samples" in config.train.model:
del config.train.model.train_samples

# Del checkpoint load
if "checkpoint" in config.train.system:
if "load" in config.train.system.checkpoint:
del config.train.system.checkpoint.load
if "save_interval" in config.train.system.checkpoint:
config.train.system.checkpoint.save_interval = 2000

# Set train_iters of each task
if "control" in config.experiment.auto_tuner:
config.train.model.train_iters = config.experiment.auto_tuner.control.get(
"train_iters", 5)
else:
config.train.model.train_iters = 5

# log dir
config.experiment.exp_dir = os.path.join(config.experiment.exp_dir,
"auto_tuner",
f"task_{strategy['idx']}")

return config

def gen_best_task(self, strategy, config):
self._set_value(strategy, config)
return config
1 change: 1 addition & 0 deletions flagscale/auto_tuner/prune/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .pruner import Pruner
Loading

0 comments on commit 27577d9

Please sign in to comment.