Skip to content

Commit

Permalink
Merge branch 'main' into base_autotuner
Browse files Browse the repository at this point in the history
  • Loading branch information
Caozhou1995 authored Jun 6, 2024
2 parents 7199af2 + f57bde0 commit 5778d21
Show file tree
Hide file tree
Showing 21 changed files with 193 additions and 50 deletions.
24 changes: 11 additions & 13 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ name: flagscale-test

on:
push:
branches: [ "main", "add_CICD" ]
branches: [ "main" ]
pull_request:
branches: [ "main" ]

Expand All @@ -23,22 +23,20 @@ jobs:

- name: unit_test-megatron
run: |
export PYTHONPATH=./megatron:$PYTHONPATH
export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
cd megatron
# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/*.py
export PYTHONPATH=..:$PYTHONPATH
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/*.py
- name: unit_test-flagscale
run: |
export PYTHONPATH=./flagscale:$PYTHONPATH
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/launcher
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/launcher
- name: functional_test-flagscale
run: |
Expand Down
Empty file removed __init__.py
Empty file.
40 changes: 40 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
FROM nvcr.io/nvidia/pytorch:24.05-py3

ENV DEBIAN_FRONTEND noninteractive
ENV TZ=Asia/Shanghai

##############################################################################
# Change apt source to Ksyun
##############################################################################
RUN sed -i "s#\S\+#http://apt.ksyun.cn/ubuntu/#2" /etc/apt/sources.list && \
> /etc/apt/apt.conf.d/docker-clean && \
> /etc/dpkg/dpkg.cfg.d/pkg-config-hook-config

##############################################################################
# Install basic utilities
##############################################################################
RUN apt-get update && \
apt-get install -y --no-install-recommends \
curl wget vim tmux less unzip \
htop iftop iotop ca-certificates openssh-client openssh-server \
rsync iputils-ping net-tools \
tzdata psmisc screen && \
apt-get clean

##############################################################################
# SSH configuration (not secure, only for development purpose)
##############################################################################
RUN mkdir -p /run/sshd && \
ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa && \
cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
echo "StrictHostKeyChecking no\n" >> /etc/ssh/ssh_config

##############################################################################
# Install Miniconda
##############################################################################
RUN mkdir -p ~/miniconda3 && \
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh && \
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 || { echo 'Miniconda installation failed' ; exit 1; } && \
rm -rf ~/miniconda3/miniconda.sh && \
~/miniconda3/bin/conda init bash || { echo 'conda init failed' ; exit 1; } && \
~/miniconda3/bin/conda config --set auto_activate_base false || { echo 'conda config failed' ; exit 1; }
Original file line number Diff line number Diff line change
Expand Up @@ -495,4 +495,4 @@ def _get_size_per_split_per_dataset(
for weight in normalized_weights
]

return sizes_per_dataset
return sizes_per_dataset
1 change: 0 additions & 1 deletion megatron/megatron/training/global_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,4 +304,3 @@ def set_device_type(args):
# Apply the following patch during the import time
import patches


25 changes: 25 additions & 0 deletions megatron/tests/unit_tests/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
def set_mock_args():
from unittest import mock
def init_mock_args(args):
args.data_parallel_random_init = False
args.virtual_pipeline_model_parallel_size = None
args.bf16 = True
args.accumulate_allreduce_grads_in_fp32 = False
args.overlap_grad_reduce = False
args.use_distributed_optimizer = True
args.load = None
args.save_param_index_maps_only = False
args.rampup_batch_size = None
args.global_batch_size = 8
args.micro_batch_size = 1
args.data_parallel_size = 8
args.adlr_autoresume = False
args.timing_log_option = 'minmax'
args.timing_log_level = 0
args.pretrained_checkpoint = None
return args

with mock.patch('megatron.training.training.get_args', data_parallel_random_init=False) as mock_args:
init_mock_args(mock_args.return_value)
from megatron.training.global_vars import set_args
set_args(mock_args.return_value)
5 changes: 4 additions & 1 deletion megatron/tests/unit_tests/data/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ def __len__(self) -> int:
def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:
return {"text": self.dataset[self.sample_index[idx]]}

from tests.unit_tests.data import set_mock_args
set_mock_args()

with tempfile.TemporaryDirectory() as temp_dir:

paths = do_setup(temp_dir)
Expand Down Expand Up @@ -364,4 +367,4 @@ def __getitem__(self, idx: int) -> Dict[str, numpy.ndarray]:


if __name__ == "__main__":
test_builder()
test_builder()
5 changes: 4 additions & 1 deletion megatron/tests/unit_tests/data/test_gpt_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ def test_mock_gpt_dataset():
tokenizer=tokenizer,
)

from tests.unit_tests.data import set_mock_args
set_mock_args()

datasets = BlendedMegatronDatasetBuilder(
MockGPTDataset, [100, 100, 100], lambda: True, config
).build()
Expand Down Expand Up @@ -114,4 +117,4 @@ def test_mock_gpt_dataset():


if __name__ == "__main__":
test_mock_gpt_dataset()
test_mock_gpt_dataset()
5 changes: 4 additions & 1 deletion megatron/tests/unit_tests/data/test_multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ def test_mock_multimodal_dataset():
tokenizer=_NullTokenizer(vocab_size=_MOCK_VOCAB_SIZE),
)

from tests.unit_tests.data import set_mock_args
set_mock_args()

datasets = BlendedMegatronDatasetBuilder(
MockMultimodalDataset, [100, 100, 100], lambda: True, config
).build()
Expand All @@ -55,4 +58,4 @@ def test_mock_multimodal_dataset():


if __name__ == "__main__":
test_mock_multimodal_dataset()
test_mock_multimodal_dataset()
10 changes: 7 additions & 3 deletions megatron/tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import multiprocessing
try:
import nltk
import shutil
nltk_available = True
except ImportError:
nltk_available = False
Expand Down Expand Up @@ -270,7 +271,12 @@ def main():

if args.split_sentences:
if nltk_available:
nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
try:
punkt_path = os.environ.get("NLTK_DATA") + "/tokenizers/punkt"
if not os.path.exists(punkt_path):
shutil.copytree('/root/nltk_data/tokenizers/punkt', punkt_path)
except:
nltk.download("punkt", quiet=True, download_dir=os.environ.get("NLTK_DATA"))
else:
raise Exception(
"nltk library required for sentence splitting is not available.")
Expand Down Expand Up @@ -404,6 +410,4 @@ def main():


if __name__ == '__main__':

main()

6 changes: 0 additions & 6 deletions requirements.txt

This file was deleted.

25 changes: 25 additions & 0 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# build tools
packaging
setuptools>=49.4.0
wheel

# testing
pytest
pytest_mock
pytest-cov
pytest-forked
pytest-asyncio
pytest-rerunfailures
pytest-shard

# formatting
black==24.4.2
codespell==2.2.6
isort==5.13.2
clang-format==18.1.5

# type checking
mypy==1.10.0

# pre-commit
pre-commit>=2.20.0
16 changes: 16 additions & 0 deletions requirements/requirements-inf-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# mirror from vllm repo with some modifications

# testing utils
awscli
einops # required for MPT
httpx
peft
requests
ray
sentence-transformers # required for embedding

# Benchmarking
aiohttp

# Multimodal
pillow
18 changes: 18 additions & 0 deletions requirements/requirements-inf.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# mirror from vllm repo with some modifications
torch
fastapi
aiohttp
openai
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
lm-format-enforcer == 0.10.1
outlines == 0.0.34 # Requires torch >= 2.1.0
typing_extensions
filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4

# Uncomment the following lines if you don't compile them from source
# xformers
# vllm-flash-attn
# vllm-nccl-cu12
3 changes: 3 additions & 0 deletions requirements/requirements-train-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# pip install git+https://github.com/fanshiqing/[email protected]
nltk
wrapt
4 changes: 4 additions & 0 deletions requirements/requirements-train.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
torch
wandb
deepspeed
dlrover[torch]
18 changes: 18 additions & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
cmake >= 3.21
hydra-core
ninja
psutil
numpy
requests
py-cpuinfo
sentencepiece
transformers >= 4.40.0
tokenizers >= 0.19.1
tiktoken >= 0.6.0
ray >= 2.9
matplotlib
py3nvml
graphviz
tqdm
mpi4py
nvidia-ml-py
10 changes: 0 additions & 10 deletions requirements_dev.txt

This file was deleted.

2 changes: 1 addition & 1 deletion tests/functional_tests/aquila/gold_result/gold_result.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"lm loss:": {"values": [11.61173, 11.61412, 11.47692, 11.37522, 11.28096, 11.21686, 11.16553, 11.13099, 11.10483, 11.08886]}}
{"lm loss:": {"values": [11.61173, 11.61412, 11.47692, 11.37523, 11.28096, 11.21686, 11.16553, 11.13099, 11.10483, 11.08886]}}
3 changes: 1 addition & 2 deletions tests/scripts/unit_test_flagscale.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
export PYTHONPATH=./flagscale:$PYTHONPATH
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/launcher
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/launcher
21 changes: 11 additions & 10 deletions tests/scripts/unit_test_megatron.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
export PYTHONPATH=./megatron:$PYTHONPATH
export PYTHONPATH=./../../FlagScale/:$PYTHONPATH
cd megatron
# torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest -q -x tests/unit_tests/*.py

export PYTHONPATH=..:$PYTHONPATH

torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/data
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/dist_checkpointing
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/fusions
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/models
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/pipeline_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/tensor_parallel
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/transformer
torchrun --nproc_per_node=8 -m pytest --import-mode=importlib -q -x tests/unit_tests/*.py

0 comments on commit 5778d21

Please sign in to comment.